From f22f11ca4f05a4c4efa3ba9e10ba118e3a1ae3df Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 3 Mar 2026 14:26:18 +0700 Subject: [PATCH 01/47] Initial implementation of CUDA interop unit test --- 76_CudaInterop/CMakeLists.txt | 24 + .../app_resources/vectorAdd_kernel.cu | 42 ++ 76_CudaInterop/main.cpp | 543 ++++++++++++++++++ CMakeLists.txt | 1 + 4 files changed, 610 insertions(+) create mode 100644 76_CudaInterop/CMakeLists.txt create mode 100644 76_CudaInterop/app_resources/vectorAdd_kernel.cu create mode 100644 76_CudaInterop/main.cpp diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt new file mode 100644 index 000000000..bc1624875 --- /dev/null +++ b/76_CudaInterop/CMakeLists.txt @@ -0,0 +1,24 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() \ No newline at end of file diff --git a/76_CudaInterop/app_resources/vectorAdd_kernel.cu b/76_CudaInterop/app_resources/vectorAdd_kernel.cu new file mode 100644 index 000000000..3baef0123 --- /dev/null +++ b/76_CudaInterop/app_resources/vectorAdd_kernel.cu @@ -0,0 +1,42 @@ +/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * CUDA Kernel Device code + * + * Computes the vector addition of A and B into C. The 3 vectors have the same + * number of elements numElements. + */ + +extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, + int numElements) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) { + C[i] = A[i] + B[i]; + } +} \ No newline at end of file diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp new file mode 100644 index 000000000..85d10ad13 --- /dev/null +++ b/76_CudaInterop/main.cpp @@ -0,0 +1,543 @@ +#include "nbl/video/CCUDAHandler.h" +// #include "nbl/video/CCUDASharedMemory.h" +// #include "nbl/video/CCUDASharedSemaphore.h" + +#include "nbl/application_templates/MonoDeviceApplication.hpp" +#include "nbl/examples/common/BuiltinResourcesApplication.hpp" + +using namespace nbl; +using namespace core; +using namespace system; +using namespace asset; +using namespace video; + +/* +The start of the main function starts like in most other example. We ask the +user for the desired renderer and start it up. +*/ + +bool check_cuda_err(cudaError_enum err, auto& cu, auto& logger, auto file, auto line) +{ + if (auto re = err; CUDA_SUCCESS != re) + { + const char* name = 0, * str = 0; + cu.pcuGetErrorName(re, &name); + cu.pcuGetErrorString(re, &str); + logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str); + return false; + } + return true; +} + +bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log) +{ + if (auto re = err; NVRTC_SUCCESS != re) + { + const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); + logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str()); + return false; + } + return true; +} + +#define ASSERT_SUCCESS(expr) { auto re = check_cuda_err((expr), cu, m_logger, __FILE__, __LINE__); assert(re); } +#define ASSERT_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); } + + +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::video; +using namespace nbl::examples; +using namespace nbl::application_templates; + +class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplication +{ + using device_base_t = MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + + static constexpr uint32_t gridDim[3] = { 4096,1,1 }; + static constexpr uint32_t blockDim[3] = { 1024,1,1 }; + static constexpr size_t numElements = gridDim[0] * blockDim[0]; + static constexpr size_t size = sizeof(float) * numElements; + +public: + // Yay thanks to multiple inheritance we cannot forward ctors anymore + CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + smart_refctd_ptr cudaHandler; + smart_refctd_ptr cudaDevice; + + IQueue* queue; + + // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory + std::array, 2> cpuBufs; + // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu' + // // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side + // std::array, 3> cudaMemories = {}; + // // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer + // smart_refctd_ptr cudaSemaphore; + + // our Buffer that is bound to cudaMemories[2] + smart_refctd_ptr importedBuf; + // our Image that is also bound to cudaMemories[2] + smart_refctd_ptr importedImg; + + // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing + smart_refctd_ptr stagingBufs[2]; + + // Nabla semaphore for sync + smart_refctd_ptr semaphore; + + smart_refctd_ptr commandPool; + smart_refctd_ptr cmd[2]; + + // a device filter helps you create a set of physical devices that satisfy your requirements in terms of features, limits etc. + virtual void filterDevices(core::set& physicalDevices) const + { + device_base_t::filterDevices(physicalDevices); + auto& cuDevices = cudaHandler->getAvailableDevices(); + std::erase_if(physicalDevices, [&cuDevices](auto pdev) { + return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, &cuDev.uuid, 16); }); + }); + } + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Remember to call the base class initialization! + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr(m_logger)); + if (!cudaHandler) + return logFail("Could not create a CUDA handler!"); + + if (!device_base_t::onAppInitialized(std::move(system))) + return false; + + cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast(m_api), m_physicalDevice); + if (!cudaDevice) + return logFail("Could not create a CUDA Device!"); + + + queue = device_base_t::getComputeQueue(); + + createResources(); + + smart_refctd_ptr ptx; + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + // this time we load a shader directly from a file + auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + return logFail("Could not load kernel!"); + + smart_refctd_ptr source = IAsset::castDown(assets[0]); + std::string log; + auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), + "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log); + ASSERT_SUCCESS_NV(res, log); + + ptx = std::move(ptx_); + } + CUmodule module; + CUfunction kernel; + CUstream stream; + + auto& cu = cudaHandler->getCUDAFunctionTable(); + + ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); + ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); + ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); + + // launchKernel(kernel, stream); + + ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); + ASSERT_SUCCESS(cu.pcuModuleUnload(module)); + ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); + + m_device->waitIdle(); + + // testInterop(); + + return true; + } + + void createResources() + { + auto& cu = cudaHandler->getCUDAFunctionTable(); + + for (auto& buf : cpuBufs) + { + ICPUBuffer::SCreationParams params = {}; + params.size = size; + buf = ICPUBuffer::create(std::move(params)); + } + + for (auto j = 0; j < 2; j++) + for (auto i = 0; i < numElements; i++) + reinterpret_cast(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX); + + + // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper + // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + // + // semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); + // ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get())); + // { + // // export the CUmem we have just created into a refctd IDeviceMemoryAllocation + // auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get()); + // if (!devmemory) + // logFail("Failed to export CUDA memory!"); + // + // + // // create an importing external buffer on Nabla side + // IGPUBuffer::SCreationParams params = {}; + // params.size = devmemory->getAllocationSize(); + // params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; + // params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; + // importedBuf = m_device->createBuffer(std::move(params)); + // if (!importedBuf) + // logFail("Failed to create an external buffer"); + // + // // bind that imported IDeviceMemoryAllocation to the external buffer we've just created + // ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } }; + // bool re = m_device->bindBufferMemory(1, &bindInfo); + // if (!re) logFail("Failed to bind CUDA memory to buffer"); + // } + // + // { + // // same thing as above + // // we create an external image and bind the imported external memory to it + // // now we have 2 different resources that are bound to the same memory + // IImage::SCreationParams params = {}; + // params.type = IGPUImage::ET_2D; + // params.samples = IGPUImage::ESCF_1_BIT; + // params.format = EF_R32_SFLOAT; + // params.extent = { gridDim[0], blockDim[0], 1 }; + // params.mipLevels = 1; + // params.arrayLayers = 1; + // params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT; + // importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params)); + // if (!importedImg) logFail("Failed to create an external image"); + // } + // + // commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + // bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger)); + // + // stagingBufs[0] = createStaging(); + // stagingBufs[1] = createStaging(); + } + + // smart_refctd_ptr createExternalBuffer(IDeviceMemoryAllocation* mem) + // { + // IGPUBuffer::SCreationParams params = {}; + // params.size = mem->getAllocationSize(); + // params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT; + // params.externalHandleTypes = mem->getCreationParams().externalHandleType; + // auto buf = m_device->createBuffer(std::move(params)); + // ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } }; + // m_device->bindBufferMemory(1, &bindInfo); + // return buf; + // } + + // smart_refctd_ptr createStaging(size_t sz = size) + // { + // auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} }); + // auto req = buf->getMemoryReqs(); + // req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); + // auto allocation = m_device->allocate(req, buf.get()); + // + // void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); + // if (!mapping) + // logFail("Failed to map an staging buffer"); + // memset(mapping, 0, req.size); + // return buf; + // }; + + // void launchKernel(CUfunction kernel, CUstream stream) + // { + // + // // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API + // { + // IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + // .barrier = { + // .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, + // .otherQueueFamilyIndex = IQueue::FamilyExternal, + // }, + // .range = {.buffer = importedBuf, }, + // }; + // + // IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { + // .barrier = { + // .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, + // .otherQueueFamilyIndex = IQueue::FamilyExternal, + // }, + // .image = importedImg.get(), + // .subresourceRange = { + // .aspectMask = IImage::EAF_COLOR_BIT, + // .levelCount = 1u, + // .layerCount = 1u, + // } + // }; + // // start recording + // bool re = true; + // re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} }); + // re &= cmd[0]->end(); + // + // IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 }; + // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()}; + // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} }; + // auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); + // re &= IQueue::RESULT::SUCCESS == submitRe; + // if (!re) + // logFail("Something went wrong readying resources for CUDA"); + // } + // + // auto& cu = cudaHandler->getCUDAFunctionTable(); + // // Launch kernel + // { + // CUdeviceptr ptrs[] = { + // cudaMemories[0]->getDeviceptr(), + // cudaMemories[1]->getDeviceptr(), + // cudaMemories[2]->getDeviceptr(), + // }; + // auto numEles = numElements; + // void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles }; + // ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream)); + // ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream)); + // + // auto semaphore = cudaSemaphore->getInternalObject(); + // CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; + // ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan + // ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); + // CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; + // ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore + // } + // + // // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA + // { + // IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + // .barrier = { + // .dep = { + // .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + // .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT, + // }, + // .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + // .otherQueueFamilyIndex = IQueue::FamilyExternal, + // }, + // .range = { .buffer = importedBuf, }, + // }; + // bool re = true; + // re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // + // re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}}); + // + // IGPUCommandBuffer::SBufferCopy region = { .size = size }; + // re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, ®ion); + // + // IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { + // .barrier = { + // .dep = { + // .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + // .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS, + // }, + // .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + // .otherQueueFamilyIndex = IQueue::FamilyExternal, + // }, + // .image = importedImg.get(), + // .subresourceRange = { + // .aspectMask = IImage::EAF_COLOR_BIT, + // .levelCount = 1u, + // .layerCount = 1u, + // }, + // .oldLayout = IImage::LAYOUT::PREINITIALIZED, + // .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, + // }; + // + // re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}}); + // + // IImage::SBufferCopy imgRegion = { + // .imageSubresource = { + // .aspectMask = imgBarrier.subresourceRange.aspectMask, + // .layerCount = imgBarrier.subresourceRange.layerCount, + // }, + // .imageExtent = importedImg->getCreationParameters().extent, + // }; + // + // re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion); + // re &= cmd[1]->end(); + // + // IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 }; + // IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 }; + // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; + // IQueue::SSubmitInfo submitInfo = { + // .waitSemaphores = {&waitInfo,&waitInfo + 1}, + // .commandBuffers = {&cmdInfo, &cmdInfo + 1}, + // .signalSemaphores = {&signalInfo,&signalInfo + 1} + // }; + // auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); + // re &= IQueue::RESULT::SUCCESS == submitRe; + // if (!re) + // logFail("Something went wrong copying results from CUDA"); + // } + // + // ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this)); + // } + + // void kernelCallback() + // { + // // Make sure we are also done with the readback + // auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3}}; + // m_device->waitForSemaphores(wait, true, -1); + // + // float* A = reinterpret_cast(cpuBufs[0]->getPointer()); + // float* B = reinterpret_cast(cpuBufs[1]->getPointer()); + // + // float* CBuf = reinterpret_cast(stagingBufs[0]->getBoundMemory().memory->getMappedPointer()); + // float* CImg = reinterpret_cast(stagingBufs[1]->getBoundMemory().memory->getMappedPointer()); + // + // if(memcmp(CBuf, CImg, size)) + // logFail("Buffer and Image memories do not match!"); + // + // for (auto i = 0; i < numElements; i++) + // { + // bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f); + // if(!re) + // logFail("Element at index %d is incorrect!", i); + // } + // + // std::cout << "Success\n"; + // } + + + // void testInterop() + // { + // { + // IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + // .size = size, + // .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + // .alignmentLog2 = 10, + // }; + // + // for (size_t i = 0; i < (1 << 8); ++i) + // { + // auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + // assert(memory); + // auto tmpBuf = createExternalBuffer(memory.get()); + // } + // } + // + // smart_refctd_ptr escaped; + // { + // IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + // .size = size, + // .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + // .alignmentLog2 = 10, + // }; + // + // auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + // + // auto tmpBuf = createExternalBuffer(memory.get()); + // auto staging = createStaging(); + // + // auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); + // for (uint32_t i = 0; i < size / 4; ++i) + // ptr[i] = i; + // + // smart_refctd_ptr cmd; + // commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); + // cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // IGPUCommandBuffer::SBufferCopy region = { .size = size }; + // assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); + // cmd->end(); + // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() }; + // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + // queue->submit({ &submitInfo,&submitInfo + 1 }); + // m_device->waitIdle(); + // escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory; + // } + // + // //{ + // // constexpr size_t M = 32; + // // auto staging = createStaging(size * M); + // + // // auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); + // // for (uint32_t i = 0; i < (M * size) / 4; ++i) + // // ptr[i] = rand(); + // + // // std::vector> cmd(1 << 10); + // // commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data()); + // + // // for (size_t i = 0; i < 1 << 10; ++i) + // // { + // // IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + // // .size = size * M, + // // .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + // // .alignmentLog2 = 10, + // // }; + // // RE: + // // auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + // + // // if (!memory) + // // { + // // m_device->waitIdle(); + // // for (size_t j = 0; j < i; ++j) + // // cmd[j] = 0; + // // goto END; + // // } + // // assert(memory); + // // auto tmpBuf = createExternalBuffer(memory.get()); + // + // // cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // // IGPUCommandBuffer::SBufferCopy region = { .size = size * M }; + // // assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); + // // cmd[i]->end(); + // // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() }; + // // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + // // assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 })); + // // } + // //END: + // // m_device->waitIdle(); + // //} + // + // { + // auto tmpBuf = createExternalBuffer(escaped.get()); + // auto staging = createStaging(); + // + // smart_refctd_ptr cmd; + // commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); + // cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // IGPUCommandBuffer::SBufferCopy region = { .size = size }; + // assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, ®ion)); + // cmd->end(); + // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() }; + // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + // auto qre = queue->submit({ &submitInfo,&submitInfo + 1 }); + // assert(IQueue::RESULT::SUCCESS == qre); + // m_device->waitIdle(); + // + // auto& ptr = *(std::array*)staging->getBoundMemory().memory->getMappedPointer(); + // for (uint32_t i = 0; i < size / 4; ++i) + // assert(ptr[i] == i); + // } + // + // } + + + // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. + bool keepRunning() override { return false; } + + // Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop" + void workLoopBody() override {} +}; + +NBL_MAIN_FUNC(CUDA2VKApp) \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index d945c547a..7c7990c06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,6 +111,7 @@ if(NBL_BUILD_EXAMPLES) endif() add_subdirectory(74_QuantizedSequenceTests) + add_subdirectory(76_CudaInterop) # add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory NBL_GET_ALL_TARGETS(TARGETS) From b8abd200a1a83ce4592f7ad3290d07ae02b4f538 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 23 Mar 2026 17:00:19 +0700 Subject: [PATCH 02/47] Dummy --- 71_RayTracingPipeline/main.cpp | 2 +- 76_CudaInterop/main.cpp | 706 +++++++++++++++++---------------- 2 files changed, 359 insertions(+), 349 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index f6b64c5ca..70ab21994 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -1245,7 +1245,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui auto retval = device->allocate(info); // map what is mappable by default so ReBAR checks succeed if (retval.isValid() && retval.memory->isMappable()) - retval.memory->map({ .offset = 0,.length = info.size }); + retval.memory->map({ .offset = 0,.length = info.allocationSize }); return retval; } diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 85d10ad13..c4b4fd5fe 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -76,9 +76,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica std::array, 2> cpuBufs; // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu' // // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side - // std::array, 3> cudaMemories = {}; + std::array, 3> cudaMemories = {}; // // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer - // smart_refctd_ptr cudaSemaphore; + smart_refctd_ptr cudaSemaphore; // our Buffer that is bound to cudaMemories[2] smart_refctd_ptr importedBuf; @@ -155,7 +155,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); - // launchKernel(kernel, stream); + launchKernel(kernel, stream); ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); ASSERT_SUCCESS(cu.pcuModuleUnload(module)); @@ -163,7 +163,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica m_device->waitIdle(); - // testInterop(); + testInterop(); return true; } @@ -185,352 +185,362 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper - // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - // - // semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); - // ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get())); - // { - // // export the CUmem we have just created into a refctd IDeviceMemoryAllocation - // auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get()); - // if (!devmemory) - // logFail("Failed to export CUDA memory!"); - // - // - // // create an importing external buffer on Nabla side - // IGPUBuffer::SCreationParams params = {}; - // params.size = devmemory->getAllocationSize(); - // params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; - // params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; - // importedBuf = m_device->createBuffer(std::move(params)); - // if (!importedBuf) - // logFail("Failed to create an external buffer"); - // - // // bind that imported IDeviceMemoryAllocation to the external buffer we've just created - // ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } }; - // bool re = m_device->bindBufferMemory(1, &bindInfo); - // if (!re) logFail("Failed to bind CUDA memory to buffer"); - // } - // - // { - // // same thing as above - // // we create an external image and bind the imported external memory to it - // // now we have 2 different resources that are bound to the same memory - // IImage::SCreationParams params = {}; - // params.type = IGPUImage::ET_2D; - // params.samples = IGPUImage::ESCF_1_BIT; - // params.format = EF_R32_SFLOAT; - // params.extent = { gridDim[0], blockDim[0], 1 }; - // params.mipLevels = 1; - // params.arrayLayers = 1; - // params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT; - // importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params)); - // if (!importedImg) logFail("Failed to create an external image"); - // } - // - // commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - // bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger)); - // - // stagingBufs[0] = createStaging(); - // stagingBufs[1] = createStaging(); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + + semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); + ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get())); + { + // export the CUmem we have just created into a refctd IDeviceMemoryAllocation + auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get()); + if (!devmemory) + logFail("Failed to export CUDA memory!"); + + + // create an importing external buffer on Nabla side + IGPUBuffer::SCreationParams params = {}; + params.size = devmemory->getAllocationSize(); + params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; + params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; + importedBuf = m_device->createBuffer(std::move(params)); + if (!importedBuf) + logFail("Failed to create an external buffer"); + + // bind that imported IDeviceMemoryAllocation to the external buffer we've just created + ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } }; + bool re = m_device->bindBufferMemory(1, &bindInfo); + if (!re) logFail("Failed to bind CUDA memory to buffer"); + } + + { + // same thing as above + // we create an external image and bind the imported external memory to it + // now we have 2 different resources that are bound to the same memory + + auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get()); + if (!devmemory) + logFail("Failed to export CUDA memory!"); + + IGPUImage::SCreationParams params = {}; + params.type = IGPUImage::ET_2D; + params.samples = IGPUImage::ESCF_1_BIT; + params.format = EF_R32_SFLOAT; + params.extent = { gridDim[0], blockDim[0], 1 }; + params.mipLevels = 1; + params.arrayLayers = 1; + params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT; + params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; + importedImg = m_device->createImage(std::move(params)); + if (!importedImg) logFail("Failed to create an external image"); + // bind that imported IDeviceMemoryAllocation to the external buffer we've just created + ILogicalDevice::SBindImageMemoryInfo bindInfo = { .image = importedImg.get(), .binding = {.memory = devmemory.get() } }; + bool re = m_device->bindImageMemory(1, &bindInfo); + if (!re) logFail("Failed to bind CUDA memory to buffer"); + } + + commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger)); + + stagingBufs[0] = createStaging(); + stagingBufs[1] = createStaging(); + } + + smart_refctd_ptr createExternalBuffer(IDeviceMemoryAllocation* mem) + { + IGPUBuffer::SCreationParams params = {}; + params.size = mem->getAllocationSize(); + params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT; + params.externalHandleTypes = mem->getCreationParams().externalHandleType; + auto buf = m_device->createBuffer(std::move(params)); + ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } }; + m_device->bindBufferMemory(1, &bindInfo); + return buf; + } + + smart_refctd_ptr createStaging(size_t sz = size) + { + auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} }); + auto req = buf->getMemoryReqs(); + req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); + auto allocation = m_device->allocate(req, buf.get()); + + void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); + if (!mapping) + logFail("Failed to map an staging buffer"); + memset(mapping, 0, req.size); + return buf; + }; + + void launchKernel(CUfunction kernel, CUstream stream) + { + + // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API + { + IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .range = {.buffer = importedBuf, }, + }; + + IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { + .barrier = { + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .image = importedImg.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .levelCount = 1u, + .layerCount = 1u, + } + }; + // start recording + bool re = true; + re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} }); + re &= cmd[0]->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 }; + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()}; + IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} }; + auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); + re &= IQueue::RESULT::SUCCESS == submitRe; + if (!re) + logFail("Something went wrong readying resources for CUDA"); + } + + auto& cu = cudaHandler->getCUDAFunctionTable(); + // Launch kernel + { + CUdeviceptr ptrs[] = { + cudaMemories[0]->getDeviceptr(), + cudaMemories[1]->getDeviceptr(), + cudaMemories[2]->getDeviceptr(), + }; + auto numEles = numElements; + void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles }; + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream)); + + auto semaphore = cudaSemaphore->getInternalObject(); + CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; + ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan + ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); + CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; + // ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore + } + + // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA + { + IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { + .dep = { + .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT, + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .range = { .buffer = importedBuf, }, + }; + bool re = true; + re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}}); + + IGPUCommandBuffer::SBufferCopy region = { .size = size }; + re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, ®ion); + + IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { + .barrier = { + .dep = { + .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS, + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .image = importedImg.get(), + .subresourceRange = { + .aspectMask = IImage::EAF_COLOR_BIT, + .levelCount = 1u, + .layerCount = 1u, + }, + .oldLayout = IImage::LAYOUT::PREINITIALIZED, + .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, + }; + + re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}}); + + IImage::SBufferCopy imgRegion = { + .imageSubresource = { + .aspectMask = imgBarrier.subresourceRange.aspectMask, + .layerCount = imgBarrier.subresourceRange.layerCount, + }, + .imageExtent = importedImg->getCreationParameters().extent, + }; + + re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion); + re &= cmd[1]->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 }; + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 }; + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; + IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = {&waitInfo,&waitInfo + 1}, + .commandBuffers = {&cmdInfo, &cmdInfo + 1}, + .signalSemaphores = {&signalInfo,&signalInfo + 1} + }; + auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); + re &= IQueue::RESULT::SUCCESS == submitRe; + if (!re) + logFail("Something went wrong copying results from CUDA"); + } + + ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this)); + } + + void kernelCallback() + { + // Make sure we are also done with the readback + auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 2}}; + m_device->waitForSemaphores(wait, true, -1); + + float* A = reinterpret_cast(cpuBufs[0]->getPointer()); + float* B = reinterpret_cast(cpuBufs[1]->getPointer()); + + float* CBuf = reinterpret_cast(stagingBufs[0]->getBoundMemory().memory->getMappedPointer()); + float* CImg = reinterpret_cast(stagingBufs[1]->getBoundMemory().memory->getMappedPointer()); + + if(memcmp(CBuf, CImg, size)) + logFail("Buffer and Image memories do not match!"); + + for (auto i = 0; i < numElements; i++) + { + bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f); + if(!re) + logFail("Element at index %d is incorrect!", i); + } + + std::cout << "Success\n"; } - // smart_refctd_ptr createExternalBuffer(IDeviceMemoryAllocation* mem) - // { - // IGPUBuffer::SCreationParams params = {}; - // params.size = mem->getAllocationSize(); - // params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT; - // params.externalHandleTypes = mem->getCreationParams().externalHandleType; - // auto buf = m_device->createBuffer(std::move(params)); - // ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } }; - // m_device->bindBufferMemory(1, &bindInfo); - // return buf; - // } - - // smart_refctd_ptr createStaging(size_t sz = size) - // { - // auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} }); - // auto req = buf->getMemoryReqs(); - // req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); - // auto allocation = m_device->allocate(req, buf.get()); - // - // void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); - // if (!mapping) - // logFail("Failed to map an staging buffer"); - // memset(mapping, 0, req.size); - // return buf; - // }; - - // void launchKernel(CUfunction kernel, CUstream stream) - // { - // - // // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API - // { - // IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { - // .barrier = { - // .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, - // .otherQueueFamilyIndex = IQueue::FamilyExternal, - // }, - // .range = {.buffer = importedBuf, }, - // }; - // - // IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { - // .barrier = { - // .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, - // .otherQueueFamilyIndex = IQueue::FamilyExternal, - // }, - // .image = importedImg.get(), - // .subresourceRange = { - // .aspectMask = IImage::EAF_COLOR_BIT, - // .levelCount = 1u, - // .layerCount = 1u, - // } - // }; - // // start recording - // bool re = true; - // re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - // re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} }); - // re &= cmd[0]->end(); - // - // IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 }; - // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()}; - // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} }; - // auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); - // re &= IQueue::RESULT::SUCCESS == submitRe; - // if (!re) - // logFail("Something went wrong readying resources for CUDA"); - // } - // - // auto& cu = cudaHandler->getCUDAFunctionTable(); - // // Launch kernel - // { - // CUdeviceptr ptrs[] = { - // cudaMemories[0]->getDeviceptr(), - // cudaMemories[1]->getDeviceptr(), - // cudaMemories[2]->getDeviceptr(), - // }; - // auto numEles = numElements; - // void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles }; - // ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream)); - // ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream)); - // - // auto semaphore = cudaSemaphore->getInternalObject(); - // CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; - // ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan - // ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); - // CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; - // ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore - // } - // - // // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA - // { - // IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { - // .barrier = { - // .dep = { - // .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, - // .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT, - // }, - // .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, - // .otherQueueFamilyIndex = IQueue::FamilyExternal, - // }, - // .range = { .buffer = importedBuf, }, - // }; - // bool re = true; - // re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - // - // re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}}); - // - // IGPUCommandBuffer::SBufferCopy region = { .size = size }; - // re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, ®ion); - // - // IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { - // .barrier = { - // .dep = { - // .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, - // .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS, - // }, - // .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, - // .otherQueueFamilyIndex = IQueue::FamilyExternal, - // }, - // .image = importedImg.get(), - // .subresourceRange = { - // .aspectMask = IImage::EAF_COLOR_BIT, - // .levelCount = 1u, - // .layerCount = 1u, - // }, - // .oldLayout = IImage::LAYOUT::PREINITIALIZED, - // .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, - // }; - // - // re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}}); - // - // IImage::SBufferCopy imgRegion = { - // .imageSubresource = { - // .aspectMask = imgBarrier.subresourceRange.aspectMask, - // .layerCount = imgBarrier.subresourceRange.layerCount, - // }, - // .imageExtent = importedImg->getCreationParameters().extent, - // }; - // - // re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion); - // re &= cmd[1]->end(); - // - // IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 }; - // IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 }; - // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; - // IQueue::SSubmitInfo submitInfo = { - // .waitSemaphores = {&waitInfo,&waitInfo + 1}, - // .commandBuffers = {&cmdInfo, &cmdInfo + 1}, - // .signalSemaphores = {&signalInfo,&signalInfo + 1} - // }; - // auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); - // re &= IQueue::RESULT::SUCCESS == submitRe; - // if (!re) - // logFail("Something went wrong copying results from CUDA"); - // } - // - // ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this)); - // } - - // void kernelCallback() - // { - // // Make sure we are also done with the readback - // auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3}}; - // m_device->waitForSemaphores(wait, true, -1); - // - // float* A = reinterpret_cast(cpuBufs[0]->getPointer()); - // float* B = reinterpret_cast(cpuBufs[1]->getPointer()); - // - // float* CBuf = reinterpret_cast(stagingBufs[0]->getBoundMemory().memory->getMappedPointer()); - // float* CImg = reinterpret_cast(stagingBufs[1]->getBoundMemory().memory->getMappedPointer()); - // - // if(memcmp(CBuf, CImg, size)) - // logFail("Buffer and Image memories do not match!"); - // - // for (auto i = 0; i < numElements; i++) - // { - // bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f); - // if(!re) - // logFail("Element at index %d is incorrect!", i); - // } - // - // std::cout << "Success\n"; - // } - - - // void testInterop() - // { - // { - // IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { - // .size = size, - // .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), - // .alignmentLog2 = 10, - // }; - // - // for (size_t i = 0; i < (1 << 8); ++i) - // { - // auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; - // assert(memory); - // auto tmpBuf = createExternalBuffer(memory.get()); - // } - // } - // - // smart_refctd_ptr escaped; - // { - // IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { - // .size = size, - // .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), - // .alignmentLog2 = 10, - // }; - // - // auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; - // - // auto tmpBuf = createExternalBuffer(memory.get()); - // auto staging = createStaging(); - // - // auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); - // for (uint32_t i = 0; i < size / 4; ++i) - // ptr[i] = i; - // - // smart_refctd_ptr cmd; - // commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); - // cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - // IGPUCommandBuffer::SBufferCopy region = { .size = size }; - // assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); - // cmd->end(); - // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() }; - // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; - // queue->submit({ &submitInfo,&submitInfo + 1 }); - // m_device->waitIdle(); - // escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory; - // } - // - // //{ - // // constexpr size_t M = 32; - // // auto staging = createStaging(size * M); - // - // // auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); - // // for (uint32_t i = 0; i < (M * size) / 4; ++i) - // // ptr[i] = rand(); - // - // // std::vector> cmd(1 << 10); - // // commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data()); - // - // // for (size_t i = 0; i < 1 << 10; ++i) - // // { - // // IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { - // // .size = size * M, - // // .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), - // // .alignmentLog2 = 10, - // // }; - // // RE: - // // auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; - // - // // if (!memory) - // // { - // // m_device->waitIdle(); - // // for (size_t j = 0; j < i; ++j) - // // cmd[j] = 0; - // // goto END; - // // } - // // assert(memory); - // // auto tmpBuf = createExternalBuffer(memory.get()); - // - // // cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - // // IGPUCommandBuffer::SBufferCopy region = { .size = size * M }; - // // assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); - // // cmd[i]->end(); - // // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() }; - // // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; - // // assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 })); - // // } - // //END: - // // m_device->waitIdle(); - // //} - // - // { - // auto tmpBuf = createExternalBuffer(escaped.get()); - // auto staging = createStaging(); - // - // smart_refctd_ptr cmd; - // commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); - // cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - // IGPUCommandBuffer::SBufferCopy region = { .size = size }; - // assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, ®ion)); - // cmd->end(); - // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() }; - // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; - // auto qre = queue->submit({ &submitInfo,&submitInfo + 1 }); - // assert(IQueue::RESULT::SUCCESS == qre); - // m_device->waitIdle(); - // - // auto& ptr = *(std::array*)staging->getBoundMemory().memory->getMappedPointer(); - // for (uint32_t i = 0; i < size / 4; ++i) - // assert(ptr[i] == i); - // } - // - // } + + void testInterop() + { + { + IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + .size = size, + .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + .alignmentLog2 = 10, + }; + + for (size_t i = 0; i < (1 << 8); ++i) + { + auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + assert(memory); + auto tmpBuf = createExternalBuffer(memory.get()); + } + } + + smart_refctd_ptr escaped; + { + IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + .size = size, + .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + .alignmentLog2 = 10, + }; + + auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + + auto tmpBuf = createExternalBuffer(memory.get()); + auto staging = createStaging(); + + auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); + for (uint32_t i = 0; i < size / 4; ++i) + ptr[i] = i; + + smart_refctd_ptr cmd; + commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); + cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + IGPUCommandBuffer::SBufferCopy region = { .size = size }; + assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); + cmd->end(); + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() }; + IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + queue->submit({ &submitInfo,&submitInfo + 1 }); + m_device->waitIdle(); + escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory; + } + + { + constexpr size_t M = 32; + auto staging = createStaging(size * M); + + auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); + for (uint32_t i = 0; i < (M * size) / 4; ++i) + ptr[i] = rand(); + + std::vector> cmd(1 << 10); + commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data()); + + for (size_t i = 0; i < 1 << 10; ++i) + { + IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + .size = size * M, + .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + .alignmentLog2 = 10, + }; + RE: + auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + + if (!memory) + { + m_device->waitIdle(); + for (size_t j = 0; j < i; ++j) + cmd[j] = 0; + goto END; + } + assert(memory); + auto tmpBuf = createExternalBuffer(memory.get()); + + cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + IGPUCommandBuffer::SBufferCopy region = { .size = size * M }; + assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); + cmd[i]->end(); + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() }; + IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 })); + } + END: + m_device->waitIdle(); + } + + { + auto tmpBuf = createExternalBuffer(escaped.get()); + auto staging = createStaging(); + + smart_refctd_ptr cmd; + commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); + cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + IGPUCommandBuffer::SBufferCopy region = { .size = size }; + assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, ®ion)); + cmd->end(); + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() }; + IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + auto qre = queue->submit({ &submitInfo,&submitInfo + 1 }); + assert(IQueue::RESULT::SUCCESS == qre); + m_device->waitIdle(); + + auto& ptr = *(std::array*)staging->getBoundMemory().memory->getMappedPointer(); + for (uint32_t i = 0; i < size / 4; ++i) + assert(ptr[i] == i); + } + + } // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. From 93ca5efe588ca85c1eaf81a486b611df98403580 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 01:09:08 +0700 Subject: [PATCH 03/47] Refactor test into separate section --- .../app_resources/vectorAdd_kernel.cu | 6 +- 76_CudaInterop/main.cpp | 686 +++++++++--------- 2 files changed, 350 insertions(+), 342 deletions(-) diff --git a/76_CudaInterop/app_resources/vectorAdd_kernel.cu b/76_CudaInterop/app_resources/vectorAdd_kernel.cu index 3baef0123..35876a627 100644 --- a/76_CudaInterop/app_resources/vectorAdd_kernel.cu +++ b/76_CudaInterop/app_resources/vectorAdd_kernel.cu @@ -33,10 +33,8 @@ */ extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, - int numElements) { + size_t numElements) { int i = blockDim.x * blockIdx.x + threadIdx.x; - - if (i < numElements) { + if (i < numElements) C[i] = A[i] + B[i]; - } } \ No newline at end of file diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index c4b4fd5fe..2a64f9428 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -1,6 +1,6 @@ #include "nbl/video/CCUDAHandler.h" -// #include "nbl/video/CCUDASharedMemory.h" -// #include "nbl/video/CCUDASharedSemaphore.h" +// #include "nbl/video/CCUDAExportableMemory.h" +// #include "nbl/video/CCUDAImportedSemaphore.h" #include "nbl/application_templates/MonoDeviceApplication.hpp" #include "nbl/examples/common/BuiltinResourcesApplication.hpp" @@ -57,10 +57,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica using device_base_t = MonoDeviceApplication; using asset_base_t = BuiltinResourcesApplication; - static constexpr uint32_t gridDim[3] = { 4096,1,1 }; - static constexpr uint32_t blockDim[3] = { 1024,1,1 }; - static constexpr size_t numElements = gridDim[0] * blockDim[0]; - static constexpr size_t size = sizeof(float) * numElements; public: // Yay thanks to multiple inheritance we cannot forward ctors anymore @@ -72,27 +68,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica IQueue* queue; - // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory - std::array, 2> cpuBufs; - // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu' - // // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side - std::array, 3> cudaMemories = {}; - // // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer - smart_refctd_ptr cudaSemaphore; - - // our Buffer that is bound to cudaMemories[2] - smart_refctd_ptr importedBuf; - // our Image that is also bound to cudaMemories[2] - smart_refctd_ptr importedImg; - - // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing - smart_refctd_ptr stagingBufs[2]; - - // Nabla semaphore for sync - smart_refctd_ptr semaphore; - - smart_refctd_ptr commandPool; - smart_refctd_ptr cmd[2]; // a device filter helps you create a set of physical devices that satisfy your requirements in terms of features, limits etc. virtual void filterDevices(core::set& physicalDevices) const @@ -121,10 +96,47 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica if (!cudaDevice) return logFail("Could not create a CUDA Device!"); - - queue = device_base_t::getComputeQueue(); - - createResources(); + testSharedResource(); + testDestruction(); + testLargeAllocations(); + + return true; + } + + smart_refctd_ptr createExternalBuffer(IDeviceMemoryAllocation* mem) + { + IGPUBuffer::SCreationParams params = {}; + params.size = mem->getAllocationSize(); + params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT; + params.externalHandleTypes = mem->getCreationParams().externalHandleType; + auto buf = m_device->createBuffer(std::move(params)); + ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } }; + m_device->bindBufferMemory(1, &bindInfo); + return buf; + } + + smart_refctd_ptr createStaging(size_t sz) + { + auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} }); + auto req = buf->getMemoryReqs(); + req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits() + & m_device->getPhysicalDevice()->getHostVisibleMemoryTypeBits() + & m_device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT); + auto allocation = m_device->allocate(req, buf.get()); + + void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); + if (!mapping) + logFail("Failed to map an staging buffer"); + memset(mapping, 0, req.size); + return buf; + }; + + void testSharedResource() + { + static constexpr uint32_t GridDim[3] = { 4096,1,1 }; + static constexpr uint32_t BlockDim[3] = { 1024,1,1 }; + static constexpr size_t NumElements = GridDim[0] * BlockDim[0]; + static constexpr size_t BufferSize = sizeof(float) * NumElements; smart_refctd_ptr ptx; { @@ -135,7 +147,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp); const auto assets = assetBundle.getContents(); if (assets.empty()) - return logFail("Could not load kernel!"); + logFail("Could not load kernel!"); smart_refctd_ptr source = IAsset::castDown(assets[0]); std::string log; @@ -145,197 +157,137 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ptx = std::move(ptx_); } + + auto& cu = cudaHandler->getCUDAFunctionTable(); + CUmodule module; CUfunction kernel; CUstream stream; - auto& cu = cudaHandler->getCUDAFunctionTable(); - ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); - launchKernel(kernel, stream); - - ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); - ASSERT_SUCCESS(cu.pcuModuleUnload(module)); - ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); - - m_device->waitIdle(); - - testInterop(); - - return true; - } - - void createResources() - { - auto& cu = cudaHandler->getCUDAFunctionTable(); + // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory + std::array, 2> cpuBufs; for (auto& buf : cpuBufs) { - ICPUBuffer::SCreationParams params = {}; - params.size = size; - buf = ICPUBuffer::create(std::move(params)); + ICPUBuffer::SCreationParams params = {}; + params.size = BufferSize; + buf = ICPUBuffer::create(std::move(params)); } - for (auto j = 0; j < 2; j++) - for (auto i = 0; i < numElements; i++) - reinterpret_cast(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX); + for (auto buf_i = 0; buf_i < cpuBufs.size(); buf_i++) + for (auto elem_i = 0; elem_i < NumElements; elem_i++) + reinterpret_cast(cpuBufs[buf_i]->getPointer())[elem_i] = rand() / float(RAND_MAX); + constexpr auto InputCount = 2; + // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu' + // // Kernel writes to cudaInputMemories[2] which we later use to export and read on nabla side + std::array, InputCount> cudaInputMemories = {}; + std::array, InputCount> vulkanMemories = {}; + std::array, InputCount> vulkanInputBuffers = {}; + std::array, InputCount> inputStagingBuffers = {}; - // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper - ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); - - semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 }); - ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get())); + for (auto input_i = 0; input_i < InputCount; input_i++) { - // export the CUmem we have just created into a refctd IDeviceMemoryAllocation - auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get()); - if (!devmemory) - logFail("Failed to export CUDA memory!"); - - - // create an importing external buffer on Nabla side - IGPUBuffer::SCreationParams params = {}; - params.size = devmemory->getAllocationSize(); - params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; - params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; - importedBuf = m_device->createBuffer(std::move(params)); - if (!importedBuf) - logFail("Failed to create an external buffer"); - - // bind that imported IDeviceMemoryAllocation to the external buffer we've just created - ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } }; - bool re = m_device->bindBufferMemory(1, &bindInfo); - if (!re) logFail("Failed to bind CUDA memory to buffer"); + // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper + ASSERT_SUCCESS(cudaDevice->createExportableMemory(&cudaInputMemories[input_i], { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr); + vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get()); + inputStagingBuffers[input_i] = createStaging(BufferSize); } + + IGPUBuffer::SCreationParams outputBufferParams; + outputBufferParams.size = cudaDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, BufferSize); + outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; + outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; + const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams)); + auto outputMemReq = outputBuf->getMemoryReqs(); + auto allocation = m_device->allocate(outputMemReq, outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE); + core::smart_refctd_ptr cudaOutputMemory; + ASSERT_SUCCESS(cudaDevice->importExternalMemory(&cudaOutputMemory, allocation.memory.get())); - { - // same thing as above - // we create an external image and bind the imported external memory to it - // now we have 2 different resources that are bound to the same memory - - auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get()); - if (!devmemory) - logFail("Failed to export CUDA memory!"); - - IGPUImage::SCreationParams params = {}; - params.type = IGPUImage::ET_2D; - params.samples = IGPUImage::ESCF_1_BIT; - params.format = EF_R32_SFLOAT; - params.extent = { gridDim[0], blockDim[0], 1 }; - params.mipLevels = 1; - params.arrayLayers = 1; - params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT; - params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; - importedImg = m_device->createImage(std::move(params)); - if (!importedImg) logFail("Failed to create an external image"); - // bind that imported IDeviceMemoryAllocation to the external buffer we've just created - ILogicalDevice::SBindImageMemoryInfo bindInfo = { .image = importedImg.get(), .binding = {.memory = devmemory.get() } }; - bool re = m_device->bindImageMemory(1, &bindInfo); - if (!re) logFail("Failed to bind CUDA memory to buffer"); - } + ISemaphore::SCreationParams semParams; + semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32; + auto semaphore = m_device->createSemaphore(0, std::move(semParams)); + core::smart_refctd_ptr cudaSemaphore; + ASSERT_SUCCESS(cudaDevice->importExternalSemaphore(&cudaSemaphore, semaphore.get())); - commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger)); + std::array, 2> cmd; + auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger)); - stagingBufs[0] = createStaging(); - stagingBufs[1] = createStaging(); - } - - smart_refctd_ptr createExternalBuffer(IDeviceMemoryAllocation* mem) - { - IGPUBuffer::SCreationParams params = {}; - params.size = mem->getAllocationSize(); - params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT; - params.externalHandleTypes = mem->getCreationParams().externalHandleType; - auto buf = m_device->createBuffer(std::move(params)); - ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } }; - m_device->bindBufferMemory(1, &bindInfo); - return buf; - } - - smart_refctd_ptr createStaging(size_t sz = size) - { - auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} }); - auto req = buf->getMemoryReqs(); - req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits(); - auto allocation = m_device->allocate(req, buf.get()); - - void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); - if (!mapping) - logFail("Failed to map an staging buffer"); - memset(mapping, 0, req.size); - return buf; - }; + const auto outputStagingBuffer = createStaging(BufferSize); - void launchKernel(CUfunction kernel, CUstream stream) - { - // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API { - IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { - .barrier = { - .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, - .otherQueueFamilyIndex = IQueue::FamilyExternal, - }, - .range = {.buffer = importedBuf, }, - }; - - IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { + const IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS, + }, .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, .otherQueueFamilyIndex = IQueue::FamilyExternal, }, - .image = importedImg.get(), - .subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .levelCount = 1u, - .layerCount = 1u, - } + .range = { + .offset = 0, + .size = outputBuf->getSize(), + .buffer = outputBuf, + }, }; + // start recording bool re = true; re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} }); + re &= cmd[0]->pipelineBarrier(EDF_NONE, { + .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1} + }); re &= cmd[0]->end(); - IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 }; - IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()}; - IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} }; - auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = semaphore.get(), + .value = 1, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + }; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() }; + const IQueue::SSubmitInfo submitInfo = { + .commandBuffers = {&cmdInfo, &cmdInfo + 1}, + .signalSemaphores = {&signalInfo, &signalInfo + 1} + }; + const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 }); re &= IQueue::RESULT::SUCCESS == submitRe; - if (!re) - logFail("Something went wrong readying resources for CUDA"); + if (!re) logFail("Something went wrong readying resources for CUDA"); } - auto& cu = cudaHandler->getCUDAFunctionTable(); // Launch kernel { + CUdeviceptr outputBufPtr; + cudaOutputMemory->getMappedBuffer(&outputBufPtr); CUdeviceptr ptrs[] = { - cudaMemories[0]->getDeviceptr(), - cudaMemories[1]->getDeviceptr(), - cudaMemories[2]->getDeviceptr(), + cudaInputMemories[0]->getDeviceptr(), + cudaInputMemories[1]->getDeviceptr(), + outputBufPtr }; - auto numEles = numElements; - void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles }; - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream)); - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream)); + auto numElements = &NumElements; + void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements }; + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream)); auto semaphore = cudaSemaphore->getInternalObject(); - CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; + const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan - ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr)); - CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; - // ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore + ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr)); + const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; + ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore } + ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA { - IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + const IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { .barrier = { .dep = { .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, @@ -344,202 +296,260 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, .otherQueueFamilyIndex = IQueue::FamilyExternal, }, - .range = { .buffer = importedBuf, }, + .range = { + .offset = 0, + .size = outputBuf->getSize(), + .buffer = outputBuf, + }, }; bool re = true; re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}}); - - IGPUCommandBuffer::SBufferCopy region = { .size = size }; - re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, ®ion); - - IGPUCommandBuffer::SImageMemoryBarrier imgBarrier = { - .barrier = { - .dep = { - .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, - .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS, - }, - .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, - .otherQueueFamilyIndex = IQueue::FamilyExternal, - }, - .image = importedImg.get(), - .subresourceRange = { - .aspectMask = IImage::EAF_COLOR_BIT, - .levelCount = 1u, - .layerCount = 1u, - }, - .oldLayout = IImage::LAYOUT::PREINITIALIZED, - .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL, + const auto region = IGPUCommandBuffer::SBufferCopy{ + .srcOffset = 0, + .dstOffset = 0, + .size = BufferSize }; - - re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}}); - - IImage::SBufferCopy imgRegion = { - .imageSubresource = { - .aspectMask = imgBarrier.subresourceRange.aspectMask, - .layerCount = imgBarrier.subresourceRange.layerCount, - }, - .imageExtent = importedImg->getCreationParameters().extent, - }; - - re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion); - re &= cmd[1]->end(); + re &= cmd[1]->copyBuffer(outputBuf.get(), outputStagingBuffer.get(), 1, ®ion); + for (auto input_i = 0; input_i < InputCount; input_i++) + re &= cmd[1]->copyBuffer(vulkanInputBuffers[input_i].get(), inputStagingBuffers[input_i].get(), 1, ®ion); + cmd[1]->end(); - IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 }; - IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 }; - IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; - IQueue::SSubmitInfo submitInfo = { - .waitSemaphores = {&waitInfo,&waitInfo + 1}, - .commandBuffers = {&cmdInfo, &cmdInfo + 1}, - .signalSemaphores = {&signalInfo,&signalInfo + 1} + const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { + .semaphore = semaphore.get(), + .value = 2, + .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + }; + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = semaphore.get(), + .value = 3, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }; - auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 }); + const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; + const IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = { &waitInfo, &waitInfo + 1 }, + .commandBuffers = { &cmdInfo, &cmdInfo + 1 }, + .signalSemaphores = { &signalInfo, &signalInfo + 1 } + }; + const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 }); re &= IQueue::RESULT::SUCCESS == submitRe; if (!re) logFail("Something went wrong copying results from CUDA"); - } - - ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this)); - } - void kernelCallback() - { - // Make sure we are also done with the readback - auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 2}}; - m_device->waitForSemaphores(wait, true, -1); - - float* A = reinterpret_cast(cpuBufs[0]->getPointer()); - float* B = reinterpret_cast(cpuBufs[1]->getPointer()); - - float* CBuf = reinterpret_cast(stagingBufs[0]->getBoundMemory().memory->getMappedPointer()); - float* CImg = reinterpret_cast(stagingBufs[1]->getBoundMemory().memory->getMappedPointer()); - - if(memcmp(CBuf, CImg, size)) - logFail("Buffer and Image memories do not match!"); - - for (auto i = 0; i < numElements; i++) - { - bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f); - if(!re) - logFail("Element at index %d is incorrect!", i); } - - std::cout << "Success\n"; - } - - void testInterop() - { + struct CallbackContext { - IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { - .size = size, - .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), - .alignmentLog2 = 10, + core::smart_refctd_ptr semaphore; + std::array, InputCount> cpuBuffers; + std::array, InputCount> inputStagingBuffers; + core::smart_refctd_ptr outputStagingBuffer; + core::smart_refctd_ptr device; + core::smart_refctd_ptr logger; + }; + + CallbackContext ctx; + ctx.semaphore = semaphore; + ctx.cpuBuffers = cpuBufs; + ctx.inputStagingBuffers = inputStagingBuffers; + ctx.outputStagingBuffer = outputStagingBuffer; + ctx.device = m_device; + ctx.logger = m_logger; + + auto cudaCallback = [](void* userData) + { + const auto* ctx = reinterpret_cast(userData); + + // Make sure we are also done with the readback + const auto wait = std::array{ + ISemaphore::SWaitInfo{ + .semaphore = ctx->semaphore.get(), + .value = 3, + } }; - - for (size_t i = 0; i < (1 << 8); ++i) + ctx->device->blockForSemaphores(wait, true); + + auto* stagingMem = ctx->outputStagingBuffer->getBoundMemory().memory; + if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) { - auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; - assert(memory); - auto tmpBuf = createExternalBuffer(memory.get()); + ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize()); + ctx->device->invalidateMappedMemoryRanges(1, &range); } - } - + + const auto* inputs1 = reinterpret_cast(ctx->cpuBuffers[0]->getPointer()); + const auto* inputs2 = reinterpret_cast(ctx->cpuBuffers[1]->getPointer()); + + const auto* outputs = reinterpret_cast(ctx->outputStagingBuffer->getBoundMemory().memory->getMappedPointer()); + const auto* inputsInStaging1 = reinterpret_cast(ctx->inputStagingBuffers[0]->getBoundMemory().memory->getMappedPointer()); + const auto* inputsInStaging2 = reinterpret_cast(ctx->inputStagingBuffers[1]->getBoundMemory().memory->getMappedPointer()); + + for (auto elem_i = 0; elem_i < NumElements; elem_i++) + { + const auto input1 = inputs1[elem_i]; + const auto input2 = inputs2[elem_i]; + const auto inputInStaging1 = inputsInStaging1[elem_i]; + const auto inputInStaging2 = inputsInStaging2[elem_i]; + if (inputInStaging1 != input1) + ctx->logger->log("Input1 in Staging %d is incorrect!", ILogger::ELL_ERROR, elem_i); + if (inputInStaging2 != input2) + ctx->logger->log("Input2 in Staging %d is incorrect!", ILogger::ELL_ERROR, elem_i); + + const auto output = outputs[elem_i]; + const auto expected = input1 + input2; + const auto diff = abs(output - expected); + bool re = diff < 0.01; + if (!re) + ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i); + } + + ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO); + }; + + ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx)); + ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); + + ASSERT_SUCCESS(cu.pcuModuleUnload(module)); + ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); + } + + void testDestruction() + { + + auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + constexpr auto ElementCount = 1024; + constexpr auto BufferSize = ElementCount * sizeof(int); + auto& cu = cudaHandler->getCUDAFunctionTable(); smart_refctd_ptr escaped; { - IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { - .size = size, - .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), - .alignmentLog2 = 10, - }; - - auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; - - auto tmpBuf = createExternalBuffer(memory.get()); - auto staging = createStaging(); - + core::smart_refctd_ptr cudaMemory; + ASSERT_SUCCESS(cudaDevice->createExportableMemory(&cudaMemory, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + escaped = cudaMemory->exportAsMemory(m_device.get()); + if (!escaped) logFail("Fail to export CUDA memory!"); + + auto tmpBuf = createExternalBuffer(escaped.get()); + auto staging = createStaging(BufferSize); + auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); - for (uint32_t i = 0; i < size / 4; ++i) + for (uint32_t i = 0; i < ElementCount; ++i) ptr[i] = i; - - smart_refctd_ptr cmd; - commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); - cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - IGPUCommandBuffer::SBufferCopy region = { .size = size }; - assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); - cmd->end(); - IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() }; - IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; - queue->submit({ &submitInfo,&submitInfo + 1 }); - m_device->waitIdle(); - escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory; - } - - { - constexpr size_t M = 32; - auto staging = createStaging(size * M); - - auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); - for (uint32_t i = 0; i < (M * size) / 4; ++i) - ptr[i] = rand(); - - std::vector> cmd(1 << 10); - commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data()); - - for (size_t i = 0; i < 1 << 10; ++i) - { - IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { - .size = size * M, - .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), - .alignmentLog2 = 10, - }; - RE: - auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; - - if (!memory) - { - m_device->waitIdle(); - for (size_t j = 0; j < i; ++j) - cmd[j] = 0; - goto END; - } - assert(memory); - auto tmpBuf = createExternalBuffer(memory.get()); - - cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - IGPUCommandBuffer::SBufferCopy region = { .size = size * M }; - assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); - cmd[i]->end(); - IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() }; - IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; - assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 })); - } - END: + + const auto semaphore = m_device->createSemaphore(0); + IQueue::SSubmitInfo::SSemaphoreInfo semInfo; + semInfo.semaphore = semaphore.get(); + semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; + semInfo.value = 1; + + smart_refctd_ptr cmdBuffer; + commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmdBuffer); + cmdBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + IGPUCommandBuffer::SBufferCopy region = { .size = BufferSize }; + assert(cmdBuffer->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); + cmdBuffer->end(); + IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmdBuffer.get() }; + const IQueue::SSubmitInfo submitInfo = { + .commandBuffers = {&cmdInfo, &cmdInfo + 1}, + .signalSemaphores = {&semInfo, 1} + }; + auto qre = queue->submit({ &submitInfo, &submitInfo + 1 }); + assert(IQueue::RESULT::SUCCESS == qre); m_device->waitIdle(); - } - + } + { auto tmpBuf = createExternalBuffer(escaped.get()); - auto staging = createStaging(); - + auto staging = createStaging(BufferSize); + + const auto semaphore = m_device->createSemaphore(0); + IQueue::SSubmitInfo::SSemaphoreInfo semInfo; + semInfo.semaphore = semaphore.get(); + semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; + semInfo.value = 1; + smart_refctd_ptr cmd; commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - IGPUCommandBuffer::SBufferCopy region = { .size = size }; + IGPUCommandBuffer::SBufferCopy region = { .size = BufferSize }; assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, ®ion)); cmd->end(); IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() }; - IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; - auto qre = queue->submit({ &submitInfo,&submitInfo + 1 }); + const IQueue::SSubmitInfo submitInfo = { + .commandBuffers = {&cmdInfo, &cmdInfo + 1}, + .signalSemaphores = {&semInfo, 1} + }; + auto qre = queue->submit({ &submitInfo, &submitInfo + 1 }); assert(IQueue::RESULT::SUCCESS == qre); + m_device->waitIdle(); - - auto& ptr = *(std::array*)staging->getBoundMemory().memory->getMappedPointer(); - for (uint32_t i = 0; i < size / 4; ++i) - assert(ptr[i] == i); + + auto& ptr = *(std::array*)staging->getBoundMemory().memory->getMappedPointer(); + for (uint32_t i = 0; i < ElementCount; ++i) + { + if (ptr[i] != i) logFail("Test Destruction: Element %d is incorrect", i); + } + m_logger->log("Test Destruction complete", ILogger::ELL_INFO); } + // { + // constexpr size_t M = 32; + // auto staging = createStaging(size * M); + // + // auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); + // for (uint32_t i = 0; i < (M * size) / 4; ++i) + // ptr[i] = rand(); + // + // std::vector> cmd(1 << 10); + // commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data()); + // + // for (size_t i = 0; i < 1 << 10; ++i) + // { + // IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + // .size = size * M, + // .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + // .alignmentLog2 = 10, + // }; + // RE: + // auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + // + // if (!memory) + // { + // m_device->waitIdle(); + // for (size_t j = 0; j < i; ++j) + // cmd[j] = 0; + // goto END; + // } + // assert(memory); + // auto tmpBuf = createExternalBuffer(memory.get()); + // + // cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // IGPUCommandBuffer::SBufferCopy region = { .size = size * M }; + // assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); + // cmd[i]->end(); + // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() }; + // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + // assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 })); + // } + // END: + // m_device->waitIdle(); + // } + + } + + void testLargeAllocations() + { + // TODO(kevin): Calculate BufferSize that is big enough to fill the machine VRAM + constexpr auto BufferSize = 1024; + IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + .size = BufferSize, + .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + .alignmentLog2 = 10, + }; + + for (size_t i = 0; i < (1 << 8); ++i) + { + auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + assert(memory); + auto tmpBuf = createExternalBuffer(memory.get()); + } } From 03d2ce251e39cd58057a52d6728ec73484f0216d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Apr 2026 00:52:33 +0700 Subject: [PATCH 04/47] Update to follow latest commit on main repo --- 76_CudaInterop/main.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 2a64f9428..2c4f819b2 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -96,6 +96,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica if (!cudaDevice) return logFail("Could not create a CUDA Device!"); + + queue = getComputeQueue(); + testSharedResource(); testDestruction(); testLargeAllocations(); @@ -193,7 +196,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica for (auto input_i = 0; input_i < InputCount; input_i++) { // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper - ASSERT_SUCCESS(cudaDevice->createExportableMemory(&cudaInputMemories[input_i], { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + cudaInputMemories[input_i] = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr); vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get()); inputStagingBuffers[input_i] = createStaging(BufferSize); @@ -205,15 +208,18 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams)); auto outputMemReq = outputBuf->getMemoryReqs(); + auto allocation = m_device->allocate(outputMemReq, outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE); - core::smart_refctd_ptr cudaOutputMemory; - ASSERT_SUCCESS(cudaDevice->importExternalMemory(&cudaOutputMemory, allocation.memory.get())); + const auto cudaOutputMemory = cudaDevice->importExternalMemory(core::smart_refctd_ptr(allocation.memory)); + if (!cudaOutputMemory) + logFail("Fail to import Vulkan Memory into CUDA!"); ISemaphore::SCreationParams semParams; semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32; auto semaphore = m_device->createSemaphore(0, std::move(semParams)); - core::smart_refctd_ptr cudaSemaphore; - ASSERT_SUCCESS(cudaDevice->importExternalSemaphore(&cudaSemaphore, semaphore.get())); + const auto cudaSemaphore = cudaDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); + if (!cudaSemaphore) + logFail("Fail to import Vulkan Semaphore into CUDA!"); std::array, 2> cmd; auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); @@ -414,15 +420,15 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica void testDestruction() { - auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); constexpr auto ElementCount = 1024; constexpr auto BufferSize = ElementCount * sizeof(int); auto& cu = cudaHandler->getCUDAFunctionTable(); smart_refctd_ptr escaped; { - core::smart_refctd_ptr cudaMemory; - ASSERT_SUCCESS(cudaDevice->createExportableMemory(&cudaMemory, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE })); + const auto cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); + if (!cudaMemory) logFail("Fail to create exportable memory!"); + escaped = cudaMemory->exportAsMemory(m_device.get()); if (!escaped) logFail("Fail to export CUDA memory!"); From 1e120e8956181d8de7931f1fd2e8bb350a046c2a Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 25 Apr 2026 17:18:45 +0700 Subject: [PATCH 05/47] Fix ex 67 due to changes in memory allocation --- 67_RayQueryGeometry/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp index 63346ac4c..2f196e140 100644 --- a/67_RayQueryGeometry/main.cpp +++ b/67_RayQueryGeometry/main.cpp @@ -664,7 +664,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built auto retval = device->allocate(info); // map what is mappable by default so ReBAR checks succeed if (retval.isValid() && retval.memory->isMappable()) - retval.memory->map({.offset=0,.length=info.size}); + retval.memory->map({.offset=0,.length=info.allocationSize}); return retval; } From fc00a68b3dec9f4c3ff81419ea77e5f85f5ff4ce Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 30 Apr 2026 15:03:58 +0700 Subject: [PATCH 06/47] ASSERT_SUCCESS into ASSERT_CUDA_SUCCESS --- 76_CudaInterop/main.cpp | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 2c4f819b2..8231586d5 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -40,8 +40,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin return true; } -#define ASSERT_SUCCESS(expr) { auto re = check_cuda_err((expr), cu, m_logger, __FILE__, __LINE__); assert(re); } -#define ASSERT_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); } +#define ASSERT_CUDA_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); } using namespace nbl::core; @@ -156,7 +155,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica std::string log; auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log); - ASSERT_SUCCESS_NV(res, log); + ASSERT_CUDA_SUCCESS_NV(res, log); ptx = std::move(ptx_); } @@ -167,9 +166,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica CUfunction kernel; CUstream stream; - ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); - ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); - ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); + ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), cudaHandler); + ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), cudaHandler); + ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), cudaHandler); // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory std::array, 2> cpuBufs; @@ -279,17 +278,17 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica }; auto numElements = &NumElements; void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements }; - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream)); - ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream)); + ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler); + ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler); auto semaphore = cudaSemaphore->getInternalObject(); const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; - ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan - ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr)); + ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan + ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler); const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; - ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore + ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), cudaHandler); // Signal the imported semaphore } - ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); + ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler); // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA { @@ -411,11 +410,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO); }; - ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx)); - ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); + ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), cudaHandler); + ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler); - ASSERT_SUCCESS(cu.pcuModuleUnload(module)); - ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); + ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), cudaHandler); + ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), cudaHandler); } void testDestruction() From 00572257f2370be17e118f3186ea032119e186cd Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 4 May 2026 14:22:22 +0700 Subject: [PATCH 07/47] Refactor ASSERT_CUDA_SUCCESS --- 76_CudaInterop/main.cpp | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 8231586d5..84dbac39f 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -16,19 +16,6 @@ The start of the main function starts like in most other example. We ask the user for the desired renderer and start it up. */ -bool check_cuda_err(cudaError_enum err, auto& cu, auto& logger, auto file, auto line) -{ - if (auto re = err; CUDA_SUCCESS != re) - { - const char* name = 0, * str = 0; - cu.pcuGetErrorName(re, &name); - cu.pcuGetErrorString(re, &str); - logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str); - return false; - } - return true; -} - bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log) { if (auto re = err; NVRTC_SUCCESS != re) @@ -40,7 +27,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin return true; } -#define ASSERT_CUDA_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); } +#define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); } using namespace nbl::core; @@ -155,7 +142,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica std::string log; auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log); - ASSERT_CUDA_SUCCESS_NV(res, log); + ASSERT_NV_SUCCESS(res, log); ptx = std::move(ptx_); } From 82d05923f15c09f1f1de771c14b9c1b89c5ca28b Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 4 May 2026 14:22:49 +0700 Subject: [PATCH 08/47] Slight naming refactor --- 76_CudaInterop/main.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 84dbac39f..5fd8151bf 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -85,7 +85,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica queue = getComputeQueue(); - testSharedResource(); + testVectorAddKernel(); testDestruction(); testLargeAllocations(); @@ -120,7 +120,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica return buf; }; - void testSharedResource() + void testVectorAddKernel() { static constexpr uint32_t GridDim[3] = { 4096,1,1 }; static constexpr uint32_t BlockDim[3] = { 1024,1,1 }; @@ -389,8 +389,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const auto output = outputs[elem_i]; const auto expected = input1 + input2; const auto diff = abs(output - expected); - bool re = diff < 0.01; - if (!re) + if (diff < 0.01) ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i); } From a229db2993e35af7b09c8fd5393b8e16d7ff6435 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 4 May 2026 14:24:35 +0700 Subject: [PATCH 09/47] Remove unused commented code --- 76_CudaInterop/main.cpp | 43 ----------------------------------------- 1 file changed, 43 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 5fd8151bf..dfd214384 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -480,49 +480,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica m_logger->log("Test Destruction complete", ILogger::ELL_INFO); } - // { - // constexpr size_t M = 32; - // auto staging = createStaging(size * M); - // - // auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); - // for (uint32_t i = 0; i < (M * size) / 4; ++i) - // ptr[i] = rand(); - // - // std::vector> cmd(1 << 10); - // commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data()); - // - // for (size_t i = 0; i < 1 << 10; ++i) - // { - // IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { - // .size = size * M, - // .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), - // .alignmentLog2 = 10, - // }; - // RE: - // auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; - // - // if (!memory) - // { - // m_device->waitIdle(); - // for (size_t j = 0; j < i; ++j) - // cmd[j] = 0; - // goto END; - // } - // assert(memory); - // auto tmpBuf = createExternalBuffer(memory.get()); - // - // cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - // IGPUCommandBuffer::SBufferCopy region = { .size = size * M }; - // assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); - // cmd[i]->end(); - // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() }; - // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; - // assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 })); - // } - // END: - // m_device->waitIdle(); - // } - } void testLargeAllocations() From feac63dc5b968a6351381a05a1632019b8d19749 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 09:28:11 +0200 Subject: [PATCH 10/47] Build CUDA interop example through extension target --- 76_CudaInterop/CMakeLists.txt | 8 +++++++- 76_CudaInterop/main.cpp | 6 ++---- CMakeLists.txt | 6 ++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt index bc1624875..c904da699 100644 --- a/76_CudaInterop/CMakeLists.txt +++ b/76_CudaInterop/CMakeLists.txt @@ -5,6 +5,12 @@ endif() nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") +if(NOT TARGET Nabla::ext::CUDAInterop) + message(FATAL_ERROR "76_CudaInterop requires the CUDA interop extension target") +endif() + +target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInterop) + if(NBL_EMBED_BUILTIN_RESOURCES) set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) set(RESOURCE_DIR "app_resources") @@ -21,4 +27,4 @@ if(NBL_EMBED_BUILTIN_RESOURCES) ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) -endif() \ No newline at end of file +endif() diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index dfd214384..9108e08f4 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -1,6 +1,4 @@ -#include "nbl/video/CCUDAHandler.h" -// #include "nbl/video/CCUDAExportableMemory.h" -// #include "nbl/video/CCUDAImportedSemaphore.h" +#include "nbl/ext/CUDAInterop/CUDAInterop.h" #include "nbl/application_templates/MonoDeviceApplication.hpp" #include "nbl/examples/common/BuiltinResourcesApplication.hpp" @@ -508,4 +506,4 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica void workLoopBody() override {} }; -NBL_MAIN_FUNC(CUDA2VKApp) \ No newline at end of file +NBL_MAIN_FUNC(CUDA2VKApp) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c7990c06..0715f1064 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,7 +111,9 @@ if(NBL_BUILD_EXAMPLES) endif() add_subdirectory(74_QuantizedSequenceTests) - add_subdirectory(76_CudaInterop) + if (NBL_COMPILE_WITH_CUDA) + add_subdirectory(76_CudaInterop) + endif() # add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory NBL_GET_ALL_TARGETS(TARGETS) @@ -137,4 +139,4 @@ if(NBL_BUILD_EXAMPLES) endforeach() NBL_ADJUST_FOLDERS(examples) -endif() \ No newline at end of file +endif() From 6f136a224c516182d8d7883407a04be529063a9e Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 11:42:32 +0200 Subject: [PATCH 11/47] Simplify CUDA interop example link --- 76_CudaInterop/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt index c904da699..8eb08f70b 100644 --- a/76_CudaInterop/CMakeLists.txt +++ b/76_CudaInterop/CMakeLists.txt @@ -5,10 +5,6 @@ endif() nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") -if(NOT TARGET Nabla::ext::CUDAInterop) - message(FATAL_ERROR "76_CudaInterop requires the CUDA interop extension target") -endif() - target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInterop) if(NBL_EMBED_BUILTIN_RESOURCES) From b17beb26f27a9cb8347a25fd5587c7cc9310d589 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 16:16:29 +0200 Subject: [PATCH 12/47] Use CUDA interop native target --- 76_CudaInterop/CMakeLists.txt | 2 +- 76_CudaInterop/main.cpp | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt index 8eb08f70b..bd4f1914b 100644 --- a/76_CudaInterop/CMakeLists.txt +++ b/76_CudaInterop/CMakeLists.txt @@ -5,7 +5,7 @@ endif() nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") -target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInterop) +target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInteropNative) if(NBL_EMBED_BUILTIN_RESOURCES) set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 9108e08f4..becdfbe50 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -1,4 +1,4 @@ -#include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/ext/CUDAInterop/CUDAInteropNative.h" #include "nbl/application_templates/MonoDeviceApplication.hpp" #include "nbl/examples/common/BuiltinResourcesApplication.hpp" @@ -16,9 +16,9 @@ user for the desired renderer and start it up. bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log) { - if (auto re = err; NVRTC_SUCCESS != re) + if (auto re = err; NVRTC_SUCCESS != re) { - const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); + const char* str = cuda_native::getNVRTCFunctionTable(*cudaHandler).pnvrtcGetErrorString(re); logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str()); return false; } @@ -59,7 +59,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica device_base_t::filterDevices(physicalDevices); auto& cuDevices = cudaHandler->getAvailableDevices(); std::erase_if(physicalDevices, [&cuDevices](auto pdev) { - return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, &cuDev.uuid, 16); }); + return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, cuDev.uuid.data(), 16); }); }); } @@ -138,14 +138,14 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica smart_refctd_ptr source = IAsset::castDown(assets[0]); std::string log; - auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), + auto [ptx_, res] = cuda_native::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()), "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log); ASSERT_NV_SUCCESS(res, log); ptx = std::move(ptx_); } - auto& cu = cudaHandler->getCUDAFunctionTable(); + auto& cu = cuda_native::getCUDAFunctionTable(*cudaHandler); CUmodule module; CUfunction kernel; @@ -187,7 +187,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica } IGPUBuffer::SCreationParams outputBufferParams; - outputBufferParams.size = cudaDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, BufferSize); + outputBufferParams.size = cudaDevice->roundToGranularity(ECUDAMemoryLocation::DEVICE, BufferSize); outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams)); @@ -255,10 +255,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica // Launch kernel { CUdeviceptr outputBufPtr; - cudaOutputMemory->getMappedBuffer(&outputBufPtr); + cuda_native::getMappedBuffer(*cudaOutputMemory, &outputBufPtr); CUdeviceptr ptrs[] = { - cudaInputMemories[0]->getDeviceptr(), - cudaInputMemories[1]->getDeviceptr(), + cuda_native::getDeviceptr(*cudaInputMemories[0]), + cuda_native::getDeviceptr(*cudaInputMemories[1]), outputBufPtr }; auto numElements = &NumElements; @@ -266,7 +266,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler); ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler); - auto semaphore = cudaSemaphore->getInternalObject(); + auto semaphore = cuda_native::getInternalObject(*cudaSemaphore); const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler); @@ -406,7 +406,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); constexpr auto ElementCount = 1024; constexpr auto BufferSize = ElementCount * sizeof(int); - auto& cu = cudaHandler->getCUDAFunctionTable(); + auto& cu = cuda_native::getCUDAFunctionTable(*cudaHandler); smart_refctd_ptr escaped; { const auto cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); From fd50fda4952096febc8ab9df94e441d55e54e7bf Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 16:39:53 +0200 Subject: [PATCH 13/47] Use native CUDA accessors --- 76_CudaInterop/main.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index becdfbe50..289b0c0b1 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -8,6 +8,7 @@ using namespace core; using namespace system; using namespace asset; using namespace video; +namespace cuda = nbl::video::cuda_native; /* The start of the main function starts like in most other example. We ask the @@ -18,7 +19,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin { if (auto re = err; NVRTC_SUCCESS != re) { - const char* str = cuda_native::getNVRTCFunctionTable(*cudaHandler).pnvrtcGetErrorString(re); + const char* str = cuda::getNVRTCFunctionTable(cudaHandler).pnvrtcGetErrorString(re); logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str()); return false; } @@ -138,14 +139,14 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica smart_refctd_ptr source = IAsset::castDown(assets[0]); std::string log; - auto [ptx_, res] = cuda_native::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()), + auto [ptx_, res] = cuda::compileDirectlyToPTX(cudaHandler, std::string((const char*)source->getPointer(), source->getSize()), "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log); ASSERT_NV_SUCCESS(res, log); ptx = std::move(ptx_); } - auto& cu = cuda_native::getCUDAFunctionTable(*cudaHandler); + auto& cu = cuda::getCUDAFunctionTable(cudaHandler); CUmodule module; CUfunction kernel; @@ -255,10 +256,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica // Launch kernel { CUdeviceptr outputBufPtr; - cuda_native::getMappedBuffer(*cudaOutputMemory, &outputBufPtr); + cuda::getMappedBuffer(cudaOutputMemory, &outputBufPtr); CUdeviceptr ptrs[] = { - cuda_native::getDeviceptr(*cudaInputMemories[0]), - cuda_native::getDeviceptr(*cudaInputMemories[1]), + cuda::getDeviceptr(cudaInputMemories[0]), + cuda::getDeviceptr(cudaInputMemories[1]), outputBufPtr }; auto numElements = &NumElements; @@ -266,7 +267,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler); ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler); - auto semaphore = cuda_native::getInternalObject(*cudaSemaphore); + auto semaphore = cuda::getInternalObject(cudaSemaphore); const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler); @@ -406,7 +407,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); constexpr auto ElementCount = 1024; constexpr auto BufferSize = ElementCount * sizeof(int); - auto& cu = cuda_native::getCUDAFunctionTable(*cudaHandler); + auto& cu = cuda::getCUDAFunctionTable(cudaHandler); smart_refctd_ptr escaped; { const auto cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); From 24525f0ee735f19d92a688fe85be84667b79af66 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 17:40:31 +0200 Subject: [PATCH 14/47] Use CUDA interop target --- 76_CudaInterop/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt index bd4f1914b..8eb08f70b 100644 --- a/76_CudaInterop/CMakeLists.txt +++ b/76_CudaInterop/CMakeLists.txt @@ -5,7 +5,7 @@ endif() nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") -target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInteropNative) +target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInterop) if(NBL_EMBED_BUILTIN_RESOURCES) set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) From 4671898c61f7b00de9e3e88d039b199e3b16cc0b Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 09:24:56 +0200 Subject: [PATCH 15/47] Use CUDA native interop helper --- 76_CudaInterop/main.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 289b0c0b1..64616a6b7 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -181,14 +181,15 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica for (auto input_i = 0; input_i < InputCount; input_i++) { // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper - cudaInputMemories[input_i] = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); + cudaInputMemories[input_i] = cuda_native::createExportableMemory(cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); + assert(cudaInputMemories[input_i]); vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr); vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get()); inputStagingBuffers[input_i] = createStaging(BufferSize); } IGPUBuffer::SCreationParams outputBufferParams; - outputBufferParams.size = cudaDevice->roundToGranularity(ECUDAMemoryLocation::DEVICE, BufferSize); + outputBufferParams.size = cuda_native::roundToGranularity(cudaDevice, CU_MEM_LOCATION_TYPE_DEVICE, BufferSize); outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams)); @@ -410,9 +411,8 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto& cu = cuda::getCUDAFunctionTable(cudaHandler); smart_refctd_ptr escaped; { - const auto cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); - if (!cudaMemory) logFail("Fail to create exportable memory!"); - + core::smart_refctd_ptr cudaMemory = cuda_native::createExportableMemory(cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); + assert(cudaMemory); escaped = cudaMemory->exportAsMemory(m_device.get()); if (!escaped) logFail("Fail to export CUDA memory!"); From acdcfc8e0feb29a81a274b951c010c7b95f07230 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 11:58:06 +0200 Subject: [PATCH 16/47] Use CUDA interop helper in example --- 76_CudaInterop/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt index 8eb08f70b..de9f9d6b8 100644 --- a/76_CudaInterop/CMakeLists.txt +++ b/76_CudaInterop/CMakeLists.txt @@ -5,7 +5,7 @@ endif() nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") -target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInterop) +nbl_target_link_cuda_interop(${EXECUTABLE_NAME} PRIVATE) if(NBL_EMBED_BUILTIN_RESOURCES) set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) From d5aa23b9648f5854830101a2bda722a402775238 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 16:05:25 +0200 Subject: [PATCH 17/47] Use CUDA interop accessors --- 76_CudaInterop/main.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 64616a6b7..6f7fee94c 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -19,7 +19,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin { if (auto re = err; NVRTC_SUCCESS != re) { - const char* str = cuda::getNVRTCFunctionTable(cudaHandler).pnvrtcGetErrorString(re); + const char* str = cuda::CCUDAHandlerAccessor::getNVRTCFunctionTable(*cudaHandler).pnvrtcGetErrorString(re); logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str()); return false; } @@ -139,14 +139,14 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica smart_refctd_ptr source = IAsset::castDown(assets[0]); std::string log; - auto [ptx_, res] = cuda::compileDirectlyToPTX(cudaHandler, std::string((const char*)source->getPointer(), source->getSize()), + auto [ptx_, res] = cuda::CCUDAHandlerAccessor::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()), "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log); ASSERT_NV_SUCCESS(res, log); ptx = std::move(ptx_); } - auto& cu = cuda::getCUDAFunctionTable(cudaHandler); + auto& cu = cuda::CCUDAHandlerAccessor::getCUDAFunctionTable(*cudaHandler); CUmodule module; CUfunction kernel; @@ -181,7 +181,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica for (auto input_i = 0; input_i < InputCount; input_i++) { // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper - cudaInputMemories[input_i] = cuda_native::createExportableMemory(cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); + cudaInputMemories[input_i] = cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); assert(cudaInputMemories[input_i]); vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr); vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get()); @@ -189,7 +189,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica } IGPUBuffer::SCreationParams outputBufferParams; - outputBufferParams.size = cuda_native::roundToGranularity(cudaDevice, CU_MEM_LOCATION_TYPE_DEVICE, BufferSize); + outputBufferParams.size = cuda_native::CCUDADeviceAccessor::roundToGranularity(*cudaDevice, CU_MEM_LOCATION_TYPE_DEVICE, BufferSize); outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams)); @@ -257,10 +257,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica // Launch kernel { CUdeviceptr outputBufPtr; - cuda::getMappedBuffer(cudaOutputMemory, &outputBufPtr); + cuda::CCUDAImportedMemoryAccessor::getMappedBuffer(*cudaOutputMemory, &outputBufPtr); CUdeviceptr ptrs[] = { - cuda::getDeviceptr(cudaInputMemories[0]), - cuda::getDeviceptr(cudaInputMemories[1]), + cuda::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaInputMemories[0]), + cuda::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaInputMemories[1]), outputBufPtr }; auto numElements = &NumElements; @@ -268,7 +268,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler); ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler); - auto semaphore = cuda::getInternalObject(cudaSemaphore); + auto semaphore = cuda::CCUDAImportedSemaphoreAccessor::getInternalObject(*cudaSemaphore); const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler); @@ -408,10 +408,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); constexpr auto ElementCount = 1024; constexpr auto BufferSize = ElementCount * sizeof(int); - auto& cu = cuda::getCUDAFunctionTable(cudaHandler); + auto& cu = cuda::CCUDAHandlerAccessor::getCUDAFunctionTable(*cudaHandler); smart_refctd_ptr escaped; { - core::smart_refctd_ptr cudaMemory = cuda_native::createExportableMemory(cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); + core::smart_refctd_ptr cudaMemory = cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); assert(cudaMemory); escaped = cudaMemory->exportAsMemory(m_device.get()); if (!escaped) logFail("Fail to export CUDA memory!"); From 5031a249c5cd892190e74aca69ed15c6144575c0 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 16:35:27 +0200 Subject: [PATCH 18/47] Use explicit CUDA compile log --- 76_CudaInterop/main.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 6f7fee94c..b4dffcd31 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -139,11 +139,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica smart_refctd_ptr source = IAsset::castDown(assets[0]); std::string log; - auto [ptx_, res] = cuda::CCUDAHandlerAccessor::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()), - "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log); - ASSERT_NV_SUCCESS(res, log); + auto compile = cuda::CCUDAHandlerAccessor::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()), + "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), log, 0, 0, 0); + ASSERT_NV_SUCCESS(compile.result, log); - ptx = std::move(ptx_); + ptx = std::move(compile.ptx); } auto& cu = cuda::CCUDAHandlerAccessor::getCUDAFunctionTable(*cudaHandler); From 7b5817a6d45c62a70fbe617022b6026a83939ff5 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 18:11:24 +0200 Subject: [PATCH 19/47] Fix CUDA interop example assert helper --- 76_CudaInterop/main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index b4dffcd31..f528dc561 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -27,6 +27,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin } #define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); } +#define ASSERT_CUDA_SUCCESS(expr, handler) { auto re = cuda::CCUDAHandlerAccessor::defaultHandleResult(*(handler), (expr)); assert(re); } using namespace nbl::core; From 2d415af102ebf710ea2bb369b3f0eca5544652f7 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Fri, 8 May 2026 17:06:48 +0200 Subject: [PATCH 20/47] Use opaque CUDA interop handles --- 76_CudaInterop/main.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index f528dc561..38c336da0 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -19,7 +19,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin { if (auto re = err; NVRTC_SUCCESS != re) { - const char* str = cuda::CCUDAHandlerAccessor::getNVRTCFunctionTable(*cudaHandler).pnvrtcGetErrorString(re); + const char* str = cuda::getNVRTCFunctionTable(*cudaHandler).pnvrtcGetErrorString(re); logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str()); return false; } @@ -27,7 +27,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin } #define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); } -#define ASSERT_CUDA_SUCCESS(expr, handler) { auto re = cuda::CCUDAHandlerAccessor::defaultHandleResult(*(handler), (expr)); assert(re); } +#define ASSERT_CUDA_SUCCESS(expr, handler) { auto re = cuda::defaultHandleResult(*(handler), (expr)); assert(re); } using namespace nbl::core; @@ -140,14 +140,14 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica smart_refctd_ptr source = IAsset::castDown(assets[0]); std::string log; - auto compile = cuda::CCUDAHandlerAccessor::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()), + auto compile = cuda::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()), "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), log, 0, 0, 0); ASSERT_NV_SUCCESS(compile.result, log); ptx = std::move(compile.ptx); } - auto& cu = cuda::CCUDAHandlerAccessor::getCUDAFunctionTable(*cudaHandler); + auto& cu = cuda::getCUDAFunctionTable(*cudaHandler); CUmodule module; CUfunction kernel; @@ -182,7 +182,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica for (auto input_i = 0; input_i < InputCount; input_i++) { // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper - cudaInputMemories[input_i] = cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); + cudaInputMemories[input_i] = cuda::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); assert(cudaInputMemories[input_i]); vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr); vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get()); @@ -190,7 +190,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica } IGPUBuffer::SCreationParams outputBufferParams; - outputBufferParams.size = cuda_native::CCUDADeviceAccessor::roundToGranularity(*cudaDevice, CU_MEM_LOCATION_TYPE_DEVICE, BufferSize); + outputBufferParams.size = cuda::roundToGranularity(*cudaDevice, CU_MEM_LOCATION_TYPE_DEVICE, BufferSize); outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams)); @@ -257,11 +257,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica // Launch kernel { - CUdeviceptr outputBufPtr; - cuda::CCUDAImportedMemoryAccessor::getMappedBuffer(*cudaOutputMemory, &outputBufPtr); + cuda::SCUdeviceptr outputBufPtr; + cudaOutputMemory->getMappedBuffer(outputBufPtr.opaque()); CUdeviceptr ptrs[] = { - cuda::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaInputMemories[0]), - cuda::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaInputMemories[1]), + cuda::SCUdeviceptr(cudaInputMemories[0]->getDeviceptr()), + cuda::SCUdeviceptr(cudaInputMemories[1]->getDeviceptr()), outputBufPtr }; auto numElements = &NumElements; @@ -269,7 +269,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler); ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler); - auto semaphore = cuda::CCUDAImportedSemaphoreAccessor::getInternalObject(*cudaSemaphore); + CUexternalSemaphore semaphore = cuda::SCUexternalSemaphore(cudaSemaphore->getInternalObject()); const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler); @@ -409,10 +409,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); constexpr auto ElementCount = 1024; constexpr auto BufferSize = ElementCount * sizeof(int); - auto& cu = cuda::CCUDAHandlerAccessor::getCUDAFunctionTable(*cudaHandler); + auto& cu = cuda::getCUDAFunctionTable(*cudaHandler); smart_refctd_ptr escaped; { - core::smart_refctd_ptr cudaMemory = cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); + core::smart_refctd_ptr cudaMemory = cuda::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); assert(cudaMemory); escaped = cudaMemory->exportAsMemory(m_device.get()); if (!escaped) logFail("Fail to export CUDA memory!"); From e289ee14f5b8f05004726e6f03c81a9a2e768219 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sat, 9 May 2026 13:21:43 +0200 Subject: [PATCH 21/47] Use opaque CUDA interop calls --- 76_CudaInterop/main.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 38c336da0..ec9d8b25f 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -19,7 +19,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin { if (auto re = err; NVRTC_SUCCESS != re) { - const char* str = cuda::getNVRTCFunctionTable(*cudaHandler).pnvrtcGetErrorString(re); + const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str()); return false; } @@ -147,7 +147,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ptx = std::move(compile.ptx); } - auto& cu = cuda::getCUDAFunctionTable(*cudaHandler); + auto& cu = cudaHandler->getCUDAFunctionTable(); CUmodule module; CUfunction kernel; @@ -182,7 +182,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica for (auto input_i = 0; input_i < InputCount; input_i++) { // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper - cudaInputMemories[input_i] = cuda::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); + cudaInputMemories[input_i] = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE }); assert(cudaInputMemories[input_i]); vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr); vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get()); @@ -190,7 +190,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica } IGPUBuffer::SCreationParams outputBufferParams; - outputBufferParams.size = cuda::roundToGranularity(*cudaDevice, CU_MEM_LOCATION_TYPE_DEVICE, BufferSize); + outputBufferParams.size = cudaDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, BufferSize); outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams)); @@ -409,10 +409,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); constexpr auto ElementCount = 1024; constexpr auto BufferSize = ElementCount * sizeof(int); - auto& cu = cuda::getCUDAFunctionTable(*cudaHandler); + auto& cu = cudaHandler->getCUDAFunctionTable(); smart_refctd_ptr escaped; { - core::smart_refctd_ptr cudaMemory = cuda::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); + core::smart_refctd_ptr cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE }); assert(cudaMemory); escaped = cudaMemory->exportAsMemory(m_device.get()); if (!escaped) logFail("Fail to export CUDA memory!"); From b4601fc685176d6729b095a9637c1662d0a29503 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sat, 9 May 2026 16:53:35 +0200 Subject: [PATCH 22/47] Use native CUDA interop conversion --- 76_CudaInterop/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index ec9d8b25f..263e3dcce 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -258,7 +258,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica // Launch kernel { cuda::SCUdeviceptr outputBufPtr; - cudaOutputMemory->getMappedBuffer(outputBufPtr.opaque()); + cudaOutputMemory->getMappedBuffer(outputBufPtr); CUdeviceptr ptrs[] = { cuda::SCUdeviceptr(cudaInputMemories[0]->getDeviceptr()), cuda::SCUdeviceptr(cudaInputMemories[1]->getDeviceptr()), From d373d313d3e70579d650c7804af8a2785cfede9a Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sun, 10 May 2026 10:19:11 +0200 Subject: [PATCH 23/47] Fix CUDA interop smoke validation --- 76_CudaInterop/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 263e3dcce..d66688710 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -390,7 +390,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const auto output = outputs[elem_i]; const auto expected = input1 + input2; const auto diff = abs(output - expected); - if (diff < 0.01) + if (diff >= 0.01f) ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i); } From a6268bc9953b8d8a795b3b2eee8dbd897b05706e Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sun, 10 May 2026 15:36:54 +0200 Subject: [PATCH 24/47] Use CUDA interop assert helper --- 76_CudaInterop/main.cpp | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index d66688710..f090a4500 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -27,7 +27,6 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin } #define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); } -#define ASSERT_CUDA_SUCCESS(expr, handler) { auto re = cuda::defaultHandleResult(*(handler), (expr)); assert(re); } using namespace nbl::core; @@ -153,9 +152,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica CUfunction kernel; CUstream stream; - ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), cudaHandler); - ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), cudaHandler); - ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), *cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), *cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), *cudaHandler); // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory std::array, 2> cpuBufs; @@ -266,17 +265,17 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica }; auto numElements = &NumElements; void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements }; - ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler); - ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), *cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), *cudaHandler); CUexternalSemaphore semaphore = cuda::SCUexternalSemaphore(cudaSemaphore->getInternalObject()); const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; - ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan - ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), *cudaHandler); // Wait for release op from vulkan + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), *cudaHandler); const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; - ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), cudaHandler); // Signal the imported semaphore + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), *cudaHandler); // Signal the imported semaphore } - ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), *cudaHandler); // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA { @@ -397,11 +396,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO); }; - ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), cudaHandler); - ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), *cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), *cudaHandler); - ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), cudaHandler); - ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleUnload(module), *cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream), *cudaHandler); } void testDestruction() From eb8f44a1b5ef38d1416a6fdc9a43e8e0215ec0bf Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sun, 10 May 2026 19:38:51 +0200 Subject: [PATCH 25/47] Use native CUDA interop handles in EX76 --- 76_CudaInterop/main.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index f090a4500..fd05e4b79 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -8,7 +8,6 @@ using namespace core; using namespace system; using namespace asset; using namespace video; -namespace cuda = nbl::video::cuda_native; /* The start of the main function starts like in most other example. We ask the @@ -139,7 +138,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica smart_refctd_ptr source = IAsset::castDown(assets[0]); std::string log; - auto compile = cuda::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()), + auto compile = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), log, 0, 0, 0); ASSERT_NV_SUCCESS(compile.result, log); @@ -256,11 +255,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica // Launch kernel { - cuda::SCUdeviceptr outputBufPtr; + CUdeviceptr outputBufPtr = 0; cudaOutputMemory->getMappedBuffer(outputBufPtr); CUdeviceptr ptrs[] = { - cuda::SCUdeviceptr(cudaInputMemories[0]->getDeviceptr()), - cuda::SCUdeviceptr(cudaInputMemories[1]->getDeviceptr()), + cudaInputMemories[0]->getDeviceptr(), + cudaInputMemories[1]->getDeviceptr(), outputBufPtr }; auto numElements = &NumElements; @@ -268,7 +267,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), *cudaHandler); NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), *cudaHandler); - CUexternalSemaphore semaphore = cuda::SCUexternalSemaphore(cudaSemaphore->getInternalObject()); + CUexternalSemaphore semaphore = cudaSemaphore->getInternalObject(); const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), *cudaHandler); // Wait for release op from vulkan NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), *cudaHandler); From 39441760d335467158a340ad366302235ba6c30e Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sun, 10 May 2026 19:56:53 +0200 Subject: [PATCH 26/47] Pass CUDA handler pointer to assert macro --- 76_CudaInterop/main.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index fd05e4b79..3026bf451 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -151,9 +151,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica CUfunction kernel; CUstream stream; - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), *cudaHandler); - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), *cudaHandler); - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), *cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), cudaHandler); // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory std::array, 2> cpuBufs; @@ -264,17 +264,17 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica }; auto numElements = &NumElements; void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements }; - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), *cudaHandler); - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), *cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler); CUexternalSemaphore semaphore = cudaSemaphore->getInternalObject(); const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), *cudaHandler); // Wait for release op from vulkan - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), *cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler); const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), *cudaHandler); // Signal the imported semaphore + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), cudaHandler); // Signal the imported semaphore } - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), *cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler); // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA { @@ -395,11 +395,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO); }; - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), *cudaHandler); - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), *cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler); - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleUnload(module), *cudaHandler); - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream), *cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleUnload(module), cudaHandler); + NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream), cudaHandler); } void testDestruction() From b4a8725d54ca960e0d2c353ef08d5f40aa4c4e04 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 11 May 2026 09:26:33 +0200 Subject: [PATCH 27/47] Polish CUDA interop example usage --- 76_CudaInterop/main.cpp | 72 +++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 14 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 3026bf451..e2e326102 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -25,6 +25,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin return true; } +#define ASSERT_SUCCESS(expr) NBL_CUDA_INTEROP_ASSERT_SUCCESS((expr), cudaHandler) #define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); } @@ -139,7 +140,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica smart_refctd_ptr source = IAsset::castDown(assets[0]); std::string log; auto compile = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), - "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), log, 0, 0, 0); + "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), &log, 0, 0, 0); ASSERT_NV_SUCCESS(compile.result, log); ptx = std::move(compile.ptx); @@ -151,9 +152,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica CUfunction kernel; CUstream stream; - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), cudaHandler); - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), cudaHandler); - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), cudaHandler); + ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr)); + ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd")); + ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING)); // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory std::array, 2> cpuBufs; @@ -264,17 +265,17 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica }; auto numElements = &NumElements; void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements }; - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler); - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler); + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream)); + ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream)); CUexternalSemaphore semaphore = cudaSemaphore->getInternalObject(); const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler); + ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan + ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr)); const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), cudaHandler); // Signal the imported semaphore + ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore } - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler); + ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA { @@ -395,11 +396,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO); }; - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), cudaHandler); - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler); + ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx)); + ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream)); - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleUnload(module), cudaHandler); - NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream), cudaHandler); + ASSERT_SUCCESS(cu.pcuModuleUnload(module)); + ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream)); } void testDestruction() @@ -478,6 +479,49 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica m_logger->log("Test Destruction complete", ILogger::ELL_INFO); } + // { + // constexpr size_t M = 32; + // auto staging = createStaging(size * M); + // + // auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); + // for (uint32_t i = 0; i < (M * size) / 4; ++i) + // ptr[i] = rand(); + // + // std::vector> cmd(1 << 10); + // commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data()); + // + // for (size_t i = 0; i < 1 << 10; ++i) + // { + // IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { + // .size = size * M, + // .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), + // .alignmentLog2 = 10, + // }; + // RE: + // auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + // + // if (!memory) + // { + // m_device->waitIdle(); + // for (size_t j = 0; j < i; ++j) + // cmd[j] = 0; + // goto END; + // } + // assert(memory); + // auto tmpBuf = createExternalBuffer(memory.get()); + // + // cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // IGPUCommandBuffer::SBufferCopy region = { .size = size * M }; + // assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); + // cmd[i]->end(); + // IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() }; + // IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} }; + // assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 })); + // } + // END: + // m_device->waitIdle(); + // } + } void testLargeAllocations() From 39d02e26023c72a7d3241e5df85e9b7c4afacb84 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 11 May 2026 15:17:11 +0200 Subject: [PATCH 28/47] Fix path tracer allocation size access --- 40_PathTracer/src/renderer/CRenderer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/40_PathTracer/src/renderer/CRenderer.cpp b/40_PathTracer/src/renderer/CRenderer.cpp index aa0a456ff..798cdb987 100644 --- a/40_PathTracer/src/renderer/CRenderer.cpp +++ b/40_PathTracer/src/renderer/CRenderer.cpp @@ -553,7 +553,7 @@ core::smart_refctd_ptr CRenderer::createScene(CScene::SCreationParams&& auto retval = device->allocate(info); // map what is mappable by default so ReBAR checks succeed if (retval.isValid() && retval.memory->isMappable()) - retval.memory->map({.offset=0,.length=info.size}); + retval.memory->map({.offset=0,.length=info.allocationSize}); return retval; } @@ -896,4 +896,4 @@ IQueue::SSubmitInfo::SSemaphoreInfo CRenderer::SSubmit::operator()(std::span Date: Tue, 12 May 2026 17:06:28 +0700 Subject: [PATCH 29/47] Initial implementation of testWmmaGemm test --- .../app_resources/wmmaGemm_b1_kernel.cu | 53 ++ .../app_resources/wmmaGemm_kernel.cu | 107 +++ 76_CudaInterop/main.cpp | 654 ++++++++++++++++-- 3 files changed, 749 insertions(+), 65 deletions(-) create mode 100644 76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu create mode 100644 76_CudaInterop/app_resources/wmmaGemm_kernel.cu diff --git a/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu b/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu new file mode 100644 index 000000000..56d376fae --- /dev/null +++ b/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu @@ -0,0 +1,53 @@ +#include +#include + +using namespace nvcuda; + +// Define WMMA parameters +const int WMMA_M = 8; +const int WMMA_N = 8; +const int WMMA_K = 128; + +extern "C" __global__ void b1_wmma_gemm_kernel(int* a, int* b, int* c, + int M, int N, int K) { + // Leading dimensions + int lda = K; + int ldb = K; + int ldc = N; + + // Tile indices + int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32; + int warpN = (blockIdx.y * blockDim.y + threadIdx.y); + + // Fragments + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment acc_frag; + + // Initialize accumulator with zeros + wmma::fill_fragment(acc_frag, 0); + + // Loop over the K-dimension + for (int i = 0; i < K; i += WMMA_K) { + int aRow = warpM * WMMA_M; + int aCol = i / 32; // Indexing uint32_t + + int bRow = i / 32; + int bCol = warpN * WMMA_N; + + // Load fragments + // Note: load_matrix_sync handles the bit-packing layout internally + wmma::load_matrix_sync(a_frag, a + (aRow * lda / 32 + aCol), lda); + wmma::load_matrix_sync(b_frag, b + (bCol * ldb / 32 + bRow), ldb); + + // Perform XOR-Popcount MMA + wmma::bmma_sync(acc_frag, a_frag, b_frag, acc_frag, wmma::experimental::bmmaBitOpAND); + } + + // Store the result + int cRow = warpM * WMMA_M; + int cCol = warpN * WMMA_N; + int* outputLoc = c + (cRow * ldc + cCol); + wmma::store_matrix_sync(outputLoc, acc_frag, ldc, wmma::mem_row_major); + +} diff --git a/76_CudaInterop/app_resources/wmmaGemm_kernel.cu b/76_CudaInterop/app_resources/wmmaGemm_kernel.cu new file mode 100644 index 000000000..523590e8c --- /dev/null +++ b/76_CudaInterop/app_resources/wmmaGemm_kernel.cu @@ -0,0 +1,107 @@ +/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * CUDA Kernel Device code + * + * Computes the vector addition of A and B into C. The 3 vectors have the same + * number of elements numElements. + */ + + +// GPU configuration. + +#define WARP_SIZE 32 + +// MMA matrix tile dimensions. + +#define M 16 +#define N 16 +#define K 16 + +#define WMMA_M 16 +#define WMMA_N 16 +#define WMMA_K 16 + +#include +#include + +using namespace nvcuda; + +extern "C" __global__ void wmmaGemm(half *a, half *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta) +{ + // Leading dimensions. Packed with no transpositions. + int lda = k_ld; + int ldb = k_ld; + int ldc = n_ld; + + // Tile using a 2D grid + int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; + int warpN = (blockIdx.y * blockDim.y + threadIdx.y); + + // Declare the fragments + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment acc_frag; + wmma::fragment c_frag; + + wmma::fill_fragment(acc_frag, 0.0f); + + // Loop over k + for (int i = 0; i < k_ld; i += WMMA_K) { + int aCol = i; + int aRow = warpM * WMMA_M; + int bCol = warpN * WMMA_N; + int bRow = i; + + // Bounds checking + if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) { + // Load the inputs + wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda); + wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb); + + // Perform the matrix multiplication + wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); + } + } + + // Load in the current value of c, scale it by beta, and add this our result + // scaled by alpha + int cCol = warpN * WMMA_N; + int cRow = warpM * WMMA_M; + + if (cRow < m_ld && cCol < n_ld) { + wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major); + + for (int i = 0; i < c_frag.num_elements; i++) { + c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; + } + + // Store the output + wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major); + } +} diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index dfd214384..11a8768bf 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -11,23 +11,26 @@ using namespace system; using namespace asset; using namespace video; +#define WARP_SIZE 32 + + /* The start of the main function starts like in most other example. We ask the user for the desired renderer and start it up. */ -bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log) +bool check_nv_err(auto err, auto& m_cuHandler, auto& logger, auto file, auto line, std::string const& log) { if (auto re = err; NVRTC_SUCCESS != re) { - const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); + const char* str = m_cuHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str()); return false; } return true; } -#define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); } +#define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), m_cuHandler, m_logger, __FILE__, __LINE__, log); assert(re); } using namespace nbl::core; @@ -49,8 +52,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} - smart_refctd_ptr cudaHandler; - smart_refctd_ptr cudaDevice; + smart_refctd_ptr m_cuHandler; + smart_refctd_ptr m_cuDevice; + smart_refctd_ptr m_utils; IQueue* queue; @@ -59,7 +63,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica virtual void filterDevices(core::set& physicalDevices) const { device_base_t::filterDevices(physicalDevices); - auto& cuDevices = cudaHandler->getAvailableDevices(); + auto& cuDevices = m_cuHandler->getAvailableDevices(); std::erase_if(physicalDevices, [&cuDevices](auto pdev) { return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, &cuDev.uuid, 16); }); }); @@ -71,23 +75,29 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) return false; - cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr(m_logger)); - if (!cudaHandler) + m_cuHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr(m_logger)); + if (!m_cuHandler) return logFail("Could not create a CUDA handler!"); if (!device_base_t::onAppInitialized(std::move(system))) return false; - cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast(m_api), m_physicalDevice); - if (!cudaDevice) + m_utils = IUtilities::create(core::smart_refctd_ptr(m_device), core::smart_refctd_ptr(m_logger)); + if (!m_utils) + return logFail("Could not create IUtilities!"); + + m_cuDevice = m_cuHandler->createDevice(smart_refctd_ptr_dynamic_cast(m_api), m_physicalDevice); + if (!m_cuDevice) return logFail("Could not create a CUDA Device!"); queue = getComputeQueue(); - testVectorAddKernel(); - testDestruction(); - testLargeAllocations(); + testWmmaGemB1(); + // testWmmaGemm(); + // testVectorAddKernel(); + // testDestruction(); + // testLargeAllocations(); return true; } @@ -120,42 +130,60 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica return buf; }; + smart_refctd_ptr compilePtx(const char* filepath) + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = ""; // virtual root + // this time we load a shader directly from a file + auto assetBundle = m_assetMgr->getAsset(filepath, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + logFail("Could not load kernel!"); + + smart_refctd_ptr source = IAsset::castDown(assets[0]); + std::string log; + auto [ptx, res] = m_cuHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), + filepath, m_cuDevice->geDefaultCompileOptions(), 0, 0, 0, &log); + ASSERT_NV_SUCCESS(res, log); + + return ptx; + } + + std::tuple, smart_refctd_ptr> createSharedBuffer(uint32_t size) + { + IGPUBuffer::SCreationParams vkBufferParams; + vkBufferParams.size = m_cuDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, size); + vkBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; + vkBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; + const auto outputBuf = m_device->createBuffer(std::move(vkBufferParams)); + auto outputMemReq = outputBuf->getMemoryReqs(); + + auto allocation = m_device->allocate(outputMemReq, outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE); + const auto cudaOutputMemory = m_cuDevice->importExternalMemory(core::smart_refctd_ptr(allocation.memory)); + if (!cudaOutputMemory) + logFail("Fail to import Vulkan Memory into CUDA!"); + + return std::tuple(std::move(outputBuf), std::move(cudaOutputMemory)); + } + void testVectorAddKernel() { static constexpr uint32_t GridDim[3] = { 4096,1,1 }; - static constexpr uint32_t BlockDim[3] = { 1024,1,1 }; + static constexpr uint32_t BlockDim[3] = { 1,1,1 }; static constexpr size_t NumElements = GridDim[0] * BlockDim[0]; static constexpr size_t BufferSize = sizeof(float) * NumElements; - smart_refctd_ptr ptx; - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = ""; // virtual root - // this time we load a shader directly from a file - auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - logFail("Could not load kernel!"); - - smart_refctd_ptr source = IAsset::castDown(assets[0]); - std::string log; - auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), - "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log); - ASSERT_NV_SUCCESS(res, log); - - ptx = std::move(ptx_); - } - - auto& cu = cudaHandler->getCUDAFunctionTable(); + const auto ptx = compilePtx("app_resources/vectorAdd_kernel.cu"); + auto& cu = m_cuHandler->getCUDAFunctionTable(); CUmodule module; CUfunction kernel; CUstream stream; - ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), cudaHandler); - ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), cudaHandler); - ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), cudaHandler); + ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler); // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory std::array, 2> cpuBufs; @@ -182,28 +210,18 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica for (auto input_i = 0; input_i < InputCount; input_i++) { // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper - cudaInputMemories[input_i] = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); + cudaInputMemories[input_i] = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr); vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get()); inputStagingBuffers[input_i] = createStaging(BufferSize); } - IGPUBuffer::SCreationParams outputBufferParams; - outputBufferParams.size = cudaDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, BufferSize); - outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; - outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; - const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams)); - auto outputMemReq = outputBuf->getMemoryReqs(); - - auto allocation = m_device->allocate(outputMemReq, outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE); - const auto cudaOutputMemory = cudaDevice->importExternalMemory(core::smart_refctd_ptr(allocation.memory)); - if (!cudaOutputMemory) - logFail("Fail to import Vulkan Memory into CUDA!"); + auto [outputBuf, cudaOutputMemory] = createSharedBuffer(BufferSize); ISemaphore::SCreationParams semParams; semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32; auto semaphore = m_device->createSemaphore(0, std::move(semParams)); - const auto cudaSemaphore = cudaDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); + const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); if (!cudaSemaphore) logFail("Fail to import Vulkan Semaphore into CUDA!"); @@ -265,17 +283,16 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica }; auto numElements = &NumElements; void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements }; - ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler); - ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler); + ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), m_cuHandler); auto semaphore = cudaSemaphore->getInternalObject(); const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; - ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan - ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler); + ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), m_cuHandler); // Wait for release op from vulkan + ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), m_cuHandler); const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; - ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), cudaHandler); // Signal the imported semaphore + ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), m_cuHandler); // Signal the imported semaphore } - ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler); // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA { @@ -389,18 +406,526 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const auto output = outputs[elem_i]; const auto expected = input1 + input2; const auto diff = abs(output - expected); - if (diff < 0.01) + if (diff > 0.01) ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i); } ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO); }; - ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), cudaHandler); - ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler); + ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), m_cuHandler); - ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), cudaHandler); - ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), cudaHandler); + ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), m_cuHandler); + } + + void testWmmaGemm() + { + // x = M, y = N, z = K + constexpr auto WmmaSize = uint32_t3{ 16, 16, 16 }; + constexpr auto TileCount = uint32_t3{ 64, 64, 64 }; + constexpr auto ElementCount = WmmaSize * TileCount; + constexpr auto BlockDim = uint32_t2{ 128, 4 }; + // TODO(kevin): Check if this calculation of GridDim correct. Currently we only handle square matrix. So, it doesn't matter + constexpr auto GridDim = uint32_t2(ElementCount.x / BlockDim.x, ElementCount.y / BlockDim.y); + const float Alpha = 1.1f; + const float Beta = 1.2f; + + const auto ptx = compilePtx("app_resources/wmmaGemm_kernel.cu"); + auto& cu = m_cuHandler->getCUDAFunctionTable(); + + CUmodule module; + CUfunction kernel; + CUstream stream; + + ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "wmmaGemm"), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler); + + const auto elementsPerBlock = uint32_t2{ (WmmaSize.x * BlockDim.x) / WARP_SIZE, (WmmaSize.y * BlockDim.y) }; + uint32_t2 gridDim = { + ElementCount.x + (elementsPerBlock.x - 1) / elementsPerBlock.x, + ElementCount.y + (elementsPerBlock.y - 1) / elementsPerBlock.y + }; + + + auto [vkBufferMatA, cuMemMatA] = createSharedBuffer(sizeof(half) * ElementCount.x * ElementCount.z); + auto [vkBufferMatB, cuMemMatB] = createSharedBuffer(sizeof(half) * ElementCount.z * ElementCount.y); + auto [vkBufferMatC, cuMemMatC] = createSharedBuffer(sizeof(float) * ElementCount.x * ElementCount.y); + auto [vkBufferMatD, cuMemMatD] = createSharedBuffer(sizeof(float) * ElementCount.x * ElementCount.y); + + core::vector cpuMatA(ElementCount.x * ElementCount.z), cpuMatB(ElementCount.z * ElementCount.y); + core::vector cpuMatC(ElementCount.x * ElementCount.y); + + auto initCpuMatrix = [ElementCount](half* a, half* b, float* c) + { + for (int i = 0; i < ElementCount.x; i++) { + for (int j = 0; j < ElementCount.z; j++) { + a[i * ElementCount.z + j] = (half)(rand() % 3); + } + } + + for (int i = 0; i < ElementCount.y; i++) { + for (int j = 0; j < ElementCount.z; j++) { + b[i * ElementCount.x + j] = (half)(rand() % 3); + } + } + + for (int t = 0; t < ElementCount.x * ElementCount.y; t++) { + c[t] = static_cast(rand() % 3); + } + }; + initCpuMatrix(cpuMatA.data(), cpuMatB.data(), cpuMatC.data()); + + + ISemaphore::SCreationParams semParams; + semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32; + auto semaphore = m_device->createSemaphore(0, std::move(semParams)); + const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); + if (!cudaSemaphore) + logFail("Fail to import Vulkan Semaphore into CUDA!"); + + std::array, 2> cmd; + auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger)); + + const auto outputStagingBuffer = createStaging(vkBufferMatD->getSize()); + + // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API + { + const IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS, + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .range = { + .offset = 0, + .size = vkBufferMatD->getSize(), + .buffer = vkBufferMatD, + }, + }; + + // start recording + bool re = true; + re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + re &= cmd[0]->pipelineBarrier(EDF_NONE, { + .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1} + }); + re &= cmd[0]->end(); + + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = semaphore.get(), + .value = 1, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + }; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() }; + const IQueue::SSubmitInfo submitInfo = { + .commandBuffers = {&cmdInfo, &cmdInfo + 1}, + .signalSemaphores = {&signalInfo, &signalInfo + 1} + }; + const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 }); + re &= IQueue::RESULT::SUCCESS == submitRe; + if (!re) logFail("Something went wrong readying resources for CUDA"); + } + + // Launch kernel + { + CUdeviceptr matrixAPtr, matrixBPtr, matrixCPtr, matrixDPtr; + cuMemMatA->getMappedBuffer(&matrixAPtr); + cuMemMatB->getMappedBuffer(&matrixBPtr); + cuMemMatC->getMappedBuffer(&matrixCPtr); + cuMemMatD->getMappedBuffer(&matrixDPtr); + CUdeviceptr ptrs[] = { + matrixAPtr, + matrixBPtr, + matrixCPtr, + matrixDPtr, + }; + ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixAPtr, cpuMatA.data(), cpuMatA.size() * sizeof(half), stream), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixBPtr, cpuMatB.data(), cpuMatB.size() * sizeof(half), stream), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixCPtr, cpuMatC.data(), cpuMatC.size() * sizeof(float), stream), m_cuHandler); + + int m_ld = ElementCount.x; + int n_ld = ElementCount.y; + int k_ld = ElementCount.z; + float alpha = Alpha; + float beta = Beta; + void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &ptrs[3], &m_ld, &n_ld, &k_ld, &alpha, &beta }; + + auto semaphore = cudaSemaphore->getInternalObject(); + const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; + ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), m_cuHandler); // Wait for release op from vulkan + ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim.x, GridDim.y, 1, BlockDim.x, BlockDim.y, 1, 0, stream, parameters, nullptr), m_cuHandler); + const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; + ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), m_cuHandler); // Signal the imported semaphore + } + + // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA + { + const IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { + .dep = { + .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT, + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .range = { + .offset = 0, + .size = vkBufferMatD->getSize(), + .buffer = vkBufferMatD, + }, + }; + bool re = true; + re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + re &= cmd[1]->pipelineBarrier(EDF_NONE, + { + .bufBarriers = std::span{ &bufBarrier, &bufBarrier + 1 } + }); + const auto region = IGPUCommandBuffer::SBufferCopy{ + .srcOffset = 0, + .dstOffset = 0, + .size = vkBufferMatD->getSize() + }; + re &= cmd[1]->copyBuffer(vkBufferMatD.get(), outputStagingBuffer.get(), 1, ®ion); + re &= cmd[1]->end(); + + const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { + .semaphore = semaphore.get(), + .value = 2, + .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + }; + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = semaphore.get(), + .value = 3, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + }; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; + const IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = { &waitInfo, &waitInfo + 1 }, + .commandBuffers = { &cmdInfo, &cmdInfo + 1 }, + .signalSemaphores = { &signalInfo, &signalInfo + 1 } + }; + const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 }); + re &= IQueue::RESULT::SUCCESS == submitRe; + if (!re) + logFail("Something went wrong copying results from CUDA"); + } + + auto matMultiplyOnHost = [&]( + const half* A, + const half* B, + float* C) + { + const auto numARows = ElementCount.x; + const auto numAColumns = ElementCount.z; + const auto numBRows = ElementCount.z; + const auto numBColumns = ElementCount.y; + const auto numCRows = ElementCount.x; + const auto numCColumns = ElementCount.y; + for (int i = 0; i < numCRows; i++) { + for (int j = 0; j < numCColumns; j++) { + float temp = 0.0; + + for (int k = 0; k < numAColumns; k++) { + temp += (float)A[i * numAColumns + k] * (float)B[j * numBRows + k]; + } + + C[i * numCColumns + j] = temp * Alpha + Beta * C[i * numCColumns + j]; + } + } + }; + matMultiplyOnHost(cpuMatA.data(), cpuMatB.data(), cpuMatC.data()); + + struct CallbackContext + { + core::smart_refctd_ptr semaphore; + core::smart_refctd_ptr outputStagingBuffer; + core::smart_refctd_ptr device; + core::smart_refctd_ptr logger; + const float* expectedOutput; + }; + + CallbackContext ctx; + ctx.semaphore = semaphore; + ctx.outputStagingBuffer = outputStagingBuffer; + ctx.device = m_device; + ctx.logger = m_logger; + ctx.expectedOutput = cpuMatC.data(); + + auto cudaCallback = [](void* userData) + { + const auto* ctx = reinterpret_cast(userData); + + // Make sure we are also done with the readback + const auto wait = std::array{ + ISemaphore::SWaitInfo{ + .semaphore = ctx->semaphore.get(), + .value = 3, + } + }; + ctx->device->blockForSemaphores(wait, true); + + auto* stagingMem = ctx->outputStagingBuffer->getBoundMemory().memory; + if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize()); + ctx->device->invalidateMappedMemoryRanges(1, &range); + } + + + const auto* outputs = reinterpret_cast(ctx->outputStagingBuffer->getBoundMemory().memory->getMappedPointer()); + + for (auto elem_i = 0; elem_i < ElementCount.x * ElementCount.y; elem_i++) + { + const auto output = outputs[elem_i]; + const auto diff = abs(output - ctx->expectedOutput[elem_i]); + if (diff > 0.01) + ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i); + } + + ctx->logger->log("Test Wmma Gemm Complete", ILogger::ELL_INFO); + }; + + ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), m_cuHandler); + + ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), m_cuHandler); + } + + void testWmmaGemB1() + { + // b1 WMMA dimensions: M=8, N=8, K=128 + constexpr auto WmmaSize = uint32_t3{ 8, 8, 128 }; + constexpr auto TileCount = uint32_t3{ 128, 128, 8 }; // Adjust for b1 dimensions + constexpr auto ElementCount = WmmaSize * TileCount; // M=1024, N=1024, K=1024 + constexpr auto BlockDim = uint32_t2{ 32, 1 }; // 1 warp per block + constexpr auto GridDim = uint32_t2( + (ElementCount.x + WmmaSize.x - 1) / WmmaSize.x, // M tiles + (ElementCount.y + WmmaSize.y - 1) / WmmaSize.y // N tiles + ); + + const auto ptx = compilePtx("app_resources/wmmaGemm_b1_kernel.cu"); + auto& cu = m_cuHandler->getCUDAFunctionTable(); + + CUmodule module; + CUfunction kernel; + CUstream stream; + + ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "b1_wmma_gemm_kernel"), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler); + + // Calculate buffer sizes (bits packed into uint32_t) + const size_t matA_size = (ElementCount.x * ElementCount.z) / 32 * sizeof(uint32_t); // M x K bits + const size_t matB_size = (ElementCount.z * ElementCount.y) / 32 * sizeof(uint32_t); // K x N bits + const size_t matC_size = ElementCount.x * ElementCount.y * sizeof(int32_t); // M x N ints + + auto [vkBufferMatA, cuMemMatA] = createSharedBuffer(matA_size); + auto [vkBufferMatB, cuMemMatB] = createSharedBuffer(matB_size); + auto [vkBufferMatC, cuMemMatC] = createSharedBuffer(matC_size); + + // CPU matrices for initialization and verification + core::vector cpuMatA(ElementCount.x * ElementCount.z / 32); + core::vector cpuMatB(ElementCount.z * ElementCount.y / 32); + core::vector cpuMatC_expected(ElementCount.x * ElementCount.y); + + // Initialize with simple patterns for verification + auto initBinaryMatrices = [&]() + { + // Fill cpuMatA with reverse diagonal pattern + std::fill(cpuMatA.begin(), cpuMatA.end(), 0); + + for (int i = 0; i < ElementCount.x; i++) + { + auto j = ElementCount.z - 1 - i; + auto bitIdx = i * ElementCount.z + j; + auto wordIdx = bitIdx / 32; + auto bitOffset = bitIdx % 32; + cpuMatA[wordIdx] |= (1u << bitOffset); + } + + // Fill cpuMatB with random bits + for (auto& val : cpuMatB) val = rand(); + + // Compute expected result: For bmma with bmmaBitOpAND + // C[i][j] = popcount(A[i,:] AND B[:,j]) + for (int i = 0; i < ElementCount.x; i++) { + for (int j = 0; j < ElementCount.y; j++) { + const int k = ElementCount.z - 1 - i; + const int b_bit_idx = j * ElementCount.z + k; // col-major + const int32_t bit = (cpuMatB[b_bit_idx / 32] >> (b_bit_idx % 32)) & 1; + cpuMatC_expected[i * ElementCount.y + j] = bit; + } + } + }; + initBinaryMatrices(); + + ISemaphore::SCreationParams semParams; + semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32; + auto semaphore = m_device->createSemaphore(0, std::move(semParams)); + const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); + if (!cudaSemaphore) + logFail("Fail to import Vulkan Semaphore into CUDA!"); + + std::array, 2> cmd; + auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger)); + + const auto outputStagingBuffer = createStaging(vkBufferMatC->getSize()); + + // Release ownership to CUDA + { + const IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { + .dep = { + .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS, + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .range = { .offset = 0, .size = vkBufferMatC->getSize(), .buffer = vkBufferMatC }, + }; + + cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmd[0]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier, &bufBarrier + 1}}); + cmd[0]->end(); + + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = semaphore.get(), .value = 1, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + }; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() }; + const IQueue::SSubmitInfo submitInfo = { + .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo, &signalInfo + 1} + }; + queue->submit({ &submitInfo, &submitInfo + 1 }); + } + + // Launch CUDA kernel + { + CUdeviceptr matrixAPtr, matrixBPtr, matrixCPtr; + cuMemMatA->getMappedBuffer(&matrixAPtr); + cuMemMatB->getMappedBuffer(&matrixBPtr); + cuMemMatC->getMappedBuffer(&matrixCPtr); + + ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixAPtr, cpuMatA.data(), matA_size, stream), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixBPtr, cpuMatB.data(), matB_size, stream), m_cuHandler); + core::vector cpuMatC(ElementCount.x * ElementCount.y, 15); + ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixCPtr, cpuMatC.data(), matC_size, stream), m_cuHandler); + + void* parameters[] = { &matrixAPtr, &matrixBPtr, &matrixCPtr, + (void*)&ElementCount.x, (void*)&ElementCount.y, (void*)&ElementCount.z }; + + auto semaphore_cu = cudaSemaphore->getInternalObject(); + const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; + ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore_cu, &waitParams, 1, stream), m_cuHandler); + + ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim.x, GridDim.y, 1, + BlockDim.x, BlockDim.y, 1, + 0, stream, parameters, nullptr), m_cuHandler); + + const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; + ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore_cu, &signalParams, 1, stream), m_cuHandler); + } + + // Acquire ownership and copy results back + { + const IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { + .barrier = { + .dep = { + .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT, + }, + .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, + .otherQueueFamilyIndex = IQueue::FamilyExternal, + }, + .range = { .offset = 0, .size = vkBufferMatC->getSize(), .buffer = vkBufferMatC }, + }; + + cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier, &bufBarrier + 1}}); + const auto region = IGPUCommandBuffer::SBufferCopy{ .srcOffset = 0, .dstOffset = 0, .size = matC_size }; + cmd[1]->copyBuffer(vkBufferMatC.get(), outputStagingBuffer.get(), 1, ®ion); + cmd[1]->end(); + + const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo = { + .semaphore = semaphore.get(), .value = 2, .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + }; + const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = semaphore.get(), .value = 3, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + }; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; + const IQueue::SSubmitInfo submitInfo = { + .waitSemaphores = { &waitInfo, &waitInfo + 1 }, + .commandBuffers = { &cmdInfo, &cmdInfo + 1 }, + .signalSemaphores = { &signalInfo, &signalInfo + 1 } + }; + queue->submit({ &submitInfo, &submitInfo + 1 }); + } + + // Wait and verify results + const auto wait = std::array{ ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3} }; + m_device->blockForSemaphores(wait, true); + + auto* stagingMem = outputStagingBuffer->getBoundMemory().memory; + if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize()); + m_device->invalidateMappedMemoryRanges(1, &range); + } + + const auto* results = reinterpret_cast(stagingMem->getMappedPointer()); + + // Verify results + bool success = true; + int errors = 0; + for (int i = 0; i < ElementCount.x * ElementCount.y; i++) { + const auto expected = [&] + { + // Since we are multiplying reverse diagonal matrix to matrixB. The result should be matrix b but each column reversed. + // The calculation below is to get the index of cpuMatB if the column is reversed to get the expected bit. + const auto row = i / ElementCount.y; + const auto col = i % ElementCount.y; + const auto expectedCol = col; + const auto expectedRow = ElementCount.z - row - 1; + const auto expectedIdx = expectedCol * ElementCount.z + expectedRow; + const auto expectedWordIdx = expectedIdx / 32; + const auto expectedBitOffset = expectedIdx % 32; + return (cpuMatB[expectedWordIdx] >> expectedBitOffset) & uint32_t(1); + }(); + + // const auto expected = [&] + // { + // const auto row = i / ElementCount.y; // row-major + // const auto col = i % ElementCount.y; + // const auto k = ElementCount.z - 1 - row; // reverse-diagonal A + // const auto bIdx = col * ElementCount.z + k; // col-major B + // return (cpuMatB[bIdx / 32] >> (bIdx % 32)) & uint32_t(1); + // }(); + + // const auto expected = cpuMatC_expected[i]; + + const auto result = results[i]; + if (result != expected) { + m_logger->log("WMMA b1 test error at [%d]: GPU=%d, CPU=%d", + system::ILogger::ELL_ERROR, i, results[i], expected); + errors++; + success = false; + } + } + + if (success) + m_logger->log("b1 WMMA test PASSED!", system::ILogger::ELL_INFO); + else + m_logger->log("b1 WMMA test FAILED with %d errors!", system::ILogger::ELL_ERROR, errors); } void testDestruction() @@ -408,10 +933,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); constexpr auto ElementCount = 1024; constexpr auto BufferSize = ElementCount * sizeof(int); - auto& cu = cudaHandler->getCUDAFunctionTable(); + auto& cu = m_cuHandler->getCUDAFunctionTable(); smart_refctd_ptr escaped; { - const auto cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); + const auto cudaMemory = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }); if (!cudaMemory) logFail("Fail to create exportable memory!"); escaped = cudaMemory->exportAsMemory(m_device.get()); @@ -500,7 +1025,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica } } - // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. bool keepRunning() override { return false; } From 8e84dcdf277f77ce1a4f1804f1a4323a32451601 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 12 May 2026 17:13:55 +0700 Subject: [PATCH 30/47] Remove test for WmmaGemm half --- .../app_resources/wmmaGemm_kernel.cu | 107 ------- 76_CudaInterop/main.cpp | 281 ------------------ 2 files changed, 388 deletions(-) delete mode 100644 76_CudaInterop/app_resources/wmmaGemm_kernel.cu diff --git a/76_CudaInterop/app_resources/wmmaGemm_kernel.cu b/76_CudaInterop/app_resources/wmmaGemm_kernel.cu deleted file mode 100644 index 523590e8c..000000000 --- a/76_CudaInterop/app_resources/wmmaGemm_kernel.cu +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/** - * CUDA Kernel Device code - * - * Computes the vector addition of A and B into C. The 3 vectors have the same - * number of elements numElements. - */ - - -// GPU configuration. - -#define WARP_SIZE 32 - -// MMA matrix tile dimensions. - -#define M 16 -#define N 16 -#define K 16 - -#define WMMA_M 16 -#define WMMA_N 16 -#define WMMA_K 16 - -#include -#include - -using namespace nvcuda; - -extern "C" __global__ void wmmaGemm(half *a, half *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta) -{ - // Leading dimensions. Packed with no transpositions. - int lda = k_ld; - int ldb = k_ld; - int ldc = n_ld; - - // Tile using a 2D grid - int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; - int warpN = (blockIdx.y * blockDim.y + threadIdx.y); - - // Declare the fragments - wmma::fragment a_frag; - wmma::fragment b_frag; - wmma::fragment acc_frag; - wmma::fragment c_frag; - - wmma::fill_fragment(acc_frag, 0.0f); - - // Loop over k - for (int i = 0; i < k_ld; i += WMMA_K) { - int aCol = i; - int aRow = warpM * WMMA_M; - int bCol = warpN * WMMA_N; - int bRow = i; - - // Bounds checking - if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) { - // Load the inputs - wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda); - wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb); - - // Perform the matrix multiplication - wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); - } - } - - // Load in the current value of c, scale it by beta, and add this our result - // scaled by alpha - int cCol = warpN * WMMA_N; - int cRow = warpM * WMMA_M; - - if (cRow < m_ld && cCol < n_ld) { - wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major); - - for (int i = 0; i < c_frag.num_elements; i++) { - c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; - } - - // Store the output - wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major); - } -} diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 11a8768bf..4b7f532c7 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -94,7 +94,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica queue = getComputeQueue(); testWmmaGemB1(); - // testWmmaGemm(); // testVectorAddKernel(); // testDestruction(); // testLargeAllocations(); @@ -420,286 +419,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), m_cuHandler); } - void testWmmaGemm() - { - // x = M, y = N, z = K - constexpr auto WmmaSize = uint32_t3{ 16, 16, 16 }; - constexpr auto TileCount = uint32_t3{ 64, 64, 64 }; - constexpr auto ElementCount = WmmaSize * TileCount; - constexpr auto BlockDim = uint32_t2{ 128, 4 }; - // TODO(kevin): Check if this calculation of GridDim correct. Currently we only handle square matrix. So, it doesn't matter - constexpr auto GridDim = uint32_t2(ElementCount.x / BlockDim.x, ElementCount.y / BlockDim.y); - const float Alpha = 1.1f; - const float Beta = 1.2f; - - const auto ptx = compilePtx("app_resources/wmmaGemm_kernel.cu"); - auto& cu = m_cuHandler->getCUDAFunctionTable(); - - CUmodule module; - CUfunction kernel; - CUstream stream; - - ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler); - ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "wmmaGemm"), m_cuHandler); - ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler); - - const auto elementsPerBlock = uint32_t2{ (WmmaSize.x * BlockDim.x) / WARP_SIZE, (WmmaSize.y * BlockDim.y) }; - uint32_t2 gridDim = { - ElementCount.x + (elementsPerBlock.x - 1) / elementsPerBlock.x, - ElementCount.y + (elementsPerBlock.y - 1) / elementsPerBlock.y - }; - - - auto [vkBufferMatA, cuMemMatA] = createSharedBuffer(sizeof(half) * ElementCount.x * ElementCount.z); - auto [vkBufferMatB, cuMemMatB] = createSharedBuffer(sizeof(half) * ElementCount.z * ElementCount.y); - auto [vkBufferMatC, cuMemMatC] = createSharedBuffer(sizeof(float) * ElementCount.x * ElementCount.y); - auto [vkBufferMatD, cuMemMatD] = createSharedBuffer(sizeof(float) * ElementCount.x * ElementCount.y); - - core::vector cpuMatA(ElementCount.x * ElementCount.z), cpuMatB(ElementCount.z * ElementCount.y); - core::vector cpuMatC(ElementCount.x * ElementCount.y); - - auto initCpuMatrix = [ElementCount](half* a, half* b, float* c) - { - for (int i = 0; i < ElementCount.x; i++) { - for (int j = 0; j < ElementCount.z; j++) { - a[i * ElementCount.z + j] = (half)(rand() % 3); - } - } - - for (int i = 0; i < ElementCount.y; i++) { - for (int j = 0; j < ElementCount.z; j++) { - b[i * ElementCount.x + j] = (half)(rand() % 3); - } - } - - for (int t = 0; t < ElementCount.x * ElementCount.y; t++) { - c[t] = static_cast(rand() % 3); - } - }; - initCpuMatrix(cpuMatA.data(), cpuMatB.data(), cpuMatC.data()); - - - ISemaphore::SCreationParams semParams; - semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32; - auto semaphore = m_device->createSemaphore(0, std::move(semParams)); - const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); - if (!cudaSemaphore) - logFail("Fail to import Vulkan Semaphore into CUDA!"); - - std::array, 2> cmd; - auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger)); - - const auto outputStagingBuffer = createStaging(vkBufferMatD->getSize()); - - // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API - { - const IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { - .barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, - .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS, - }, - .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, - .otherQueueFamilyIndex = IQueue::FamilyExternal, - }, - .range = { - .offset = 0, - .size = vkBufferMatD->getSize(), - .buffer = vkBufferMatD, - }, - }; - - // start recording - bool re = true; - re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - re &= cmd[0]->pipelineBarrier(EDF_NONE, { - .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1} - }); - re &= cmd[0]->end(); - - const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { - .semaphore = semaphore.get(), - .value = 1, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, - }; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() }; - const IQueue::SSubmitInfo submitInfo = { - .commandBuffers = {&cmdInfo, &cmdInfo + 1}, - .signalSemaphores = {&signalInfo, &signalInfo + 1} - }; - const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 }); - re &= IQueue::RESULT::SUCCESS == submitRe; - if (!re) logFail("Something went wrong readying resources for CUDA"); - } - - // Launch kernel - { - CUdeviceptr matrixAPtr, matrixBPtr, matrixCPtr, matrixDPtr; - cuMemMatA->getMappedBuffer(&matrixAPtr); - cuMemMatB->getMappedBuffer(&matrixBPtr); - cuMemMatC->getMappedBuffer(&matrixCPtr); - cuMemMatD->getMappedBuffer(&matrixDPtr); - CUdeviceptr ptrs[] = { - matrixAPtr, - matrixBPtr, - matrixCPtr, - matrixDPtr, - }; - ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixAPtr, cpuMatA.data(), cpuMatA.size() * sizeof(half), stream), m_cuHandler); - ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixBPtr, cpuMatB.data(), cpuMatB.size() * sizeof(half), stream), m_cuHandler); - ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixCPtr, cpuMatC.data(), cpuMatC.size() * sizeof(float), stream), m_cuHandler); - - int m_ld = ElementCount.x; - int n_ld = ElementCount.y; - int k_ld = ElementCount.z; - float alpha = Alpha; - float beta = Beta; - void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &ptrs[3], &m_ld, &n_ld, &k_ld, &alpha, &beta }; - - auto semaphore = cudaSemaphore->getInternalObject(); - const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; - ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), m_cuHandler); // Wait for release op from vulkan - ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim.x, GridDim.y, 1, BlockDim.x, BlockDim.y, 1, 0, stream, parameters, nullptr), m_cuHandler); - const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; - ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), m_cuHandler); // Signal the imported semaphore - } - - // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA - { - const IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { - .barrier = { - .dep = { - .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, - .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT, - }, - .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE, - .otherQueueFamilyIndex = IQueue::FamilyExternal, - }, - .range = { - .offset = 0, - .size = vkBufferMatD->getSize(), - .buffer = vkBufferMatD, - }, - }; - bool re = true; - re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - re &= cmd[1]->pipelineBarrier(EDF_NONE, - { - .bufBarriers = std::span{ &bufBarrier, &bufBarrier + 1 } - }); - const auto region = IGPUCommandBuffer::SBufferCopy{ - .srcOffset = 0, - .dstOffset = 0, - .size = vkBufferMatD->getSize() - }; - re &= cmd[1]->copyBuffer(vkBufferMatD.get(), outputStagingBuffer.get(), 1, ®ion); - re &= cmd[1]->end(); - - const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { - .semaphore = semaphore.get(), - .value = 2, - .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, - }; - const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { - .semaphore = semaphore.get(), - .value = 3, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, - }; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; - const IQueue::SSubmitInfo submitInfo = { - .waitSemaphores = { &waitInfo, &waitInfo + 1 }, - .commandBuffers = { &cmdInfo, &cmdInfo + 1 }, - .signalSemaphores = { &signalInfo, &signalInfo + 1 } - }; - const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 }); - re &= IQueue::RESULT::SUCCESS == submitRe; - if (!re) - logFail("Something went wrong copying results from CUDA"); - } - - auto matMultiplyOnHost = [&]( - const half* A, - const half* B, - float* C) - { - const auto numARows = ElementCount.x; - const auto numAColumns = ElementCount.z; - const auto numBRows = ElementCount.z; - const auto numBColumns = ElementCount.y; - const auto numCRows = ElementCount.x; - const auto numCColumns = ElementCount.y; - for (int i = 0; i < numCRows; i++) { - for (int j = 0; j < numCColumns; j++) { - float temp = 0.0; - - for (int k = 0; k < numAColumns; k++) { - temp += (float)A[i * numAColumns + k] * (float)B[j * numBRows + k]; - } - - C[i * numCColumns + j] = temp * Alpha + Beta * C[i * numCColumns + j]; - } - } - }; - matMultiplyOnHost(cpuMatA.data(), cpuMatB.data(), cpuMatC.data()); - - struct CallbackContext - { - core::smart_refctd_ptr semaphore; - core::smart_refctd_ptr outputStagingBuffer; - core::smart_refctd_ptr device; - core::smart_refctd_ptr logger; - const float* expectedOutput; - }; - - CallbackContext ctx; - ctx.semaphore = semaphore; - ctx.outputStagingBuffer = outputStagingBuffer; - ctx.device = m_device; - ctx.logger = m_logger; - ctx.expectedOutput = cpuMatC.data(); - - auto cudaCallback = [](void* userData) - { - const auto* ctx = reinterpret_cast(userData); - - // Make sure we are also done with the readback - const auto wait = std::array{ - ISemaphore::SWaitInfo{ - .semaphore = ctx->semaphore.get(), - .value = 3, - } - }; - ctx->device->blockForSemaphores(wait, true); - - auto* stagingMem = ctx->outputStagingBuffer->getBoundMemory().memory; - if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - { - ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize()); - ctx->device->invalidateMappedMemoryRanges(1, &range); - } - - - const auto* outputs = reinterpret_cast(ctx->outputStagingBuffer->getBoundMemory().memory->getMappedPointer()); - - for (auto elem_i = 0; elem_i < ElementCount.x * ElementCount.y; elem_i++) - { - const auto output = outputs[elem_i]; - const auto diff = abs(output - ctx->expectedOutput[elem_i]); - if (diff > 0.01) - ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i); - } - - ctx->logger->log("Test Wmma Gemm Complete", ILogger::ELL_INFO); - }; - - ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), m_cuHandler); - ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), m_cuHandler); - - ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), m_cuHandler); - ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), m_cuHandler); - } - void testWmmaGemB1() { // b1 WMMA dimensions: M=8, N=8, K=128 From 96b8b3ec938b03672981cf0fc70494f971bb1a2f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 19 May 2026 23:52:50 +0700 Subject: [PATCH 31/47] Update test to follow the update on vk_cuda_interop main branch --- 76_CudaInterop/main.cpp | 88 ++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 087b7d181..516a6fe8b 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -93,13 +93,23 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica queue = getComputeQueue(); testWmmaGemB1(); - // testVectorAddKernel(); - // testDestruction(); - // testLargeAllocations(); + testVectorAddKernel(); + testDestruction(); + testLargeAllocations(); return true; } + smart_refctd_ptr createExternalBuffer2(uint64_t size, core::bitflag externalHandleTypes) + { + IGPUBuffer::SCreationParams params = {}; + params.size = size; + params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT; + params.externalHandleTypes = externalHandleTypes; + auto buf = m_device->createBuffer(std::move(params)); + return buf; + } + smart_refctd_ptr createExternalBuffer(IDeviceMemoryAllocation* mem) { IGPUBuffer::SCreationParams params = {}; @@ -119,7 +129,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits() & m_device->getPhysicalDevice()->getHostVisibleMemoryTypeBits() & m_device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT); - auto allocation = m_device->allocate(req, buf.get()); + auto allocation = m_device->allocate(req, { buf.get() }); void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ); if (!mapping) @@ -153,11 +163,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica IGPUBuffer::SCreationParams vkBufferParams; vkBufferParams.size = m_cuDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, size); vkBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT; - vkBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE; + vkBufferParams.externalHandleTypes = CCUDADevice::ExternalMemoryHandleType; const auto outputBuf = m_device->createBuffer(std::move(vkBufferParams)); auto outputMemReq = outputBuf->getMemoryReqs(); - auto allocation = m_device->allocate(outputMemReq, outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE); + auto allocation = m_device->allocate(outputMemReq, { outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::ExternalMemoryHandleType }); const auto cudaOutputMemory = m_cuDevice->importExternalMemory(core::smart_refctd_ptr(allocation.memory)); if (!cudaOutputMemory) logFail("Fail to import Vulkan Memory into CUDA!"); @@ -217,8 +227,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto [outputBuf, cudaOutputMemory] = createSharedBuffer(BufferSize); ISemaphore::SCreationParams semParams; + semParams.initialValue = 0; semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32; - auto semaphore = m_device->createSemaphore(0, std::move(semParams)); + auto semaphore = m_device->createSemaphore(std::move(semParams)); const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); if (!cudaSemaphore) logFail("Fail to import Vulkan Semaphore into CUDA!"); @@ -387,20 +398,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const auto* inputs2 = reinterpret_cast(ctx->cpuBuffers[1]->getPointer()); const auto* outputs = reinterpret_cast(ctx->outputStagingBuffer->getBoundMemory().memory->getMappedPointer()); - const auto* inputsInStaging1 = reinterpret_cast(ctx->inputStagingBuffers[0]->getBoundMemory().memory->getMappedPointer()); - const auto* inputsInStaging2 = reinterpret_cast(ctx->inputStagingBuffers[1]->getBoundMemory().memory->getMappedPointer()); for (auto elem_i = 0; elem_i < NumElements; elem_i++) { const auto input1 = inputs1[elem_i]; const auto input2 = inputs2[elem_i]; - const auto inputInStaging1 = inputsInStaging1[elem_i]; - const auto inputInStaging2 = inputsInStaging2[elem_i]; - if (inputInStaging1 != input1) - ctx->logger->log("Input1 in Staging %d is incorrect!", ILogger::ELL_ERROR, elem_i); - if (inputInStaging2 != input2) - ctx->logger->log("Input2 in Staging %d is incorrect!", ILogger::ELL_ERROR, elem_i); - const auto output = outputs[elem_i]; const auto expected = input1 + input2; const auto diff = abs(output - expected); @@ -450,10 +452,23 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto [vkBufferMatB, cuMemMatB] = createSharedBuffer(matB_size); auto [vkBufferMatC, cuMemMatC] = createSharedBuffer(matC_size); + // ICPUBuffer::SCreationParams cpuBufferParamsA; + // cpuBufferParamsA.size = ElementCount.x * ElementCount.z / 32; + // const auto cpuBufferA = ICPUBuffer::create(std::move(cpuBufferParamsA)); + + // ICPUBuffer::SCreationParams cpuBufferParamsB; + // cpuBufferParamsB.size = ElementCount.x * ElementCount.z / 32; + // const auto cpuBufferB = ICPUBuffer::create(std::move(cpuBufferParamsB)); + // + // std::array inputBuffers = {cpuBufferA.get(), cpuBufferB.get()}; + // + // CAssetConverter::SInputs inputs = {}; + // std::get>(inputs.assets) = inputBuffers; + // CPU matrices for initialization and verification core::vector cpuMatA(ElementCount.x * ElementCount.z / 32); core::vector cpuMatB(ElementCount.z * ElementCount.y / 32); - core::vector cpuMatC_expected(ElementCount.x * ElementCount.y); + // Initialize with simple patterns for verification auto initBinaryMatrices = [&]() @@ -473,22 +488,13 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica // Fill cpuMatB with random bits for (auto& val : cpuMatB) val = rand(); - // Compute expected result: For bmma with bmmaBitOpAND - // C[i][j] = popcount(A[i,:] AND B[:,j]) - for (int i = 0; i < ElementCount.x; i++) { - for (int j = 0; j < ElementCount.y; j++) { - const int k = ElementCount.z - 1 - i; - const int b_bit_idx = j * ElementCount.z + k; // col-major - const int32_t bit = (cpuMatB[b_bit_idx / 32] >> (b_bit_idx % 32)) & 1; - cpuMatC_expected[i * ElementCount.y + j] = bit; - } - } }; initBinaryMatrices(); ISemaphore::SCreationParams semParams; semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32; - auto semaphore = m_device->createSemaphore(0, std::move(semParams)); + semParams.initialValue = 0; + auto semaphore = m_device->createSemaphore(std::move(semParams)); const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); if (!cudaSemaphore) logFail("Fail to import Vulkan Semaphore into CUDA!"); @@ -619,18 +625,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const auto expectedBitOffset = expectedIdx % 32; return (cpuMatB[expectedWordIdx] >> expectedBitOffset) & uint32_t(1); }(); - - // const auto expected = [&] - // { - // const auto row = i / ElementCount.y; // row-major - // const auto col = i % ElementCount.y; - // const auto k = ElementCount.z - 1 - row; // reverse-diagonal A - // const auto bIdx = col * ElementCount.z + k; // col-major B - // return (cpuMatB[bIdx / 32] >> (bIdx % 32)) & uint32_t(1); - // }(); - - // const auto expected = cpuMatC_expected[i]; - const auto result = results[i]; if (result != expected) { m_logger->log("WMMA b1 test error at [%d]: GPU=%d, CPU=%d", @@ -657,17 +651,19 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const auto cudaMemory = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE }); if (!cudaMemory) logFail("Fail to create exportable memory!"); - escaped = cudaMemory->exportAsMemory(m_device.get()); + auto tmpBuf = createExternalBuffer2(cudaMemory->getCreationParams().granularSize, IDeviceMemoryAllocation::EHT_OPAQUE_WIN32); + escaped = cudaMemory->exportAsMemory(m_device.get(), tmpBuf.get()); if (!escaped) logFail("Fail to export CUDA memory!"); - auto tmpBuf = createExternalBuffer(escaped.get()); auto staging = createStaging(BufferSize); auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); for (uint32_t i = 0; i < ElementCount; ++i) ptr[i] = i; - const auto semaphore = m_device->createSemaphore(0); + ISemaphore::SCreationParams semParams; + semParams.initialValue = 0; + const auto semaphore = m_device->createSemaphore(std::move(semParams)); IQueue::SSubmitInfo::SSemaphoreInfo semInfo; semInfo.semaphore = semaphore.get(); semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; @@ -693,7 +689,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto tmpBuf = createExternalBuffer(escaped.get()); auto staging = createStaging(BufferSize); - const auto semaphore = m_device->createSemaphore(0); + ISemaphore::SCreationParams semParams; + semParams.initialValue = 0; + const auto semaphore = m_device->createSemaphore(std::move(semParams)); IQueue::SSubmitInfo::SSemaphoreInfo semInfo; semInfo.semaphore = semaphore.get(); semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; @@ -737,7 +735,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica for (size_t i = 0; i < (1 << 8); ++i) { - auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory; + auto memory = m_device->allocate(reqs, { nullptr, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::ExternalMemoryHandleType }).memory; assert(memory); auto tmpBuf = createExternalBuffer(memory.get()); } From 58f20e55ac6653f2a2b64a966358e61fcde5a660 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 22 May 2026 14:51:33 +0700 Subject: [PATCH 32/47] Fix to parameter passing when calling allocate --- 71_RayTracingPipeline/main.cpp | 4 ++-- .../nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp index c74ab6686..d46894954 100644 --- a/71_RayTracingPipeline/main.cpp +++ b/71_RayTracingPipeline/main.cpp @@ -216,7 +216,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui } }); - if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid()) + if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), { m_hdrImage.get() }).isValid()) return logFail("Could not create HDR Image"); m_hdrImageView = m_device->createImageView({ @@ -1353,7 +1353,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui auto reqs = scratchBuffer->getMemoryReqs(); reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits(); - auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + auto allocation = m_device->allocate(reqs, { scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT }); allocation.memory->map({ .offset = 0,.length = reqs.size }); scratchAlloc = make_smart_refctd_ptr( diff --git a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp index c7d780fdf..949026a3c 100644 --- a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp +++ b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp @@ -76,7 +76,7 @@ class CSwapchainFramebuffersAndDepth final : public video::CDefaultSwapchainFram .depthUsage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT } }); - device->allocate(image->getMemoryReqs(), image.get()); + device->allocate(image->getMemoryReqs(), { image.get() }); m_depthBuffer = device->createImageView({ .flags = IGPUImageView::ECF_NONE, From dc3a11446aba697238708174d41f44a5e29ec922 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 10:41:24 +0700 Subject: [PATCH 33/47] Use RAII exiter for module and stream cleanup --- 76_CudaInterop/main.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 516a6fe8b..512807574 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -186,12 +186,20 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto& cu = m_cuHandler->getCUDAFunctionTable(); CUmodule module; - CUfunction kernel; - CUstream stream; - ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler); + auto moduleCleanup = nbl::core::makeRAIIExiter([&]() + { + cu.pcuModuleUnload(module); + }); + + CUfunction kernel; ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), m_cuHandler); + + CUstream stream; ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler); + auto streamCleanup = nbl::core::makeRAIIExiter([&] { + cu.pcuStreamDestroy_v2(stream); + }); // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory std::array, 2> cpuBufs; @@ -416,8 +424,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), m_cuHandler); ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), m_cuHandler); - ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), m_cuHandler); - ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), m_cuHandler); } void testWmmaGemB1() From b5caa9c08bee767a604c6b8a51aa71aa6fb5efcb Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 10:42:02 +0700 Subject: [PATCH 34/47] Use c++ random facility --- 76_CudaInterop/main.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 512807574..c392bcc33 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -54,6 +54,8 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica smart_refctd_ptr m_cuHandler; smart_refctd_ptr m_cuDevice; smart_refctd_ptr m_utils; + std::random_device m_randomDevice; + std::mt19937 m_randGenerator; IQueue* queue; @@ -211,9 +213,15 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica buf = ICPUBuffer::create(std::move(params)); } + std::uniform_real_distribution dist(-RAND_MAX, RAND_MAX); for (auto buf_i = 0; buf_i < cpuBufs.size(); buf_i++) + { for (auto elem_i = 0; elem_i < NumElements; elem_i++) - reinterpret_cast(cpuBufs[buf_i]->getPointer())[elem_i] = rand() / float(RAND_MAX); + { + auto* data = reinterpret_cast(cpuBufs[buf_i]->getPointer()); + data[elem_i] = dist(m_randGenerator); + } + } constexpr auto InputCount = 2; // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu' @@ -491,8 +499,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica cpuMatA[wordIdx] |= (1u << bitOffset); } + std::uniform_int_distribution dist; // Fill cpuMatB with random bits - for (auto& val : cpuMatB) val = rand(); + for (auto& val : cpuMatB) val = dist(m_randGenerator); }; initBinaryMatrices(); From bed15b46acb90aafdb512abb3f51b656bac5e958 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 10:42:28 +0700 Subject: [PATCH 35/47] Fix conversion of cpuBuffer to gpuBuffer by setting contentHash --- 76_CudaInterop/main.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index c392bcc33..74d9f2403 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -498,10 +498,12 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto bitOffset = bitIdx % 32; cpuMatA[wordIdx] |= (1u << bitOffset); } + cpuBufferA->setContentHash(cpuBufferA->computeContentHash()); std::uniform_int_distribution dist; // Fill cpuMatB with random bits for (auto& val : cpuMatB) val = dist(m_randGenerator); + cpuBufferB->setContentHash(cpuBufferB->computeContentHash()); }; initBinaryMatrices(); From 48c19621c882f82bd070af04966ca77ad48c60f9 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 10:42:50 +0700 Subject: [PATCH 36/47] Fix block dimension of testVectorAddKernel --- 76_CudaInterop/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 74d9f2403..db1f08f74 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -180,7 +180,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica void testVectorAddKernel() { static constexpr uint32_t GridDim[3] = { 4096,1,1 }; - static constexpr uint32_t BlockDim[3] = { 1,1,1 }; + static constexpr uint32_t BlockDim[3] = { 1024,1,1 }; static constexpr size_t NumElements = GridDim[0] * BlockDim[0]; static constexpr size_t BufferSize = sizeof(float) * NumElements; From da64d2e9176930f369e3d91a5b45aad76cfc4757 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 10:43:31 +0700 Subject: [PATCH 37/47] Implement the rest of testWmmaGemB1 --- 76_CudaInterop/main.cpp | 137 ++++++++++++++++++++++++++++------------ 1 file changed, 95 insertions(+), 42 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index db1f08f74..adee6a671 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -49,7 +49,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica public: // Yay thanks to multiple inheritance we cannot forward ctors anymore CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : - system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), m_randGenerator(m_randomDevice()) {} smart_refctd_ptr m_cuHandler; smart_refctd_ptr m_cuDevice; @@ -94,15 +94,15 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica queue = getComputeQueue(); - testWmmaGemB1(); testVectorAddKernel(); + testWmmaGemB1(); testDestruction(); testLargeAllocations(); return true; } - smart_refctd_ptr createExternalBuffer2(uint64_t size, core::bitflag externalHandleTypes) + smart_refctd_ptr createExternalBuffer(uint64_t size, core::bitflag externalHandleTypes) { IGPUBuffer::SCreationParams params = {}; params.size = size; @@ -117,7 +117,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica IGPUBuffer::SCreationParams params = {}; params.size = mem->getAllocationSize(); params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT; - params.externalHandleTypes = mem->getCreationParams().externalHandleType; + params.externalHandleTypes = mem->getCreationParams().externalHandleTypes; auto buf = m_device->createBuffer(std::move(params)); ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } }; m_device->bindBufferMemory(1, &bindInfo); @@ -244,7 +244,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ISemaphore::SCreationParams semParams; semParams.initialValue = 0; - semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32; + semParams.externalHandleTypes = CCUDADevice::ExternalSemaphoreHandleType; auto semaphore = m_device->createSemaphore(std::move(semParams)); const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); if (!cudaSemaphore) @@ -462,28 +462,20 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const size_t matB_size = (ElementCount.z * ElementCount.y) / 32 * sizeof(uint32_t); // K x N bits const size_t matC_size = ElementCount.x * ElementCount.y * sizeof(int32_t); // M x N ints - auto [vkBufferMatA, cuMemMatA] = createSharedBuffer(matA_size); - auto [vkBufferMatB, cuMemMatB] = createSharedBuffer(matB_size); auto [vkBufferMatC, cuMemMatC] = createSharedBuffer(matC_size); - // ICPUBuffer::SCreationParams cpuBufferParamsA; - // cpuBufferParamsA.size = ElementCount.x * ElementCount.z / 32; - // const auto cpuBufferA = ICPUBuffer::create(std::move(cpuBufferParamsA)); - - // ICPUBuffer::SCreationParams cpuBufferParamsB; - // cpuBufferParamsB.size = ElementCount.x * ElementCount.z / 32; - // const auto cpuBufferB = ICPUBuffer::create(std::move(cpuBufferParamsB)); - // - // std::array inputBuffers = {cpuBufferA.get(), cpuBufferB.get()}; - // - // CAssetConverter::SInputs inputs = {}; - // std::get>(inputs.assets) = inputBuffers; - - // CPU matrices for initialization and verification - core::vector cpuMatA(ElementCount.x * ElementCount.z / 32); - core::vector cpuMatB(ElementCount.z * ElementCount.y / 32); - - + ICPUBuffer::SCreationParams cpuBufferParamsA; + cpuBufferParamsA.size = matA_size; + const auto cpuBufferA = ICPUBuffer::create(std::move(cpuBufferParamsA)); + const auto cpuBufferAData = reinterpret_cast(cpuBufferA->getPointer()); + const auto cpuMatA = std::span(cpuBufferAData, matA_size / sizeof(uint32_t)); + + ICPUBuffer::SCreationParams cpuBufferParamsB; + cpuBufferParamsB.size = matB_size; + const auto cpuBufferB = ICPUBuffer::create(std::move(cpuBufferParamsB)); + const auto cpuBufferBData = reinterpret_cast(cpuBufferB->getPointer()); + const auto cpuMatB = std::span(cpuBufferBData, matB_size / sizeof(uint32_t)); + // Initialize with simple patterns for verification auto initBinaryMatrices = [&]() { @@ -507,7 +499,81 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica }; initBinaryMatrices(); + + std::array inputBuffers = {cpuBufferA.get(), cpuBufferB.get()}; + + CAssetConverter::SInputs inputs = {}; + std::get>(inputs.assets) = inputBuffers; + std::array, std::size(inputBuffers)> inputBufferPatches; + for (auto& inputPatch : inputBufferPatches) + { + inputPatch.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT; + inputPatch.externalHandleTypes = CCUDADevice::ExternalMemoryHandleType; + } + std::get>(inputs.patches) = inputBufferPatches; + smart_refctd_ptr converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} }); + auto reservation = converter->reserve(inputs); + if (!reservation) + { + logFail("reserve failed!"); + return; + } + + // Create transfer queue resources + auto transferQueue = getComputeQueue(); + auto transferCmdPool = m_device->createCommandPool( + transferQueue->getFamilyIndex(), + IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT + ); + + // SIntendedSubmitInfo needs at least one scratch cmdbuf in RECORDING state + smart_refctd_ptr transferCmdBuf; + transferCmdPool->createCommandBuffers( + IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &transferCmdBuf, smart_refctd_ptr(m_logger) + ); + transferCmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + auto transferScratchSemaphore = m_device->createSemaphore({ .initialValue = 0 }); + + IQueue::SSubmitInfo::SCommandBufferInfo transferCmdBufInfo = { + transferCmdBuf.get() + }; + SIntendedSubmitInfo transferSubmitInfo; + transferSubmitInfo.queue = transferQueue; + transferSubmitInfo.scratchCommandBuffers = { &transferCmdBufInfo, 1 }; + transferSubmitInfo.scratchSemaphore = { + .semaphore = transferScratchSemaphore.get(), + .value = 0, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + }; + + nbl::video::CAssetConverter::SConvertParams convertParams = {}; + convertParams.utilities = m_utils.get(); + convertParams.transfer = &transferSubmitInfo; + auto future = reservation.convert(convertParams); + if (future.copy() != IQueue::RESULT::SUCCESS) + { + logFail("CAssetConverter convert failed!"); + return; + } + + auto gpuBuffers = reservation.getGPUObjects(); + auto gpuBufferA = gpuBuffers[0].value; + const auto boundedMemA = gpuBufferA->getBoundMemory(); + auto cuMemMatA = m_cuDevice->importExternalMemory(core::smart_refctd_ptr(boundedMemA.memory)); + + auto gpuBufferB = gpuBuffers[1].value; + const auto boundedMemB = gpuBufferB->getBoundMemory(); + auto cuMemMatB = m_cuDevice->importExternalMemory( + core::smart_refctd_ptr(boundedMemB.memory)); + + std::array, 2> cmd; + auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger)); + + const auto outputStagingBuffer = createStaging(vkBufferMatC->getSize()); + ISemaphore::SCreationParams semParams; semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32; semParams.initialValue = 0; @@ -515,12 +581,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); if (!cudaSemaphore) logFail("Fail to import Vulkan Semaphore into CUDA!"); - - std::array, 2> cmd; - auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger)); - - const auto outputStagingBuffer = createStaging(vkBufferMatC->getSize()); // Release ownership to CUDA { @@ -553,15 +613,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica // Launch CUDA kernel { CUdeviceptr matrixAPtr, matrixBPtr, matrixCPtr; - cuMemMatA->getMappedBuffer(&matrixAPtr); - cuMemMatB->getMappedBuffer(&matrixBPtr); + cuMemMatA->getMappedBuffer(&matrixAPtr, gpuBufferA->getSize(), gpuBufferA->getBoundMemory().offset); + cuMemMatB->getMappedBuffer(&matrixBPtr, gpuBufferB->getSize(), gpuBufferB->getBoundMemory().offset); cuMemMatC->getMappedBuffer(&matrixCPtr); - ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixAPtr, cpuMatA.data(), matA_size, stream), m_cuHandler); - ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixBPtr, cpuMatB.data(), matB_size, stream), m_cuHandler); - core::vector cpuMatC(ElementCount.x * ElementCount.y, 15); - ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixCPtr, cpuMatC.data(), matC_size, stream), m_cuHandler); - void* parameters[] = { &matrixAPtr, &matrixBPtr, &matrixCPtr, (void*)&ElementCount.x, (void*)&ElementCount.y, (void*)&ElementCount.z }; @@ -569,9 +624,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore_cu, &waitParams, 1, stream), m_cuHandler); - ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim.x, GridDim.y, 1, - BlockDim.x, BlockDim.y, 1, - 0, stream, parameters, nullptr), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim.x, GridDim.y, 1, BlockDim.x, BlockDim.y, 1, 0, stream, parameters, nullptr), m_cuHandler); const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore_cu, &signalParams, 1, stream), m_cuHandler); @@ -668,7 +721,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const auto cudaMemory = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE }); if (!cudaMemory) logFail("Fail to create exportable memory!"); - auto tmpBuf = createExternalBuffer2(cudaMemory->getCreationParams().granularSize, IDeviceMemoryAllocation::EHT_OPAQUE_WIN32); + auto tmpBuf = createExternalBuffer(cudaMemory->getCreationParams().granularSize, IDeviceMemoryAllocation::EHT_OPAQUE_WIN32); escaped = cudaMemory->exportAsMemory(m_device.get(), tmpBuf.get()); if (!escaped) logFail("Fail to export CUDA memory!"); From ca8a8adb338c37c1ad7af5de22a201bc7e73e3f9 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 11:17:18 +0700 Subject: [PATCH 38/47] Improve the comment on testVectorAddKernel --- 76_CudaInterop/main.cpp | 112 ++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 49 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index adee6a671..59f31f081 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -179,6 +179,23 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica void testVectorAddKernel() { + // This function demonstrates bidirectional resource sharing between CUDA and Vulkan: + // + // Shared Resources: + // - 3 buffers: 2 input buffers + 1 output buffer for vector addition results + // - 1 semaphore for synchronization + // + // Memory Allocation Patterns: + // - Input buffers: Allocated by CUDA (CCUDADevice::createExportableMemory) → imported to Vulkan + // - Output buffer: Allocated by Vulkan → imported to CUDA (CCUDADevice::importExternalMemory) + // + // Synchronization: + // - Semaphore: Created by Vulkan → imported to CUDA + // - Demonstrates bidirectional signaling: CUDA signals → Vulkan waits, and vice versa + // + // Data Flow: + // - CUDA kernel writes to shared buffer → Vulkan reads the results + static constexpr uint32_t GridDim[3] = { 4096,1,1 }; static constexpr uint32_t BlockDim[3] = { 1024,1,1 }; static constexpr size_t NumElements = GridDim[0] * BlockDim[0]; @@ -189,8 +206,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica CUmodule module; ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler); - auto moduleCleanup = nbl::core::makeRAIIExiter([&]() - { + auto moduleCleanup = nbl::core::makeRAIIExiter([&]() { cu.pcuModuleUnload(module); }); @@ -224,28 +240,35 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica } constexpr auto InputCount = 2; - // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu' - // // Kernel writes to cudaInputMemories[2] which we later use to export and read on nabla side + // Create CUDA-allocated input buffers that will be exported to Vulkan + // This demonstrates the CUDA → Vulkan memory sharing pattern std::array, InputCount> cudaInputMemories = {}; std::array, InputCount> vulkanMemories = {}; std::array, InputCount> vulkanInputBuffers = {}; std::array, InputCount> inputStagingBuffers = {}; - for (auto input_i = 0; input_i < InputCount; input_i++) + auto initInputBuffers = [&] { - // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper - cudaInputMemories[input_i] = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE }); - vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr); - vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get()); - inputStagingBuffers[input_i] = createStaging(BufferSize); - } + for (auto input_i = 0; input_i < InputCount; input_i++) + { + cudaInputMemories[input_i] = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE }); + vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr); + vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get()); + inputStagingBuffers[input_i] = createStaging(BufferSize); + } + }; + initInputBuffers(); + // Create Vulkan-allocated output buffer and import to CUDA + // This demonstrates the Vulkan → CUDA memory sharing pattern auto [outputBuf, cudaOutputMemory] = createSharedBuffer(BufferSize); + // Create timeline semaphore for cross-API synchronization + // Timeline values: 0=initial, 1=release vulkan output buffer ownership, 2=cuda kernel done, 3=copy done ISemaphore::SCreationParams semParams; semParams.initialValue = 0; semParams.externalHandleTypes = CCUDADevice::ExternalSemaphoreHandleType; - auto semaphore = m_device->createSemaphore(std::move(semParams)); + const auto semaphore = m_device->createSemaphore(std::move(semParams)); const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); if (!cudaSemaphore) logFail("Fail to import Vulkan Semaphore into CUDA!"); @@ -256,7 +279,8 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const auto outputStagingBuffer = createStaging(BufferSize); - // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API + // === Phase 1: Vulkan releases ownership to external queue (CUDA) === + // Signal semaphore to value=1 after ownership transfer { const IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { .barrier = { @@ -297,8 +321,13 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica if (!re) logFail("Something went wrong readying resources for CUDA"); } - // Launch kernel + // === Phase 2: CUDA executes kernel === + // 1. Copy input data from CPU to CUDA device memory + // 2. Wait for semaphore value=1 (ownership released) + // 3. Launch vectorAdd kernel + // 4. Signal semaphore to value=2 (kernel complete) { + // Step 1 CUdeviceptr outputBufPtr; cudaOutputMemory->getMappedBuffer(&outputBufPtr); CUdeviceptr ptrs[] = { @@ -311,15 +340,22 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), m_cuHandler); ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), m_cuHandler); + // Step 2 CUexternalSemaphore semaphore = cudaSemaphore->getInternalObject(); const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), m_cuHandler); // Wait for release op from vulkan + + // Step 3 ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), m_cuHandler); + + // Step 4 const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), m_cuHandler); // Signal the imported semaphore } - - // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA + + // === Phase 3: Vulkan acquires ownership and copies results === + // Wait for semaphore value=2, then copy output to staging buffer + // Signal semaphore to value=3 after copy completes { const IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { .barrier = { @@ -372,48 +408,28 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica } - struct CallbackContext + // === Phase 4: Validate the output buffer content === { - core::smart_refctd_ptr semaphore; - std::array, InputCount> cpuBuffers; - std::array, InputCount> inputStagingBuffers; - core::smart_refctd_ptr outputStagingBuffer; - core::smart_refctd_ptr device; - core::smart_refctd_ptr logger; - }; - - CallbackContext ctx; - ctx.semaphore = semaphore; - ctx.cpuBuffers = cpuBufs; - ctx.inputStagingBuffers = inputStagingBuffers; - ctx.outputStagingBuffer = outputStagingBuffer; - ctx.device = m_device; - ctx.logger = m_logger; - - auto cudaCallback = [](void* userData) - { - const auto* ctx = reinterpret_cast(userData); - // Make sure we are also done with the readback const auto wait = std::array{ ISemaphore::SWaitInfo{ - .semaphore = ctx->semaphore.get(), + .semaphore = semaphore.get(), .value = 3, } }; - ctx->device->blockForSemaphores(wait, true); + m_device->blockForSemaphores(wait, true); - auto* stagingMem = ctx->outputStagingBuffer->getBoundMemory().memory; + auto* stagingMem = outputStagingBuffer->getBoundMemory().memory; if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) { ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize()); - ctx->device->invalidateMappedMemoryRanges(1, &range); + m_device->invalidateMappedMemoryRanges(1, &range); } - const auto* inputs1 = reinterpret_cast(ctx->cpuBuffers[0]->getPointer()); - const auto* inputs2 = reinterpret_cast(ctx->cpuBuffers[1]->getPointer()); + const auto* inputs1 = reinterpret_cast(cpuBufs[0]->getPointer()); + const auto* inputs2 = reinterpret_cast(cpuBufs[1]->getPointer()); - const auto* outputs = reinterpret_cast(ctx->outputStagingBuffer->getBoundMemory().memory->getMappedPointer()); + const auto* outputs = reinterpret_cast(outputStagingBuffer->getBoundMemory().memory->getMappedPointer()); for (auto elem_i = 0; elem_i < NumElements; elem_i++) { @@ -423,14 +439,12 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const auto expected = input1 + input2; const auto diff = abs(output - expected); if (diff > 0.01) - ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i); + m_logger->log("TestVectorAdd: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i); } - ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO); - }; + m_logger->log("TestVectorAdd Complete", ILogger::ELL_INFO); - ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), m_cuHandler); - ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), m_cuHandler); + } } From f07899c26a04fdd73d0d8ef79a092be6b25af400 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 12:16:11 +0700 Subject: [PATCH 39/47] Proper resource cleanup for testWmmaGemB1 --- 76_CudaInterop/main.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 59f31f081..ef0f6477a 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -464,12 +464,19 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto& cu = m_cuHandler->getCUDAFunctionTable(); CUmodule module; - CUfunction kernel; - CUstream stream; - ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler); + auto moduleCleanup = nbl::core::makeRAIIExiter([&]() { + cu.pcuModuleUnload(module); + }); + + CUfunction kernel; ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "b1_wmma_gemm_kernel"), m_cuHandler); + + CUstream stream; ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler); + auto streamCleanup = nbl::core::makeRAIIExiter([&] { + cu.pcuStreamDestroy_v2(stream); + }); // Calculate buffer sizes (bits packed into uint32_t) const size_t matA_size = (ElementCount.x * ElementCount.z) / 32 * sizeof(uint32_t); // M x K bits From 7ead24c7d5335f5c3bf71c11c25ebaa86fa6a8c5 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 15:28:25 +0700 Subject: [PATCH 40/47] Use SyncPoint* constant instead of magic number --- 76_CudaInterop/main.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index ef0f6477a..b717a354d 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -265,8 +265,12 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica // Create timeline semaphore for cross-API synchronization // Timeline values: 0=initial, 1=release vulkan output buffer ownership, 2=cuda kernel done, 3=copy done + static constexpr uint64_t SyncPointInitial = 0; + static constexpr uint64_t SyncPointReleased = 1; + static constexpr uint64_t SyncPointKernelDone = 2; + static constexpr uint64_t SyncPointCopyDone = 3; ISemaphore::SCreationParams semParams; - semParams.initialValue = 0; + semParams.initialValue = SyncPointInitial; semParams.externalHandleTypes = CCUDADevice::ExternalSemaphoreHandleType; const auto semaphore = m_device->createSemaphore(std::move(semParams)); const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); @@ -308,7 +312,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), - .value = 1, + .value = SyncPointReleased, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }; const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() }; @@ -342,14 +346,14 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica // Step 2 CUexternalSemaphore semaphore = cudaSemaphore->getInternalObject(); - const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; + const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = SyncPointReleased } } }; ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), m_cuHandler); // Wait for release op from vulkan // Step 3 ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), m_cuHandler); // Step 4 - const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; + const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = SyncPointKernelDone } } }; ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), m_cuHandler); // Signal the imported semaphore } @@ -387,12 +391,12 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), - .value = 2, + .value = SyncPointKernelDone, .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, }; const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), - .value = 3, + .value = SyncPointCopyDone, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }; const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; @@ -414,7 +418,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const auto wait = std::array{ ISemaphore::SWaitInfo{ .semaphore = semaphore.get(), - .value = 3, + .value = SyncPointCopyDone, } }; m_device->blockForSemaphores(wait, true); From 78835314611a9933344359e6a8b28cee02717f8c Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 15:29:08 +0700 Subject: [PATCH 41/47] Refactor testWmmaGemmb1 --- 76_CudaInterop/main.cpp | 166 ++++++++++++++++++++++++---------------- 1 file changed, 102 insertions(+), 64 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index b717a354d..d9d3138af 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -454,6 +454,32 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica void testWmmaGemB1() { + // This function demonstrates a key advantage of CUDA-Vulkan interoperability: + // accessing CUDA-exclusive hardware features that Vulkan cannot natively support. + // + // WMMA (Warp Matrix Multiply-Accumulate) with b1 (1-bit) primitives leverages + // specialized Tensor Core instructions for ultra-efficient binary matrix operations. + // Since Vulkan lacks native support for 1-bit matrix operations, this test showcases + // how applications can: + // 1. Allocate and manage matrices using Vulkan's memory system + // 2. Share those buffers with CUDA via external memory handles + // 3. Execute CUDA-exclusive Tensor Core operations (b1 WMMA GEMM) + // 4. Retrieve results back to Vulkan for further GPU processing or readback + // + // Test methodology: + // - Matrix A (M×K): 1-bit reverse diagonal matrix (1s on anti-diagonal, 0s elsewhere) + // - Matrix B (K×N): 1-bit random matrix + // - Matrix C (M×N): Result stored as int32s (popcount of bitwise AND per row/col pair) + // + // Verification strategy: + // Multiplying a reverse diagonal matrix by any matrix B produces a result where each + // column of B is reversed. This makes verification trivial: C[i,j] should equal B[K-1-i, j] + // Example with K=4: + // [0 0 0 1] [b00 b01] [b30 b31] + // [0 0 1 0] × [b10 b11] = [b20 b21] + // [0 1 0 0] [b20 b21] [b10 b11] + // [1 0 0 0] [b30 b31] [b00 b01] + // b1 WMMA dimensions: M=8, N=8, K=128 constexpr auto WmmaSize = uint32_t3{ 8, 8, 128 }; constexpr auto TileCount = uint32_t3{ 128, 128, 8 }; // Adjust for b1 dimensions @@ -463,6 +489,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica (ElementCount.x + WmmaSize.x - 1) / WmmaSize.x, // M tiles (ElementCount.y + WmmaSize.y - 1) / WmmaSize.y // N tiles ); + static constexpr auto BitsPerUint32 = 32; const auto ptx = compilePtx("app_resources/wmmaGemm_b1_kernel.cu"); auto& cu = m_cuHandler->getCUDAFunctionTable(); @@ -483,11 +510,16 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica }); // Calculate buffer sizes (bits packed into uint32_t) - const size_t matA_size = (ElementCount.x * ElementCount.z) / 32 * sizeof(uint32_t); // M x K bits - const size_t matB_size = (ElementCount.z * ElementCount.y) / 32 * sizeof(uint32_t); // K x N bits + const size_t matA_size = (ElementCount.x * ElementCount.z) / BitsPerUint32 * sizeof(uint32_t); // M x K bits + const size_t matB_size = (ElementCount.z * ElementCount.y) / BitsPerUint32 * sizeof(uint32_t); // K x N bits const size_t matC_size = ElementCount.x * ElementCount.y * sizeof(int32_t); // M x N ints auto [vkBufferMatC, cuMemMatC] = createSharedBuffer(matC_size); + if (!vkBufferMatC || !cuMemMatC) + { + logFail("Failed to create shared buffer for matrix C"); + return; + } ICPUBuffer::SCreationParams cpuBufferParamsA; cpuBufferParamsA.size = matA_size; @@ -506,13 +538,12 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica { // Fill cpuMatA with reverse diagonal pattern std::fill(cpuMatA.begin(), cpuMatA.end(), 0); - for (int i = 0; i < ElementCount.x; i++) { auto j = ElementCount.z - 1 - i; auto bitIdx = i * ElementCount.z + j; - auto wordIdx = bitIdx / 32; - auto bitOffset = bitIdx % 32; + auto wordIdx = bitIdx / BitsPerUint32; + auto bitOffset = bitIdx % BitsPerUint32; cpuMatA[wordIdx] |= (1u << bitOffset); } cpuBufferA->setContentHash(cpuBufferA->computeContentHash()); @@ -599,9 +630,13 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica const auto outputStagingBuffer = createStaging(vkBufferMatC->getSize()); + static constexpr uint64_t SyncPointInitial = 0; + static constexpr uint64_t SyncPointReleased = 1; + static constexpr uint64_t SyncPointKernelDone = 2; + static constexpr uint64_t SyncPointCopyDone = 3; ISemaphore::SCreationParams semParams; semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32; - semParams.initialValue = 0; + semParams.initialValue = SyncPointInitial; auto semaphore = m_device->createSemaphore(std::move(semParams)); const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore)); if (!cudaSemaphore) @@ -626,7 +661,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica cmd[0]->end(); const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { - .semaphore = semaphore.get(), .value = 1, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + .semaphore = semaphore.get(), .value = SyncPointReleased, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }; const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() }; const IQueue::SSubmitInfo submitInfo = { @@ -646,12 +681,12 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica (void*)&ElementCount.x, (void*)&ElementCount.y, (void*)&ElementCount.z }; CUexternalSemaphore semaphore_cu = cudaSemaphore->getInternalObject(); - const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } }; + const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = SyncPointReleased } } }; ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore_cu, &waitParams, 1, stream), m_cuHandler); ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim.x, GridDim.y, 1, BlockDim.x, BlockDim.y, 1, 0, stream, parameters, nullptr), m_cuHandler); - const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } }; + const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = SyncPointKernelDone } } }; ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore_cu, &signalParams, 1, stream), m_cuHandler); } @@ -676,10 +711,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica cmd[1]->end(); const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo = { - .semaphore = semaphore.get(), .value = 2, .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, + .semaphore = semaphore.get(), .value = SyncPointKernelDone, .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT, }; const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { - .semaphore = semaphore.get(), .value = 3, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, + .semaphore = semaphore.get(), .value = SyncPointCopyDone, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, }; const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() }; const IQueue::SSubmitInfo submitInfo = { @@ -691,10 +726,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica } // Wait and verify results - const auto wait = std::array{ ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3} }; - m_device->blockForSemaphores(wait, true); + { + const auto wait = std::array{ ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = SyncPointCopyDone} }; + m_device->blockForSemaphores(wait, true); - auto* stagingMem = outputStagingBuffer->getBoundMemory().memory; + auto* stagingMem = outputStagingBuffer->getBoundMemory().memory; if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) { ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize()); @@ -713,77 +749,78 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica // The calculation below is to get the index of cpuMatB if the column is reversed to get the expected bit. const auto row = i / ElementCount.y; const auto col = i % ElementCount.y; - const auto expectedCol = col; - const auto expectedRow = ElementCount.z - row - 1; - const auto expectedIdx = expectedCol * ElementCount.z + expectedRow; - const auto expectedWordIdx = expectedIdx / 32; - const auto expectedBitOffset = expectedIdx % 32; - return (cpuMatB[expectedWordIdx] >> expectedBitOffset) & uint32_t(1); - }(); - const auto result = results[i]; - if (result != expected) { - m_logger->log("WMMA b1 test error at [%d]: GPU=%d, CPU=%d", - system::ILogger::ELL_ERROR, i, results[i], expected); - errors++; - success = false; + const auto expectedCol = col; + const auto expectedRow = ElementCount.z - row - 1; + const auto expectedIdx = expectedCol * ElementCount.z + expectedRow; + const auto expectedWordIdx = expectedIdx / BitsPerUint32; + const auto expectedBitOffset = expectedIdx % BitsPerUint32; + return (cpuMatB[expectedWordIdx] >> expectedBitOffset) & uint32_t(1); + }(); + const auto result = results[i]; + if (result != expected) { + m_logger->log("WMMA b1 test error at [%d]: GPU=%d, CPU=%d", + system::ILogger::ELL_ERROR, i, results[i], expected); + errors++; + success = false; + constexpr int MaxErrorsToReport = 10; + if (errors == MaxErrorsToReport) break; + } } + + if (success) + m_logger->log("b1 WMMA test PASSED!", system::ILogger::ELL_INFO); + else + m_logger->log("b1 WMMA test FAILED with %d errors!", system::ILogger::ELL_ERROR, errors); } - - if (success) - m_logger->log("b1 WMMA test PASSED!", system::ILogger::ELL_INFO); - else - m_logger->log("b1 WMMA test FAILED with %d errors!", system::ILogger::ELL_ERROR, errors); } void testDestruction() { + + // Tests proper resource lifetime management across CUDA-Vulkan interop by creating exportable CUDA memory, + // copying data to it, then destroying the CUDA memory object while keeping the exported Vulkan memory alive. + // Verifies that the exported memory remains valid and accessible after the original CUDA object is destroyed, + // confirming correct reference counting and external memory handle semantics. + auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); constexpr auto ElementCount = 1024; constexpr auto BufferSize = ElementCount * sizeof(int); + + // Construct testData + core::vector testData(ElementCount); + std::iota(testData.begin(), testData.end(), 0); + auto& cu = m_cuHandler->getCUDAFunctionTable(); + + // This vulkan memory will outlive the CUDA memory object below smart_refctd_ptr escaped; { + // Create exportable CUDA memory - this object will be destroyed at the end of this scope const auto cudaMemory = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE }); if (!cudaMemory) logFail("Fail to create exportable memory!"); - auto tmpBuf = createExternalBuffer(cudaMemory->getCreationParams().granularSize, IDeviceMemoryAllocation::EHT_OPAQUE_WIN32); - escaped = cudaMemory->exportAsMemory(m_device.get(), tmpBuf.get()); + // Export CUDA memory as Vulkan device memory - this reference will persist + escaped = cudaMemory->exportAsMemory(m_device.get()); if (!escaped) logFail("Fail to export CUDA memory!"); - - auto staging = createStaging(BufferSize); - - auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer(); - for (uint32_t i = 0; i < ElementCount; ++i) - ptr[i] = i; - - ISemaphore::SCreationParams semParams; - semParams.initialValue = 0; - const auto semaphore = m_device->createSemaphore(std::move(semParams)); - IQueue::SSubmitInfo::SSemaphoreInfo semInfo; - semInfo.semaphore = semaphore.get(); - semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; - semInfo.value = 1; - - smart_refctd_ptr cmdBuffer; - commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmdBuffer); - cmdBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - IGPUCommandBuffer::SBufferCopy region = { .size = BufferSize }; - assert(cmdBuffer->copyBuffer(staging.get(), tmpBuf.get(), 1, ®ion)); - cmdBuffer->end(); - IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmdBuffer.get() }; - const IQueue::SSubmitInfo submitInfo = { - .commandBuffers = {&cmdInfo, &cmdInfo + 1}, - .signalSemaphores = {&semInfo, 1} - }; - auto qre = queue->submit({ &submitInfo, &submitInfo + 1 }); - assert(IQueue::RESULT::SUCCESS == qre); - m_device->waitIdle(); - } + + // Copy testData into cudaMemory + CUstream stream; + ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler); + auto streamCleanup = nbl::core::makeRAIIExiter([&] { + cu.pcuStreamDestroy_v2(stream); + }); + ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(cudaMemory->getDeviceptr(), testData.data(), BufferSize, stream), m_cuHandler); + ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), m_cuHandler); + + } + // CRITICAL: cudaMemory object destroyed here, but escaped memory should remain valid { + // Re-import the exported memory - this tests if the memory survived CUDA object destruction auto tmpBuf = createExternalBuffer(escaped.get()); auto staging = createStaging(BufferSize); + // Setup synchronization for readback ISemaphore::SCreationParams semParams; semParams.initialValue = 0; const auto semaphore = m_device->createSemaphore(std::move(semParams)); @@ -792,6 +829,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; semInfo.value = 1; + // Copy data back from the persistent buffer to staging for verification smart_refctd_ptr cmd; commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd); cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); From 3cd947c256390e44e8212ba9506fcfc1974c26cd Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 15:40:03 +0700 Subject: [PATCH 42/47] Misc refactor on testWmmaGemmb1 --- 76_CudaInterop/main.cpp | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index d9d3138af..e06bfd970 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -737,14 +737,13 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica m_device->invalidateMappedMemoryRanges(1, &range); } - const auto* results = reinterpret_cast(stagingMem->getMappedPointer()); - - // Verify results - bool success = true; - int errors = 0; - for (int i = 0; i < ElementCount.x * ElementCount.y; i++) { - const auto expected = [&] - { + const auto* results = reinterpret_cast(stagingMem->getMappedPointer()); + + // Verify results + int errorCount = 0; + for (int i = 0; i < ElementCount.x * ElementCount.y; i++) { + const auto expected = [&] + { // Since we are multiplying reverse diagonal matrix to matrixB. The result should be matrix b but each column reversed. // The calculation below is to get the index of cpuMatB if the column is reversed to get the expected bit. const auto row = i / ElementCount.y; @@ -760,17 +759,16 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica if (result != expected) { m_logger->log("WMMA b1 test error at [%d]: GPU=%d, CPU=%d", system::ILogger::ELL_ERROR, i, results[i], expected); - errors++; - success = false; + errorCount++; constexpr int MaxErrorsToReport = 10; - if (errors == MaxErrorsToReport) break; + if (errorCount == MaxErrorsToReport) break; } } - if (success) + if (errorCount == 0) m_logger->log("b1 WMMA test PASSED!", system::ILogger::ELL_INFO); else - m_logger->log("b1 WMMA test FAILED with %d errors!", system::ILogger::ELL_ERROR, errors); + m_logger->log("b1 WMMA test FAILED with %d errors!", system::ILogger::ELL_ERROR, errorCount); } } From 5110e9b9f0074b0c1ab3a75c215b76016010cb0a Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 16:26:18 +0700 Subject: [PATCH 43/47] Remove testLargeAllocations --- 76_CudaInterop/main.cpp | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index e06bfd970..ac7182369 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -97,7 +97,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica testVectorAddKernel(); testWmmaGemB1(); testDestruction(); - testLargeAllocations(); return true; } @@ -854,22 +853,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica } - void testLargeAllocations() - { - // TODO(kevin): Calculate BufferSize that is big enough to fill the machine VRAM - constexpr auto BufferSize = 1024; - IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = { - .size = BufferSize, - .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(), - .alignmentLog2 = 10, - }; - - for (size_t i = 0; i < (1 << 8); ++i) - { - auto memory = m_device->allocate(reqs, { nullptr, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::ExternalMemoryHandleType }).memory; - assert(memory); - auto tmpBuf = createExternalBuffer(memory.get()); - } } // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. From 347fae0d4a7480748126355e94fb0ff21193f6fd Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 16:26:46 +0700 Subject: [PATCH 44/47] Remove unused method --- 76_CudaInterop/main.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index ac7182369..63062e1f8 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -101,15 +101,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica return true; } - smart_refctd_ptr createExternalBuffer(uint64_t size, core::bitflag externalHandleTypes) - { - IGPUBuffer::SCreationParams params = {}; - params.size = size; - params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT; - params.externalHandleTypes = externalHandleTypes; - auto buf = m_device->createBuffer(std::move(params)); - return buf; - } smart_refctd_ptr createExternalBuffer(IDeviceMemoryAllocation* mem) { From 1aebc5a6e6f0e365aafe1e2ac4c742d7c4d3b6aa Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 16:27:10 +0700 Subject: [PATCH 45/47] Misc refactor on testDestruction --- 76_CudaInterop/main.cpp | 46 +++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 63062e1f8..6b0a5931d 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -101,7 +101,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica return true; } - smart_refctd_ptr createExternalBuffer(IDeviceMemoryAllocation* mem) { IGPUBuffer::SCreationParams params = {}; @@ -721,11 +720,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica m_device->blockForSemaphores(wait, true); auto* stagingMem = outputStagingBuffer->getBoundMemory().memory; - if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - { - ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize()); - m_device->invalidateMappedMemoryRanges(1, &range); - } + if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize()); + m_device->invalidateMappedMemoryRanges(1, &range); + } const auto* results = reinterpret_cast(stagingMem->getMappedPointer()); @@ -734,10 +733,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica for (int i = 0; i < ElementCount.x * ElementCount.y; i++) { const auto expected = [&] { - // Since we are multiplying reverse diagonal matrix to matrixB. The result should be matrix b but each column reversed. - // The calculation below is to get the index of cpuMatB if the column is reversed to get the expected bit. - const auto row = i / ElementCount.y; - const auto col = i % ElementCount.y; + // Since we are multiplying reverse diagonal matrix to matrixB. The result should be matrix b but each column reversed. + // The calculation below is to get the index of cpuMatB if the column is reversed to get the expected bit. + const auto row = i / ElementCount.y; + const auto col = i % ElementCount.y; const auto expectedCol = col; const auto expectedRow = ElementCount.z - row - 1; const auto expectedIdx = expectedCol * ElementCount.z + expectedRow; @@ -808,14 +807,17 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto tmpBuf = createExternalBuffer(escaped.get()); auto staging = createStaging(BufferSize); + // Setup synchronization for readback ISemaphore::SCreationParams semParams; semParams.initialValue = 0; const auto semaphore = m_device->createSemaphore(std::move(semParams)); + static constexpr auto SyncPointCopyDone = 1; + IQueue::SSubmitInfo::SSemaphoreInfo semInfo; semInfo.semaphore = semaphore.get(); semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; - semInfo.value = 1; + semInfo.value = SyncPointCopyDone; // Copy data back from the persistent buffer to staging for verification smart_refctd_ptr cmd; @@ -832,20 +834,34 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica auto qre = queue->submit({ &submitInfo, &submitInfo + 1 }); assert(IQueue::RESULT::SUCCESS == qre); + ISemaphore::SWaitInfo waitInfo = { + .semaphore = semaphore.get(), + .value = SyncPointCopyDone + }; + m_device->blockForSemaphores({ &waitInfo, 1 }); m_device->waitIdle(); + // Verify the data remains intact after CUDA object destruction auto& ptr = *(std::array*)staging->getBoundMemory().memory->getMappedPointer(); + auto errorCount = 0; + static const auto MaxErrorCount = 10; for (uint32_t i = 0; i < ElementCount; ++i) { - if (ptr[i] != i) logFail("Test Destruction: Element %d is incorrect", i); + if (ptr[i] != testData[i]) { + logFail("Destruction test error at [%d]: value=%d, expected=%d", i, ptr[i], testData[i]); + errorCount++; + if (errorCount == MaxErrorCount) break; + } } - m_logger->log("Test Destruction complete", ILogger::ELL_INFO); + + if (errorCount == 0) + m_logger->log("Destruction test PASSED!", system::ILogger::ELL_INFO); + else + m_logger->log("Destruction test FAILED with %d errors!", system::ILogger::ELL_ERROR, errorCount); } } - } - // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization. bool keepRunning() override { return false; } From 08d594c79bea5917f6380393c0bcde78a5c2a3ec Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 16:55:45 +0700 Subject: [PATCH 46/47] Fix the acquire barrier --- 76_CudaInterop/main.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp index 6b0a5931d..612b01ba2 100644 --- a/76_CudaInterop/main.cpp +++ b/76_CudaInterop/main.cpp @@ -277,10 +277,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica { const IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { .barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, - .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS, - }, .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, .otherQueueFamilyIndex = IQueue::FamilyExternal, }, @@ -635,10 +631,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica { const IGPUCommandBuffer::SBufferMemoryBarrier bufBarrier = { .barrier = { - .dep = { - .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS, - .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS, - }, .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE, .otherQueueFamilyIndex = IQueue::FamilyExternal, }, From a4b1f5ffcfee386a0b020db1fc3fe4481a365b4e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 25 May 2026 16:56:30 +0700 Subject: [PATCH 47/47] Remove unnecessary comment --- 76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu | 3 --- 1 file changed, 3 deletions(-) diff --git a/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu b/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu index 56d376fae..ef6ccad12 100644 --- a/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu +++ b/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu @@ -35,12 +35,9 @@ extern "C" __global__ void b1_wmma_gemm_kernel(int* a, int* b, int* c, int bRow = i / 32; int bCol = warpN * WMMA_N; - // Load fragments - // Note: load_matrix_sync handles the bit-packing layout internally wmma::load_matrix_sync(a_frag, a + (aRow * lda / 32 + aCol), lda); wmma::load_matrix_sync(b_frag, b + (bCol * ldb / 32 + bRow), ldb); - // Perform XOR-Popcount MMA wmma::bmma_sync(acc_frag, a_frag, b_frag, acc_frag, wmma::experimental::bmmaBitOpAND); }