From f22f11ca4f05a4c4efa3ba9e10ba118e3a1ae3df Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Tue, 3 Mar 2026 14:26:18 +0700
Subject: [PATCH 01/47] Initial implementation of CUDA interop unit test

---
 76_CudaInterop/CMakeLists.txt                 |  24 +
 .../app_resources/vectorAdd_kernel.cu         |  42 ++
 76_CudaInterop/main.cpp                       | 543 ++++++++++++++++++
 CMakeLists.txt                                |   1 +
 4 files changed, 610 insertions(+)
 create mode 100644 76_CudaInterop/CMakeLists.txt
 create mode 100644 76_CudaInterop/app_resources/vectorAdd_kernel.cu
 create mode 100644 76_CudaInterop/main.cpp

diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt
new file mode 100644
index 000000000..bc1624875
--- /dev/null
+++ b/76_CudaInterop/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/76_CudaInterop/app_resources/vectorAdd_kernel.cu b/76_CudaInterop/app_resources/vectorAdd_kernel.cu
new file mode 100644
index 000000000..3baef0123
--- /dev/null
+++ b/76_CudaInterop/app_resources/vectorAdd_kernel.cu
@@ -0,0 +1,42 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+
+extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
+                                     int numElements) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < numElements) {
+    C[i] = A[i] + B[i];
+  }
+}
\ No newline at end of file
diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
new file mode 100644
index 000000000..85d10ad13
--- /dev/null
+++ b/76_CudaInterop/main.cpp
@@ -0,0 +1,543 @@
+#include "nbl/video/CCUDAHandler.h"
+// #include "nbl/video/CCUDASharedMemory.h"
+// #include "nbl/video/CCUDASharedSemaphore.h"
+
+#include "nbl/application_templates/MonoDeviceApplication.hpp"
+#include "nbl/examples/common/BuiltinResourcesApplication.hpp"
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace video;
+
+/*
+The start of the main function starts like in most other example. We ask the
+user for the desired renderer and start it up.
+*/
+
+bool check_cuda_err(cudaError_enum err, auto& cu, auto& logger, auto file, auto line)
+{
+    if (auto re = err; CUDA_SUCCESS != re) 
+    {
+        const char* name = 0, * str = 0;
+        cu.pcuGetErrorName(re, &name);
+        cu.pcuGetErrorString(re, &str);
+        logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str);
+        return false;
+    }
+    return true;
+}
+
+bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log)
+{
+    if (auto re = err; NVRTC_SUCCESS != re) 
+    {
+        const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); 
+        logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str());
+        return false;
+    }
+    return true;
+}
+
+#define ASSERT_SUCCESS(expr) { auto re = check_cuda_err((expr), cu, m_logger, __FILE__, __LINE__); assert(re); }
+#define ASSERT_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
+
+
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::video;
+using namespace nbl::examples;
+using namespace nbl::application_templates;
+
+class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplication
+{
+    using device_base_t = MonoDeviceApplication;
+    using asset_base_t = BuiltinResourcesApplication;
+
+    static constexpr uint32_t gridDim[3] = { 4096,1,1 };
+    static constexpr uint32_t blockDim[3] = { 1024,1,1 };
+    static constexpr size_t numElements = gridDim[0] * blockDim[0];
+    static constexpr size_t size = sizeof(float) * numElements;
+
+public:
+    // Yay thanks to multiple inheritance we cannot forward ctors anymore
+    CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+        system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+    smart_refctd_ptr<CCUDAHandler> cudaHandler;
+    smart_refctd_ptr<CCUDADevice> cudaDevice;
+
+    IQueue* queue;
+
+    // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
+    std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
+    // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
+    // // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side
+    // std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
+    // // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer
+    // smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
+
+    // our Buffer that is bound to cudaMemories[2]
+    smart_refctd_ptr<IGPUBuffer> importedBuf;
+    // our Image that is also bound to cudaMemories[2]
+    smart_refctd_ptr<IGPUImage> importedImg;
+
+    // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing
+    smart_refctd_ptr<IGPUBuffer> stagingBufs[2];
+
+    // Nabla semaphore for sync
+    smart_refctd_ptr<ISemaphore> semaphore;
+
+    smart_refctd_ptr<IGPUCommandPool> commandPool;
+    smart_refctd_ptr<IGPUCommandBuffer> cmd[2];
+
+    // a device filter helps you create a set of physical devices that satisfy your requirements in terms of features, limits etc.
+    virtual void filterDevices(core::set<video::IPhysicalDevice*>& physicalDevices) const
+    {
+        device_base_t::filterDevices(physicalDevices);
+        auto& cuDevices = cudaHandler->getAvailableDevices();
+        std::erase_if(physicalDevices, [&cuDevices](auto pdev) {
+            return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, &cuDev.uuid, 16);  });
+        });
+    }
+
+    bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+    {
+        // Remember to call the base class initialization!
+        if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
+            return false;
+
+        cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr<ILogger>(m_logger));
+        if (!cudaHandler) 
+            return logFail("Could not create a CUDA handler!");
+
+        if (!device_base_t::onAppInitialized(std::move(system)))
+            return false;
+
+        cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(m_api), m_physicalDevice);
+        if (!cudaDevice) 
+            return logFail("Could not create a CUDA Device!");
+
+        
+        queue = device_base_t::getComputeQueue();
+        
+        createResources();
+
+        smart_refctd_ptr<ICPUBuffer> ptx;
+        {
+            IAssetLoader::SAssetLoadParams lp = {};
+            lp.logger = m_logger.get();
+            lp.workingDirectory = ""; // virtual root
+            // this time we load a shader directly from a file
+            auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp);
+            const auto assets = assetBundle.getContents();
+            if (assets.empty())
+                return logFail("Could not load kernel!");
+
+            smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
+            std::string log;
+            auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
+                "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
+            ASSERT_SUCCESS_NV(res, log);
+
+            ptx = std::move(ptx_);
+        }
+        CUmodule   module;
+        CUfunction kernel;
+        CUstream   stream;
+
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+
+        ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
+        ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
+        ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+
+        // launchKernel(kernel, stream);
+
+        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+        ASSERT_SUCCESS(cu.pcuModuleUnload(module));
+        ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
+
+        m_device->waitIdle();
+        
+        // testInterop();
+
+        return true;
+    }
+
+    void createResources()
+    {
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+
+        for (auto& buf : cpuBufs)
+        {
+          ICPUBuffer::SCreationParams params = {};
+          params.size = size;
+          buf = ICPUBuffer::create(std::move(params));
+        }
+
+        for (auto j = 0; j < 2; j++)
+            for (auto i = 0; i < numElements; i++)
+                reinterpret_cast<float*>(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX);
+
+
+        // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
+        // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        //
+        // semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
+        // ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get()));
+        // {
+        //     // export the CUmem we have just created into a refctd IDeviceMemoryAllocation
+        //     auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
+        //     if (!devmemory)
+        //         logFail("Failed to export CUDA memory!");
+        //
+        //
+        //     // create an importing external buffer on Nabla side
+        //     IGPUBuffer::SCreationParams params = {};
+        //     params.size = devmemory->getAllocationSize();
+        //     params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
+        //     params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+        //     importedBuf = m_device->createBuffer(std::move(params));
+        //     if (!importedBuf) 
+        //         logFail("Failed to create an external buffer");
+        //
+        //     // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
+        //     ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
+        //     bool re = m_device->bindBufferMemory(1, &bindInfo);
+        //     if (!re) logFail("Failed to bind CUDA memory to buffer");
+        // }
+        //
+        // {
+        //     // same thing as above
+        //     // we create an external image and bind the imported external memory to it
+        //     // now we have 2 different resources that are bound to the same memory
+        //     IImage::SCreationParams params = {};
+        //     params.type = IGPUImage::ET_2D;
+        //     params.samples = IGPUImage::ESCF_1_BIT;
+        //     params.format = EF_R32_SFLOAT;
+        //     params.extent = { gridDim[0], blockDim[0], 1 };
+        //     params.mipLevels = 1;
+        //     params.arrayLayers = 1;
+        //     params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
+        //     importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params));
+        //     if (!importedImg) logFail("Failed to create an external image");
+        // }
+        //
+        // commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        // bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
+        //
+        // stagingBufs[0] = createStaging();
+        // stagingBufs[1] = createStaging();
+    }
+
+    // smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
+    // {
+    //     IGPUBuffer::SCreationParams params = {};
+    //     params.size = mem->getAllocationSize();
+    //     params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
+    //     params.externalHandleTypes = mem->getCreationParams().externalHandleType;
+    //     auto buf = m_device->createBuffer(std::move(params));
+    //     ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
+    //     m_device->bindBufferMemory(1, &bindInfo);
+    //     return buf;
+    // }
+
+    // smart_refctd_ptr<IGPUBuffer> createStaging(size_t sz = size)
+    // {
+    //     auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} });
+    //     auto req = buf->getMemoryReqs();
+    //     req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
+    //     auto allocation = m_device->allocate(req, buf.get());
+    //
+    //     void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
+    //     if (!mapping)
+    //         logFail("Failed to map an staging buffer");
+    //     memset(mapping, 0, req.size);
+    //     return buf;
+    // };
+
+    // void launchKernel(CUfunction kernel, CUstream stream)
+    // {
+    //
+    //     // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
+    //     {
+    //         IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+    //                 .barrier = {
+    //                     .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+    //                     .otherQueueFamilyIndex = IQueue::FamilyExternal,
+    //                 },
+    //                 .range = {.buffer = importedBuf, },
+    //         };
+    //
+    //         IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+    //             .barrier = {
+    //                 .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+    //                 .otherQueueFamilyIndex = IQueue::FamilyExternal,
+    //             },
+    //             .image = importedImg.get(),
+    //             .subresourceRange = {
+    //                 .aspectMask = IImage::EAF_COLOR_BIT,
+    //                 .levelCount = 1u,
+    //                 .layerCount = 1u,
+    //             }
+    //         };
+    //         // start recording
+    //         bool re = true;
+    //         re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+    //         re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
+    //         re &= cmd[0]->end();
+    //
+    //         IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 };
+    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
+    //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
+    //         auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+    //         re &= IQueue::RESULT::SUCCESS == submitRe;
+    //         if (!re)
+    //             logFail("Something went wrong readying resources for CUDA");
+    //     }
+    //     
+    //     auto& cu = cudaHandler->getCUDAFunctionTable();
+    //     // Launch kernel
+    //     {
+    //         CUdeviceptr ptrs[] = {
+    //             cudaMemories[0]->getDeviceptr(),
+    //             cudaMemories[1]->getDeviceptr(),
+    //             cudaMemories[2]->getDeviceptr(),
+    //         };
+    //         auto numEles = numElements;
+    //         void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles };
+    //         ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream));
+    //         ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream));
+    //
+    //         auto semaphore = cudaSemaphore->getInternalObject();
+    //         CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+    //         ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
+    //         ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
+    //         CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
+    //         ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+    //     }
+    //     
+    //     // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
+    //     {
+    //         IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+    //             .barrier = {
+    //                 .dep = {
+    //                     .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+    //                     .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
+    //                 },
+    //                 .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+    //                 .otherQueueFamilyIndex = IQueue::FamilyExternal,
+    //             },
+    //             .range = { .buffer = importedBuf, },
+    //         };
+    //         bool re = true;
+    //         re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+    //
+    //         re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}});
+    //
+    //         IGPUCommandBuffer::SBufferCopy region = { .size = size };
+    //         re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, &region);
+    //
+    //         IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+    //             .barrier = { 
+    //                 .dep = { 
+    //                     .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+    //                     .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS,
+    //                 },
+    //                 .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+    //                 .otherQueueFamilyIndex = IQueue::FamilyExternal,
+    //             },
+    //             .image = importedImg.get(),
+    //             .subresourceRange = {
+    //                 .aspectMask = IImage::EAF_COLOR_BIT,
+    //                 .levelCount = 1u,
+    //                 .layerCount = 1u,
+    //             },
+    //             .oldLayout = IImage::LAYOUT::PREINITIALIZED,
+    //             .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
+    //         };
+    //
+    //         re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}});
+    //
+    //         IImage::SBufferCopy imgRegion = {
+    //             .imageSubresource = {
+    //                 .aspectMask = imgBarrier.subresourceRange.aspectMask,
+    //                 .layerCount = imgBarrier.subresourceRange.layerCount,
+    //             },
+    //             .imageExtent = importedImg->getCreationParameters().extent,
+    //         };
+    //
+    //         re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
+    //         re &= cmd[1]->end();
+    //         
+    //         IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 };
+    //         IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 };
+    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
+    //         IQueue::SSubmitInfo submitInfo = { 
+    //             .waitSemaphores = {&waitInfo,&waitInfo + 1},
+    //             .commandBuffers = {&cmdInfo, &cmdInfo + 1},  
+    //             .signalSemaphores = {&signalInfo,&signalInfo + 1} 
+    //         };
+    //         auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+    //         re &= IQueue::RESULT::SUCCESS == submitRe;
+    //         if (!re)
+    //             logFail("Something went wrong copying results from CUDA");
+    //     }
+    //     
+    //     ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
+    // }
+
+    // void kernelCallback()
+    // {
+    //     // Make sure we are also done with the readback
+    //     auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3}};
+    //     m_device->waitForSemaphores(wait, true, -1);
+    //
+    //     float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
+    //     float* B = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
+    //
+    //     float* CBuf = reinterpret_cast<float*>(stagingBufs[0]->getBoundMemory().memory->getMappedPointer());
+    //     float* CImg = reinterpret_cast<float*>(stagingBufs[1]->getBoundMemory().memory->getMappedPointer());
+    //
+    //     if(memcmp(CBuf, CImg, size))
+    //         logFail("Buffer and Image memories do not match!");
+    //
+    //     for (auto i = 0; i < numElements; i++)
+    //     {
+    //         bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
+    //         if(!re)
+    //             logFail("Element at index %d is incorrect!", i);
+    //     }
+    //     
+    //     std::cout << "Success\n";
+    // }
+
+
+    // void testInterop()
+    // {
+    //     {
+    //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+    //             .size = size,
+    //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+    //             .alignmentLog2 = 10,
+    //         };
+    //
+    //         for (size_t i = 0; i < (1 << 8); ++i)
+    //         {
+    //             auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+    //             assert(memory);
+    //             auto tmpBuf = createExternalBuffer(memory.get());
+    //         }
+    //     }
+    //
+    //     smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
+    //     {
+    //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+    //             .size = size,
+    //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+    //             .alignmentLog2 = 10,
+    //         };
+    //
+    //         auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+    //
+    //         auto tmpBuf = createExternalBuffer(memory.get());
+    //         auto staging = createStaging();
+    //
+    //         auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+    //         for (uint32_t i = 0; i < size / 4; ++i)
+    //             ptr[i] = i;
+    //
+    //         smart_refctd_ptr<IGPUCommandBuffer> cmd;
+    //         commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
+    //         cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+    //         IGPUCommandBuffer::SBufferCopy region = { .size = size };
+    //         assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+    //         cmd->end();
+    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
+    //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+    //         queue->submit({ &submitInfo,&submitInfo + 1 });
+    //         m_device->waitIdle();
+    //         escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory;
+    //     }
+    //
+    //     //{
+    //     //    constexpr size_t M = 32;
+    //     //    auto staging = createStaging(size * M);
+    //
+    //     //    auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+    //     //    for (uint32_t i = 0; i < (M * size) / 4; ++i)
+    //     //        ptr[i] = rand();
+    //
+    //     //    std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
+    //     //    commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
+    //
+    //     //    for (size_t i = 0; i < 1 << 10; ++i)
+    //     //    {
+    //     //        IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+    //     //            .size = size * M,
+    //     //            .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+    //     //            .alignmentLog2 = 10,
+    //     //        };
+    //     //    RE:
+    //     //        auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+    //
+    //     //        if (!memory)
+    //     //        {
+    //     //            m_device->waitIdle();
+    //     //            for (size_t j = 0; j < i; ++j)
+    //     //                cmd[j] = 0;
+    //     //            goto END;
+    //     //        }
+    //     //        assert(memory);
+    //     //        auto tmpBuf = createExternalBuffer(memory.get());
+    //
+    //     //        cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+    //     //        IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
+    //     //        assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+    //     //        cmd[i]->end();
+    //     //        IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
+    //     //        IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+    //     //        assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
+    //     //    }
+    //     //END:
+    //     //    m_device->waitIdle();
+    //     //}
+    //
+    //     {
+    //         auto tmpBuf = createExternalBuffer(escaped.get());
+    //         auto staging = createStaging();
+    //
+    //         smart_refctd_ptr<IGPUCommandBuffer> cmd;
+    //         commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
+    //         cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+    //         IGPUCommandBuffer::SBufferCopy region = { .size = size };
+    //         assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, &region));
+    //         cmd->end();
+    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
+    //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+    //         auto qre = queue->submit({ &submitInfo,&submitInfo + 1 });
+    //         assert(IQueue::RESULT::SUCCESS == qre);
+    //         m_device->waitIdle();
+    //
+    //         auto& ptr = *(std::array<uint32_t, size>*)staging->getBoundMemory().memory->getMappedPointer();
+    //         for (uint32_t i = 0; i < size / 4; ++i)
+    //             assert(ptr[i] == i);
+    //     }
+    //
+    // }
+
+
+    // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
+    bool keepRunning() override { return false; }
+
+    // Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
+    void workLoopBody() override {}
+};
+
+NBL_MAIN_FUNC(CUDA2VKApp)
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d945c547a..7c7990c06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -111,6 +111,7 @@ if(NBL_BUILD_EXAMPLES)
 	endif()
 
 	add_subdirectory(74_QuantizedSequenceTests)
+	add_subdirectory(76_CudaInterop)
 
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)

From b8abd200a1a83ce4592f7ad3290d07ae02b4f538 Mon Sep 17 00:00:00 2001
From: kevyuu <kevin.kayu@gmail.com>
Date: Mon, 23 Mar 2026 17:00:19 +0700
Subject: [PATCH 02/47] Dummy

---
 71_RayTracingPipeline/main.cpp |   2 +-
 76_CudaInterop/main.cpp        | 706 +++++++++++++++++----------------
 2 files changed, 359 insertions(+), 349 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index f6b64c5ca..70ab21994 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -1245,7 +1245,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 				auto retval = device->allocate(info);
 				// map what is mappable by default so ReBAR checks succeed
 				if (retval.isValid() && retval.memory->isMappable())
-					retval.memory->map({ .offset = 0,.length = info.size });
+					retval.memory->map({ .offset = 0,.length = info.allocationSize });
 				return retval;
 			}
 
diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 85d10ad13..c4b4fd5fe 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -76,9 +76,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
     // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
     // // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side
-    // std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
+    std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
     // // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer
-    // smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
+    smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
 
     // our Buffer that is bound to cudaMemories[2]
     smart_refctd_ptr<IGPUBuffer> importedBuf;
@@ -155,7 +155,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
         ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 
-        // launchKernel(kernel, stream);
+        launchKernel(kernel, stream);
 
         ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
         ASSERT_SUCCESS(cu.pcuModuleUnload(module));
@@ -163,7 +163,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
         m_device->waitIdle();
         
-        // testInterop();
+        testInterop();
 
         return true;
     }
@@ -185,352 +185,362 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
 
         // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-        // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-        // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-        // ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-        //
-        // semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
-        // ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get()));
-        // {
-        //     // export the CUmem we have just created into a refctd IDeviceMemoryAllocation
-        //     auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
-        //     if (!devmemory)
-        //         logFail("Failed to export CUDA memory!");
-        //
-        //
-        //     // create an importing external buffer on Nabla side
-        //     IGPUBuffer::SCreationParams params = {};
-        //     params.size = devmemory->getAllocationSize();
-        //     params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
-        //     params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-        //     importedBuf = m_device->createBuffer(std::move(params));
-        //     if (!importedBuf) 
-        //         logFail("Failed to create an external buffer");
-        //
-        //     // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
-        //     ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
-        //     bool re = m_device->bindBufferMemory(1, &bindInfo);
-        //     if (!re) logFail("Failed to bind CUDA memory to buffer");
-        // }
-        //
-        // {
-        //     // same thing as above
-        //     // we create an external image and bind the imported external memory to it
-        //     // now we have 2 different resources that are bound to the same memory
-        //     IImage::SCreationParams params = {};
-        //     params.type = IGPUImage::ET_2D;
-        //     params.samples = IGPUImage::ESCF_1_BIT;
-        //     params.format = EF_R32_SFLOAT;
-        //     params.extent = { gridDim[0], blockDim[0], 1 };
-        //     params.mipLevels = 1;
-        //     params.arrayLayers = 1;
-        //     params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
-        //     importedImg = cudaMemories[2]->createAndBindImage(m_device.get(), std::move(params));
-        //     if (!importedImg) logFail("Failed to create an external image");
-        // }
-        //
-        // commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-        // bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
-        //
-        // stagingBufs[0] = createStaging();
-        // stagingBufs[1] = createStaging();
+        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+        
+        semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
+        ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get()));
+        {
+            // export the CUmem we have just created into a refctd IDeviceMemoryAllocation
+            auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
+            if (!devmemory)
+                logFail("Failed to export CUDA memory!");
+            
+            
+            // create an importing external buffer on Nabla side
+            IGPUBuffer::SCreationParams params = {};
+            params.size = devmemory->getAllocationSize();
+            params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
+            params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+            importedBuf = m_device->createBuffer(std::move(params));
+            if (!importedBuf) 
+                logFail("Failed to create an external buffer");
+            
+            // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
+            ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
+            bool re = m_device->bindBufferMemory(1, &bindInfo);
+                if (!re) logFail("Failed to bind CUDA memory to buffer");
+        }
+        
+        {
+            // same thing as above
+            // we create an external image and bind the imported external memory to it
+            // now we have 2 different resources that are bound to the same memory
+
+            auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
+            if (!devmemory)
+                logFail("Failed to export CUDA memory!");
+
+            IGPUImage::SCreationParams params = {};
+            params.type = IGPUImage::ET_2D;
+            params.samples = IGPUImage::ESCF_1_BIT;
+            params.format = EF_R32_SFLOAT;
+            params.extent = { gridDim[0], blockDim[0], 1 };
+            params.mipLevels = 1;
+            params.arrayLayers = 1;
+            params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
+            params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+            importedImg = m_device->createImage(std::move(params));
+            if (!importedImg) logFail("Failed to create an external image");
+            // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
+            ILogicalDevice::SBindImageMemoryInfo bindInfo = { .image = importedImg.get(), .binding = {.memory = devmemory.get() } };
+            bool re = m_device->bindImageMemory(1, &bindInfo);
+                if (!re) logFail("Failed to bind CUDA memory to buffer");
+        }
+        
+        commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
+        
+        stagingBufs[0] = createStaging();
+        stagingBufs[1] = createStaging();
+    }
+
+    smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
+    {
+        IGPUBuffer::SCreationParams params = {};
+        params.size = mem->getAllocationSize();
+        params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
+        params.externalHandleTypes = mem->getCreationParams().externalHandleType;
+        auto buf = m_device->createBuffer(std::move(params));
+        ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
+        m_device->bindBufferMemory(1, &bindInfo);
+        return buf;
+    }
+
+    smart_refctd_ptr<IGPUBuffer> createStaging(size_t sz = size)
+    {
+        auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} });
+        auto req = buf->getMemoryReqs();
+        req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
+        auto allocation = m_device->allocate(req, buf.get());
+    
+        void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
+        if (!mapping)
+            logFail("Failed to map an staging buffer");
+        memset(mapping, 0, req.size);
+        return buf;
+    };
+
+    void launchKernel(CUfunction kernel, CUstream stream)
+    {
+    
+        // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
+        {
+            IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+                    .barrier = {
+                        .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+                        .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                    },
+                    .range = {.buffer = importedBuf, },
+            };
+    
+            IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+                .barrier = {
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .image = importedImg.get(),
+                .subresourceRange = {
+                    .aspectMask = IImage::EAF_COLOR_BIT,
+                    .levelCount = 1u,
+                    .layerCount = 1u,
+                }
+            };
+            // start recording
+            bool re = true;
+            re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
+            re &= cmd[0]->end();
+    
+            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 };
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
+            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
+            auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+            re &= IQueue::RESULT::SUCCESS == submitRe;
+            if (!re)
+                logFail("Something went wrong readying resources for CUDA");
+        }
+        
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+        // Launch kernel
+        {
+            CUdeviceptr ptrs[] = {
+                cudaMemories[0]->getDeviceptr(),
+                cudaMemories[1]->getDeviceptr(),
+                cudaMemories[2]->getDeviceptr(),
+            };
+            auto numEles = numElements;
+            void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles };
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream));
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream));
+    
+            auto semaphore = cudaSemaphore->getInternalObject();
+            CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+            ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
+            ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
+            CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
+            // ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+        }
+        
+        // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
+        {
+            IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+                .barrier = {
+                    .dep = {
+                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+                        .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
+                    },
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .range = { .buffer = importedBuf, },
+            };
+            bool re = true;
+            re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+        
+            re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}});
+        
+            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, &region);
+        
+            IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+                .barrier = { 
+                    .dep = { 
+                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+                        .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS,
+                    },
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .image = importedImg.get(),
+                .subresourceRange = {
+                    .aspectMask = IImage::EAF_COLOR_BIT,
+                    .levelCount = 1u,
+                    .layerCount = 1u,
+                },
+                .oldLayout = IImage::LAYOUT::PREINITIALIZED,
+                .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
+            };
+        
+            re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}});
+        
+            IImage::SBufferCopy imgRegion = {
+                .imageSubresource = {
+                    .aspectMask = imgBarrier.subresourceRange.aspectMask,
+                    .layerCount = imgBarrier.subresourceRange.layerCount,
+                },
+                .imageExtent = importedImg->getCreationParameters().extent,
+            };
+        
+            re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
+            re &= cmd[1]->end();
+            
+            IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 };
+            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 };
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
+            IQueue::SSubmitInfo submitInfo = { 
+                .waitSemaphores = {&waitInfo,&waitInfo + 1},
+                .commandBuffers = {&cmdInfo, &cmdInfo + 1},  
+                .signalSemaphores = {&signalInfo,&signalInfo + 1} 
+            };
+            auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+            re &= IQueue::RESULT::SUCCESS == submitRe;
+            if (!re)
+                logFail("Something went wrong copying results from CUDA");
+        }
+        
+        ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
+    }
+
+    void kernelCallback()
+    {
+        // Make sure we are also done with the readback
+        auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 2}};
+        m_device->waitForSemaphores(wait, true, -1);
+    
+        float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
+        float* B = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
+    
+        float* CBuf = reinterpret_cast<float*>(stagingBufs[0]->getBoundMemory().memory->getMappedPointer());
+        float* CImg = reinterpret_cast<float*>(stagingBufs[1]->getBoundMemory().memory->getMappedPointer());
+    
+        if(memcmp(CBuf, CImg, size))
+            logFail("Buffer and Image memories do not match!");
+    
+        for (auto i = 0; i < numElements; i++)
+        {
+            bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
+            if(!re)
+                logFail("Element at index %d is incorrect!", i);
+        }
+        
+        std::cout << "Success\n";
     }
 
-    // smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
-    // {
-    //     IGPUBuffer::SCreationParams params = {};
-    //     params.size = mem->getAllocationSize();
-    //     params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
-    //     params.externalHandleTypes = mem->getCreationParams().externalHandleType;
-    //     auto buf = m_device->createBuffer(std::move(params));
-    //     ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
-    //     m_device->bindBufferMemory(1, &bindInfo);
-    //     return buf;
-    // }
-
-    // smart_refctd_ptr<IGPUBuffer> createStaging(size_t sz = size)
-    // {
-    //     auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} });
-    //     auto req = buf->getMemoryReqs();
-    //     req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
-    //     auto allocation = m_device->allocate(req, buf.get());
-    //
-    //     void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
-    //     if (!mapping)
-    //         logFail("Failed to map an staging buffer");
-    //     memset(mapping, 0, req.size);
-    //     return buf;
-    // };
-
-    // void launchKernel(CUfunction kernel, CUstream stream)
-    // {
-    //
-    //     // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
-    //     {
-    //         IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
-    //                 .barrier = {
-    //                     .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
-    //                     .otherQueueFamilyIndex = IQueue::FamilyExternal,
-    //                 },
-    //                 .range = {.buffer = importedBuf, },
-    //         };
-    //
-    //         IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
-    //             .barrier = {
-    //                 .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
-    //                 .otherQueueFamilyIndex = IQueue::FamilyExternal,
-    //             },
-    //             .image = importedImg.get(),
-    //             .subresourceRange = {
-    //                 .aspectMask = IImage::EAF_COLOR_BIT,
-    //                 .levelCount = 1u,
-    //                 .layerCount = 1u,
-    //             }
-    //         };
-    //         // start recording
-    //         bool re = true;
-    //         re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-    //         re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
-    //         re &= cmd[0]->end();
-    //
-    //         IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 };
-    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
-    //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
-    //         auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
-    //         re &= IQueue::RESULT::SUCCESS == submitRe;
-    //         if (!re)
-    //             logFail("Something went wrong readying resources for CUDA");
-    //     }
-    //     
-    //     auto& cu = cudaHandler->getCUDAFunctionTable();
-    //     // Launch kernel
-    //     {
-    //         CUdeviceptr ptrs[] = {
-    //             cudaMemories[0]->getDeviceptr(),
-    //             cudaMemories[1]->getDeviceptr(),
-    //             cudaMemories[2]->getDeviceptr(),
-    //         };
-    //         auto numEles = numElements;
-    //         void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles };
-    //         ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream));
-    //         ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream));
-    //
-    //         auto semaphore = cudaSemaphore->getInternalObject();
-    //         CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
-    //         ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
-    //         ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-    //         CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
-    //         ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
-    //     }
-    //     
-    //     // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
-    //     {
-    //         IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
-    //             .barrier = {
-    //                 .dep = {
-    //                     .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-    //                     .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
-    //                 },
-    //                 .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
-    //                 .otherQueueFamilyIndex = IQueue::FamilyExternal,
-    //             },
-    //             .range = { .buffer = importedBuf, },
-    //         };
-    //         bool re = true;
-    //         re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-    //
-    //         re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}});
-    //
-    //         IGPUCommandBuffer::SBufferCopy region = { .size = size };
-    //         re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, &region);
-    //
-    //         IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
-    //             .barrier = { 
-    //                 .dep = { 
-    //                     .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-    //                     .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS,
-    //                 },
-    //                 .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
-    //                 .otherQueueFamilyIndex = IQueue::FamilyExternal,
-    //             },
-    //             .image = importedImg.get(),
-    //             .subresourceRange = {
-    //                 .aspectMask = IImage::EAF_COLOR_BIT,
-    //                 .levelCount = 1u,
-    //                 .layerCount = 1u,
-    //             },
-    //             .oldLayout = IImage::LAYOUT::PREINITIALIZED,
-    //             .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
-    //         };
-    //
-    //         re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}});
-    //
-    //         IImage::SBufferCopy imgRegion = {
-    //             .imageSubresource = {
-    //                 .aspectMask = imgBarrier.subresourceRange.aspectMask,
-    //                 .layerCount = imgBarrier.subresourceRange.layerCount,
-    //             },
-    //             .imageExtent = importedImg->getCreationParameters().extent,
-    //         };
-    //
-    //         re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
-    //         re &= cmd[1]->end();
-    //         
-    //         IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 };
-    //         IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 };
-    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
-    //         IQueue::SSubmitInfo submitInfo = { 
-    //             .waitSemaphores = {&waitInfo,&waitInfo + 1},
-    //             .commandBuffers = {&cmdInfo, &cmdInfo + 1},  
-    //             .signalSemaphores = {&signalInfo,&signalInfo + 1} 
-    //         };
-    //         auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
-    //         re &= IQueue::RESULT::SUCCESS == submitRe;
-    //         if (!re)
-    //             logFail("Something went wrong copying results from CUDA");
-    //     }
-    //     
-    //     ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
-    // }
-
-    // void kernelCallback()
-    // {
-    //     // Make sure we are also done with the readback
-    //     auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3}};
-    //     m_device->waitForSemaphores(wait, true, -1);
-    //
-    //     float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
-    //     float* B = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
-    //
-    //     float* CBuf = reinterpret_cast<float*>(stagingBufs[0]->getBoundMemory().memory->getMappedPointer());
-    //     float* CImg = reinterpret_cast<float*>(stagingBufs[1]->getBoundMemory().memory->getMappedPointer());
-    //
-    //     if(memcmp(CBuf, CImg, size))
-    //         logFail("Buffer and Image memories do not match!");
-    //
-    //     for (auto i = 0; i < numElements; i++)
-    //     {
-    //         bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
-    //         if(!re)
-    //             logFail("Element at index %d is incorrect!", i);
-    //     }
-    //     
-    //     std::cout << "Success\n";
-    // }
-
-
-    // void testInterop()
-    // {
-    //     {
-    //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-    //             .size = size,
-    //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-    //             .alignmentLog2 = 10,
-    //         };
-    //
-    //         for (size_t i = 0; i < (1 << 8); ++i)
-    //         {
-    //             auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-    //             assert(memory);
-    //             auto tmpBuf = createExternalBuffer(memory.get());
-    //         }
-    //     }
-    //
-    //     smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
-    //     {
-    //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-    //             .size = size,
-    //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-    //             .alignmentLog2 = 10,
-    //         };
-    //
-    //         auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-    //
-    //         auto tmpBuf = createExternalBuffer(memory.get());
-    //         auto staging = createStaging();
-    //
-    //         auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
-    //         for (uint32_t i = 0; i < size / 4; ++i)
-    //             ptr[i] = i;
-    //
-    //         smart_refctd_ptr<IGPUCommandBuffer> cmd;
-    //         commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
-    //         cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-    //         IGPUCommandBuffer::SBufferCopy region = { .size = size };
-    //         assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
-    //         cmd->end();
-    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
-    //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-    //         queue->submit({ &submitInfo,&submitInfo + 1 });
-    //         m_device->waitIdle();
-    //         escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory;
-    //     }
-    //
-    //     //{
-    //     //    constexpr size_t M = 32;
-    //     //    auto staging = createStaging(size * M);
-    //
-    //     //    auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
-    //     //    for (uint32_t i = 0; i < (M * size) / 4; ++i)
-    //     //        ptr[i] = rand();
-    //
-    //     //    std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
-    //     //    commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
-    //
-    //     //    for (size_t i = 0; i < 1 << 10; ++i)
-    //     //    {
-    //     //        IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-    //     //            .size = size * M,
-    //     //            .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-    //     //            .alignmentLog2 = 10,
-    //     //        };
-    //     //    RE:
-    //     //        auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-    //
-    //     //        if (!memory)
-    //     //        {
-    //     //            m_device->waitIdle();
-    //     //            for (size_t j = 0; j < i; ++j)
-    //     //                cmd[j] = 0;
-    //     //            goto END;
-    //     //        }
-    //     //        assert(memory);
-    //     //        auto tmpBuf = createExternalBuffer(memory.get());
-    //
-    //     //        cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-    //     //        IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
-    //     //        assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
-    //     //        cmd[i]->end();
-    //     //        IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
-    //     //        IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-    //     //        assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
-    //     //    }
-    //     //END:
-    //     //    m_device->waitIdle();
-    //     //}
-    //
-    //     {
-    //         auto tmpBuf = createExternalBuffer(escaped.get());
-    //         auto staging = createStaging();
-    //
-    //         smart_refctd_ptr<IGPUCommandBuffer> cmd;
-    //         commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
-    //         cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-    //         IGPUCommandBuffer::SBufferCopy region = { .size = size };
-    //         assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, &region));
-    //         cmd->end();
-    //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
-    //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-    //         auto qre = queue->submit({ &submitInfo,&submitInfo + 1 });
-    //         assert(IQueue::RESULT::SUCCESS == qre);
-    //         m_device->waitIdle();
-    //
-    //         auto& ptr = *(std::array<uint32_t, size>*)staging->getBoundMemory().memory->getMappedPointer();
-    //         for (uint32_t i = 0; i < size / 4; ++i)
-    //             assert(ptr[i] == i);
-    //     }
-    //
-    // }
+
+    void testInterop()
+    {
+        {
+            IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+                .size = size,
+                .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+                .alignmentLog2 = 10,
+            };
+    
+            for (size_t i = 0; i < (1 << 8); ++i)
+            {
+                auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+                assert(memory);
+                auto tmpBuf = createExternalBuffer(memory.get());
+            }
+        }
+    
+        smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
+        {
+            IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+                .size = size,
+                .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+                .alignmentLog2 = 10,
+            };
+    
+            auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+    
+            auto tmpBuf = createExternalBuffer(memory.get());
+            auto staging = createStaging();
+    
+            auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+            for (uint32_t i = 0; i < size / 4; ++i)
+                ptr[i] = i;
+    
+            smart_refctd_ptr<IGPUCommandBuffer> cmd;
+            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
+            cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+            cmd->end();
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
+            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+            queue->submit({ &submitInfo,&submitInfo + 1 });
+            m_device->waitIdle();
+            escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory;
+        }
+    
+        {
+            constexpr size_t M = 32;
+            auto staging = createStaging(size * M);
+    
+            auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+            for (uint32_t i = 0; i < (M * size) / 4; ++i)
+                ptr[i] = rand();
+    
+            std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
+            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
+    
+            for (size_t i = 0; i < 1 << 10; ++i)
+            {
+                IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+                    .size = size * M,
+                    .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+                    .alignmentLog2 = 10,
+                };
+            RE:
+                auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+    
+                if (!memory)
+                {
+                    m_device->waitIdle();
+                    for (size_t j = 0; j < i; ++j)
+                        cmd[j] = 0;
+                    goto END;
+                }
+                assert(memory);
+                auto tmpBuf = createExternalBuffer(memory.get());
+    
+                cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+                IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
+                assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+                cmd[i]->end();
+                IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
+                IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+                assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
+            }
+        END:
+            m_device->waitIdle();
+        }
+    
+        {
+            auto tmpBuf = createExternalBuffer(escaped.get());
+            auto staging = createStaging();
+    
+            smart_refctd_ptr<IGPUCommandBuffer> cmd;
+            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
+            cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, &region));
+            cmd->end();
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
+            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+            auto qre = queue->submit({ &submitInfo,&submitInfo + 1 });
+            assert(IQueue::RESULT::SUCCESS == qre);
+            m_device->waitIdle();
+    
+            auto& ptr = *(std::array<uint32_t, size>*)staging->getBoundMemory().memory->getMappedPointer();
+            for (uint32_t i = 0; i < size / 4; ++i)
+                assert(ptr[i] == i);
+        }
+    
+    }
 
 
     // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.

From 93ca5efe588ca85c1eaf81a486b611df98403580 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Thu, 23 Apr 2026 01:09:08 +0700
Subject: [PATCH 03/47] Refactor test into separate section

---
 .../app_resources/vectorAdd_kernel.cu         |   6 +-
 76_CudaInterop/main.cpp                       | 686 +++++++++---------
 2 files changed, 350 insertions(+), 342 deletions(-)

diff --git a/76_CudaInterop/app_resources/vectorAdd_kernel.cu b/76_CudaInterop/app_resources/vectorAdd_kernel.cu
index 3baef0123..35876a627 100644
--- a/76_CudaInterop/app_resources/vectorAdd_kernel.cu
+++ b/76_CudaInterop/app_resources/vectorAdd_kernel.cu
@@ -33,10 +33,8 @@
  */
 
 extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
-                                     int numElements) {
+                                     size_t numElements) {
   int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  if (i < numElements) {
+  if (i < numElements)
     C[i] = A[i] + B[i];
-  }
 }
\ No newline at end of file
diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index c4b4fd5fe..2a64f9428 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -1,6 +1,6 @@
 #include "nbl/video/CCUDAHandler.h"
-// #include "nbl/video/CCUDASharedMemory.h"
-// #include "nbl/video/CCUDASharedSemaphore.h"
+// #include "nbl/video/CCUDAExportableMemory.h"
+// #include "nbl/video/CCUDAImportedSemaphore.h"
 
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
 #include "nbl/examples/common/BuiltinResourcesApplication.hpp"
@@ -57,10 +57,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     using device_base_t = MonoDeviceApplication;
     using asset_base_t = BuiltinResourcesApplication;
 
-    static constexpr uint32_t gridDim[3] = { 4096,1,1 };
-    static constexpr uint32_t blockDim[3] = { 1024,1,1 };
-    static constexpr size_t numElements = gridDim[0] * blockDim[0];
-    static constexpr size_t size = sizeof(float) * numElements;
 
 public:
     // Yay thanks to multiple inheritance we cannot forward ctors anymore
@@ -72,27 +68,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
     IQueue* queue;
 
-    // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
-    std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
-    // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
-    // // Kernel writes to cudaMemories[2] which we later use to export and read on nabla side
-    std::array<smart_refctd_ptr<CCUDASharedMemory>, 3> cudaMemories = {};
-    // // A semaphore created in CUDA which will alias a Nabla semaphore to help sync between the CUDA kernel and Nabla device to host transfer
-    smart_refctd_ptr<CCUDASharedSemaphore> cudaSemaphore;
-
-    // our Buffer that is bound to cudaMemories[2]
-    smart_refctd_ptr<IGPUBuffer> importedBuf;
-    // our Image that is also bound to cudaMemories[2]
-    smart_refctd_ptr<IGPUImage> importedImg;
-
-    // host visible buffers that we use to copy from the resources above after CUDA kernel is done writing
-    smart_refctd_ptr<IGPUBuffer> stagingBufs[2];
-
-    // Nabla semaphore for sync
-    smart_refctd_ptr<ISemaphore> semaphore;
-
-    smart_refctd_ptr<IGPUCommandPool> commandPool;
-    smart_refctd_ptr<IGPUCommandBuffer> cmd[2];
 
     // a device filter helps you create a set of physical devices that satisfy your requirements in terms of features, limits etc.
     virtual void filterDevices(core::set<video::IPhysicalDevice*>& physicalDevices) const
@@ -121,10 +96,47 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         if (!cudaDevice) 
             return logFail("Could not create a CUDA Device!");
 
-        
-        queue = device_base_t::getComputeQueue();
-        
-        createResources();
+        testSharedResource();
+        testDestruction();
+        testLargeAllocations();
+
+        return true;
+    }
+
+    smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
+    {
+        IGPUBuffer::SCreationParams params = {};
+        params.size = mem->getAllocationSize();
+        params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
+        params.externalHandleTypes = mem->getCreationParams().externalHandleType;
+        auto buf = m_device->createBuffer(std::move(params));
+        ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
+        m_device->bindBufferMemory(1, &bindInfo);
+        return buf;
+    }
+
+    smart_refctd_ptr<IGPUBuffer> createStaging(size_t sz)
+    {
+        auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} });
+        auto req = buf->getMemoryReqs();
+        req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits()
+                            & m_device->getPhysicalDevice()->getHostVisibleMemoryTypeBits()
+                            & m_device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT);
+        auto allocation = m_device->allocate(req, buf.get());
+    
+        void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
+        if (!mapping)
+            logFail("Failed to map an staging buffer");
+        memset(mapping, 0, req.size);
+        return buf;
+    };
+
+    void testSharedResource()
+    {
+        static constexpr uint32_t GridDim[3] = { 4096,1,1 };
+        static constexpr uint32_t BlockDim[3] = { 1024,1,1 };
+        static constexpr size_t NumElements = GridDim[0] * BlockDim[0];
+        static constexpr size_t BufferSize = sizeof(float) * NumElements;
 
         smart_refctd_ptr<ICPUBuffer> ptx;
         {
@@ -135,7 +147,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp);
             const auto assets = assetBundle.getContents();
             if (assets.empty())
-                return logFail("Could not load kernel!");
+                logFail("Could not load kernel!");
 
             smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
             std::string log;
@@ -145,197 +157,137 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
             ptx = std::move(ptx_);
         }
+
+        auto& cu = cudaHandler->getCUDAFunctionTable();
+
         CUmodule   module;
         CUfunction kernel;
         CUstream   stream;
 
-        auto& cu = cudaHandler->getCUDAFunctionTable();
-
         ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
         ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
         ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 
-        launchKernel(kernel, stream);
-
-        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
-        ASSERT_SUCCESS(cu.pcuModuleUnload(module));
-        ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
-
-        m_device->waitIdle();
-        
-        testInterop();
-
-        return true;
-    }
-
-    void createResources()
-    {
-        auto& cu = cudaHandler->getCUDAFunctionTable();
+        // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
+        std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
 
         for (auto& buf : cpuBufs)
         {
-          ICPUBuffer::SCreationParams params = {};
-          params.size = size;
-          buf = ICPUBuffer::create(std::move(params));
+            ICPUBuffer::SCreationParams params = {};
+            params.size = BufferSize;
+            buf = ICPUBuffer::create(std::move(params));
         }
 
-        for (auto j = 0; j < 2; j++)
-            for (auto i = 0; i < numElements; i++)
-                reinterpret_cast<float*>(cpuBufs[j]->getPointer())[i] = rand() / float(RAND_MAX);
+        for (auto buf_i = 0; buf_i < cpuBufs.size(); buf_i++)
+            for (auto elem_i = 0; elem_i < NumElements; elem_i++)
+                reinterpret_cast<float*>(cpuBufs[buf_i]->getPointer())[elem_i] = rand() / float(RAND_MAX);
 
+        constexpr auto InputCount = 2;
+        // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
+        // // Kernel writes to cudaInputMemories[2] which we later use to export and read on nabla side
+        std::array<smart_refctd_ptr<CCUDAExportableMemory>, InputCount> cudaInputMemories = {};
+        std::array<smart_refctd_ptr<IDeviceMemoryAllocation>, InputCount> vulkanMemories = {};
+        std::array<smart_refctd_ptr<IGPUBuffer>, InputCount> vulkanInputBuffers = {};
+        std::array<smart_refctd_ptr<IGPUBuffer>, InputCount> inputStagingBuffers = {};
 
-        // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[0], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[1], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-        ASSERT_SUCCESS(cudaDevice->createSharedMemory(&cudaMemories[2], { .size = size, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
-        
-        semaphore = m_device->createSemaphore(0, { .externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32 });
-        ASSERT_SUCCESS(cudaDevice->importGPUSemaphore(&cudaSemaphore, semaphore.get()));
+        for (auto input_i = 0; input_i < InputCount; input_i++)
         {
-            // export the CUmem we have just created into a refctd IDeviceMemoryAllocation
-            auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
-            if (!devmemory)
-                logFail("Failed to export CUDA memory!");
-            
-            
-            // create an importing external buffer on Nabla side
-            IGPUBuffer::SCreationParams params = {};
-            params.size = devmemory->getAllocationSize();
-            params.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
-            params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-            importedBuf = m_device->createBuffer(std::move(params));
-            if (!importedBuf) 
-                logFail("Failed to create an external buffer");
-            
-            // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
-            ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = importedBuf.get(), .binding = {.memory = devmemory.get() } };
-            bool re = m_device->bindBufferMemory(1, &bindInfo);
-                if (!re) logFail("Failed to bind CUDA memory to buffer");
+          // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
+          ASSERT_SUCCESS(cudaDevice->createExportableMemory(&cudaInputMemories[input_i], { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+          vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr);
+          vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get());
+          inputStagingBuffers[input_i] = createStaging(BufferSize);
         }
+
+        IGPUBuffer::SCreationParams outputBufferParams;
+        outputBufferParams.size = cudaDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, BufferSize);
+        outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
+        outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+        const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams));
+        auto outputMemReq = outputBuf->getMemoryReqs();
+        auto allocation = m_device->allocate(outputMemReq, outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE);
+        core::smart_refctd_ptr<CCUDAImportedMemory> cudaOutputMemory;
+        ASSERT_SUCCESS(cudaDevice->importExternalMemory(&cudaOutputMemory, allocation.memory.get()));
         
-        {
-            // same thing as above
-            // we create an external image and bind the imported external memory to it
-            // now we have 2 different resources that are bound to the same memory
-
-            auto devmemory = cudaMemories[2]->exportAsMemory(m_device.get());
-            if (!devmemory)
-                logFail("Failed to export CUDA memory!");
-
-            IGPUImage::SCreationParams params = {};
-            params.type = IGPUImage::ET_2D;
-            params.samples = IGPUImage::ESCF_1_BIT;
-            params.format = EF_R32_SFLOAT;
-            params.extent = { gridDim[0], blockDim[0], 1 };
-            params.mipLevels = 1;
-            params.arrayLayers = 1;
-            params.usage = IGPUImage::EUF_TRANSFER_SRC_BIT;
-            params.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-            importedImg = m_device->createImage(std::move(params));
-            if (!importedImg) logFail("Failed to create an external image");
-            // bind that imported IDeviceMemoryAllocation to the external buffer we've just created
-            ILogicalDevice::SBindImageMemoryInfo bindInfo = { .image = importedImg.get(), .binding = {.memory = devmemory.get() } };
-            bool re = m_device->bindImageMemory(1, &bindInfo);
-                if (!re) logFail("Failed to bind CUDA memory to buffer");
-        }
+        ISemaphore::SCreationParams semParams;
+        semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
+        auto semaphore = m_device->createSemaphore(0, std::move(semParams));
+        core::smart_refctd_ptr<CCUDAImportedSemaphore> cudaSemaphore;
+        ASSERT_SUCCESS(cudaDevice->importExternalSemaphore(&cudaSemaphore, semaphore.get()));
         
-        commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-        bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 2, cmd, smart_refctd_ptr(m_logger));
+        std::array<smart_refctd_ptr<IGPUCommandBuffer>, 2> cmd;
+        auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger));
         
-        stagingBufs[0] = createStaging();
-        stagingBufs[1] = createStaging();
-    }
-
-    smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
-    {
-        IGPUBuffer::SCreationParams params = {};
-        params.size = mem->getAllocationSize();
-        params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
-        params.externalHandleTypes = mem->getCreationParams().externalHandleType;
-        auto buf = m_device->createBuffer(std::move(params));
-        ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
-        m_device->bindBufferMemory(1, &bindInfo);
-        return buf;
-    }
-
-    smart_refctd_ptr<IGPUBuffer> createStaging(size_t sz = size)
-    {
-        auto buf = m_device->createBuffer({ {.size = sz, .usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT} });
-        auto req = buf->getMemoryReqs();
-        req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits();
-        auto allocation = m_device->allocate(req, buf.get());
-    
-        void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
-        if (!mapping)
-            logFail("Failed to map an staging buffer");
-        memset(mapping, 0, req.size);
-        return buf;
-    };
+        const auto outputStagingBuffer = createStaging(BufferSize);
 
-    void launchKernel(CUfunction kernel, CUstream stream)
-    {
-    
         // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
         {
-            IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
-                    .barrier = {
-                        .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
-                        .otherQueueFamilyIndex = IQueue::FamilyExternal,
-                    },
-                    .range = {.buffer = importedBuf, },
-            };
-    
-            IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
+            const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
                 .barrier = {
+                    .dep = {
+                        .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+                        .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS,
+                    },
                     .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
                     .otherQueueFamilyIndex = IQueue::FamilyExternal,
                 },
-                .image = importedImg.get(),
-                .subresourceRange = {
-                    .aspectMask = IImage::EAF_COLOR_BIT,
-                    .levelCount = 1u,
-                    .layerCount = 1u,
-                }
+                .range = {
+                  .offset = 0, 
+                  .size = outputBuf->getSize(), 
+                  .buffer = outputBuf, 
+                },
             };
+    
             // start recording
             bool re = true;
             re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            re &= cmd[0]->pipelineBarrier(EDF_NONE, { .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}, .imgBarriers = {&imgBarrier,&imgBarrier + 1} });
+            re &= cmd[0]->pipelineBarrier(EDF_NONE, {
+              .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}
+            });
             re &= cmd[0]->end();
     
-            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 1 };
-            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get()};
-            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo,&signalInfo + 1} };
-            auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+            const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+              .semaphore = semaphore.get(), 
+              .value = 1,
+              .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+            };
+            const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() };
+            const IQueue::SSubmitInfo submitInfo = {
+              .commandBuffers = {&cmdInfo, &cmdInfo + 1}, 
+              .signalSemaphores = {&signalInfo, &signalInfo + 1}
+            };
+            const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 });
             re &= IQueue::RESULT::SUCCESS == submitRe;
-            if (!re)
-                logFail("Something went wrong readying resources for CUDA");
+            if (!re) logFail("Something went wrong readying resources for CUDA");
         }
         
-        auto& cu = cudaHandler->getCUDAFunctionTable();
         // Launch kernel
         {
+            CUdeviceptr outputBufPtr;
+            cudaOutputMemory->getMappedBuffer(&outputBufPtr);
             CUdeviceptr ptrs[] = {
-                cudaMemories[0]->getDeviceptr(),
-                cudaMemories[1]->getDeviceptr(),
-                cudaMemories[2]->getDeviceptr(),
+              cudaInputMemories[0]->getDeviceptr(),
+              cudaInputMemories[1]->getDeviceptr(),
+              outputBufPtr
             };
-            auto numEles = numElements;
-            void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numEles };
-            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), size, stream));
-            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), size, stream));
+            auto numElements = &NumElements;
+            void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements };
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream));
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream));
     
             auto semaphore = cudaSemaphore->getInternalObject();
-            CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+            const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
             ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
-            ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, gridDim[0], gridDim[1], gridDim[2], blockDim[0], blockDim[1], blockDim[2], 0, stream, parameters, nullptr));
-            CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
-            // ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+            ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr));
+            const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
+            ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
         }
+        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
         
         // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
         {
-            IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+            const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
                 .barrier = {
                     .dep = {
                         .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
@@ -344,202 +296,260 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
                     .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
                     .otherQueueFamilyIndex = IQueue::FamilyExternal,
                 },
-                .range = { .buffer = importedBuf, },
+                .range = { 
+                  .offset = 0,
+                  .size = outputBuf->getSize(),
+                  .buffer = outputBuf, 
+                },
             };
             bool re = true;
             re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-        
             re &= cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}});
-        
-            IGPUCommandBuffer::SBufferCopy region = { .size = size };
-            re &= cmd[1]->copyBuffer(importedBuf.get(), stagingBufs[0].get(), 1, &region);
-        
-            IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> imgBarrier = {
-                .barrier = { 
-                    .dep = { 
-                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-                        .dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS,
-                    },
-                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
-                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
-                },
-                .image = importedImg.get(),
-                .subresourceRange = {
-                    .aspectMask = IImage::EAF_COLOR_BIT,
-                    .levelCount = 1u,
-                    .layerCount = 1u,
-                },
-                .oldLayout = IImage::LAYOUT::PREINITIALIZED,
-                .newLayout = IImage::LAYOUT::TRANSFER_SRC_OPTIMAL,
+            const auto region = IGPUCommandBuffer::SBufferCopy{ 
+              .srcOffset = 0,
+              .dstOffset = 0,
+              .size = BufferSize 
             };
-        
-            re &= cmd[1]->pipelineBarrier(EDF_NONE, {.imgBarriers = {&imgBarrier,&imgBarrier + 1}});
-        
-            IImage::SBufferCopy imgRegion = {
-                .imageSubresource = {
-                    .aspectMask = imgBarrier.subresourceRange.aspectMask,
-                    .layerCount = imgBarrier.subresourceRange.layerCount,
-                },
-                .imageExtent = importedImg->getCreationParameters().extent,
-            };
-        
-            re &= cmd[1]->copyImageToBuffer(importedImg.get(), imgBarrier.newLayout, stagingBufs[1].get(), 1, &imgRegion);
-            re &= cmd[1]->end();
+            re &= cmd[1]->copyBuffer(outputBuf.get(), outputStagingBuffer.get(), 1, &region);
+            for (auto input_i = 0; input_i < InputCount; input_i++)
+              re &= cmd[1]->copyBuffer(vulkanInputBuffers[input_i].get(), inputStagingBuffers[input_i].get(), 1, &region);
+            cmd[1]->end();
             
-            IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= { .semaphore = semaphore.get(), .value = 2 };
-            IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { .semaphore = semaphore.get(), .value = 3 };
-            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
-            IQueue::SSubmitInfo submitInfo = { 
-                .waitSemaphores = {&waitInfo,&waitInfo + 1},
-                .commandBuffers = {&cmdInfo, &cmdInfo + 1},  
-                .signalSemaphores = {&signalInfo,&signalInfo + 1} 
+            const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= {
+              .semaphore = semaphore.get(), 
+              .value = 2,
+              .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+            };
+            const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+              .semaphore = semaphore.get(), 
+              .value = 3,
+              .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
             };
-            auto submitRe = queue->submit({ &submitInfo,&submitInfo + 1 });
+            const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
+            const IQueue::SSubmitInfo submitInfo = { 
+                .waitSemaphores = { &waitInfo, &waitInfo + 1 },
+                .commandBuffers = { &cmdInfo, &cmdInfo + 1 },  
+                .signalSemaphores = { &signalInfo, &signalInfo + 1 } 
+            };
+            const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 });
             re &= IQueue::RESULT::SUCCESS == submitRe;
             if (!re)
                 logFail("Something went wrong copying results from CUDA");
-        }
-        
-        ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, [](void* userData) { decltype(this)(userData)->kernelCallback(); }, this));
-    }
 
-    void kernelCallback()
-    {
-        // Make sure we are also done with the readback
-        auto wait = std::array{ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 2}};
-        m_device->waitForSemaphores(wait, true, -1);
-    
-        float* A = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
-        float* B = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
-    
-        float* CBuf = reinterpret_cast<float*>(stagingBufs[0]->getBoundMemory().memory->getMappedPointer());
-        float* CImg = reinterpret_cast<float*>(stagingBufs[1]->getBoundMemory().memory->getMappedPointer());
-    
-        if(memcmp(CBuf, CImg, size))
-            logFail("Buffer and Image memories do not match!");
-    
-        for (auto i = 0; i < numElements; i++)
-        {
-            bool re = (abs(CBuf[i] - A[i] - B[i]) < 0.01f) && (abs(CImg[i] - A[i] - B[i]) < 0.01f);
-            if(!re)
-                logFail("Element at index %d is incorrect!", i);
         }
-        
-        std::cout << "Success\n";
-    }
 
-
-    void testInterop()
-    {
+        struct CallbackContext
         {
-            IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-                .size = size,
-                .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-                .alignmentLog2 = 10,
+            core::smart_refctd_ptr<ISemaphore> semaphore;
+            std::array<core::smart_refctd_ptr<ICPUBuffer>, InputCount> cpuBuffers;
+            std::array<core::smart_refctd_ptr<IGPUBuffer>, InputCount> inputStagingBuffers;
+            core::smart_refctd_ptr<IGPUBuffer> outputStagingBuffer;
+            core::smart_refctd_ptr<video::ILogicalDevice> device;
+            core::smart_refctd_ptr<system::ILogger> logger;
+        };
+
+        CallbackContext ctx;
+        ctx.semaphore = semaphore;
+        ctx.cpuBuffers = cpuBufs;
+        ctx.inputStagingBuffers = inputStagingBuffers;
+        ctx.outputStagingBuffer = outputStagingBuffer;
+        ctx.device = m_device;
+        ctx.logger = m_logger;
+
+        auto cudaCallback = [](void* userData)
+        {
+            const auto* ctx = reinterpret_cast<CallbackContext*>(userData);
+
+            // Make sure we are also done with the readback 
+            const auto wait = std::array{
+              ISemaphore::SWaitInfo{
+                .semaphore = ctx->semaphore.get(), 
+                .value = 3,
+              }
             };
-    
-            for (size_t i = 0; i < (1 << 8); ++i)
+            ctx->device->blockForSemaphores(wait, true);
+
+            auto* stagingMem = ctx->outputStagingBuffer->getBoundMemory().memory;
+            if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
             {
-                auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-                assert(memory);
-                auto tmpBuf = createExternalBuffer(memory.get());
+                ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize());
+                ctx->device->invalidateMappedMemoryRanges(1, &range);
             }
-        }
-    
+
+            const auto* inputs1 = reinterpret_cast<float*>(ctx->cpuBuffers[0]->getPointer());
+            const auto* inputs2 = reinterpret_cast<float*>(ctx->cpuBuffers[1]->getPointer());
+
+            const auto* outputs = reinterpret_cast<float*>(ctx->outputStagingBuffer->getBoundMemory().memory->getMappedPointer());
+            const auto* inputsInStaging1 = reinterpret_cast<float*>(ctx->inputStagingBuffers[0]->getBoundMemory().memory->getMappedPointer());
+            const auto* inputsInStaging2 = reinterpret_cast<float*>(ctx->inputStagingBuffers[1]->getBoundMemory().memory->getMappedPointer());
+
+            for (auto elem_i = 0; elem_i < NumElements; elem_i++)
+            {
+              const auto input1 = inputs1[elem_i];
+              const auto input2 = inputs2[elem_i];
+              const auto inputInStaging1 = inputsInStaging1[elem_i];
+              const auto inputInStaging2 = inputsInStaging2[elem_i];
+              if (inputInStaging1 != input1)
+                ctx->logger->log("Input1 in Staging %d is incorrect!", ILogger::ELL_ERROR, elem_i);
+              if (inputInStaging2 != input2)
+                ctx->logger->log("Input2 in Staging %d is incorrect!", ILogger::ELL_ERROR, elem_i);
+
+              const auto output = outputs[elem_i];
+              const auto expected = input1 + input2;
+              const auto diff = abs(output - expected);
+              bool re = diff < 0.01;
+              if (!re)
+                ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i);
+            }
+
+            ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO);
+        };
+
+        ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx));
+        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+
+        ASSERT_SUCCESS(cu.pcuModuleUnload(module));
+        ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
+    }
+
+    void testDestruction()
+    {
+
+        auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        constexpr auto ElementCount = 1024;
+        constexpr auto BufferSize = ElementCount * sizeof(int);
+        auto& cu = cudaHandler->getCUDAFunctionTable();
         smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
         {
-            IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-                .size = size,
-                .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-                .alignmentLog2 = 10,
-            };
-    
-            auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-    
-            auto tmpBuf = createExternalBuffer(memory.get());
-            auto staging = createStaging();
-    
+            core::smart_refctd_ptr<CCUDAExportableMemory> cudaMemory;
+            ASSERT_SUCCESS(cudaDevice->createExportableMemory(&cudaMemory, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+            escaped = cudaMemory->exportAsMemory(m_device.get());
+            if (!escaped) logFail("Fail to export CUDA memory!");
+        
+            auto tmpBuf = createExternalBuffer(escaped.get());
+            auto staging = createStaging(BufferSize);
+        
             auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
-            for (uint32_t i = 0; i < size / 4; ++i)
+            for (uint32_t i = 0; i < ElementCount; ++i)
                 ptr[i] = i;
-    
-            smart_refctd_ptr<IGPUCommandBuffer> cmd;
-            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
-            cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            IGPUCommandBuffer::SBufferCopy region = { .size = size };
-            assert(cmd->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
-            cmd->end();
-            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
-            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-            queue->submit({ &submitInfo,&submitInfo + 1 });
-            m_device->waitIdle();
-            escaped = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, memory->getCreationParams().externalHandle).memory;
-        }
-    
-        {
-            constexpr size_t M = 32;
-            auto staging = createStaging(size * M);
-    
-            auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
-            for (uint32_t i = 0; i < (M * size) / 4; ++i)
-                ptr[i] = rand();
-    
-            std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
-            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
-    
-            for (size_t i = 0; i < 1 << 10; ++i)
-            {
-                IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-                    .size = size * M,
-                    .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-                    .alignmentLog2 = 10,
-                };
-            RE:
-                auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-    
-                if (!memory)
-                {
-                    m_device->waitIdle();
-                    for (size_t j = 0; j < i; ++j)
-                        cmd[j] = 0;
-                    goto END;
-                }
-                assert(memory);
-                auto tmpBuf = createExternalBuffer(memory.get());
-    
-                cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-                IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
-                assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
-                cmd[i]->end();
-                IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
-                IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-                assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
-            }
-        END:
+        
+            const auto semaphore = m_device->createSemaphore(0);
+            IQueue::SSubmitInfo::SSemaphoreInfo semInfo;
+            semInfo.semaphore = semaphore.get();
+            semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
+            semInfo.value = 1;
+        
+            smart_refctd_ptr<IGPUCommandBuffer> cmdBuffer;
+            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmdBuffer);
+            cmdBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            IGPUCommandBuffer::SBufferCopy region = { .size = BufferSize };
+            assert(cmdBuffer->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+            cmdBuffer->end();
+            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmdBuffer.get() };
+            const IQueue::SSubmitInfo submitInfo = {
+              .commandBuffers = {&cmdInfo, &cmdInfo + 1}, 
+              .signalSemaphores = {&semInfo, 1}
+            };
+            auto qre = queue->submit({ &submitInfo, &submitInfo + 1 });
+            assert(IQueue::RESULT::SUCCESS == qre);
             m_device->waitIdle();
-        }
-    
+        }        
+        
         {
             auto tmpBuf = createExternalBuffer(escaped.get());
-            auto staging = createStaging();
-    
+            auto staging = createStaging(BufferSize);
+        
+            const auto semaphore = m_device->createSemaphore(0);
+            IQueue::SSubmitInfo::SSemaphoreInfo semInfo;
+            semInfo.semaphore = semaphore.get();
+            semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
+            semInfo.value = 1;
+        
             smart_refctd_ptr<IGPUCommandBuffer> cmd;
             commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
             cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            IGPUCommandBuffer::SBufferCopy region = { .size = size };
+            IGPUCommandBuffer::SBufferCopy region = { .size = BufferSize };
             assert(cmd->copyBuffer(tmpBuf.get(), staging.get(), 1, &region));
             cmd->end();
             IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd.get() };
-            IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-            auto qre = queue->submit({ &submitInfo,&submitInfo + 1 });
+            const IQueue::SSubmitInfo submitInfo = {
+              .commandBuffers = {&cmdInfo, &cmdInfo + 1}, 
+              .signalSemaphores = {&semInfo, 1}
+            };
+            auto qre = queue->submit({ &submitInfo, &submitInfo + 1 });
             assert(IQueue::RESULT::SUCCESS == qre);
+        
             m_device->waitIdle();
-    
-            auto& ptr = *(std::array<uint32_t, size>*)staging->getBoundMemory().memory->getMappedPointer();
-            for (uint32_t i = 0; i < size / 4; ++i)
-                assert(ptr[i] == i);
+        
+            auto& ptr = *(std::array<uint32_t, BufferSize>*)staging->getBoundMemory().memory->getMappedPointer();
+            for (uint32_t i = 0; i < ElementCount; ++i)
+            {
+                if (ptr[i] != i) logFail("Test Destruction: Element %d is incorrect", i);
+            }
+            m_logger->log("Test Destruction complete", ILogger::ELL_INFO);
         }
     
+        // {
+        //     constexpr size_t M = 32;
+        //     auto staging = createStaging(size * M);
+        //
+        //     auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+        //     for (uint32_t i = 0; i < (M * size) / 4; ++i)
+        //         ptr[i] = rand();
+        //
+        //     std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
+        //     commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
+        //
+        //     for (size_t i = 0; i < 1 << 10; ++i)
+        //     {
+        //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+        //             .size = size * M,
+        //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+        //             .alignmentLog2 = 10,
+        //         };
+        //     RE:
+        //         auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+        //
+        //         if (!memory)
+        //         {
+        //             m_device->waitIdle();
+        //             for (size_t j = 0; j < i; ++j)
+        //                 cmd[j] = 0;
+        //             goto END;
+        //         }
+        //         assert(memory);
+        //         auto tmpBuf = createExternalBuffer(memory.get());
+        //
+        //         cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+        //         IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
+        //         assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+        //         cmd[i]->end();
+        //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
+        //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+        //         assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
+        //     }
+        // END:
+        //     m_device->waitIdle();
+        // }
+    
+    }
+
+    void testLargeAllocations()
+    {
+        // TODO(kevin): Calculate BufferSize that is big enough to fill the machine VRAM
+        constexpr auto BufferSize = 1024;
+        IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+            .size = BufferSize,
+            .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+            .alignmentLog2 = 10,
+        };
+    
+        for (size_t i = 0; i < (1 << 8); ++i)
+        {
+            auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+            assert(memory);
+            auto tmpBuf = createExternalBuffer(memory.get());
+        }
     }
 
 

From 03d2ce251e39cd58057a52d6728ec73484f0216d Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Fri, 24 Apr 2026 00:52:33 +0700
Subject: [PATCH 04/47] Update to follow latest commit on main repo

---
 76_CudaInterop/main.cpp | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 2a64f9428..2c4f819b2 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -96,6 +96,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         if (!cudaDevice) 
             return logFail("Could not create a CUDA Device!");
 
+
+        queue = getComputeQueue();
+
         testSharedResource();
         testDestruction();
         testLargeAllocations();
@@ -193,7 +196,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         for (auto input_i = 0; input_i < InputCount; input_i++)
         {
           // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-          ASSERT_SUCCESS(cudaDevice->createExportableMemory(&cudaInputMemories[input_i], { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+          cudaInputMemories[input_i] = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
           vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr);
           vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get());
           inputStagingBuffers[input_i] = createStaging(BufferSize);
@@ -205,15 +208,18 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
         const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams));
         auto outputMemReq = outputBuf->getMemoryReqs();
+
         auto allocation = m_device->allocate(outputMemReq, outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE);
-        core::smart_refctd_ptr<CCUDAImportedMemory> cudaOutputMemory;
-        ASSERT_SUCCESS(cudaDevice->importExternalMemory(&cudaOutputMemory, allocation.memory.get()));
+        const auto cudaOutputMemory = cudaDevice->importExternalMemory(core::smart_refctd_ptr(allocation.memory));
+        if (!cudaOutputMemory)
+          logFail("Fail to import Vulkan Memory into CUDA!");
         
         ISemaphore::SCreationParams semParams;
         semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
         auto semaphore = m_device->createSemaphore(0, std::move(semParams));
-        core::smart_refctd_ptr<CCUDAImportedSemaphore> cudaSemaphore;
-        ASSERT_SUCCESS(cudaDevice->importExternalSemaphore(&cudaSemaphore, semaphore.get()));
+        const auto cudaSemaphore = cudaDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
+        if (!cudaSemaphore)
+          logFail("Fail to import Vulkan Semaphore into CUDA!");
         
         std::array<smart_refctd_ptr<IGPUCommandBuffer>, 2> cmd;
         auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
@@ -414,15 +420,15 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
     void testDestruction()
     {
-
         auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
         constexpr auto ElementCount = 1024;
         constexpr auto BufferSize = ElementCount * sizeof(int);
         auto& cu = cudaHandler->getCUDAFunctionTable();
         smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
         {
-            core::smart_refctd_ptr<CCUDAExportableMemory> cudaMemory;
-            ASSERT_SUCCESS(cudaDevice->createExportableMemory(&cudaMemory, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE }));
+            const auto cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+            if (!cudaMemory) logFail("Fail to create exportable memory!");
+
             escaped = cudaMemory->exportAsMemory(m_device.get());
             if (!escaped) logFail("Fail to export CUDA memory!");
         

From 1e120e8956181d8de7931f1fd2e8bb350a046c2a Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Sat, 25 Apr 2026 17:18:45 +0700
Subject: [PATCH 05/47] Fix ex 67 due to changes in memory allocation

---
 67_RayQueryGeometry/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/67_RayQueryGeometry/main.cpp b/67_RayQueryGeometry/main.cpp
index 63346ac4c..2f196e140 100644
--- a/67_RayQueryGeometry/main.cpp
+++ b/67_RayQueryGeometry/main.cpp
@@ -664,7 +664,7 @@ class RayQueryGeometryApp final : public SimpleWindowedApplication, public Built
 					auto retval = device->allocate(info);
 					// map what is mappable by default so ReBAR checks succeed
 					if (retval.isValid() && retval.memory->isMappable())
-						retval.memory->map({.offset=0,.length=info.size});
+						retval.memory->map({.offset=0,.length=info.allocationSize});
 					return retval;
 				}
 

From fc00a68b3dec9f4c3ff81419ea77e5f85f5ff4ce Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Thu, 30 Apr 2026 15:03:58 +0700
Subject: [PATCH 06/47] ASSERT_SUCCESS into ASSERT_CUDA_SUCCESS

---
 76_CudaInterop/main.cpp | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 2c4f819b2..8231586d5 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -40,8 +40,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin
     return true;
 }
 
-#define ASSERT_SUCCESS(expr) { auto re = check_cuda_err((expr), cu, m_logger, __FILE__, __LINE__); assert(re); }
-#define ASSERT_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
+#define ASSERT_CUDA_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
 
 
 using namespace nbl::core;
@@ -156,7 +155,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             std::string log;
             auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
                 "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
-            ASSERT_SUCCESS_NV(res, log);
+            ASSERT_CUDA_SUCCESS_NV(res, log);
 
             ptx = std::move(ptx_);
         }
@@ -167,9 +166,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         CUfunction kernel;
         CUstream   stream;
 
-        ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
-        ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
-        ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), cudaHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), cudaHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), cudaHandler);
 
         // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
         std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
@@ -279,17 +278,17 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             };
             auto numElements = &NumElements;
             void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements };
-            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream));
-            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream));
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler);
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler);
     
             auto semaphore = cudaSemaphore->getInternalObject();
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
-            ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
-            ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr));
+            ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan
+            ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler);
             const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
-            ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
+            ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), cudaHandler); // Signal the imported semaphore
         }
-        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler);
         
         // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
         {
@@ -411,11 +410,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO);
         };
 
-        ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx));
-        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
+        ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), cudaHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler);
 
-        ASSERT_SUCCESS(cu.pcuModuleUnload(module));
-        ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), cudaHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), cudaHandler);
     }
 
     void testDestruction()

From 00572257f2370be17e118f3186ea032119e186cd Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 4 May 2026 14:22:22 +0700
Subject: [PATCH 07/47] Refactor ASSERT_CUDA_SUCCESS

---
 76_CudaInterop/main.cpp | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 8231586d5..84dbac39f 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -16,19 +16,6 @@ The start of the main function starts like in most other example. We ask the
 user for the desired renderer and start it up.
 */
 
-bool check_cuda_err(cudaError_enum err, auto& cu, auto& logger, auto file, auto line)
-{
-    if (auto re = err; CUDA_SUCCESS != re) 
-    {
-        const char* name = 0, * str = 0;
-        cu.pcuGetErrorName(re, &name);
-        cu.pcuGetErrorString(re, &str);
-        logger->log("%s:%d %s:\n\t%s\n", system::ILogger::ELL_ERROR, file, line, name, str);
-        return false;
-    }
-    return true;
-}
-
 bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log)
 {
     if (auto re = err; NVRTC_SUCCESS != re) 
@@ -40,7 +27,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin
     return true;
 }
 
-#define ASSERT_CUDA_SUCCESS_NV(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
+#define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
 
 
 using namespace nbl::core;
@@ -155,7 +142,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             std::string log;
             auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
                 "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
-            ASSERT_CUDA_SUCCESS_NV(res, log);
+            ASSERT_NV_SUCCESS(res, log);
 
             ptx = std::move(ptx_);
         }

From 82d05923f15c09f1f1de771c14b9c1b89c5ca28b Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 4 May 2026 14:22:49 +0700
Subject: [PATCH 08/47] Slight naming refactor

---
 76_CudaInterop/main.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 84dbac39f..5fd8151bf 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -85,7 +85,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
         queue = getComputeQueue();
 
-        testSharedResource();
+        testVectorAddKernel();
         testDestruction();
         testLargeAllocations();
 
@@ -120,7 +120,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         return buf;
     };
 
-    void testSharedResource()
+    void testVectorAddKernel()
     {
         static constexpr uint32_t GridDim[3] = { 4096,1,1 };
         static constexpr uint32_t BlockDim[3] = { 1024,1,1 };
@@ -389,8 +389,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
               const auto output = outputs[elem_i];
               const auto expected = input1 + input2;
               const auto diff = abs(output - expected);
-              bool re = diff < 0.01;
-              if (!re)
+              if (diff < 0.01)
                 ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i);
             }
 

From a229db2993e35af7b09c8fd5393b8e16d7ff6435 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 4 May 2026 14:24:35 +0700
Subject: [PATCH 09/47] Remove unused commented code

---
 76_CudaInterop/main.cpp | 43 -----------------------------------------
 1 file changed, 43 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 5fd8151bf..dfd214384 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -480,49 +480,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             m_logger->log("Test Destruction complete", ILogger::ELL_INFO);
         }
     
-        // {
-        //     constexpr size_t M = 32;
-        //     auto staging = createStaging(size * M);
-        //
-        //     auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
-        //     for (uint32_t i = 0; i < (M * size) / 4; ++i)
-        //         ptr[i] = rand();
-        //
-        //     std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
-        //     commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
-        //
-        //     for (size_t i = 0; i < 1 << 10; ++i)
-        //     {
-        //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-        //             .size = size * M,
-        //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-        //             .alignmentLog2 = 10,
-        //         };
-        //     RE:
-        //         auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
-        //
-        //         if (!memory)
-        //         {
-        //             m_device->waitIdle();
-        //             for (size_t j = 0; j < i; ++j)
-        //                 cmd[j] = 0;
-        //             goto END;
-        //         }
-        //         assert(memory);
-        //         auto tmpBuf = createExternalBuffer(memory.get());
-        //
-        //         cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-        //         IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
-        //         assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
-        //         cmd[i]->end();
-        //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
-        //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
-        //         assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
-        //     }
-        // END:
-        //     m_device->waitIdle();
-        // }
-    
     }
 
     void testLargeAllocations()

From feac63dc5b968a6351381a05a1632019b8d19749 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 09:28:11 +0200
Subject: [PATCH 10/47] Build CUDA interop example through extension target

---
 76_CudaInterop/CMakeLists.txt | 8 +++++++-
 76_CudaInterop/main.cpp       | 6 ++----
 CMakeLists.txt                | 6 ++++--
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt
index bc1624875..c904da699 100644
--- a/76_CudaInterop/CMakeLists.txt
+++ b/76_CudaInterop/CMakeLists.txt
@@ -5,6 +5,12 @@ endif()
 
 nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
 
+if(NOT TARGET Nabla::ext::CUDAInterop)
+	message(FATAL_ERROR "76_CudaInterop requires the CUDA interop extension target")
+endif()
+
+target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInterop)
+
 if(NBL_EMBED_BUILTIN_RESOURCES)
 	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
 	set(RESOURCE_DIR "app_resources")
@@ -21,4 +27,4 @@ if(NBL_EMBED_BUILTIN_RESOURCES)
 	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
 
 	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
-endif()
\ No newline at end of file
+endif()
diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index dfd214384..9108e08f4 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -1,6 +1,4 @@
-#include "nbl/video/CCUDAHandler.h"
-// #include "nbl/video/CCUDAExportableMemory.h"
-// #include "nbl/video/CCUDAImportedSemaphore.h"
+#include "nbl/ext/CUDAInterop/CUDAInterop.h"
 
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
 #include "nbl/examples/common/BuiltinResourcesApplication.hpp"
@@ -508,4 +506,4 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     void workLoopBody() override {}
 };
 
-NBL_MAIN_FUNC(CUDA2VKApp)
\ No newline at end of file
+NBL_MAIN_FUNC(CUDA2VKApp)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c7990c06..0715f1064 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -111,7 +111,9 @@ if(NBL_BUILD_EXAMPLES)
 	endif()
 
 	add_subdirectory(74_QuantizedSequenceTests)
-	add_subdirectory(76_CudaInterop)
+	if (NBL_COMPILE_WITH_CUDA)
+		add_subdirectory(76_CudaInterop)
+	endif()
 
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)
@@ -137,4 +139,4 @@ if(NBL_BUILD_EXAMPLES)
     endforeach()
 
 	NBL_ADJUST_FOLDERS(examples)
-endif()
\ No newline at end of file
+endif()

From 6f136a224c516182d8d7883407a04be529063a9e Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 11:42:32 +0200
Subject: [PATCH 11/47] Simplify CUDA interop example link

---
 76_CudaInterop/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt
index c904da699..8eb08f70b 100644
--- a/76_CudaInterop/CMakeLists.txt
+++ b/76_CudaInterop/CMakeLists.txt
@@ -5,10 +5,6 @@ endif()
 
 nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
 
-if(NOT TARGET Nabla::ext::CUDAInterop)
-	message(FATAL_ERROR "76_CudaInterop requires the CUDA interop extension target")
-endif()
-
 target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInterop)
 
 if(NBL_EMBED_BUILTIN_RESOURCES)

From b17beb26f27a9cb8347a25fd5587c7cc9310d589 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 16:16:29 +0200
Subject: [PATCH 12/47] Use CUDA interop native target

---
 76_CudaInterop/CMakeLists.txt |  2 +-
 76_CudaInterop/main.cpp       | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt
index 8eb08f70b..bd4f1914b 100644
--- a/76_CudaInterop/CMakeLists.txt
+++ b/76_CudaInterop/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 
 nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
 
-target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInterop)
+target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInteropNative)
 
 if(NBL_EMBED_BUILTIN_RESOURCES)
 	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 9108e08f4..becdfbe50 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -1,4 +1,4 @@
-#include "nbl/ext/CUDAInterop/CUDAInterop.h"
+#include "nbl/ext/CUDAInterop/CUDAInteropNative.h"
 
 #include "nbl/application_templates/MonoDeviceApplication.hpp"
 #include "nbl/examples/common/BuiltinResourcesApplication.hpp"
@@ -16,9 +16,9 @@ user for the desired renderer and start it up.
 
 bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log)
 {
-    if (auto re = err; NVRTC_SUCCESS != re) 
+    if (auto re = err; NVRTC_SUCCESS != re)
     {
-        const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); 
+        const char* str = cuda_native::getNVRTCFunctionTable(*cudaHandler).pnvrtcGetErrorString(re);
         logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str());
         return false;
     }
@@ -59,7 +59,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         device_base_t::filterDevices(physicalDevices);
         auto& cuDevices = cudaHandler->getAvailableDevices();
         std::erase_if(physicalDevices, [&cuDevices](auto pdev) {
-            return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, &cuDev.uuid, 16);  });
+            return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, cuDev.uuid.data(), 16);  });
         });
     }
 
@@ -138,14 +138,14 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
             smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
             std::string log;
-            auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
+            auto [ptx_, res] = cuda_native::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()),
                 "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
             ASSERT_NV_SUCCESS(res, log);
 
             ptx = std::move(ptx_);
         }
 
-        auto& cu = cudaHandler->getCUDAFunctionTable();
+        auto& cu = cuda_native::getCUDAFunctionTable(*cudaHandler);
 
         CUmodule   module;
         CUfunction kernel;
@@ -187,7 +187,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         }
 
         IGPUBuffer::SCreationParams outputBufferParams;
-        outputBufferParams.size = cudaDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, BufferSize);
+        outputBufferParams.size = cudaDevice->roundToGranularity(ECUDAMemoryLocation::DEVICE, BufferSize);
         outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
         outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
         const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams));
@@ -255,10 +255,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         // Launch kernel
         {
             CUdeviceptr outputBufPtr;
-            cudaOutputMemory->getMappedBuffer(&outputBufPtr);
+            cuda_native::getMappedBuffer(*cudaOutputMemory, &outputBufPtr);
             CUdeviceptr ptrs[] = {
-              cudaInputMemories[0]->getDeviceptr(),
-              cudaInputMemories[1]->getDeviceptr(),
+              cuda_native::getDeviceptr(*cudaInputMemories[0]),
+              cuda_native::getDeviceptr(*cudaInputMemories[1]),
               outputBufPtr
             };
             auto numElements = &NumElements;
@@ -266,7 +266,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler);
             ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler);
     
-            auto semaphore = cudaSemaphore->getInternalObject();
+            auto semaphore = cuda_native::getInternalObject(*cudaSemaphore);
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
             ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan
             ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler);
@@ -406,7 +406,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
         constexpr auto ElementCount = 1024;
         constexpr auto BufferSize = ElementCount * sizeof(int);
-        auto& cu = cudaHandler->getCUDAFunctionTable();
+        auto& cu = cuda_native::getCUDAFunctionTable(*cudaHandler);
         smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
         {
             const auto cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });

From fd50fda4952096febc8ab9df94e441d55e54e7bf Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 16:39:53 +0200
Subject: [PATCH 13/47] Use native CUDA accessors

---
 76_CudaInterop/main.cpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index becdfbe50..289b0c0b1 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -8,6 +8,7 @@ using namespace core;
 using namespace system;
 using namespace asset;
 using namespace video;
+namespace cuda = nbl::video::cuda_native;
 
 /*
 The start of the main function starts like in most other example. We ask the
@@ -18,7 +19,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin
 {
     if (auto re = err; NVRTC_SUCCESS != re)
     {
-        const char* str = cuda_native::getNVRTCFunctionTable(*cudaHandler).pnvrtcGetErrorString(re);
+        const char* str = cuda::getNVRTCFunctionTable(cudaHandler).pnvrtcGetErrorString(re);
         logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str());
         return false;
     }
@@ -138,14 +139,14 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
             smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
             std::string log;
-            auto [ptx_, res] = cuda_native::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()),
+            auto [ptx_, res] = cuda::compileDirectlyToPTX(cudaHandler, std::string((const char*)source->getPointer(), source->getSize()),
                 "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
             ASSERT_NV_SUCCESS(res, log);
 
             ptx = std::move(ptx_);
         }
 
-        auto& cu = cuda_native::getCUDAFunctionTable(*cudaHandler);
+        auto& cu = cuda::getCUDAFunctionTable(cudaHandler);
 
         CUmodule   module;
         CUfunction kernel;
@@ -255,10 +256,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         // Launch kernel
         {
             CUdeviceptr outputBufPtr;
-            cuda_native::getMappedBuffer(*cudaOutputMemory, &outputBufPtr);
+            cuda::getMappedBuffer(cudaOutputMemory, &outputBufPtr);
             CUdeviceptr ptrs[] = {
-              cuda_native::getDeviceptr(*cudaInputMemories[0]),
-              cuda_native::getDeviceptr(*cudaInputMemories[1]),
+              cuda::getDeviceptr(cudaInputMemories[0]),
+              cuda::getDeviceptr(cudaInputMemories[1]),
               outputBufPtr
             };
             auto numElements = &NumElements;
@@ -266,7 +267,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler);
             ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler);
     
-            auto semaphore = cuda_native::getInternalObject(*cudaSemaphore);
+            auto semaphore = cuda::getInternalObject(cudaSemaphore);
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
             ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan
             ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler);
@@ -406,7 +407,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
         constexpr auto ElementCount = 1024;
         constexpr auto BufferSize = ElementCount * sizeof(int);
-        auto& cu = cuda_native::getCUDAFunctionTable(*cudaHandler);
+        auto& cu = cuda::getCUDAFunctionTable(cudaHandler);
         smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
         {
             const auto cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });

From 24525f0ee735f19d92a688fe85be84667b79af66 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Wed, 6 May 2026 17:40:31 +0200
Subject: [PATCH 14/47] Use CUDA interop target

---
 76_CudaInterop/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt
index bd4f1914b..8eb08f70b 100644
--- a/76_CudaInterop/CMakeLists.txt
+++ b/76_CudaInterop/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 
 nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
 
-target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInteropNative)
+target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInterop)
 
 if(NBL_EMBED_BUILTIN_RESOURCES)
 	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)

From 4671898c61f7b00de9e3e88d039b199e3b16cc0b Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 09:24:56 +0200
Subject: [PATCH 15/47] Use CUDA native interop helper

---
 76_CudaInterop/main.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 289b0c0b1..64616a6b7 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -181,14 +181,15 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         for (auto input_i = 0; input_i < InputCount; input_i++)
         {
           // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-          cudaInputMemories[input_i] = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+            cudaInputMemories[input_i] = cuda_native::createExportableMemory(cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+          assert(cudaInputMemories[input_i]);
           vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr);
           vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get());
           inputStagingBuffers[input_i] = createStaging(BufferSize);
         }
 
         IGPUBuffer::SCreationParams outputBufferParams;
-        outputBufferParams.size = cudaDevice->roundToGranularity(ECUDAMemoryLocation::DEVICE, BufferSize);
+        outputBufferParams.size = cuda_native::roundToGranularity(cudaDevice, CU_MEM_LOCATION_TYPE_DEVICE, BufferSize);
         outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
         outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
         const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams));
@@ -410,9 +411,8 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         auto& cu = cuda::getCUDAFunctionTable(cudaHandler);
         smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
         {
-            const auto cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
-            if (!cudaMemory) logFail("Fail to create exportable memory!");
-
+            core::smart_refctd_ptr<CCUDAExportableMemory> cudaMemory = cuda_native::createExportableMemory(cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+            assert(cudaMemory);
             escaped = cudaMemory->exportAsMemory(m_device.get());
             if (!escaped) logFail("Fail to export CUDA memory!");
         

From acdcfc8e0feb29a81a274b951c010c7b95f07230 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 11:58:06 +0200
Subject: [PATCH 16/47] Use CUDA interop helper in example

---
 76_CudaInterop/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/76_CudaInterop/CMakeLists.txt b/76_CudaInterop/CMakeLists.txt
index 8eb08f70b..de9f9d6b8 100644
--- a/76_CudaInterop/CMakeLists.txt
+++ b/76_CudaInterop/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 
 nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
 
-target_link_libraries(${EXECUTABLE_NAME} PRIVATE Nabla::ext::CUDAInterop)
+nbl_target_link_cuda_interop(${EXECUTABLE_NAME} PRIVATE)
 
 if(NBL_EMBED_BUILTIN_RESOURCES)
 	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)

From d5aa23b9648f5854830101a2bda722a402775238 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 16:05:25 +0200
Subject: [PATCH 17/47] Use CUDA interop accessors

---
 76_CudaInterop/main.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 64616a6b7..6f7fee94c 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -19,7 +19,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin
 {
     if (auto re = err; NVRTC_SUCCESS != re)
     {
-        const char* str = cuda::getNVRTCFunctionTable(cudaHandler).pnvrtcGetErrorString(re);
+        const char* str = cuda::CCUDAHandlerAccessor::getNVRTCFunctionTable(*cudaHandler).pnvrtcGetErrorString(re);
         logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str());
         return false;
     }
@@ -139,14 +139,14 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
             smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
             std::string log;
-            auto [ptx_, res] = cuda::compileDirectlyToPTX(cudaHandler, std::string((const char*)source->getPointer(), source->getSize()),
+            auto [ptx_, res] = cuda::CCUDAHandlerAccessor::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()),
                 "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
             ASSERT_NV_SUCCESS(res, log);
 
             ptx = std::move(ptx_);
         }
 
-        auto& cu = cuda::getCUDAFunctionTable(cudaHandler);
+        auto& cu = cuda::CCUDAHandlerAccessor::getCUDAFunctionTable(*cudaHandler);
 
         CUmodule   module;
         CUfunction kernel;
@@ -181,7 +181,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         for (auto input_i = 0; input_i < InputCount; input_i++)
         {
           // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-            cudaInputMemories[input_i] = cuda_native::createExportableMemory(cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+            cudaInputMemories[input_i] = cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
           assert(cudaInputMemories[input_i]);
           vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr);
           vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get());
@@ -189,7 +189,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         }
 
         IGPUBuffer::SCreationParams outputBufferParams;
-        outputBufferParams.size = cuda_native::roundToGranularity(cudaDevice, CU_MEM_LOCATION_TYPE_DEVICE, BufferSize);
+        outputBufferParams.size = cuda_native::CCUDADeviceAccessor::roundToGranularity(*cudaDevice, CU_MEM_LOCATION_TYPE_DEVICE, BufferSize);
         outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
         outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
         const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams));
@@ -257,10 +257,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         // Launch kernel
         {
             CUdeviceptr outputBufPtr;
-            cuda::getMappedBuffer(cudaOutputMemory, &outputBufPtr);
+            cuda::CCUDAImportedMemoryAccessor::getMappedBuffer(*cudaOutputMemory, &outputBufPtr);
             CUdeviceptr ptrs[] = {
-              cuda::getDeviceptr(cudaInputMemories[0]),
-              cuda::getDeviceptr(cudaInputMemories[1]),
+              cuda::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaInputMemories[0]),
+              cuda::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaInputMemories[1]),
               outputBufPtr
             };
             auto numElements = &NumElements;
@@ -268,7 +268,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler);
             ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler);
     
-            auto semaphore = cuda::getInternalObject(cudaSemaphore);
+            auto semaphore = cuda::CCUDAImportedSemaphoreAccessor::getInternalObject(*cudaSemaphore);
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
             ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan
             ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler);
@@ -408,10 +408,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
         constexpr auto ElementCount = 1024;
         constexpr auto BufferSize = ElementCount * sizeof(int);
-        auto& cu = cuda::getCUDAFunctionTable(cudaHandler);
+        auto& cu = cuda::CCUDAHandlerAccessor::getCUDAFunctionTable(*cudaHandler);
         smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
         {
-            core::smart_refctd_ptr<CCUDAExportableMemory> cudaMemory = cuda_native::createExportableMemory(cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+            core::smart_refctd_ptr<CCUDAExportableMemory> cudaMemory = cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
             assert(cudaMemory);
             escaped = cudaMemory->exportAsMemory(m_device.get());
             if (!escaped) logFail("Fail to export CUDA memory!");

From 5031a249c5cd892190e74aca69ed15c6144575c0 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 16:35:27 +0200
Subject: [PATCH 18/47] Use explicit CUDA compile log

---
 76_CudaInterop/main.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 6f7fee94c..b4dffcd31 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -139,11 +139,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
             smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
             std::string log;
-            auto [ptx_, res] = cuda::CCUDAHandlerAccessor::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()),
-                "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
-            ASSERT_NV_SUCCESS(res, log);
+            auto compile = cuda::CCUDAHandlerAccessor::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()),
+                "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), log, 0, 0, 0);
+            ASSERT_NV_SUCCESS(compile.result, log);
 
-            ptx = std::move(ptx_);
+            ptx = std::move(compile.ptx);
         }
 
         auto& cu = cuda::CCUDAHandlerAccessor::getCUDAFunctionTable(*cudaHandler);

From 7b5817a6d45c62a70fbe617022b6026a83939ff5 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Thu, 7 May 2026 18:11:24 +0200
Subject: [PATCH 19/47] Fix CUDA interop example assert helper

---
 76_CudaInterop/main.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index b4dffcd31..f528dc561 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -27,6 +27,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin
 }
 
 #define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
+#define ASSERT_CUDA_SUCCESS(expr, handler) { auto re = cuda::CCUDAHandlerAccessor::defaultHandleResult(*(handler), (expr)); assert(re); }
 
 
 using namespace nbl::core;

From 2d415af102ebf710ea2bb369b3f0eca5544652f7 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Fri, 8 May 2026 17:06:48 +0200
Subject: [PATCH 20/47] Use opaque CUDA interop handles

---
 76_CudaInterop/main.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index f528dc561..38c336da0 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -19,7 +19,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin
 {
     if (auto re = err; NVRTC_SUCCESS != re)
     {
-        const char* str = cuda::CCUDAHandlerAccessor::getNVRTCFunctionTable(*cudaHandler).pnvrtcGetErrorString(re);
+        const char* str = cuda::getNVRTCFunctionTable(*cudaHandler).pnvrtcGetErrorString(re);
         logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str());
         return false;
     }
@@ -27,7 +27,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin
 }
 
 #define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
-#define ASSERT_CUDA_SUCCESS(expr, handler) { auto re = cuda::CCUDAHandlerAccessor::defaultHandleResult(*(handler), (expr)); assert(re); }
+#define ASSERT_CUDA_SUCCESS(expr, handler) { auto re = cuda::defaultHandleResult(*(handler), (expr)); assert(re); }
 
 
 using namespace nbl::core;
@@ -140,14 +140,14 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
             smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
             std::string log;
-            auto compile = cuda::CCUDAHandlerAccessor::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()),
+            auto compile = cuda::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()),
                 "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), log, 0, 0, 0);
             ASSERT_NV_SUCCESS(compile.result, log);
 
             ptx = std::move(compile.ptx);
         }
 
-        auto& cu = cuda::CCUDAHandlerAccessor::getCUDAFunctionTable(*cudaHandler);
+        auto& cu = cuda::getCUDAFunctionTable(*cudaHandler);
 
         CUmodule   module;
         CUfunction kernel;
@@ -182,7 +182,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         for (auto input_i = 0; input_i < InputCount; input_i++)
         {
           // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-            cudaInputMemories[input_i] = cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+            cudaInputMemories[input_i] = cuda::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
           assert(cudaInputMemories[input_i]);
           vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr);
           vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get());
@@ -190,7 +190,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         }
 
         IGPUBuffer::SCreationParams outputBufferParams;
-        outputBufferParams.size = cuda_native::CCUDADeviceAccessor::roundToGranularity(*cudaDevice, CU_MEM_LOCATION_TYPE_DEVICE, BufferSize);
+        outputBufferParams.size = cuda::roundToGranularity(*cudaDevice, CU_MEM_LOCATION_TYPE_DEVICE, BufferSize);
         outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
         outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
         const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams));
@@ -257,11 +257,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         
         // Launch kernel
         {
-            CUdeviceptr outputBufPtr;
-            cuda::CCUDAImportedMemoryAccessor::getMappedBuffer(*cudaOutputMemory, &outputBufPtr);
+            cuda::SCUdeviceptr outputBufPtr;
+            cudaOutputMemory->getMappedBuffer(outputBufPtr.opaque());
             CUdeviceptr ptrs[] = {
-              cuda::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaInputMemories[0]),
-              cuda::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaInputMemories[1]),
+              cuda::SCUdeviceptr(cudaInputMemories[0]->getDeviceptr()),
+              cuda::SCUdeviceptr(cudaInputMemories[1]->getDeviceptr()),
               outputBufPtr
             };
             auto numElements = &NumElements;
@@ -269,7 +269,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler);
             ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler);
     
-            auto semaphore = cuda::CCUDAImportedSemaphoreAccessor::getInternalObject(*cudaSemaphore);
+            CUexternalSemaphore semaphore = cuda::SCUexternalSemaphore(cudaSemaphore->getInternalObject());
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
             ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan
             ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler);
@@ -409,10 +409,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
         constexpr auto ElementCount = 1024;
         constexpr auto BufferSize = ElementCount * sizeof(int);
-        auto& cu = cuda::CCUDAHandlerAccessor::getCUDAFunctionTable(*cudaHandler);
+        auto& cu = cuda::getCUDAFunctionTable(*cudaHandler);
         smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
         {
-            core::smart_refctd_ptr<CCUDAExportableMemory> cudaMemory = cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+            core::smart_refctd_ptr<CCUDAExportableMemory> cudaMemory = cuda::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
             assert(cudaMemory);
             escaped = cudaMemory->exportAsMemory(m_device.get());
             if (!escaped) logFail("Fail to export CUDA memory!");

From e289ee14f5b8f05004726e6f03c81a9a2e768219 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sat, 9 May 2026 13:21:43 +0200
Subject: [PATCH 21/47] Use opaque CUDA interop calls

---
 76_CudaInterop/main.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 38c336da0..ec9d8b25f 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -19,7 +19,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin
 {
     if (auto re = err; NVRTC_SUCCESS != re)
     {
-        const char* str = cuda::getNVRTCFunctionTable(*cudaHandler).pnvrtcGetErrorString(re);
+        const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re);
         logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str());
         return false;
     }
@@ -147,7 +147,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             ptx = std::move(compile.ptx);
         }
 
-        auto& cu = cuda::getCUDAFunctionTable(*cudaHandler);
+        auto& cu = cudaHandler->getCUDAFunctionTable();
 
         CUmodule   module;
         CUfunction kernel;
@@ -182,7 +182,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         for (auto input_i = 0; input_i < InputCount; input_i++)
         {
           // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-            cudaInputMemories[input_i] = cuda::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+            cudaInputMemories[input_i] = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE });
           assert(cudaInputMemories[input_i]);
           vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr);
           vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get());
@@ -190,7 +190,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         }
 
         IGPUBuffer::SCreationParams outputBufferParams;
-        outputBufferParams.size = cuda::roundToGranularity(*cudaDevice, CU_MEM_LOCATION_TYPE_DEVICE, BufferSize);
+        outputBufferParams.size = cudaDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, BufferSize);
         outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
         outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
         const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams));
@@ -409,10 +409,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
         constexpr auto ElementCount = 1024;
         constexpr auto BufferSize = ElementCount * sizeof(int);
-        auto& cu = cuda::getCUDAFunctionTable(*cudaHandler);
+        auto& cu = cudaHandler->getCUDAFunctionTable();
         smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
         {
-            core::smart_refctd_ptr<CCUDAExportableMemory> cudaMemory = cuda::createExportableMemory(*cudaDevice, { .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+            core::smart_refctd_ptr<CCUDAExportableMemory> cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE });
             assert(cudaMemory);
             escaped = cudaMemory->exportAsMemory(m_device.get());
             if (!escaped) logFail("Fail to export CUDA memory!");

From b4601fc685176d6729b095a9637c1662d0a29503 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sat, 9 May 2026 16:53:35 +0200
Subject: [PATCH 22/47] Use native CUDA interop conversion

---
 76_CudaInterop/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index ec9d8b25f..263e3dcce 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -258,7 +258,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         // Launch kernel
         {
             cuda::SCUdeviceptr outputBufPtr;
-            cudaOutputMemory->getMappedBuffer(outputBufPtr.opaque());
+            cudaOutputMemory->getMappedBuffer(outputBufPtr);
             CUdeviceptr ptrs[] = {
               cuda::SCUdeviceptr(cudaInputMemories[0]->getDeviceptr()),
               cuda::SCUdeviceptr(cudaInputMemories[1]->getDeviceptr()),

From d373d313d3e70579d650c7804af8a2785cfede9a Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 10 May 2026 10:19:11 +0200
Subject: [PATCH 23/47] Fix CUDA interop smoke validation

---
 76_CudaInterop/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 263e3dcce..d66688710 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -390,7 +390,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
               const auto output = outputs[elem_i];
               const auto expected = input1 + input2;
               const auto diff = abs(output - expected);
-              if (diff < 0.01)
+              if (diff >= 0.01f)
                 ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i);
             }
 

From a6268bc9953b8d8a795b3b2eee8dbd897b05706e Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 10 May 2026 15:36:54 +0200
Subject: [PATCH 24/47] Use CUDA interop assert helper

---
 76_CudaInterop/main.cpp | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index d66688710..f090a4500 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -27,7 +27,6 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin
 }
 
 #define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
-#define ASSERT_CUDA_SUCCESS(expr, handler) { auto re = cuda::defaultHandleResult(*(handler), (expr)); assert(re); }
 
 
 using namespace nbl::core;
@@ -153,9 +152,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         CUfunction kernel;
         CUstream   stream;
 
-        ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), cudaHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), cudaHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), *cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), *cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), *cudaHandler);
 
         // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
         std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
@@ -266,17 +265,17 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             };
             auto numElements = &NumElements;
             void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements };
-            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler);
-            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler);
+            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), *cudaHandler);
+            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), *cudaHandler);
     
             CUexternalSemaphore semaphore = cuda::SCUexternalSemaphore(cudaSemaphore->getInternalObject());
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
-            ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan
-            ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler);
+            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), *cudaHandler); // Wait for release op from vulkan
+            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), *cudaHandler);
             const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
-            ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), cudaHandler); // Signal the imported semaphore
+            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), *cudaHandler); // Signal the imported semaphore
         }
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), *cudaHandler);
         
         // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
         {
@@ -397,11 +396,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO);
         };
 
-        ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), cudaHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), *cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), *cudaHandler);
 
-        ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), cudaHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleUnload(module), *cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream), *cudaHandler);
     }
 
     void testDestruction()

From eb8f44a1b5ef38d1416a6fdc9a43e8e0215ec0bf Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 10 May 2026 19:38:51 +0200
Subject: [PATCH 25/47] Use native CUDA interop handles in EX76

---
 76_CudaInterop/main.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index f090a4500..fd05e4b79 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -8,7 +8,6 @@ using namespace core;
 using namespace system;
 using namespace asset;
 using namespace video;
-namespace cuda = nbl::video::cuda_native;
 
 /*
 The start of the main function starts like in most other example. We ask the
@@ -139,7 +138,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
             smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
             std::string log;
-            auto compile = cuda::compileDirectlyToPTX(*cudaHandler, std::string((const char*)source->getPointer(), source->getSize()),
+            auto compile = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()),
                 "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), log, 0, 0, 0);
             ASSERT_NV_SUCCESS(compile.result, log);
 
@@ -256,11 +255,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         
         // Launch kernel
         {
-            cuda::SCUdeviceptr outputBufPtr;
+            CUdeviceptr outputBufPtr = 0;
             cudaOutputMemory->getMappedBuffer(outputBufPtr);
             CUdeviceptr ptrs[] = {
-              cuda::SCUdeviceptr(cudaInputMemories[0]->getDeviceptr()),
-              cuda::SCUdeviceptr(cudaInputMemories[1]->getDeviceptr()),
+              cudaInputMemories[0]->getDeviceptr(),
+              cudaInputMemories[1]->getDeviceptr(),
               outputBufPtr
             };
             auto numElements = &NumElements;
@@ -268,7 +267,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), *cudaHandler);
             NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), *cudaHandler);
     
-            CUexternalSemaphore semaphore = cuda::SCUexternalSemaphore(cudaSemaphore->getInternalObject());
+            CUexternalSemaphore semaphore = cudaSemaphore->getInternalObject();
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
             NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), *cudaHandler); // Wait for release op from vulkan
             NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), *cudaHandler);

From 39441760d335467158a340ad366302235ba6c30e Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Sun, 10 May 2026 19:56:53 +0200
Subject: [PATCH 26/47] Pass CUDA handler pointer to assert macro

---
 76_CudaInterop/main.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index fd05e4b79..3026bf451 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -151,9 +151,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         CUfunction kernel;
         CUstream   stream;
 
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), *cudaHandler);
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), *cudaHandler);
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), *cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), cudaHandler);
 
         // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
         std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
@@ -264,17 +264,17 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             };
             auto numElements = &NumElements;
             void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements };
-            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), *cudaHandler);
-            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), *cudaHandler);
+            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler);
+            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler);
     
             CUexternalSemaphore semaphore = cudaSemaphore->getInternalObject();
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
-            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), *cudaHandler); // Wait for release op from vulkan
-            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), *cudaHandler);
+            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan
+            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler);
             const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
-            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), *cudaHandler); // Signal the imported semaphore
+            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), cudaHandler); // Signal the imported semaphore
         }
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), *cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler);
         
         // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
         {
@@ -395,11 +395,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO);
         };
 
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), *cudaHandler);
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), *cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler);
 
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleUnload(module), *cudaHandler);
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream), *cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleUnload(module), cudaHandler);
+        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream), cudaHandler);
     }
 
     void testDestruction()

From b4a8725d54ca960e0d2c353ef08d5f40aa4c4e04 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 11 May 2026 09:26:33 +0200
Subject: [PATCH 27/47] Polish CUDA interop example usage

---
 76_CudaInterop/main.cpp | 72 +++++++++++++++++++++++++++++++++--------
 1 file changed, 58 insertions(+), 14 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 3026bf451..e2e326102 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -25,6 +25,7 @@ bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto lin
     return true;
 }
 
+#define ASSERT_SUCCESS(expr) NBL_CUDA_INTEROP_ASSERT_SUCCESS((expr), cudaHandler)
 #define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
 
 
@@ -139,7 +140,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
             std::string log;
             auto compile = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()),
-                "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), log, 0, 0, 0);
+                "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), &log, 0, 0, 0);
             ASSERT_NV_SUCCESS(compile.result, log);
 
             ptx = std::move(compile.ptx);
@@ -151,9 +152,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         CUfunction kernel;
         CUstream   stream;
 
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), cudaHandler);
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), cudaHandler);
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), cudaHandler);
+        ASSERT_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr));
+        ASSERT_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"));
+        ASSERT_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
 
         // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
         std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
@@ -264,17 +265,17 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             };
             auto numElements = &NumElements;
             void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements };
-            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler);
-            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler);
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream));
+            ASSERT_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream));
     
             CUexternalSemaphore semaphore = cudaSemaphore->getInternalObject();
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
-            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan
-            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler);
+            ASSERT_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream)); // Wait for release op from vulkan
+            ASSERT_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr));
             const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
-            NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), cudaHandler); // Signal the imported semaphore
+            ASSERT_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream)); // Signal the imported semaphore
         }
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler);
+        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
         
         // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
         {
@@ -395,11 +396,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO);
         };
 
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), cudaHandler);
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler);
+        ASSERT_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx));
+        ASSERT_SUCCESS(cu.pcuStreamSynchronize(stream));
 
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuModuleUnload(module), cudaHandler);
-        NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream), cudaHandler);
+        ASSERT_SUCCESS(cu.pcuModuleUnload(module));
+        ASSERT_SUCCESS(cu.pcuStreamDestroy_v2(stream));
     }
 
     void testDestruction()
@@ -478,6 +479,49 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             m_logger->log("Test Destruction complete", ILogger::ELL_INFO);
         }
     
+        // {
+        //     constexpr size_t M = 32;
+        //     auto staging = createStaging(size * M);
+        //
+        //     auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
+        //     for (uint32_t i = 0; i < (M * size) / 4; ++i)
+        //         ptr[i] = rand();
+        //
+        //     std::vector<smart_refctd_ptr<IGPUCommandBuffer>> cmd(1 << 10);
+        //     commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1 << 10, cmd.data());
+        //
+        //     for (size_t i = 0; i < 1 << 10; ++i)
+        //     {
+        //         IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
+        //             .size = size * M,
+        //             .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
+        //             .alignmentLog2 = 10,
+        //         };
+        //     RE:
+        //         auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+        //
+        //         if (!memory)
+        //         {
+        //             m_device->waitIdle();
+        //             for (size_t j = 0; j < i; ++j)
+        //                 cmd[j] = 0;
+        //             goto END;
+        //         }
+        //         assert(memory);
+        //         auto tmpBuf = createExternalBuffer(memory.get());
+        //
+        //         cmd[i]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+        //         IGPUCommandBuffer::SBufferCopy region = { .size = size * M };
+        //         assert(cmd[i]->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
+        //         cmd[i]->end();
+        //         IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[i].get() };
+        //         IQueue::SSubmitInfo submitInfo = { .commandBuffers = {&cmdInfo, &cmdInfo + 1} };
+        //         assert(IQueue::RESULT::SUCCESS == queue->submit({ &submitInfo,&submitInfo + 1 }));
+        //     }
+        // END:
+        //     m_device->waitIdle();
+        // }
+
     }
 
     void testLargeAllocations()

From 39d02e26023c72a7d3241e5df85e9b7c4afacb84 Mon Sep 17 00:00:00 2001
From: Arkadiusz Lachowicz <areklachowicz@gmail.com>
Date: Mon, 11 May 2026 15:17:11 +0200
Subject: [PATCH 28/47] Fix path tracer allocation size access

---
 40_PathTracer/src/renderer/CRenderer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/40_PathTracer/src/renderer/CRenderer.cpp b/40_PathTracer/src/renderer/CRenderer.cpp
index aa0a456ff..798cdb987 100644
--- a/40_PathTracer/src/renderer/CRenderer.cpp
+++ b/40_PathTracer/src/renderer/CRenderer.cpp
@@ -553,7 +553,7 @@ core::smart_refctd_ptr<CScene> CRenderer::createScene(CScene::SCreationParams&&
 				auto retval = device->allocate(info);
 				// map what is mappable by default so ReBAR checks succeed
 				if (retval.isValid() && retval.memory->isMappable())
-					retval.memory->map({.offset=0,.length=info.size});
+					retval.memory->map({.offset=0,.length=info.allocationSize});
 				return retval;
 			}
 
@@ -896,4 +896,4 @@ IQueue::SSubmitInfo::SSemaphoreInfo CRenderer::SSubmit::operator()(std::span<con
 	return rendered[0];
 }
 
-}
\ No newline at end of file
+}

From 951bc9949d93731ace3747d69bdf9f11dc493ab0 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Tue, 12 May 2026 17:06:28 +0700
Subject: [PATCH 29/47] Initial implementation of testWmmaGemm test

---
 .../app_resources/wmmaGemm_b1_kernel.cu       |  53 ++
 .../app_resources/wmmaGemm_kernel.cu          | 107 +++
 76_CudaInterop/main.cpp                       | 654 ++++++++++++++++--
 3 files changed, 749 insertions(+), 65 deletions(-)
 create mode 100644 76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu
 create mode 100644 76_CudaInterop/app_resources/wmmaGemm_kernel.cu

diff --git a/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu b/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu
new file mode 100644
index 000000000..56d376fae
--- /dev/null
+++ b/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu
@@ -0,0 +1,53 @@
+#include <mma.h>
+#include <cuda_runtime.h>
+
+using namespace nvcuda;
+
+// Define WMMA parameters
+const int WMMA_M = 8;
+const int WMMA_N = 8;
+const int WMMA_K = 128;
+
+extern "C" __global__ void b1_wmma_gemm_kernel(int* a, int* b, int* c, 
+                                    int M, int N, int K) {
+    // Leading dimensions
+    int lda = K; 
+    int ldb = K;
+    int ldc = N;
+    
+    // Tile indices
+    int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+    int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
+    
+    // Fragments
+    wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::row_major> a_frag;
+    wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, wmma::experimental::precision::b1, wmma::col_major> b_frag;
+    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> acc_frag;
+    
+    // Initialize accumulator with zeros
+    wmma::fill_fragment(acc_frag, 0);
+    
+    // Loop over the K-dimension
+    for (int i = 0; i < K; i += WMMA_K) {
+        int aRow = warpM * WMMA_M;
+        int aCol = i / 32; // Indexing uint32_t
+        
+        int bRow = i / 32;
+        int bCol = warpN * WMMA_N;
+    
+        // Load fragments
+        // Note: load_matrix_sync handles the bit-packing layout internally
+        wmma::load_matrix_sync(a_frag, a + (aRow * lda / 32 + aCol), lda);
+        wmma::load_matrix_sync(b_frag, b + (bCol * ldb / 32 + bRow), ldb);
+    
+        // Perform XOR-Popcount MMA
+        wmma::bmma_sync(acc_frag, a_frag, b_frag, acc_frag, wmma::experimental::bmmaBitOpAND);
+    }
+    
+    // Store the result
+    int cRow = warpM * WMMA_M;
+    int cCol = warpN * WMMA_N;
+    int* outputLoc = c + (cRow * ldc + cCol);
+    wmma::store_matrix_sync(outputLoc, acc_frag, ldc, wmma::mem_row_major);
+
+}
diff --git a/76_CudaInterop/app_resources/wmmaGemm_kernel.cu b/76_CudaInterop/app_resources/wmmaGemm_kernel.cu
new file mode 100644
index 000000000..523590e8c
--- /dev/null
+++ b/76_CudaInterop/app_resources/wmmaGemm_kernel.cu
@@ -0,0 +1,107 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vector addition of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+
+
+// GPU configuration.
+
+#define WARP_SIZE 32
+
+// MMA matrix tile dimensions.
+
+#define M 16
+#define N 16
+#define K 16
+
+#define WMMA_M 16
+#define WMMA_N 16
+#define WMMA_K 16
+
+#include <cuda_fp16.h>
+#include <mma.h>
+
+using namespace nvcuda;
+
+extern "C" __global__ void wmmaGemm(half *a, half *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta)
+{
+    // Leading dimensions. Packed with no transpositions.
+    int lda = k_ld;
+    int ldb = k_ld;
+    int ldc = n_ld;
+
+    // Tile using a 2D grid
+    int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
+    int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
+
+    // Declare the fragments
+    wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
+    wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> b_frag;
+    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float>              acc_frag;
+    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float>              c_frag;
+
+    wmma::fill_fragment(acc_frag, 0.0f);
+
+    // Loop over k
+    for (int i = 0; i < k_ld; i += WMMA_K) {
+        int aCol = i;
+        int aRow = warpM * WMMA_M;
+        int bCol = warpN * WMMA_N;
+        int bRow = i;
+
+        // Bounds checking
+        if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {
+            // Load the inputs
+            wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda);
+            wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb);
+
+            // Perform the matrix multiplication
+            wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
+        }
+    }
+
+    // Load in the current value of c, scale it by beta, and add this our result
+    // scaled by alpha
+    int cCol = warpN * WMMA_N;
+    int cRow = warpM * WMMA_M;
+
+    if (cRow < m_ld && cCol < n_ld) {
+        wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major);
+
+        for (int i = 0; i < c_frag.num_elements; i++) {
+            c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
+        }
+
+        // Store the output
+        wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major);
+    }
+}
diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index dfd214384..11a8768bf 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -11,23 +11,26 @@ using namespace system;
 using namespace asset;
 using namespace video;
 
+#define WARP_SIZE 32
+
+
 /*
 The start of the main function starts like in most other example. We ask the
 user for the desired renderer and start it up.
 */
 
-bool check_nv_err(auto err, auto& cudaHandler, auto& logger, auto file, auto line, std::string const& log)
+bool check_nv_err(auto err, auto& m_cuHandler, auto& logger, auto file, auto line, std::string const& log)
 {
     if (auto re = err; NVRTC_SUCCESS != re) 
     {
-        const char* str = cudaHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); 
+        const char* str = m_cuHandler->getNVRTCFunctionTable().pnvrtcGetErrorString(re); 
         logger->log("%s:%d %s\n%s\n", system::ILogger::ELL_ERROR, file, line, str, log.c_str());
         return false;
     }
     return true;
 }
 
-#define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), cudaHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
+#define ASSERT_NV_SUCCESS(expr, log) { auto re = check_nv_err((expr), m_cuHandler, m_logger, __FILE__, __LINE__, log); assert(re); }
 
 
 using namespace nbl::core;
@@ -49,8 +52,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
         system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
 
-    smart_refctd_ptr<CCUDAHandler> cudaHandler;
-    smart_refctd_ptr<CCUDADevice> cudaDevice;
+    smart_refctd_ptr<CCUDAHandler> m_cuHandler;
+    smart_refctd_ptr<CCUDADevice> m_cuDevice;
+    smart_refctd_ptr<IUtilities> m_utils;
 
     IQueue* queue;
 
@@ -59,7 +63,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     virtual void filterDevices(core::set<video::IPhysicalDevice*>& physicalDevices) const
     {
         device_base_t::filterDevices(physicalDevices);
-        auto& cuDevices = cudaHandler->getAvailableDevices();
+        auto& cuDevices = m_cuHandler->getAvailableDevices();
         std::erase_if(physicalDevices, [&cuDevices](auto pdev) {
             return cuDevices.end() == std::find_if(cuDevices.begin(), cuDevices.end(), [pdev](auto& cuDev) { return !memcmp(pdev->getProperties().deviceUUID, &cuDev.uuid, 16);  });
         });
@@ -71,23 +75,29 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system)))
             return false;
 
-        cudaHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr<ILogger>(m_logger));
-        if (!cudaHandler) 
+        m_cuHandler = CCUDAHandler::create(m_system.get(), smart_refctd_ptr<ILogger>(m_logger));
+        if (!m_cuHandler) 
             return logFail("Could not create a CUDA handler!");
 
         if (!device_base_t::onAppInitialized(std::move(system)))
             return false;
 
-        cudaDevice = cudaHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(m_api), m_physicalDevice);
-        if (!cudaDevice) 
+        m_utils = IUtilities::create(core::smart_refctd_ptr(m_device), core::smart_refctd_ptr<system::ILogger>(m_logger));
+        if (!m_utils)
+            return logFail("Could not create IUtilities!");
+
+        m_cuDevice = m_cuHandler->createDevice(smart_refctd_ptr_dynamic_cast<CVulkanConnection>(m_api), m_physicalDevice);
+        if (!m_cuDevice) 
             return logFail("Could not create a CUDA Device!");
 
 
         queue = getComputeQueue();
 
-        testVectorAddKernel();
-        testDestruction();
-        testLargeAllocations();
+        testWmmaGemB1();
+        // testWmmaGemm();
+        // testVectorAddKernel();
+        // testDestruction();
+        // testLargeAllocations();
 
         return true;
     }
@@ -120,42 +130,60 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         return buf;
     };
 
+    smart_refctd_ptr<ICPUBuffer> compilePtx(const char* filepath)
+    {
+        IAssetLoader::SAssetLoadParams lp = {};
+        lp.logger = m_logger.get();
+        lp.workingDirectory = ""; // virtual root
+        // this time we load a shader directly from a file
+        auto assetBundle = m_assetMgr->getAsset(filepath, lp);
+        const auto assets = assetBundle.getContents();
+        if (assets.empty())
+            logFail("Could not load kernel!");
+
+        smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
+        std::string log;
+        auto [ptx, res] = m_cuHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
+            filepath, m_cuDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
+        ASSERT_NV_SUCCESS(res, log);
+
+        return ptx;
+    }
+
+    std::tuple<smart_refctd_ptr<IGPUBuffer>, smart_refctd_ptr<CCUDAImportedMemory>> createSharedBuffer(uint32_t size)
+    {
+        IGPUBuffer::SCreationParams vkBufferParams;
+        vkBufferParams.size = m_cuDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, size);
+        vkBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
+        vkBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+        const auto outputBuf = m_device->createBuffer(std::move(vkBufferParams));
+        auto outputMemReq = outputBuf->getMemoryReqs();
+
+        auto allocation = m_device->allocate(outputMemReq, outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE);
+        const auto cudaOutputMemory = m_cuDevice->importExternalMemory(core::smart_refctd_ptr(allocation.memory));
+        if (!cudaOutputMemory)
+          logFail("Fail to import Vulkan Memory into CUDA!");
+
+        return std::tuple(std::move(outputBuf), std::move(cudaOutputMemory));
+    }
+
     void testVectorAddKernel()
     {
         static constexpr uint32_t GridDim[3] = { 4096,1,1 };
-        static constexpr uint32_t BlockDim[3] = { 1024,1,1 };
+        static constexpr uint32_t BlockDim[3] = { 1,1,1 };
         static constexpr size_t NumElements = GridDim[0] * BlockDim[0];
         static constexpr size_t BufferSize = sizeof(float) * NumElements;
 
-        smart_refctd_ptr<ICPUBuffer> ptx;
-        {
-            IAssetLoader::SAssetLoadParams lp = {};
-            lp.logger = m_logger.get();
-            lp.workingDirectory = ""; // virtual root
-            // this time we load a shader directly from a file
-            auto assetBundle = m_assetMgr->getAsset("app_resources/vectorAdd_kernel.cu", lp);
-            const auto assets = assetBundle.getContents();
-            if (assets.empty())
-                logFail("Could not load kernel!");
-
-            smart_refctd_ptr<ICPUBuffer> source = IAsset::castDown<ICPUBuffer>(assets[0]);
-            std::string log;
-            auto [ptx_, res] = cudaHandler->compileDirectlyToPTX(std::string((const char*)source->getPointer(), source->getSize()), 
-                "app_resources/vectorAdd_kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, 0, 0, &log);
-            ASSERT_NV_SUCCESS(res, log);
-
-            ptx = std::move(ptx_);
-        }
-
-        auto& cu = cudaHandler->getCUDAFunctionTable();
+        const auto ptx = compilePtx("app_resources/vectorAdd_kernel.cu");
+        auto& cu = m_cuHandler->getCUDAFunctionTable();
 
         CUmodule   module;
         CUfunction kernel;
         CUstream   stream;
 
-        ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), cudaHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), cudaHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), cudaHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), m_cuHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler);
 
         // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
         std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
@@ -182,28 +210,18 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         for (auto input_i = 0; input_i < InputCount; input_i++)
         {
           // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-          cudaInputMemories[input_i] = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+          cudaInputMemories[input_i] = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
           vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr);
           vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get());
           inputStagingBuffers[input_i] = createStaging(BufferSize);
         }
 
-        IGPUBuffer::SCreationParams outputBufferParams;
-        outputBufferParams.size = cudaDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, BufferSize);
-        outputBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
-        outputBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
-        const auto outputBuf = m_device->createBuffer(std::move(outputBufferParams));
-        auto outputMemReq = outputBuf->getMemoryReqs();
-
-        auto allocation = m_device->allocate(outputMemReq, outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE);
-        const auto cudaOutputMemory = cudaDevice->importExternalMemory(core::smart_refctd_ptr(allocation.memory));
-        if (!cudaOutputMemory)
-          logFail("Fail to import Vulkan Memory into CUDA!");
+        auto [outputBuf, cudaOutputMemory] = createSharedBuffer(BufferSize);
         
         ISemaphore::SCreationParams semParams;
         semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
         auto semaphore = m_device->createSemaphore(0, std::move(semParams));
-        const auto cudaSemaphore = cudaDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
+        const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
         if (!cudaSemaphore)
           logFail("Fail to import Vulkan Semaphore into CUDA!");
         
@@ -265,17 +283,16 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             };
             auto numElements = &NumElements;
             void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &numElements };
-            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), cudaHandler);
-            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), cudaHandler);
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), m_cuHandler);
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), m_cuHandler);
     
             auto semaphore = cudaSemaphore->getInternalObject();
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
-            ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), cudaHandler); // Wait for release op from vulkan
-            ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), cudaHandler);
+            ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), m_cuHandler); // Wait for release op from vulkan
+            ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), m_cuHandler);
             const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
-            ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), cudaHandler); // Signal the imported semaphore
+            ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), m_cuHandler); // Signal the imported semaphore
         }
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler);
         
         // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
         {
@@ -389,18 +406,526 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
               const auto output = outputs[elem_i];
               const auto expected = input1 + input2;
               const auto diff = abs(output - expected);
-              if (diff < 0.01)
+              if (diff > 0.01)
                 ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i);
             }
 
             ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO);
         };
 
-        ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), cudaHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), cudaHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), m_cuHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), m_cuHandler);
 
-        ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), cudaHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), cudaHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), m_cuHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), m_cuHandler);
+    }
+
+    void testWmmaGemm()
+    {
+        // x = M, y = N, z = K
+        constexpr auto WmmaSize = uint32_t3{ 16, 16, 16 };
+        constexpr auto TileCount = uint32_t3{ 64, 64, 64 };
+        constexpr auto ElementCount = WmmaSize * TileCount;
+        constexpr auto BlockDim = uint32_t2{ 128, 4 };
+        // TODO(kevin): Check if this calculation of GridDim correct. Currently we only handle square matrix. So, it doesn't matter
+        constexpr auto GridDim = uint32_t2(ElementCount.x / BlockDim.x, ElementCount.y / BlockDim.y);
+        const float Alpha = 1.1f;
+        const float Beta  = 1.2f;
+
+        const auto ptx = compilePtx("app_resources/wmmaGemm_kernel.cu");
+        auto& cu = m_cuHandler->getCUDAFunctionTable();
+
+        CUmodule   module;
+        CUfunction kernel;
+        CUstream   stream;
+
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "wmmaGemm"), m_cuHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler);
+
+        const auto elementsPerBlock = uint32_t2{ (WmmaSize.x * BlockDim.x) / WARP_SIZE, (WmmaSize.y * BlockDim.y) };
+        uint32_t2 gridDim = {
+            ElementCount.x + (elementsPerBlock.x - 1) / elementsPerBlock.x,
+            ElementCount.y + (elementsPerBlock.y - 1) / elementsPerBlock.y
+        };
+
+
+        auto [vkBufferMatA, cuMemMatA] = createSharedBuffer(sizeof(half) * ElementCount.x * ElementCount.z);
+        auto [vkBufferMatB, cuMemMatB] = createSharedBuffer(sizeof(half) * ElementCount.z * ElementCount.y);
+        auto [vkBufferMatC, cuMemMatC] = createSharedBuffer(sizeof(float) * ElementCount.x * ElementCount.y);
+        auto [vkBufferMatD, cuMemMatD] = createSharedBuffer(sizeof(float) * ElementCount.x * ElementCount.y);
+
+        core::vector<half> cpuMatA(ElementCount.x * ElementCount.z), cpuMatB(ElementCount.z * ElementCount.y);
+        core::vector<float> cpuMatC(ElementCount.x * ElementCount.y);
+
+        auto initCpuMatrix = [ElementCount](half* a, half* b, float* c)
+        {
+            for (int i = 0; i < ElementCount.x; i++) {
+                for (int j = 0; j < ElementCount.z; j++) {
+                    a[i * ElementCount.z + j] = (half)(rand() % 3);
+                }
+            }
+
+            for (int i = 0; i < ElementCount.y; i++) {
+                for (int j = 0; j < ElementCount.z; j++) {
+                    b[i * ElementCount.x + j] = (half)(rand() % 3);
+                }
+            }
+
+            for (int t = 0; t < ElementCount.x * ElementCount.y; t++) {
+                c[t] = static_cast<float>(rand() % 3);
+            }
+        };
+        initCpuMatrix(cpuMatA.data(), cpuMatB.data(), cpuMatC.data());
+
+
+        ISemaphore::SCreationParams semParams;
+        semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
+        auto semaphore = m_device->createSemaphore(0, std::move(semParams));
+        const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
+        if (!cudaSemaphore)
+          logFail("Fail to import Vulkan Semaphore into CUDA!");
+        
+        std::array<smart_refctd_ptr<IGPUCommandBuffer>, 2> cmd;
+        auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger));
+
+        const auto outputStagingBuffer = createStaging(vkBufferMatD->getSize());
+
+        // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
+        {
+            const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+                .barrier = {
+                    .dep = {
+                        .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+                        .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS,
+                    },
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .range = {
+                  .offset = 0, 
+                  .size = vkBufferMatD->getSize(), 
+                  .buffer = vkBufferMatD, 
+                },
+            };
+    
+            // start recording
+            bool re = true;
+            re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            re &= cmd[0]->pipelineBarrier(EDF_NONE, {
+              .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}
+            });
+            re &= cmd[0]->end();
+    
+            const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+              .semaphore = semaphore.get(), 
+              .value = 1,
+              .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+            };
+            const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() };
+            const IQueue::SSubmitInfo submitInfo = {
+              .commandBuffers = {&cmdInfo, &cmdInfo + 1}, 
+              .signalSemaphores = {&signalInfo, &signalInfo + 1}
+            };
+            const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 });
+            re &= IQueue::RESULT::SUCCESS == submitRe;
+            if (!re) logFail("Something went wrong readying resources for CUDA");
+        }
+
+        // Launch kernel
+        {
+            CUdeviceptr matrixAPtr, matrixBPtr, matrixCPtr, matrixDPtr;
+            cuMemMatA->getMappedBuffer(&matrixAPtr);
+            cuMemMatB->getMappedBuffer(&matrixBPtr);
+            cuMemMatC->getMappedBuffer(&matrixCPtr);
+            cuMemMatD->getMappedBuffer(&matrixDPtr);
+            CUdeviceptr ptrs[] = {
+                matrixAPtr,
+                matrixBPtr,
+                matrixCPtr,
+                matrixDPtr,
+            };
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixAPtr, cpuMatA.data(), cpuMatA.size() * sizeof(half), stream), m_cuHandler);
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixBPtr, cpuMatB.data(), cpuMatB.size() * sizeof(half), stream), m_cuHandler);
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixCPtr, cpuMatC.data(), cpuMatC.size() * sizeof(float), stream), m_cuHandler);
+
+            int m_ld = ElementCount.x; 
+            int n_ld = ElementCount.y;  
+            int k_ld = ElementCount.z; 
+            float alpha = Alpha; 
+            float beta = Beta;
+            void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &ptrs[3], &m_ld, &n_ld, &k_ld, &alpha, &beta };
+
+            auto semaphore = cudaSemaphore->getInternalObject();
+            const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+            ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), m_cuHandler); // Wait for release op from vulkan
+            ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim.x, GridDim.y, 1, BlockDim.x, BlockDim.y, 1, 0, stream, parameters, nullptr), m_cuHandler);
+            const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
+            ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), m_cuHandler); // Signal the imported semaphore
+        }
+
+        // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
+        {
+            const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+                .barrier = {
+                    .dep = {
+                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+                        .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
+                    },
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .range = { 
+                  .offset = 0,
+                  .size = vkBufferMatD->getSize(),
+                  .buffer = vkBufferMatD, 
+                },
+            };
+            bool re = true;
+            re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            re &= cmd[1]->pipelineBarrier(EDF_NONE,
+            {
+              .bufBarriers = std::span{ &bufBarrier, &bufBarrier + 1 }
+            });
+            const auto region = IGPUCommandBuffer::SBufferCopy{ 
+              .srcOffset = 0,
+              .dstOffset = 0,
+              .size = vkBufferMatD->getSize() 
+            };
+            re &= cmd[1]->copyBuffer(vkBufferMatD.get(), outputStagingBuffer.get(), 1, &region);
+            re &= cmd[1]->end();
+            
+            const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= {
+              .semaphore = semaphore.get(), 
+              .value = 2,
+              .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+            };
+            const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+              .semaphore = semaphore.get(), 
+              .value = 3,
+              .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+            };
+            const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
+            const IQueue::SSubmitInfo submitInfo = { 
+                .waitSemaphores = { &waitInfo, &waitInfo + 1 },
+                .commandBuffers = { &cmdInfo, &cmdInfo + 1 },  
+                .signalSemaphores = { &signalInfo, &signalInfo + 1 } 
+            };
+            const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 });
+            re &= IQueue::RESULT::SUCCESS == submitRe;
+            if (!re)
+                logFail("Something went wrong copying results from CUDA");
+        } 
+
+        auto matMultiplyOnHost = [&](
+            const half* A,
+            const half* B,
+            float* C)
+        {
+            const auto numARows = ElementCount.x;
+            const auto numAColumns = ElementCount.z;
+            const auto numBRows = ElementCount.z;
+            const auto numBColumns = ElementCount.y;
+            const auto numCRows = ElementCount.x;
+            const auto numCColumns = ElementCount.y;
+            for (int i = 0; i < numCRows; i++) {
+                for (int j = 0; j < numCColumns; j++) {
+                    float temp = 0.0;
+
+                    for (int k = 0; k < numAColumns; k++) {
+                        temp += (float)A[i * numAColumns + k] * (float)B[j * numBRows + k];
+                    }
+
+                    C[i * numCColumns + j] = temp * Alpha + Beta * C[i * numCColumns + j];
+                }
+            }
+        };
+        matMultiplyOnHost(cpuMatA.data(), cpuMatB.data(), cpuMatC.data());
+
+        struct CallbackContext
+        {
+            core::smart_refctd_ptr<ISemaphore> semaphore;
+            core::smart_refctd_ptr<IGPUBuffer> outputStagingBuffer;
+            core::smart_refctd_ptr<video::ILogicalDevice> device;
+            core::smart_refctd_ptr<system::ILogger> logger;
+            const float* expectedOutput;
+        };
+
+        CallbackContext ctx;
+        ctx.semaphore = semaphore;
+        ctx.outputStagingBuffer = outputStagingBuffer;
+        ctx.device = m_device;
+        ctx.logger = m_logger;
+        ctx.expectedOutput = cpuMatC.data();
+
+        auto cudaCallback = [](void* userData)
+        {
+            const auto* ctx = reinterpret_cast<CallbackContext*>(userData);
+
+            // Make sure we are also done with the readback 
+            const auto wait = std::array{
+              ISemaphore::SWaitInfo{
+                .semaphore = ctx->semaphore.get(), 
+                .value = 3,
+              }
+            };
+            ctx->device->blockForSemaphores(wait, true);
+
+            auto* stagingMem = ctx->outputStagingBuffer->getBoundMemory().memory;
+            if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+            {
+                ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize());
+                ctx->device->invalidateMappedMemoryRanges(1, &range);
+            }
+
+
+            const auto* outputs = reinterpret_cast<float*>(ctx->outputStagingBuffer->getBoundMemory().memory->getMappedPointer());
+
+            for (auto elem_i = 0; elem_i < ElementCount.x * ElementCount.y; elem_i++)
+            {
+              const auto output = outputs[elem_i];
+              const auto diff = abs(output - ctx->expectedOutput[elem_i]);
+              if (diff > 0.01)
+                ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i);
+            }
+
+            ctx->logger->log("Test Wmma Gemm Complete", ILogger::ELL_INFO);
+        };
+
+        ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), m_cuHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), m_cuHandler);
+
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), m_cuHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), m_cuHandler);
+    }
+
+    void testWmmaGemB1()
+    {
+        // b1 WMMA dimensions: M=8, N=8, K=128
+        constexpr auto WmmaSize = uint32_t3{ 8, 8, 128 };
+        constexpr auto TileCount = uint32_t3{ 128, 128, 8 };  // Adjust for b1 dimensions
+        constexpr auto ElementCount = WmmaSize * TileCount; // M=1024, N=1024, K=1024
+        constexpr auto BlockDim = uint32_t2{ 32, 1 };       // 1 warp per block
+        constexpr auto GridDim = uint32_t2(
+            (ElementCount.x + WmmaSize.x - 1) / WmmaSize.x,  // M tiles
+            (ElementCount.y + WmmaSize.y - 1) / WmmaSize.y   // N tiles
+        );
+
+        const auto ptx = compilePtx("app_resources/wmmaGemm_b1_kernel.cu");
+        auto& cu = m_cuHandler->getCUDAFunctionTable();
+
+        CUmodule   module;
+        CUfunction kernel;
+        CUstream   stream;
+
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "b1_wmma_gemm_kernel"), m_cuHandler);
+        ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler);
+
+        // Calculate buffer sizes (bits packed into uint32_t)
+        const size_t matA_size = (ElementCount.x * ElementCount.z) / 32 * sizeof(uint32_t); // M x K bits
+        const size_t matB_size = (ElementCount.z * ElementCount.y) / 32 * sizeof(uint32_t); // K x N bits
+        const size_t matC_size = ElementCount.x * ElementCount.y * sizeof(int32_t);         // M x N ints
+
+        auto [vkBufferMatA, cuMemMatA] = createSharedBuffer(matA_size);
+        auto [vkBufferMatB, cuMemMatB] = createSharedBuffer(matB_size);
+        auto [vkBufferMatC, cuMemMatC] = createSharedBuffer(matC_size);
+
+        // CPU matrices for initialization and verification
+        core::vector<uint32_t> cpuMatA(ElementCount.x * ElementCount.z / 32);
+        core::vector<uint32_t> cpuMatB(ElementCount.z * ElementCount.y / 32);
+        core::vector<int32_t> cpuMatC_expected(ElementCount.x * ElementCount.y);
+
+        // Initialize with simple patterns for verification
+        auto initBinaryMatrices = [&]()
+        {
+            // Fill cpuMatA with reverse diagonal pattern
+            std::fill(cpuMatA.begin(), cpuMatA.end(), 0);
+
+            for (int i = 0; i < ElementCount.x; i++)
+            {
+              auto j = ElementCount.z - 1 - i;
+              auto bitIdx = i * ElementCount.z + j;
+              auto wordIdx = bitIdx / 32;
+              auto bitOffset = bitIdx % 32;
+              cpuMatA[wordIdx] |= (1u << bitOffset);
+            }
+
+            // Fill cpuMatB with random bits
+            for (auto& val : cpuMatB) val = rand();
+            
+            // Compute expected result: For bmma with bmmaBitOpAND
+            // C[i][j] = popcount(A[i,:] AND B[:,j])
+            for (int i = 0; i < ElementCount.x; i++) {
+                for (int j = 0; j < ElementCount.y; j++) {
+                    const int k = ElementCount.z - 1 - i;
+                    const int b_bit_idx = j * ElementCount.z + k; // col-major
+                    const int32_t bit = (cpuMatB[b_bit_idx / 32] >> (b_bit_idx % 32)) & 1;
+                    cpuMatC_expected[i * ElementCount.y + j] = bit;
+                }
+            }
+        };
+        initBinaryMatrices();
+  
+        ISemaphore::SCreationParams semParams;
+        semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
+        auto semaphore = m_device->createSemaphore(0, std::move(semParams));
+        const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
+        if (!cudaSemaphore)
+          logFail("Fail to import Vulkan Semaphore into CUDA!");
+        
+        std::array<smart_refctd_ptr<IGPUCommandBuffer>, 2> cmd;
+        auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger));
+
+        const auto outputStagingBuffer = createStaging(vkBufferMatC->getSize());
+
+        // Release ownership to CUDA
+        {
+            const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+                .barrier = {
+                    .dep = {
+                        .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+                        .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS,
+                    },
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .range = { .offset = 0, .size = vkBufferMatC->getSize(), .buffer = vkBufferMatC },
+            };
+
+            cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            cmd[0]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier, &bufBarrier + 1}});
+            cmd[0]->end();
+
+            const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+              .semaphore = semaphore.get(), .value = 1, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+            };
+            const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() };
+            const IQueue::SSubmitInfo submitInfo = {
+              .commandBuffers = {&cmdInfo, &cmdInfo + 1}, .signalSemaphores = {&signalInfo, &signalInfo + 1}
+            };
+            queue->submit({ &submitInfo, &submitInfo + 1 });
+        }
+
+        // Launch CUDA kernel
+        {
+            CUdeviceptr matrixAPtr, matrixBPtr, matrixCPtr;
+            cuMemMatA->getMappedBuffer(&matrixAPtr);
+            cuMemMatB->getMappedBuffer(&matrixBPtr);
+            cuMemMatC->getMappedBuffer(&matrixCPtr);
+
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixAPtr, cpuMatA.data(), matA_size, stream), m_cuHandler);
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixBPtr, cpuMatB.data(), matB_size, stream), m_cuHandler);
+            core::vector<int32_t> cpuMatC(ElementCount.x * ElementCount.y, 15);
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixCPtr, cpuMatC.data(), matC_size, stream), m_cuHandler);
+
+            void* parameters[] = { &matrixAPtr, &matrixBPtr, &matrixCPtr, 
+                                   (void*)&ElementCount.x, (void*)&ElementCount.y, (void*)&ElementCount.z };
+
+            auto semaphore_cu = cudaSemaphore->getInternalObject();
+            const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+            ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore_cu, &waitParams, 1, stream), m_cuHandler);
+            
+            ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim.x, GridDim.y, 1, 
+                                                   BlockDim.x, BlockDim.y, 1, 
+                                                   0, stream, parameters, nullptr), m_cuHandler);
+            
+            const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
+            ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore_cu, &signalParams, 1, stream), m_cuHandler);
+        }
+
+        // Acquire ownership and copy results back
+        {
+            const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
+                .barrier = {
+                    .dep = {
+                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+                        .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
+                    },
+                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
+                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
+                },
+                .range = { .offset = 0, .size = vkBufferMatC->getSize(), .buffer = vkBufferMatC },
+            };
+            
+            cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+            cmd[1]->pipelineBarrier(EDF_NONE, {.bufBarriers = std::span{&bufBarrier, &bufBarrier + 1}});
+            const auto region = IGPUCommandBuffer::SBufferCopy{ .srcOffset = 0, .dstOffset = 0, .size = matC_size };
+            cmd[1]->copyBuffer(vkBufferMatC.get(), outputStagingBuffer.get(), 1, &region);
+            cmd[1]->end();
+            
+            const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo = {
+              .semaphore = semaphore.get(), .value = 2, .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+            };
+            const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+              .semaphore = semaphore.get(), .value = 3, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+            };
+            const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
+            const IQueue::SSubmitInfo submitInfo = { 
+                .waitSemaphores = { &waitInfo, &waitInfo + 1 },
+                .commandBuffers = { &cmdInfo, &cmdInfo + 1 },  
+                .signalSemaphores = { &signalInfo, &signalInfo + 1 } 
+            };
+            queue->submit({ &submitInfo, &submitInfo + 1 });
+        }
+
+        // Wait and verify results
+        const auto wait = std::array{ ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3} };
+        m_device->blockForSemaphores(wait, true);
+
+        auto* stagingMem = outputStagingBuffer->getBoundMemory().memory;
+        if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+        {
+            ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize());
+            m_device->invalidateMappedMemoryRanges(1, &range);
+        }
+
+        const auto* results = reinterpret_cast<int32_t*>(stagingMem->getMappedPointer());
+        
+        // Verify results
+        bool success = true;
+        int errors = 0;
+        for (int i = 0; i < ElementCount.x * ElementCount.y; i++) {
+            const auto expected = [&]
+            {
+                // Since we are multiplying reverse diagonal matrix to matrixB. The result should be matrix b but each column reversed.
+                // The calculation below is to get the index of cpuMatB if the column is reversed to get the expected bit.
+                const auto row = i / ElementCount.y;
+                const auto col = i % ElementCount.y;
+                const auto expectedCol = col;
+                const auto expectedRow = ElementCount.z - row - 1;
+                const auto expectedIdx = expectedCol * ElementCount.z + expectedRow;
+                const auto expectedWordIdx = expectedIdx / 32;
+                const auto expectedBitOffset = expectedIdx % 32;
+                return (cpuMatB[expectedWordIdx] >> expectedBitOffset) & uint32_t(1);
+            }();
+
+            // const auto expected = [&]
+            // {
+            //     const auto row = i / ElementCount.y;            // row-major
+            //     const auto col = i % ElementCount.y;
+            //     const auto k   = ElementCount.z - 1 - row;      // reverse-diagonal A
+            //     const auto bIdx = col * ElementCount.z + k;     // col-major B
+            //     return (cpuMatB[bIdx / 32] >> (bIdx % 32)) & uint32_t(1);
+            // }();
+
+            // const auto expected = cpuMatC_expected[i];
+
+            const auto result = results[i];
+            if (result != expected) {
+                m_logger->log("WMMA b1 test error at [%d]: GPU=%d, CPU=%d", 
+                             system::ILogger::ELL_ERROR, i, results[i], expected);
+                errors++;
+                success = false;
+            }
+        }
+        
+        if (success)
+            m_logger->log("b1 WMMA test PASSED!", system::ILogger::ELL_INFO);
+        else
+            m_logger->log("b1 WMMA test FAILED with %d errors!", system::ILogger::ELL_ERROR, errors);
     }
 
     void testDestruction()
@@ -408,10 +933,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
         constexpr auto ElementCount = 1024;
         constexpr auto BufferSize = ElementCount * sizeof(int);
-        auto& cu = cudaHandler->getCUDAFunctionTable();
+        auto& cu = m_cuHandler->getCUDAFunctionTable();
         smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
         {
-            const auto cudaMemory = cudaDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
+            const auto cudaMemory = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .location = CU_MEM_LOCATION_TYPE_DEVICE });
             if (!cudaMemory) logFail("Fail to create exportable memory!");
 
             escaped = cudaMemory->exportAsMemory(m_device.get());
@@ -500,7 +1025,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         }
     }
 
-
     // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
     bool keepRunning() override { return false; }
 

From 8e84dcdf277f77ce1a4f1804f1a4323a32451601 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Tue, 12 May 2026 17:13:55 +0700
Subject: [PATCH 30/47] Remove test for WmmaGemm half

---
 .../app_resources/wmmaGemm_kernel.cu          | 107 -------
 76_CudaInterop/main.cpp                       | 281 ------------------
 2 files changed, 388 deletions(-)
 delete mode 100644 76_CudaInterop/app_resources/wmmaGemm_kernel.cu

diff --git a/76_CudaInterop/app_resources/wmmaGemm_kernel.cu b/76_CudaInterop/app_resources/wmmaGemm_kernel.cu
deleted file mode 100644
index 523590e8c..000000000
--- a/76_CudaInterop/app_resources/wmmaGemm_kernel.cu
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/**
- * CUDA Kernel Device code
- *
- * Computes the vector addition of A and B into C. The 3 vectors have the same
- * number of elements numElements.
- */
-
-
-// GPU configuration.
-
-#define WARP_SIZE 32
-
-// MMA matrix tile dimensions.
-
-#define M 16
-#define N 16
-#define K 16
-
-#define WMMA_M 16
-#define WMMA_N 16
-#define WMMA_K 16
-
-#include <cuda_fp16.h>
-#include <mma.h>
-
-using namespace nvcuda;
-
-extern "C" __global__ void wmmaGemm(half *a, half *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta)
-{
-    // Leading dimensions. Packed with no transpositions.
-    int lda = k_ld;
-    int ldb = k_ld;
-    int ldc = n_ld;
-
-    // Tile using a 2D grid
-    int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
-    int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
-
-    // Declare the fragments
-    wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
-    wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> b_frag;
-    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float>              acc_frag;
-    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float>              c_frag;
-
-    wmma::fill_fragment(acc_frag, 0.0f);
-
-    // Loop over k
-    for (int i = 0; i < k_ld; i += WMMA_K) {
-        int aCol = i;
-        int aRow = warpM * WMMA_M;
-        int bCol = warpN * WMMA_N;
-        int bRow = i;
-
-        // Bounds checking
-        if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {
-            // Load the inputs
-            wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda);
-            wmma::load_matrix_sync(b_frag, b + bRow + bCol * ldb, ldb);
-
-            // Perform the matrix multiplication
-            wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
-        }
-    }
-
-    // Load in the current value of c, scale it by beta, and add this our result
-    // scaled by alpha
-    int cCol = warpN * WMMA_N;
-    int cRow = warpM * WMMA_M;
-
-    if (cRow < m_ld && cCol < n_ld) {
-        wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major);
-
-        for (int i = 0; i < c_frag.num_elements; i++) {
-            c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
-        }
-
-        // Store the output
-        wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major);
-    }
-}
diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 11a8768bf..4b7f532c7 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -94,7 +94,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         queue = getComputeQueue();
 
         testWmmaGemB1();
-        // testWmmaGemm();
         // testVectorAddKernel();
         // testDestruction();
         // testLargeAllocations();
@@ -420,286 +419,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), m_cuHandler);
     }
 
-    void testWmmaGemm()
-    {
-        // x = M, y = N, z = K
-        constexpr auto WmmaSize = uint32_t3{ 16, 16, 16 };
-        constexpr auto TileCount = uint32_t3{ 64, 64, 64 };
-        constexpr auto ElementCount = WmmaSize * TileCount;
-        constexpr auto BlockDim = uint32_t2{ 128, 4 };
-        // TODO(kevin): Check if this calculation of GridDim correct. Currently we only handle square matrix. So, it doesn't matter
-        constexpr auto GridDim = uint32_t2(ElementCount.x / BlockDim.x, ElementCount.y / BlockDim.y);
-        const float Alpha = 1.1f;
-        const float Beta  = 1.2f;
-
-        const auto ptx = compilePtx("app_resources/wmmaGemm_kernel.cu");
-        auto& cu = m_cuHandler->getCUDAFunctionTable();
-
-        CUmodule   module;
-        CUfunction kernel;
-        CUstream   stream;
-
-        ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "wmmaGemm"), m_cuHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler);
-
-        const auto elementsPerBlock = uint32_t2{ (WmmaSize.x * BlockDim.x) / WARP_SIZE, (WmmaSize.y * BlockDim.y) };
-        uint32_t2 gridDim = {
-            ElementCount.x + (elementsPerBlock.x - 1) / elementsPerBlock.x,
-            ElementCount.y + (elementsPerBlock.y - 1) / elementsPerBlock.y
-        };
-
-
-        auto [vkBufferMatA, cuMemMatA] = createSharedBuffer(sizeof(half) * ElementCount.x * ElementCount.z);
-        auto [vkBufferMatB, cuMemMatB] = createSharedBuffer(sizeof(half) * ElementCount.z * ElementCount.y);
-        auto [vkBufferMatC, cuMemMatC] = createSharedBuffer(sizeof(float) * ElementCount.x * ElementCount.y);
-        auto [vkBufferMatD, cuMemMatD] = createSharedBuffer(sizeof(float) * ElementCount.x * ElementCount.y);
-
-        core::vector<half> cpuMatA(ElementCount.x * ElementCount.z), cpuMatB(ElementCount.z * ElementCount.y);
-        core::vector<float> cpuMatC(ElementCount.x * ElementCount.y);
-
-        auto initCpuMatrix = [ElementCount](half* a, half* b, float* c)
-        {
-            for (int i = 0; i < ElementCount.x; i++) {
-                for (int j = 0; j < ElementCount.z; j++) {
-                    a[i * ElementCount.z + j] = (half)(rand() % 3);
-                }
-            }
-
-            for (int i = 0; i < ElementCount.y; i++) {
-                for (int j = 0; j < ElementCount.z; j++) {
-                    b[i * ElementCount.x + j] = (half)(rand() % 3);
-                }
-            }
-
-            for (int t = 0; t < ElementCount.x * ElementCount.y; t++) {
-                c[t] = static_cast<float>(rand() % 3);
-            }
-        };
-        initCpuMatrix(cpuMatA.data(), cpuMatB.data(), cpuMatC.data());
-
-
-        ISemaphore::SCreationParams semParams;
-        semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
-        auto semaphore = m_device->createSemaphore(0, std::move(semParams));
-        const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
-        if (!cudaSemaphore)
-          logFail("Fail to import Vulkan Semaphore into CUDA!");
-        
-        std::array<smart_refctd_ptr<IGPUCommandBuffer>, 2> cmd;
-        auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-        bool re = commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger));
-
-        const auto outputStagingBuffer = createStaging(vkBufferMatD->getSize());
-
-        // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
-        {
-            const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
-                .barrier = {
-                    .dep = {
-                        .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
-                        .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS,
-                    },
-                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
-                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
-                },
-                .range = {
-                  .offset = 0, 
-                  .size = vkBufferMatD->getSize(), 
-                  .buffer = vkBufferMatD, 
-                },
-            };
-    
-            // start recording
-            bool re = true;
-            re &= cmd[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            re &= cmd[0]->pipelineBarrier(EDF_NONE, {
-              .bufBarriers = std::span{&bufBarrier,&bufBarrier + 1}
-            });
-            re &= cmd[0]->end();
-    
-            const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
-              .semaphore = semaphore.get(), 
-              .value = 1,
-              .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
-            };
-            const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() };
-            const IQueue::SSubmitInfo submitInfo = {
-              .commandBuffers = {&cmdInfo, &cmdInfo + 1}, 
-              .signalSemaphores = {&signalInfo, &signalInfo + 1}
-            };
-            const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 });
-            re &= IQueue::RESULT::SUCCESS == submitRe;
-            if (!re) logFail("Something went wrong readying resources for CUDA");
-        }
-
-        // Launch kernel
-        {
-            CUdeviceptr matrixAPtr, matrixBPtr, matrixCPtr, matrixDPtr;
-            cuMemMatA->getMappedBuffer(&matrixAPtr);
-            cuMemMatB->getMappedBuffer(&matrixBPtr);
-            cuMemMatC->getMappedBuffer(&matrixCPtr);
-            cuMemMatD->getMappedBuffer(&matrixDPtr);
-            CUdeviceptr ptrs[] = {
-                matrixAPtr,
-                matrixBPtr,
-                matrixCPtr,
-                matrixDPtr,
-            };
-            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixAPtr, cpuMatA.data(), cpuMatA.size() * sizeof(half), stream), m_cuHandler);
-            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixBPtr, cpuMatB.data(), cpuMatB.size() * sizeof(half), stream), m_cuHandler);
-            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixCPtr, cpuMatC.data(), cpuMatC.size() * sizeof(float), stream), m_cuHandler);
-
-            int m_ld = ElementCount.x; 
-            int n_ld = ElementCount.y;  
-            int k_ld = ElementCount.z; 
-            float alpha = Alpha; 
-            float beta = Beta;
-            void* parameters[] = { &ptrs[0], &ptrs[1], &ptrs[2], &ptrs[3], &m_ld, &n_ld, &k_ld, &alpha, &beta };
-
-            auto semaphore = cudaSemaphore->getInternalObject();
-            const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
-            ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), m_cuHandler); // Wait for release op from vulkan
-            ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim.x, GridDim.y, 1, BlockDim.x, BlockDim.y, 1, 0, stream, parameters, nullptr), m_cuHandler);
-            const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
-            ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), m_cuHandler); // Signal the imported semaphore
-        }
-
-        // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
-        {
-            const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
-                .barrier = {
-                    .dep = {
-                        .dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-                        .dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT,
-                    },
-                    .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE,
-                    .otherQueueFamilyIndex = IQueue::FamilyExternal,
-                },
-                .range = { 
-                  .offset = 0,
-                  .size = vkBufferMatD->getSize(),
-                  .buffer = vkBufferMatD, 
-                },
-            };
-            bool re = true;
-            re &= cmd[1]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            re &= cmd[1]->pipelineBarrier(EDF_NONE,
-            {
-              .bufBarriers = std::span{ &bufBarrier, &bufBarrier + 1 }
-            });
-            const auto region = IGPUCommandBuffer::SBufferCopy{ 
-              .srcOffset = 0,
-              .dstOffset = 0,
-              .size = vkBufferMatD->getSize() 
-            };
-            re &= cmd[1]->copyBuffer(vkBufferMatD.get(), outputStagingBuffer.get(), 1, &region);
-            re &= cmd[1]->end();
-            
-            const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= {
-              .semaphore = semaphore.get(), 
-              .value = 2,
-              .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
-            };
-            const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
-              .semaphore = semaphore.get(), 
-              .value = 3,
-              .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
-            };
-            const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
-            const IQueue::SSubmitInfo submitInfo = { 
-                .waitSemaphores = { &waitInfo, &waitInfo + 1 },
-                .commandBuffers = { &cmdInfo, &cmdInfo + 1 },  
-                .signalSemaphores = { &signalInfo, &signalInfo + 1 } 
-            };
-            const auto submitRe = queue->submit({ &submitInfo, &submitInfo + 1 });
-            re &= IQueue::RESULT::SUCCESS == submitRe;
-            if (!re)
-                logFail("Something went wrong copying results from CUDA");
-        } 
-
-        auto matMultiplyOnHost = [&](
-            const half* A,
-            const half* B,
-            float* C)
-        {
-            const auto numARows = ElementCount.x;
-            const auto numAColumns = ElementCount.z;
-            const auto numBRows = ElementCount.z;
-            const auto numBColumns = ElementCount.y;
-            const auto numCRows = ElementCount.x;
-            const auto numCColumns = ElementCount.y;
-            for (int i = 0; i < numCRows; i++) {
-                for (int j = 0; j < numCColumns; j++) {
-                    float temp = 0.0;
-
-                    for (int k = 0; k < numAColumns; k++) {
-                        temp += (float)A[i * numAColumns + k] * (float)B[j * numBRows + k];
-                    }
-
-                    C[i * numCColumns + j] = temp * Alpha + Beta * C[i * numCColumns + j];
-                }
-            }
-        };
-        matMultiplyOnHost(cpuMatA.data(), cpuMatB.data(), cpuMatC.data());
-
-        struct CallbackContext
-        {
-            core::smart_refctd_ptr<ISemaphore> semaphore;
-            core::smart_refctd_ptr<IGPUBuffer> outputStagingBuffer;
-            core::smart_refctd_ptr<video::ILogicalDevice> device;
-            core::smart_refctd_ptr<system::ILogger> logger;
-            const float* expectedOutput;
-        };
-
-        CallbackContext ctx;
-        ctx.semaphore = semaphore;
-        ctx.outputStagingBuffer = outputStagingBuffer;
-        ctx.device = m_device;
-        ctx.logger = m_logger;
-        ctx.expectedOutput = cpuMatC.data();
-
-        auto cudaCallback = [](void* userData)
-        {
-            const auto* ctx = reinterpret_cast<CallbackContext*>(userData);
-
-            // Make sure we are also done with the readback 
-            const auto wait = std::array{
-              ISemaphore::SWaitInfo{
-                .semaphore = ctx->semaphore.get(), 
-                .value = 3,
-              }
-            };
-            ctx->device->blockForSemaphores(wait, true);
-
-            auto* stagingMem = ctx->outputStagingBuffer->getBoundMemory().memory;
-            if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-            {
-                ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize());
-                ctx->device->invalidateMappedMemoryRanges(1, &range);
-            }
-
-
-            const auto* outputs = reinterpret_cast<float*>(ctx->outputStagingBuffer->getBoundMemory().memory->getMappedPointer());
-
-            for (auto elem_i = 0; elem_i < ElementCount.x * ElementCount.y; elem_i++)
-            {
-              const auto output = outputs[elem_i];
-              const auto diff = abs(output - ctx->expectedOutput[elem_i]);
-              if (diff > 0.01)
-                ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i);
-            }
-
-            ctx->logger->log("Test Wmma Gemm Complete", ILogger::ELL_INFO);
-        };
-
-        ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), m_cuHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), m_cuHandler);
-
-        ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), m_cuHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), m_cuHandler);
-    }
-
     void testWmmaGemB1()
     {
         // b1 WMMA dimensions: M=8, N=8, K=128

From 96b8b3ec938b03672981cf0fc70494f971bb1a2f Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Tue, 19 May 2026 23:52:50 +0700
Subject: [PATCH 31/47] Update test to follow the update on vk_cuda_interop
 main branch

---
 76_CudaInterop/main.cpp | 88 ++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 45 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 087b7d181..516a6fe8b 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -93,13 +93,23 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         queue = getComputeQueue();
 
         testWmmaGemB1();
-        // testVectorAddKernel();
-        // testDestruction();
-        // testLargeAllocations();
+        testVectorAddKernel();
+        testDestruction();
+        testLargeAllocations();
 
         return true;
     }
 
+    smart_refctd_ptr<IGPUBuffer> createExternalBuffer2(uint64_t size, core::bitflag<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE> externalHandleTypes)
+    {
+        IGPUBuffer::SCreationParams params = {};
+        params.size = size;
+        params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
+        params.externalHandleTypes = externalHandleTypes;
+        auto buf = m_device->createBuffer(std::move(params));
+        return buf;
+    }
+
     smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
     {
         IGPUBuffer::SCreationParams params = {};
@@ -119,7 +129,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         req.memoryTypeBits &= m_device->getPhysicalDevice()->getDownStreamingMemoryTypeBits()
                             & m_device->getPhysicalDevice()->getHostVisibleMemoryTypeBits()
                             & m_device->getPhysicalDevice()->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT);
-        auto allocation = m_device->allocate(req, buf.get());
+        auto allocation = m_device->allocate(req, { buf.get() });
     
         void* mapping = allocation.memory->map(IDeviceMemoryAllocation::MemoryRange(0, req.size), IDeviceMemoryAllocation::EMCAF_READ);
         if (!mapping)
@@ -153,11 +163,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         IGPUBuffer::SCreationParams vkBufferParams;
         vkBufferParams.size = m_cuDevice->roundToGranularity(CU_MEM_LOCATION_TYPE_DEVICE, size);
         vkBufferParams.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT;
-        vkBufferParams.externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE;
+        vkBufferParams.externalHandleTypes = CCUDADevice::ExternalMemoryHandleType;
         const auto outputBuf = m_device->createBuffer(std::move(vkBufferParams));
         auto outputMemReq = outputBuf->getMemoryReqs();
 
-        auto allocation = m_device->allocate(outputMemReq, outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE);
+        auto allocation = m_device->allocate(outputMemReq, { outputBuf.get(), IDeviceMemoryAllocation::EMAF_NONE, CCUDADevice::ExternalMemoryHandleType });
         const auto cudaOutputMemory = m_cuDevice->importExternalMemory(core::smart_refctd_ptr(allocation.memory));
         if (!cudaOutputMemory)
           logFail("Fail to import Vulkan Memory into CUDA!");
@@ -217,8 +227,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         auto [outputBuf, cudaOutputMemory] = createSharedBuffer(BufferSize);
         
         ISemaphore::SCreationParams semParams;
+        semParams.initialValue = 0;
         semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
-        auto semaphore = m_device->createSemaphore(0, std::move(semParams));
+        auto semaphore = m_device->createSemaphore(std::move(semParams));
         const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
         if (!cudaSemaphore)
           logFail("Fail to import Vulkan Semaphore into CUDA!");
@@ -387,20 +398,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             const auto* inputs2 = reinterpret_cast<float*>(ctx->cpuBuffers[1]->getPointer());
 
             const auto* outputs = reinterpret_cast<float*>(ctx->outputStagingBuffer->getBoundMemory().memory->getMappedPointer());
-            const auto* inputsInStaging1 = reinterpret_cast<float*>(ctx->inputStagingBuffers[0]->getBoundMemory().memory->getMappedPointer());
-            const auto* inputsInStaging2 = reinterpret_cast<float*>(ctx->inputStagingBuffers[1]->getBoundMemory().memory->getMappedPointer());
 
             for (auto elem_i = 0; elem_i < NumElements; elem_i++)
             {
               const auto input1 = inputs1[elem_i];
               const auto input2 = inputs2[elem_i];
-              const auto inputInStaging1 = inputsInStaging1[elem_i];
-              const auto inputInStaging2 = inputsInStaging2[elem_i];
-              if (inputInStaging1 != input1)
-                ctx->logger->log("Input1 in Staging %d is incorrect!", ILogger::ELL_ERROR, elem_i);
-              if (inputInStaging2 != input2)
-                ctx->logger->log("Input2 in Staging %d is incorrect!", ILogger::ELL_ERROR, elem_i);
-
               const auto output = outputs[elem_i];
               const auto expected = input1 + input2;
               const auto diff = abs(output - expected);
@@ -450,10 +452,23 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         auto [vkBufferMatB, cuMemMatB] = createSharedBuffer(matB_size);
         auto [vkBufferMatC, cuMemMatC] = createSharedBuffer(matC_size);
 
+        // ICPUBuffer::SCreationParams cpuBufferParamsA;
+        // cpuBufferParamsA.size = ElementCount.x * ElementCount.z / 32;
+        // const auto cpuBufferA = ICPUBuffer::create(std::move(cpuBufferParamsA));
+
+        // ICPUBuffer::SCreationParams cpuBufferParamsB;
+        // cpuBufferParamsB.size = ElementCount.x * ElementCount.z / 32;
+        // const auto cpuBufferB = ICPUBuffer::create(std::move(cpuBufferParamsB));
+        //
+        // std::array inputBuffers = {cpuBufferA.get(), cpuBufferB.get()};
+        //
+        // CAssetConverter::SInputs inputs = {};
+        // std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = inputBuffers;
+
         // CPU matrices for initialization and verification
         core::vector<uint32_t> cpuMatA(ElementCount.x * ElementCount.z / 32);
         core::vector<uint32_t> cpuMatB(ElementCount.z * ElementCount.y / 32);
-        core::vector<int32_t> cpuMatC_expected(ElementCount.x * ElementCount.y);
+
 
         // Initialize with simple patterns for verification
         auto initBinaryMatrices = [&]()
@@ -473,22 +488,13 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             // Fill cpuMatB with random bits
             for (auto& val : cpuMatB) val = rand();
             
-            // Compute expected result: For bmma with bmmaBitOpAND
-            // C[i][j] = popcount(A[i,:] AND B[:,j])
-            for (int i = 0; i < ElementCount.x; i++) {
-                for (int j = 0; j < ElementCount.y; j++) {
-                    const int k = ElementCount.z - 1 - i;
-                    const int b_bit_idx = j * ElementCount.z + k; // col-major
-                    const int32_t bit = (cpuMatB[b_bit_idx / 32] >> (b_bit_idx % 32)) & 1;
-                    cpuMatC_expected[i * ElementCount.y + j] = bit;
-                }
-            }
         };
         initBinaryMatrices();
   
         ISemaphore::SCreationParams semParams;
         semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
-        auto semaphore = m_device->createSemaphore(0, std::move(semParams));
+        semParams.initialValue = 0;
+        auto semaphore = m_device->createSemaphore(std::move(semParams));
         const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
         if (!cudaSemaphore)
           logFail("Fail to import Vulkan Semaphore into CUDA!");
@@ -619,18 +625,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
                 const auto expectedBitOffset = expectedIdx % 32;
                 return (cpuMatB[expectedWordIdx] >> expectedBitOffset) & uint32_t(1);
             }();
-
-            // const auto expected = [&]
-            // {
-            //     const auto row = i / ElementCount.y;            // row-major
-            //     const auto col = i % ElementCount.y;
-            //     const auto k   = ElementCount.z - 1 - row;      // reverse-diagonal A
-            //     const auto bIdx = col * ElementCount.z + k;     // col-major B
-            //     return (cpuMatB[bIdx / 32] >> (bIdx % 32)) & uint32_t(1);
-            // }();
-
-            // const auto expected = cpuMatC_expected[i];
-
             const auto result = results[i];
             if (result != expected) {
                 m_logger->log("WMMA b1 test error at [%d]: GPU=%d, CPU=%d", 
@@ -657,17 +651,19 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             const auto cudaMemory = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE });
             if (!cudaMemory) logFail("Fail to create exportable memory!");
 
-            escaped = cudaMemory->exportAsMemory(m_device.get());
+            auto tmpBuf = createExternalBuffer2(cudaMemory->getCreationParams().granularSize, IDeviceMemoryAllocation::EHT_OPAQUE_WIN32);
+            escaped = cudaMemory->exportAsMemory(m_device.get(), tmpBuf.get());
             if (!escaped) logFail("Fail to export CUDA memory!");
         
-            auto tmpBuf = createExternalBuffer(escaped.get());
             auto staging = createStaging(BufferSize);
         
             auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
             for (uint32_t i = 0; i < ElementCount; ++i)
                 ptr[i] = i;
         
-            const auto semaphore = m_device->createSemaphore(0);
+            ISemaphore::SCreationParams semParams;
+            semParams.initialValue = 0;
+            const auto semaphore = m_device->createSemaphore(std::move(semParams));
             IQueue::SSubmitInfo::SSemaphoreInfo semInfo;
             semInfo.semaphore = semaphore.get();
             semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
@@ -693,7 +689,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             auto tmpBuf = createExternalBuffer(escaped.get());
             auto staging = createStaging(BufferSize);
         
-            const auto semaphore = m_device->createSemaphore(0);
+            ISemaphore::SCreationParams semParams;
+            semParams.initialValue = 0;
+            const auto semaphore = m_device->createSemaphore(std::move(semParams));
             IQueue::SSubmitInfo::SSemaphoreInfo semInfo;
             semInfo.semaphore = semaphore.get();
             semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
@@ -737,7 +735,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     
         for (size_t i = 0; i < (1 << 8); ++i)
         {
-            auto memory = m_device->allocate(reqs, 0, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE).memory;
+            auto memory = m_device->allocate(reqs, { nullptr, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::ExternalMemoryHandleType }).memory;
             assert(memory);
             auto tmpBuf = createExternalBuffer(memory.get());
         }

From 58f20e55ac6653f2a2b64a966358e61fcde5a660 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Fri, 22 May 2026 14:51:33 +0700
Subject: [PATCH 32/47] Fix to parameter passing when calling allocate

---
 71_RayTracingPipeline/main.cpp                                | 4 ++--
 .../nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/71_RayTracingPipeline/main.cpp b/71_RayTracingPipeline/main.cpp
index c74ab6686..d46894954 100644
--- a/71_RayTracingPipeline/main.cpp
+++ b/71_RayTracingPipeline/main.cpp
@@ -216,7 +216,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 			}
 			});
 
-		if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), m_hdrImage.get()).isValid())
+		if (!m_hdrImage || !m_device->allocate(m_hdrImage->getMemoryReqs(), { m_hdrImage.get() }).isValid())
 			return logFail("Could not create HDR Image");
 
 		m_hdrImageView = m_device->createImageView({
@@ -1353,7 +1353,7 @@ class RaytracingPipelineApp final : public SimpleWindowedApplication, public Bui
 				auto reqs = scratchBuffer->getMemoryReqs();
 				reqs.memoryTypeBits &= m_physicalDevice->getDirectVRAMAccessMemoryTypeBits();
 
-				auto allocation = m_device->allocate(reqs, scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+				auto allocation = m_device->allocate(reqs, { scratchBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT });
 				allocation.memory->map({ .offset = 0,.length = reqs.size });
 
 				scratchAlloc = make_smart_refctd_ptr<CAssetConverter::SConvertParams::scratch_for_device_AS_build_t>(
diff --git a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp
index c7d780fdf..949026a3c 100644
--- a/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp
+++ b/common/include/nbl/examples/common/CSwapchainFramebuffersAndDepth.hpp
@@ -76,7 +76,7 @@ class CSwapchainFramebuffersAndDepth final : public video::CDefaultSwapchainFram
 					.depthUsage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT
 				} });
 
-				device->allocate(image->getMemoryReqs(), image.get());
+				device->allocate(image->getMemoryReqs(), { image.get() });
 
 				m_depthBuffer = device->createImageView({
 					.flags = IGPUImageView::ECF_NONE,

From dc3a11446aba697238708174d41f44a5e29ec922 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 10:41:24 +0700
Subject: [PATCH 33/47] Use RAII exiter for module and stream cleanup

---
 76_CudaInterop/main.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 516a6fe8b..512807574 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -186,12 +186,20 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         auto& cu = m_cuHandler->getCUDAFunctionTable();
 
         CUmodule   module;
-        CUfunction kernel;
-        CUstream   stream;
-
         ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler);
+        auto moduleCleanup = nbl::core::makeRAIIExiter([&]()
+        {
+            cu.pcuModuleUnload(module);
+        });
+
+        CUfunction kernel;
         ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "vectorAdd"), m_cuHandler);
+
+        CUstream   stream;
         ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler);
+        auto streamCleanup = nbl::core::makeRAIIExiter([&] {
+            cu.pcuStreamDestroy_v2(stream);
+        });
 
         // CPU memory which we fill with random numbers between [-1,1] that will be copied to corresponding cudaMemory
         std::array<smart_refctd_ptr<ICPUBuffer>, 2> cpuBufs;
@@ -416,8 +424,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), m_cuHandler);
         ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), m_cuHandler);
 
-        ASSERT_CUDA_SUCCESS(cu.pcuModuleUnload(module), m_cuHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamDestroy_v2(stream), m_cuHandler);
     }
 
     void testWmmaGemB1()

From b5caa9c08bee767a604c6b8a51aa71aa6fb5efcb Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 10:42:02 +0700
Subject: [PATCH 34/47] Use c++ random facility

---
 76_CudaInterop/main.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 512807574..c392bcc33 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -54,6 +54,8 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     smart_refctd_ptr<CCUDAHandler> m_cuHandler;
     smart_refctd_ptr<CCUDADevice> m_cuDevice;
     smart_refctd_ptr<IUtilities> m_utils;
+    std::random_device m_randomDevice;
+    std::mt19937 m_randGenerator;
 
     IQueue* queue;
 
@@ -211,9 +213,15 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             buf = ICPUBuffer::create(std::move(params));
         }
 
+        std::uniform_real_distribution<float32_t> dist(-RAND_MAX, RAND_MAX);
         for (auto buf_i = 0; buf_i < cpuBufs.size(); buf_i++)
+        {
             for (auto elem_i = 0; elem_i < NumElements; elem_i++)
-                reinterpret_cast<float*>(cpuBufs[buf_i]->getPointer())[elem_i] = rand() / float(RAND_MAX);
+            {
+                auto* data = reinterpret_cast<float*>(cpuBufs[buf_i]->getPointer());
+                data[elem_i] = dist(m_randGenerator);
+            }
+        }
 
         constexpr auto InputCount = 2;
         // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
@@ -491,8 +499,9 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
               cpuMatA[wordIdx] |= (1u << bitOffset);
             }
 
+            std::uniform_int_distribution<uint32_t> dist;
             // Fill cpuMatB with random bits
-            for (auto& val : cpuMatB) val = rand();
+            for (auto& val : cpuMatB) val = dist(m_randGenerator);
             
         };
         initBinaryMatrices();

From bed15b46acb90aafdb512abb3f51b656bac5e958 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 10:42:28 +0700
Subject: [PATCH 35/47] Fix conversion of cpuBuffer to gpuBuffer by setting
 contentHash

---
 76_CudaInterop/main.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index c392bcc33..74d9f2403 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -498,10 +498,12 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
               auto bitOffset = bitIdx % 32;
               cpuMatA[wordIdx] |= (1u << bitOffset);
             }
+            cpuBufferA->setContentHash(cpuBufferA->computeContentHash());
 
             std::uniform_int_distribution<uint32_t> dist;
             // Fill cpuMatB with random bits
             for (auto& val : cpuMatB) val = dist(m_randGenerator);
+            cpuBufferB->setContentHash(cpuBufferB->computeContentHash());
             
         };
         initBinaryMatrices();

From 48c19621c882f82bd070af04966ca77ad48c60f9 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 10:42:50 +0700
Subject: [PATCH 36/47] Fix block dimension of testVectorAddKernel

---
 76_CudaInterop/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 74d9f2403..db1f08f74 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -180,7 +180,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     void testVectorAddKernel()
     {
         static constexpr uint32_t GridDim[3] = { 4096,1,1 };
-        static constexpr uint32_t BlockDim[3] = { 1,1,1 };
+        static constexpr uint32_t BlockDim[3] = { 1024,1,1 };
         static constexpr size_t NumElements = GridDim[0] * BlockDim[0];
         static constexpr size_t BufferSize = sizeof(float) * NumElements;
 

From da64d2e9176930f369e3d91a5b45aad76cfc4757 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 10:43:31 +0700
Subject: [PATCH 37/47] Implement the rest of testWmmaGemB1

---
 76_CudaInterop/main.cpp | 137 ++++++++++++++++++++++++++++------------
 1 file changed, 95 insertions(+), 42 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index db1f08f74..adee6a671 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -49,7 +49,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 public:
     // Yay thanks to multiple inheritance we cannot forward ctors anymore
     CUDA2VKApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
-        system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+        system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), m_randGenerator(m_randomDevice()) {}
 
     smart_refctd_ptr<CCUDAHandler> m_cuHandler;
     smart_refctd_ptr<CCUDADevice> m_cuDevice;
@@ -94,15 +94,15 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
         queue = getComputeQueue();
 
-        testWmmaGemB1();
         testVectorAddKernel();
+        testWmmaGemB1();
         testDestruction();
         testLargeAllocations();
 
         return true;
     }
 
-    smart_refctd_ptr<IGPUBuffer> createExternalBuffer2(uint64_t size, core::bitflag<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE> externalHandleTypes)
+    smart_refctd_ptr<IGPUBuffer> createExternalBuffer(uint64_t size, core::bitflag<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE> externalHandleTypes)
     {
         IGPUBuffer::SCreationParams params = {};
         params.size = size;
@@ -117,7 +117,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         IGPUBuffer::SCreationParams params = {};
         params.size = mem->getAllocationSize();
         params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
-        params.externalHandleTypes = mem->getCreationParams().externalHandleType;
+        params.externalHandleTypes = mem->getCreationParams().externalHandleTypes;
         auto buf = m_device->createBuffer(std::move(params));
         ILogicalDevice::SBindBufferMemoryInfo bindInfo = { .buffer = buf.get(), .binding = {.memory = mem } };
         m_device->bindBufferMemory(1, &bindInfo);
@@ -244,7 +244,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         
         ISemaphore::SCreationParams semParams;
         semParams.initialValue = 0;
-        semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
+        semParams.externalHandleTypes = CCUDADevice::ExternalSemaphoreHandleType;
         auto semaphore = m_device->createSemaphore(std::move(semParams));
         const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
         if (!cudaSemaphore)
@@ -462,28 +462,20 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         const size_t matB_size = (ElementCount.z * ElementCount.y) / 32 * sizeof(uint32_t); // K x N bits
         const size_t matC_size = ElementCount.x * ElementCount.y * sizeof(int32_t);         // M x N ints
 
-        auto [vkBufferMatA, cuMemMatA] = createSharedBuffer(matA_size);
-        auto [vkBufferMatB, cuMemMatB] = createSharedBuffer(matB_size);
         auto [vkBufferMatC, cuMemMatC] = createSharedBuffer(matC_size);
 
-        // ICPUBuffer::SCreationParams cpuBufferParamsA;
-        // cpuBufferParamsA.size = ElementCount.x * ElementCount.z / 32;
-        // const auto cpuBufferA = ICPUBuffer::create(std::move(cpuBufferParamsA));
-
-        // ICPUBuffer::SCreationParams cpuBufferParamsB;
-        // cpuBufferParamsB.size = ElementCount.x * ElementCount.z / 32;
-        // const auto cpuBufferB = ICPUBuffer::create(std::move(cpuBufferParamsB));
-        //
-        // std::array inputBuffers = {cpuBufferA.get(), cpuBufferB.get()};
-        //
-        // CAssetConverter::SInputs inputs = {};
-        // std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = inputBuffers;
-
-        // CPU matrices for initialization and verification
-        core::vector<uint32_t> cpuMatA(ElementCount.x * ElementCount.z / 32);
-        core::vector<uint32_t> cpuMatB(ElementCount.z * ElementCount.y / 32);
-
-
+        ICPUBuffer::SCreationParams cpuBufferParamsA;
+        cpuBufferParamsA.size = matA_size;
+        const auto cpuBufferA = ICPUBuffer::create(std::move(cpuBufferParamsA));
+        const auto cpuBufferAData = reinterpret_cast<uint32_t*>(cpuBufferA->getPointer());
+        const auto cpuMatA = std::span(cpuBufferAData, matA_size / sizeof(uint32_t));
+
+        ICPUBuffer::SCreationParams cpuBufferParamsB;
+        cpuBufferParamsB.size = matB_size;
+        const auto cpuBufferB = ICPUBuffer::create(std::move(cpuBufferParamsB));
+        const auto cpuBufferBData = reinterpret_cast<uint32_t*>(cpuBufferB->getPointer());
+        const auto cpuMatB = std::span(cpuBufferBData, matB_size / sizeof(uint32_t));
+        
         // Initialize with simple patterns for verification
         auto initBinaryMatrices = [&]()
         {
@@ -507,7 +499,81 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             
         };
         initBinaryMatrices();
+
+        std::array inputBuffers = {cpuBufferA.get(), cpuBufferB.get()};
+
+        CAssetConverter::SInputs inputs = {};
+        std::get<CAssetConverter::SInputs::asset_span_t<ICPUBuffer>>(inputs.assets) = inputBuffers;
+        std::array<CAssetConverter::patch_t<asset::ICPUBuffer>, std::size(inputBuffers)> inputBufferPatches;
+        for (auto& inputPatch : inputBufferPatches)
+        {
+          inputPatch.usage = asset::IBuffer::EUF_STORAGE_BUFFER_BIT | asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
+          inputPatch.externalHandleTypes = CCUDADevice::ExternalMemoryHandleType;
+        }
+        std::get<CAssetConverter::SInputs::patch_span_t<ICPUBuffer>>(inputs.patches) = inputBufferPatches;
+        smart_refctd_ptr<CAssetConverter> converter = CAssetConverter::create({ .device = m_device.get(), .optimizer = {} });
+        auto reservation = converter->reserve(inputs);
+        if (!reservation)
+        {
+            logFail("reserve failed!");
+            return;
+        }
+
+        // Create transfer queue resources
+        auto transferQueue = getComputeQueue();
+        auto transferCmdPool = m_device->createCommandPool(
+          transferQueue->getFamilyIndex(),
+          IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT | IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT
+        );
+
+        // SIntendedSubmitInfo needs at least one scratch cmdbuf in RECORDING state
+        smart_refctd_ptr<IGPUCommandBuffer> transferCmdBuf;
+        transferCmdPool->createCommandBuffers(
+          IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &transferCmdBuf, smart_refctd_ptr(m_logger)
+        );
+        transferCmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+        auto transferScratchSemaphore = m_device->createSemaphore({ .initialValue = 0 });
+
+        IQueue::SSubmitInfo::SCommandBufferInfo transferCmdBufInfo = {
+          transferCmdBuf.get()
+        };
+        SIntendedSubmitInfo transferSubmitInfo;
+        transferSubmitInfo.queue = transferQueue;
+        transferSubmitInfo.scratchCommandBuffers = { &transferCmdBufInfo, 1 };
+        transferSubmitInfo.scratchSemaphore = {
+            .semaphore = transferScratchSemaphore.get(),
+            .value = 0,
+            .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+        };
+
+        nbl::video::CAssetConverter::SConvertParams convertParams = {};
+        convertParams.utilities = m_utils.get();
+        convertParams.transfer = &transferSubmitInfo;
   
+        auto future = reservation.convert(convertParams);
+        if (future.copy() != IQueue::RESULT::SUCCESS)
+        {
+            logFail("CAssetConverter convert failed!");
+            return;
+        }
+
+        auto gpuBuffers = reservation.getGPUObjects<ICPUBuffer>();
+        auto gpuBufferA = gpuBuffers[0].value;
+        const auto boundedMemA = gpuBufferA->getBoundMemory();
+        auto cuMemMatA = m_cuDevice->importExternalMemory(core::smart_refctd_ptr<IDeviceMemoryAllocation>(boundedMemA.memory));
+
+        auto gpuBufferB = gpuBuffers[1].value;
+        const auto boundedMemB = gpuBufferB->getBoundMemory();
+        auto cuMemMatB = m_cuDevice->importExternalMemory(
+          core::smart_refctd_ptr<IDeviceMemoryAllocation>(boundedMemB.memory));
+        
+        std::array<smart_refctd_ptr<IGPUCommandBuffer>, 2> cmd;
+        auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+        commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger));
+
+        const auto outputStagingBuffer = createStaging(vkBufferMatC->getSize());
+
         ISemaphore::SCreationParams semParams;
         semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
         semParams.initialValue = 0;
@@ -515,12 +581,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
         if (!cudaSemaphore)
           logFail("Fail to import Vulkan Semaphore into CUDA!");
-        
-        std::array<smart_refctd_ptr<IGPUCommandBuffer>, 2> cmd;
-        auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-        commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, cmd.size(), cmd.data(), smart_refctd_ptr(m_logger));
-
-        const auto outputStagingBuffer = createStaging(vkBufferMatC->getSize());
 
         // Release ownership to CUDA
         {
@@ -553,15 +613,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         // Launch CUDA kernel
         {
             CUdeviceptr matrixAPtr, matrixBPtr, matrixCPtr;
-            cuMemMatA->getMappedBuffer(&matrixAPtr);
-            cuMemMatB->getMappedBuffer(&matrixBPtr);
+            cuMemMatA->getMappedBuffer(&matrixAPtr, gpuBufferA->getSize(), gpuBufferA->getBoundMemory().offset);
+            cuMemMatB->getMappedBuffer(&matrixBPtr, gpuBufferB->getSize(), gpuBufferB->getBoundMemory().offset);
             cuMemMatC->getMappedBuffer(&matrixCPtr);
 
-            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixAPtr, cpuMatA.data(), matA_size, stream), m_cuHandler);
-            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixBPtr, cpuMatB.data(), matB_size, stream), m_cuHandler);
-            core::vector<int32_t> cpuMatC(ElementCount.x * ElementCount.y, 15);
-            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(matrixCPtr, cpuMatC.data(), matC_size, stream), m_cuHandler);
-
             void* parameters[] = { &matrixAPtr, &matrixBPtr, &matrixCPtr, 
                                    (void*)&ElementCount.x, (void*)&ElementCount.y, (void*)&ElementCount.z };
 
@@ -569,9 +624,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
             ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore_cu, &waitParams, 1, stream), m_cuHandler);
             
-            ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim.x, GridDim.y, 1, 
-                                                   BlockDim.x, BlockDim.y, 1, 
-                                                   0, stream, parameters, nullptr), m_cuHandler);
+            ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim.x, GridDim.y, 1, BlockDim.x, BlockDim.y, 1, 0, stream, parameters, nullptr), m_cuHandler);
             
             const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
             ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore_cu, &signalParams, 1, stream), m_cuHandler);
@@ -668,7 +721,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             const auto cudaMemory = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE });
             if (!cudaMemory) logFail("Fail to create exportable memory!");
 
-            auto tmpBuf = createExternalBuffer2(cudaMemory->getCreationParams().granularSize, IDeviceMemoryAllocation::EHT_OPAQUE_WIN32);
+            auto tmpBuf = createExternalBuffer(cudaMemory->getCreationParams().granularSize, IDeviceMemoryAllocation::EHT_OPAQUE_WIN32);
             escaped = cudaMemory->exportAsMemory(m_device.get(), tmpBuf.get());
             if (!escaped) logFail("Fail to export CUDA memory!");
         

From ca8a8adb338c37c1ad7af5de22a201bc7e73e3f9 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 11:17:18 +0700
Subject: [PATCH 38/47] Improve the comment on testVectorAddKernel

---
 76_CudaInterop/main.cpp | 112 ++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 49 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index adee6a671..59f31f081 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -179,6 +179,23 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
     void testVectorAddKernel()
     {
+        // This function demonstrates bidirectional resource sharing between CUDA and Vulkan:
+        // 
+        // Shared Resources:
+        // - 3 buffers: 2 input buffers + 1 output buffer for vector addition results
+        // - 1 semaphore for synchronization
+        //
+        // Memory Allocation Patterns:
+        // - Input buffers: Allocated by CUDA (CCUDADevice::createExportableMemory) → imported to Vulkan
+        // - Output buffer: Allocated by Vulkan → imported to CUDA (CCUDADevice::importExternalMemory)
+        //
+        // Synchronization:
+        // - Semaphore: Created by Vulkan → imported to CUDA
+        // - Demonstrates bidirectional signaling: CUDA signals → Vulkan waits, and vice versa
+        //
+        // Data Flow:
+        // - CUDA kernel writes to shared buffer → Vulkan reads the results
+
         static constexpr uint32_t GridDim[3] = { 4096,1,1 };
         static constexpr uint32_t BlockDim[3] = { 1024,1,1 };
         static constexpr size_t NumElements = GridDim[0] * BlockDim[0];
@@ -189,8 +206,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
         CUmodule   module;
         ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler);
-        auto moduleCleanup = nbl::core::makeRAIIExiter([&]()
-        {
+        auto moduleCleanup = nbl::core::makeRAIIExiter([&]() {
             cu.pcuModuleUnload(module);
         });
 
@@ -224,28 +240,35 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         }
 
         constexpr auto InputCount = 2;
-        // // CUDA resources that we input to the kernel 'vectorAdd_kernel.cu'
-        // // Kernel writes to cudaInputMemories[2] which we later use to export and read on nabla side
+        // Create CUDA-allocated input buffers that will be exported to Vulkan
+        // This demonstrates the CUDA → Vulkan memory sharing pattern 
         std::array<smart_refctd_ptr<CCUDAExportableMemory>, InputCount> cudaInputMemories = {};
         std::array<smart_refctd_ptr<IDeviceMemoryAllocation>, InputCount> vulkanMemories = {};
         std::array<smart_refctd_ptr<IGPUBuffer>, InputCount> vulkanInputBuffers = {};
         std::array<smart_refctd_ptr<IGPUBuffer>, InputCount> inputStagingBuffers = {};
 
-        for (auto input_i = 0; input_i < InputCount; input_i++)
+        auto initInputBuffers = [&]
         {
-          // create and allocate CUmem with CUDA and slap it inside a simple IReferenceCounted wrapper
-          cudaInputMemories[input_i] = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE });
-          vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr);
-          vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get());
-          inputStagingBuffers[input_i] = createStaging(BufferSize);
-        }
+            for (auto input_i = 0; input_i < InputCount; input_i++)
+            {
+              cudaInputMemories[input_i] = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE });
+              vulkanMemories[input_i] = cudaInputMemories[input_i]->exportAsMemory(m_device.get(), nullptr);
+              vulkanInputBuffers[input_i] = createExternalBuffer(vulkanMemories[input_i].get());
+              inputStagingBuffers[input_i] = createStaging(BufferSize);
+            }
+        };
+        initInputBuffers();
 
+        // Create Vulkan-allocated output buffer and import to CUDA
+        // This demonstrates the Vulkan → CUDA memory sharing pattern
         auto [outputBuf, cudaOutputMemory] = createSharedBuffer(BufferSize);
         
+        // Create timeline semaphore for cross-API synchronization
+        // Timeline values: 0=initial, 1=release vulkan output buffer ownership, 2=cuda kernel done, 3=copy done 
         ISemaphore::SCreationParams semParams;
         semParams.initialValue = 0;
         semParams.externalHandleTypes = CCUDADevice::ExternalSemaphoreHandleType;
-        auto semaphore = m_device->createSemaphore(std::move(semParams));
+        const auto semaphore = m_device->createSemaphore(std::move(semParams));
         const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
         if (!cudaSemaphore)
           logFail("Fail to import Vulkan Semaphore into CUDA!");
@@ -256,7 +279,8 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         
         const auto outputStagingBuffer = createStaging(BufferSize);
 
-        // First we record a release ownership transfer to let vulkan know that resources are going to be used in an external API
+        // === Phase 1: Vulkan releases ownership to external queue (CUDA) ===
+        // Signal semaphore to value=1 after ownership transfer
         {
             const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
                 .barrier = {
@@ -297,8 +321,13 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             if (!re) logFail("Something went wrong readying resources for CUDA");
         }
         
-        // Launch kernel
+        // === Phase 2: CUDA executes kernel ===
+        // 1. Copy input data from CPU to CUDA device memory
+        // 2. Wait for semaphore value=1 (ownership released)
+        // 3. Launch vectorAdd kernel
+        // 4. Signal semaphore to value=2 (kernel complete)
         {
+            // Step 1
             CUdeviceptr outputBufPtr;
             cudaOutputMemory->getMappedBuffer(&outputBufPtr);
             CUdeviceptr ptrs[] = {
@@ -311,15 +340,22 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[0], cpuBufs[0]->getPointer(), BufferSize, stream), m_cuHandler);
             ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(ptrs[1], cpuBufs[1]->getPointer(), BufferSize, stream), m_cuHandler);
     
+            // Step 2
             CUexternalSemaphore semaphore = cudaSemaphore->getInternalObject();
             const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
             ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), m_cuHandler); // Wait for release op from vulkan
+
+            // Step 3
             ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), m_cuHandler);
+
+            // Step 4
             const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
             ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), m_cuHandler); // Signal the imported semaphore
         }
-        
-        // After the cuda kernel has signalled our exported vk semaphore, we will download the results through the buffer imported from CUDA
+
+        // === Phase 3: Vulkan acquires ownership and copies results ===
+        // Wait for semaphore value=2, then copy output to staging buffer
+        // Signal semaphore to value=3 after copy completes
         {
             const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
                 .barrier = {
@@ -372,48 +408,28 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
         }
 
-        struct CallbackContext
+        // === Phase 4: Validate the output buffer content ===
         {
-            core::smart_refctd_ptr<ISemaphore> semaphore;
-            std::array<core::smart_refctd_ptr<ICPUBuffer>, InputCount> cpuBuffers;
-            std::array<core::smart_refctd_ptr<IGPUBuffer>, InputCount> inputStagingBuffers;
-            core::smart_refctd_ptr<IGPUBuffer> outputStagingBuffer;
-            core::smart_refctd_ptr<video::ILogicalDevice> device;
-            core::smart_refctd_ptr<system::ILogger> logger;
-        };
-
-        CallbackContext ctx;
-        ctx.semaphore = semaphore;
-        ctx.cpuBuffers = cpuBufs;
-        ctx.inputStagingBuffers = inputStagingBuffers;
-        ctx.outputStagingBuffer = outputStagingBuffer;
-        ctx.device = m_device;
-        ctx.logger = m_logger;
-
-        auto cudaCallback = [](void* userData)
-        {
-            const auto* ctx = reinterpret_cast<CallbackContext*>(userData);
-
             // Make sure we are also done with the readback 
             const auto wait = std::array{
               ISemaphore::SWaitInfo{
-                .semaphore = ctx->semaphore.get(), 
+                .semaphore = semaphore.get(), 
                 .value = 3,
               }
             };
-            ctx->device->blockForSemaphores(wait, true);
+            m_device->blockForSemaphores(wait, true);
 
-            auto* stagingMem = ctx->outputStagingBuffer->getBoundMemory().memory;
+            auto* stagingMem = outputStagingBuffer->getBoundMemory().memory;
             if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
             {
                 ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize());
-                ctx->device->invalidateMappedMemoryRanges(1, &range);
+                m_device->invalidateMappedMemoryRanges(1, &range);
             }
 
-            const auto* inputs1 = reinterpret_cast<float*>(ctx->cpuBuffers[0]->getPointer());
-            const auto* inputs2 = reinterpret_cast<float*>(ctx->cpuBuffers[1]->getPointer());
+            const auto* inputs1 = reinterpret_cast<float*>(cpuBufs[0]->getPointer());
+            const auto* inputs2 = reinterpret_cast<float*>(cpuBufs[1]->getPointer());
 
-            const auto* outputs = reinterpret_cast<float*>(ctx->outputStagingBuffer->getBoundMemory().memory->getMappedPointer());
+            const auto* outputs = reinterpret_cast<float*>(outputStagingBuffer->getBoundMemory().memory->getMappedPointer());
 
             for (auto elem_i = 0; elem_i < NumElements; elem_i++)
             {
@@ -423,14 +439,12 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
               const auto expected = input1 + input2;
               const auto diff = abs(output - expected);
               if (diff > 0.01)
-                ctx->logger->log("TestSharedResources: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i);
+                m_logger->log("TestVectorAdd: Element at index %d is incorrect!", ILogger::ELL_ERROR, elem_i);
             }
 
-            ctx->logger->log("TestSharedResources Complete", ILogger::ELL_INFO);
-        };
+            m_logger->log("TestVectorAdd Complete", ILogger::ELL_INFO);
 
-        ASSERT_CUDA_SUCCESS(cu.pcuLaunchHostFunc(stream, cudaCallback, &ctx), m_cuHandler);
-        ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), m_cuHandler);
+        }
 
     }
 

From f07899c26a04fdd73d0d8ef79a092be6b25af400 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 12:16:11 +0700
Subject: [PATCH 39/47] Proper resource cleanup for testWmmaGemB1

---
 76_CudaInterop/main.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 59f31f081..ef0f6477a 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -464,12 +464,19 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         auto& cu = m_cuHandler->getCUDAFunctionTable();
 
         CUmodule   module;
-        CUfunction kernel;
-        CUstream   stream;
-
         ASSERT_CUDA_SUCCESS(cu.pcuModuleLoadDataEx(&module, ptx->getPointer(), 0u, nullptr, nullptr), m_cuHandler);
+        auto moduleCleanup = nbl::core::makeRAIIExiter([&]() {
+            cu.pcuModuleUnload(module);
+        });
+
+        CUfunction kernel;
         ASSERT_CUDA_SUCCESS(cu.pcuModuleGetFunction(&kernel, module, "b1_wmma_gemm_kernel"), m_cuHandler);
+
+        CUstream   stream;
         ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler);
+        auto streamCleanup = nbl::core::makeRAIIExiter([&] {
+            cu.pcuStreamDestroy_v2(stream);
+        });
 
         // Calculate buffer sizes (bits packed into uint32_t)
         const size_t matA_size = (ElementCount.x * ElementCount.z) / 32 * sizeof(uint32_t); // M x K bits

From 7ead24c7d5335f5c3bf71c11c25ebaa86fa6a8c5 Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 15:28:25 +0700
Subject: [PATCH 40/47] Use SyncPoint* constant instead of magic number

---
 76_CudaInterop/main.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index ef0f6477a..b717a354d 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -265,8 +265,12 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         
         // Create timeline semaphore for cross-API synchronization
         // Timeline values: 0=initial, 1=release vulkan output buffer ownership, 2=cuda kernel done, 3=copy done 
+        static constexpr uint64_t SyncPointInitial = 0;
+        static constexpr uint64_t SyncPointReleased = 1;
+        static constexpr uint64_t SyncPointKernelDone = 2;
+        static constexpr uint64_t SyncPointCopyDone = 3;
         ISemaphore::SCreationParams semParams;
-        semParams.initialValue = 0;
+        semParams.initialValue = SyncPointInitial;
         semParams.externalHandleTypes = CCUDADevice::ExternalSemaphoreHandleType;
         const auto semaphore = m_device->createSemaphore(std::move(semParams));
         const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
@@ -308,7 +312,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     
             const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
               .semaphore = semaphore.get(), 
-              .value = 1,
+              .value = SyncPointReleased,
               .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
             };
             const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() };
@@ -342,14 +346,14 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     
             // Step 2
             CUexternalSemaphore semaphore = cudaSemaphore->getInternalObject();
-            const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+            const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = SyncPointReleased } } };
             ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore, &waitParams, 1, stream), m_cuHandler); // Wait for release op from vulkan
 
             // Step 3
             ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim[0], GridDim[1], GridDim[2], BlockDim[0], BlockDim[1], BlockDim[2], 0, stream, parameters, nullptr), m_cuHandler);
 
             // Step 4
-            const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
+            const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = SyncPointKernelDone } } };
             ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore, &signalParams, 1, stream), m_cuHandler); // Signal the imported semaphore
         }
 
@@ -387,12 +391,12 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             
             const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo= {
               .semaphore = semaphore.get(), 
-              .value = 2,
+              .value = SyncPointKernelDone,
               .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
             };
             const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
               .semaphore = semaphore.get(), 
-              .value = 3,
+              .value = SyncPointCopyDone,
               .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
             };
             const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
@@ -414,7 +418,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             const auto wait = std::array{
               ISemaphore::SWaitInfo{
                 .semaphore = semaphore.get(), 
-                .value = 3,
+                .value = SyncPointCopyDone,
               }
             };
             m_device->blockForSemaphores(wait, true);

From 78835314611a9933344359e6a8b28cee02717f8c Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 15:29:08 +0700
Subject: [PATCH 41/47] Refactor testWmmaGemmb1

---
 76_CudaInterop/main.cpp | 166 ++++++++++++++++++++++++----------------
 1 file changed, 102 insertions(+), 64 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index b717a354d..d9d3138af 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -454,6 +454,32 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
     void testWmmaGemB1()
     {
+        // This function demonstrates a key advantage of CUDA-Vulkan interoperability:
+        // accessing CUDA-exclusive hardware features that Vulkan cannot natively support.
+        //
+        // WMMA (Warp Matrix Multiply-Accumulate) with b1 (1-bit) primitives leverages
+        // specialized Tensor Core instructions for ultra-efficient binary matrix operations.
+        // Since Vulkan lacks native support for 1-bit matrix operations, this test showcases
+        // how applications can:
+        // 1. Allocate and manage matrices using Vulkan's memory system
+        // 2. Share those buffers with CUDA via external memory handles
+        // 3. Execute CUDA-exclusive Tensor Core operations (b1 WMMA GEMM)
+        // 4. Retrieve results back to Vulkan for further GPU processing or readback
+        //
+        // Test methodology:
+        // - Matrix A (M×K): 1-bit reverse diagonal matrix (1s on anti-diagonal, 0s elsewhere)
+        // - Matrix B (K×N): 1-bit random matrix
+        // - Matrix C (M×N): Result stored as int32s (popcount of bitwise AND per row/col pair)
+        //
+        // Verification strategy:
+        // Multiplying a reverse diagonal matrix by any matrix B produces a result where each
+        // column of B is reversed. This makes verification trivial: C[i,j] should equal B[K-1-i, j]
+        // Example with K=4:
+        //   [0 0 0 1]   [b00 b01]   [b30 b31]
+        //   [0 0 1 0] × [b10 b11] = [b20 b21]
+        //   [0 1 0 0]   [b20 b21]   [b10 b11]
+        //   [1 0 0 0]   [b30 b31]   [b00 b01]
+
         // b1 WMMA dimensions: M=8, N=8, K=128
         constexpr auto WmmaSize = uint32_t3{ 8, 8, 128 };
         constexpr auto TileCount = uint32_t3{ 128, 128, 8 };  // Adjust for b1 dimensions
@@ -463,6 +489,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             (ElementCount.x + WmmaSize.x - 1) / WmmaSize.x,  // M tiles
             (ElementCount.y + WmmaSize.y - 1) / WmmaSize.y   // N tiles
         );
+        static constexpr auto BitsPerUint32 = 32;
 
         const auto ptx = compilePtx("app_resources/wmmaGemm_b1_kernel.cu");
         auto& cu = m_cuHandler->getCUDAFunctionTable();
@@ -483,11 +510,16 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         });
 
         // Calculate buffer sizes (bits packed into uint32_t)
-        const size_t matA_size = (ElementCount.x * ElementCount.z) / 32 * sizeof(uint32_t); // M x K bits
-        const size_t matB_size = (ElementCount.z * ElementCount.y) / 32 * sizeof(uint32_t); // K x N bits
+        const size_t matA_size = (ElementCount.x * ElementCount.z) / BitsPerUint32 * sizeof(uint32_t); // M x K bits
+        const size_t matB_size = (ElementCount.z * ElementCount.y) / BitsPerUint32 * sizeof(uint32_t); // K x N bits
         const size_t matC_size = ElementCount.x * ElementCount.y * sizeof(int32_t);         // M x N ints
 
         auto [vkBufferMatC, cuMemMatC] = createSharedBuffer(matC_size);
+        if (!vkBufferMatC || !cuMemMatC)
+        {
+            logFail("Failed to create shared buffer for matrix C");
+            return;
+        }
 
         ICPUBuffer::SCreationParams cpuBufferParamsA;
         cpuBufferParamsA.size = matA_size;
@@ -506,13 +538,12 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         {
             // Fill cpuMatA with reverse diagonal pattern
             std::fill(cpuMatA.begin(), cpuMatA.end(), 0);
-
             for (int i = 0; i < ElementCount.x; i++)
             {
               auto j = ElementCount.z - 1 - i;
               auto bitIdx = i * ElementCount.z + j;
-              auto wordIdx = bitIdx / 32;
-              auto bitOffset = bitIdx % 32;
+              auto wordIdx = bitIdx / BitsPerUint32;
+              auto bitOffset = bitIdx % BitsPerUint32;
               cpuMatA[wordIdx] |= (1u << bitOffset);
             }
             cpuBufferA->setContentHash(cpuBufferA->computeContentHash());
@@ -599,9 +630,13 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
 
         const auto outputStagingBuffer = createStaging(vkBufferMatC->getSize());
 
+        static constexpr uint64_t SyncPointInitial = 0;
+        static constexpr uint64_t SyncPointReleased = 1;
+        static constexpr uint64_t SyncPointKernelDone = 2;
+        static constexpr uint64_t SyncPointCopyDone = 3;
         ISemaphore::SCreationParams semParams;
         semParams.externalHandleTypes = ISemaphore::EHT_OPAQUE_WIN32;
-        semParams.initialValue = 0;
+        semParams.initialValue = SyncPointInitial;
         auto semaphore = m_device->createSemaphore(std::move(semParams));
         const auto cudaSemaphore = m_cuDevice->importExternalSemaphore(core::smart_refctd_ptr(semaphore));
         if (!cudaSemaphore)
@@ -626,7 +661,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             cmd[0]->end();
 
             const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
-              .semaphore = semaphore.get(), .value = 1, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+              .semaphore = semaphore.get(), .value = SyncPointReleased, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
             };
             const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[0].get() };
             const IQueue::SSubmitInfo submitInfo = {
@@ -646,12 +681,12 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
                                    (void*)&ElementCount.x, (void*)&ElementCount.y, (void*)&ElementCount.z };
 
             CUexternalSemaphore semaphore_cu = cudaSemaphore->getInternalObject();
-            const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = 1 } } };
+            const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS waitParams = { .params = {.fence = {.value = SyncPointReleased } } };
             ASSERT_CUDA_SUCCESS(cu.pcuWaitExternalSemaphoresAsync(&semaphore_cu, &waitParams, 1, stream), m_cuHandler);
             
             ASSERT_CUDA_SUCCESS(cu.pcuLaunchKernel(kernel, GridDim.x, GridDim.y, 1, BlockDim.x, BlockDim.y, 1, 0, stream, parameters, nullptr), m_cuHandler);
             
-            const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = 2 } } };
+            const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS signalParams = { .params = {.fence = {.value = SyncPointKernelDone } } };
             ASSERT_CUDA_SUCCESS(cu.pcuSignalExternalSemaphoresAsync(&semaphore_cu, &signalParams, 1, stream), m_cuHandler);
         }
 
@@ -676,10 +711,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             cmd[1]->end();
             
             const IQueue::SSubmitInfo::SSemaphoreInfo waitInfo = {
-              .semaphore = semaphore.get(), .value = 2, .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
+              .semaphore = semaphore.get(), .value = SyncPointKernelDone, .stageMask = PIPELINE_STAGE_FLAGS::COPY_BIT,
             };
             const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
-              .semaphore = semaphore.get(), .value = 3, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
+              .semaphore = semaphore.get(), .value = SyncPointCopyDone, .stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
             };
             const IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmd[1].get() };
             const IQueue::SSubmitInfo submitInfo = { 
@@ -691,10 +726,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         }
 
         // Wait and verify results
-        const auto wait = std::array{ ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = 3} };
-        m_device->blockForSemaphores(wait, true);
+        {
+            const auto wait = std::array{ ISemaphore::SWaitInfo{.semaphore = semaphore.get(), .value = SyncPointCopyDone} };
+            m_device->blockForSemaphores(wait, true);
 
-        auto* stagingMem = outputStagingBuffer->getBoundMemory().memory;
+            auto* stagingMem = outputStagingBuffer->getBoundMemory().memory;
         if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
         {
             ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize());
@@ -713,77 +749,78 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
                 // The calculation below is to get the index of cpuMatB if the column is reversed to get the expected bit.
                 const auto row = i / ElementCount.y;
                 const auto col = i % ElementCount.y;
-                const auto expectedCol = col;
-                const auto expectedRow = ElementCount.z - row - 1;
-                const auto expectedIdx = expectedCol * ElementCount.z + expectedRow;
-                const auto expectedWordIdx = expectedIdx / 32;
-                const auto expectedBitOffset = expectedIdx % 32;
-                return (cpuMatB[expectedWordIdx] >> expectedBitOffset) & uint32_t(1);
-            }();
-            const auto result = results[i];
-            if (result != expected) {
-                m_logger->log("WMMA b1 test error at [%d]: GPU=%d, CPU=%d", 
-                             system::ILogger::ELL_ERROR, i, results[i], expected);
-                errors++;
-                success = false;
+                    const auto expectedCol = col;
+                    const auto expectedRow = ElementCount.z - row - 1;
+                    const auto expectedIdx = expectedCol * ElementCount.z + expectedRow;
+                    const auto expectedWordIdx = expectedIdx / BitsPerUint32;
+                    const auto expectedBitOffset = expectedIdx % BitsPerUint32;
+                    return (cpuMatB[expectedWordIdx] >> expectedBitOffset) & uint32_t(1);
+                }();
+                const auto result = results[i];
+                if (result != expected) {
+                    m_logger->log("WMMA b1 test error at [%d]: GPU=%d, CPU=%d", 
+                                 system::ILogger::ELL_ERROR, i, results[i], expected);
+                    errors++;
+                    success = false;
+                    constexpr int MaxErrorsToReport = 10;
+                    if (errors == MaxErrorsToReport) break;
+                }
             }
+            
+            if (success)
+                m_logger->log("b1 WMMA test PASSED!", system::ILogger::ELL_INFO);
+            else
+                m_logger->log("b1 WMMA test FAILED with %d errors!", system::ILogger::ELL_ERROR, errors);
         }
-        
-        if (success)
-            m_logger->log("b1 WMMA test PASSED!", system::ILogger::ELL_INFO);
-        else
-            m_logger->log("b1 WMMA test FAILED with %d errors!", system::ILogger::ELL_ERROR, errors);
     }
 
     void testDestruction()
     {
+        
+        // Tests proper resource lifetime management across CUDA-Vulkan interop by creating exportable CUDA memory,
+        // copying data to it, then destroying the CUDA memory object while keeping the exported Vulkan memory alive.
+        // Verifies that the exported memory remains valid and accessible after the original CUDA object is destroyed,
+        // confirming correct reference counting and external memory handle semantics.
+
         auto commandPool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
         constexpr auto ElementCount = 1024;
         constexpr auto BufferSize = ElementCount * sizeof(int);
+
+        // Construct testData
+        core::vector<uint32_t> testData(ElementCount);
+        std::iota(testData.begin(), testData.end(), 0);
+
         auto& cu = m_cuHandler->getCUDAFunctionTable();
+
+        // This vulkan memory will outlive the CUDA memory object below
         smart_refctd_ptr<IDeviceMemoryAllocation> escaped;
         {
+            // Create exportable CUDA memory - this object will be destroyed at the end of this scope
             const auto cudaMemory = m_cuDevice->createExportableMemory({ .size = BufferSize, .alignment = sizeof(float), .locationType = CU_MEM_LOCATION_TYPE_DEVICE });
             if (!cudaMemory) logFail("Fail to create exportable memory!");
 
-            auto tmpBuf = createExternalBuffer(cudaMemory->getCreationParams().granularSize, IDeviceMemoryAllocation::EHT_OPAQUE_WIN32);
-            escaped = cudaMemory->exportAsMemory(m_device.get(), tmpBuf.get());
+            // Export CUDA memory as Vulkan device memory - this reference will persist
+            escaped = cudaMemory->exportAsMemory(m_device.get());
             if (!escaped) logFail("Fail to export CUDA memory!");
-        
-            auto staging = createStaging(BufferSize);
-        
-            auto ptr = (uint32_t*)staging->getBoundMemory().memory->getMappedPointer();
-            for (uint32_t i = 0; i < ElementCount; ++i)
-                ptr[i] = i;
-        
-            ISemaphore::SCreationParams semParams;
-            semParams.initialValue = 0;
-            const auto semaphore = m_device->createSemaphore(std::move(semParams));
-            IQueue::SSubmitInfo::SSemaphoreInfo semInfo;
-            semInfo.semaphore = semaphore.get();
-            semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
-            semInfo.value = 1;
-        
-            smart_refctd_ptr<IGPUCommandBuffer> cmdBuffer;
-            commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmdBuffer);
-            cmdBuffer->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-            IGPUCommandBuffer::SBufferCopy region = { .size = BufferSize };
-            assert(cmdBuffer->copyBuffer(staging.get(), tmpBuf.get(), 1, &region));
-            cmdBuffer->end();
-            IQueue::SSubmitInfo::SCommandBufferInfo cmdInfo = { cmdBuffer.get() };
-            const IQueue::SSubmitInfo submitInfo = {
-              .commandBuffers = {&cmdInfo, &cmdInfo + 1}, 
-              .signalSemaphores = {&semInfo, 1}
-            };
-            auto qre = queue->submit({ &submitInfo, &submitInfo + 1 });
-            assert(IQueue::RESULT::SUCCESS == qre);
-            m_device->waitIdle();
-        }        
+
+            // Copy testData into cudaMemory
+            CUstream stream;
+            ASSERT_CUDA_SUCCESS(cu.pcuStreamCreate(&stream, CU_STREAM_NON_BLOCKING), m_cuHandler);
+            auto streamCleanup = nbl::core::makeRAIIExiter([&] {
+                cu.pcuStreamDestroy_v2(stream);
+            });
+            ASSERT_CUDA_SUCCESS(cu.pcuMemcpyHtoDAsync_v2(cudaMemory->getDeviceptr(), testData.data(), BufferSize, stream), m_cuHandler);
+            ASSERT_CUDA_SUCCESS(cu.pcuStreamSynchronize(stream), m_cuHandler);
+
+        }
+        // CRITICAL: cudaMemory object destroyed here, but escaped memory should remain valid
         
         {
+            // Re-import the exported memory - this tests if the memory survived CUDA object destruction
             auto tmpBuf = createExternalBuffer(escaped.get());
             auto staging = createStaging(BufferSize);
         
+            // Setup synchronization for readback
             ISemaphore::SCreationParams semParams;
             semParams.initialValue = 0;
             const auto semaphore = m_device->createSemaphore(std::move(semParams));
@@ -792,6 +829,7 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
             semInfo.value = 1;
         
+            // Copy data back from the persistent buffer to staging for verification
             smart_refctd_ptr<IGPUCommandBuffer> cmd;
             commandPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &cmd);
             cmd->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);

From 3cd947c256390e44e8212ba9506fcfc1974c26cd Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 15:40:03 +0700
Subject: [PATCH 42/47] Misc refactor on testWmmaGemmb1

---
 76_CudaInterop/main.cpp | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index d9d3138af..e06bfd970 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -737,14 +737,13 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             m_device->invalidateMappedMemoryRanges(1, &range);
         }
 
-        const auto* results = reinterpret_cast<int32_t*>(stagingMem->getMappedPointer());
-        
-        // Verify results
-        bool success = true;
-        int errors = 0;
-        for (int i = 0; i < ElementCount.x * ElementCount.y; i++) {
-            const auto expected = [&]
-            {
+            const auto* results = reinterpret_cast<int32_t*>(stagingMem->getMappedPointer());
+            
+            // Verify results
+            int errorCount = 0;
+            for (int i = 0; i < ElementCount.x * ElementCount.y; i++) {
+                const auto expected = [&]
+                {
                 // Since we are multiplying reverse diagonal matrix to matrixB. The result should be matrix b but each column reversed.
                 // The calculation below is to get the index of cpuMatB if the column is reversed to get the expected bit.
                 const auto row = i / ElementCount.y;
@@ -760,17 +759,16 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
                 if (result != expected) {
                     m_logger->log("WMMA b1 test error at [%d]: GPU=%d, CPU=%d", 
                                  system::ILogger::ELL_ERROR, i, results[i], expected);
-                    errors++;
-                    success = false;
+                    errorCount++;
                     constexpr int MaxErrorsToReport = 10;
-                    if (errors == MaxErrorsToReport) break;
+                    if (errorCount == MaxErrorsToReport) break;
                 }
             }
             
-            if (success)
+            if (errorCount == 0)
                 m_logger->log("b1 WMMA test PASSED!", system::ILogger::ELL_INFO);
             else
-                m_logger->log("b1 WMMA test FAILED with %d errors!", system::ILogger::ELL_ERROR, errors);
+                m_logger->log("b1 WMMA test FAILED with %d errors!", system::ILogger::ELL_ERROR, errorCount);
         }
     }
 

From 5110e9b9f0074b0c1ab3a75c215b76016010cb0a Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 16:26:18 +0700
Subject: [PATCH 43/47] Remove testLargeAllocations

---
 76_CudaInterop/main.cpp | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index e06bfd970..ac7182369 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -97,7 +97,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         testVectorAddKernel();
         testWmmaGemB1();
         testDestruction();
-        testLargeAllocations();
 
         return true;
     }
@@ -854,22 +853,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
     
     }
 
-    void testLargeAllocations()
-    {
-        // TODO(kevin): Calculate BufferSize that is big enough to fill the machine VRAM
-        constexpr auto BufferSize = 1024;
-        IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = {
-            .size = BufferSize,
-            .memoryTypeBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(),
-            .alignmentLog2 = 10,
-        };
-    
-        for (size_t i = 0; i < (1 << 8); ++i)
-        {
-            auto memory = m_device->allocate(reqs, { nullptr, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::ExternalMemoryHandleType }).memory;
-            assert(memory);
-            auto tmpBuf = createExternalBuffer(memory.get());
-        }
     }
 
     // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.

From 347fae0d4a7480748126355e94fb0ff21193f6fd Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 16:26:46 +0700
Subject: [PATCH 44/47] Remove unused method

---
 76_CudaInterop/main.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index ac7182369..63062e1f8 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -101,15 +101,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         return true;
     }
 
-    smart_refctd_ptr<IGPUBuffer> createExternalBuffer(uint64_t size, core::bitflag<IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE> externalHandleTypes)
-    {
-        IGPUBuffer::SCreationParams params = {};
-        params.size = size;
-        params.usage = asset::IBuffer::EUF_TRANSFER_SRC_BIT | asset::IBuffer::EUF_TRANSFER_DST_BIT;
-        params.externalHandleTypes = externalHandleTypes;
-        auto buf = m_device->createBuffer(std::move(params));
-        return buf;
-    }
 
     smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
     {

From 1aebc5a6e6f0e365aafe1e2ac4c742d7c4d3b6aa Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 16:27:10 +0700
Subject: [PATCH 45/47] Misc refactor on testDestruction

---
 76_CudaInterop/main.cpp | 46 +++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 63062e1f8..6b0a5931d 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -101,7 +101,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         return true;
     }
 
-
     smart_refctd_ptr<IGPUBuffer> createExternalBuffer(IDeviceMemoryAllocation* mem)
     {
         IGPUBuffer::SCreationParams params = {};
@@ -721,11 +720,11 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             m_device->blockForSemaphores(wait, true);
 
             auto* stagingMem = outputStagingBuffer->getBoundMemory().memory;
-        if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-        {
-            ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize());
-            m_device->invalidateMappedMemoryRanges(1, &range);
-        }
+            if (!stagingMem->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+            {
+                ILogicalDevice::MappedMemoryRange range(stagingMem, 0, stagingMem->getAllocationSize());
+                m_device->invalidateMappedMemoryRanges(1, &range);
+            }
 
             const auto* results = reinterpret_cast<int32_t*>(stagingMem->getMappedPointer());
             
@@ -734,10 +733,10 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             for (int i = 0; i < ElementCount.x * ElementCount.y; i++) {
                 const auto expected = [&]
                 {
-                // Since we are multiplying reverse diagonal matrix to matrixB. The result should be matrix b but each column reversed.
-                // The calculation below is to get the index of cpuMatB if the column is reversed to get the expected bit.
-                const auto row = i / ElementCount.y;
-                const auto col = i % ElementCount.y;
+                    // Since we are multiplying reverse diagonal matrix to matrixB. The result should be matrix b but each column reversed.
+                    // The calculation below is to get the index of cpuMatB if the column is reversed to get the expected bit.
+                    const auto row = i / ElementCount.y;
+                    const auto col = i % ElementCount.y;
                     const auto expectedCol = col;
                     const auto expectedRow = ElementCount.z - row - 1;
                     const auto expectedIdx = expectedCol * ElementCount.z + expectedRow;
@@ -808,14 +807,17 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             auto tmpBuf = createExternalBuffer(escaped.get());
             auto staging = createStaging(BufferSize);
         
+
             // Setup synchronization for readback
             ISemaphore::SCreationParams semParams;
             semParams.initialValue = 0;
             const auto semaphore = m_device->createSemaphore(std::move(semParams));
+            static constexpr auto SyncPointCopyDone = 1;
+
             IQueue::SSubmitInfo::SSemaphoreInfo semInfo;
             semInfo.semaphore = semaphore.get();
             semInfo.stageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
-            semInfo.value = 1;
+            semInfo.value = SyncPointCopyDone;
         
             // Copy data back from the persistent buffer to staging for verification
             smart_refctd_ptr<IGPUCommandBuffer> cmd;
@@ -832,20 +834,34 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
             auto qre = queue->submit({ &submitInfo, &submitInfo + 1 });
             assert(IQueue::RESULT::SUCCESS == qre);
         
+            ISemaphore::SWaitInfo waitInfo = {
+              .semaphore = semaphore.get(),
+              .value = SyncPointCopyDone
+            };
+            m_device->blockForSemaphores({ &waitInfo, 1 });
             m_device->waitIdle();
         
+            // Verify the data remains intact after CUDA object destruction
             auto& ptr = *(std::array<uint32_t, BufferSize>*)staging->getBoundMemory().memory->getMappedPointer();
+            auto errorCount = 0;
+            static const auto MaxErrorCount = 10;
             for (uint32_t i = 0; i < ElementCount; ++i)
             {
-                if (ptr[i] != i) logFail("Test Destruction: Element %d is incorrect", i);
+                if (ptr[i] != testData[i]) {
+                  logFail("Destruction test error at [%d]: value=%d, expected=%d", i, ptr[i], testData[i]);
+                  errorCount++;
+                  if (errorCount == MaxErrorCount) break;
+                }
             }
-            m_logger->log("Test Destruction complete", ILogger::ELL_INFO);
+            
+            if (errorCount == 0)
+                m_logger->log("Destruction test PASSED!", system::ILogger::ELL_INFO);
+            else
+                m_logger->log("Destruction test FAILED with %d errors!", system::ILogger::ELL_ERROR, errorCount);
         }
     
     }
 
-    }
-
     // Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
     bool keepRunning() override { return false; }
 

From 08d594c79bea5917f6380393c0bcde78a5c2a3ec Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 16:55:45 +0700
Subject: [PATCH 46/47] Fix the acquire barrier

---
 76_CudaInterop/main.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/76_CudaInterop/main.cpp b/76_CudaInterop/main.cpp
index 6b0a5931d..612b01ba2 100644
--- a/76_CudaInterop/main.cpp
+++ b/76_CudaInterop/main.cpp
@@ -277,10 +277,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         {
             const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
                 .barrier = {
-                    .dep = {
-                        .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
-                        .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS,
-                    },
                     .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
                     .otherQueueFamilyIndex = IQueue::FamilyExternal,
                 },
@@ -635,10 +631,6 @@ class CUDA2VKApp : public virtual MonoDeviceApplication, BuiltinResourcesApplica
         {
             const IGPUCommandBuffer::SBufferMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> bufBarrier = {
                 .barrier = {
-                    .dep = {
-                        .srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS,
-                        .srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS,
-                    },
                     .ownershipOp = IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE,
                     .otherQueueFamilyIndex = IQueue::FamilyExternal,
                 },

From a4b1f5ffcfee386a0b020db1fc3fe4481a365b4e Mon Sep 17 00:00:00 2001
From: kevyuu <kyu.kevinyu@gmail.com>
Date: Mon, 25 May 2026 16:56:30 +0700
Subject: [PATCH 47/47] Remove unnecessary comment

---
 76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu b/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu
index 56d376fae..ef6ccad12 100644
--- a/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu
+++ b/76_CudaInterop/app_resources/wmmaGemm_b1_kernel.cu
@@ -35,12 +35,9 @@ extern "C" __global__ void b1_wmma_gemm_kernel(int* a, int* b, int* c,
         int bRow = i / 32;
         int bCol = warpN * WMMA_N;
     
-        // Load fragments
-        // Note: load_matrix_sync handles the bit-packing layout internally
         wmma::load_matrix_sync(a_frag, a + (aRow * lda / 32 + aCol), lda);
         wmma::load_matrix_sync(b_frag, b + (bCol * ldb / 32 + bRow), ldb);
     
-        // Perform XOR-Popcount MMA
         wmma::bmma_sync(acc_frag, a_frag, b_frag, acc_frag, wmma::experimental::bmmaBitOpAND);
     }