|
| 1 | +#include "pyscheduler/library_export.hpp" |
| 2 | +#include <cuda_runtime.h> |
| 3 | +#include <dlpack.h> |
| 4 | +#include <memory> |
| 5 | + |
| 6 | +enum class DeviceType { |
| 7 | + CPU, |
| 8 | + CUDA, |
| 9 | +}; |
| 10 | + |
| 11 | +template <typename T> |
| 12 | +struct DLPackTypeTraits; |
| 13 | + |
| 14 | +// I love explicit template specialization |
| 15 | +// NOTE: If you get a compile time, add an entry here: |
| 16 | + |
| 17 | +template <> |
| 18 | +struct DLPackTypeTraits<float> { |
| 19 | + static constexpr DLDataType dtype = { kDLFloat, 32, 1 }; |
| 20 | +}; |
| 21 | + |
| 22 | +template <> |
| 23 | +struct DLPackTypeTraits<double> { |
| 24 | + static constexpr DLDataType dtype = { kDLFloat, 64, 1 }; |
| 25 | +}; |
| 26 | + |
| 27 | +template <> |
| 28 | +struct DLPackTypeTraits<int64_t> { |
| 29 | + static constexpr DLDataType dtype = { kDLInt, 64, 1 }; |
| 30 | +}; |
| 31 | + |
| 32 | +template <> |
| 33 | +struct DLPackTypeTraits<int32_t> { |
| 34 | + static constexpr DLDataType dtype = { kDLInt, 32, 1 }; |
| 35 | +}; |
| 36 | + |
| 37 | +template <> |
| 38 | +struct DLPackTypeTraits<uint8_t> { |
| 39 | + static constexpr DLDataType dtype = { kDLUInt, 8, 1 }; |
| 40 | +}; |
| 41 | + |
| 42 | +template <DeviceType Device, typename DataType, size_t... Dims> |
| 43 | +std::unique_ptr<DLManagedTensor> createDlpackTensor() { |
| 44 | + constexpr int ndim = sizeof...(Dims); |
| 45 | + constexpr int64_t num_items = (... * Dims); // C++17 fold expression |
| 46 | + |
| 47 | + // Allocate and set shape |
| 48 | + int64_t* shape = new int64_t[ndim]{ Dims... }; |
| 49 | + |
| 50 | + // Allocate tensor memory |
| 51 | + DataType* data; |
| 52 | + if constexpr(Device == DeviceType::CPU) { |
| 53 | + data = new T[num_items]; |
| 54 | + } else if constexpr(Device == DeviceType::CUDA) { |
| 55 | + cudaMalloc(&data, num_items * sizeof(T)); |
| 56 | + } |
| 57 | + |
| 58 | + // Create DLManagedTensor |
| 59 | + DLManagedTensor* managed_tensor = new DLManagedTensor(); |
| 60 | + managed_tensor->dl_tensor.data = data; |
| 61 | + managed_tensor->dl_tensor.device = { Device == DeviceType::CPU ? kDLCPU : kDLCUDA, 0 }; |
| 62 | + managed_tensor->dl_tensor.ndim = ndim; |
| 63 | + managed_tensor->dl_tensor.dtype = DLPackTypeTraits<DataType>::dtype; |
| 64 | + |
| 65 | + managed_tensor->dl_tensor.shape = shape; |
| 66 | + managed_tensor->dl_tensor.strides = nullptr; |
| 67 | + managed_tensor->dl_tensor.byte_offset = 0; |
| 68 | + managed_tensor->dl_tensor.shape = shape; |
| 69 | + managed_tensor->manager_ctx = nullptr; |
| 70 | + |
| 71 | + tensor->deleter = [](DLManagedTensor* self) { |
| 72 | + if(if constexpr Device == DeviceType::GPU) |
| 73 | + cudaFree(self->dl_tensor.data); |
| 74 | + else |
| 75 | + delete[] static_cast<DataType*>(self->dl_tensor.data); |
| 76 | + delete[] self->dl_tensor.shape; |
| 77 | + delete self; |
| 78 | + }; |
| 79 | + |
| 80 | + return std::unique_ptr<DLManagedTensor>(managed_tensor); |
| 81 | +} |
0 commit comments