MadGraphTeam
diff --git a/‎madspace/include/madspace/runtime/tensor.h‎
Lines changed: 49 additions & 21 deletions b/‎madspace/include/madspace/runtime/tensor.h‎
Lines changed: 49 additions & 21 deletions
diff --git a/‎madspace/src/cpu/device.h‎
Lines changed: 1 addition & 1 deletion b/‎madspace/src/cpu/device.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎madspace/src/cpu/runtime.h‎
Lines changed: 5 additions & 0 deletions b/‎madspace/src/cpu/runtime.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎madspace/src/gpu/device.cu‎
Lines changed: 87 additions & 35 deletions b/‎madspace/src/gpu/device.cu‎
Lines changed: 87 additions & 35 deletions
diff --git a/‎madspace/src/gpu/device.h‎
Lines changed: 17 additions & 15 deletions b/‎madspace/src/gpu/device.h‎
Lines changed: 17 additions & 15 deletions
@@ -165,10 +165,21 @@ class Tensor;
 
 enum class DeviceType { cpu, cuda, hip };
 
+enum class AllocHint {
+    normal,
+    output,
+    local,
+    temporary,
+    input_grad,
+    local_grad,
+    global_grad,
+};
+
 class Device {
 public:
     virtual ~Device() = default;
-    virtual std::pair<void*, Tensor> allocate(std::size_t size) const = 0;
+    virtual std::pair<void*, Tensor>
+    allocate(std::size_t size, AllocHint hint) const = 0;
     virtual void free(void* ptr) const = 0;
     virtual void memcpy(void* to, void* from, std::size_t size) const = 0;
     virtual void tensor_copy(const Tensor& source, Tensor& target) const = 0;
@@ -199,19 +210,30 @@ class Tensor {
 
     Tensor(Tensor&& other) noexcept : impl(other.impl) { other.impl = nullptr; }
 
-    Tensor(DataType dtype, const Sizes& shape) : Tensor(dtype, shape, cpu_device()) {}
+    Tensor(DataType dtype, const Sizes& shape, AllocHint hint = AllocHint::normal) :
+        Tensor(dtype, shape, cpu_device(), hint) {}
 
-    Tensor(DataType dtype, const Sizes& shape, DevicePtr device) :
+    Tensor(
+        DataType dtype,
+        const Sizes& shape,
+        DevicePtr device,
+        AllocHint hint = AllocHint::normal
+    ) :
         impl(new TensorImpl{dtype, shape, device}) {
         auto size = init_stride();
-        allocate(size, *device);
+        allocate(size, *device, hint);
     }
 
     template <typename D>
-    Tensor(DataType dtype, const Sizes& shape, const D& device) :
+    Tensor(
+        DataType dtype,
+        const Sizes& shape,
+        const D& device,
+        AllocHint hint = AllocHint::normal
+    ) :
         impl(new TensorImpl{dtype, shape, device.device_ptr()}) {
         auto size = init_stride();
-        allocate(size, device);
+        allocate(size, device, hint);
     }
 
     Tensor(
@@ -276,21 +298,21 @@ class Tensor {
         }) {}
 
     template <ScalarType T>
-    Tensor(T value, DevicePtr device) :
+    Tensor(T value, DevicePtr device, AllocHint hint = AllocHint::normal) :
         impl(new TensorImpl{
             std::is_same_v<T, me_int_t> ? DataType::dt_int : DataType::dt_float,
             {1},
             device
         }) {
         auto size = init_stride();
-        allocate(size, *device);
+        allocate(size, *device, hint);
         device->memcpy(impl->data, &value, sizeof(value));
         if (std::is_same_v<T, me_int_t> && value >= 0) {
             impl->batch_sizes.push_back(value);
         }
     }
 
-    Tensor(TensorValue value, DevicePtr device) :
+    Tensor(TensorValue value, DevicePtr device, AllocHint hint = AllocHint::normal) :
         impl(new TensorImpl{
             std::visit(
                 Overloaded{
@@ -309,7 +331,7 @@ class Tensor {
             device
         }) {
         auto size = init_stride();
-        allocate(size, *device);
+        allocate(size, *device, hint);
         std::visit(
             [&](auto& vec) { device->memcpy(impl->data, vec.data(), size); },
             std::get<1>(value)
@@ -510,35 +532,41 @@ class Tensor {
     void add(const Tensor& source) { add(source, *impl->device); }
 
     template <typename D>
-    Tensor copy(const D& device) const {
+    Tensor copy(const D& device, AllocHint hint = AllocHint::normal) const {
         check_impl();
-        Tensor tensor(impl->dtype, impl->shape, impl->device);
+        Tensor tensor(impl->dtype, impl->shape, impl->device, hint);
         device.tensor_copy(*this, tensor);
         return tensor;
     }
-    Tensor copy() const { return copy(*impl->device); }
+    Tensor copy(AllocHint hint = AllocHint::normal) const {
+        return copy(*impl->device, hint);
+    }
 
     bool is_contiguous() const { return impl->contiguous_dims == impl->shape.size(); }
 
     std::size_t contiguous_dims() const { return impl->contiguous_dims; }
 
     template <typename D>
-    Tensor contiguous(const D& device) const {
+    Tensor contiguous(const D& device, AllocHint hint = AllocHint::normal) const {
         check_impl();
-        return is_contiguous() ? *this : copy(device);
+        return is_contiguous() ? *this : copy(device, hint);
     }
 
-    Tensor contiguous() const { return contiguous(*impl->device); }
+    Tensor contiguous(AllocHint hint = AllocHint::normal) const {
+        return contiguous(*impl->device, hint);
+    }
 
     template <typename D>
-    Tensor contiguous(std::size_t batch_size, const D& device) const {
+    Tensor contiguous(
+        std::size_t batch_size, const D& device, AllocHint hint = AllocHint::normal
+    ) const {
         check_impl();
         if (size(0) == batch_size) {
-            return contiguous(device);
+            return contiguous(device, hint);
         } else if (size(0) == 1) {
             auto shape = impl->shape;
             shape[0] = batch_size;
-            Tensor tensor(impl->dtype, shape, impl->device);
+            Tensor tensor(impl->dtype, shape, impl->device, hint);
             device.tensor_copy(*this, tensor);
             return tensor;
         } else {
@@ -597,8 +625,8 @@ class Tensor {
     }
 
     template <typename D>
-    void allocate(std::size_t size, const D& device) {
-        auto [data, parent] = device.allocate(size);
+    void allocate(std::size_t size, const D& device, AllocHint hint) {
+        auto [data, parent] = device.allocate(size, hint);
         impl->data = data;
         if (parent) {
             impl->owns_data = false;
 
@@ -11,7 +11,7 @@ class CpuDevice : public Device {
 public:
     static constexpr bool is_concurrent = false;
 
-    std::pair<void*, Tensor> allocate(std::size_t size) const override {
+    std::pair<void*, Tensor> allocate(std::size_t size, AllocHint hint) const override {
         return {new std::byte[size], Tensor()};
     }
 
 
@@ -12,6 +12,9 @@ namespace cpu {
 
 class CpuRuntime : public Runtime {
 public:
+    struct DummyAllocHints {
+        AllocHint operator[](std::size_t index) const { return AllocHint::normal; }
+    };
     struct Instruction {
         int opcode;
         SizeVec input_indices;
@@ -25,6 +28,8 @@ class CpuRuntime : public Runtime {
         std::size_t dependency_count;
         SizeVec dependent_instructions_backward;
         std::size_t dependency_count_backward;
+        DummyAllocHints output_alloc_hints;
+        DummyAllocHints input_grad_alloc_hints;
     };
 
     CpuRuntime(const Function& function, ContextPtr context, bool concurrent);
 
@@ -48,53 +48,105 @@ void GpuDevice::tensor_cpu(const Tensor& source, Tensor& target) const {
     );
 }
 
-MemPool::MemPool(const SizeVec pool_factors, const std::vector<AllocItem>& allocs) :
-    _allocs(allocs) {
-    _pools.reserve(pool_factors.size());
-    for (auto& factor : pool_factors) {
-        _pools.push_back({
-            .size_factor = factor,
-            .batch_size = 0,
-            .parent_tensor = Tensor(),
-        });
+MemPool::MemPool(
+    const GpuDevice& device,
+    const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes
+) :
+    _device(device) {
+    std::size_t pool_count = 0;
+    for (auto& [pool_index, size] : cached_sizes) {
+        if (pool_index >= pool_count) {
+            pool_count = pool_index + 1;
+        }
+    }
+    _pools.resize(pool_count);
+
+    for (auto& [pool_index, size] : cached_sizes) {
+        auto& pool = _pools.at(pool_index);
+        std::size_t word_count = (size + 7) / 8;
+        pool.parent_tensor = Tensor(DataType::dt_float, {word_count}, device);
+        pool.size = word_count * 8;
+        pool.needed_size = word_count * 8;
     }
 }
 
-std::pair<void*, Tensor>
-MemPool::allocate(std::size_t size, const GpuDevice& device, gpuStream_t stream) {
-    AllocItem& alloc = _allocs.at(_alloc_index);
-    ++_alloc_index;
-    PoolItem& pool = _pools.at(alloc.pool_index);
-    if (size % alloc.size_factor != 0) {
-        throw std::runtime_error("inconsistent pool allocation");
-    }
-    std::size_t batch_size = size / alloc.size_factor;
-    if (!pool.parent_tensor) {
-        pool.batch_size = batch_size;
-        AsyncGpuDevice async_device(device, stream);
-        pool.parent_tensor = Tensor(
-            DataType::dt_float, {(batch_size * pool.size_factor + 7) / 8}, async_device
-        );
-    } else if (batch_size != pool.batch_size) {
-        throw std::runtime_error("inconsistent pool allocation");
+MemPool::~MemPool() {
+    for (PoolItem& pool : _pools) {
+        for (auto& [size, item] : pool.free_pointers) {
+            auto& [ptr, parent] = item;
+            if (!parent) {
+                check_error(gpuFree(ptr));
+            }
+        }
     }
-    return {
-        static_cast<uint8_t*>(pool.parent_tensor.data()) +
-            pool.batch_size * alloc.offset,
-        pool.parent_tensor
-    };
 }
 
-std::pair<void*, Tensor> AsyncGpuDevice::allocate(std::size_t size) const {
-    if (_mem_pool != nullptr) {
-        return _mem_pool->allocate(size, _device, _stream);
+std::pair<void*, Tensor> MemPool::allocate(std::size_t pool_index, std::size_t size) {
+    if (pool_index >= _pools.size()) {
+        _pools.resize(pool_index);
+    }
+    PoolItem& pool = _pools.at(pool_index);
+    if (auto search = pool.free_pointers.find(size);
+        search != pool.free_pointers.end()) {
+        std::pair<void*, Tensor> ret = *search->second;
+        pool.free_pointers.erase(search);
+        return ret;
+    } else if (pool.capacity - pool.size >= size) {
+        void* ptr = &static_cast<uint8_t*>(pool.parent_tensor.data())[pool.size];
+        pool.size = (pool.size + size + 7) / 8 * 8;
+        _allocs[ptr] = {
+            .pool_index = pool_index,
+            .size = size,
+            .parent_tensor = pool.parent_tensor,
+        };
+        return {ptr, pool.parent_tensor};
     } else {
         void* ptr;
-        check_error(gpuMallocAsync(&ptr, size, _stream));
+        check_error(gpuMalloc(&ptr, size));
+        _allocs[ptr] = {
+            .pool_index = pool_index,
+            .size = size,
+            .parent_tensor = Tensor(),
+        };
+        pool.needed_size += (size + 7) / 8 * 8;
         return {ptr, Tensor()};
     }
 }
 
+void MemPool::free(void* ptr) {
+    auto search = _allocs.find(ptr) if (search == _allocs.end()) {
+        throw std::runtime_error("address was not allocated using this pool");
+    }
+    auto& alloc = search->second;
+    _pools.at(alloc.pool_index)
+        .free_pointers.emplace(alloc.size, {ptr, alloc.parent_tensor});
+    _allocs.erase(search);
+}
+
+std::vector<std::pair<std::size_t, std::size_t>> MemPool::total_sizes() const {
+    std::vector<std::pair<std::size_t, std::size_t>> ret;
+    ret.reserve(_pools.size());
+    for (std::size_t index = 0; PoolItem& pool : _pools) {
+        if (pool.needed_size > 0) {
+            ret.push_back({index, pool.needed_size});
+        }
+        ++index;
+    }
+    return ret;
+}
+
+std::pair<void*, Tensor>
+AsyncGpuDevice::allocate(std::size_t size, AllocHint hint) const {
+    if (_mem_pool != nullptr && hint != AllocHint::normal) {
+        return _mem_pool->allocate(static_cast<std::size_t>(hint) - 1, size);
+    } else {
+        _device.allocate(size, hint);
+        // void* ptr;
+        // check_error(gpuMallocAsync(&ptr, size, _stream));
+        // return {ptr, Tensor()};
+    }
+}
+
 void AsyncGpuDevice::free(void* ptr) const { check_error(gpuFreeAsync(ptr, _stream)); }
 
 void AsyncGpuDevice::memcpy(void* to, void* from, std::size_t size) const {
 
@@ -76,26 +76,28 @@ class GpuDevice : public Device {
 
 class MemPool {
 public:
-    struct AllocItem {
-        std::size_t pool_index;
-        std::size_t size_factor;
-        std::size_t offset;
-    };
-
-    MemPool(const SizeVec pool_factors, const std::vector<AllocItem>& allocs);
-    std::pair<void*, Tensor>
-    allocate(std::size_t size, const GpuDevice& device, gpuStream_t stream);
+    MemPool(const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes);
+    ~MemPool();
+    std::pair<void*, Tensor> allocate(std::size_t pool_index, std::size_t size);
+    void free(void* ptr);
+    std::vector<std::pair<std::size_t, std::size_t>> total_sizes() const;
 
 private:
     struct PoolItem {
-        std::size_t size_factor;
-        std::size_t batch_size;
+        Tensor parent_tensor;
+        std::size_t size = 0;
+        std::size_t capacity = 0;
+        std::size_t needed_size = 0;
+        std::unordered_multimap<std::size_t, std::pair<void*, Tensor>> free_pointers;
+    };
+    struct AllocItem {
+        std::size_t pool_index;
+        std::size_t size;
         Tensor parent_tensor;
     };
-
-    std::vector<AllocItem> _allocs;
     std::vector<PoolItem> _pools;
-    std::size_t _alloc_index = 0;
+    std::unordered_map<void*, AllocItem> _allocs;
+    const GpuDevice& _device;
 };
 
 class AsyncGpuDevice {
@@ -105,7 +107,7 @@ class AsyncGpuDevice {
     ) :
         _device(device), _stream(stream), _mem_pool(mem_pool) {}
 
-    std::pair<void*, Tensor> allocate(std::size_t size) const;
+    std::pair<void*, Tensor> allocate(std::size_t size, AllocHint hint) const;
     void free(void* ptr) const;
     void memcpy(void* to, void* from, std::size_t size) const;
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ class CpuDevice : public Device {`
`11`	`11`	`public:`
`12`	`12`	`static constexpr bool is_concurrent = false;`
`13`	`13`
`14`		`- std::pair<void*, Tensor> allocate(std::size_t size) const override {`
	`14`	`+ std::pair<void*, Tensor> allocate(std::size_t size, AllocHint hint) const override {`
`15`	`15`	`return {new std::byte[size], Tensor()};`
`16`	`16`	`}`
`17`	`17`