memory pool for gpu allocations

theoheimel · theoheimel · commit f19e8e5c7a5e · 2026-04-03T12:50:18.000+02:00
diff --git a/madspace/include/madspace/madcode/type.h b/madspace/include/madspace/madcode/type.h
@@ -34,7 +34,7 @@ class BatchSize {
     };
     using Unnamed = std::shared_ptr<UnnamedBody>;
     using One = std::monostate;
-    using Compound = std::unordered_map<std::variant<Named, Unnamed, One>, int>;
+    using Compound = std::map<std::variant<Named, Unnamed, One>, int>;
 
     static const BatchSize zero;
     static const BatchSize one;
@@ -46,6 +46,7 @@ class BatchSize {
     BatchSize operator-(const BatchSize& other) const { return add(other, -1); }
     bool operator==(const BatchSize& other) const { return value == other.value; }
     bool operator!=(const BatchSize& other) const { return value != other.value; }
+    std::string to_string() const;
 
     friend std::ostream& operator<<(std::ostream& out, const BatchSize& batch_size);
     friend void to_json(nlohmann::json& j, const BatchSize& batch_size);
diff --git a/madspace/include/madspace/runtime/tensor.h b/madspace/include/madspace/runtime/tensor.h
@@ -168,7 +168,7 @@ enum class DeviceType { cpu, cuda, hip };
 class Device {
 public:
     virtual ~Device() = default;
-    virtual void* allocate(std::size_t size) const = 0;
+    virtual std::pair<void*, Tensor> allocate(std::size_t size) const = 0;
     virtual void free(void* ptr) const = 0;
     virtual void memcpy(void* to, void* from, std::size_t size) const = 0;
     virtual void tensor_copy(const Tensor& source, Tensor& target) const = 0;
@@ -204,14 +204,14 @@ class Tensor {
     Tensor(DataType dtype, const Sizes& shape, DevicePtr device) :
         impl(new TensorImpl{dtype, shape, device}) {
         auto size = init_stride();
-        impl->data = device->allocate(size);
+        allocate(size, *device);
     }
 
     template <typename D>
     Tensor(DataType dtype, const Sizes& shape, const D& device) :
         impl(new TensorImpl{dtype, shape, device.device_ptr()}) {
         auto size = init_stride();
-        impl->data = device.allocate(size);
+        allocate(size, device);
     }
 
     Tensor(
@@ -283,7 +283,7 @@ class Tensor {
             device
         }) {
         auto size = init_stride();
-        impl->data = device->allocate(size);
+        allocate(size, *device);
         device->memcpy(impl->data, &value, sizeof(value));
         if (std::is_same_v<T, me_int_t> && value >= 0) {
             impl->batch_sizes.push_back(value);
@@ -309,7 +309,7 @@ class Tensor {
             device
         }) {
         auto size = init_stride();
-        impl->data = device->allocate(size);
+        allocate(size, *device);
         std::visit(
             [&](auto& vec) { device->memcpy(impl->data, vec.data(), size); },
             std::get<1>(value)
@@ -596,6 +596,16 @@ class Tensor {
         }
     }
 
+    template <typename D>
+    void allocate(std::size_t size, const D& device) {
+        auto [data, parent] = device.allocate(size);
+        impl->data = data;
+        if (parent) {
+            impl->owns_data = false;
+            impl->data_owner = parent.impl;
+        }
+    }
+
     TensorImpl* impl;
 };
 
diff --git a/madspace/src/cpu/device.h b/madspace/src/cpu/device.h
@@ -11,7 +11,9 @@ class CpuDevice : public Device {
 public:
     static constexpr bool is_concurrent = false;
 
-    void* allocate(std::size_t size) const override { return new std::byte[size]; }
+    std::pair<void*, Tensor> allocate(std::size_t size) const override {
+        return {new std::byte[size], Tensor()};
+    }
 
     void free(void* ptr) const override { delete[] static_cast<std::byte*>(ptr); }
 
diff --git a/madspace/src/gpu/device.cu b/madspace/src/gpu/device.cu
@@ -6,11 +6,11 @@ using namespace madspace;
 using namespace madspace::gpu;
 using namespace madspace::kernels;
 
-void* GpuDevice::allocate(std::size_t size) const {
+std::pair<void*, Tensor> GpuDevice::allocate(std::size_t size) const {
     activate();
     void* ptr;
     check_error(gpuMalloc(&ptr, size));
-    return ptr;
+    return {ptr, Tensor()};
 }
 
 void GpuDevice::free(void* ptr) const {
@@ -48,10 +48,51 @@ void GpuDevice::tensor_cpu(const Tensor& source, Tensor& target) const {
     );
 }
 
-void* AsyncGpuDevice::allocate(std::size_t size) const {
-    void* ptr;
-    check_error(gpuMallocAsync(&ptr, size, _stream));
-    return ptr;
+MemPool::MemPool(const SizeVec pool_factors, const std::vector<AllocItem>& allocs) :
+    _allocs(allocs) {
+    _pools.reserve(pool_factors.size());
+    for (auto& factor : pool_factors) {
+        _pools.push_back({
+            .size_factor = factor,
+            .batch_size = 0,
+            .parent_tensor = Tensor(),
+        });
+    }
+}
+
+std::pair<void*, Tensor>
+MemPool::allocate(std::size_t size, const GpuDevice& device, gpuStream_t stream) {
+    AllocItem& alloc = _allocs.at(_alloc_index);
+    ++_alloc_index;
+    PoolItem& pool = _pools.at(alloc.pool_index);
+    if (size % alloc.size_factor != 0) {
+        throw std::runtime_error("inconsistent pool allocation");
+    }
+    std::size_t batch_size = size / alloc.size_factor;
+    if (!pool.parent_tensor) {
+        pool.batch_size = batch_size;
+        AsyncGpuDevice async_device(device, stream);
+        pool.parent_tensor = Tensor(
+            DataType::dt_float, {(batch_size * pool.size_factor + 7) / 8}, async_device
+        );
+    } else if (batch_size != pool.batch_size) {
+        throw std::runtime_error("inconsistent pool allocation");
+    }
+    return {
+        static_cast<uint8_t*>(pool.parent_tensor.data()) +
+            pool.batch_size * alloc.offset,
+        pool.parent_tensor
+    };
+}
+
+std::pair<void*, Tensor> AsyncGpuDevice::allocate(std::size_t size) const {
+    if (_mem_pool != nullptr) {
+        return _mem_pool->allocate(size, _device, _stream);
+    } else {
+        void* ptr;
+        check_error(gpuMallocAsync(&ptr, size, _stream));
+        return {ptr, Tensor()};
+    }
 }
 
 void AsyncGpuDevice::free(void* ptr) const { check_error(gpuFreeAsync(ptr, _stream)); }
diff --git a/madspace/src/gpu/device.h b/madspace/src/gpu/device.h
@@ -74,12 +74,38 @@ class GpuDevice : public Device {
     int _index;
 };
 
+class MemPool {
+public:
+    struct AllocItem {
+        std::size_t pool_index;
+        std::size_t size_factor;
+        std::size_t offset;
+    };
+
+    MemPool(const SizeVec pool_factors, const std::vector<AllocItem>& allocs);
+    std::pair<void*, Tensor>
+    allocate(std::size_t size, const GpuDevice& device, gpuStream_t stream);
+
+private:
+    struct PoolItem {
+        std::size_t size_factor;
+        std::size_t batch_size;
+        Tensor parent_tensor;
+    };
+
+    std::vector<AllocItem> _allocs;
+    std::vector<PoolItem> _pools;
+    std::size_t _alloc_index = 0;
+};
+
 class AsyncGpuDevice {
 public:
-    AsyncGpuDevice(const GpuDevice& device, gpuStream_t stream) :
-        _device(device), _stream(stream) {}
+    AsyncGpuDevice(
+        const GpuDevice& device, gpuStream_t stream, MemPool* mem_pool = nullptr
+    ) :
+        _device(device), _stream(stream), _mem_pool(mem_pool) {}
 
-    void* allocate(std::size_t size) const;
+    std::pair<void*, Tensor> allocate(std::size_t size) const;
     void free(void* ptr) const;
     void memcpy(void* to, void* from, std::size_t size) const;
 
@@ -94,6 +120,7 @@ class AsyncGpuDevice {
 private:
     const GpuDevice& _device;
     gpuStream_t _stream;
+    MemPool* _mem_pool;
 };
 
 extern "C" int device_count();
diff --git a/madspace/src/gpu/runtime.cu b/madspace/src/gpu/runtime.cu
@@ -933,6 +933,42 @@ private:
     std::vector<bool> _sync_matrix;
 }
 
+struct MemPoolTracker {
+    void allocate(Type type, std::size_t group_index) {
+        auto& pool = pools[{type.batch_size, group_index}];
+        if (!pool.initialized) {
+            pool.index = pools.size() - 1;
+        }
+        std::size_t size_factor;
+        switch (type.dtype) {
+        case DataType::dt_int:
+            size_factor = sizeof(me_int_t);
+        case DataType::dt_float:
+            size_factor = sizeof(double);
+        default:
+            throw std::logic_error("invalid data type");
+        }
+        for (std::size_t size : type.shape) {
+            size_factor *= size;
+        }
+        allocs.push_back({
+            .pool_index = pool.index,
+            .size_factor = size_factor,
+            .offset = pool.total_size,
+        });
+        pool.total_size += size_factor;
+    }
+
+    using PoolKey = std::pair<BatchSize, std::size_t>;
+    struct PoolData {
+        std::size_t index;
+        std::size_t total_size = 0;
+        bool initialized = false;
+    };
+    std::map<PoolKey, PoolData> pools;
+    std::vector<MemPool::AllocItem> allocs;
+};
+
 } // namespace
 
 GpuRuntime::GpuRuntime(const Function& function, ContextPtr context) :
diff --git a/madspace/src/gpu/runtime.h b/madspace/src/gpu/runtime.h
@@ -43,7 +43,10 @@ class GpuRuntime : public Runtime {
     gpurandGenerator_t gpurand_generator() { return _gpurand_generator.get(); }
 
 private:
-    std::vector<Instruction> _instructions;
+    void
+
+        std::vector<Instruction>
+            _instructions;
     SizeVec _output_indices;
     std::size_t _input_count;
     TensorVec _locals_init;
diff --git a/madspace/src/kernels/operations.h b/madspace/src/kernels/operations.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "madspace/madcode/function.h"
 #include "madspace/runtime/tensor.h"
 #include "madspace/util.h"
 
@@ -28,6 +29,12 @@ void batch_foreach(const I& instruction, TensorVec& locals, D& device) {
     foreach_func(inputs, outputs, batch_size, device);
 }
 
+void memory_batch_foreach(MemPoolTracker& mpt, const InstructionCall& instruction) {
+    for (auto& output : instruction.outputs) {
+        mpt.allocate(output.type);
+    }
+}
+
 template <
     auto foreach_func,
     int n_in,
diff --git a/madspace/src/madcode/type.cpp b/madspace/src/madcode/type.cpp
@@ -1,6 +1,8 @@
 #include "madspace/madcode/type.h"
 #include "madspace/util.h"
 
+#include <sstream>
+
 using namespace madspace;
 using json = nlohmann::json;
 
@@ -97,6 +99,12 @@ std::ostream& madspace::operator<<(std::ostream& out, const DataType& dtype) {
     return out;
 }
 
+std::string BatchSize::to_string() const {
+    std::ostringstream ss;
+    ss << *this;
+    return ss.str();
+}
+
 std::ostream& madspace::operator<<(std::ostream& out, const BatchSize& batch_size) {
     std::visit(
         Overloaded{