keep memory pools alive between runtime calls

theoheimel · theoheimel · commit 75d2ff347bbd · 2026-04-08T14:50:59.000+02:00
diff --git a/madspace/include/madspace/runtime/tensor.h b/madspace/include/madspace/runtime/tensor.h
@@ -578,6 +578,11 @@ class Tensor {
         return contiguous(batch_size, *impl->device);
     }
 
+    bool is_only_reference() const {
+        check_impl();
+        return impl->ref_count.load() == 1;
+    }
+
 private:
     struct TensorImpl {
         DataType dtype;
diff --git a/madspace/src/gpu/device.cu b/madspace/src/gpu/device.cu
@@ -50,26 +50,32 @@ void GpuDevice::tensor_cpu(const Tensor& source, Tensor& target) const {
 
 MemPool::MemPool(
     const GpuDevice& device,
-    const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes,
+    const std::vector<std::tuple<std::size_t, std::size_t, Tensor>>& cached_sizes_and_tensors,
     gpuStream_t stream
 ) :
     _device(device) {
     std::size_t pool_count = 0;
-    for (auto& [pool_index, size] : cached_sizes) {
+    for (auto& [pool_index, size, parent_tensor] : cached_sizes_and_tensors) {
         if (pool_index >= pool_count) {
             pool_count = pool_index + 1;
         }
     }
     _pools.resize(pool_count);
 
     AsyncGpuDevice async_device(device, stream);
-    for (auto& [pool_index, size] : cached_sizes) {
+    for (auto& [pool_index, size, parent_tensor] : cached_sizes_and_tensors) {
         auto& pool = _pools.at(pool_index);
-        std::size_t word_count = (size + 7) / 8;
-        pool.parent_tensor = Tensor(DataType::dt_float, {word_count}, async_device);
-        pool.capacity = word_count * 8;
-        pool.needed_size = word_count * 8;
-        //println("create pool {} {}", pool_index, pool.size);
+        if (parent_tensor) {
+            pool.parent_tensor = parent_tensor;
+            pool.capacity = parent_tensor.byte_size();
+            pool.needed_size = parent_tensor.byte_size();
+        } else {
+            std::size_t word_count = (size + 7) / 8;
+            pool.parent_tensor = Tensor(DataType::dt_float, {word_count}, async_device);
+            pool.capacity = word_count * 8;
+            pool.needed_size = word_count * 8;
+            //println("create pool {} {}", pool_index, pool.size);
+        }
     }
 }
 
@@ -86,10 +92,9 @@ MemPool::~MemPool() {
     }
 }
 
-void MemPool::reset(gpuStream_t stream) {
-    AsyncGpuDevice async_device(_device, stream);
-    for (PoolItem& pool : _pools) {
-        pool.parent_tensor.reset(async_device);
+std::vector<std::pair<std::size_t, Tensor>> MemPool::reset(gpuStream_t stream) {
+    std::vector<std::pair<std::size_t, Tensor>> parent_tensors;
+    for (std::size_t pool_index = 0; PoolItem& pool : _pools) {
         for (auto& stream_free_pointers : pool.free_pointers) {
             for (auto& [size, item] : stream_free_pointers) {
                 auto& [ptr, parent] = item;
@@ -98,8 +103,13 @@ void MemPool::reset(gpuStream_t stream) {
                 }
             }
         }
+        if (pool.parent_tensor) {
+            parent_tensors.push_back({pool_index, pool.parent_tensor});
+        }
+        ++pool_index;
     }
     _pools.clear();
+    return parent_tensors;
 }
 
 std::pair<void*, Tensor> MemPool::allocate(std::size_t pool_index, std::size_t size, gpuStream_t stream, std::size_t stream_index) {
diff --git a/madspace/src/gpu/device.h b/madspace/src/gpu/device.h
@@ -79,11 +79,11 @@ class MemPool {
 public:
     MemPool(
         const GpuDevice& device,
-        const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes,
+        const std::vector<std::tuple<std::size_t, std::size_t, Tensor>>& cached_sizes_and_tensors,
         gpuStream_t stream
     );
     ~MemPool();
-    void reset(gpuStream_t stream);
+    std::vector<std::pair<std::size_t, Tensor>> reset(gpuStream_t stream);
     std::pair<void*, Tensor> allocate(std::size_t pool_index, std::size_t size, gpuStream_t stream, std::size_t stream_index);
     bool free(void* ptr, std::size_t stream_index);
     std::vector<std::pair<std::size_t, std::size_t>> total_sizes() const;
diff --git a/madspace/src/gpu/runtime.cu b/madspace/src/gpu/runtime.cu
@@ -1043,18 +1043,18 @@ GpuRuntime::GpuRuntime(const Function& function_arg, ContextPtr context) :
             return handle;
         },
         [](gpurandGenerator_t handle) { check_error(gpurandDestroyGenerator(handle)); }
-    ) {
+    ),
+    _prev_caches(context->thread_pool(), []() { return TensorVec{}; }) {
     if (context->device()->device_type() != GpuDevice::gpu_device_type) {
         throw std::runtime_error("Context has incompatible device");
     }
     auto& gpu_device = *static_cast<const GpuDevice*>(_context->device());
     gpu_device.activate();
 
-    cudaMemPool_t pool;
-    check_error(cudaDeviceGetMemPool(&pool, 0));
-    uint64_t thresh = UINT64_MAX;
-    check_error(cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &thresh));
-
+    //cudaMemPool_t pool;
+    //check_error(cudaDeviceGetMemPool(&pool, 0));
+    //uint64_t thresh = UINT64_MAX;
+    //check_error(cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &thresh));
 
     Function function = sort_breadth_first(function_arg);
 
@@ -1352,7 +1352,7 @@ TensorVec GpuRuntime::run(const TensorVec& inputs) {
         check_error(gpuStreamWaitEvent(main_stream, events.at(event)));
     }
     update_pool_size_cache(mem_pool.total_sizes());
-    mem_pool.reset(main_stream);
+    update_cached_tensors(mem_pool.reset(main_stream));
     TensorVec outputs;
     for (auto index : _output_indices) {
         outputs.push_back(locals[index]);
@@ -1421,7 +1421,7 @@ std::tuple<TensorVec, TensorVec, std::vector<bool>> GpuRuntime::run_with_grad(
         check_error(gpuStreamWaitEvent(main_stream, events.at(event)));
     }
     update_pool_size_cache(mem_pool.total_sizes());
-    mem_pool.reset(main_stream);
+    update_cached_tensors(mem_pool.reset(main_stream));
     TensorVec outputs;
     for (auto index : _output_indices) {
         outputs.push_back(locals[index]);
@@ -1479,7 +1479,7 @@ GpuRuntime::run_backward(
         check_error(gpuStreamWaitEvent(main_stream, events.at(event)));
     }*/
     update_pool_size_cache(mem_pool.total_sizes());
-    mem_pool.reset(main_stream);
+    update_cached_tensors(mem_pool.reset(main_stream));
     std::vector<std::tuple<std::string, Tensor>> global_grads;
     for (auto& [name, index] : _grad_global_indices) {
         global_grads.push_back({name, local_grads[index]});
@@ -1488,11 +1488,21 @@ GpuRuntime::run_backward(
     return {{local_grads.begin(), local_grads.begin() + _input_count}, global_grads};
 }
 
-std::vector<std::pair<std::size_t, std::size_t>> GpuRuntime::load_pool_size_cache() {
+std::vector<std::tuple<std::size_t, std::size_t, Tensor>> GpuRuntime::load_pool_size_cache() {
     auto cache = _pool_size_cache.load();
-    std::vector<std::pair<std::size_t, std::size_t>> ret;
+    std::vector<std::tuple<std::size_t, std::size_t, Tensor>> ret;
     if (cache) {
-        ret = {cache->begin(), cache->end()};
+        auto& thread_prev_caches = _prev_caches.get();
+        for (auto [pool_index, size] : *cache) {
+            Tensor new_cache;
+            if (pool_index < thread_prev_caches.size()) {
+                Tensor& prev_cache = thread_prev_caches.at(pool_index);
+                if (prev_cache && prev_cache.is_only_reference()) {
+                    new_cache = prev_cache;
+                }
+            }
+            ret.push_back({pool_index, size, new_cache});
+        }
     }
     return ret;
 }
@@ -1503,11 +1513,25 @@ void GpuRuntime::update_pool_size_cache(const std::vector<std::pair<std::size_t,
         std::make_shared<std::unordered_map<std::size_t, std::size_t>>(*cache) :
         std::make_shared<std::unordered_map<std::size_t, std::size_t>>();
     for (auto [pool_index, size] : total_sizes) {
-        (*new_cache)[pool_index] = std::max((*new_cache)[pool_index], size);
+        auto& cache_size = (*new_cache)[pool_index];
+        if (size > cache_size) {
+            // if the cache needs to be resized, add some padding to prevent frequent resizing
+            cache_size = size * 4 / 3;
+        }
     }
     _pool_size_cache.store(new_cache);
 }
 
+void GpuRuntime::update_cached_tensors(const std::vector<std::pair<std::size_t, Tensor>>& tensors) {
+    auto& thread_prev_caches = _prev_caches.get();
+    for (auto& [pool_index, tensor] : tensors) {
+        if (pool_index >= thread_prev_caches.size()) {
+            thread_prev_caches.resize(pool_index + 1);
+        }
+        thread_prev_caches.at(pool_index) = tensor;
+    }
+}
+
 extern "C" Runtime*
 build_runtime(const Function& function, ContextPtr context, bool concurrent) {
     return new GpuRuntime(function, context);
diff --git a/madspace/src/gpu/runtime.h b/madspace/src/gpu/runtime.h
@@ -45,8 +45,9 @@ class GpuRuntime : public Runtime {
     gpurandGenerator_t gpurand_generator() { return _gpurand_generator.get(); }
 
 private:
-    std::vector<std::pair<std::size_t, std::size_t>> load_pool_size_cache();
+    std::vector<std::tuple<std::size_t, std::size_t, Tensor>> load_pool_size_cache();
     void update_pool_size_cache(const std::vector<std::pair<std::size_t, std::size_t>>& pool);
+    void update_cached_tensors(const std::vector<std::pair<std::size_t, Tensor>>& tensors);
     std::vector<Instruction> _instructions;
     SizeVec _output_indices;
     std::size_t _input_count;
@@ -61,6 +62,7 @@ class GpuRuntime : public Runtime {
     ThreadResource<gpublasHandle_t> _gpublas_handle;
     ThreadResource<gpurandGenerator_t> _gpurand_generator;
     std::atomic<std::shared_ptr<std::unordered_map<std::size_t, std::size_t>>> _pool_size_cache;
+    ThreadResource<TensorVec> _prev_caches;
 };
 
 extern "C" Runtime*