Skip to content

Commit 75d2ff3

Browse files
committed
keep memory pools alive between runtime calls
1 parent c57d050 commit 75d2ff3

5 files changed

Lines changed: 69 additions & 28 deletions

File tree

madspace/include/madspace/runtime/tensor.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,11 @@ class Tensor {
578578
return contiguous(batch_size, *impl->device);
579579
}
580580

581+
bool is_only_reference() const {
582+
check_impl();
583+
return impl->ref_count.load() == 1;
584+
}
585+
581586
private:
582587
struct TensorImpl {
583588
DataType dtype;

madspace/src/gpu/device.cu

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -50,26 +50,32 @@ void GpuDevice::tensor_cpu(const Tensor& source, Tensor& target) const {
5050

5151
MemPool::MemPool(
5252
const GpuDevice& device,
53-
const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes,
53+
const std::vector<std::tuple<std::size_t, std::size_t, Tensor>>& cached_sizes_and_tensors,
5454
gpuStream_t stream
5555
) :
5656
_device(device) {
5757
std::size_t pool_count = 0;
58-
for (auto& [pool_index, size] : cached_sizes) {
58+
for (auto& [pool_index, size, parent_tensor] : cached_sizes_and_tensors) {
5959
if (pool_index >= pool_count) {
6060
pool_count = pool_index + 1;
6161
}
6262
}
6363
_pools.resize(pool_count);
6464

6565
AsyncGpuDevice async_device(device, stream);
66-
for (auto& [pool_index, size] : cached_sizes) {
66+
for (auto& [pool_index, size, parent_tensor] : cached_sizes_and_tensors) {
6767
auto& pool = _pools.at(pool_index);
68-
std::size_t word_count = (size + 7) / 8;
69-
pool.parent_tensor = Tensor(DataType::dt_float, {word_count}, async_device);
70-
pool.capacity = word_count * 8;
71-
pool.needed_size = word_count * 8;
72-
//println("create pool {} {}", pool_index, pool.size);
68+
if (parent_tensor) {
69+
pool.parent_tensor = parent_tensor;
70+
pool.capacity = parent_tensor.byte_size();
71+
pool.needed_size = parent_tensor.byte_size();
72+
} else {
73+
std::size_t word_count = (size + 7) / 8;
74+
pool.parent_tensor = Tensor(DataType::dt_float, {word_count}, async_device);
75+
pool.capacity = word_count * 8;
76+
pool.needed_size = word_count * 8;
77+
//println("create pool {} {}", pool_index, pool.size);
78+
}
7379
}
7480
}
7581

@@ -86,10 +92,9 @@ MemPool::~MemPool() {
8692
}
8793
}
8894

89-
void MemPool::reset(gpuStream_t stream) {
90-
AsyncGpuDevice async_device(_device, stream);
91-
for (PoolItem& pool : _pools) {
92-
pool.parent_tensor.reset(async_device);
95+
std::vector<std::pair<std::size_t, Tensor>> MemPool::reset(gpuStream_t stream) {
96+
std::vector<std::pair<std::size_t, Tensor>> parent_tensors;
97+
for (std::size_t pool_index = 0; PoolItem& pool : _pools) {
9398
for (auto& stream_free_pointers : pool.free_pointers) {
9499
for (auto& [size, item] : stream_free_pointers) {
95100
auto& [ptr, parent] = item;
@@ -98,8 +103,13 @@ void MemPool::reset(gpuStream_t stream) {
98103
}
99104
}
100105
}
106+
if (pool.parent_tensor) {
107+
parent_tensors.push_back({pool_index, pool.parent_tensor});
108+
}
109+
++pool_index;
101110
}
102111
_pools.clear();
112+
return parent_tensors;
103113
}
104114

105115
std::pair<void*, Tensor> MemPool::allocate(std::size_t pool_index, std::size_t size, gpuStream_t stream, std::size_t stream_index) {

madspace/src/gpu/device.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,11 @@ class MemPool {
7979
public:
8080
MemPool(
8181
const GpuDevice& device,
82-
const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes,
82+
const std::vector<std::tuple<std::size_t, std::size_t, Tensor>>& cached_sizes_and_tensors,
8383
gpuStream_t stream
8484
);
8585
~MemPool();
86-
void reset(gpuStream_t stream);
86+
std::vector<std::pair<std::size_t, Tensor>> reset(gpuStream_t stream);
8787
std::pair<void*, Tensor> allocate(std::size_t pool_index, std::size_t size, gpuStream_t stream, std::size_t stream_index);
8888
bool free(void* ptr, std::size_t stream_index);
8989
std::vector<std::pair<std::size_t, std::size_t>> total_sizes() const;

madspace/src/gpu/runtime.cu

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1043,18 +1043,18 @@ GpuRuntime::GpuRuntime(const Function& function_arg, ContextPtr context) :
10431043
return handle;
10441044
},
10451045
[](gpurandGenerator_t handle) { check_error(gpurandDestroyGenerator(handle)); }
1046-
) {
1046+
),
1047+
_prev_caches(context->thread_pool(), []() { return TensorVec{}; }) {
10471048
if (context->device()->device_type() != GpuDevice::gpu_device_type) {
10481049
throw std::runtime_error("Context has incompatible device");
10491050
}
10501051
auto& gpu_device = *static_cast<const GpuDevice*>(_context->device());
10511052
gpu_device.activate();
10521053

1053-
cudaMemPool_t pool;
1054-
check_error(cudaDeviceGetMemPool(&pool, 0));
1055-
uint64_t thresh = UINT64_MAX;
1056-
check_error(cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &thresh));
1057-
1054+
//cudaMemPool_t pool;
1055+
//check_error(cudaDeviceGetMemPool(&pool, 0));
1056+
//uint64_t thresh = UINT64_MAX;
1057+
//check_error(cudaMemPoolSetAttribute(pool, cudaMemPoolAttrReleaseThreshold, &thresh));
10581058

10591059
Function function = sort_breadth_first(function_arg);
10601060

@@ -1352,7 +1352,7 @@ TensorVec GpuRuntime::run(const TensorVec& inputs) {
13521352
check_error(gpuStreamWaitEvent(main_stream, events.at(event)));
13531353
}
13541354
update_pool_size_cache(mem_pool.total_sizes());
1355-
mem_pool.reset(main_stream);
1355+
update_cached_tensors(mem_pool.reset(main_stream));
13561356
TensorVec outputs;
13571357
for (auto index : _output_indices) {
13581358
outputs.push_back(locals[index]);
@@ -1421,7 +1421,7 @@ std::tuple<TensorVec, TensorVec, std::vector<bool>> GpuRuntime::run_with_grad(
14211421
check_error(gpuStreamWaitEvent(main_stream, events.at(event)));
14221422
}
14231423
update_pool_size_cache(mem_pool.total_sizes());
1424-
mem_pool.reset(main_stream);
1424+
update_cached_tensors(mem_pool.reset(main_stream));
14251425
TensorVec outputs;
14261426
for (auto index : _output_indices) {
14271427
outputs.push_back(locals[index]);
@@ -1479,7 +1479,7 @@ GpuRuntime::run_backward(
14791479
check_error(gpuStreamWaitEvent(main_stream, events.at(event)));
14801480
}*/
14811481
update_pool_size_cache(mem_pool.total_sizes());
1482-
mem_pool.reset(main_stream);
1482+
update_cached_tensors(mem_pool.reset(main_stream));
14831483
std::vector<std::tuple<std::string, Tensor>> global_grads;
14841484
for (auto& [name, index] : _grad_global_indices) {
14851485
global_grads.push_back({name, local_grads[index]});
@@ -1488,11 +1488,21 @@ GpuRuntime::run_backward(
14881488
return {{local_grads.begin(), local_grads.begin() + _input_count}, global_grads};
14891489
}
14901490

1491-
std::vector<std::pair<std::size_t, std::size_t>> GpuRuntime::load_pool_size_cache() {
1491+
std::vector<std::tuple<std::size_t, std::size_t, Tensor>> GpuRuntime::load_pool_size_cache() {
14921492
auto cache = _pool_size_cache.load();
1493-
std::vector<std::pair<std::size_t, std::size_t>> ret;
1493+
std::vector<std::tuple<std::size_t, std::size_t, Tensor>> ret;
14941494
if (cache) {
1495-
ret = {cache->begin(), cache->end()};
1495+
auto& thread_prev_caches = _prev_caches.get();
1496+
for (auto [pool_index, size] : *cache) {
1497+
Tensor new_cache;
1498+
if (pool_index < thread_prev_caches.size()) {
1499+
Tensor& prev_cache = thread_prev_caches.at(pool_index);
1500+
if (prev_cache && prev_cache.is_only_reference()) {
1501+
new_cache = prev_cache;
1502+
}
1503+
}
1504+
ret.push_back({pool_index, size, new_cache});
1505+
}
14961506
}
14971507
return ret;
14981508
}
@@ -1503,11 +1513,25 @@ void GpuRuntime::update_pool_size_cache(const std::vector<std::pair<std::size_t,
15031513
std::make_shared<std::unordered_map<std::size_t, std::size_t>>(*cache) :
15041514
std::make_shared<std::unordered_map<std::size_t, std::size_t>>();
15051515
for (auto [pool_index, size] : total_sizes) {
1506-
(*new_cache)[pool_index] = std::max((*new_cache)[pool_index], size);
1516+
auto& cache_size = (*new_cache)[pool_index];
1517+
if (size > cache_size) {
1518+
// if the cache needs to be resized, add some padding to prevent frequent resizing
1519+
cache_size = size * 4 / 3;
1520+
}
15071521
}
15081522
_pool_size_cache.store(new_cache);
15091523
}
15101524

1525+
void GpuRuntime::update_cached_tensors(const std::vector<std::pair<std::size_t, Tensor>>& tensors) {
1526+
auto& thread_prev_caches = _prev_caches.get();
1527+
for (auto& [pool_index, tensor] : tensors) {
1528+
if (pool_index >= thread_prev_caches.size()) {
1529+
thread_prev_caches.resize(pool_index + 1);
1530+
}
1531+
thread_prev_caches.at(pool_index) = tensor;
1532+
}
1533+
}
1534+
15111535
extern "C" Runtime*
15121536
build_runtime(const Function& function, ContextPtr context, bool concurrent) {
15131537
return new GpuRuntime(function, context);

madspace/src/gpu/runtime.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,9 @@ class GpuRuntime : public Runtime {
4545
gpurandGenerator_t gpurand_generator() { return _gpurand_generator.get(); }
4646

4747
private:
48-
std::vector<std::pair<std::size_t, std::size_t>> load_pool_size_cache();
48+
std::vector<std::tuple<std::size_t, std::size_t, Tensor>> load_pool_size_cache();
4949
void update_pool_size_cache(const std::vector<std::pair<std::size_t, std::size_t>>& pool);
50+
void update_cached_tensors(const std::vector<std::pair<std::size_t, Tensor>>& tensors);
5051
std::vector<Instruction> _instructions;
5152
SizeVec _output_indices;
5253
std::size_t _input_count;
@@ -61,6 +62,7 @@ class GpuRuntime : public Runtime {
6162
ThreadResource<gpublasHandle_t> _gpublas_handle;
6263
ThreadResource<gpurandGenerator_t> _gpurand_generator;
6364
std::atomic<std::shared_ptr<std::unordered_map<std::size_t, std::size_t>>> _pool_size_cache;
65+
ThreadResource<TensorVec> _prev_caches;
6466
};
6567

6668
extern "C" Runtime*

0 commit comments

Comments
 (0)