Skip to content

Commit fa762bc

Browse files
committed
memory pool improvements
1 parent 779f347 commit fa762bc

3 files changed

Lines changed: 71 additions & 29 deletions

File tree

madspace/src/gpu/device.cu

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ void GpuDevice::tensor_cpu(const Tensor& source, Tensor& target) const {
5050

5151
MemPool::MemPool(
5252
const GpuDevice& device,
53-
const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes
53+
const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes,
54+
gpuStream_t stream
5455
) :
5556
_device(device) {
5657
std::size_t pool_count = 0;
@@ -61,10 +62,11 @@ MemPool::MemPool(
6162
}
6263
_pools.resize(pool_count);
6364

65+
AsyncGpuDevice async_device(device, stream);
6466
for (auto& [pool_index, size] : cached_sizes) {
6567
auto& pool = _pools.at(pool_index);
6668
std::size_t word_count = (size + 7) / 8;
67-
pool.parent_tensor = Tensor(DataType::dt_float, {word_count}, device);
69+
pool.parent_tensor = Tensor(DataType::dt_float, {word_count}, async_device);
6870
pool.capacity = word_count * 8;
6971
pool.needed_size = word_count * 8;
7072
//println("create pool {} {}", pool_index, pool.size);
@@ -82,7 +84,21 @@ MemPool::~MemPool() {
8284
}
8385
}
8486

85-
std::pair<void*, Tensor> MemPool::allocate(std::size_t pool_index, std::size_t size) {
87+
void MemPool::reset(gpuStream_t stream) {
88+
AsyncGpuDevice async_device(_device, stream);
89+
for (PoolItem& pool : _pools) {
90+
pool.parent_tensor.reset(async_device);
91+
for (auto& [size, item] : pool.free_pointers) {
92+
auto& [ptr, parent] = item;
93+
if (!parent) {
94+
check_error(gpuFreeAsync(ptr, stream));
95+
}
96+
}
97+
}
98+
_pools.clear();
99+
}
100+
101+
std::pair<void*, Tensor> MemPool::allocate(std::size_t pool_index, std::size_t size, gpuStream_t stream) {
86102
if (pool_index >= _pools.size()) {
87103
_pools.resize(pool_index + 1);
88104
}
@@ -110,7 +126,7 @@ std::pair<void*, Tensor> MemPool::allocate(std::size_t pool_index, std::size_t s
110126
return {ptr, pool.parent_tensor};
111127
} else {
112128
void* ptr;
113-
check_error(gpuMalloc(&ptr, size));
129+
check_error(gpuMallocAsync(&ptr, size, stream));
114130
_allocs[ptr] = {
115131
.pool_index = pool_index,
116132
.size = size,
@@ -122,16 +138,17 @@ std::pair<void*, Tensor> MemPool::allocate(std::size_t pool_index, std::size_t s
122138
}
123139
}
124140

125-
void MemPool::free(void* ptr) {
141+
bool MemPool::free(void* ptr) {
126142
auto search = _allocs.find(ptr);
127143
if (search == _allocs.end()) {
128-
throw std::runtime_error("address was not allocated using this pool");
144+
return false;
129145
}
130146
auto& alloc = search->second;
131147
_pools.at(alloc.pool_index)
132148
.free_pointers.emplace(alloc.size, std::pair<void*, Tensor>{ptr, alloc.parent_tensor});
133149
//println("free {} {} {}", ptr, alloc.pool_index, alloc.size);
134150
_allocs.erase(search);
151+
return true;
135152
}
136153

137154
std::vector<std::pair<std::size_t, std::size_t>> MemPool::total_sizes() const {
@@ -148,7 +165,7 @@ std::vector<std::pair<std::size_t, std::size_t>> MemPool::total_sizes() const {
148165

149166
std::pair<void*, Tensor>
150167
AsyncGpuDevice::allocate(std::size_t size, AllocHint hint) const {
151-
if (_mem_pool) {
168+
if (_mem_pool && size <= 4 * 1024 * 1024) {
152169
std::size_t pool_index;
153170
switch (hint) {
154171
case AllocHint::normal:
@@ -172,7 +189,7 @@ AsyncGpuDevice::allocate(std::size_t size, AllocHint hint) const {
172189
pool_index = 2;
173190
break;
174191
}
175-
return _mem_pool->allocate(pool_index, size);
192+
return _mem_pool->allocate(pool_index, size, _stream);
176193
} else {
177194
//_device.allocate(size, hint);
178195
void* ptr;
@@ -182,9 +199,7 @@ AsyncGpuDevice::allocate(std::size_t size, AllocHint hint) const {
182199
}
183200

184201
void AsyncGpuDevice::free(void* ptr) const {
185-
if (_mem_pool) {
186-
_mem_pool->free(ptr);
187-
} else {
202+
if (!_mem_pool || !_mem_pool->free(ptr)) {
188203
check_error(gpuFreeAsync(ptr, _stream));
189204
}
190205
}

madspace/src/gpu/device.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,15 @@ class GpuDevice : public Device {
7777

7878
class MemPool {
7979
public:
80-
MemPool(const GpuDevice& device, const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes);
80+
MemPool(
81+
const GpuDevice& device,
82+
const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes,
83+
gpuStream_t stream
84+
);
8185
~MemPool();
82-
std::pair<void*, Tensor> allocate(std::size_t pool_index, std::size_t size);
83-
void free(void* ptr);
86+
void reset(gpuStream_t stream);
87+
std::pair<void*, Tensor> allocate(std::size_t pool_index, std::size_t size, gpuStream_t stream);
88+
bool free(void* ptr);
8489
std::vector<std::pair<std::size_t, std::size_t>> total_sizes() const;
8590

8691
private:
@@ -104,7 +109,7 @@ class MemPool {
104109
class AsyncGpuDevice {
105110
public:
106111
AsyncGpuDevice(
107-
const GpuDevice& device, gpuStream_t stream, std::size_t stream_index, MemPool* mem_pool = nullptr
112+
const GpuDevice& device, gpuStream_t stream, std::size_t stream_index = 0, MemPool* mem_pool = nullptr
108113
) :
109114
_device(device), _stream(stream), _stream_index(stream_index), _mem_pool(mem_pool) {}
110115

madspace/src/gpu/runtime.cu

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,16 @@ void op_matmul(
171171
bias.reset(device);
172172
}
173173

174+
__global__ void kernel_one(
175+
std::size_t batch_size,
176+
GpuTensorView<double, 1, true> output
177+
) {
178+
me_int_t i = blockDim.x * blockIdx.x + threadIdx.x;
179+
if (i < batch_size) {
180+
output[i] = 1.;
181+
}
182+
}
183+
174184
void backward_op_matmul(
175185
const GpuRuntime::Instruction& instruction,
176186
TensorVec& locals,
@@ -268,11 +278,12 @@ void backward_op_matmul(
268278

269279
// compute bias_grad += sum_i output_grad_ij
270280
Tensor ones(DataType::dt_float, {batch_size}, device, AllocHint::temporary);
271-
thrust::fill_n(
272-
thrust_par.on(stream),
273-
thrust::device_pointer_cast(static_cast<double*>(ones.data())),
281+
launch_kernel(
282+
kernel_one,
283+
batch_size,
284+
device.stream(),
274285
batch_size,
275-
1.0
286+
ones.view<double, 1>()
276287
);
277288
check_error(gpublasDgemv(
278289
handle,
@@ -973,7 +984,9 @@ void op_histogram(
973984
class SyncTracker {
974985
public:
975986
SyncTracker(std::size_t stream_count) :
976-
_stream_count(stream_count), _sync_matrix(stream_count * stream_count, true) {}
987+
_stream_count(stream_count), _sync_matrix(stream_count * stream_count) {
988+
reset();
989+
}
977990

978991
bool is_in_sync_with(std::size_t this_stream, std::size_t other_stream) const {
979992
return _sync_matrix.at(this_stream * _stream_count + other_stream);
@@ -993,7 +1006,13 @@ public:
9931006
}
9941007
}
9951008
}
996-
void reset() { std::fill(_sync_matrix.begin(), _sync_matrix.end(), true); }
1009+
void reset() {
1010+
for (std::size_t i = 0; i < _stream_count; ++i) {
1011+
for (std::size_t j = 0; j < _stream_count; ++j) {
1012+
_sync_matrix.at(i * _stream_count + j) = i == j;
1013+
}
1014+
}
1015+
}
9971016

9981017
private:
9991018
std::size_t _stream_count;
@@ -1309,7 +1328,8 @@ TensorVec GpuRuntime::run(const TensorVec& inputs) {
13091328
gpu_device.activate();
13101329
auto locals = _locals_init;
13111330
std::copy(inputs.begin(), inputs.end(), locals.begin());
1312-
MemPool mem_pool(gpu_device, load_pool_size_cache());
1331+
gpuStream_t main_stream = streams.at(0);
1332+
MemPool mem_pool(gpu_device, load_pool_size_cache(), main_stream);
13131333

13141334
//println("----");
13151335
for (auto& instr : _instructions) {
@@ -1328,16 +1348,16 @@ TensorVec GpuRuntime::run(const TensorVec& inputs) {
13281348
check_error(gpuEventRecord(events.at(instr.record_event), stream));
13291349
}
13301350
}
1331-
gpuStream_t main_stream = streams.at(0);
13321351
for (auto event : _wait_events) {
13331352
check_error(gpuStreamWaitEvent(main_stream, events.at(event)));
13341353
}
1354+
update_pool_size_cache(mem_pool.total_sizes());
1355+
mem_pool.reset(main_stream);
13351356
TensorVec outputs;
13361357
for (auto index : _output_indices) {
13371358
outputs.push_back(locals[index]);
13381359
}
13391360
check_error(gpuStreamSynchronize(main_stream));
1340-
update_pool_size_cache(mem_pool.total_sizes());
13411361
return outputs;
13421362
}
13431363

@@ -1356,7 +1376,8 @@ std::tuple<TensorVec, TensorVec, std::vector<bool>> GpuRuntime::run_with_grad(
13561376
std::copy(
13571377
input_requires_grad.begin(), input_requires_grad.end(), requires_grad.begin()
13581378
);
1359-
MemPool mem_pool(gpu_device, load_pool_size_cache());
1379+
gpuStream_t main_stream = streams.at(0);
1380+
MemPool mem_pool(gpu_device, load_pool_size_cache(), main_stream);
13601381

13611382
for (auto [instr, instr_eval_grad] : zip(_instructions, eval_grad)) {
13621383
gpuStream_t stream = streams.at(instr.stream);
@@ -1396,16 +1417,16 @@ std::tuple<TensorVec, TensorVec, std::vector<bool>> GpuRuntime::run_with_grad(
13961417
check_error(gpuEventRecord(events.at(instr.record_event), stream));
13971418
}
13981419
}
1399-
gpuStream_t main_stream = streams.at(0);
14001420
for (auto event : _wait_events) {
14011421
check_error(gpuStreamWaitEvent(main_stream, events.at(event)));
14021422
}
1423+
update_pool_size_cache(mem_pool.total_sizes());
1424+
mem_pool.reset(main_stream);
14031425
TensorVec outputs;
14041426
for (auto index : _output_indices) {
14051427
outputs.push_back(locals[index]);
14061428
}
14071429
check_error(gpuStreamSynchronize(main_stream));
1408-
update_pool_size_cache(mem_pool.total_sizes());
14091430
return {outputs, locals, eval_grad};
14101431
}
14111432

@@ -1424,8 +1445,8 @@ GpuRuntime::run_backward(
14241445
for (auto [index, grad] : zip(_output_indices, output_grads)) {
14251446
local_grads[index] = grad;
14261447
}
1427-
MemPool mem_pool(gpu_device, load_pool_size_cache());
14281448
gpuStream_t main_stream = streams.at(0);
1449+
MemPool mem_pool(gpu_device, load_pool_size_cache(), main_stream);
14291450
for (auto [instr, instr_eval_grad] :
14301451
zip(std::views::reverse(_instructions), std::views::reverse(eval_grad))) {
14311452
/*gpuStream_t stream = streams.at(instr.stream);
@@ -1457,12 +1478,13 @@ GpuRuntime::run_backward(
14571478
/*for (auto event : _backward_wait_events) {
14581479
check_error(gpuStreamWaitEvent(main_stream, events.at(event)));
14591480
}*/
1481+
update_pool_size_cache(mem_pool.total_sizes());
1482+
mem_pool.reset(main_stream);
14601483
std::vector<std::tuple<std::string, Tensor>> global_grads;
14611484
for (auto& [name, index] : _grad_global_indices) {
14621485
global_grads.push_back({name, local_grads[index]});
14631486
}
14641487
check_error(gpuStreamSynchronize(main_stream));
1465-
update_pool_size_cache(mem_pool.total_sizes());
14661488
return {{local_grads.begin(), local_grads.begin() + _input_count}, global_grads};
14671489
}
14681490

0 commit comments

Comments
 (0)