Skip to content

Commit e3f59d9

Browse files
committed
more work on memory pool
1 parent f19e8e5 commit e3f59d9

9 files changed

Lines changed: 415 additions & 202 deletions

File tree

madspace/include/madspace/runtime/tensor.h

Lines changed: 49 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -165,10 +165,21 @@ class Tensor;
165165

166166
enum class DeviceType { cpu, cuda, hip };
167167

168+
enum class AllocHint {
169+
normal,
170+
output,
171+
local,
172+
temporary,
173+
input_grad,
174+
local_grad,
175+
global_grad,
176+
};
177+
168178
class Device {
169179
public:
170180
virtual ~Device() = default;
171-
virtual std::pair<void*, Tensor> allocate(std::size_t size) const = 0;
181+
virtual std::pair<void*, Tensor>
182+
allocate(std::size_t size, AllocHint hint) const = 0;
172183
virtual void free(void* ptr) const = 0;
173184
virtual void memcpy(void* to, void* from, std::size_t size) const = 0;
174185
virtual void tensor_copy(const Tensor& source, Tensor& target) const = 0;
@@ -199,19 +210,30 @@ class Tensor {
199210

200211
Tensor(Tensor&& other) noexcept : impl(other.impl) { other.impl = nullptr; }
201212

202-
Tensor(DataType dtype, const Sizes& shape) : Tensor(dtype, shape, cpu_device()) {}
213+
Tensor(DataType dtype, const Sizes& shape, AllocHint hint = AllocHint::normal) :
214+
Tensor(dtype, shape, cpu_device(), hint) {}
203215

204-
Tensor(DataType dtype, const Sizes& shape, DevicePtr device) :
216+
Tensor(
217+
DataType dtype,
218+
const Sizes& shape,
219+
DevicePtr device,
220+
AllocHint hint = AllocHint::normal
221+
) :
205222
impl(new TensorImpl{dtype, shape, device}) {
206223
auto size = init_stride();
207-
allocate(size, *device);
224+
allocate(size, *device, hint);
208225
}
209226

210227
template <typename D>
211-
Tensor(DataType dtype, const Sizes& shape, const D& device) :
228+
Tensor(
229+
DataType dtype,
230+
const Sizes& shape,
231+
const D& device,
232+
AllocHint hint = AllocHint::normal
233+
) :
212234
impl(new TensorImpl{dtype, shape, device.device_ptr()}) {
213235
auto size = init_stride();
214-
allocate(size, device);
236+
allocate(size, device, hint);
215237
}
216238

217239
Tensor(
@@ -276,21 +298,21 @@ class Tensor {
276298
}) {}
277299

278300
template <ScalarType T>
279-
Tensor(T value, DevicePtr device) :
301+
Tensor(T value, DevicePtr device, AllocHint hint = AllocHint::normal) :
280302
impl(new TensorImpl{
281303
std::is_same_v<T, me_int_t> ? DataType::dt_int : DataType::dt_float,
282304
{1},
283305
device
284306
}) {
285307
auto size = init_stride();
286-
allocate(size, *device);
308+
allocate(size, *device, hint);
287309
device->memcpy(impl->data, &value, sizeof(value));
288310
if (std::is_same_v<T, me_int_t> && value >= 0) {
289311
impl->batch_sizes.push_back(value);
290312
}
291313
}
292314

293-
Tensor(TensorValue value, DevicePtr device) :
315+
Tensor(TensorValue value, DevicePtr device, AllocHint hint = AllocHint::normal) :
294316
impl(new TensorImpl{
295317
std::visit(
296318
Overloaded{
@@ -309,7 +331,7 @@ class Tensor {
309331
device
310332
}) {
311333
auto size = init_stride();
312-
allocate(size, *device);
334+
allocate(size, *device, hint);
313335
std::visit(
314336
[&](auto& vec) { device->memcpy(impl->data, vec.data(), size); },
315337
std::get<1>(value)
@@ -510,35 +532,41 @@ class Tensor {
510532
void add(const Tensor& source) { add(source, *impl->device); }
511533

512534
template <typename D>
513-
Tensor copy(const D& device) const {
535+
Tensor copy(const D& device, AllocHint hint = AllocHint::normal) const {
514536
check_impl();
515-
Tensor tensor(impl->dtype, impl->shape, impl->device);
537+
Tensor tensor(impl->dtype, impl->shape, impl->device, hint);
516538
device.tensor_copy(*this, tensor);
517539
return tensor;
518540
}
519-
Tensor copy() const { return copy(*impl->device); }
541+
Tensor copy(AllocHint hint = AllocHint::normal) const {
542+
return copy(*impl->device, hint);
543+
}
520544

521545
bool is_contiguous() const { return impl->contiguous_dims == impl->shape.size(); }
522546

523547
std::size_t contiguous_dims() const { return impl->contiguous_dims; }
524548

525549
template <typename D>
526-
Tensor contiguous(const D& device) const {
550+
Tensor contiguous(const D& device, AllocHint hint = AllocHint::normal) const {
527551
check_impl();
528-
return is_contiguous() ? *this : copy(device);
552+
return is_contiguous() ? *this : copy(device, hint);
529553
}
530554

531-
Tensor contiguous() const { return contiguous(*impl->device); }
555+
Tensor contiguous(AllocHint hint = AllocHint::normal) const {
556+
return contiguous(*impl->device, hint);
557+
}
532558

533559
template <typename D>
534-
Tensor contiguous(std::size_t batch_size, const D& device) const {
560+
Tensor contiguous(
561+
std::size_t batch_size, const D& device, AllocHint hint = AllocHint::normal
562+
) const {
535563
check_impl();
536564
if (size(0) == batch_size) {
537-
return contiguous(device);
565+
return contiguous(device, hint);
538566
} else if (size(0) == 1) {
539567
auto shape = impl->shape;
540568
shape[0] = batch_size;
541-
Tensor tensor(impl->dtype, shape, impl->device);
569+
Tensor tensor(impl->dtype, shape, impl->device, hint);
542570
device.tensor_copy(*this, tensor);
543571
return tensor;
544572
} else {
@@ -597,8 +625,8 @@ class Tensor {
597625
}
598626

599627
template <typename D>
600-
void allocate(std::size_t size, const D& device) {
601-
auto [data, parent] = device.allocate(size);
628+
void allocate(std::size_t size, const D& device, AllocHint hint) {
629+
auto [data, parent] = device.allocate(size, hint);
602630
impl->data = data;
603631
if (parent) {
604632
impl->owns_data = false;

madspace/src/cpu/device.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ class CpuDevice : public Device {
1111
public:
1212
static constexpr bool is_concurrent = false;
1313

14-
std::pair<void*, Tensor> allocate(std::size_t size) const override {
14+
std::pair<void*, Tensor> allocate(std::size_t size, AllocHint hint) const override {
1515
return {new std::byte[size], Tensor()};
1616
}
1717

madspace/src/cpu/runtime.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ namespace cpu {
1212

1313
class CpuRuntime : public Runtime {
1414
public:
15+
struct DummyAllocHints {
16+
AllocHint operator[](std::size_t index) const { return AllocHint::normal; }
17+
};
1518
struct Instruction {
1619
int opcode;
1720
SizeVec input_indices;
@@ -25,6 +28,8 @@ class CpuRuntime : public Runtime {
2528
std::size_t dependency_count;
2629
SizeVec dependent_instructions_backward;
2730
std::size_t dependency_count_backward;
31+
DummyAllocHints output_alloc_hints;
32+
DummyAllocHints input_grad_alloc_hints;
2833
};
2934

3035
CpuRuntime(const Function& function, ContextPtr context, bool concurrent);

madspace/src/gpu/device.cu

Lines changed: 87 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -48,53 +48,105 @@ void GpuDevice::tensor_cpu(const Tensor& source, Tensor& target) const {
4848
);
4949
}
5050

51-
MemPool::MemPool(const SizeVec pool_factors, const std::vector<AllocItem>& allocs) :
52-
_allocs(allocs) {
53-
_pools.reserve(pool_factors.size());
54-
for (auto& factor : pool_factors) {
55-
_pools.push_back({
56-
.size_factor = factor,
57-
.batch_size = 0,
58-
.parent_tensor = Tensor(),
59-
});
51+
MemPool::MemPool(
52+
const GpuDevice& device,
53+
const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes
54+
) :
55+
_device(device) {
56+
std::size_t pool_count = 0;
57+
for (auto& [pool_index, size] : cached_sizes) {
58+
if (pool_index >= pool_count) {
59+
pool_count = pool_index + 1;
60+
}
61+
}
62+
_pools.resize(pool_count);
63+
64+
for (auto& [pool_index, size] : cached_sizes) {
65+
auto& pool = _pools.at(pool_index);
66+
std::size_t word_count = (size + 7) / 8;
67+
pool.parent_tensor = Tensor(DataType::dt_float, {word_count}, device);
68+
pool.size = word_count * 8;
69+
pool.needed_size = word_count * 8;
6070
}
6171
}
6272

63-
std::pair<void*, Tensor>
64-
MemPool::allocate(std::size_t size, const GpuDevice& device, gpuStream_t stream) {
65-
AllocItem& alloc = _allocs.at(_alloc_index);
66-
++_alloc_index;
67-
PoolItem& pool = _pools.at(alloc.pool_index);
68-
if (size % alloc.size_factor != 0) {
69-
throw std::runtime_error("inconsistent pool allocation");
70-
}
71-
std::size_t batch_size = size / alloc.size_factor;
72-
if (!pool.parent_tensor) {
73-
pool.batch_size = batch_size;
74-
AsyncGpuDevice async_device(device, stream);
75-
pool.parent_tensor = Tensor(
76-
DataType::dt_float, {(batch_size * pool.size_factor + 7) / 8}, async_device
77-
);
78-
} else if (batch_size != pool.batch_size) {
79-
throw std::runtime_error("inconsistent pool allocation");
73+
MemPool::~MemPool() {
74+
for (PoolItem& pool : _pools) {
75+
for (auto& [size, item] : pool.free_pointers) {
76+
auto& [ptr, parent] = item;
77+
if (!parent) {
78+
check_error(gpuFree(ptr));
79+
}
80+
}
8081
}
81-
return {
82-
static_cast<uint8_t*>(pool.parent_tensor.data()) +
83-
pool.batch_size * alloc.offset,
84-
pool.parent_tensor
85-
};
8682
}
8783

88-
std::pair<void*, Tensor> AsyncGpuDevice::allocate(std::size_t size) const {
89-
if (_mem_pool != nullptr) {
90-
return _mem_pool->allocate(size, _device, _stream);
84+
std::pair<void*, Tensor> MemPool::allocate(std::size_t pool_index, std::size_t size) {
85+
if (pool_index >= _pools.size()) {
86+
_pools.resize(pool_index);
87+
}
88+
PoolItem& pool = _pools.at(pool_index);
89+
if (auto search = pool.free_pointers.find(size);
90+
search != pool.free_pointers.end()) {
91+
std::pair<void*, Tensor> ret = *search->second;
92+
pool.free_pointers.erase(search);
93+
return ret;
94+
} else if (pool.capacity - pool.size >= size) {
95+
void* ptr = &static_cast<uint8_t*>(pool.parent_tensor.data())[pool.size];
96+
pool.size = (pool.size + size + 7) / 8 * 8;
97+
_allocs[ptr] = {
98+
.pool_index = pool_index,
99+
.size = size,
100+
.parent_tensor = pool.parent_tensor,
101+
};
102+
return {ptr, pool.parent_tensor};
91103
} else {
92104
void* ptr;
93-
check_error(gpuMallocAsync(&ptr, size, _stream));
105+
check_error(gpuMalloc(&ptr, size));
106+
_allocs[ptr] = {
107+
.pool_index = pool_index,
108+
.size = size,
109+
.parent_tensor = Tensor(),
110+
};
111+
pool.needed_size += (size + 7) / 8 * 8;
94112
return {ptr, Tensor()};
95113
}
96114
}
97115

116+
void MemPool::free(void* ptr) {
117+
auto search = _allocs.find(ptr) if (search == _allocs.end()) {
118+
throw std::runtime_error("address was not allocated using this pool");
119+
}
120+
auto& alloc = search->second;
121+
_pools.at(alloc.pool_index)
122+
.free_pointers.emplace(alloc.size, {ptr, alloc.parent_tensor});
123+
_allocs.erase(search);
124+
}
125+
126+
std::vector<std::pair<std::size_t, std::size_t>> MemPool::total_sizes() const {
127+
std::vector<std::pair<std::size_t, std::size_t>> ret;
128+
ret.reserve(_pools.size());
129+
for (std::size_t index = 0; PoolItem& pool : _pools) {
130+
if (pool.needed_size > 0) {
131+
ret.push_back({index, pool.needed_size});
132+
}
133+
++index;
134+
}
135+
return ret;
136+
}
137+
138+
std::pair<void*, Tensor>
139+
AsyncGpuDevice::allocate(std::size_t size, AllocHint hint) const {
140+
if (_mem_pool != nullptr && hint != AllocHint::normal) {
141+
return _mem_pool->allocate(static_cast<std::size_t>(hint) - 1, size);
142+
} else {
143+
_device.allocate(size, hint);
144+
// void* ptr;
145+
// check_error(gpuMallocAsync(&ptr, size, _stream));
146+
// return {ptr, Tensor()};
147+
}
148+
}
149+
98150
void AsyncGpuDevice::free(void* ptr) const { check_error(gpuFreeAsync(ptr, _stream)); }
99151

100152
void AsyncGpuDevice::memcpy(void* to, void* from, std::size_t size) const {

madspace/src/gpu/device.h

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -76,26 +76,28 @@ class GpuDevice : public Device {
7676

7777
class MemPool {
7878
public:
79-
struct AllocItem {
80-
std::size_t pool_index;
81-
std::size_t size_factor;
82-
std::size_t offset;
83-
};
84-
85-
MemPool(const SizeVec pool_factors, const std::vector<AllocItem>& allocs);
86-
std::pair<void*, Tensor>
87-
allocate(std::size_t size, const GpuDevice& device, gpuStream_t stream);
79+
MemPool(const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes);
80+
~MemPool();
81+
std::pair<void*, Tensor> allocate(std::size_t pool_index, std::size_t size);
82+
void free(void* ptr);
83+
std::vector<std::pair<std::size_t, std::size_t>> total_sizes() const;
8884

8985
private:
9086
struct PoolItem {
91-
std::size_t size_factor;
92-
std::size_t batch_size;
87+
Tensor parent_tensor;
88+
std::size_t size = 0;
89+
std::size_t capacity = 0;
90+
std::size_t needed_size = 0;
91+
std::unordered_multimap<std::size_t, std::pair<void*, Tensor>> free_pointers;
92+
};
93+
struct AllocItem {
94+
std::size_t pool_index;
95+
std::size_t size;
9396
Tensor parent_tensor;
9497
};
95-
96-
std::vector<AllocItem> _allocs;
9798
std::vector<PoolItem> _pools;
98-
std::size_t _alloc_index = 0;
99+
std::unordered_map<void*, AllocItem> _allocs;
100+
const GpuDevice& _device;
99101
};
100102

101103
class AsyncGpuDevice {
@@ -105,7 +107,7 @@ class AsyncGpuDevice {
105107
) :
106108
_device(device), _stream(stream), _mem_pool(mem_pool) {}
107109

108-
std::pair<void*, Tensor> allocate(std::size_t size) const;
110+
std::pair<void*, Tensor> allocate(std::size_t size, AllocHint hint) const;
109111
void free(void* ptr) const;
110112
void memcpy(void* to, void* from, std::size_t size) const;
111113

0 commit comments

Comments
 (0)