Skip to content

Commit f19e8e5

Browse files
committed
memory pool for gpu allocations
1 parent ea35e76 commit f19e8e5

9 files changed

Lines changed: 152 additions & 17 deletions

File tree

madspace/include/madspace/madcode/type.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class BatchSize {
3434
};
3535
using Unnamed = std::shared_ptr<UnnamedBody>;
3636
using One = std::monostate;
37-
using Compound = std::unordered_map<std::variant<Named, Unnamed, One>, int>;
37+
using Compound = std::map<std::variant<Named, Unnamed, One>, int>;
3838

3939
static const BatchSize zero;
4040
static const BatchSize one;
@@ -46,6 +46,7 @@ class BatchSize {
4646
BatchSize operator-(const BatchSize& other) const { return add(other, -1); }
4747
bool operator==(const BatchSize& other) const { return value == other.value; }
4848
bool operator!=(const BatchSize& other) const { return value != other.value; }
49+
std::string to_string() const;
4950

5051
friend std::ostream& operator<<(std::ostream& out, const BatchSize& batch_size);
5152
friend void to_json(nlohmann::json& j, const BatchSize& batch_size);

madspace/include/madspace/runtime/tensor.h

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ enum class DeviceType { cpu, cuda, hip };
168168
class Device {
169169
public:
170170
virtual ~Device() = default;
171-
virtual void* allocate(std::size_t size) const = 0;
171+
virtual std::pair<void*, Tensor> allocate(std::size_t size) const = 0;
172172
virtual void free(void* ptr) const = 0;
173173
virtual void memcpy(void* to, void* from, std::size_t size) const = 0;
174174
virtual void tensor_copy(const Tensor& source, Tensor& target) const = 0;
@@ -204,14 +204,14 @@ class Tensor {
204204
Tensor(DataType dtype, const Sizes& shape, DevicePtr device) :
205205
impl(new TensorImpl{dtype, shape, device}) {
206206
auto size = init_stride();
207-
impl->data = device->allocate(size);
207+
allocate(size, *device);
208208
}
209209

210210
template <typename D>
211211
Tensor(DataType dtype, const Sizes& shape, const D& device) :
212212
impl(new TensorImpl{dtype, shape, device.device_ptr()}) {
213213
auto size = init_stride();
214-
impl->data = device.allocate(size);
214+
allocate(size, device);
215215
}
216216

217217
Tensor(
@@ -283,7 +283,7 @@ class Tensor {
283283
device
284284
}) {
285285
auto size = init_stride();
286-
impl->data = device->allocate(size);
286+
allocate(size, *device);
287287
device->memcpy(impl->data, &value, sizeof(value));
288288
if (std::is_same_v<T, me_int_t> && value >= 0) {
289289
impl->batch_sizes.push_back(value);
@@ -309,7 +309,7 @@ class Tensor {
309309
device
310310
}) {
311311
auto size = init_stride();
312-
impl->data = device->allocate(size);
312+
allocate(size, *device);
313313
std::visit(
314314
[&](auto& vec) { device->memcpy(impl->data, vec.data(), size); },
315315
std::get<1>(value)
@@ -596,6 +596,16 @@ class Tensor {
596596
}
597597
}
598598

599+
template <typename D>
600+
void allocate(std::size_t size, const D& device) {
601+
auto [data, parent] = device.allocate(size);
602+
impl->data = data;
603+
if (parent) {
604+
impl->owns_data = false;
605+
impl->data_owner = parent.impl;
606+
}
607+
}
608+
599609
TensorImpl* impl;
600610
};
601611

madspace/src/cpu/device.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ class CpuDevice : public Device {
1111
public:
1212
static constexpr bool is_concurrent = false;
1313

14-
void* allocate(std::size_t size) const override { return new std::byte[size]; }
14+
std::pair<void*, Tensor> allocate(std::size_t size) const override {
15+
return {new std::byte[size], Tensor()};
16+
}
1517

1618
void free(void* ptr) const override { delete[] static_cast<std::byte*>(ptr); }
1719

madspace/src/gpu/device.cu

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ using namespace madspace;
66
using namespace madspace::gpu;
77
using namespace madspace::kernels;
88

9-
void* GpuDevice::allocate(std::size_t size) const {
9+
std::pair<void*, Tensor> GpuDevice::allocate(std::size_t size) const {
1010
activate();
1111
void* ptr;
1212
check_error(gpuMalloc(&ptr, size));
13-
return ptr;
13+
return {ptr, Tensor()};
1414
}
1515

1616
void GpuDevice::free(void* ptr) const {
@@ -48,10 +48,51 @@ void GpuDevice::tensor_cpu(const Tensor& source, Tensor& target) const {
4848
);
4949
}
5050

51-
void* AsyncGpuDevice::allocate(std::size_t size) const {
52-
void* ptr;
53-
check_error(gpuMallocAsync(&ptr, size, _stream));
54-
return ptr;
51+
MemPool::MemPool(const SizeVec pool_factors, const std::vector<AllocItem>& allocs) :
52+
_allocs(allocs) {
53+
_pools.reserve(pool_factors.size());
54+
for (auto& factor : pool_factors) {
55+
_pools.push_back({
56+
.size_factor = factor,
57+
.batch_size = 0,
58+
.parent_tensor = Tensor(),
59+
});
60+
}
61+
}
62+
63+
std::pair<void*, Tensor>
64+
MemPool::allocate(std::size_t size, const GpuDevice& device, gpuStream_t stream) {
65+
AllocItem& alloc = _allocs.at(_alloc_index);
66+
++_alloc_index;
67+
PoolItem& pool = _pools.at(alloc.pool_index);
68+
if (size % alloc.size_factor != 0) {
69+
throw std::runtime_error("inconsistent pool allocation");
70+
}
71+
std::size_t batch_size = size / alloc.size_factor;
72+
if (!pool.parent_tensor) {
73+
pool.batch_size = batch_size;
74+
AsyncGpuDevice async_device(device, stream);
75+
pool.parent_tensor = Tensor(
76+
DataType::dt_float, {(batch_size * pool.size_factor + 7) / 8}, async_device
77+
);
78+
} else if (batch_size != pool.batch_size) {
79+
throw std::runtime_error("inconsistent pool allocation");
80+
}
81+
return {
82+
static_cast<uint8_t*>(pool.parent_tensor.data()) +
83+
pool.batch_size * alloc.offset,
84+
pool.parent_tensor
85+
};
86+
}
87+
88+
std::pair<void*, Tensor> AsyncGpuDevice::allocate(std::size_t size) const {
89+
if (_mem_pool != nullptr) {
90+
return _mem_pool->allocate(size, _device, _stream);
91+
} else {
92+
void* ptr;
93+
check_error(gpuMallocAsync(&ptr, size, _stream));
94+
return {ptr, Tensor()};
95+
}
5596
}
5697

5798
void AsyncGpuDevice::free(void* ptr) const { check_error(gpuFreeAsync(ptr, _stream)); }

madspace/src/gpu/device.h

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,38 @@ class GpuDevice : public Device {
7474
int _index;
7575
};
7676

77+
class MemPool {
78+
public:
79+
struct AllocItem {
80+
std::size_t pool_index;
81+
std::size_t size_factor;
82+
std::size_t offset;
83+
};
84+
85+
MemPool(const SizeVec pool_factors, const std::vector<AllocItem>& allocs);
86+
std::pair<void*, Tensor>
87+
allocate(std::size_t size, const GpuDevice& device, gpuStream_t stream);
88+
89+
private:
90+
struct PoolItem {
91+
std::size_t size_factor;
92+
std::size_t batch_size;
93+
Tensor parent_tensor;
94+
};
95+
96+
std::vector<AllocItem> _allocs;
97+
std::vector<PoolItem> _pools;
98+
std::size_t _alloc_index = 0;
99+
};
100+
77101
class AsyncGpuDevice {
78102
public:
79-
AsyncGpuDevice(const GpuDevice& device, gpuStream_t stream) :
80-
_device(device), _stream(stream) {}
103+
AsyncGpuDevice(
104+
const GpuDevice& device, gpuStream_t stream, MemPool* mem_pool = nullptr
105+
) :
106+
_device(device), _stream(stream), _mem_pool(mem_pool) {}
81107

82-
void* allocate(std::size_t size) const;
108+
std::pair<void*, Tensor> allocate(std::size_t size) const;
83109
void free(void* ptr) const;
84110
void memcpy(void* to, void* from, std::size_t size) const;
85111

@@ -94,6 +120,7 @@ class AsyncGpuDevice {
94120
private:
95121
const GpuDevice& _device;
96122
gpuStream_t _stream;
123+
MemPool* _mem_pool;
97124
};
98125

99126
extern "C" int device_count();

madspace/src/gpu/runtime.cu

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -933,6 +933,42 @@ private:
933933
std::vector<bool> _sync_matrix;
934934
}
935935

936+
struct MemPoolTracker {
937+
void allocate(Type type, std::size_t group_index) {
938+
auto& pool = pools[{type.batch_size, group_index}];
939+
if (!pool.initialized) {
940+
pool.index = pools.size() - 1;
941+
}
942+
std::size_t size_factor;
943+
switch (type.dtype) {
944+
case DataType::dt_int:
945+
size_factor = sizeof(me_int_t);
946+
case DataType::dt_float:
947+
size_factor = sizeof(double);
948+
default:
949+
throw std::logic_error("invalid data type");
950+
}
951+
for (std::size_t size : type.shape) {
952+
size_factor *= size;
953+
}
954+
allocs.push_back({
955+
.pool_index = pool.index,
956+
.size_factor = size_factor,
957+
.offset = pool.total_size,
958+
});
959+
pool.total_size += size_factor;
960+
}
961+
962+
using PoolKey = std::pair<BatchSize, std::size_t>;
963+
struct PoolData {
964+
std::size_t index;
965+
std::size_t total_size = 0;
966+
bool initialized = false;
967+
};
968+
std::map<PoolKey, PoolData> pools;
969+
std::vector<MemPool::AllocItem> allocs;
970+
};
971+
936972
} // namespace
937973

938974
GpuRuntime::GpuRuntime(const Function& function, ContextPtr context) :

madspace/src/gpu/runtime.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,10 @@ class GpuRuntime : public Runtime {
4343
gpurandGenerator_t gpurand_generator() { return _gpurand_generator.get(); }
4444

4545
private:
46-
std::vector<Instruction> _instructions;
46+
void
47+
48+
std::vector<Instruction>
49+
_instructions;
4750
SizeVec _output_indices;
4851
std::size_t _input_count;
4952
TensorVec _locals_init;

madspace/src/kernels/operations.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#pragma once
22

3+
#include "madspace/madcode/function.h"
34
#include "madspace/runtime/tensor.h"
45
#include "madspace/util.h"
56

@@ -28,6 +29,12 @@ void batch_foreach(const I& instruction, TensorVec& locals, D& device) {
2829
foreach_func(inputs, outputs, batch_size, device);
2930
}
3031

32+
void memory_batch_foreach(MemPoolTracker& mpt, const InstructionCall& instruction) {
33+
for (auto& output : instruction.outputs) {
34+
mpt.allocate(output.type);
35+
}
36+
}
37+
3138
template <
3239
auto foreach_func,
3340
int n_in,

madspace/src/madcode/type.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#include "madspace/madcode/type.h"
22
#include "madspace/util.h"
33

4+
#include <sstream>
5+
46
using namespace madspace;
57
using json = nlohmann::json;
68

@@ -97,6 +99,12 @@ std::ostream& madspace::operator<<(std::ostream& out, const DataType& dtype) {
9799
return out;
98100
}
99101

102+
std::string BatchSize::to_string() const {
103+
std::ostringstream ss;
104+
ss << *this;
105+
return ss.str();
106+
}
107+
100108
std::ostream& madspace::operator<<(std::ostream& out, const BatchSize& batch_size) {
101109
std::visit(
102110
Overloaded{

0 commit comments

Comments
 (0)