Skip to content

Commit 779f347

Browse files
committed
new memory management working
1 parent 7ec3faf commit 779f347

11 files changed

Lines changed: 169 additions & 74 deletions

File tree

madspace/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
3636
endif()
3737
endif()
3838

39+
add_compile_options(-fno-omit-frame-pointer)
40+
3941
########################################################################################
4042
# Load dependencies #
4143
########################################################################################

madspace/include/madspace/runtime/runtime_base.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,17 @@ namespace madspace {
99
class Runtime {
1010
public:
1111
virtual ~Runtime() = default;
12-
virtual TensorVec run(const TensorVec& inputs) const = 0;
12+
virtual TensorVec run(const TensorVec& inputs) = 0;
1313
virtual std::tuple<TensorVec, TensorVec, std::vector<bool>> run_with_grad(
1414
const TensorVec& inputs, const std::vector<bool>& input_requires_grad
15-
) const = 0;
15+
) = 0;
1616
virtual std::
1717
tuple<TensorVec, std::vector<std::tuple<std::string, madspace::Tensor>>>
1818
run_backward(
1919
const TensorVec& output_grads,
2020
const TensorVec& stored_locals,
2121
const std::vector<bool>& eval_grad
22-
) const = 0;
22+
) = 0;
2323
friend std::unique_ptr<Runtime>
2424
build_runtime(const Function& function, ContextPtr context, bool concurrent);
2525

madspace/include/madspace/runtime/tensor.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,7 @@ class Tensor {
534534
template <typename D>
535535
Tensor copy(const D& device, AllocHint hint = AllocHint::normal) const {
536536
check_impl();
537-
Tensor tensor(impl->dtype, impl->shape, impl->device, hint);
537+
Tensor tensor(impl->dtype, impl->shape, device, hint);
538538
device.tensor_copy(*this, tensor);
539539
return tensor;
540540
}
@@ -629,6 +629,7 @@ class Tensor {
629629
auto [data, parent] = device.allocate(size, hint);
630630
impl->data = data;
631631
if (parent) {
632+
parent.impl->incref();
632633
impl->owns_data = false;
633634
impl->data_owner = parent.impl;
634635
}

madspace/src/cpu/runtime.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -870,7 +870,7 @@ CpuRuntime::CpuRuntime(const Function& function, ContextPtr context, bool concur
870870
}
871871
}
872872

873-
TensorVec CpuRuntime::run(const TensorVec& inputs) const {
873+
TensorVec CpuRuntime::run(const TensorVec& inputs) {
874874
if (_concurrent && _context->thread_pool().thread_count() > 1) {
875875
auto [outputs, locals, eval_grad] = run_concurrent(inputs, {}, false);
876876
return outputs;
@@ -881,7 +881,7 @@ TensorVec CpuRuntime::run(const TensorVec& inputs) const {
881881

882882
std::tuple<TensorVec, TensorVec, std::vector<bool>> CpuRuntime::run_with_grad(
883883
const TensorVec& inputs, const std::vector<bool>& input_requires_grad
884-
) const {
884+
) {
885885
if (_concurrent && _context->thread_pool().thread_count() > 1) {
886886
return run_concurrent(inputs, input_requires_grad, true);
887887
} else {
@@ -894,7 +894,7 @@ CpuRuntime::run_backward(
894894
const TensorVec& output_grads,
895895
const TensorVec& stored_locals,
896896
const std::vector<bool>& eval_grad
897-
) const {
897+
) {
898898
if (_concurrent && _context->thread_pool().thread_count() > 1) {
899899
return run_backward_concurrent(output_grads, stored_locals, eval_grad);
900900
} else {

madspace/src/cpu/runtime.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,15 @@ class CpuRuntime : public Runtime {
3434

3535
CpuRuntime(const Function& function, ContextPtr context, bool concurrent);
3636

37-
TensorVec run(const TensorVec& inputs) const override;
37+
TensorVec run(const TensorVec& inputs) override;
3838
std::tuple<TensorVec, TensorVec, std::vector<bool>> run_with_grad(
3939
const TensorVec& inputs, const std::vector<bool>& input_requires_grad
40-
) const override;
40+
) override;
4141
std::tuple<TensorVec, std::vector<std::tuple<std::string, Tensor>>> run_backward(
4242
const TensorVec& output_grads,
4343
const TensorVec& stored_locals,
4444
const std::vector<bool>& eval_grad
45-
) const override;
45+
) override;
4646

4747
Context& context() { return *_context; }
4848
std::mt19937& rand_gen() { return _rand_gens.get(); }

madspace/src/gpu/device.cu

Lines changed: 72 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ using namespace madspace;
66
using namespace madspace::gpu;
77
using namespace madspace::kernels;
88

9-
std::pair<void*, Tensor> GpuDevice::allocate(std::size_t size) const {
9+
std::pair<void*, Tensor> GpuDevice::allocate(std::size_t size, AllocHint hint) const {
1010
activate();
1111
void* ptr;
1212
check_error(gpuMalloc(&ptr, size));
@@ -25,19 +25,19 @@ void GpuDevice::memcpy(void* to, void* from, std::size_t size) const {
2525

2626
void GpuDevice::tensor_copy(const Tensor& source, Tensor& target) const {
2727
activate();
28-
AsyncGpuDevice(*this, gpuStreamPerThread).tensor_copy(source, target);
28+
AsyncGpuDevice(*this, gpuStreamPerThread, 0).tensor_copy(source, target);
2929
check_error(gpuStreamSynchronize(gpuStreamPerThread));
3030
}
3131

3232
void GpuDevice::tensor_zero(Tensor& tensor) const {
3333
activate();
34-
AsyncGpuDevice(*this, gpuStreamPerThread).tensor_zero(tensor);
34+
AsyncGpuDevice(*this, gpuStreamPerThread, 0).tensor_zero(tensor);
3535
check_error(gpuStreamSynchronize(gpuStreamPerThread));
3636
}
3737

3838
void GpuDevice::tensor_add(const Tensor& source, Tensor& target) const {
3939
activate();
40-
AsyncGpuDevice(*this, gpuStreamPerThread).tensor_add(source, target);
40+
AsyncGpuDevice(*this, gpuStreamPerThread, 0).tensor_add(source, target);
4141
check_error(gpuStreamSynchronize(gpuStreamPerThread));
4242
}
4343

@@ -65,8 +65,9 @@ MemPool::MemPool(
6565
auto& pool = _pools.at(pool_index);
6666
std::size_t word_count = (size + 7) / 8;
6767
pool.parent_tensor = Tensor(DataType::dt_float, {word_count}, device);
68-
pool.size = word_count * 8;
68+
pool.capacity = word_count * 8;
6969
pool.needed_size = word_count * 8;
70+
//println("create pool {} {}", pool_index, pool.size);
7071
}
7172
}
7273

@@ -83,22 +84,29 @@ MemPool::~MemPool() {
8384

8485
std::pair<void*, Tensor> MemPool::allocate(std::size_t pool_index, std::size_t size) {
8586
if (pool_index >= _pools.size()) {
86-
_pools.resize(pool_index);
87+
_pools.resize(pool_index + 1);
8788
}
8889
PoolItem& pool = _pools.at(pool_index);
8990
if (auto search = pool.free_pointers.find(size);
9091
search != pool.free_pointers.end()) {
91-
std::pair<void*, Tensor> ret = *search->second;
92+
std::pair<void*, Tensor> ret = search->second;
93+
_allocs[ret.first] = {
94+
.pool_index = pool_index,
95+
.size = size,
96+
.parent_tensor = ret.second,
97+
};
98+
//println("reuse {} {} {}", ret.first, pool_index, size);
9299
pool.free_pointers.erase(search);
93100
return ret;
94-
} else if (pool.capacity - pool.size >= size) {
101+
} else if (pool.parent_tensor && pool.capacity - pool.size >= size) {
95102
void* ptr = &static_cast<uint8_t*>(pool.parent_tensor.data())[pool.size];
96103
pool.size = (pool.size + size + 7) / 8 * 8;
97104
_allocs[ptr] = {
98105
.pool_index = pool_index,
99106
.size = size,
100107
.parent_tensor = pool.parent_tensor,
101108
};
109+
//println("pooled {} {} {} {} {}", ptr, pool_index, size, pool.size, pool.capacity);
102110
return {ptr, pool.parent_tensor};
103111
} else {
104112
void* ptr;
@@ -108,25 +116,28 @@ std::pair<void*, Tensor> MemPool::allocate(std::size_t pool_index, std::size_t s
108116
.size = size,
109117
.parent_tensor = Tensor(),
110118
};
119+
//println("alloc {} {} {}", ptr, pool_index, size);
111120
pool.needed_size += (size + 7) / 8 * 8;
112121
return {ptr, Tensor()};
113122
}
114123
}
115124

116125
void MemPool::free(void* ptr) {
117-
auto search = _allocs.find(ptr) if (search == _allocs.end()) {
126+
auto search = _allocs.find(ptr);
127+
if (search == _allocs.end()) {
118128
throw std::runtime_error("address was not allocated using this pool");
119129
}
120130
auto& alloc = search->second;
121131
_pools.at(alloc.pool_index)
122-
.free_pointers.emplace(alloc.size, {ptr, alloc.parent_tensor});
132+
.free_pointers.emplace(alloc.size, std::pair<void*, Tensor>{ptr, alloc.parent_tensor});
133+
//println("free {} {} {}", ptr, alloc.pool_index, alloc.size);
123134
_allocs.erase(search);
124135
}
125136

126137
std::vector<std::pair<std::size_t, std::size_t>> MemPool::total_sizes() const {
127138
std::vector<std::pair<std::size_t, std::size_t>> ret;
128139
ret.reserve(_pools.size());
129-
for (std::size_t index = 0; PoolItem& pool : _pools) {
140+
for (std::size_t index = 0; auto& pool : _pools) {
130141
if (pool.needed_size > 0) {
131142
ret.push_back({index, pool.needed_size});
132143
}
@@ -137,17 +148,46 @@ std::vector<std::pair<std::size_t, std::size_t>> MemPool::total_sizes() const {
137148

138149
std::pair<void*, Tensor>
139150
AsyncGpuDevice::allocate(std::size_t size, AllocHint hint) const {
140-
if (_mem_pool != nullptr && hint != AllocHint::normal) {
141-
return _mem_pool->allocate(static_cast<std::size_t>(hint) - 1, size);
151+
if (_mem_pool) {
152+
std::size_t pool_index;
153+
switch (hint) {
154+
case AllocHint::normal:
155+
throw std::runtime_error("allocation without hint");
156+
case AllocHint::output:
157+
pool_index = 0;
158+
break;
159+
case AllocHint::local:
160+
pool_index = 3 + 3 * _stream_index;
161+
break;
162+
case AllocHint::temporary:
163+
pool_index = 4 + 3 * _stream_index;
164+
break;
165+
case AllocHint::input_grad:
166+
pool_index = 1;
167+
break;
168+
case AllocHint::local_grad:
169+
pool_index = 5 + 3 * _stream_index;
170+
break;
171+
case AllocHint::global_grad:
172+
pool_index = 2;
173+
break;
174+
}
175+
return _mem_pool->allocate(pool_index, size);
142176
} else {
143-
_device.allocate(size, hint);
144-
// void* ptr;
145-
// check_error(gpuMallocAsync(&ptr, size, _stream));
146-
// return {ptr, Tensor()};
177+
//_device.allocate(size, hint);
178+
void* ptr;
179+
check_error(gpuMallocAsync(&ptr, size, _stream));
180+
return {ptr, Tensor()};
147181
}
148182
}
149183

150-
void AsyncGpuDevice::free(void* ptr) const { check_error(gpuFreeAsync(ptr, _stream)); }
184+
void AsyncGpuDevice::free(void* ptr) const {
185+
if (_mem_pool) {
186+
_mem_pool->free(ptr);
187+
} else {
188+
check_error(gpuFreeAsync(ptr, _stream));
189+
}
190+
}
151191

152192
void AsyncGpuDevice::memcpy(void* to, void* from, std::size_t size) const {
153193
check_error(gpuMemcpyAsync(to, from, size, gpuMemcpyDefault, _stream));
@@ -170,13 +210,21 @@ void AsyncGpuDevice::tensor_copy(const Tensor& source, Tensor& target) const {
170210

171211
void AsyncGpuDevice::tensor_zero(Tensor& tensor) const {
172212
if (tensor.dtype() == DataType::dt_float) {
173-
tensor_foreach_dynamic<kernel_zero<GpuTypes>, 1, 1>(
174-
{&tensor}, {&tensor}, tensor.size(0), *this
175-
);
213+
if (tensor.is_contiguous()) {
214+
gpuMemsetAsync(tensor.data(), 0, tensor.byte_size(), _stream);
215+
} else {
216+
tensor_foreach_dynamic<kernel_zero<GpuTypes>, 1, 1>(
217+
{&tensor}, {&tensor}, tensor.size(0), *this
218+
);
219+
}
176220
} else if (tensor.dtype() == DataType::dt_int) {
177-
tensor_foreach_dynamic<kernel_zero_int<GpuTypes>, 1, 1>(
178-
{&tensor}, {&tensor}, tensor.size(0), *this
179-
);
221+
if (tensor.is_contiguous()) {
222+
gpuMemsetAsync(tensor.data(), 0, tensor.byte_size(), _stream);
223+
} else {
224+
tensor_foreach_dynamic<kernel_zero_int<GpuTypes>, 1, 1>(
225+
{&tensor}, {&tensor}, tensor.size(0), *this
226+
);
227+
}
180228
} else {
181229
throw std::runtime_error("invalid dtype in zero");
182230
}

madspace/src/gpu/device.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ class GpuDevice : public Device {
3939
#else
4040
static constexpr DeviceType gpu_device_type = DeviceType::hip;
4141
#endif
42-
void* allocate(std::size_t size) const override;
42+
virtual std::pair<void*, Tensor>
43+
allocate(std::size_t size, AllocHint hint) const override;
4344
void free(void* ptr) const override;
4445
void memcpy(void* to, void* from, std::size_t size) const override;
4546

@@ -76,7 +77,7 @@ class GpuDevice : public Device {
7677

7778
class MemPool {
7879
public:
79-
MemPool(const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes);
80+
MemPool(const GpuDevice& device, const std::vector<std::pair<std::size_t, std::size_t>>& cached_sizes);
8081
~MemPool();
8182
std::pair<void*, Tensor> allocate(std::size_t pool_index, std::size_t size);
8283
void free(void* ptr);
@@ -103,9 +104,9 @@ class MemPool {
103104
class AsyncGpuDevice {
104105
public:
105106
AsyncGpuDevice(
106-
const GpuDevice& device, gpuStream_t stream, MemPool* mem_pool = nullptr
107+
const GpuDevice& device, gpuStream_t stream, std::size_t stream_index, MemPool* mem_pool = nullptr
107108
) :
108-
_device(device), _stream(stream), _mem_pool(mem_pool) {}
109+
_device(device), _stream(stream), _stream_index(stream_index), _mem_pool(mem_pool) {}
109110

110111
std::pair<void*, Tensor> allocate(std::size_t size, AllocHint hint) const;
111112
void free(void* ptr) const;
@@ -122,6 +123,7 @@ class AsyncGpuDevice {
122123
private:
123124
const GpuDevice& _device;
124125
gpuStream_t _stream;
126+
std::size_t _stream_index;
125127
MemPool* _mem_pool;
126128
};
127129

madspace/src/gpu/gpu_abstraction.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#define gpuMemcpy cudaMemcpy
1616
#define gpuMemcpyDefault cudaMemcpyDefault
1717
#define gpuMemcpyAsync cudaMemcpyAsync
18+
#define gpuMemsetAsync cudaMemsetAsync
1819
#define gpuStreamPerThread cudaStreamPerThread
1920
#define gpuStreamSynchronize cudaStreamSynchronize
2021
#define gpuStream_t cudaStream_t
@@ -70,6 +71,7 @@
7071
#define gpuMemcpy hipMemcpy
7172
#define gpuMemcpyDefault hipMemcpyDefault
7273
#define gpuMemcpyAsync hipMemcpyAsync
74+
#define gpuMemsetAsync hipMemsetAsync
7375
#define gpuStreamPerThread hipStreamPerThread
7476
#define gpuStreamSynchronize hipStreamSynchronize
7577
#define gpuStream_t hipStream_t

0 commit comments

Comments
 (0)