Skip to content

Commit 8220425

Browse files
committed
Fix: finalize onboard resources before device reset
- move rtDeviceReset to finalize after kernel args, uploaded binaries, profiling collectors, dump collectors, and tracked device allocations are released - introduce attach_current_thread, prepare_run_context, and release_run_context to separate per-thread device binding from run-scoped stream lifecycle - keep run_runtime cleanup lightweight by releasing only run-scoped streams while preserving session resources for explicit finalize - simplify MemoryAllocator finalize semantics by relying on tracked-pointer set emptiness instead of a separate finalized flag
1 parent 6800c38 commit 8220425

12 files changed

Lines changed: 150 additions & 105 deletions

File tree

src/a2a3/platform/include/host/memory_allocator.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ class MemoryAllocator {
9999

100100
private:
101101
std::set<void *> ptr_set_;
102-
bool finalized_{false};
103102
};
104103

105104
#endif // PLATFORM_MEMORY_ALLOCATOR_H_

src/a2a3/platform/onboard/host/device_runner.cpp

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -236,8 +236,8 @@ std::thread DeviceRunner::create_thread(std::function<void()> fn) {
236236
int DeviceRunner::ensure_device_initialized(
237237
int device_id, const std::vector<uint8_t> &aicpu_so_binary, const std::vector<uint8_t> &aicore_kernel_binary
238238
) {
239-
// First ensure device is set and streams are created
240-
int rc = ensure_device_set(device_id);
239+
// First attach the current thread and create fresh run-scoped streams
240+
int rc = prepare_run_context(device_id);
241241
if (rc != 0) {
242242
return rc;
243243
}
@@ -246,20 +246,41 @@ int DeviceRunner::ensure_device_initialized(
246246
return ensure_binaries_loaded(aicpu_so_binary, aicore_kernel_binary);
247247
}
248248

249-
int DeviceRunner::ensure_device_set(int device_id) {
250-
// Always set device for the calling thread (CANN device context is per-thread)
249+
int DeviceRunner::attach_current_thread(int device_id) {
250+
if (device_id < 0) {
251+
LOG_ERROR("Invalid device_id: %d", device_id);
252+
return -1;
253+
}
254+
if (device_id_ != -1 && device_id_ != device_id) {
255+
LOG_ERROR(
256+
"DeviceRunner already initialized on device %d; reset/finalize before switching to device %d", device_id_,
257+
device_id
258+
);
259+
return -1;
260+
}
261+
262+
// CANN device context is per-thread, so every caller must attach explicitly.
251263
int rc = rtSetDevice(device_id);
252264
if (rc != 0) {
253265
LOG_ERROR("rtSetDevice(%d) failed: %d", device_id, rc);
254266
return rc;
255267
}
256268

257-
// Create streams only on first call
258-
if (stream_aicpu_ != nullptr) {
269+
device_id_ = device_id;
270+
return 0;
271+
}
272+
273+
int DeviceRunner::prepare_run_context(int device_id) {
274+
int rc = attach_current_thread(device_id);
275+
if (rc != 0) {
276+
return rc;
277+
}
278+
279+
if (stream_aicpu_ != nullptr && stream_aicore_ != nullptr) {
259280
return 0;
260281
}
261282

262-
device_id_ = device_id;
283+
release_run_context();
263284

264285
// Create streams
265286
rc = rtStreamCreate(&stream_aicpu_, 0);
@@ -280,7 +301,7 @@ int DeviceRunner::ensure_device_set(int device_id) {
280301
return 0;
281302
}
282303

283-
void DeviceRunner::reset_device_context() {
304+
void DeviceRunner::release_run_context() {
284305
// Destroy streams (they belong to the current thread's CANN context)
285306
if (stream_aicpu_ != nullptr) {
286307
rtStreamDestroy(stream_aicpu_);
@@ -290,7 +311,6 @@ void DeviceRunner::reset_device_context() {
290311
rtStreamDestroy(stream_aicore_);
291312
stream_aicore_ = nullptr;
292313
}
293-
rtDeviceReset(device_id_);
294314
}
295315

296316
int DeviceRunner::ensure_binaries_loaded(
@@ -629,10 +649,18 @@ void DeviceRunner::print_handshake_results() {
629649
}
630650

631651
int DeviceRunner::finalize() {
632-
if (stream_aicpu_ == nullptr) {
652+
if (device_id_ == -1) {
633653
return 0;
634654
}
635655

656+
int rc = attach_current_thread(device_id_);
657+
if (rc != 0) {
658+
LOG_ERROR("Failed to attach finalize thread to device %d: %d", device_id_, rc);
659+
return rc;
660+
}
661+
662+
release_run_context();
663+
636664
// Cleanup kernel args (deviceArgs)
637665
kernel_args_.finalize_device_args();
638666

@@ -652,16 +680,6 @@ int DeviceRunner::finalize() {
652680
func_id_to_addr_.clear();
653681
binaries_loaded_ = false;
654682

655-
// Destroy streams
656-
if (stream_aicpu_ != nullptr) {
657-
rtStreamDestroy(stream_aicpu_);
658-
stream_aicpu_ = nullptr;
659-
}
660-
if (stream_aicore_ != nullptr) {
661-
rtStreamDestroy(stream_aicore_);
662-
stream_aicore_ = nullptr;
663-
}
664-
665683
// Cleanup performance profiling
666684
if (perf_collector_.is_initialized()) {
667685
auto unregister_cb = [](void *dev_ptr, int device_id) -> int {
@@ -700,7 +718,14 @@ int DeviceRunner::finalize() {
700718
// Free all remaining allocations (including handshake buffer and binGmAddr)
701719
mem_alloc_.finalize();
702720

721+
rc = rtDeviceReset(device_id_);
722+
if (rc != 0) {
723+
LOG_ERROR("rtDeviceReset(%d) failed during finalize: %d", device_id_, rc);
724+
return rc;
725+
}
726+
703727
device_id_ = -1;
728+
block_dim_ = 0;
704729
worker_count_ = 0;
705730
aicore_kernel_binary_.clear();
706731

src/a2a3/platform/onboard/host/device_runner.h

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,7 @@ class DeviceRunner {
189189

190190
/**
191191
* Create a thread bound to this device.
192-
* The thread calls rtSetDevice(device_id) on entry
193-
* and rtDeviceReset(device_id) on exit.
192+
* The thread calls rtSetDevice(device_id) on entry.
194193
*/
195194
std::thread create_thread(std::function<void()> fn);
196195

@@ -325,7 +324,7 @@ class DeviceRunner {
325324
/**
326325
* Upload a kernel binary to device memory
327326
*
328-
* IMPORTANT: ensure_device_set() must be called before this function.
327+
* IMPORTANT: prepare_run_context() must be called before this function.
329328
* Kernels are immediately copied to device memory.
330329
*
331330
* Receives pre-extracted .text section binary data,
@@ -354,24 +353,35 @@ class DeviceRunner {
354353
void remove_kernel_binary(int func_id);
355354

356355
/**
357-
* Ensure device is set and streams are created (minimal initialization)
356+
* Attach the current host thread to the target device.
358357
*
359-
* This is called by set_device() C API to enable memory allocation
360-
* before init_runtime(). Only performs:
361-
* - rtSetDevice(device_id)
362-
* - Create AICPU and AICore streams
358+
* This is required before host-side runtime initialization may allocate or
359+
* free device memory on the current thread. No streams are created here.
363360
*
364361
* @param device_id Device ID (0-15)
365362
* @return 0 on success, error code on failure
366363
*/
367-
int ensure_device_set(int device_id);
364+
int attach_current_thread(int device_id);
368365

369366
/**
370-
* Reset per-thread CANN device context and clear cached streams.
371-
* Called after each run_runtime() completes so the next run on a
372-
* fresh thread can recreate streams in its own context.
367+
* Ensure the current thread has fresh run-scoped streams.
368+
*
369+
* This attaches the current thread to the target device and lazily creates
370+
* the AICPU/AICore streams used by a single run.
371+
*
372+
* @param device_id Device ID (0-15)
373+
* @return 0 on success, error code on failure
374+
*/
375+
int prepare_run_context(int device_id);
376+
377+
/**
378+
* Release run-scoped resources owned by the current thread.
379+
*
380+
* This destroys AICPU/AICore streams but intentionally preserves device
381+
* allocations, uploaded binaries, and other session state so they can be
382+
* finalized later before rtDeviceReset().
373383
*/
374-
void reset_device_context();
384+
void release_run_context();
375385

376386
private:
377387
// Internal state
@@ -405,7 +415,7 @@ class DeviceRunner {
405415
* Ensure device is initialized (lazy initialization)
406416
*
407417
* Checks if device is already initialized. If not, performs:
408-
* - rtSetDevice(device_id)
418+
* - Attach the current thread to the device
409419
* - Create AICPU and AICore streams
410420
* - Load AICPU SO to device memory
411421
* - Initialize device args
@@ -422,7 +432,7 @@ class DeviceRunner {
422432
/**
423433
* Load AICPU SO and initialize device args
424434
*
425-
* Called by run() after ensure_device_set(). Performs:
435+
* Called by run() after prepare_run_context(). Performs:
426436
* - Load AICPU SO to device memory
427437
* - Initialize device args
428438
*

src/a2a3/platform/onboard/host/memory_allocator.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,6 @@ int MemoryAllocator::free(void *ptr) {
6161
}
6262

6363
int MemoryAllocator::finalize() {
64-
// Idempotent - safe to call multiple times
65-
if (finalized_) {
66-
return 0;
67-
}
68-
6964
int last_error = 0;
7065

7166
// Free all remaining tracked pointers
@@ -77,9 +72,8 @@ int MemoryAllocator::finalize() {
7772
}
7873
}
7974

80-
// Clear the set
75+
// Clear the set (empty set makes subsequent finalize() calls a no-op)
8176
ptr_set_.clear();
82-
finalized_ = true;
8377

8478
return last_error;
8579
}

src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,10 +136,10 @@ int run_runtime(
136136
});
137137

138138
try {
139-
int rc = runner->ensure_device_set(device_id);
139+
int rc = runner->attach_current_thread(device_id);
140140
if (rc != 0) return rc;
141-
auto device_guard = RAIIScopeGuard([runner]() {
142-
runner->reset_device_context();
141+
auto run_context_guard = RAIIScopeGuard([runner]() {
142+
runner->release_run_context();
143143
});
144144

145145
Runtime *r = new (runtime) Runtime();

src/a2a3/platform/sim/host/memory_allocator.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,13 @@ int MemoryAllocator::free(void *ptr) {
5454
}
5555

5656
int MemoryAllocator::finalize() {
57-
// Idempotent - safe to call multiple times
58-
if (finalized_) {
59-
return 0;
60-
}
61-
6257
// Free all remaining tracked pointers
6358
for (void *ptr : ptr_set_) {
6459
std::free(ptr);
6560
}
6661

67-
// Clear the set
62+
// Clear the set (empty set makes subsequent finalize() calls a no-op)
6863
ptr_set_.clear();
69-
finalized_ = true;
7064

7165
return 0;
7266
}

src/a5/platform/include/host/memory_allocator.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ class MemoryAllocator {
9999

100100
private:
101101
std::set<void *> ptr_set_;
102-
bool finalized_{false};
103102
};
104103

105104
#endif // PLATFORM_MEMORY_ALLOCATOR_H_

0 commit comments

Comments
 (0)