Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/a2a3/platform/include/host/memory_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ class MemoryAllocator {

private:
std::set<void *> ptr_set_;
bool finalized_{false};
};

#endif // PLATFORM_MEMORY_ALLOCATOR_H_
67 changes: 46 additions & 21 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,8 @@ std::thread DeviceRunner::create_thread(std::function<void()> fn) {
int DeviceRunner::ensure_device_initialized(
int device_id, const std::vector<uint8_t> &aicpu_so_binary, const std::vector<uint8_t> &aicore_kernel_binary
) {
// First ensure device is set and streams are created
int rc = ensure_device_set(device_id);
// First attach the current thread and create fresh run-scoped streams
int rc = prepare_run_context(device_id);
if (rc != 0) {
return rc;
}
Expand All @@ -246,20 +246,41 @@ int DeviceRunner::ensure_device_initialized(
return ensure_binaries_loaded(aicpu_so_binary, aicore_kernel_binary);
}

int DeviceRunner::ensure_device_set(int device_id) {
// Always set device for the calling thread (CANN device context is per-thread)
int DeviceRunner::attach_current_thread(int device_id) {
if (device_id < 0) {
LOG_ERROR("Invalid device_id: %d", device_id);
return -1;
}
if (device_id_ != -1 && device_id_ != device_id) {
LOG_ERROR(
"DeviceRunner already initialized on device %d; reset/finalize before switching to device %d", device_id_,
device_id
);
return -1;
}

// CANN device context is per-thread, so every caller must attach explicitly.
int rc = rtSetDevice(device_id);
if (rc != 0) {
LOG_ERROR("rtSetDevice(%d) failed: %d", device_id, rc);
return rc;
}

// Create streams only on first call
if (stream_aicpu_ != nullptr) {
device_id_ = device_id;
return 0;
}

int DeviceRunner::prepare_run_context(int device_id) {
int rc = attach_current_thread(device_id);
if (rc != 0) {
return rc;
}

if (stream_aicpu_ != nullptr && stream_aicore_ != nullptr) {
return 0;
}

device_id_ = device_id;
release_run_context();

// Create streams
rc = rtStreamCreate(&stream_aicpu_, 0);
Expand All @@ -280,7 +301,7 @@ int DeviceRunner::ensure_device_set(int device_id) {
return 0;
}

void DeviceRunner::reset_device_context() {
void DeviceRunner::release_run_context() {
// Destroy streams (they belong to the current thread's CANN context)
if (stream_aicpu_ != nullptr) {
rtStreamDestroy(stream_aicpu_);
Expand All @@ -290,7 +311,6 @@ void DeviceRunner::reset_device_context() {
rtStreamDestroy(stream_aicore_);
stream_aicore_ = nullptr;
}
rtDeviceReset(device_id_);
}

int DeviceRunner::ensure_binaries_loaded(
Expand Down Expand Up @@ -629,10 +649,18 @@ void DeviceRunner::print_handshake_results() {
}

int DeviceRunner::finalize() {
if (stream_aicpu_ == nullptr) {
if (device_id_ == -1) {
return 0;
}

int rc = attach_current_thread(device_id_);
if (rc != 0) {
LOG_ERROR("Failed to attach finalize thread to device %d: %d", device_id_, rc);
return rc;
}

release_run_context();

// Cleanup kernel args (deviceArgs)
kernel_args_.finalize_device_args();

Expand All @@ -652,16 +680,6 @@ int DeviceRunner::finalize() {
func_id_to_addr_.clear();
binaries_loaded_ = false;

// Destroy streams
if (stream_aicpu_ != nullptr) {
rtStreamDestroy(stream_aicpu_);
stream_aicpu_ = nullptr;
}
if (stream_aicore_ != nullptr) {
rtStreamDestroy(stream_aicore_);
stream_aicore_ = nullptr;
}

// Cleanup performance profiling
if (perf_collector_.is_initialized()) {
auto unregister_cb = [](void *dev_ptr, int device_id) -> int {
Expand Down Expand Up @@ -700,7 +718,14 @@ int DeviceRunner::finalize() {
// Free all remaining allocations (including handshake buffer and binGmAddr)
mem_alloc_.finalize();

rc = rtDeviceReset(device_id_);
if (rc != 0) {
LOG_ERROR("rtDeviceReset(%d) failed during finalize: %d", device_id_, rc);
return rc;
}

device_id_ = -1;
block_dim_ = 0;
worker_count_ = 0;
aicore_kernel_binary_.clear();

Expand Down Expand Up @@ -787,7 +812,7 @@ uint64_t DeviceRunner::upload_kernel_binary(int func_id, const uint8_t *bin_data

// Device must be set first (set_device() must be called before upload_kernel_binary())
if (stream_aicpu_ == nullptr) {
LOG_ERROR("Device not set. Call set_device() before upload_kernel_binary()");
LOG_ERROR("Run context not prepared before upload_kernel_binary()");
return 0;
}

Expand Down
40 changes: 25 additions & 15 deletions src/a2a3/platform/onboard/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,7 @@ class DeviceRunner {

/**
* Create a thread bound to this device.
* The thread calls rtSetDevice(device_id) on entry
* and rtDeviceReset(device_id) on exit.
* The thread calls rtSetDevice(device_id) on entry.
*/
std::thread create_thread(std::function<void()> fn);

Expand Down Expand Up @@ -325,7 +324,7 @@ class DeviceRunner {
/**
* Upload a kernel binary to device memory
*
* IMPORTANT: ensure_device_set() must be called before this function.
* IMPORTANT: prepare_run_context() must be called before this function.
* Kernels are immediately copied to device memory.
*
* Receives pre-extracted .text section binary data,
Expand Down Expand Up @@ -354,24 +353,35 @@ class DeviceRunner {
void remove_kernel_binary(int func_id);

/**
* Ensure device is set and streams are created (minimal initialization)
* Attach the current host thread to the target device.
*
* This is called by set_device() C API to enable memory allocation
* before init_runtime(). Only performs:
* - rtSetDevice(device_id)
* - Create AICPU and AICore streams
* This is required before host-side runtime initialization may allocate or
* free device memory on the current thread. No streams are created here.
*
* @param device_id Device ID (0-15)
* @return 0 on success, error code on failure
*/
int ensure_device_set(int device_id);
int attach_current_thread(int device_id);

/**
* Reset per-thread CANN device context and clear cached streams.
* Called after each run_runtime() completes so the next run on a
* fresh thread can recreate streams in its own context.
* Ensure the current thread has fresh run-scoped streams.
*
* This attaches the current thread to the target device and lazily creates
* the AICPU/AICore streams used by a single run.
*
* @param device_id Device ID (0-15)
* @return 0 on success, error code on failure
*/
int prepare_run_context(int device_id);

/**
* Release run-scoped resources owned by the current thread.
*
* This destroys AICPU/AICore streams but intentionally preserves device
* allocations, uploaded binaries, and other session state so they can be
* finalized later before rtDeviceReset().
*/
void reset_device_context();
void release_run_context();

private:
// Internal state
Expand Down Expand Up @@ -405,7 +415,7 @@ class DeviceRunner {
* Ensure device is initialized (lazy initialization)
*
* Checks if device is already initialized. If not, performs:
* - rtSetDevice(device_id)
* - Attach the current thread to the device
* - Create AICPU and AICore streams
* - Load AICPU SO to device memory
* - Initialize device args
Expand All @@ -422,7 +432,7 @@ class DeviceRunner {
/**
* Load AICPU SO and initialize device args
*
* Called by run() after ensure_device_set(). Performs:
* Called by run() after prepare_run_context(). Performs:
* - Load AICPU SO to device memory
* - Initialize device args
*
Expand Down
8 changes: 1 addition & 7 deletions src/a2a3/platform/onboard/host/memory_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,6 @@ int MemoryAllocator::free(void *ptr) {
}

int MemoryAllocator::finalize() {
// Idempotent - safe to call multiple times
if (finalized_) {
return 0;
}

int last_error = 0;

// Free all remaining tracked pointers
Expand All @@ -77,9 +72,8 @@ int MemoryAllocator::finalize() {
}
}

// Clear the set
// Clear the set (empty set makes subsequent finalize() calls a no-op)
ptr_set_.clear();
finalized_ = true;

return last_error;
}
6 changes: 3 additions & 3 deletions src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,10 @@ int run_runtime(
});

try {
int rc = runner->ensure_device_set(device_id);
int rc = runner->prepare_run_context(device_id);
if (rc != 0) return rc;
auto device_guard = RAIIScopeGuard([runner]() {
runner->reset_device_context();
auto run_context_guard = RAIIScopeGuard([runner]() {
runner->release_run_context();
});

Runtime *r = new (runtime) Runtime();
Expand Down
8 changes: 1 addition & 7 deletions src/a2a3/platform/sim/host/memory_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,13 @@ int MemoryAllocator::free(void *ptr) {
}

int MemoryAllocator::finalize() {
// Idempotent - safe to call multiple times
if (finalized_) {
return 0;
}

// Free all remaining tracked pointers
for (void *ptr : ptr_set_) {
std::free(ptr);
}

// Clear the set
// Clear the set (empty set makes subsequent finalize() calls a no-op)
ptr_set_.clear();
finalized_ = true;

return 0;
}
1 change: 0 additions & 1 deletion src/a5/platform/include/host/memory_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ class MemoryAllocator {

private:
std::set<void *> ptr_set_;
bool finalized_{false};
};

#endif // PLATFORM_MEMORY_ALLOCATOR_H_
Loading
Loading