@@ -236,8 +236,8 @@ std::thread DeviceRunner::create_thread(std::function<void()> fn) {
236236int DeviceRunner::ensure_device_initialized (
237237 int device_id, const std::vector<uint8_t > &aicpu_so_binary, const std::vector<uint8_t > &aicore_kernel_binary
238238) {
239- // First ensure device is set and streams are created
240- int rc = ensure_device_set (device_id);
239+ // First attach the current thread and create fresh run-scoped streams
240+ int rc = prepare_run_context (device_id);
241241 if (rc != 0 ) {
242242 return rc;
243243 }
@@ -246,20 +246,41 @@ int DeviceRunner::ensure_device_initialized(
246246 return ensure_binaries_loaded (aicpu_so_binary, aicore_kernel_binary);
247247}
248248
249- int DeviceRunner::ensure_device_set (int device_id) {
250- // Always set device for the calling thread (CANN device context is per-thread)
249+ int DeviceRunner::attach_current_thread (int device_id) {
250+ if (device_id < 0 ) {
251+ LOG_ERROR (" Invalid device_id: %d" , device_id);
252+ return -1 ;
253+ }
254+ if (device_id_ != -1 && device_id_ != device_id) {
255+ LOG_ERROR (
256+ " DeviceRunner already initialized on device %d; reset/finalize before switching to device %d" , device_id_,
257+ device_id
258+ );
259+ return -1 ;
260+ }
261+
262+ // CANN device context is per-thread, so every caller must attach explicitly.
251263 int rc = rtSetDevice (device_id);
252264 if (rc != 0 ) {
253265 LOG_ERROR (" rtSetDevice(%d) failed: %d" , device_id, rc);
254266 return rc;
255267 }
256268
257- // Create streams only on first call
258- if (stream_aicpu_ != nullptr ) {
269+ device_id_ = device_id;
270+ return 0 ;
271+ }
272+
273+ int DeviceRunner::prepare_run_context (int device_id) {
274+ int rc = attach_current_thread (device_id);
275+ if (rc != 0 ) {
276+ return rc;
277+ }
278+
279+ if (stream_aicpu_ != nullptr && stream_aicore_ != nullptr ) {
259280 return 0 ;
260281 }
261282
262- device_id_ = device_id ;
283+ release_run_context () ;
263284
264285 // Create streams
265286 rc = rtStreamCreate (&stream_aicpu_, 0 );
@@ -280,7 +301,7 @@ int DeviceRunner::ensure_device_set(int device_id) {
280301 return 0 ;
281302}
282303
283- void DeviceRunner::reset_device_context () {
304+ void DeviceRunner::release_run_context () {
284305 // Destroy streams (they belong to the current thread's CANN context)
285306 if (stream_aicpu_ != nullptr ) {
286307 rtStreamDestroy (stream_aicpu_);
@@ -290,7 +311,6 @@ void DeviceRunner::reset_device_context() {
290311 rtStreamDestroy (stream_aicore_);
291312 stream_aicore_ = nullptr ;
292313 }
293- rtDeviceReset (device_id_);
294314}
295315
296316int DeviceRunner::ensure_binaries_loaded (
@@ -629,10 +649,18 @@ void DeviceRunner::print_handshake_results() {
629649}
630650
631651int DeviceRunner::finalize () {
632- if (stream_aicpu_ == nullptr ) {
652+ if (device_id_ == - 1 ) {
633653 return 0 ;
634654 }
635655
656+ int rc = attach_current_thread (device_id_);
657+ if (rc != 0 ) {
658+ LOG_ERROR (" Failed to attach finalize thread to device %d: %d" , device_id_, rc);
659+ return rc;
660+ }
661+
662+ release_run_context ();
663+
636664 // Cleanup kernel args (deviceArgs)
637665 kernel_args_.finalize_device_args ();
638666
@@ -652,16 +680,6 @@ int DeviceRunner::finalize() {
652680 func_id_to_addr_.clear ();
653681 binaries_loaded_ = false ;
654682
655- // Destroy streams
656- if (stream_aicpu_ != nullptr ) {
657- rtStreamDestroy (stream_aicpu_);
658- stream_aicpu_ = nullptr ;
659- }
660- if (stream_aicore_ != nullptr ) {
661- rtStreamDestroy (stream_aicore_);
662- stream_aicore_ = nullptr ;
663- }
664-
665683 // Cleanup performance profiling
666684 if (perf_collector_.is_initialized ()) {
667685 auto unregister_cb = [](void *dev_ptr, int device_id) -> int {
@@ -700,7 +718,14 @@ int DeviceRunner::finalize() {
700718 // Free all remaining allocations (including handshake buffer and binGmAddr)
701719 mem_alloc_.finalize ();
702720
721+ rc = rtDeviceReset (device_id_);
722+ if (rc != 0 ) {
723+ LOG_ERROR (" rtDeviceReset(%d) failed during finalize: %d" , device_id_, rc);
724+ return rc;
725+ }
726+
703727 device_id_ = -1 ;
728+ block_dim_ = 0 ;
704729 worker_count_ = 0 ;
705730 aicore_kernel_binary_.clear ();
706731
0 commit comments