Skip to content

Commit 1432557

Browse files
committed
fix: store Metal completion handler errors instead of throwing
Metal completion handlers run on dispatch queues where C++ exceptions cannot propagate — throwing causes std::terminate → SIGABRT, crashing the process with no diagnostic information. Instead, store the error message atomically in the CommandEncoder and check it at the next synchronous point (commit, synchronize). This converts fatal crashes into catchable runtime_error exceptions that the application can handle gracefully. Root cause analysis: the crash at 262K+ context reported as mlx#3216 was actually TWO separate issues: 1. Thread safety in stream management (fixed by PR #3281) 2. C++ exceptions thrown from Metal completion handler callbacks (fixed by this commit) The GPU watchdog error (kIOGPUCommandBufferCallbackErrorImpactingInteractivity) is a separate concern — macOS kills command buffers that block the GPU beyond the watchdog threshold. This commit ensures that error is reported as a Python RuntimeError instead of SIGABRT.
1 parent f35ce26 commit 1432557

3 files changed

Lines changed: 70 additions & 7 deletions

File tree

mlx/backend/metal/device.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,28 @@ bool CommandEncoder::needs_commit() const {
424424
return (buffer_ops_ > max_ops) || ((buffer_sizes_ >> 20) > max_mb);
425425
}
426426

427+
void CommandEncoder::store_error(const std::string& msg) {
428+
std::lock_guard<std::mutex> lock(error_mtx_);
429+
if (!has_error_.load(std::memory_order_relaxed)) {
430+
error_msg_ = msg;
431+
has_error_.store(true, std::memory_order_release);
432+
}
433+
}
434+
435+
void CommandEncoder::check_stored_error() {
436+
if (has_error_.load(std::memory_order_acquire)) {
437+
std::string msg;
438+
{
439+
std::lock_guard<std::mutex> lock(error_mtx_);
440+
msg = std::move(error_msg_);
441+
has_error_.store(false, std::memory_order_release);
442+
}
443+
throw std::runtime_error(msg);
444+
}
445+
}
446+
427447
void CommandEncoder::commit() {
448+
check_stored_error();
428449
buffer_->commit();
429450
buffer_ = NS::RetainPtr(queue_->commandBufferWithUnretainedReferences());
430451
buffer_ops_ = 0;

mlx/backend/metal/device.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#pragma once
44

55
#include <Metal/Metal.hpp>
6+
#include <atomic>
67
#include <functional>
78
#include <mutex>
89
#include <shared_mutex>
@@ -90,6 +91,13 @@ class MLX_API CommandEncoder {
9091
bool needs_commit() const;
9192
void commit();
9293

94+
// Deferred error handling for completion handler errors.
95+
// Metal completion handlers run on dispatch queues where C++ exceptions
96+
// cannot propagate — throwing causes std::terminate → SIGABRT.
97+
// Instead, errors are stored and checked at the next synchronous point.
98+
void store_error(const std::string& msg);
99+
void check_stored_error();
100+
93101
MTL::CommandQueue* get_command_queue() const {
94102
return queue_.get();
95103
}
@@ -125,6 +133,12 @@ class MLX_API CommandEncoder {
125133
// A map of prior command encoder outputs to their corresponding fence.
126134
std::unordered_map<const void*, NS::SharedPtr<MTL::Fence>> prev_ce_outputs_;
127135
std::mutex outputs_mtx_;
136+
137+
// Deferred error from Metal completion handlers (set from dispatch queue,
138+
// checked from eval/synchronize thread).
139+
std::atomic<bool> has_error_{false};
140+
std::mutex error_mtx_;
141+
std::string error_msg_;
128142
};
129143

130144
class MLX_API Device {

mlx/backend/metal/eval.cpp

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ void new_stream(Stream stream) {
1717
}
1818
}
1919

20-
inline void check_error(MTL::CommandBuffer* cbuf) {
20+
// check_error_throw: for synchronous contexts where exceptions propagate.
21+
inline void check_error_throw(MTL::CommandBuffer* cbuf) {
2122
if (cbuf->status() == MTL::CommandBufferStatusError) {
2223
std::ostringstream msg;
2324
msg << "[METAL] Command buffer execution failed: "
@@ -56,19 +57,34 @@ void eval(array& arr) {
5657
buffers.erase(it);
5758
}
5859

60+
// Capture a pointer to the encoder for deferred error storage.
61+
// Completion handlers run on Metal's dispatch queue where C++ exceptions
62+
// cannot propagate (throwing → std::terminate → SIGABRT). Instead, store
63+
// the error and check it at the next synchronous point (commit, synchronize).
64+
auto* enc = &d.get_command_encoder(s.index);
5965
if (d.command_buffer_needs_commit(s.index)) {
6066
d.end_encoding(s.index);
6167
scheduler::notify_new_task(s);
6268
command_buffer->addCompletedHandler(
63-
[s, buffers = std::move(buffers)](MTL::CommandBuffer* cbuf) {
69+
[s, enc, buffers = std::move(buffers)](MTL::CommandBuffer* cbuf) {
6470
scheduler::notify_task_completion(s);
65-
check_error(cbuf);
71+
if (cbuf->status() == MTL::CommandBufferStatusError) {
72+
std::ostringstream msg;
73+
msg << "[METAL] Command buffer execution failed: "
74+
<< cbuf->error()->localizedDescription()->utf8String();
75+
enc->store_error(msg.str());
76+
}
6677
});
6778
d.commit_command_buffer(s.index);
6879
} else {
6980
command_buffer->addCompletedHandler(
70-
[buffers = std::move(buffers)](MTL::CommandBuffer* cbuf) {
71-
check_error(cbuf);
81+
[enc, buffers = std::move(buffers)](MTL::CommandBuffer* cbuf) {
82+
if (cbuf->status() == MTL::CommandBufferStatusError) {
83+
std::ostringstream msg;
84+
msg << "[METAL] Command buffer execution failed: "
85+
<< cbuf->error()->localizedDescription()->utf8String();
86+
enc->store_error(msg.str());
87+
}
7288
});
7389
}
7490
}
@@ -77,20 +93,32 @@ void finalize(Stream s) {
7793
auto pool = metal::new_scoped_memory_pool();
7894
auto& d = metal::device(s.device);
7995
auto cb = d.get_command_buffer(s.index);
96+
auto* enc = &d.get_command_encoder(s.index);
8097
d.end_encoding(s.index);
81-
cb->addCompletedHandler([](MTL::CommandBuffer* cbuf) { check_error(cbuf); });
98+
cb->addCompletedHandler([enc](MTL::CommandBuffer* cbuf) {
99+
if (cbuf->status() == MTL::CommandBufferStatusError) {
100+
std::ostringstream msg;
101+
msg << "[METAL] Command buffer execution failed: "
102+
<< cbuf->error()->localizedDescription()->utf8String();
103+
enc->store_error(msg.str());
104+
}
105+
});
82106
d.commit_command_buffer(s.index);
83107
}
84108

85109
void synchronize(Stream s) {
86110
auto pool = metal::new_scoped_memory_pool();
87111
auto& d = metal::device(s.device);
112+
auto& enc = d.get_command_encoder(s.index);
88113
auto cb = d.get_command_buffer(s.index);
89114
cb->retain();
90115
d.end_encoding(s.index);
91116
d.commit_command_buffer(s.index);
92117
cb->waitUntilCompleted();
93-
check_error(cb);
118+
// Check both this command buffer and any deferred errors from earlier
119+
// completion handlers that ran on Metal's dispatch queue.
120+
check_error_throw(cb);
121+
enc.check_stored_error();
94122
cb->release();
95123
}
96124

0 commit comments

Comments
 (0)