diff --git a/.gitignore b/.gitignore index 368b752..ef7cfce 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ Thumbs.db # Frontend frontend/node_modules/ frontend/dist/ +_crashtest/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 3434530..03f0c6d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,7 @@ find_package(glaze REQUIRED) find_package(spdlog REQUIRED) find_package(GTest REQUIRED) find_package(Crow REQUIRED) +find_package(cpptrace REQUIRED) # runtime-only: crash-report symbolization add_subdirectory(sdk) add_subdirectory(runtime) diff --git a/conanfile.py b/conanfile.py index 90a7705..6ddce5a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -20,5 +20,6 @@ def requirements(self): # Runtime deps self.requires("spdlog/1.17.0") self.requires("crowcpp-crow/1.3.0") + self.requires("cpptrace/0.8.3") # runtime-only: crash-report stack symbolization if self.options.with_tests: self.requires("gtest/1.15.0") diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt index 982a18b..ec95bdb 100644 --- a/modules/CMakeLists.txt +++ b/modules/CMakeLists.txt @@ -4,6 +4,7 @@ set(LOOM_MODULES_OUTPUT_DIR "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/modules") # Build all example modules add_subdirectory(class_based) add_subdirectory(command_probe) +add_subdirectory(crasher) add_subdirectory(ethercat) add_subdirectory(example_motor) add_subdirectory(oscilloscope) diff --git a/modules/crasher/CMakeLists.txt b/modules/crasher/CMakeLists.txt new file mode 100644 index 0000000..c921d1e --- /dev/null +++ b/modules/crasher/CMakeLists.txt @@ -0,0 +1,17 @@ +add_library(crasher MODULE + crasher.cpp +) + +target_link_libraries(crasher PRIVATE + loom::sdk +) +target_include_directories(crasher PUBLIC + ${LOOM_MODULES_DIR} +) + +set_target_properties(crasher PROPERTIES + PREFIX "" + SUFFIX "${LOOM_MODULE_SUFFIX}" + LIBRARY_OUTPUT_DIRECTORY "${LOOM_MODULES_OUTPUT_DIR}" + CXX_VISIBILITY_PRESET hidden +) diff --git a/modules/crasher/crasher.cpp b/modules/crasher/crasher.cpp new file mode 100644 index 0000000..0bb64f1 --- /dev/null +++ b/modules/crasher/crasher.cpp @@ -0,0 +1,53 @@ +#include +#include + +#include +#include +#include +#include + +// ============================================================================ +// Crasher — a deliberately-faulting module for exercising crash diagnostics. +// +// Config picks the fault and when it fires: +// fault: none | throw | segfault | fpe | abort | loop +// phase: init | cyclic +// after_ticks: for phase=cyclic, fault on the Nth cyclic tick (lets the +// module load + run first, so the breadcrumb shows phase=cyclic). +// ============================================================================ + +struct CrasherConfig { + std::string fault = "none"; + std::string phase = "cyclic"; + uint64_t after_ticks = 50; +}; +struct CrasherRecipe { int _unused = 0; }; +struct CrasherRuntime { uint64_t cycle = 0; }; + +namespace { +[[maybe_unused]] void doFault(const std::string& f) { + if (f == "throw") throw std::runtime_error("crasher: intentional std::runtime_error"); + if (f == "segfault") { volatile int* p = nullptr; *p = 1; } // SIGSEGV / AV + if (f == "fpe") { volatile int a = 1, b = 0; volatile int c = a / b; (void)c; } // SIGFPE + if (f == "abort") std::abort(); // SIGABRT + if (f == "loop") { volatile bool spin = true; while (spin) {} } // hang (watchdog) +} +} // namespace + +class Crasher : public loom::Module { +public: + LOOM_MODULE_HEADER("Crasher", "1.0.0") + + void init(const loom::InitContext&) override { + if (config_.phase == "init") doFault(config_.fault); + } + void cyclic() override { + runtime_.cycle++; + if (config_.phase == "cyclic" && runtime_.cycle >= config_.after_ticks) + doFault(config_.fault); + } + void exit() override {} + void longRunning() override {} +}; + +LOOM_REGISTER_MODULE(Crasher) diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index bd938cd..a314d77 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -36,6 +36,12 @@ add_library(loom_runtime STATIC src/opcua_rest_server.cpp src/oscilloscope.cpp src/module_watcher.cpp + src/diag/breadcrumb.cpp + src/diag/crash_handler.cpp + src/diag/symbolizer.cpp + src/diag/fault_report.cpp + src/diag/fault_store.cpp + src/diag/runtime_fault_sink.cpp ) target_include_directories(loom_runtime PUBLIC @@ -43,6 +49,9 @@ target_include_directories(loom_runtime PUBLIC $ ) +# Build type stamped into crash reports (works for single- and multi-config). +target_compile_definitions(loom_runtime PRIVATE LOOM_BUILD_TYPE="$") + target_link_libraries(loom_runtime PUBLIC loom::sdk spdlog::spdlog @@ -50,6 +59,10 @@ target_link_libraries(loom_runtime PUBLIC ${CMAKE_DL_LIBS} ) +# Symbolization for crash reports. PRIVATE: confined to the diag symbolizer TU, +# never exposed in loom_runtime's public headers (SDK/consumers stay clean of it). +target_link_libraries(loom_runtime PRIVATE cpptrace::cpptrace) + # --------------------------------------------------------------------------- # Thin executable — just calls loom::run(argc, argv). # --------------------------------------------------------------------------- diff --git a/runtime/conanfile.py b/runtime/conanfile.py index 78096f5..bb57980 100644 --- a/runtime/conanfile.py +++ b/runtime/conanfile.py @@ -22,6 +22,7 @@ def requirements(self): self.requires(f"loom/{self.version}@local/stable", transitive_headers=True) self.requires("spdlog/1.17.0", transitive_headers=True) self.requires("crowcpp-crow/1.3.0", transitive_headers=True) + self.requires("cpptrace/0.8.3") # impl-only: crash-report symbolization (not in public headers) def layout(self): cmake_layout(self) diff --git a/runtime/include/loom/diag/breadcrumb.h b/runtime/include/loom/diag/breadcrumb.h new file mode 100644 index 0000000..3cb6a0d --- /dev/null +++ b/runtime/include/loom/diag/breadcrumb.h @@ -0,0 +1,70 @@ +#pragma once + +#include + +// ============================================================================ +// loom::diag — execution breadcrumb +// +// A per-thread record of what the runtime is currently executing (which module, +// class, and lifecycle phase). Set cheaply via RAII around every module entry +// call; read by the crash handler (on the faulting thread) to attribute a fault +// to a specific module/phase. +// +// Stores *stable pointers* into the module's id/class strings (which outlive the +// call) plus a phase byte — no allocation, no copy. Reading the raw pointers/ +// bytes from a signal handler is allocator-free and async-signal-safe. +// ============================================================================ + +namespace loom::diag { + +enum class Phase : uint8_t { + None = 0, Init, PreCyclic, Cyclic, PostCyclic, LongRunning, Exit, Service, +}; + +/// Human-readable phase name (no allocation — safe in a signal handler). +inline const char* phaseName(Phase p) noexcept { + switch (p) { + case Phase::Init: return "init"; + case Phase::PreCyclic: return "preCyclic"; + case Phase::Cyclic: return "cyclic"; + case Phase::PostCyclic: return "postCyclic"; + case Phase::LongRunning: return "longRunning"; + case Phase::Exit: return "exit"; + case Phase::Service: return "service"; + case Phase::None: return "none"; + } + return "?"; +} + +struct Breadcrumb { + const char* moduleId = nullptr; // stable pointer into the module's id + const char* className = nullptr; // stable pointer into the module's class name + Phase phase = Phase::None; + uint64_t cycle = 0; +}; + +/// The current thread's breadcrumb. The crash handler runs on the faulting +/// thread, so reading this names the exact module/phase that was executing. +extern thread_local Breadcrumb tlsBreadcrumb; + +/// RAII: stamp the breadcrumb on construction, restore the previous value on +/// destruction (so nested calls — e.g. a service invoked from cyclic — unwind +/// correctly). +class BreadcrumbScope { +public: + BreadcrumbScope(Phase p, const char* moduleId, const char* className) noexcept + : prev_(tlsBreadcrumb) { + tlsBreadcrumb.moduleId = moduleId; + tlsBreadcrumb.className = className; + tlsBreadcrumb.phase = p; + } + ~BreadcrumbScope() noexcept { tlsBreadcrumb = prev_; } + + BreadcrumbScope(const BreadcrumbScope&) = delete; + BreadcrumbScope& operator=(const BreadcrumbScope&) = delete; + +private: + Breadcrumb prev_; +}; + +} // namespace loom::diag diff --git a/runtime/include/loom/diag/crash_handler.h b/runtime/include/loom/diag/crash_handler.h new file mode 100644 index 0000000..193414c --- /dev/null +++ b/runtime/include/loom/diag/crash_handler.h @@ -0,0 +1,28 @@ +#pragma once + +#include + +// ============================================================================ +// loom::diag — process-global crash handler (hardware-fault / unhandled path) +// +// Installs fatal-signal handlers (POSIX) / an unhandled-exception filter +// (Windows) / std::set_terminate, so a segfault/FPE/abort or an escaped C++ +// exception — in a module OR in the runtime itself — produces a crash report +// (faulting thread's breadcrumb + signal/exception + build identity + raw stack +// addresses) before the process exits. Symbolization is layered on later +// (Phase 2). Install once, early in startup. +// ============================================================================ + +namespace loom::diag { + +struct CrashConfig { + std::filesystem::path crashDir; // where crash reports are written (e.g. /crash) +}; + +class CrashHandler { +public: + /// Install the handlers. Idempotent; call once after logging is set up. + static void install(const CrashConfig& cfg); +}; + +} // namespace loom::diag diff --git a/runtime/include/loom/diag/fault_report.h b/runtime/include/loom/diag/fault_report.h new file mode 100644 index 0000000..d0e4f39 --- /dev/null +++ b/runtime/include/loom/diag/fault_report.h @@ -0,0 +1,66 @@ +#pragma once + +#include "loom/diag/breadcrumb.h" +#include "loom/diag/symbolizer.h" + +#include +#include +#include +#include + +// ============================================================================ +// loom::diag — structured fault report +// +// The machine-readable record of a single fault, written to +// /crash/.json and served over /api/faults for the LoomUI crash +// viewer. Built and serialized OFF the signal path only (it allocates): the +// exception path (scheduler guard) and the Windows unhandled-exception filter. +// The POSIX fatal-signal handler writes a raw text report instead (async- +// signal-safe) which is symbolized offline. +// ============================================================================ + +namespace loom::diag { + +enum class FaultKind : uint8_t { + Exception, ///< C++ exception caught by a module-call guard + Signal, ///< Fatal signal / SEH exception caught by the crash handler +}; + +const char* faultKindName(FaultKind); + +/// Module data sections captured at fault time (exception path only — reading a +/// module's state is safe off the signal path). Each holds raw JSON or "". +struct FaultSections { + std::string config; + std::string recipe; + std::string runtime; + std::string summary; +}; + +struct FaultReport { + std::string id; ///< Unique within a run, also the filename stem + int64_t tsMs = 0; ///< system_clock milliseconds + FaultKind kind = FaultKind::Exception; + int signalOrCode = 0; ///< signal number / SEH exception code (0 for exceptions) + std::string reason; ///< what() or signal/exception description + + // Build identity (so a report maps back to a commit + matching symbols). + std::string sdkVersion; + std::string gitSha; + std::string buildType; + + // Execution breadcrumb (which module/phase was running on the faulting thread). + std::string moduleId; ///< "" → runtime code (no module on the thread) + std::string className; + Phase phase = Phase::None; + uint64_t cycle = 0; + + std::vector frames; ///< symbolized stack (empty if unavailable) + std::optional sections; ///< captured live values (exception path) +}; + +/// Serialize to clean, nested JSON (sections embed as real JSON objects, not +/// escaped strings). Allocates — off-signal use only. +std::string toJson(const FaultReport&); + +} // namespace loom::diag diff --git a/runtime/include/loom/diag/fault_sink.h b/runtime/include/loom/diag/fault_sink.h new file mode 100644 index 0000000..b9928e4 --- /dev/null +++ b/runtime/include/loom/diag/fault_sink.h @@ -0,0 +1,37 @@ +#pragma once + +#include "loom/diag/breadcrumb.h" + +#include +#include + +// ============================================================================ +// loom::diag — fault sink interface +// +// The scheduler depends only on this interface (injected, not owned) so it +// stays thin: when a guarded module call throws, it reports a FaultEvent and +// the concrete sink (in the runtime layer) does the heavy lifting — capture +// live sections, build + persist a FaultReport, publish the `loom/faults` +// topic. Keeping the interface here lets diag stay free of DataEngine/Bus deps. +// ============================================================================ + +namespace loom::diag { + +struct FaultEvent { + std::string moduleId; + std::string className; + Phase phase = Phase::None; + uint64_t cycle = 0; + std::string message; ///< exception what() +}; + +class IFaultSink { +public: + virtual ~IFaultSink() = default; + + /// Called from the faulting worker thread, off the signal path. Implementations + /// must be thread-safe and must not throw. + virtual void onModuleFault(const FaultEvent&) = 0; +}; + +} // namespace loom::diag diff --git a/runtime/include/loom/diag/fault_store.h b/runtime/include/loom/diag/fault_store.h new file mode 100644 index 0000000..944a743 --- /dev/null +++ b/runtime/include/loom/diag/fault_store.h @@ -0,0 +1,64 @@ +#pragma once + +#include "loom/diag/fault_report.h" + +#include +#include +#include +#include +#include + +// ============================================================================ +// loom::diag — fault store +// +// Thread-safe registry of fault reports for this run, backed by JSON files in +// . On construction it scans the directory so crashes from PRIOR runs +// (including the signal-path text reports the process couldn't keep in memory) +// are visible too. The server's /api/faults[/:id] routes delegate here. +// ============================================================================ + +namespace loom::diag { + +class FaultStore { +public: + /// One row in the fault list — enough to render the LoomUI tree without + /// fetching every full report. + struct Summary { + std::string id; + int64_t tsMs = 0; + std::string kind; ///< "exception" | "signal" | "raw" + std::string moduleId; ///< "" → runtime code + std::string className; + std::string phase; + std::string reason; + }; + + explicit FaultStore(std::filesystem::path crashDir); + + /// Persist a live fault (JSON file + in-memory summary). Returns its id. + /// Safe to call from any worker thread; never throws. + std::string record(const FaultReport& report) noexcept; + + /// All known faults, newest first. + std::vector list() const; + + /// Full report JSON for one id ("" stem), or nullopt if unknown. + std::optional detailJson(const std::string& id) const; + + const std::filesystem::path& crashDir() const { return crashDir_; } + +private: + struct Entry { + Summary summary; + std::string rawJson; ///< full report JSON served by detailJson() + }; + + /// Load existing reports from disk (called once at construction). + void scanDir(); + + std::filesystem::path crashDir_; + mutable std::mutex mx_; + std::vector entries_; ///< append-only; newest at the back +}; + +} // namespace loom::diag diff --git a/runtime/include/loom/diag/guard.h b/runtime/include/loom/diag/guard.h new file mode 100644 index 0000000..1c96808 --- /dev/null +++ b/runtime/include/loom/diag/guard.h @@ -0,0 +1,47 @@ +#pragma once + +#include "loom/diag/breadcrumb.h" + +#include +#include + +// ============================================================================ +// loom::diag — module-call guard (C++ exception path) +// +// Wraps a module entry-point call in a breadcrumb + try/catch. Catches a thrown +// std::exception (or anything), reports it via the caller-supplied onFault, and +// returns false — turning an exception that would otherwise terminate the worker +// thread / process into a contained, attributed fault. Hardware faults +// (segfault/FPE) are NOT exceptions; those are captured by the crash handler. +// +// `onFault` is a template parameter (not std::function) so the happy path is +// zero-cost and allocation-free. The scheduler passes a small lambda that sets +// the module's faulted/Error state. +// ============================================================================ + +namespace loom::diag { + +struct FaultInfo { + const char* moduleId; + const char* className; + Phase phase; + std::string_view message; // exception what() (valid for the onFault call) +}; + +template +bool guard(Phase phase, const char* moduleId, const char* className, + Fn&& fn, OnFault&& onFault) { + BreadcrumbScope crumb(phase, moduleId, className); + try { + fn(); + return true; + } catch (const std::exception& e) { + onFault(FaultInfo{moduleId, className, phase, e.what()}); + return false; + } catch (...) { + onFault(FaultInfo{moduleId, className, phase, "unknown (non-std exception)"}); + return false; + } +} + +} // namespace loom::diag diff --git a/runtime/include/loom/diag/runtime_fault_sink.h b/runtime/include/loom/diag/runtime_fault_sink.h new file mode 100644 index 0000000..a0f1ad5 --- /dev/null +++ b/runtime/include/loom/diag/runtime_fault_sink.h @@ -0,0 +1,40 @@ +#pragma once + +#include "loom/diag/fault_sink.h" +#include "loom/diag/fault_store.h" + +#include +#include + +// ============================================================================ +// loom::diag — concrete fault sink (runtime layer) +// +// Bridges the scheduler's exception path to the rest of the runtime: builds a +// structured FaultReport from a module-call exception, captures the module's +// live data sections (safe off the signal path), persists it via the +// FaultStore, and publishes it on the `loom/faults` bus topic for the UI and +// any subscribing module. Keeping it here (not in the scheduler) is what lets +// diag stay free of DataEngine/Bus dependencies. +// ============================================================================ + +namespace loom { +class DataEngine; +class Bus; +} + +namespace loom::diag { + +class RuntimeFaultSink : public IFaultSink { +public: + RuntimeFaultSink(FaultStore& store, DataEngine& engine, Bus& bus); + + void onModuleFault(const FaultEvent& ev) override; + +private: + FaultStore& store_; + DataEngine& engine_; + Bus& bus_; + std::atomic seq_{0}; +}; + +} // namespace loom::diag diff --git a/runtime/include/loom/diag/symbolizer.h b/runtime/include/loom/diag/symbolizer.h new file mode 100644 index 0000000..e4d0236 --- /dev/null +++ b/runtime/include/loom/diag/symbolizer.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include +#include +#include + +// ============================================================================ +// loom::diag — stack symbolizer +// +// Resolves raw instruction addresses to function + file:line. Backed by cpptrace +// in symbolizer.cpp (the ONLY TU that includes cpptrace — it stays out of every +// header so consumers/SDK never see it). NOT async-signal-safe (it allocates and +// reads debug info): call only off the signal path — the Windows unhandled- +// exception filter, the std::set_terminate handler, or the offline --symbolize +// tool. The POSIX fatal-signal handler captures raw addresses and symbolizes +// later / offline. +// ============================================================================ + +namespace loom::diag { + +struct SymFrame { + uintptr_t address = 0; + std::string symbol; // function name ("" if unresolved) + std::string filename; // source file ("" if unavailable) + uint32_t line = 0; // 0 if unknown +}; + +std::vector symbolize(const void* const* addrs, std::size_t n); + +} // namespace loom::diag diff --git a/runtime/include/loom/runtime_core.h b/runtime/include/loom/runtime_core.h index 356cf6a..4958517 100644 --- a/runtime/include/loom/runtime_core.h +++ b/runtime/include/loom/runtime_core.h @@ -3,6 +3,8 @@ #include "loom/bus.h" #include "loom/data_engine.h" #include "loom/data_store.h" +#include "loom/diag/fault_store.h" +#include "loom/diag/runtime_fault_sink.h" #include "loom/instance_manifest.h" #include "loom/io_mapper.h" #include "loom/module.h" @@ -16,6 +18,7 @@ #include #include +#include #include #include #include @@ -92,6 +95,7 @@ class RuntimeCore : public IModuleRegistry { Bus& bus() { return bus_; } Oscilloscope& oscilloscope() { return oscilloscope_; } IOMapper& ioMapper() { return ioMapper_; } + diag::FaultStore& faultStore() { return faultStore_; } const RuntimeConfig& config() const { return config_; } const SchedulerConfig& schedulerConfig() const { return schedCfg_; } @@ -133,6 +137,12 @@ class RuntimeCore : public IModuleRegistry { ModuleWatcher watcher_; RuntimeHeap runtimeHeap_; + // Fault diagnostics: persistent store of fault reports + the sink that the + // scheduler notifies on a guarded exception. Declared after the subsystems + // it references so it is destroyed first. + diag::FaultStore faultStore_; + std::unique_ptr faultSink_; + std::shared_mutex moduleMutex_; }; diff --git a/runtime/include/loom/scheduler.h b/runtime/include/loom/scheduler.h index 6a0b86c..f0f42ca 100644 --- a/runtime/include/loom/scheduler.h +++ b/runtime/include/loom/scheduler.h @@ -4,6 +4,7 @@ #include "loom/scheduler_config.h" #include "loom/oscilloscope.h" #include "loom/data_engine.h" +#include "loom/diag/breadcrumb.h" #include #include @@ -19,6 +20,8 @@ #include #include +namespace loom::diag { class IFaultSink; } + namespace loom { // Forward declarations @@ -74,6 +77,17 @@ struct TaskState { std::atomic lastJitterUs{0}; ///< |actualStart − prevStart| − period (µs) std::atomic lastCyclicStartNs{0}; ///< Used internally to compute per-module jitter + // Last-fault diagnostics, written by the faulting worker thread in + // Scheduler::recordModuleFault() and read by the server thread. + // Synchronization: recordModuleFault writes these fields, THEN stores + // `faulted` with memory_order_release. Readers MUST load `faulted` with + // acquire and read these only when it is true. A module faults at most once + // (it is skipped afterward), so lastFaultMsg is effectively write-once — + // hence safe to read as a plain buffer after observing faulted==true. + std::atomic lastFaultMs{0}; ///< system_clock ms of last fault (0 = none) + std::atomic lastFaultPhase{0}; ///< Phase value at fault + char lastFaultMsg[256] = {}; + // Historical cycle/jitter data for charting (fixed-size ring buffer) MetricRingBuffer cycleHistory; mutable std::mutex cycleHistoryMx; @@ -146,6 +160,11 @@ class Scheduler { /// The scheduler will call executeForClass/executeForModule at appropriate times. void setIOMapper(IOMapper* mapper); + /// Inject the fault sink notified when a guarded module call throws. Not + /// owned; must outlive the scheduler. Optional — nullptr disables reporting + /// (the module is still quarantined). Set before startClasses(). + void setFaultSink(diag::IFaultSink* sink) { faultSink_ = sink; } + /// Stop a module. Removes from class (or stops isolated thread). Joins long-running thread. /// Does NOT call exit() — caller's responsibility. bool stop(const std::string& moduleId); @@ -270,6 +289,12 @@ class Scheduler { void classLoop(ClassRunnerState& runner); void isolatedLoop(LoadedModule& mod, TaskConfig config, TaskState& state); + /// Quarantine a module that threw from a guarded call and report the fault: + /// set faulted + ModuleState::Error, stamp last-fault fields, log, and notify + /// the fault sink (if any). Called from the worker thread, off the signal path. + void recordModuleFault(TaskState& state, LoadedModule& mod, + diag::Phase phase, std::string_view message); + // ---- State ------------------------------------------------------------------ SchedulerConfig schedCfg_; @@ -285,6 +310,7 @@ class Scheduler { ModuleLoader* loader_ = nullptr; std::shared_mutex* moduleMutex_ = nullptr; IOMapper* ioMapper_ = nullptr; + diag::IFaultSink* faultSink_ = nullptr; }; } // namespace loom diff --git a/runtime/src/diag/breadcrumb.cpp b/runtime/src/diag/breadcrumb.cpp new file mode 100644 index 0000000..400e6a9 --- /dev/null +++ b/runtime/src/diag/breadcrumb.cpp @@ -0,0 +1,7 @@ +#include "loom/diag/breadcrumb.h" + +namespace loom::diag { + +thread_local Breadcrumb tlsBreadcrumb; + +} // namespace loom::diag diff --git a/runtime/src/diag/crash_handler.cpp b/runtime/src/diag/crash_handler.cpp new file mode 100644 index 0000000..60ba4e1 --- /dev/null +++ b/runtime/src/diag/crash_handler.cpp @@ -0,0 +1,223 @@ +#include "loom/diag/crash_handler.h" +#include "loom/diag/breadcrumb.h" +#include "loom/diag/fault_report.h" +#include "loom/diag/symbolizer.h" +#include "loom/version.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef LOOM_BUILD_TYPE +#define LOOM_BUILD_TYPE "unknown" +#endif + +// ============================================================================ +// Crash handler. Phase 1: capture (breadcrumb + signal/exception + build id + +// raw stack addresses) and write a text crash report, then let the process die. +// Symbolization (cpptrace) and structured JSON come in later phases. +// +// POSIX handler bodies must be async-signal-safe: only open()/write()/_exit and +// raw reads — no malloc/printf/ofstream/locks. The Windows unhandled-exception +// filter is NOT signal-constrained, so it may use richer calls. +// ============================================================================ + +namespace loom::diag { +namespace { + +std::atomic_flag g_reporting = ATOMIC_FLAG_INIT; // first faulting thread wins +char g_reportPath[1024] = {}; // precomputed at install + +void buildIdentityLine(char* buf, size_t n) { + std::snprintf(buf, n, "build: sdk=%s git=%s type=%s", + loom::kSdkVersion, loom::kGitSha, LOOM_BUILD_TYPE); +} + +} // namespace +} // namespace loom::diag + +// --------------------------------------------------------------------------- +#if defined(_WIN32) +// --------------------------------------------------------------------------- +#include // CaptureStackBackTrace (RtlCaptureStackBackTrace, in kernel32) + +namespace loom::diag { +namespace { + +char g_reportId[256] = {}; // crash-report id / filename stem (precomputed at install) + +// Build a structured JSON crash report and write it to g_reportPath. The Windows +// unhandled-exception filter is NOT async-signal-constrained, so we may allocate: +// symbolize in-process via cpptrace and serialize the same FaultReport the +// exception path uses, so signal-path crashes surface in /api/faults too. +void writeReportWin(FaultKind kind, const char* reason, int code, + void* const* frames, unsigned nframes) { + const Breadcrumb& b = tlsBreadcrumb; // faulting thread + + FaultReport r; + r.id = g_reportId; + r.kind = kind; + r.signalOrCode = code; + r.reason = reason ? reason : ""; + r.sdkVersion = loom::kSdkVersion; + r.gitSha = loom::kGitSha; + r.buildType = LOOM_BUILD_TYPE; + r.moduleId = b.moduleId ? b.moduleId : ""; + r.className = b.className ? b.className : ""; + r.phase = b.phase; + r.cycle = b.cycle; + r.frames = symbolize(reinterpret_cast(frames), nframes); + + std::string json = toJson(r); + std::ofstream f(g_reportPath, std::ios::binary | std::ios::trunc); + if (f) f << json; +} + +LONG WINAPI unhandledFilter(EXCEPTION_POINTERS* ep) { + if (g_reporting.test_and_set()) return EXCEPTION_EXECUTE_HANDLER; + void* frames[64]; + unsigned n = CaptureStackBackTrace(0, 64, frames, nullptr); + const DWORD code = ep ? ep->ExceptionRecord->ExceptionCode : 0UL; + char reason[64]; + std::snprintf(reason, sizeof reason, "SEH exception 0x%08lx", code); + writeReportWin(FaultKind::Signal, reason, static_cast(code), frames, n); + return EXCEPTION_EXECUTE_HANDLER; // run default handler → terminate +} + +void terminateHandler() { + if (!g_reporting.test_and_set()) { + void* frames[64]; + unsigned n = CaptureStackBackTrace(0, 64, frames, nullptr); + writeReportWin(FaultKind::Signal, "std::terminate (unhandled C++ exception)", + 0, frames, n); + } + std::abort(); +} + +} // namespace + +void CrashHandler::install(const CrashConfig& cfg) { + std::error_code ec; + std::filesystem::create_directories(cfg.crashDir, ec); + std::snprintf(g_reportId, sizeof g_reportId, "loom-crash-%lu", + GetCurrentProcessId()); + auto path = (cfg.crashDir / (std::string(g_reportId) + ".json")).string(); + std::snprintf(g_reportPath, sizeof g_reportPath, "%s", path.c_str()); + SetUnhandledExceptionFilter(unhandledFilter); + std::set_terminate(terminateHandler); +} + +} // namespace loom::diag + +// --------------------------------------------------------------------------- +#else // POSIX +// --------------------------------------------------------------------------- +#include +#include +#include +#include // open(), O_WRONLY/O_CREAT/O_TRUNC (not transitively included on macOS) +#include + +namespace loom::diag { +namespace { + +// async-signal-safe helpers -------------------------------------------------- +// NB: std::strlen is NOT in the POSIX async-signal-safe list, so compute the +// length with a plain loop (which is) before the single write(). +void sWrite(int fd, const char* s) { + if (!s) return; + size_t n = 0; + while (s[n]) ++n; + ssize_t r = ::write(fd, s, n); (void)r; +} +void sWriteHex(int fd, uintptr_t v) { + char buf[2 + sizeof(uintptr_t) * 2]; buf[0] = '0'; buf[1] = 'x'; + const char* hex = "0123456789abcdef"; + int i = 2 + (int)sizeof(uintptr_t) * 2; + char* p = buf + i; + if (v == 0) { *--p = '0'; } else { while (v) { *--p = hex[v & 0xf]; v >>= 4; } } + sWrite(fd, "0x"); ssize_t r = ::write(fd, p, (buf + i) - p); (void)r; +} + +char g_buildId[256] = {}; // precomputed at install (no formatting in handler) +void* g_primeFrames[4]; // backtrace priming target + +void handler(int sig, siginfo_t*, void*) { + // Re-raise with kill() (async-signal-safe); the handler was installed with + // SA_RESETHAND, so the disposition is already SIG_DFL — no signal() needed + // (signal() is NOT async-signal-safe). + if (g_reporting.test_and_set()) { kill(getpid(), sig); _exit(128 + sig); } + int fd = ::open(g_reportPath, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd >= 0) { + const Breadcrumb& b = tlsBreadcrumb; + sWrite(fd, "=== Loom crash report ===\nsignal: "); + sWriteHex(fd, (uintptr_t)sig); + sWrite(fd, "\nmodule: "); sWrite(fd, b.moduleId ? b.moduleId : "(none/runtime)"); + sWrite(fd, " class: "); sWrite(fd, b.className ? b.className : "(none)"); + sWrite(fd, " phase: "); sWrite(fd, phaseName(b.phase)); + sWrite(fd, "\n"); sWrite(fd, g_buildId); sWrite(fd, "\n"); + sWrite(fd, "frames (raw addresses — symbolize offline):\n"); + void* frames[64]; + int n = backtrace(frames, 64); + for (int i = 0; i < n; ++i) { sWrite(fd, " "); sWriteHex(fd, (uintptr_t)frames[i]); sWrite(fd, "\n"); } + ::close(fd); + } + // Re-raise for a core dump with the default disposition. SA_RESETHAND already + // reset it to SIG_DFL on entry, so just kill(getpid(), sig) — async-signal-safe, + // unlike signal()/raise(). + kill(getpid(), sig); + _exit(128 + sig); +} + +void terminateHandler() { + if (!g_reporting.test_and_set()) { + int fd = ::open(g_reportPath, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd >= 0) { + sWrite(fd, "=== Loom crash report ===\nstd::terminate (unhandled C++ exception)\n"); + sWrite(fd, g_buildId); sWrite(fd, "\n"); + void* frames[64]; int n = backtrace(frames, 64); + for (int i = 0; i < n; ++i) { sWrite(fd, " "); sWriteHex(fd, (uintptr_t)frames[i]); sWrite(fd, "\n"); } + ::close(fd); + } + } + std::abort(); +} + +} // namespace + +void CrashHandler::install(const CrashConfig& cfg) { + std::error_code ec; + std::filesystem::create_directories(cfg.crashDir, ec); + auto path = (cfg.crashDir / ("loom-crash-" + std::to_string((long)getpid()) + ".txt")).string(); + std::snprintf(g_reportPath, sizeof g_reportPath, "%s", path.c_str()); + buildIdentityLine(g_buildId, sizeof g_buildId); + + // Prime backtrace() so its first (lazy libgcc) call already happened — the + // handler's backtrace() is then effectively async-signal-safe. + backtrace(g_primeFrames, 4); + + // Alternate signal stack so a stack-overflow SIGSEGV still has stack to run. + // Use a fixed compile-time size: since glibc 2.34, SIGSTKSZ expands to a + // sysconf() call (not a constant), so it can't size a static array. 64 KiB + // comfortably exceeds SIGSTKSZ on every supported platform. + static constexpr size_t kAltStackSize = 64 * 1024; + static char altStack[kAltStackSize]; + stack_t ss{}; ss.ss_sp = altStack; ss.ss_size = sizeof altStack; ss.ss_flags = 0; + sigaltstack(&ss, nullptr); + + struct sigaction sa{}; + sa.sa_sigaction = handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESETHAND; + for (int sig : {SIGSEGV, SIGABRT, SIGFPE, SIGILL, SIGBUS}) sigaction(sig, &sa, nullptr); + + std::set_terminate(terminateHandler); +} + +} // namespace loom::diag +#endif diff --git a/runtime/src/diag/fault_report.cpp b/runtime/src/diag/fault_report.cpp new file mode 100644 index 0000000..caab24a --- /dev/null +++ b/runtime/src/diag/fault_report.cpp @@ -0,0 +1,133 @@ +#include "loom/diag/fault_report.h" + +#include +#include +#include + +namespace loom::diag { + +const char* faultKindName(FaultKind k) { + switch (k) { + case FaultKind::Exception: return "exception"; + case FaultKind::Signal: return "signal"; + } + return "unknown"; +} + +namespace { + +// Minimal JSON string escaper (RFC 8259). We hand-roll the report JSON so the +// captured `sections` can embed as real nested JSON rather than escaped strings. +void appendEscaped(std::string& out, std::string_view s) { + out += '"'; + for (char c : s) { + switch (c) { + case '"': out += "\\\""; break; + case '\\': out += "\\\\"; break; + case '\n': out += "\\n"; break; + case '\r': out += "\\r"; break; + case '\t': out += "\\t"; break; + default: + if (static_cast(c) < 0x20) { + char buf[8]; + std::snprintf(buf, sizeof buf, "\\u%04x", c); + out += buf; + } else { + out += c; + } + } + } + out += '"'; +} + +void appendKV(std::string& out, const char* key, std::string_view val, bool& first) { + if (!first) out += ','; + first = false; + out += '"'; out += key; out += "\":"; + appendEscaped(out, val); +} + +void appendKVRaw(std::string& out, const char* key, std::string_view rawJson, bool& first) { + if (!first) out += ','; + first = false; + out += '"'; out += key; out += "\":"; + out += (rawJson.empty() ? std::string_view{"null"} : rawJson); +} + +void appendKVNum(std::string& out, const char* key, long long val, bool& first) { + if (!first) out += ','; + first = false; + out += '"'; out += key; out += "\":"; + out += std::to_string(val); +} + +} // namespace + +std::string toJson(const FaultReport& r) { + std::string out; + out.reserve(1024 + r.frames.size() * 128); + out += '{'; + bool first = true; + + appendKV (out, "id", r.id, first); + appendKVNum(out, "ts", r.tsMs, first); + appendKV (out, "kind", faultKindName(r.kind), first); + appendKVNum(out, "signalOrCode", r.signalOrCode, first); + appendKV (out, "reason", r.reason, first); + + // build identity + out += ",\"build\":{"; + { + bool bfirst = true; + appendKV(out, "sdkVersion", r.sdkVersion, bfirst); + appendKV(out, "gitSha", r.gitSha, bfirst); + appendKV(out, "buildType", r.buildType, bfirst); + } + out += '}'; + + // breadcrumb + out += ",\"breadcrumb\":{"; + { + bool cfirst = true; + appendKV (out, "module", r.moduleId.empty() ? "" : r.moduleId, cfirst); + appendKV (out, "class", r.className, cfirst); + appendKV (out, "phase", phaseName(r.phase), cfirst); + appendKVNum(out, "cycle", static_cast(r.cycle), cfirst); + } + out += '}'; + + // frames + out += ",\"frames\":["; + for (std::size_t i = 0; i < r.frames.size(); ++i) { + const SymFrame& f = r.frames[i]; + if (i) out += ','; + out += '{'; + bool ffirst = true; + char addr[24]; + std::snprintf(addr, sizeof addr, "0x%016llx", + static_cast(f.address)); + appendKVNum(out, "idx", static_cast(i), ffirst); + appendKV (out, "address", addr, ffirst); + appendKV (out, "function", f.symbol, ffirst); + appendKV (out, "file", f.filename, ffirst); + appendKVNum(out, "line", f.line, ffirst); + out += '}'; + } + out += ']'; + + // sections (exception path only) + if (r.sections) { + out += ",\"sections\":{"; + bool sfirst = true; + appendKVRaw(out, "config", r.sections->config, sfirst); + appendKVRaw(out, "recipe", r.sections->recipe, sfirst); + appendKVRaw(out, "runtime", r.sections->runtime, sfirst); + appendKVRaw(out, "summary", r.sections->summary, sfirst); + out += '}'; + } + + out += '}'; + return out; +} + +} // namespace loom::diag diff --git a/runtime/src/diag/fault_store.cpp b/runtime/src/diag/fault_store.cpp new file mode 100644 index 0000000..fa62432 --- /dev/null +++ b/runtime/src/diag/fault_store.cpp @@ -0,0 +1,143 @@ +#include "loom/diag/fault_store.h" + +#include +#include + +#include +#include +#include +#include +#include + +namespace loom::diag { + +namespace { + +std::string readFile(const std::filesystem::path& p) { + std::ifstream f(p, std::ios::binary); + if (!f) return {}; + std::ostringstream ss; + ss << f.rdbuf(); + return ss.str(); +} + +std::string jstr(glz::json_t& j, const char* key) { + if (!j.is_object() || !j.contains(key)) return {}; + auto& v = j[key]; + return v.is_string() ? v.get() : std::string{}; +} + +int64_t jnum(glz::json_t& j, const char* key) { + if (!j.is_object() || !j.contains(key)) return 0; + auto& v = j[key]; + return v.is_number() ? static_cast(v.get()) : 0; +} + +// Build a Summary by parsing a report JSON blob. Falls back to a minimal +// summary (id only) if the blob isn't our JSON shape. +FaultStore::Summary summarize(const std::string& id, const std::string& json) { + FaultStore::Summary s; + s.id = id; + glz::json_t doc; + if (json.empty() || glz::read_json(doc, json) || !doc.is_object()) { + s.kind = "raw"; + return s; + } + s.tsMs = jnum(doc, "ts"); + s.kind = jstr(doc, "kind"); + s.reason = jstr(doc, "reason"); + if (doc.contains("breadcrumb") && doc["breadcrumb"].is_object()) { + auto& bc = doc["breadcrumb"]; + s.moduleId = jstr(bc, "module"); + s.className = jstr(bc, "class"); + s.phase = jstr(bc, "phase"); + } + return s; +} + +} // namespace + +FaultStore::FaultStore(std::filesystem::path crashDir) + : crashDir_(std::move(crashDir)) { + std::error_code ec; + std::filesystem::create_directories(crashDir_, ec); + scanDir(); +} + +void FaultStore::scanDir() { + std::error_code ec; + if (!std::filesystem::exists(crashDir_, ec)) return; + + for (const auto& de : std::filesystem::directory_iterator(crashDir_, ec)) { + if (ec || !de.is_regular_file()) continue; + const auto& path = de.path(); + const std::string ext = path.extension().string(); + const std::string stem = path.stem().string(); + + if (ext == ".json") { + std::string json = readFile(path); + entries_.push_back({summarize(stem, json), std::move(json)}); + } else if (ext == ".txt" && stem.rfind("loom-crash-", 0) == 0) { + // POSIX signal-path report (raw addresses) — wrap the text so the + // viewer can show it; symbolize offline via `loom --symbolize`. + std::string raw = readFile(path); + std::string wrapped = glz::write_json( + std::map{{"id", stem}, + {"kind", "raw"}, + {"raw", raw}}).value_or("{}"); + Summary s; + s.id = stem; s.kind = "raw"; s.reason = "raw report — symbolize offline"; + entries_.push_back({std::move(s), std::move(wrapped)}); + } + } + // Keep the invariant "newest at the back" (matches record()'s push_back), so + // list()'s reverse walk yields newest-first. Raw .txt reports have ts 0. + std::sort(entries_.begin(), entries_.end(), + [](const Entry& a, const Entry& b) { return a.summary.tsMs < b.summary.tsMs; }); +} + +std::string FaultStore::record(const FaultReport& report) noexcept { + try { + std::string json = toJson(report); + { + std::lock_guard lock(mx_); + const auto path = crashDir_ / (report.id + ".json"); + bool persisted = false; + { + std::ofstream f(path, std::ios::binary | std::ios::trunc); + if (f) { f << json; persisted = static_cast(f); } + } + // Keep the in-memory entry regardless (a fault that happened must stay + // visible in /api/faults for this run), but surface a persistence + // failure rather than silently implying the report was saved to disk. + if (!persisted) + spdlog::warn("FaultStore: fault '{}' kept in memory only — failed to write {}", + report.id, path.string()); + entries_.push_back({summarize(report.id, json), std::move(json)}); + } + return report.id; + } catch (const std::exception& e) { + spdlog::error("FaultStore::record failed: {}", e.what()); + return {}; + } catch (...) { + return {}; + } +} + +std::vector FaultStore::list() const { + std::lock_guard lock(mx_); + std::vector out; + out.reserve(entries_.size()); + for (auto it = entries_.rbegin(); it != entries_.rend(); ++it) + out.push_back(it->summary); + return out; +} + +std::optional FaultStore::detailJson(const std::string& id) const { + std::lock_guard lock(mx_); + for (auto it = entries_.rbegin(); it != entries_.rend(); ++it) + if (it->summary.id == id) return it->rawJson; + return std::nullopt; +} + +} // namespace loom::diag diff --git a/runtime/src/diag/runtime_fault_sink.cpp b/runtime/src/diag/runtime_fault_sink.cpp new file mode 100644 index 0000000..8ac5df0 --- /dev/null +++ b/runtime/src/diag/runtime_fault_sink.cpp @@ -0,0 +1,87 @@ +#include "loom/diag/runtime_fault_sink.h" + +#include "loom/diag/fault_report.h" +#include "loom/bus.h" +#include "loom/data_engine.h" +#include "loom/types.h" +#include "loom/version.h" + +#include + +#include +#include +#include + +#ifndef LOOM_BUILD_TYPE +#define LOOM_BUILD_TYPE "unknown" +#endif + +namespace loom::diag { + +RuntimeFaultSink::RuntimeFaultSink(FaultStore& store, DataEngine& engine, Bus& bus) + : store_(store), engine_(engine), bus_(bus) {} + +namespace { +std::string safeRead(DataEngine& engine, const std::string& id, DataSection sec) { + try { + return engine.readSection(id, sec); + } catch (...) { + return {}; + } +} + +// The report id becomes a filename (/.json), and the module +// portion ultimately comes from the HTTP instantiate request body — so a '/' or +// '..' could escape the crash directory. Map anything outside a safe set to '_'. +std::string sanitizeForFilename(std::string_view s) { + std::string out; + out.reserve(s.size()); + for (char c : s) { + const bool ok = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || c == '.' || c == '_' || c == '-'; + out += ok ? c : '_'; + } + return out.empty() ? std::string{"module"} : out; +} +} // namespace + +void RuntimeFaultSink::onModuleFault(const FaultEvent& ev) { + try { + const int64_t nowMs = static_cast( + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count()); + + FaultReport r; + r.id = sanitizeForFilename(ev.moduleId) + "-" + std::to_string(nowMs) + "-" + + std::to_string(seq_.fetch_add(1, std::memory_order_relaxed)); + r.tsMs = nowMs; + r.kind = FaultKind::Exception; + r.signalOrCode = 0; + r.reason = ev.message; + r.sdkVersion = loom::kSdkVersion; + r.gitSha = loom::kGitSha; + r.buildType = LOOM_BUILD_TYPE; + r.moduleId = ev.moduleId; + r.className = ev.className; + r.phase = ev.phase; + r.cycle = ev.cycle; + + // Capture the module's live data sections — safe off the signal path. + FaultSections sections; + sections.config = safeRead(engine_, ev.moduleId, DataSection::Config); + sections.recipe = safeRead(engine_, ev.moduleId, DataSection::Recipe); + sections.runtime = safeRead(engine_, ev.moduleId, DataSection::Runtime); + sections.summary = safeRead(engine_, ev.moduleId, DataSection::Summary); + r.sections = std::move(sections); + + std::string json = toJson(r); + store_.record(r); + bus_.publish("loom/faults", json); + } catch (const std::exception& e) { + spdlog::error("RuntimeFaultSink failed to record fault: {}", e.what()); + } catch (...) { + // never propagate out of the worker thread + } +} + +} // namespace loom::diag diff --git a/runtime/src/diag/symbolizer.cpp b/runtime/src/diag/symbolizer.cpp new file mode 100644 index 0000000..4b650ce --- /dev/null +++ b/runtime/src/diag/symbolizer.cpp @@ -0,0 +1,31 @@ +#include "loom/diag/symbolizer.h" + +#include + +#include + +namespace loom::diag { + +std::vector symbolize(const void* const* addrs, std::size_t n) { + cpptrace::raw_trace raw; + raw.frames.reserve(n); + for (std::size_t i = 0; i < n; ++i) + raw.frames.push_back( + static_cast(reinterpret_cast(addrs[i]))); + + cpptrace::stacktrace st = raw.resolve(); + + std::vector out; + out.reserve(st.frames.size()); + for (const auto& f : st.frames) { + SymFrame sf; + sf.address = static_cast(f.raw_address); + sf.symbol = f.symbol; + sf.filename = f.filename; + sf.line = f.line.has_value() ? static_cast(f.line.value()) : 0u; + out.push_back(std::move(sf)); + } + return out; +} + +} // namespace loom::diag diff --git a/runtime/src/run.cpp b/runtime/src/run.cpp index 50cb37e..847a8ed 100644 --- a/runtime/src/run.cpp +++ b/runtime/src/run.cpp @@ -1,6 +1,7 @@ #include "loom/runtime_core.h" #include "loom/server.h" #include "loom/version.h" +#include "loom/diag/crash_handler.h" #include @@ -149,6 +150,11 @@ int run(int argc, char* argv[]) { std::signal(SIGINT, signalHandler); std::signal(SIGTERM, signalHandler); + // Process-global crash capture: any fatal signal / SEH / unhandled C++ + // exception writes a crash report (faulting module/phase + build id + stack) + // to /crash before the process dies. Covers module and runtime faults. + loom::diag::CrashHandler::install({std::filesystem::path(dataDir) / "crash"}); + RuntimeConfig runtimeCfg; runtimeCfg.moduleDir = moduleDirs.front(); for (size_t i = 1; i < moduleDirs.size(); ++i) { diff --git a/runtime/src/runtime_core.cpp b/runtime/src/runtime_core.cpp index 07d3b78..07bde8d 100644 --- a/runtime/src/runtime_core.cpp +++ b/runtime/src/runtime_core.cpp @@ -43,7 +43,8 @@ std::filesystem::path resolveModulePath(const RuntimeConfig& config, RuntimeCore::RuntimeCore(const RuntimeConfig& config) : config_(config), dataStore_(config.dataDir), - watcher_(config.moduleDir) { + watcher_(config.moduleDir), + faultStore_(config.dataDir / "crash") { // Load scheduler.json from the data directory. auto schedPath = config.dataDir / "scheduler.json"; bool schedExisted = std::filesystem::exists(schedPath); @@ -52,6 +53,11 @@ RuntimeCore::RuntimeCore(const RuntimeConfig& config) // Provide scheduler with pointers needed for cycle-aligned oscilloscope sampling. scheduler_.setSamplingTargets(&oscilloscope_, &dataEngine_, &loader_, &moduleMutex_); scheduler_.setIOMapper(&ioMapper_); + + // Wire fault reporting: a module-call exception is captured, persisted, and + // published on `loom/faults` by the sink (see runtime_fault_sink.cpp). + faultSink_ = std::make_unique(faultStore_, dataEngine_, bus_); + scheduler_.setFaultSink(faultSink_.get()); if (!schedExisted) { loom::saveSchedulerConfig(schedCfg_, schedPath); spdlog::info("Wrote default scheduler config to '{}'", schedPath.string()); diff --git a/runtime/src/scheduler.cpp b/runtime/src/scheduler.cpp index 734fd19..069a183 100644 --- a/runtime/src/scheduler.cpp +++ b/runtime/src/scheduler.cpp @@ -1,6 +1,8 @@ #include "loom/scheduler.h" #include "loom/scheduler_config.h" #include "loom/io_mapper.h" +#include "loom/diag/guard.h" +#include "loom/diag/fault_sink.h" #include @@ -8,6 +10,7 @@ #include #include #include +#include #include #include @@ -165,6 +168,35 @@ void Scheduler::setIOMapper(IOMapper* mapper) { ioMapper_ = mapper; } +void Scheduler::recordModuleFault(TaskState& state, LoadedModule& mod, + diag::Phase phase, std::string_view message) { + // Write the fault details FIRST, then publish `faulted` with release. A + // reader (the server) that observes faulted==true with acquire is then + // guaranteed to see a fully-written lastFaultMsg. A module faults at most + // once (it's skipped thereafter), so these fields are effectively write-once. + const int64_t nowMs = static_cast( + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count()); + state.lastFaultMs.store(nowMs); + state.lastFaultPhase.store(static_cast(phase)); + { + const std::size_t n = std::min(message.size(), sizeof(state.lastFaultMsg) - 1); + std::memcpy(state.lastFaultMsg, message.data(), n); + state.lastFaultMsg[n] = '\0'; + } + mod.state = ModuleState::Error; + state.faulted.store(true, std::memory_order_release); // publish last + + spdlog::error("Module '{}' faulted in {}: {}", + mod.id, diag::phaseName(phase), message); + + if (faultSink_) { + faultSink_->onModuleFault(diag::FaultEvent{ + mod.id, mod.className, phase, + state.cycleCount.load(), std::string(message)}); + } +} + Scheduler::~Scheduler() { stopAll(); } @@ -224,6 +256,7 @@ bool Scheduler::start(LoadedModule& mod, const TaskConfig& config, const InitCon // Init module before any thread touches it. initGuarded() opens the // extension-registration window around the user's init(). spdlog::info("Initializing module '{}' (reason: {})", mod.id, static_cast(ctx.reason)); + diag::BreadcrumbScope initCrumb(diag::Phase::Init, mod.id.c_str(), mod.className.c_str()); try { mod.instance->initGuarded(ctx); } catch (const std::exception& e) { @@ -231,6 +264,9 @@ bool Scheduler::start(LoadedModule& mod, const TaskConfig& config, const InitCon // of the scheduler and terminate the runtime — fail this module cleanly. spdlog::error("Module '{}' init() failed: {}", mod.id, e.what()); mod.state = ModuleState::Error; + if (faultSink_) + faultSink_->onModuleFault(diag::FaultEvent{ + mod.id, mod.className, diag::Phase::Init, 0, e.what()}); return false; } mod.state = ModuleState::Initialized; @@ -244,10 +280,15 @@ bool Scheduler::start(LoadedModule& mod, const TaskConfig& config, const InitCon // Start long-running thread immediately (independent of class membership). if (config.enableLongRunning) { - statePtr->longRunningThread = std::thread([&mod, statePtr]() { + statePtr->longRunningThread = std::thread([this, &mod, statePtr]() { spdlog::info("Long-running task started for '{}'", mod.id); while (statePtr->running.load()) { - mod.instance->longRunning(); + bool ok = diag::guard(diag::Phase::LongRunning, mod.id.c_str(), mod.className.c_str(), + [&]{ mod.instance->longRunning(); }, + [&](const diag::FaultInfo& f) { + recordModuleFault(*statePtr, mod, f.phase, f.message); + }); + if (!ok) break; // stop the loop rather than spin-faulting } spdlog::info("Long-running task ended for '{}'", mod.id); }); @@ -292,6 +333,11 @@ void Scheduler::startClasses() { bool Scheduler::stop(const std::string& moduleId) { std::thread cyclicToJoin, longRunToJoin; + // Keep the TaskState alive until AFTER the threads are joined: the cyclic / + // long-running threads hold a raw TaskState* (running flag, fault fields), so + // destroying it before the join is a use-after-free. We extract the owning + // unique_ptr into this local and let it drop at end of function, post-join. + std::unique_ptr stateToFree; { std::lock_guard lock(mutex_); @@ -328,11 +374,16 @@ bool Scheduler::stop(const std::string& moduleId) { spdlog::info("Module '{}' stopped ({} cycles, {} overruns)", moduleId, state.cycleCount.load(), state.overrunCount.load()); - tasks_.erase(moduleId); + // Transfer ownership out of the map (keeps the TaskState object alive via + // stateToFree) before erasing the now-empty slot. + stateToFree = std::move(stateIt->second); + tasks_.erase(stateIt); configs_.erase(moduleId); } - // Join outside the lock so we don't block the mutex. + // Join outside the lock so we don't block the mutex. stateToFree keeps the + // TaskState valid for the threads until they have fully exited here, after + // which it is destroyed. if (cyclicToJoin.joinable()) cyclicToJoin.join(); if (longRunToJoin.joinable()) longRunToJoin.join(); @@ -752,10 +803,18 @@ void Scheduler::classLoop(ClassRunnerState& runner) { // Reading it here without a lock is safe. auto execStart = std::chrono::steady_clock::now(); + // Mark a member faulted (skipped on subsequent sweeps/ticks via the + // faulted checks below) and report — used by the guarded calls. + auto faultMember = [&](auto& m, const diag::FaultInfo& f) { + recordModuleFault(*m.state, *m.mod, f.phase, f.message); + }; + // --- Sweep 1: preCyclic (e.g. read hardware inputs) --- for (auto& member : runner.members) { if (member.state->faulted.load()) continue; - member.mod->instance->preCyclicGuarded(); + diag::guard(diag::Phase::PreCyclic, member.moduleId.c_str(), member.mod->className.c_str(), + [&]{ member.mod->instance->preCyclicGuarded(); }, + [&](const diag::FaultInfo& f){ faultMember(member, f); }); } // --- Sweep 2: cyclic (do work) — timed, sampled --- @@ -775,7 +834,9 @@ void Scheduler::classLoop(ClassRunnerState& runner) { // Execute (cyclicGuarded acquires module's runtimeMutex_ so // server/watch threads can't race on runtime_ reads) - member.mod->instance->cyclicGuarded(); + diag::guard(diag::Phase::Cyclic, member.moduleId.c_str(), member.mod->className.c_str(), + [&]{ member.mod->instance->cyclicGuarded(); }, + [&](const diag::FaultInfo& f){ faultMember(member, f); }); // Lightweight sampling: use oscilloscope fast-path. // A member in runner.members is guaranteed alive — removeMember() pauses @@ -818,7 +879,9 @@ void Scheduler::classLoop(ClassRunnerState& runner) { // --- Sweep 3: postCyclic (e.g. flush outputs to hardware) --- for (auto& member : runner.members) { if (member.state->faulted.load()) continue; - member.mod->instance->postCyclicGuarded(); + diag::guard(diag::Phase::PostCyclic, member.moduleId.c_str(), member.mod->className.c_str(), + [&]{ member.mod->instance->postCyclicGuarded(); }, + [&](const diag::FaultInfo& f){ faultMember(member, f); }); } // --- Record total class cycle time --- @@ -921,21 +984,31 @@ void Scheduler::isolatedLoop(LoadedModule& mod, TaskConfig config, TaskState& st state.lastCyclicStartNs.store(startNs); auto t0 = std::chrono::steady_clock::now(); - mod.instance->cyclicGuarded(); + if (!state.faulted.load()) { + diag::guard(diag::Phase::Cyclic, mod.id.c_str(), mod.className.c_str(), + [&]{ mod.instance->cyclicGuarded(); }, + [&](const diag::FaultInfo& f){ + recordModuleFault(state, mod, f.phase, f.message); + }); + } auto t1 = std::chrono::steady_clock::now(); - // Execute I/O mappings for this isolated module - if (ioMapper_) { - ioMapper_->executeForModule(mod.id); - } + // Once quarantined, skip I/O mappings + sampling too (the class loop skips + // faulted members entirely) — don't keep touching a module in an error state. + if (!state.faulted.load()) { + // Execute I/O mappings for this isolated module + if (ioMapper_) { + ioMapper_->executeForModule(mod.id); + } - // Sample this isolated module using oscilloscope fast-path. - // The isolated thread owns this module exclusively; state.running guards lifetime. - if (oscilloscope_ && dataEngine_) { - int64_t nowMs = static_cast( - std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()).count()); - oscilloscope_->sampleModule(mod.id, *dataEngine_, *mod.instance, nowMs); + // Sample this isolated module using oscilloscope fast-path. + // The isolated thread owns this module exclusively; state.running guards lifetime. + if (oscilloscope_ && dataEngine_) { + int64_t nowMs = static_cast( + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count()); + oscilloscope_->sampleModule(mod.id, *dataEngine_, *mod.instance, nowMs); + } } auto elapsed = std::chrono::duration_cast(t1 - t0); diff --git a/runtime/src/server.cpp b/runtime/src/server.cpp index 72a9c43..694c319 100644 --- a/runtime/src/server.cpp +++ b/runtime/src/server.cpp @@ -1,4 +1,6 @@ #include "loom/server.h" +#include "loom/diag/breadcrumb.h" +#include "loom/diag/fault_store.h" // Crow static serving: define CROW_STATIC_DIRECTORY as a C++ variable expression // so the path is resolved at runtime (when app.run() calls add_static_dir()). @@ -273,6 +275,19 @@ static std::string moduleInfoJson(const LoadedModule& mod, const Scheduler& sche json += ",\"maxCycleTimeUs\":" + std::to_string(ts->maxCycleTimeUs.load()); json += ",\"lastJitterUs\":" + std::to_string(ts->lastJitterUs.load()); + // Last-fault diagnostics (faulted modules are skipped by the scheduler). + // Acquire-load `faulted` and read the last-fault fields only when set: + // this pairs with the release store in Scheduler::recordModuleFault so we + // never read a half-written lastFaultMsg (see scheduler.h). + const bool faulted = ts->faulted.load(std::memory_order_acquire); + json += ",\"faulted\":" + std::string(faulted ? "true" : "false"); + if (faulted) { + json += ",\"lastFaultMs\":" + std::to_string(ts->lastFaultMs.load()); + json += ",\"lastFaultPhase\":\"" + + std::string(diag::phaseName(static_cast(ts->lastFaultPhase.load()))) + "\""; + json += ",\"lastFaultMsg\":\"" + jsonEscapeString(ts->lastFaultMsg) + "\""; + } + // Add cycle history { std::lock_guard lk(ts->cycleHistoryMx); @@ -357,6 +372,49 @@ void Server::start() { return resp; }); + // ===================================================================== + // GET /api/faults — List fault reports (newest first). Includes this + // run's exception faults and any persisted reports from prior runs + // (signal-path crashes the process couldn't keep in memory). + // ===================================================================== + CROW_ROUTE(app, "/api/faults") + ([this]() { + std::string json = "["; + bool first = true; + for (const auto& s : core_.faultStore().list()) { + if (!first) json += ","; + first = false; + json += "{"; + json += "\"id\":\"" + jsonEscapeString(s.id) + "\""; + json += ",\"ts\":" + std::to_string(s.tsMs); + json += ",\"kind\":\"" + jsonEscapeString(s.kind) + "\""; + json += ",\"module\":\"" + jsonEscapeString(s.moduleId) + "\""; + json += ",\"class\":\"" + jsonEscapeString(s.className) + "\""; + json += ",\"phase\":\"" + jsonEscapeString(s.phase) + "\""; + json += ",\"reason\":\"" + jsonEscapeString(s.reason) + "\""; + json += "}"; + } + json += "]"; + auto resp = crow::response(200, json); + resp.add_header("Content-Type", "application/json"); + resp.add_header("Access-Control-Allow-Origin", "*"); + return resp; + }); + + // ===================================================================== + // GET /api/faults/ — Full structured report for one fault. + // ===================================================================== + CROW_ROUTE(app, "/api/faults/") + ([this](const std::string& id) { + auto detail = core_.faultStore().detailJson(id); + crow::response resp = detail + ? crow::response(200, *detail) + : crow::response(404, R"({"error":"fault not found"})"); + resp.add_header("Content-Type", "application/json"); + resp.add_header("Access-Control-Allow-Origin", "*"); + return resp; + }); + // ===================================================================== // POST /api/modules/instantiate — Create a new instance from a .so // Body: { "id": "left_motor", "so": "libexample_motor.so" } diff --git a/sdk/CMakeLists.txt b/sdk/CMakeLists.txt index e6a34a2..a9880e5 100644 --- a/sdk/CMakeLists.txt +++ b/sdk/CMakeLists.txt @@ -14,6 +14,29 @@ else() set(LOOM_SDK_VERSION ${PROJECT_VERSION}) endif() +# Optional git identity, stamped into version.h. Degrades gracefully: if git or +# the .git dir is absent (source-tarball / conan-cache / no-git builds), kGitSha +# becomes "unknown" and the build NEVER fails. kSdkVersion + BuildInfo remain the +# always-present identifiers. +set(LOOM_GIT_SHA "unknown") +find_package(Git QUIET) +if(Git_FOUND AND EXISTS "${CMAKE_SOURCE_DIR}/.git") + execute_process( + COMMAND ${GIT_EXECUTABLE} -C "${CMAKE_SOURCE_DIR}" rev-parse --short=12 HEAD + OUTPUT_VARIABLE LOOM_GIT_SHA OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) + if(NOT LOOM_GIT_SHA) + set(LOOM_GIT_SHA "unknown") + else() + execute_process( + COMMAND ${GIT_EXECUTABLE} -C "${CMAKE_SOURCE_DIR}" status --porcelain --untracked-files=no + OUTPUT_VARIABLE _loom_git_dirty OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) + if(_loom_git_dirty) + set(LOOM_GIT_SHA "${LOOM_GIT_SHA}+dirty") + endif() + endif() +endif() +message(STATUS "Loom git sha: ${LOOM_GIT_SHA}") + # Generate version.h from the template so every module built against this SDK # has the exact version string baked in at compile time. configure_file( diff --git a/sdk/include/loom/bus.h b/sdk/include/loom/bus.h index f79043c..65c41a2 100644 --- a/sdk/include/loom/bus.h +++ b/sdk/include/loom/bus.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -147,7 +148,16 @@ class Bus { } handler = it->second; } - return handler(request); + // A throwing service handler must not unwind into the caller (which may + // be a cyclic thread or the HTTP thread) — return an error instead. + // Dependency-free (std only): keeps the SDK clean of diagnostics deps. + try { + return handler(request); + } catch (const std::exception& e) { + return {false, {}, std::string("service threw: ") + e.what()}; + } catch (...) { + return {false, {}, "service threw: unknown exception"}; + } } // ========================================================================= diff --git a/sdk/include/loom/version.h.in b/sdk/include/loom/version.h.in index 26b8690..eb6f847 100644 --- a/sdk/include/loom/version.h.in +++ b/sdk/include/loom/version.h.in @@ -7,4 +7,8 @@ namespace loom { /// to detect version mismatches at load time. inline constexpr const char* kSdkVersion = "@LOOM_SDK_VERSION@"; +/// Git commit the SDK was built from ("unknown" if git/.git was unavailable at +/// build time). Used in crash reports to map a fault back to exact source. +inline constexpr const char* kGitSha = "@LOOM_GIT_SHA@"; + } // namespace loom diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6e92f7c..6d48af4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -9,6 +9,7 @@ add_executable(loom_tests test_command_integration.cpp test_command_fb.cpp test_port.cpp + test_diag.cpp ) target_include_directories(loom_tests PRIVATE @@ -32,6 +33,9 @@ target_sources(loom_tests PRIVATE ${CMAKE_SOURCE_DIR}/runtime/src/scheduler.cpp ${CMAKE_SOURCE_DIR}/runtime/src/scheduler_config.cpp ${CMAKE_SOURCE_DIR}/runtime/src/runtime_heap.cpp + ${CMAKE_SOURCE_DIR}/runtime/src/diag/breadcrumb.cpp # scheduler.cpp references diag::tlsBreadcrumb + ${CMAKE_SOURCE_DIR}/runtime/src/diag/fault_report.cpp + ${CMAKE_SOURCE_DIR}/runtime/src/diag/fault_store.cpp # no cpptrace dep (symbolizer is header-only here) ) include(GoogleTest) diff --git a/tests/test_diag.cpp b/tests/test_diag.cpp new file mode 100644 index 0000000..466fe59 --- /dev/null +++ b/tests/test_diag.cpp @@ -0,0 +1,148 @@ +#include "loom/diag/breadcrumb.h" +#include "loom/diag/guard.h" +#include "loom/diag/fault_report.h" +#include "loom/diag/fault_store.h" + +#include +#include + +#include +#include +#include + +using namespace loom::diag; + +// --- BreadcrumbScope ------------------------------------------------------- + +TEST(DiagBreadcrumb, SetsAndRestores) { + EXPECT_EQ(tlsBreadcrumb.phase, Phase::None); + { + BreadcrumbScope s(Phase::Cyclic, "mod_a", "ClassA"); + EXPECT_EQ(tlsBreadcrumb.phase, Phase::Cyclic); + EXPECT_STREQ(tlsBreadcrumb.moduleId, "mod_a"); + EXPECT_STREQ(tlsBreadcrumb.className, "ClassA"); + } + EXPECT_EQ(tlsBreadcrumb.phase, Phase::None); // restored +} + +TEST(DiagBreadcrumb, NestedRestores) { + BreadcrumbScope outer(Phase::Cyclic, "mod_a", "A"); + { + BreadcrumbScope inner(Phase::Service, "mod_b", "B"); + EXPECT_EQ(tlsBreadcrumb.phase, Phase::Service); + EXPECT_STREQ(tlsBreadcrumb.moduleId, "mod_b"); + } + EXPECT_EQ(tlsBreadcrumb.phase, Phase::Cyclic); // back to outer + EXPECT_STREQ(tlsBreadcrumb.moduleId, "mod_a"); +} + +// --- guard ----------------------------------------------------------------- + +TEST(DiagGuard, NoThrowReturnsTrueNoFault) { + bool faulted = false; + bool ran = false; + bool ok = guard(Phase::Cyclic, "m", "C", + [&]{ ran = true; }, + [&](const FaultInfo&){ faulted = true; }); + EXPECT_TRUE(ok); + EXPECT_TRUE(ran); + EXPECT_FALSE(faulted); + EXPECT_EQ(tlsBreadcrumb.phase, Phase::None); // breadcrumb restored +} + +TEST(DiagGuard, StdExceptionCaughtReported) { + std::string msg; + Phase capturedPhase = Phase::None; + bool ok = guard(Phase::PreCyclic, "m", "C", + [&]{ throw std::runtime_error("boom"); }, + [&](const FaultInfo& f){ msg = std::string(f.message); capturedPhase = f.phase; }); + EXPECT_FALSE(ok); + EXPECT_EQ(msg, "boom"); + EXPECT_EQ(capturedPhase, Phase::PreCyclic); + EXPECT_EQ(tlsBreadcrumb.phase, Phase::None); // restored even on throw +} + +TEST(DiagGuard, NonStdThrowCaught) { + bool faulted = false; + bool ok = guard(Phase::Cyclic, "m", "C", + [&]{ throw 42; }, // non-std exception + [&](const FaultInfo&){ faulted = true; }); + EXPECT_FALSE(ok); + EXPECT_TRUE(faulted); +} + +// --- FaultReport JSON ------------------------------------------------------ + +static FaultReport sampleReport() { + FaultReport r; + r.id = "mod_a-123-0"; + r.tsMs = 123; + r.kind = FaultKind::Exception; + r.reason = "boom \"quoted\"\nnewline"; // exercise escaping + r.sdkVersion = "0.3.0"; + r.gitSha = "abc123"; + r.buildType = "Debug"; + r.moduleId = "mod_a"; + r.className = "ClassA"; + r.phase = Phase::Cyclic; + r.cycle = 7; + r.frames.push_back(SymFrame{0xdead, "foo()", "foo.cpp", 42}); + FaultSections s; + s.runtime = R"({"pos":1.5})"; + r.sections = s; + return r; +} + +TEST(DiagFaultReport, ToJsonIsParseableAndPreservesFields) { + std::string json = toJson(sampleReport()); + + glz::json_t doc; + ASSERT_FALSE(glz::read_json(doc, json)) << "report JSON must parse"; + ASSERT_TRUE(doc.is_object()); + + EXPECT_EQ(doc["id"].get(), "mod_a-123-0"); + EXPECT_EQ(doc["kind"].get(), "exception"); + EXPECT_EQ(doc["reason"].get(), "boom \"quoted\"\nnewline"); // round-trips escaping + EXPECT_EQ(doc["breadcrumb"]["module"].get(), "mod_a"); + EXPECT_EQ(doc["breadcrumb"]["phase"].get(), "cyclic"); + EXPECT_EQ(static_cast(doc["breadcrumb"]["cycle"].get()), 7); + ASSERT_TRUE(doc["frames"].is_array()); + EXPECT_EQ(doc["frames"][0]["function"].get(), "foo()"); + // sections embed as real nested JSON, not an escaped string + ASSERT_TRUE(doc["sections"]["runtime"].is_object()); + EXPECT_EQ(static_cast(doc["sections"]["runtime"]["pos"].get()), 1.5); +} + +// --- FaultStore ------------------------------------------------------------ + +TEST(DiagFaultStore, RecordListDetailRoundTrip) { + auto dir = std::filesystem::temp_directory_path() / + ("loom_faults_" + std::to_string(::testing::UnitTest::GetInstance()->random_seed())); + std::filesystem::remove_all(dir); + + FaultStore store(dir); + EXPECT_TRUE(store.list().empty()); + + std::string id = store.record(sampleReport()); + EXPECT_EQ(id, "mod_a-123-0"); + + auto list = store.list(); + ASSERT_EQ(list.size(), 1u); + EXPECT_EQ(list[0].id, "mod_a-123-0"); + EXPECT_EQ(list[0].kind, "exception"); + EXPECT_EQ(list[0].moduleId, "mod_a"); + EXPECT_EQ(list[0].phase, "cyclic"); + + auto detail = store.detailJson(id); + ASSERT_TRUE(detail.has_value()); + EXPECT_NE(detail->find("\"boom"), std::string::npos); + + EXPECT_FALSE(store.detailJson("nope").has_value()); + + // A fresh store over the same dir re-loads the persisted report. + FaultStore reopened(dir); + ASSERT_EQ(reopened.list().size(), 1u); + EXPECT_EQ(reopened.list()[0].id, "mod_a-123-0"); + + std::filesystem::remove_all(dir); +}