diff --git a/src/launch_manager_daemon/process_state_client_lib/include/score/lcm/iprocessstatereceiver.hpp b/src/launch_manager_daemon/process_state_client_lib/include/score/lcm/iprocessstatereceiver.hpp index c594155cc..d4d7e46d2 100644 --- a/src/launch_manager_daemon/process_state_client_lib/include/score/lcm/iprocessstatereceiver.hpp +++ b/src/launch_manager_daemon/process_state_client_lib/include/score/lcm/iprocessstatereceiver.hpp @@ -13,24 +13,27 @@ #ifndef IPROCESSSTATERECEIVER_HPP_INCLUDED #define IPROCESSSTATERECEIVER_HPP_INCLUDED -#include #include "score/result/result.h" #include #include +#include #include -namespace score { +namespace score +{ -namespace lcm { +namespace lcm +{ /// @brief IProcessStateReceiver interface for handling the information about each Process current state. -/// Health Monitor (HM) shall use this interface in order to properly receive +/// Alive Monitor (AM) shall use this interface in order to properly receive /// information about the current state from the posix processes running in the scope of an Adaptive Machine. -/// Each posix process state change is sent by Launch Manager (LCM) and can be read by HM. +/// Each posix process state change is sent by Launch Manager (LCM) and can be read by AM. -class IProcessStateReceiver { - public: +class IProcessStateReceiver +{ + public: virtual ~IProcessStateReceiver() noexcept = default; /// @brief Returns a queued PosixProcess that has not yet been parsed. @@ -38,8 +41,8 @@ class IProcessStateReceiver { virtual score::Result> getNextChangedPosixProcess() noexcept = 0; }; -} // namespace lcm +} // namespace lcm -} // namespace score +} // namespace score #endif diff --git a/src/launch_manager_daemon/recovery_client_lib/include/score/lcm/irecovery_client.h b/src/launch_manager_daemon/recovery_client_lib/include/score/lcm/irecovery_client.h index 623d60b15..46a6a1c54 100644 --- a/src/launch_manager_daemon/recovery_client_lib/include/score/lcm/irecovery_client.h +++ b/src/launch_manager_daemon/recovery_client_lib/include/score/lcm/irecovery_client.h @@ -13,25 +13,28 @@ #ifndef SCORE_LCM_IRECOVERYCLIENT_H_ #define SCORE_LCM_IRECOVERYCLIENT_H_ -#include #include +#include -namespace score { -namespace lcm { +namespace score +{ +namespace lcm +{ /// @brief Represents a recovery request for a failed process group. -struct RecoveryRequest { +struct RecoveryRequest +{ /// @brief The id of the process group the failed process is running in score::lcm::IdentifierHash process_group_identifier_{}; }; -/// @brief The RecoveryClient allows the HealthMonitor component to report supervision failures to the ProcessGroupManager -/// thus requesting recovery for a specific process group. -/// The requests are queued and periodically processed by the ProcessGroupManager. -/// In case the buffer is full and request cannot be queued, the overflow flag is set. -/// A detected overflow shall be handled as a critical failure by the ProcessGroupManager. -class IRecoveryClient { -public: +/// @brief The RecoveryClient allows the AliveMonitor component to report supervision failures to the +/// ProcessGroupManager thus requesting recovery for a specific process group. The requests are queued and periodically +/// processed by the ProcessGroupManager. In case the buffer is full and request cannot be queued, the overflow flag is +/// set. A detected overflow shall be handled as a critical failure by the ProcessGroupManager. +class IRecoveryClient +{ + public: IRecoveryClient() noexcept = default; virtual ~IRecoveryClient() noexcept = default; IRecoveryClient(const IRecoveryClient&) = delete; @@ -48,12 +51,13 @@ class IRecoveryClient { /// @return The request, or std::nullopt if no request is available virtual std::optional getNextRequest() noexcept = 0; - /// @brief Checks if overflow has been set, by previously calling `sendRecoveryRequest` while the queue was already full + /// @brief Checks if overflow has been set, by previously calling `sendRecoveryRequest` while the queue was already + /// full /// @details Since overflow is a critical error, the flag is never reset /// @return True if overflow has occurred, else false. virtual bool hasOverflow() const noexcept = 0; }; -} // namespace lcm -} // namespace score +} // namespace lcm +} // namespace score #endif diff --git a/src/launch_manager_daemon/src/main/main.cpp b/src/launch_manager_daemon/src/main/main.cpp index d86f2ed3f..3d70a3e97 100644 --- a/src/launch_manager_daemon/src/main/main.cpp +++ b/src/launch_manager_daemon/src/main/main.cpp @@ -17,7 +17,7 @@ #include -#include +#include #include #include #include @@ -134,11 +134,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] const char* argv[]) std::unique_ptr healthMonitor{ std::make_unique( recoveryClient, std::move(watchdog), process_state_notifier->constructReceiver())}; - std::unique_ptr healthMonitorThread{ - std::make_unique(std::move(healthMonitor))}; + std::unique_ptr aliveMonitorThread{ + std::make_unique(std::move(healthMonitor))}; std::unique_ptr process_group_manager = std::make_unique( - std::move(healthMonitorThread), recoveryClient, std::move(process_state_notifier)); + std::move(aliveMonitorThread), recoveryClient, std::move(process_state_notifier)); if (initializeLCMDaemon(*process_group_manager)) { diff --git a/src/launch_manager_daemon/src/process_group_manager/health_monitor_thread.cpp b/src/launch_manager_daemon/src/process_group_manager/alive_monitor_thread.cpp similarity index 54% rename from src/launch_manager_daemon/src/process_group_manager/health_monitor_thread.cpp rename to src/launch_manager_daemon/src/process_group_manager/alive_monitor_thread.cpp index 01b5b3b4a..3d765f916 100644 --- a/src/launch_manager_daemon/src/process_group_manager/health_monitor_thread.cpp +++ b/src/launch_manager_daemon/src/process_group_manager/alive_monitor_thread.cpp @@ -10,27 +10,30 @@ * * SPDX-License-Identifier: Apache-2.0 ********************************************************************************/ -#include "health_monitor_thread.hpp" +#include "alive_monitor_thread.hpp" namespace score { namespace lcm { -namespace internal +namespace internal { -HealthMonitorThread::HealthMonitorThread(std::unique_ptr health_monitor) : m_health_monitor(std::move(health_monitor)) { - +AliveMonitorThread::AliveMonitorThread(std::unique_ptr health_monitor) + : m_health_monitor(std::move(health_monitor)) +{ } -bool HealthMonitorThread::start() { +bool AliveMonitorThread::start() +{ score::lcm::saf::daemon::EInitCode init_status{score::lcm::saf::daemon::EInitCode::kNotInitialized}; - health_monitor_thread_ = std::thread([this, &init_status]() { + alive_monitor_thread_ = std::thread([this, &init_status]() { const auto initResult = m_health_monitor->init(); - + notifyInitializationComplete(init_status, initResult); - if (initResult == saf::daemon::EInitCode::kNoError) { + if (initResult == saf::daemon::EInitCode::kNoError) + { m_health_monitor->run(stop_thread_); } }); @@ -40,16 +43,18 @@ bool HealthMonitorThread::start() { return init_status == saf::daemon::EInitCode::kNoError; } -void HealthMonitorThread::stop() { +void AliveMonitorThread::stop() +{ stop_thread_.store(true); - if (health_monitor_thread_.joinable()) { - health_monitor_thread_.join(); + if (alive_monitor_thread_.joinable()) + { + alive_monitor_thread_.join(); } } -void HealthMonitorThread::notifyInitializationComplete( - score::lcm::saf::daemon::EInitCode& f_init_status_r, - const score::lcm::saf::daemon::EInitCode f_init_result) { +void AliveMonitorThread::notifyInitializationComplete(score::lcm::saf::daemon::EInitCode& f_init_status_r, + const score::lcm::saf::daemon::EInitCode f_init_result) +{ { std::lock_guard lk(m_initialization_mutex); f_init_status_r = f_init_result; @@ -57,16 +62,14 @@ void HealthMonitorThread::notifyInitializationComplete( m_initialization_cv.notify_all(); } -void HealthMonitorThread::waitForInitializationCompleted( - score::lcm::saf::daemon::EInitCode& f_init_status_r) { +void AliveMonitorThread::waitForInitializationCompleted(score::lcm::saf::daemon::EInitCode& f_init_status_r) +{ std::unique_lock lk(m_initialization_mutex); - m_initialization_cv.wait( - lk, - [&f_init_status_r]() { - return f_init_status_r != score::lcm::saf::daemon::EInitCode::kNotInitialized; - }); + m_initialization_cv.wait(lk, [&f_init_status_r]() { + return f_init_status_r != score::lcm::saf::daemon::EInitCode::kNotInitialized; + }); } -} -} -} +} // namespace internal +} // namespace lcm +} // namespace score diff --git a/src/launch_manager_daemon/src/process_group_manager/health_monitor_thread.hpp b/src/launch_manager_daemon/src/process_group_manager/alive_monitor_thread.hpp similarity index 51% rename from src/launch_manager_daemon/src/process_group_manager/health_monitor_thread.hpp rename to src/launch_manager_daemon/src/process_group_manager/alive_monitor_thread.hpp index b41a5635b..3a124ff10 100644 --- a/src/launch_manager_daemon/src/process_group_manager/health_monitor_thread.hpp +++ b/src/launch_manager_daemon/src/process_group_manager/alive_monitor_thread.hpp @@ -10,14 +10,14 @@ * * SPDX-License-Identifier: Apache-2.0 ********************************************************************************/ -#ifndef SCORE_LCM_HEALTH_MONITOR_HPP_INCLUDED -#define SCORE_LCM_HEALTH_MONITOR_HPP_INCLUDED +#ifndef SCORE_LCM_ALIVE_MONITOR_THREAD_HPP_INCLUDED +#define SCORE_LCM_ALIVE_MONITOR_THREAD_HPP_INCLUDED -#include -#include #include +#include +#include -#include +#include namespace score { @@ -26,31 +26,32 @@ namespace lcm namespace internal { -/// @brief HealthMonitor manages the lifecycle of the Health Monitor daemon in a separate thread. -class HealthMonitorThread final : public IHealthMonitorThread { - public: - HealthMonitorThread(std::unique_ptr health_monitor); +/// @brief AliveMonitor manages the lifecycle of the alive monitoring daemon in a separate thread. +class AliveMonitorThread final : public IAliveMonitorThread +{ + public: + AliveMonitorThread(std::unique_ptr health_monitor); - /// @brief Starts the Health Monitor thread. - /// @return true if the Health Monitor started successfully, false otherwise. + /// @brief Starts the Alive Monitor thread. + /// @return true if the Alive Monitor started successfully, false otherwise. bool start() override; - /// @brief Stops the Health Monitor thread. + /// @brief Stops the Alive Monitor thread. void stop() override; - private: - void notifyInitializationComplete( - score::lcm::saf::daemon::EInitCode& f_init_status_r, - const score::lcm::saf::daemon::EInitCode f_init_result); - void waitForInitializationCompleted(score::lcm::saf::daemon::EInitCode& f_init_status_r); + + private: + void notifyInitializationComplete(score::lcm::saf::daemon::EInitCode& f_init_status_r, + const score::lcm::saf::daemon::EInitCode f_init_result); + void waitForInitializationCompleted(score::lcm::saf::daemon::EInitCode& f_init_status_r); std::unique_ptr m_health_monitor{nullptr}; - std::thread health_monitor_thread_{}; + std::thread alive_monitor_thread_{}; std::atomic_bool stop_thread_{false}; std::mutex m_initialization_mutex{}; std::condition_variable m_initialization_cv{}; }; - -} -} -} + +} // namespace internal +} // namespace lcm +} // namespace score #endif diff --git a/src/launch_manager_daemon/src/process_group_manager/ihealth_monitor_thread.hpp b/src/launch_manager_daemon/src/process_group_manager/ialive_monitor_thread.hpp similarity index 72% rename from src/launch_manager_daemon/src/process_group_manager/ihealth_monitor_thread.hpp rename to src/launch_manager_daemon/src/process_group_manager/ialive_monitor_thread.hpp index a6011c574..739be487d 100644 --- a/src/launch_manager_daemon/src/process_group_manager/ihealth_monitor_thread.hpp +++ b/src/launch_manager_daemon/src/process_group_manager/ialive_monitor_thread.hpp @@ -11,9 +11,8 @@ * SPDX-License-Identifier: Apache-2.0 ********************************************************************************/ - -#ifndef SCORE_LCM_IHEALTH_MONITOR_HPP_INCLUDED -#define SCORE_LCM_IHEALTH_MONITOR_HPP_INCLUDED +#ifndef SCORE_LCM_IALIVE_MONITOR_THREAD_HPP_INCLUDED +#define SCORE_LCM_IALIVE_MONITOR_THREAD_HPP_INCLUDED namespace score { @@ -21,15 +20,16 @@ namespace lcm { namespace internal { -class IHealthMonitorThread { - public: +class IAliveMonitorThread +{ + public: virtual bool start() = 0; virtual void stop() = 0; - virtual ~IHealthMonitorThread() = default; + virtual ~IAliveMonitorThread() = default; }; -} -} -} +} // namespace internal +} // namespace lcm +} // namespace score -#endif \ No newline at end of file +#endif diff --git a/src/launch_manager_daemon/src/process_group_manager/processgroupmanager.cpp b/src/launch_manager_daemon/src/process_group_manager/processgroupmanager.cpp index 5c9a1b743..78998b26a 100644 --- a/src/launch_manager_daemon/src/process_group_manager/processgroupmanager.cpp +++ b/src/launch_manager_daemon/src/process_group_manager/processgroupmanager.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include @@ -37,7 +37,7 @@ void ProcessGroupManager::cancel() my_signal_handler(SIGTERM); } -ProcessGroupManager::ProcessGroupManager(std::unique_ptr health_monitor, +ProcessGroupManager::ProcessGroupManager(std::unique_ptr alive_monitor_thread, std::shared_ptr recovery_client, std::unique_ptr process_state_notifier) : configuration_manager_(), @@ -49,7 +49,7 @@ ProcessGroupManager::ProcessGroupManager(std::unique_ptr h num_process_groups_(0U), process_groups_(), process_state_notifier_(std::move(process_state_notifier)), - health_monitor_thread_(std::move(health_monitor)), + alive_monitor_thread_(std::move(alive_monitor_thread)), recovery_client_(recovery_client) //, // ucm_polling_thread_( // [this](const Message::Action act, const Message::UpdateContext updateCtx, const lib::fun::string& swc) -> bool @@ -98,9 +98,9 @@ bool ProcessGroupManager::initialize() LM_LOG_DEBUG() << "Process Group initialization done"; createProcessComponentsObjects(); initializeGraphNodes(); - if (!health_monitor_thread_->start()) + if (!alive_monitor_thread_->start()) { - LM_LOG_ERROR() << "Health monitor thread failed to start"; + LM_LOG_ERROR() << "Alive monitor thread failed to start"; return false; } @@ -116,7 +116,7 @@ bool ProcessGroupManager::initialize() void ProcessGroupManager::deinitialize() { // ucm_polling_thread_.stopPolling(); - health_monitor_thread_->stop(); + alive_monitor_thread_->stop(); configuration_manager_.deinitialize(); process_groups_.clear(); @@ -528,13 +528,13 @@ inline void ProcessGroupManager::recoveryActionHandler() if (nullptr == pg) { - LM_LOG_ERROR() << "recoveryActionHandler: Unknown process group " << recovery_request->process_group_identifier_; + LM_LOG_ERROR() << "recoveryActionHandler: Unknown process group " + << recovery_request->process_group_identifier_; continue; } const IdentifierHash old_state = pg->getProcessGroupState(); - const IdentifierHash recovery_state = - configuration_manager_.getNameOfRecoveryState(pg->getProcessGroupName()); + const IdentifierHash recovery_state = configuration_manager_.getNameOfRecoveryState(pg->getProcessGroupName()); const GraphState graph_state = pg->getState(); LM_LOG_DEBUG() << "recoveryActionHandler: Processing recovery request for PG " diff --git a/src/launch_manager_daemon/src/process_group_manager/processgroupmanager.hpp b/src/launch_manager_daemon/src/process_group_manager/processgroupmanager.hpp index 2143c9b04..c49830ddd 100644 --- a/src/launch_manager_daemon/src/process_group_manager/processgroupmanager.hpp +++ b/src/launch_manager_daemon/src/process_group_manager/processgroupmanager.hpp @@ -11,56 +11,62 @@ * SPDX-License-Identifier: Apache-2.0 ********************************************************************************/ - #ifndef PROCESSGROUPMANAGER_HPP_INCLUDED #define PROCESSGROUPMANAGER_HPP_INCLUDED #include -#include #include +#include -#include -#include +#include #include -#include #include -#include +#include +#include #include -#include #include #include #include -#include -#include +#include #include +#include +#include +#include - -namespace score::lcm::internal { +namespace score::lcm::internal +{ /// @brief ProcessGroupManager provides the core functionality of LCM. /// Software that is deployed to the machine, should be managed through Process Groups. -/// A Process Group (PG) can be described as a set of applications, or executable files, that should be controlled in a coherent way. -/// Through a Process Group, Launch Manager will control the life cycle of Operating System (OS) processes. -/// They will be started and stopped when State Management (SM) request so and they will be started and stopped in a way, that is described by integrator through configuration. -/// When SM request PG change, ProcessGroupManager will use ConfigurationManager to figure out what processes shall be started, or stopped, as well as their startup configuration. -/// Then ProcessGroupManager will use Operating System Abstraction Layer (OSAL) to start, or stop, processes as per configuration. -/// Some of the responsibilities of ProcessGroupManager include: -/// Interaction with ConfigurationManager to ensure that, the list of processes that are running on Machine, is as configured by integrator. -/// Interaction with OSAL to start and stop processes. -/// Interaction with OSAL to discover when processes terminated in an unexpected way. -/// Fulfilling PG State transitions requests from SM, as well as informing SM about unexpected problems (for example process crashes). -class ProcessGroupManager final { - using WorkerQueue = MPMCConcurrentQueue, - static_cast(ProcessLimits::kMaxProcesses)>; - public: +/// A Process Group (PG) can be described as a set of applications, or executable files, that should be controlled in a +/// coherent way. Through a Process Group, Launch Manager will control the life cycle of Operating System (OS) +/// processes. They will be started and stopped when State Management (SM) request so and they will be started and +/// stopped in a way, that is described by integrator through configuration. When SM request PG change, +/// ProcessGroupManager will use ConfigurationManager to figure out what processes shall be started, or stopped, as well +/// as their startup configuration. Then ProcessGroupManager will use Operating System Abstraction Layer (OSAL) to +/// start, or stop, processes as per configuration. Some of the responsibilities of ProcessGroupManager include: +/// Interaction with ConfigurationManager to ensure that, the list of processes that are running on Machine, is as +/// configured by integrator. Interaction with OSAL to start and stop processes. Interaction with OSAL to discover +/// when processes terminated in an unexpected way. Fulfilling PG State transitions requests from SM, as well as +/// informing SM about unexpected problems (for example process crashes). +class ProcessGroupManager final +{ + using WorkerQueue = + MPMCConcurrentQueue, static_cast(ProcessLimits::kMaxProcesses)>; + + public: /// @brief Constructs a new ProcessGroupManager object. /// /// This constructor initializes the ProcessGroupManager instance, /// setting up any necessary internal state and preparing it for use. - /// @param health_monitor A unique pointer to an IHealthMonitor instance for managing health monitoring. + /// @param alive_monitor_thread A unique pointer to an IAliveMonitorThread instance for managing health + /// monitoring. /// @param recovery_client A shared pointer to an IRecoveryClient instance for handling recovery operations. - /// @param process_state_notifier A unique pointer to an IProcessStateNotifier instance for notifying the HM thread of process state changes. - ProcessGroupManager(std::unique_ptr health_monitor, std::shared_ptr recovery_client, std::unique_ptr process_state_notifier); + /// @param process_state_notifier A unique pointer to an IProcessStateNotifier instance for notifying the Alive + /// Monitor thread of process state changes. + ProcessGroupManager(std::unique_ptr alive_monitor_thread, + std::shared_ptr recovery_client, + std::unique_ptr process_state_notifier); /// @brief Initializes the process group manager. /// Loads the flat configuration through ConfigurationManager. @@ -102,7 +108,8 @@ class ProcessGroupManager final { /// @brief Send a response message to a Control Client /// @param msg the message to send, containing the Control Client id as the address to send it - /// @return true when either no error or the state manager no longer exists, false when the state manager had not read the previous response + /// @return true when either no error or the state manager no longer exists, false when the state manager had not + /// read the previous response bool sendResponse(ControlClientMessage msg); /// @brief Gets the process interface. @@ -128,7 +135,8 @@ class ProcessGroupManager final { /// if no more free shared memory, the PosixProcess is not sent. /// @param[in] f_posixProcess The PosixProcess to be queued /// @returns True on success, false for failure (corresponding to kCommunicationError). - bool queuePosixProcess(const score::lcm::PosixProcess& f_posixProcess) { + bool queuePosixProcess(const score::lcm::PosixProcess& f_posixProcess) + { return process_state_notifier_->queuePosixProcess(f_posixProcess); } @@ -138,8 +146,7 @@ class ProcessGroupManager final { /// @brief Set the internal pointer for the Launch Manager ProcessInfoNode void setLaunchManagerConfiguration(const OsProcess* launch_manager_config); - - private: + private: /// @brief Perform the function of Control Client handler /// @details (a) check for requests from any state manager processes in this process group\n /// (b) check to see if the process group has a pending response to send to a state manager @@ -169,7 +176,7 @@ class ProcessGroupManager final { /// @param pg Reference of the process group (Graph) to check for pending responses void controlClientResponses(Graph& pg); - /// @brief Handle recovery actions requested by the Health Monitor + /// @brief Handle recovery actions requested by the Alive Monitor void recoveryActionHandler(); /// @brief Manage the process group by starting any pending transitions that were requested @@ -214,8 +221,8 @@ class ProcessGroupManager final { /// @brief process a get execution error request /// @details If the process group given in the `process_group_state_` exists:\n /// if the corresponding graph is in the `kUndefined` state:\n - /// set the `execution_error_code_` of the message to the result of calling `getLastExecutionError` method of the graph\n - /// set the request code of the message to `kExecutionErrorRequestSuccess`\n + /// set the `execution_error_code_` of the message to the result of calling `getLastExecutionError` method + /// of the graph\n set the request code of the message to `kExecutionErrorRequestSuccess`\n /// else:\n /// set the request code of the message to `kExecutionErrorRequestFailed`\n /// else:\n @@ -242,7 +249,8 @@ class ProcessGroupManager final { /// @details cancel any Graph for a process group not in the "Off" state, wait for up to 2 seconds for all graphs /// to be no longer in the `kCancelled` state, start a transition of remaining process groups to "Off" state, /// and finally wait for up to a second for all graphs to complete. - /// @warning Side effect: Depending if it is needed to forcefully terminate processes, worker jobs might be stopped after this call + /// @warning Side effect: Depending if it is needed to forcefully terminate processes, worker jobs might be stopped + /// after this call void allProcessGroupsOff(); /// @brief Initializes the process groups. @@ -299,12 +307,11 @@ class ProcessGroupManager final { /// @brief pointer to the configuration for Launch Manager const OsProcess* launch_manager_config_{nullptr}; - std::unique_ptr health_monitor_thread_; + std::unique_ptr alive_monitor_thread_; std::shared_ptr recovery_client_{}; }; } // namespace score::lcm::internal - #endif /// PROCESSGROUPMANAGER_HPP_INCLUDED