22// License, v. 2.0. If a copy of the MPL was not distributed with this
33// file, You can obtain one at https://mozilla.org/MPL/2.0/.
44
5- //! A task that listens for hardware events from the
5+ //! A task that listens for changes in the [`HardwareView`] from the
66//! [`sled_hardware::HardwareManager`] and dispatches them to other parts
77//! of the bootstrap agent and sled-agent code.
88
99use crate :: services:: ServiceManager ;
1010use crate :: sled_agent:: SledAgent ;
1111use sled_agent_config_reconciler:: RawDisksSender ;
1212use sled_agent_types:: debug:: OperatorSwitchZonePolicy ;
13- use sled_hardware:: { HardwareManager , HardwareUpdate } ;
13+ use sled_hardware:: { HardwareManager , HardwareView } ;
1414use sled_hardware_types:: Baseboard ;
1515use sled_storage:: disk:: RawDisk ;
1616use slog:: Logger ;
17- use tokio:: sync:: broadcast :: error :: RecvError ;
18- use tokio:: sync:: { broadcast , oneshot , watch} ;
17+ use tokio:: sync:: oneshot ;
18+ use tokio:: sync:: watch;
1919
2020/// A handle controlling the behavior of a [`HardwareMonitor`]
2121#[ derive( Debug , Clone ) ]
@@ -52,15 +52,12 @@ pub struct HardwareMonitor {
5252 // Receive a onetime notification that the ServiceManager is ready
5353 service_manager_ready_rx : oneshot:: Receiver < ServiceManager > ,
5454
55- // Receive messages from the [`HardwareManager`]
56- hardware_rx : broadcast :: Receiver < HardwareUpdate > ,
55+ // Receive current view of hardware from the [`HardwareManager`]
56+ hardware_view_rx : watch :: Receiver < HardwareView > ,
5757
5858 // Receive the operator's policy controlling the switch zone
5959 switch_zone_policy_rx : watch:: Receiver < OperatorSwitchZonePolicy > ,
6060
61- // A reference to the hardware manager
62- hardware_manager : HardwareManager ,
63-
6461 // A handle to send raw disk updates to the config-reconciler system.
6562 raw_disks_tx : RawDisksSender ,
6663
@@ -96,8 +93,8 @@ impl HardwareMonitor {
9693 let ( sled_agent_started_tx, sled_agent_started_rx) = oneshot:: channel ( ) ;
9794 let ( service_manager_ready_tx, service_manager_ready_rx) =
9895 oneshot:: channel ( ) ;
99- let baseboard = hardware_manager. baseboard ( ) ;
100- let hardware_rx = hardware_manager . monitor ( ) ;
96+ let hardware_view_rx = hardware_manager. subscribe ( ) ;
97+ let baseboard = hardware_view_rx . borrow ( ) . baseboard ( ) ;
10198 let log = log. new ( o ! ( "component" => "HardwareMonitor" ) ) ;
10299 let ( switch_zone_policy_tx, switch_zone_policy_rx) =
103100 watch:: channel ( OperatorSwitchZonePolicy :: StartIfSwitchPresent ) ;
@@ -106,9 +103,8 @@ impl HardwareMonitor {
106103 baseboard,
107104 sled_agent_started_rx,
108105 service_manager_ready_rx,
109- hardware_rx ,
106+ hardware_view_rx ,
110107 switch_zone_policy_rx,
111- hardware_manager : hardware_manager. clone ( ) ,
112108 raw_disks_tx,
113109 sled_agent : None ,
114110 service_manager : None ,
@@ -150,14 +146,25 @@ impl HardwareMonitor {
150146 policy,
151147 ) . await ;
152148 }
153- update = self . hardware_rx. recv( ) => {
154- info!(
155- self . log,
156- "Received hardware update message" ;
157- "update" => ?update,
158- ) ;
159- self . handle_hardware_update( update. clone( ) ) . await
160- } ,
149+ result = self . hardware_view_rx. changed( ) => {
150+ match result {
151+ Ok ( ( ) ) => {
152+ info!(
153+ self . log,
154+ "Received notification hardware \
155+ view has changed"
156+ ) ;
157+ self . check_latest_hardware_snapshot( ) . await ;
158+ }
159+ Err ( _recv_error) => {
160+ // The `HardwareManager` monitoring task is an
161+ // infinite loop - the only way for us to get
162+ // an error from `changed()` here is if it panicked,
163+ // so we will propagate such a panic.
164+ panic!( "Hardware manager monitor task panicked" ) ;
165+ }
166+ }
167+ }
161168 Ok ( ( ) ) = self . switch_zone_policy_rx. changed( ) => {
162169 let policy = self . current_switch_zone_policy( ) ;
163170 info!(
@@ -179,70 +186,6 @@ impl HardwareMonitor {
179186 * self . switch_zone_policy_rx . borrow_and_update ( )
180187 }
181188
182- // Handle an update from the [`HardwareMonitor`]
183- async fn handle_hardware_update (
184- & mut self ,
185- update : Result < HardwareUpdate , RecvError > ,
186- ) {
187- match update {
188- Ok ( update) => match update {
189- HardwareUpdate :: TofinoAvailable => {
190- info ! (
191- self . log,
192- "Hardware monitor got TofinoAvailable message"
193- ) ;
194- let policy = self . current_switch_zone_policy ( ) ;
195- self . ensure_switch_zone_activated_or_deactivated (
196- true , policy,
197- )
198- . await
199- }
200- HardwareUpdate :: TofinoUnavailable => {
201- info ! (
202- self . log,
203- "Hardware monitor got TofinoUnavailable message"
204- ) ;
205- let policy = self . current_switch_zone_policy ( ) ;
206- self . ensure_switch_zone_activated_or_deactivated (
207- false , policy,
208- )
209- . await
210- }
211- HardwareUpdate :: TofinoDeviceChange => {
212- info ! (
213- self . log,
214- "Hardware monitor got TofinoDeviceChange message"
215- ) ;
216- if let Some ( sled_agent) = & mut self . sled_agent {
217- sled_agent. notify_nexus_about_self ( & self . log ) . await ;
218- }
219- }
220- HardwareUpdate :: DiskAdded ( disk) => {
221- self . raw_disks_tx
222- . add_or_update_raw_disk ( disk. into ( ) , & self . log ) ;
223- }
224- HardwareUpdate :: DiskRemoved ( disk) => {
225- self . raw_disks_tx
226- . remove_raw_disk ( disk. identity ( ) , & self . log ) ;
227- }
228- HardwareUpdate :: DiskUpdated ( disk) => {
229- self . raw_disks_tx
230- . add_or_update_raw_disk ( disk. into ( ) , & self . log ) ;
231- }
232- } ,
233- Err ( broadcast:: error:: RecvError :: Lagged ( count) ) => {
234- warn ! ( self . log, "Hardware monitor missed {count} messages" ) ;
235- self . check_latest_hardware_snapshot ( ) . await ;
236- }
237- Err ( broadcast:: error:: RecvError :: Closed ) => {
238- // The `HardwareManager` monitoring task is an infinite loop -
239- // the only way for us to get `Closed` here is if it panicked,
240- // so we will propagate such a panic.
241- panic ! ( "Hardware manager monitor task panicked" ) ;
242- }
243- }
244- }
245-
246189 async fn ensure_switch_zone_activated_or_deactivated (
247190 & mut self ,
248191 is_tofino_available : bool ,
@@ -296,29 +239,29 @@ impl HardwareMonitor {
296239 }
297240 }
298241
299- // Observe the current hardware state manually .
242+ // Act on the current hardware snapshot .
300243 //
301- // We use this when we're monitoring hardware for the first
302- // time, and if we miss notifications.
244+ // We use this on startup and any time the snapshot changes.
303245 async fn check_latest_hardware_snapshot ( & mut self ) {
304246 if let Some ( sled_agent) = & self . sled_agent {
305247 sled_agent. notify_nexus_about_self ( & self . log ) . await ;
306248 }
307249
250+ let snapshot = self . hardware_view_rx . borrow_and_update ( ) . clone ( ) ;
308251 info ! (
309252 self . log, "Checking current full hardware snapshot" ;
310- "disks " => ?self . hardware_manager . disks ( ) ,
253+ "snapshot " => ?snapshot ,
311254 ) ;
312255
313256 let policy = self . current_switch_zone_policy ( ) ;
314257 self . ensure_switch_zone_activated_or_deactivated (
315- self . hardware_manager . is_scrimlet_asic_available ( ) ,
258+ snapshot . is_scrimlet_asic_available ( ) ,
316259 policy,
317260 )
318261 . await ;
319262
320263 self . raw_disks_tx . set_raw_disks (
321- self . hardware_manager . disks ( ) . into_values ( ) . map ( RawDisk :: from) ,
264+ snapshot . into_disks ( ) . into_values ( ) . map ( RawDisk :: from) ,
322265 & self . log ,
323266 ) ;
324267 }
0 commit comments