Skip to content

Commit 87fbc42

Browse files
ben-zenBen LewisBrian-Perkins
authored
netvsp & net_mana - instrument VTL0 & host calls at risk of hanging (#3170)
There's a bunch of async operations with long or no timeout which are at risk of hanging the NetVSP code; adding spans to those actions makes them more obvious in tracing, with additional context to track down miscreant calls or make a misbehaving VM more obvious. --------- Co-authored-by: Ben Lewis <Ben.Lewis@microsoft.com> Co-authored-by: Brian Perkins <Brian-Perkins@users.noreply.github.com>
1 parent 122093b commit 87fbc42

3 files changed

Lines changed: 52 additions & 10 deletions

File tree

openhcl/underhill_core/src/emuplat/netvsp.rs

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,14 @@ impl HclNetworkVFManagerWorker {
462462
// method.
463463
*self.guest_state.offered_to_guest.lock().await = false;
464464
// Give the network stack a chance to prepare for the removal.
465-
if let Err(err) = self.send_vf_state_change_notifications().await {
465+
if let Err(err) = self
466+
.send_vf_state_change_notifications()
467+
.instrument(tracing::info_span!(
468+
"sending VTL0 VF removal notice",
469+
vtl2_vfid,
470+
vtl0_bus = %bus_control))
471+
.await
472+
{
466473
tracing::error!(
467474
vtl2_vfid,
468475
err = err.as_ref() as &dyn std::error::Error,
@@ -840,13 +847,16 @@ impl HclNetworkVFManagerWorker {
840847
if !self.guest_state.is_offered_to_guest().await
841848
&& self.guest_state.vtl0_vfid().await.is_some()
842849
{
843-
tracing::info!(
844-
vtl2_vfid,
845-
vtl0_vfid = vtl0_vfid_from_bus_control(&self.vtl0_bus_control),
846-
"Adding VF to VTL0"
847-
);
848850
if let Vtl0Bus::Present(vtl0_bus_control) = &self.vtl0_bus_control {
849-
match vtl0_bus_control.offer_device().await {
851+
match vtl0_bus_control
852+
.offer_device()
853+
.instrument(tracing::info_span!(
854+
"adding VF to VTL0",
855+
vtl2_vfid,
856+
vtl0_vfid = vtl0_vfid_from_bus_control(&self.vtl0_bus_control)
857+
))
858+
.await
859+
{
850860
Ok(_) => {
851861
*self.guest_state.offered_to_guest.lock().await = true;
852862
}
@@ -860,6 +870,12 @@ impl HclNetworkVFManagerWorker {
860870
);
861871
}
862872
}
873+
} else {
874+
tracing::info!(
875+
vtl2_vfid,
876+
%self.vtl0_bus_control,
877+
"Ignoring VTL0 device request from guest"
878+
);
863879
}
864880
}
865881
}

vm/devices/net/net_mana/src/lib.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ use std::sync::atomic::Ordering;
6969
use std::task::Context;
7070
use std::task::Poll;
7171
use thiserror::Error;
72+
use tracing::Instrument;
7273
use user_driver::DeviceBacking;
7374
use user_driver::DmaClient;
7475
use user_driver::interrupt::DeviceInterrupt;
@@ -490,6 +491,10 @@ impl<T: DeviceBacking> Endpoint for ManaEndpoint<T> {
490491
default_rxobj: None,
491492
indirection_table: None,
492493
})
494+
.instrument(tracing::info_span!(
495+
"clearing rx configuration",
496+
vport_id = self.vport.id()
497+
))
493498
.await
494499
{
495500
tracing::warn!(
@@ -499,12 +504,25 @@ impl<T: DeviceBacking> Endpoint for ManaEndpoint<T> {
499504
}
500505

501506
self.queues.clear();
502-
self.vport.destroy(std::mem::take(&mut self.arena)).await;
507+
self.vport
508+
.destroy(std::mem::take(&mut self.arena))
509+
.instrument(tracing::info_span!(
510+
"destroying vport resources",
511+
vport_id = self.vport.id()
512+
))
513+
.await;
503514
// Wait for all outstanding queues. There can be a delay switching out
504515
// the queues when an endpoint is removed, and the queue has access to
505516
// the vport which is being stopped here.
506517
if self.queue_tracker.0.load(Ordering::Acquire) > 0 {
507-
self.queue_tracker.1.wait().await;
518+
self.queue_tracker
519+
.1
520+
.wait()
521+
.instrument(tracing::info_span!(
522+
"waiting for outstanding queues to stop",
523+
vport_id = self.vport.id()
524+
))
525+
.await;
508526
}
509527
}
510528

vm/devices/net/netvsp/src/lib.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1335,7 +1335,15 @@ impl VmbusDevice for Nic {
13351335
}
13361336

13371337
// Note that this await is not restartable.
1338-
self.coordinator.task_mut().endpoint.stop().await;
1338+
self.coordinator
1339+
.task_mut()
1340+
.endpoint
1341+
.stop()
1342+
.instrument(tracing::info_span!(
1343+
"stopping coordinator endpoint",
1344+
instance_id = %self.instance_id,
1345+
))
1346+
.await;
13391347

13401348
// Keep any VF's added to the guest. This is required to keep guest compat as
13411349
// some apps (such as DPDK) relies on the VF sticking around even after vmbus

0 commit comments

Comments
 (0)