Skip to content

Commit fffdb4f

Browse files
committed
mana_driver vf reconfig sets hwc failure, fast revokes vtl0 vf
1 parent 2fbb389 commit fffdb4f

4 files changed

Lines changed: 73 additions & 5 deletions

File tree

openhcl/underhill_core/src/emuplat/netvsp.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1079,10 +1079,14 @@ impl HclNetworkVFManagerWorker {
10791079
vtl0_vfid = vtl0_vfid_from_bus_control(&self.vtl0_bus_control),
10801080
"VTL0 VF being removed as a result of VF Reconfiguration."
10811081
);
1082-
self.try_notify_guest_and_revoke_vtl0_vf(&Vtl0Bus::NotPresent)
1083-
.await;
1082+
self.remove_vtl0_vf().await;
10841083
}
10851084

1085+
// Set MAC filters to None.
1086+
// Force the restarted device to resend HWC commands to update filters.
1087+
// Safety measure for NICs without `cap_filter_state_query` support.
1088+
self.save_state.direction_to_vtl0.lock().fill(None);
1089+
10861090
// Don't 'keep alive'. VTL2 is reconfigured when in a bad state.
10871091
let keep_vf_alive = false;
10881092
self.shutdown_vtl2_device(keep_vf_alive).await;

vm/devices/net/mana_driver/src/gdma_driver.rs

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,8 @@ impl<T: DeviceBacking> GdmaDriver<T> {
10211021
// No data is supplied for VF reconfiguration events.
10221022
tracing::info!("HWC VF reconfiguration event");
10231023
self.vf_reconfiguration_pending = true;
1024+
// HWC will no longer respond after this event.
1025+
self.hwc_failure = true;
10241026
}
10251027
ty => tracing::error!(ty, "unknown eq event"),
10261028
}
@@ -1137,8 +1139,16 @@ impl<T: DeviceBacking> GdmaDriver<T> {
11371139
_ => "response received with delay",
11381140
}
11391141
);
1140-
self.report_hwc_timeout(wait_failed, interrupt_loss, eqe_wait_result.elapsed as u32)
1142+
// Don't report the timeout once VF reconfiguration is pending,
1143+
// since the SoC will not respond.
1144+
if !self.vf_reconfiguration_pending {
1145+
self.report_hwc_timeout(
1146+
wait_failed,
1147+
interrupt_loss,
1148+
eqe_wait_result.elapsed as u32,
1149+
)
11411150
.await;
1151+
}
11421152
if !wait_failed && eqe_wait_result.elapsed > self.hwc_warning_time_in_ms as u128 {
11431153
// Increase warning threshold after each delay warning occurrence.
11441154
self.hwc_warning_time_in_ms += HWC_WARNING_INCREASE_IN_MS;
@@ -1162,7 +1172,11 @@ impl<T: DeviceBacking> GdmaDriver<T> {
11621172
));
11631173
}
11641174
}
1165-
self.hwc_failure = false;
1175+
// If the wait successfully found an EQE, clear hwc_failure caused by prior timeout.
1176+
// Don't clear hwc_failure during VF reconfiguration (EQE 135), the device is gone.
1177+
if !self.vf_reconfiguration_pending {
1178+
self.hwc_failure = false;
1179+
}
11661180
Ok(())
11671181
}
11681182

@@ -1478,4 +1492,9 @@ impl<T: DeviceBacking> GdmaDriver<T> {
14781492
)
14791493
.await
14801494
}
1495+
1496+
/// Returns true if the HWC has failed and no further requests will succeed.
1497+
pub(crate) fn hwc_failure(&self) -> bool {
1498+
self.hwc_failure
1499+
}
14811500
}

vm/devices/net/mana_driver/src/resources.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,23 @@ impl ResourceArena {
6363
}
6464

6565
pub(crate) async fn destroy<T: DeviceBacking>(mut self, gdma: &mut GdmaDriver<T>) {
66+
let skip_hwc = gdma.hwc_failure();
67+
if skip_hwc {
68+
tracing::info!(
69+
count = self.resources.len(),
70+
"skipping HWC resource teardown after hardware failure"
71+
);
72+
}
6673
for resource in self.resources.drain(..).rev() {
6774
let r = match resource {
6875
Resource::MemoryBlock(mem) => {
6976
drop(ManuallyDrop::into_inner(mem));
7077
Ok(())
7178
}
79+
// When HWC has already failed, skip sending teardown commands for HWC resources:
80+
// DmaRegion, Eq, BnicQueue. HWC requests all fail: "Previous hardware failure".
81+
// Device should reclaim resources on its own reset.
82+
_ if skip_hwc => continue,
7283
Resource::DmaRegion {
7384
dev_id,
7485
gdma_region,

vm/devices/net/mana_driver/src/tests.rs

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,12 +231,46 @@ async fn test_gdma_reconfig_vf(driver: DefaultDriver) {
231231
!gdma.get_vf_reconfiguration_pending(),
232232
"vf_reconfiguration_pending should be false"
233233
);
234+
assert!(
235+
!gdma.hwc_failure(),
236+
"hwc_failure should be false before EQE 135"
237+
);
238+
239+
// Get the device ID while HWC is still alive (needed for deregister later).
240+
let dev_id = gdma
241+
.list_devices()
242+
.await
243+
.unwrap()
244+
.iter()
245+
.copied()
246+
.find(|dev_id| dev_id.ty == GdmaDevType::GDMA_DEVICE_MANA)
247+
.unwrap();
234248

235249
// Trigger the reconfig event (EQE 135).
250+
// The in-flight HWC request should still complete because the CQE is
251+
// delivered in the same batch as the reconfig EQE. Only future requests
252+
// should fail.
236253
gdma.generate_reconfig_vf_event().await.unwrap();
237-
gdma.process_all_eqs();
254+
238255
assert!(
239256
gdma.get_vf_reconfiguration_pending(),
240257
"vf_reconfiguration_pending should be true after reconfig event"
241258
);
259+
assert!(
260+
gdma.hwc_failure(),
261+
"hwc_failure should be true after EQE 135"
262+
);
263+
264+
// Deregister should fail immediately because hwc_failure is set.
265+
let deregister_result = gdma.deregister_device(dev_id).await;
266+
let err = deregister_result.expect_err("deregister_device should fail after EQE 135");
267+
let err_msg = format!("{err:#}");
268+
assert!(
269+
err_msg.contains("Previous hardware failure"),
270+
"unexpected error: {err_msg}"
271+
);
272+
assert!(
273+
gdma.hwc_failure(),
274+
"hwc_failure should remain true after deregister_device"
275+
);
242276
}

0 commit comments

Comments
 (0)