Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions vm/devices/net/mana_driver/src/gdma_driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ use gdma_defs::SmcProtoHdr;
use inspect::Inspect;
use pal_async::driver::Driver;
use std::collections::HashMap;
use std::mem;
use std::mem::ManuallyDrop;
use std::sync::Arc;
use std::time::Duration;
Expand Down Expand Up @@ -211,7 +210,11 @@ impl<T: DeviceBacking> GdmaDriver<T> {

impl<T: DeviceBacking> Drop for GdmaDriver<T> {
fn drop(&mut self) {
tracing::info!(?self.state_saved, ?self.hwc_failure, "dropping gdma driver");
tracing::info!(?self.state_saved, ?self.hwc_failure, ?self.vf_reconfiguration_pending, "dropping gdma driver");

if self.vf_reconfiguration_pending {
return;
}

// Don't destroy anything if we're saving its state for restoration.
if self.state_saved {
Expand Down Expand Up @@ -689,6 +692,10 @@ impl<T: DeviceBacking> GdmaDriver<T> {
interrupt_loss: bool,
ms_elapsed: u32,
) {
// Don't report timeout once VF reconfiguration is pending, SoC will not respond.
if self.vf_reconfiguration_pending {
return;
}
Comment thread
erfrimod marked this conversation as resolved.
// Perform initial check for ownership, failing without wait if device
// is not present or owns shmem region
let data = self
Expand Down Expand Up @@ -783,8 +790,8 @@ impl<T: DeviceBacking> GdmaDriver<T> {
self.link_toggle.drain(..).collect()
}

pub fn get_vf_reconfiguration_pending(&mut self) -> bool {
mem::take(&mut self.vf_reconfiguration_pending)
pub fn get_vf_reconfiguration_pending(&self) -> bool {
self.vf_reconfiguration_pending
Comment thread
erfrimod marked this conversation as resolved.
}

Comment thread
erfrimod marked this conversation as resolved.
Comment thread
erfrimod marked this conversation as resolved.
pub fn device(&self) -> &T {
Expand Down Expand Up @@ -839,6 +846,9 @@ impl<T: DeviceBacking> GdmaDriver<T> {
dev_id: GdmaDevId,
req: Req,
) -> anyhow::Result<(Resp, u32)> {
if self.vf_reconfiguration_pending {
anyhow::bail!("VF reconfiguration pending");
}
if self.hwc_failure {
anyhow::bail!("Previous hardware failure");
}
Expand Down
14 changes: 14 additions & 0 deletions vm/devices/net/mana_driver/src/resources.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,26 @@ impl ResourceArena {
}

pub(crate) async fn destroy<T: DeviceBacking>(mut self, gdma: &mut GdmaDriver<T>) {
let skip_hwc = gdma.get_vf_reconfiguration_pending();
if skip_hwc {
tracing::info!(
count = self.resources.len(),
Comment thread
erfrimod marked this conversation as resolved.
"skipping HWC resource teardown during VF reconfiguration"
);
}
Comment thread
erfrimod marked this conversation as resolved.
for resource in self.resources.drain(..).rev() {
let r = match resource {
Resource::MemoryBlock(mem) => {
drop(ManuallyDrop::into_inner(mem));
Ok(())
}
// During VF reconfiguration, skip sending teardown commands for HWC resources.
// HWC requests will fail and the device reclaims resources on its own reset.
Resource::DmaRegion { .. } | Resource::Eq { .. } | Resource::BnicQueue { .. }
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My preference (slight) would be not check for all of these resources again and just do if gdma.get_vf_reconfiguration_pending() -> return to reduce redundancy.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leaving in place, cause we still need to free MemoryBlock so we're stuck iterating over all resources.

if skip_hwc =>
{
continue;
}
Resource::DmaRegion {
dev_id,
gdma_region,
Expand Down
25 changes: 24 additions & 1 deletion vm/devices/net/mana_driver/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,11 +232,34 @@ async fn test_gdma_reconfig_vf(driver: DefaultDriver) {
"vf_reconfiguration_pending should be false"
);

// Get the device ID while HWC is still alive (needed for deregister later).
let dev_id = gdma
.list_devices()
.await
.unwrap()
.iter()
.copied()
.find(|dev_id| dev_id.ty == GdmaDevType::GDMA_DEVICE_MANA)
.unwrap();

// Trigger the reconfig event (EQE 135).
gdma.generate_reconfig_vf_event().await.unwrap();
Comment thread
erfrimod marked this conversation as resolved.
gdma.process_all_eqs();

assert!(
gdma.get_vf_reconfiguration_pending(),
"vf_reconfiguration_pending should be true after reconfig event"
);
Comment thread
erfrimod marked this conversation as resolved.

// Deregister should fail immediately because vf_reconfiguration_pending is set.
let deregister_result = gdma.deregister_device(dev_id).await;
let err = deregister_result.expect_err("deregister_device should fail after EQE 135");
Comment thread
erfrimod marked this conversation as resolved.
let err_msg = format!("{err:#}");
assert!(
err_msg.contains("VF reconfiguration pending"),
"unexpected error: {err_msg}"
);
assert!(
Comment thread
erfrimod marked this conversation as resolved.
gdma.get_vf_reconfiguration_pending(),
"vf_reconfiguration_pending should remain true after deregister_device"
);
}
Loading