Skip to content

Commit 27c79b3

Browse files
shayshyigregkh
authored andcommitted
net/mlx5: Fix health error state handling
[ Upstream commit 51d138c ] Currently, when we discover a fatal error, we are queueing a work that will wait for a lock in order to enter the device to error state. Meanwhile, FW commands are still being processed, and gets timeouts. This can block the driver for few minutes before the work will manage to get the lock and enter to error state. Setting the device to error state before queueing health work, in order to avoid FW commands being processed while the work is waiting for the lock. Fixes: c1d4d2e ("net/mlx5: Avoid calling sleeping function by the health poll thread") Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Moshe Shemesh <moshe@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent ae624d4 commit 27c79b3

1 file changed

Lines changed: 14 additions & 8 deletions

File tree

  • drivers/net/ethernet/mellanox/mlx5/core

drivers/net/ethernet/mellanox/mlx5/core/health.c

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,16 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
190190
return true;
191191
}
192192

193+
static void enter_error_state(struct mlx5_core_dev *dev, bool force)
194+
{
195+
if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */
196+
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
197+
mlx5_cmd_flush(dev);
198+
}
199+
200+
mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1);
201+
}
202+
193203
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
194204
{
195205
bool err_detected = false;
@@ -208,12 +218,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
208218
goto unlock;
209219
}
210220

211-
if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */
212-
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
213-
mlx5_cmd_flush(dev);
214-
}
215-
216-
mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1);
221+
enter_error_state(dev, force);
217222
unlock:
218223
mutex_unlock(&dev->intf_state_mutex);
219224
}
@@ -613,7 +618,7 @@ static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work)
613618
priv = container_of(health, struct mlx5_priv, health);
614619
dev = container_of(priv, struct mlx5_core_dev, priv);
615620

616-
mlx5_enter_error_state(dev, false);
621+
enter_error_state(dev, false);
617622
if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) {
618623
if (mlx5_health_try_recover(dev))
619624
mlx5_core_err(dev, "health recovery failed\n");
@@ -707,8 +712,9 @@ static void poll_health(struct timer_list *t)
707712
mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error);
708713
dev->priv.health.fatal_error = fatal_error;
709714
print_health_info(dev);
715+
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
710716
mlx5_trigger_health_work(dev);
711-
goto out;
717+
return;
712718
}
713719

714720
count = ioread32be(health->health_counter);

0 commit comments

Comments
 (0)