Skip to content

Commit 022add1

Browse files
committed
Merge branch 'eth-bnxt-handle-invalid-tx-completions-more-gracefully'
Jakub Kicinski says: ==================== eth: bnxt: handle invalid Tx completions more gracefully bnxt trusts the events generated by the device which may lead to kernel crashes. These are extremely rare but they do happen. For a while I thought crashing may be intentional, because device reporting invalid completions should never happen, and having a core dump could be useful if it does. But in practice I haven't found any clues in the core dumps, and panic_on_warn exists. Series was tested by forcing the recovery path manually. Because of how rare the real crashes are I can't confirm it works for the actual device errors until it's been widely deployed. v1: https://lore.kernel.org/all/20230710205611.1198878-1-kuba@kernel.org/ ==================== Link: https://lore.kernel.org/r/20230720010440.1967136-1-kuba@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2 parents 59be3ba + 2b56b3d commit 022add1

3 files changed

Lines changed: 91 additions & 70 deletions

File tree

drivers/net/ethernet/broadcom/bnxt/bnxt.c

Lines changed: 84 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,60 @@ static void bnxt_db_cq(struct bnxt *bp, struct bnxt_db_info *db, u32 idx)
293293
BNXT_DB_CQ(db, idx);
294294
}
295295

296+
static void bnxt_queue_fw_reset_work(struct bnxt *bp, unsigned long delay)
297+
{
298+
if (!(test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)))
299+
return;
300+
301+
if (BNXT_PF(bp))
302+
queue_delayed_work(bnxt_pf_wq, &bp->fw_reset_task, delay);
303+
else
304+
schedule_delayed_work(&bp->fw_reset_task, delay);
305+
}
306+
307+
static void __bnxt_queue_sp_work(struct bnxt *bp)
308+
{
309+
if (BNXT_PF(bp))
310+
queue_work(bnxt_pf_wq, &bp->sp_task);
311+
else
312+
schedule_work(&bp->sp_task);
313+
}
314+
315+
static void bnxt_queue_sp_work(struct bnxt *bp, unsigned int event)
316+
{
317+
set_bit(event, &bp->sp_event);
318+
__bnxt_queue_sp_work(bp);
319+
}
320+
321+
static void bnxt_sched_reset_rxr(struct bnxt *bp, struct bnxt_rx_ring_info *rxr)
322+
{
323+
if (!rxr->bnapi->in_reset) {
324+
rxr->bnapi->in_reset = true;
325+
if (bp->flags & BNXT_FLAG_CHIP_P5)
326+
set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event);
327+
else
328+
set_bit(BNXT_RST_RING_SP_EVENT, &bp->sp_event);
329+
__bnxt_queue_sp_work(bp);
330+
}
331+
rxr->rx_next_cons = 0xffff;
332+
}
333+
334+
void bnxt_sched_reset_txr(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
335+
int idx)
336+
{
337+
struct bnxt_napi *bnapi = txr->bnapi;
338+
339+
if (bnapi->tx_fault)
340+
return;
341+
342+
netdev_err(bp->dev, "Invalid Tx completion (ring:%d tx_pkts:%d cons:%u prod:%u i:%d)",
343+
txr->txq_index, bnapi->tx_pkts,
344+
txr->tx_cons, txr->tx_prod, idx);
345+
WARN_ON_ONCE(1);
346+
bnapi->tx_fault = 1;
347+
bnxt_queue_sp_work(bp, BNXT_RESET_TASK_SP_EVENT);
348+
}
349+
296350
const u16 bnxt_lhint_arr[] = {
297351
TX_BD_FLAGS_LHINT_512_AND_SMALLER,
298352
TX_BD_FLAGS_LHINT_512_TO_1023,
@@ -652,6 +706,11 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
652706
skb = tx_buf->skb;
653707
tx_buf->skb = NULL;
654708

709+
if (unlikely(!skb)) {
710+
bnxt_sched_reset_txr(bp, txr, i);
711+
return;
712+
}
713+
655714
tx_bytes += skb->len;
656715

657716
if (tx_buf->is_push) {
@@ -1234,38 +1293,6 @@ static int bnxt_discard_rx(struct bnxt *bp, struct bnxt_cp_ring_info *cpr,
12341293
return 0;
12351294
}
12361295

1237-
static void bnxt_queue_fw_reset_work(struct bnxt *bp, unsigned long delay)
1238-
{
1239-
if (!(test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)))
1240-
return;
1241-
1242-
if (BNXT_PF(bp))
1243-
queue_delayed_work(bnxt_pf_wq, &bp->fw_reset_task, delay);
1244-
else
1245-
schedule_delayed_work(&bp->fw_reset_task, delay);
1246-
}
1247-
1248-
static void bnxt_queue_sp_work(struct bnxt *bp)
1249-
{
1250-
if (BNXT_PF(bp))
1251-
queue_work(bnxt_pf_wq, &bp->sp_task);
1252-
else
1253-
schedule_work(&bp->sp_task);
1254-
}
1255-
1256-
static void bnxt_sched_reset(struct bnxt *bp, struct bnxt_rx_ring_info *rxr)
1257-
{
1258-
if (!rxr->bnapi->in_reset) {
1259-
rxr->bnapi->in_reset = true;
1260-
if (bp->flags & BNXT_FLAG_CHIP_P5)
1261-
set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event);
1262-
else
1263-
set_bit(BNXT_RST_RING_SP_EVENT, &bp->sp_event);
1264-
bnxt_queue_sp_work(bp);
1265-
}
1266-
rxr->rx_next_cons = 0xffff;
1267-
}
1268-
12691296
static u16 bnxt_alloc_agg_idx(struct bnxt_rx_ring_info *rxr, u16 agg_id)
12701297
{
12711298
struct bnxt_tpa_idx_map *map = rxr->rx_tpa_idx_map;
@@ -1320,7 +1347,7 @@ static void bnxt_tpa_start(struct bnxt *bp, struct bnxt_rx_ring_info *rxr,
13201347
netdev_warn(bp->dev, "TPA cons %x, expected cons %x, error code %x\n",
13211348
cons, rxr->rx_next_cons,
13221349
TPA_START_ERROR_CODE(tpa_start1));
1323-
bnxt_sched_reset(bp, rxr);
1350+
bnxt_sched_reset_rxr(bp, rxr);
13241351
return;
13251352
}
13261353
/* Store cfa_code in tpa_info to use in tpa_end
@@ -1844,7 +1871,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr,
18441871
if (rxr->rx_next_cons != 0xffff)
18451872
netdev_warn(bp->dev, "RX cons %x != expected cons %x\n",
18461873
cons, rxr->rx_next_cons);
1847-
bnxt_sched_reset(bp, rxr);
1874+
bnxt_sched_reset_rxr(bp, rxr);
18481875
if (rc1)
18491876
return rc1;
18501877
goto next_rx_no_prod_no_len;
@@ -1882,7 +1909,7 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr,
18821909
!(bp->fw_cap & BNXT_FW_CAP_RING_MONITOR)) {
18831910
netdev_warn_once(bp->dev, "RX buffer error %x\n",
18841911
rx_err);
1885-
bnxt_sched_reset(bp, rxr);
1912+
bnxt_sched_reset_rxr(bp, rxr);
18861913
}
18871914
}
18881915
goto next_rx_no_len;
@@ -2329,7 +2356,7 @@ static int bnxt_async_event_process(struct bnxt *bp,
23292356
goto async_event_process_exit;
23302357
}
23312358
rxr = bp->bnapi[grp_idx]->rx_ring;
2332-
bnxt_sched_reset(bp, rxr);
2359+
bnxt_sched_reset_rxr(bp, rxr);
23332360
goto async_event_process_exit;
23342361
}
23352362
case ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST: {
@@ -2384,7 +2411,7 @@ static int bnxt_async_event_process(struct bnxt *bp,
23842411
default:
23852412
goto async_event_process_exit;
23862413
}
2387-
bnxt_queue_sp_work(bp);
2414+
__bnxt_queue_sp_work(bp);
23882415
async_event_process_exit:
23892416
return 0;
23902417
}
@@ -2413,8 +2440,7 @@ static int bnxt_hwrm_handler(struct bnxt *bp, struct tx_cmp *txcmp)
24132440
}
24142441

24152442
set_bit(vf_id - bp->pf.first_vf_id, bp->pf.vf_event_bmap);
2416-
set_bit(BNXT_HWRM_EXEC_FWD_REQ_SP_EVENT, &bp->sp_event);
2417-
bnxt_queue_sp_work(bp);
2443+
bnxt_queue_sp_work(bp, BNXT_HWRM_EXEC_FWD_REQ_SP_EVENT);
24182444
break;
24192445

24202446
case CMPL_BASE_TYPE_HWRM_ASYNC_EVENT:
@@ -2571,7 +2597,7 @@ static int __bnxt_poll_work(struct bnxt *bp, struct bnxt_cp_ring_info *cpr,
25712597

25722598
static void __bnxt_poll_work_done(struct bnxt *bp, struct bnxt_napi *bnapi)
25732599
{
2574-
if (bnapi->tx_pkts) {
2600+
if (bnapi->tx_pkts && !bnapi->tx_fault) {
25752601
bnapi->tx_int(bp, bnapi, bnapi->tx_pkts);
25762602
bnapi->tx_pkts = 0;
25772603
}
@@ -9424,6 +9450,8 @@ static void bnxt_enable_napi(struct bnxt *bp)
94249450
struct bnxt_napi *bnapi = bp->bnapi[i];
94259451
struct bnxt_cp_ring_info *cpr;
94269452

9453+
bnapi->tx_fault = 0;
9454+
94279455
cpr = &bnapi->cp_ring;
94289456
if (bnapi->in_reset)
94299457
cpr->sw_stats.rx.rx_resets++;
@@ -11031,8 +11059,7 @@ static void bnxt_set_rx_mode(struct net_device *dev)
1103111059
if (mask != vnic->rx_mask || uc_update || mc_update) {
1103211060
vnic->rx_mask = mask;
1103311061

11034-
set_bit(BNXT_RX_MASK_SP_EVENT, &bp->sp_event);
11035-
bnxt_queue_sp_work(bp);
11062+
bnxt_queue_sp_work(bp, BNXT_RX_MASK_SP_EVENT);
1103611063
}
1103711064
}
1103811065

@@ -11597,8 +11624,7 @@ static void bnxt_tx_timeout(struct net_device *dev, unsigned int txqueue)
1159711624
struct bnxt *bp = netdev_priv(dev);
1159811625

1159911626
netdev_err(bp->dev, "TX timeout detected, starting reset task!\n");
11600-
set_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event);
11601-
bnxt_queue_sp_work(bp);
11627+
bnxt_queue_sp_work(bp, BNXT_RESET_TASK_SP_EVENT);
1160211628
}
1160311629

1160411630
static void bnxt_fw_health_check(struct bnxt *bp)
@@ -11635,8 +11661,7 @@ static void bnxt_fw_health_check(struct bnxt *bp)
1163511661
return;
1163611662

1163711663
fw_reset:
11638-
set_bit(BNXT_FW_EXCEPTION_SP_EVENT, &bp->sp_event);
11639-
bnxt_queue_sp_work(bp);
11664+
bnxt_queue_sp_work(bp, BNXT_FW_EXCEPTION_SP_EVENT);
1164011665
}
1164111666

1164211667
static void bnxt_timer(struct timer_list *t)
@@ -11653,43 +11678,33 @@ static void bnxt_timer(struct timer_list *t)
1165311678
if (bp->fw_cap & BNXT_FW_CAP_ERROR_RECOVERY)
1165411679
bnxt_fw_health_check(bp);
1165511680

11656-
if (BNXT_LINK_IS_UP(bp) && bp->stats_coal_ticks) {
11657-
set_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event);
11658-
bnxt_queue_sp_work(bp);
11659-
}
11681+
if (BNXT_LINK_IS_UP(bp) && bp->stats_coal_ticks)
11682+
bnxt_queue_sp_work(bp, BNXT_PERIODIC_STATS_SP_EVENT);
1166011683

11661-
if (bnxt_tc_flower_enabled(bp)) {
11662-
set_bit(BNXT_FLOW_STATS_SP_EVENT, &bp->sp_event);
11663-
bnxt_queue_sp_work(bp);
11664-
}
11684+
if (bnxt_tc_flower_enabled(bp))
11685+
bnxt_queue_sp_work(bp, BNXT_FLOW_STATS_SP_EVENT);
1166511686

1166611687
#ifdef CONFIG_RFS_ACCEL
11667-
if ((bp->flags & BNXT_FLAG_RFS) && bp->ntp_fltr_count) {
11668-
set_bit(BNXT_RX_NTP_FLTR_SP_EVENT, &bp->sp_event);
11669-
bnxt_queue_sp_work(bp);
11670-
}
11688+
if ((bp->flags & BNXT_FLAG_RFS) && bp->ntp_fltr_count)
11689+
bnxt_queue_sp_work(bp, BNXT_RX_NTP_FLTR_SP_EVENT);
1167111690
#endif /*CONFIG_RFS_ACCEL*/
1167211691

1167311692
if (bp->link_info.phy_retry) {
1167411693
if (time_after(jiffies, bp->link_info.phy_retry_expires)) {
1167511694
bp->link_info.phy_retry = false;
1167611695
netdev_warn(bp->dev, "failed to update phy settings after maximum retries.\n");
1167711696
} else {
11678-
set_bit(BNXT_UPDATE_PHY_SP_EVENT, &bp->sp_event);
11679-
bnxt_queue_sp_work(bp);
11697+
bnxt_queue_sp_work(bp, BNXT_UPDATE_PHY_SP_EVENT);
1168011698
}
1168111699
}
1168211700

11683-
if (test_bit(BNXT_STATE_L2_FILTER_RETRY, &bp->state)) {
11684-
set_bit(BNXT_RX_MASK_SP_EVENT, &bp->sp_event);
11685-
bnxt_queue_sp_work(bp);
11686-
}
11701+
if (test_bit(BNXT_STATE_L2_FILTER_RETRY, &bp->state))
11702+
bnxt_queue_sp_work(bp, BNXT_RX_MASK_SP_EVENT);
1168711703

1168811704
if ((bp->flags & BNXT_FLAG_CHIP_P5) && !bp->chip_rev &&
11689-
netif_carrier_ok(dev)) {
11690-
set_bit(BNXT_RING_COAL_NOW_SP_EVENT, &bp->sp_event);
11691-
bnxt_queue_sp_work(bp);
11692-
}
11705+
netif_carrier_ok(dev))
11706+
bnxt_queue_sp_work(bp, BNXT_RING_COAL_NOW_SP_EVENT);
11707+
1169311708
bnxt_restart_timer:
1169411709
mod_timer(&bp->timer, jiffies + bp->current_interval);
1169511710
}
@@ -12968,8 +12983,7 @@ static int bnxt_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
1296812983
bp->ntp_fltr_count++;
1296912984
spin_unlock_bh(&bp->ntp_fltr_lock);
1297012985

12971-
set_bit(BNXT_RX_NTP_FLTR_SP_EVENT, &bp->sp_event);
12972-
bnxt_queue_sp_work(bp);
12986+
bnxt_queue_sp_work(bp, BNXT_RX_NTP_FLTR_SP_EVENT);
1297312987

1297412988
return new_fltr->sw_id;
1297512989

drivers/net/ethernet/broadcom/bnxt/bnxt.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,6 +1008,7 @@ struct bnxt_napi {
10081008
int);
10091009
int tx_pkts;
10101010
u8 events;
1011+
u8 tx_fault:1;
10111012

10121013
u32 flags;
10131014
#define BNXT_NAPI_FLAG_XDP 0x1
@@ -2329,6 +2330,8 @@ int bnxt_get_avail_msix(struct bnxt *bp, int num);
23292330
int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init);
23302331
void bnxt_tx_disable(struct bnxt *bp);
23312332
void bnxt_tx_enable(struct bnxt *bp);
2333+
void bnxt_sched_reset_txr(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
2334+
int idx);
23322335
void bnxt_report_link(struct bnxt *bp);
23332336
int bnxt_update_link(struct bnxt *bp, bool chng_link_state);
23342337
int bnxt_hwrm_set_pause(struct bnxt *);

drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
149149
tx_buf->action = 0;
150150
tx_buf->xdpf = NULL;
151151
} else if (tx_buf->action == XDP_TX) {
152+
tx_buf->action = 0;
152153
rx_doorbell_needed = true;
153154
last_tx_cons = tx_cons;
154155

@@ -158,6 +159,9 @@ void bnxt_tx_int_xdp(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts)
158159
tx_buf = &txr->tx_buf_ring[tx_cons];
159160
page_pool_recycle_direct(rxr->page_pool, tx_buf->page);
160161
}
162+
} else {
163+
bnxt_sched_reset_txr(bp, txr, i);
164+
return;
161165
}
162166
tx_cons = NEXT_TX(tx_cons);
163167
}

0 commit comments

Comments
 (0)