Skip to content

Commit 6f6f0ac

Browse files
committed
Merge tag 'mlx5-fixes-2021-12-22' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux
Saeed Mahameed says: ==================== mlx5 fixes 2021-12-22 This series provides bug fixes to mlx5 driver. * tag 'mlx5-fixes-2021-12-22' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux: net/mlx5: Fix some error handling paths in 'mlx5e_tc_add_fdb_flow()' net/mlx5e: Delete forward rule for ct or sample action net/mlx5e: Fix ICOSQ recovery flow for XSK net/mlx5e: Fix interoperability between XSK and ICOSQ recovery flow net/mlx5e: Fix skb memory leak when TC classifier action offloads are disabled net/mlx5e: Wrap the tx reporter dump callback to extract the sq net/mlx5: Fix tc max supported prio for nic mode net/mlx5: Fix SF health recovery flow net/mlx5: Fix error print in case of IRQ request failed net/mlx5: Use first online CPU instead of hard coded CPU net/mlx5: DR, Fix querying eswitch manager vport for ECPF net/mlx5: DR, Fix NULL vs IS_ERR checking in dr_domain_init_resources ==================== Link: https://lore.kernel.org/r/20211223190441.153012-1-saeed@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2 parents 95b4011 + 4390c6e commit 6f6f0ac

12 files changed

Lines changed: 121 additions & 46 deletions

File tree

drivers/net/ethernet/mellanox/mlx5/core/en.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,8 @@ struct mlx5e_channel {
783783
DECLARE_BITMAP(state, MLX5E_CHANNEL_NUM_STATES);
784784
int ix;
785785
int cpu;
786+
/* Sync between icosq recovery and XSK enable/disable. */
787+
struct mutex icosq_recovery_lock;
786788
};
787789

788790
struct mlx5e_ptp;
@@ -1014,9 +1016,6 @@ int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param);
10141016
void mlx5e_destroy_rq(struct mlx5e_rq *rq);
10151017

10161018
struct mlx5e_sq_param;
1017-
int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params,
1018-
struct mlx5e_sq_param *param, struct mlx5e_icosq *sq);
1019-
void mlx5e_close_icosq(struct mlx5e_icosq *sq);
10201019
int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params,
10211020
struct mlx5e_sq_param *param, struct xsk_buff_pool *xsk_pool,
10221021
struct mlx5e_xdpsq *sq, bool is_redirect);

drivers/net/ethernet/mellanox/mlx5/core/en/health.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv);
3030
void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq);
3131
void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq);
3232
void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq);
33+
void mlx5e_reporter_icosq_suspend_recovery(struct mlx5e_channel *c);
34+
void mlx5e_reporter_icosq_resume_recovery(struct mlx5e_channel *c);
3335

3436
#define MLX5E_REPORTER_PER_Q_MAX_LEN 256
3537

drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type,
6666

6767
static inline void
6868
mlx5e_rep_tc_receive(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq,
69-
struct sk_buff *skb) {}
69+
struct sk_buff *skb) { napi_gro_receive(rq->cq.napi, skb); }
7070

7171
#endif /* CONFIG_MLX5_CLS_ACT */
7272

drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ static void mlx5e_reset_icosq_cc_pc(struct mlx5e_icosq *icosq)
6262

6363
static int mlx5e_rx_reporter_err_icosq_cqe_recover(void *ctx)
6464
{
65+
struct mlx5e_rq *xskrq = NULL;
6566
struct mlx5_core_dev *mdev;
6667
struct mlx5e_icosq *icosq;
6768
struct net_device *dev;
@@ -70,7 +71,13 @@ static int mlx5e_rx_reporter_err_icosq_cqe_recover(void *ctx)
7071
int err;
7172

7273
icosq = ctx;
74+
75+
mutex_lock(&icosq->channel->icosq_recovery_lock);
76+
77+
/* mlx5e_close_rq cancels this work before RQ and ICOSQ are killed. */
7378
rq = &icosq->channel->rq;
79+
if (test_bit(MLX5E_RQ_STATE_ENABLED, &icosq->channel->xskrq.state))
80+
xskrq = &icosq->channel->xskrq;
7481
mdev = icosq->channel->mdev;
7582
dev = icosq->channel->netdev;
7683
err = mlx5_core_query_sq_state(mdev, icosq->sqn, &state);
@@ -84,6 +91,9 @@ static int mlx5e_rx_reporter_err_icosq_cqe_recover(void *ctx)
8491
goto out;
8592

8693
mlx5e_deactivate_rq(rq);
94+
if (xskrq)
95+
mlx5e_deactivate_rq(xskrq);
96+
8797
err = mlx5e_wait_for_icosq_flush(icosq);
8898
if (err)
8999
goto out;
@@ -97,15 +107,28 @@ static int mlx5e_rx_reporter_err_icosq_cqe_recover(void *ctx)
97107
goto out;
98108

99109
mlx5e_reset_icosq_cc_pc(icosq);
110+
100111
mlx5e_free_rx_in_progress_descs(rq);
112+
if (xskrq)
113+
mlx5e_free_rx_in_progress_descs(xskrq);
114+
101115
clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state);
102116
mlx5e_activate_icosq(icosq);
103-
mlx5e_activate_rq(rq);
104117

118+
mlx5e_activate_rq(rq);
105119
rq->stats->recover++;
120+
121+
if (xskrq) {
122+
mlx5e_activate_rq(xskrq);
123+
xskrq->stats->recover++;
124+
}
125+
126+
mutex_unlock(&icosq->channel->icosq_recovery_lock);
127+
106128
return 0;
107129
out:
108130
clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state);
131+
mutex_unlock(&icosq->channel->icosq_recovery_lock);
109132
return err;
110133
}
111134

@@ -706,6 +729,16 @@ void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq)
706729
mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx);
707730
}
708731

732+
void mlx5e_reporter_icosq_suspend_recovery(struct mlx5e_channel *c)
733+
{
734+
mutex_lock(&c->icosq_recovery_lock);
735+
}
736+
737+
void mlx5e_reporter_icosq_resume_recovery(struct mlx5e_channel *c)
738+
{
739+
mutex_unlock(&c->icosq_recovery_lock);
740+
}
741+
709742
static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = {
710743
.name = "rx",
711744
.recover = mlx5e_rx_reporter_recover,

drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,14 @@ static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fms
466466
return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
467467
}
468468

469+
static int mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
470+
void *ctx)
471+
{
472+
struct mlx5e_tx_timeout_ctx *to_ctx = ctx;
473+
474+
return mlx5e_tx_reporter_dump_sq(priv, fmsg, to_ctx->sq);
475+
}
476+
469477
static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv,
470478
struct devlink_fmsg *fmsg)
471479
{
@@ -561,7 +569,7 @@ int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq)
561569
to_ctx.sq = sq;
562570
err_ctx.ctx = &to_ctx;
563571
err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
564-
err_ctx.dump = mlx5e_tx_reporter_dump_sq;
572+
err_ctx.dump = mlx5e_tx_reporter_timeout_dump;
565573
snprintf(err_str, sizeof(err_str),
566574
"TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u",
567575
sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,

drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "setup.h"
55
#include "en/params.h"
66
#include "en/txrx.h"
7+
#include "en/health.h"
78

89
/* It matches XDP_UMEM_MIN_CHUNK_SIZE, but as this constant is private and may
910
* change unexpectedly, and mlx5e has a minimum valid stride size for striding
@@ -170,7 +171,13 @@ void mlx5e_close_xsk(struct mlx5e_channel *c)
170171

171172
void mlx5e_activate_xsk(struct mlx5e_channel *c)
172173
{
174+
/* ICOSQ recovery deactivates RQs. Suspend the recovery to avoid
175+
* activating XSKRQ in the middle of recovery.
176+
*/
177+
mlx5e_reporter_icosq_suspend_recovery(c);
173178
set_bit(MLX5E_RQ_STATE_ENABLED, &c->xskrq.state);
179+
mlx5e_reporter_icosq_resume_recovery(c);
180+
174181
/* TX queue is created active. */
175182

176183
spin_lock_bh(&c->async_icosq_lock);
@@ -180,6 +187,13 @@ void mlx5e_activate_xsk(struct mlx5e_channel *c)
180187

181188
void mlx5e_deactivate_xsk(struct mlx5e_channel *c)
182189
{
183-
mlx5e_deactivate_rq(&c->xskrq);
190+
/* ICOSQ recovery may reactivate XSKRQ if clear_bit is called in the
191+
* middle of recovery. Suspend the recovery to avoid it.
192+
*/
193+
mlx5e_reporter_icosq_suspend_recovery(c);
194+
clear_bit(MLX5E_RQ_STATE_ENABLED, &c->xskrq.state);
195+
mlx5e_reporter_icosq_resume_recovery(c);
196+
synchronize_net(); /* Sync with NAPI to prevent mlx5e_post_rx_wqes. */
197+
184198
/* TX queue is disabled on close. */
185199
}

drivers/net/ethernet/mellanox/mlx5/core/en_main.c

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,8 +1087,6 @@ void mlx5e_deactivate_rq(struct mlx5e_rq *rq)
10871087
void mlx5e_close_rq(struct mlx5e_rq *rq)
10881088
{
10891089
cancel_work_sync(&rq->dim.work);
1090-
if (rq->icosq)
1091-
cancel_work_sync(&rq->icosq->recover_work);
10921090
cancel_work_sync(&rq->recover_work);
10931091
mlx5e_destroy_rq(rq);
10941092
mlx5e_free_rx_descs(rq);
@@ -1216,9 +1214,20 @@ static void mlx5e_icosq_err_cqe_work(struct work_struct *recover_work)
12161214
mlx5e_reporter_icosq_cqe_err(sq);
12171215
}
12181216

1217+
static void mlx5e_async_icosq_err_cqe_work(struct work_struct *recover_work)
1218+
{
1219+
struct mlx5e_icosq *sq = container_of(recover_work, struct mlx5e_icosq,
1220+
recover_work);
1221+
1222+
/* Not implemented yet. */
1223+
1224+
netdev_warn(sq->channel->netdev, "async_icosq recovery is not implemented\n");
1225+
}
1226+
12191227
static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
12201228
struct mlx5e_sq_param *param,
1221-
struct mlx5e_icosq *sq)
1229+
struct mlx5e_icosq *sq,
1230+
work_func_t recover_work_func)
12221231
{
12231232
void *sqc_wq = MLX5_ADDR_OF(sqc, param->sqc, wq);
12241233
struct mlx5_core_dev *mdev = c->mdev;
@@ -1239,7 +1248,7 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
12391248
if (err)
12401249
goto err_sq_wq_destroy;
12411250

1242-
INIT_WORK(&sq->recover_work, mlx5e_icosq_err_cqe_work);
1251+
INIT_WORK(&sq->recover_work, recover_work_func);
12431252

12441253
return 0;
12451254

@@ -1575,13 +1584,14 @@ void mlx5e_tx_err_cqe_work(struct work_struct *recover_work)
15751584
mlx5e_reporter_tx_err_cqe(sq);
15761585
}
15771586

1578-
int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params,
1579-
struct mlx5e_sq_param *param, struct mlx5e_icosq *sq)
1587+
static int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params,
1588+
struct mlx5e_sq_param *param, struct mlx5e_icosq *sq,
1589+
work_func_t recover_work_func)
15801590
{
15811591
struct mlx5e_create_sq_param csp = {};
15821592
int err;
15831593

1584-
err = mlx5e_alloc_icosq(c, param, sq);
1594+
err = mlx5e_alloc_icosq(c, param, sq, recover_work_func);
15851595
if (err)
15861596
return err;
15871597

@@ -1620,7 +1630,7 @@ void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq)
16201630
synchronize_net(); /* Sync with NAPI. */
16211631
}
16221632

1623-
void mlx5e_close_icosq(struct mlx5e_icosq *sq)
1633+
static void mlx5e_close_icosq(struct mlx5e_icosq *sq)
16241634
{
16251635
struct mlx5e_channel *c = sq->channel;
16261636

@@ -2084,11 +2094,15 @@ static int mlx5e_open_queues(struct mlx5e_channel *c,
20842094

20852095
spin_lock_init(&c->async_icosq_lock);
20862096

2087-
err = mlx5e_open_icosq(c, params, &cparam->async_icosq, &c->async_icosq);
2097+
err = mlx5e_open_icosq(c, params, &cparam->async_icosq, &c->async_icosq,
2098+
mlx5e_async_icosq_err_cqe_work);
20882099
if (err)
20892100
goto err_close_xdpsq_cq;
20902101

2091-
err = mlx5e_open_icosq(c, params, &cparam->icosq, &c->icosq);
2102+
mutex_init(&c->icosq_recovery_lock);
2103+
2104+
err = mlx5e_open_icosq(c, params, &cparam->icosq, &c->icosq,
2105+
mlx5e_icosq_err_cqe_work);
20922106
if (err)
20932107
goto err_close_async_icosq;
20942108

@@ -2156,9 +2170,12 @@ static void mlx5e_close_queues(struct mlx5e_channel *c)
21562170
mlx5e_close_xdpsq(&c->xdpsq);
21572171
if (c->xdp)
21582172
mlx5e_close_xdpsq(&c->rq_xdpsq);
2173+
/* The same ICOSQ is used for UMRs for both RQ and XSKRQ. */
2174+
cancel_work_sync(&c->icosq.recover_work);
21592175
mlx5e_close_rq(&c->rq);
21602176
mlx5e_close_sqs(c);
21612177
mlx5e_close_icosq(&c->icosq);
2178+
mutex_destroy(&c->icosq_recovery_lock);
21622179
mlx5e_close_icosq(&c->async_icosq);
21632180
if (c->xdp)
21642181
mlx5e_close_cq(&c->rq_xdpsq.cq);

drivers/net/ethernet/mellanox/mlx5/core/en_tc.c

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,21 +1196,16 @@ void mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw,
11961196
if (attr->flags & MLX5_ESW_ATTR_FLAG_SLOW_PATH)
11971197
goto offload_rule_0;
11981198

1199-
if (flow_flag_test(flow, CT)) {
1200-
mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), flow, attr);
1201-
return;
1202-
}
1203-
1204-
if (flow_flag_test(flow, SAMPLE)) {
1205-
mlx5e_tc_sample_unoffload(get_sample_priv(flow->priv), flow->rule[0], attr);
1206-
return;
1207-
}
1208-
12091199
if (attr->esw_attr->split_count)
12101200
mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr);
12111201

1202+
if (flow_flag_test(flow, CT))
1203+
mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), flow, attr);
1204+
else if (flow_flag_test(flow, SAMPLE))
1205+
mlx5e_tc_sample_unoffload(get_sample_priv(flow->priv), flow->rule[0], attr);
1206+
else
12121207
offload_rule_0:
1213-
mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
1208+
mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
12141209
}
12151210

12161211
struct mlx5_flow_handle *
@@ -1445,7 +1440,7 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
14451440
MLX5_FLOW_NAMESPACE_FDB, VPORT_TO_REG,
14461441
metadata);
14471442
if (err)
1448-
return err;
1443+
goto err_out;
14491444
}
14501445
}
14511446

@@ -1461,22 +1456,26 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
14611456
if (attr->chain) {
14621457
NL_SET_ERR_MSG_MOD(extack,
14631458
"Internal port rule is only supported on chain 0");
1464-
return -EOPNOTSUPP;
1459+
err = -EOPNOTSUPP;
1460+
goto err_out;
14651461
}
14661462

14671463
if (attr->dest_chain) {
14681464
NL_SET_ERR_MSG_MOD(extack,
14691465
"Internal port rule offload doesn't support goto action");
1470-
return -EOPNOTSUPP;
1466+
err = -EOPNOTSUPP;
1467+
goto err_out;
14711468
}
14721469

14731470
int_port = mlx5e_tc_int_port_get(mlx5e_get_int_port_priv(priv),
14741471
parse_attr->filter_dev->ifindex,
14751472
flow_flag_test(flow, EGRESS) ?
14761473
MLX5E_TC_INT_PORT_EGRESS :
14771474
MLX5E_TC_INT_PORT_INGRESS);
1478-
if (IS_ERR(int_port))
1479-
return PTR_ERR(int_port);
1475+
if (IS_ERR(int_port)) {
1476+
err = PTR_ERR(int_port);
1477+
goto err_out;
1478+
}
14801479

14811480
esw_attr->int_port = int_port;
14821481
}

drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@ u32 mlx5_chains_get_nf_ft_chain(struct mlx5_fs_chains *chains)
121121

122122
u32 mlx5_chains_get_prio_range(struct mlx5_fs_chains *chains)
123123
{
124+
if (!mlx5_chains_prios_supported(chains))
125+
return 1;
126+
124127
if (mlx5_chains_ignore_flow_level_supported(chains))
125128
return UINT_MAX;
126129

drivers/net/ethernet/mellanox/mlx5/core/main.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1809,12 +1809,13 @@ void mlx5_disable_device(struct mlx5_core_dev *dev)
18091809

18101810
int mlx5_recover_device(struct mlx5_core_dev *dev)
18111811
{
1812-
int ret = -EIO;
1812+
if (!mlx5_core_is_sf(dev)) {
1813+
mlx5_pci_disable_device(dev);
1814+
if (mlx5_pci_slot_reset(dev->pdev) != PCI_ERS_RESULT_RECOVERED)
1815+
return -EIO;
1816+
}
18131817

1814-
mlx5_pci_disable_device(dev);
1815-
if (mlx5_pci_slot_reset(dev->pdev) == PCI_ERS_RESULT_RECOVERED)
1816-
ret = mlx5_load_one(dev);
1817-
return ret;
1818+
return mlx5_load_one(dev);
18181819
}
18191820

18201821
static struct pci_driver mlx5_core_driver = {

0 commit comments

Comments
 (0)