1 /* SPDX-License-Identifier: GPL-2.0 */
2 /* Copyright (c) 2019 Mellanox Technologies. */
3
4 #include "health.h"
5 #include "en/ptp.h"
6 #include "en/devlink.h"
7
mlx5e_wait_for_sq_flush(struct mlx5e_txqsq * sq)8 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
9 {
10 unsigned long exp_time = jiffies +
11 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC);
12
13 while (time_before(jiffies, exp_time)) {
14 if (sq->cc == sq->pc)
15 return 0;
16
17 msleep(20);
18 }
19
20 netdev_err(sq->netdev,
21 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
22 sq->sqn, sq->cc, sq->pc);
23
24 return -ETIMEDOUT;
25 }
26
mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq * sq)27 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
28 {
29 WARN_ONCE(sq->cc != sq->pc,
30 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
31 sq->sqn, sq->cc, sq->pc);
32 sq->cc = 0;
33 sq->dma_fifo_cc = 0;
34 sq->pc = 0;
35 }
36
mlx5e_tx_reporter_err_cqe_recover(void * ctx)37 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
38 {
39 struct mlx5_core_dev *mdev;
40 struct net_device *dev;
41 struct mlx5e_txqsq *sq;
42 u8 state;
43 int err;
44
45 sq = ctx;
46 mdev = sq->mdev;
47 dev = sq->netdev;
48
49 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
50 return 0;
51
52 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
53 if (err) {
54 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
55 sq->sqn, err);
56 goto out;
57 }
58
59 if (state != MLX5_SQC_STATE_ERR)
60 goto out;
61
62 mlx5e_tx_disable_queue(sq->txq);
63
64 err = mlx5e_wait_for_sq_flush(sq);
65 if (err)
66 goto out;
67
68 /* At this point, no new packets will arrive from the stack as TXQ is
69 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
70 * pending WQEs. SQ can safely reset the SQ.
71 */
72
73 err = mlx5e_health_sq_to_ready(mdev, dev, sq->sqn);
74 if (err)
75 goto out;
76
77 mlx5e_reset_txqsq_cc_pc(sq);
78 sq->stats->recover++;
79 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
80 mlx5e_activate_txqsq(sq);
81
82 return 0;
83 out:
84 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
85 return err;
86 }
87
88 struct mlx5e_tx_timeout_ctx {
89 struct mlx5e_txqsq *sq;
90 signed int status;
91 };
92
mlx5e_tx_reporter_timeout_recover(void * ctx)93 static int mlx5e_tx_reporter_timeout_recover(void *ctx)
94 {
95 struct mlx5e_tx_timeout_ctx *to_ctx;
96 struct mlx5e_priv *priv;
97 struct mlx5_eq_comp *eq;
98 struct mlx5e_txqsq *sq;
99 int err;
100
101 to_ctx = ctx;
102 sq = to_ctx->sq;
103 eq = sq->cq.mcq.eq;
104 priv = sq->priv;
105 err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats);
106 if (!err) {
107 to_ctx->status = 0; /* this sq recovered */
108 return err;
109 }
110
111 err = mlx5e_safe_reopen_channels(priv);
112 if (!err) {
113 to_ctx->status = 1; /* all channels recovered */
114 return err;
115 }
116
117 to_ctx->status = err;
118 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
119 netdev_err(priv->netdev,
120 "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
121 err);
122
123 return err;
124 }
125
126 /* state lock cannot be grabbed within this function.
127 * It can cause a dead lock or a read-after-free.
128 */
mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx * err_ctx)129 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
130 {
131 return err_ctx->recover(err_ctx->ctx);
132 }
133
mlx5e_tx_reporter_recover(struct devlink_health_reporter * reporter,void * context,struct netlink_ext_ack * extack)134 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
135 void *context,
136 struct netlink_ext_ack *extack)
137 {
138 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
139 struct mlx5e_err_ctx *err_ctx = context;
140
141 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
142 mlx5e_health_recover_channels(priv);
143 }
144
145 static int
mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * sq,int tc)146 mlx5e_tx_reporter_build_diagnose_output_sq_common(struct devlink_fmsg *fmsg,
147 struct mlx5e_txqsq *sq, int tc)
148 {
149 bool stopped = netif_xmit_stopped(sq->txq);
150 struct mlx5e_priv *priv = sq->priv;
151 u8 state;
152 int err;
153
154 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
155 if (err)
156 return err;
157
158 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc);
159 if (err)
160 return err;
161
162 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix);
163 if (err)
164 return err;
165
166 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn);
167 if (err)
168 return err;
169
170 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state);
171 if (err)
172 return err;
173
174 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped);
175 if (err)
176 return err;
177
178 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc);
179 if (err)
180 return err;
181
182 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc);
183 if (err)
184 return err;
185
186 err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg);
187 if (err)
188 return err;
189
190 return mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg);
191 }
192
193 static int
mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * sq,int tc)194 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg,
195 struct mlx5e_txqsq *sq, int tc)
196 {
197 int err;
198
199 err = devlink_fmsg_obj_nest_start(fmsg);
200 if (err)
201 return err;
202
203 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix);
204 if (err)
205 return err;
206
207 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, sq, tc);
208 if (err)
209 return err;
210
211 err = devlink_fmsg_obj_nest_end(fmsg);
212 if (err)
213 return err;
214
215 return 0;
216 }
217
218 static int
mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg * fmsg,struct mlx5e_ptpsq * ptpsq,int tc)219 mlx5e_tx_reporter_build_diagnose_output_ptpsq(struct devlink_fmsg *fmsg,
220 struct mlx5e_ptpsq *ptpsq, int tc)
221 {
222 int err;
223
224 err = devlink_fmsg_obj_nest_start(fmsg);
225 if (err)
226 return err;
227
228 err = devlink_fmsg_string_pair_put(fmsg, "channel", "ptp");
229 if (err)
230 return err;
231
232 err = mlx5e_tx_reporter_build_diagnose_output_sq_common(fmsg, &ptpsq->txqsq, tc);
233 if (err)
234 return err;
235
236 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS");
237 if (err)
238 return err;
239
240 err = mlx5e_health_cq_diag_fmsg(&ptpsq->ts_cq, fmsg);
241 if (err)
242 return err;
243
244 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
245 if (err)
246 return err;
247
248 err = devlink_fmsg_obj_nest_end(fmsg);
249 if (err)
250 return err;
251
252 return 0;
253 }
254
255 static int
mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * txqsq)256 mlx5e_tx_reporter_diagnose_generic_txqsq(struct devlink_fmsg *fmsg,
257 struct mlx5e_txqsq *txqsq)
258 {
259 u32 sq_stride, sq_sz;
260 bool real_time;
261 int err;
262
263 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
264 if (err)
265 return err;
266
267 real_time = mlx5_is_real_time_sq(txqsq->mdev);
268 sq_sz = mlx5_wq_cyc_get_size(&txqsq->wq);
269 sq_stride = MLX5_SEND_WQE_BB;
270
271 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride);
272 if (err)
273 return err;
274
275 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz);
276 if (err)
277 return err;
278
279 err = devlink_fmsg_string_pair_put(fmsg, "ts_format", real_time ? "RT" : "FRC");
280 if (err)
281 return err;
282
283 err = mlx5e_health_cq_common_diag_fmsg(&txqsq->cq, fmsg);
284 if (err)
285 return err;
286
287 return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
288 }
289
290 static int
mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg * fmsg,struct mlx5e_ptpsq * ptpsq)291 mlx5e_tx_reporter_diagnose_generic_tx_port_ts(struct devlink_fmsg *fmsg,
292 struct mlx5e_ptpsq *ptpsq)
293 {
294 int err;
295
296 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Port TS");
297 if (err)
298 return err;
299
300 err = mlx5e_health_cq_common_diag_fmsg(&ptpsq->ts_cq, fmsg);
301 if (err)
302 return err;
303
304 return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
305 }
306
307 static int
mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg)308 mlx5e_tx_reporter_diagnose_common_config(struct devlink_health_reporter *reporter,
309 struct devlink_fmsg *fmsg)
310 {
311 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
312 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0];
313 struct mlx5e_ptp *ptp_ch = priv->channels.ptp;
314 struct mlx5e_ptpsq *generic_ptpsq;
315 int err;
316
317 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config");
318 if (err)
319 return err;
320
321 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, generic_sq);
322 if (err)
323 return err;
324
325 if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state))
326 goto out;
327
328 generic_ptpsq = &ptp_ch->ptpsq[0];
329
330 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "PTP");
331 if (err)
332 return err;
333
334 err = mlx5e_tx_reporter_diagnose_generic_txqsq(fmsg, &generic_ptpsq->txqsq);
335 if (err)
336 return err;
337
338 err = mlx5e_tx_reporter_diagnose_generic_tx_port_ts(fmsg, generic_ptpsq);
339 if (err)
340 return err;
341
342 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
343 if (err)
344 return err;
345
346 out:
347 return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
348 }
349
mlx5e_tx_reporter_diagnose(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg,struct netlink_ext_ack * extack)350 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
351 struct devlink_fmsg *fmsg,
352 struct netlink_ext_ack *extack)
353 {
354 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
355 struct mlx5e_ptp *ptp_ch = priv->channels.ptp;
356
357 int i, tc, err = 0;
358
359 mutex_lock(&priv->state_lock);
360
361 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
362 goto unlock;
363
364 err = mlx5e_tx_reporter_diagnose_common_config(reporter, fmsg);
365 if (err)
366 goto unlock;
367
368 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
369 if (err)
370 goto unlock;
371
372 for (i = 0; i < priv->channels.num; i++) {
373 struct mlx5e_channel *c = priv->channels.c[i];
374
375 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) {
376 struct mlx5e_txqsq *sq = &c->sq[tc];
377
378 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc);
379 if (err)
380 goto unlock;
381 }
382 }
383
384 if (!ptp_ch || !test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state))
385 goto close_sqs_nest;
386
387 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) {
388 err = mlx5e_tx_reporter_build_diagnose_output_ptpsq(fmsg,
389 &ptp_ch->ptpsq[tc],
390 tc);
391 if (err)
392 goto unlock;
393 }
394
395 close_sqs_nest:
396 err = devlink_fmsg_arr_pair_nest_end(fmsg);
397 if (err)
398 goto unlock;
399
400 unlock:
401 mutex_unlock(&priv->state_lock);
402 return err;
403 }
404
mlx5e_tx_reporter_dump_sq(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,void * ctx)405 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
406 void *ctx)
407 {
408 struct mlx5_rsc_key key = {};
409 struct mlx5e_txqsq *sq = ctx;
410 int err;
411
412 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
413 return 0;
414
415 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
416 if (err)
417 return err;
418
419 key.size = PAGE_SIZE;
420 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
421 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
422 if (err)
423 return err;
424
425 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
426 if (err)
427 return err;
428
429 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
430 if (err)
431 return err;
432
433 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC");
434 if (err)
435 return err;
436
437 key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
438 key.index1 = sq->sqn;
439 key.num_of_obj1 = 1;
440
441 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
442 if (err)
443 return err;
444
445 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
446 if (err)
447 return err;
448
449 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff");
450 if (err)
451 return err;
452
453 key.rsc = MLX5_SGMT_TYPE_SND_BUFF;
454 key.num_of_obj2 = MLX5_RSC_DUMP_ALL;
455 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
456 if (err)
457 return err;
458
459 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
460 if (err)
461 return err;
462
463 return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
464 }
465
mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg)466 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv,
467 struct devlink_fmsg *fmsg)
468 {
469 struct mlx5e_ptp *ptp_ch = priv->channels.ptp;
470 struct mlx5_rsc_key key = {};
471 int i, tc, err;
472
473 if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
474 return 0;
475
476 err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
477 if (err)
478 return err;
479
480 key.size = PAGE_SIZE;
481 key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
482 err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
483 if (err)
484 return err;
485
486 err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
487 if (err)
488 return err;
489
490 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
491 if (err)
492 return err;
493
494 for (i = 0; i < priv->channels.num; i++) {
495 struct mlx5e_channel *c = priv->channels.c[i];
496
497 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) {
498 struct mlx5e_txqsq *sq = &c->sq[tc];
499
500 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ");
501 if (err)
502 return err;
503 }
504 }
505
506 if (ptp_ch && test_bit(MLX5E_PTP_STATE_TX, ptp_ch->state)) {
507 for (tc = 0; tc < mlx5e_get_dcb_num_tc(&priv->channels.params); tc++) {
508 struct mlx5e_txqsq *sq = &ptp_ch->ptpsq[tc].txqsq;
509
510 err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "PTP SQ");
511 if (err)
512 return err;
513 }
514 }
515
516 return devlink_fmsg_arr_pair_nest_end(fmsg);
517 }
518
mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv * priv,struct mlx5e_err_ctx * err_ctx,struct devlink_fmsg * fmsg)519 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv,
520 struct mlx5e_err_ctx *err_ctx,
521 struct devlink_fmsg *fmsg)
522 {
523 return err_ctx->dump(priv, fmsg, err_ctx->ctx);
524 }
525
mlx5e_tx_reporter_dump(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg,void * context,struct netlink_ext_ack * extack)526 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter,
527 struct devlink_fmsg *fmsg, void *context,
528 struct netlink_ext_ack *extack)
529 {
530 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
531 struct mlx5e_err_ctx *err_ctx = context;
532
533 return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) :
534 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg);
535 }
536
mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq * sq)537 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq)
538 {
539 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
540 struct mlx5e_priv *priv = sq->priv;
541 struct mlx5e_err_ctx err_ctx = {};
542
543 err_ctx.ctx = sq;
544 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover;
545 err_ctx.dump = mlx5e_tx_reporter_dump_sq;
546 snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn);
547
548 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
549 }
550
mlx5e_reporter_tx_timeout(struct mlx5e_txqsq * sq)551 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq)
552 {
553 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
554 struct mlx5e_tx_timeout_ctx to_ctx = {};
555 struct mlx5e_priv *priv = sq->priv;
556 struct mlx5e_err_ctx err_ctx = {};
557
558 to_ctx.sq = sq;
559 err_ctx.ctx = &to_ctx;
560 err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
561 err_ctx.dump = mlx5e_tx_reporter_dump_sq;
562 snprintf(err_str, sizeof(err_str),
563 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u",
564 sq->ch_ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
565 jiffies_to_usecs(jiffies - sq->txq->trans_start));
566
567 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
568 return to_ctx.status;
569 }
570
571 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
572 .name = "tx",
573 .recover = mlx5e_tx_reporter_recover,
574 .diagnose = mlx5e_tx_reporter_diagnose,
575 .dump = mlx5e_tx_reporter_dump,
576 };
577
578 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
579
mlx5e_reporter_tx_create(struct mlx5e_priv * priv)580 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv)
581 {
582 struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv);
583 struct devlink_health_reporter *reporter;
584
585 reporter = devlink_port_health_reporter_create(dl_port, &mlx5_tx_reporter_ops,
586 MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv);
587 if (IS_ERR(reporter)) {
588 netdev_warn(priv->netdev,
589 "Failed to create tx reporter, err = %ld\n",
590 PTR_ERR(reporter));
591 return;
592 }
593 priv->tx_reporter = reporter;
594 }
595
mlx5e_reporter_tx_destroy(struct mlx5e_priv * priv)596 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv)
597 {
598 if (!priv->tx_reporter)
599 return;
600
601 devlink_port_health_reporter_destroy(priv->tx_reporter);
602 priv->tx_reporter = NULL;
603 }
604