1 /* SPDX-License-Identifier: GPL-2.0 */
2 /* Copyright (c) 2019 Mellanox Technologies. */
3 
4 #include "health.h"
5 
mlx5e_wait_for_sq_flush(struct mlx5e_txqsq * sq)6 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
7 {
8 	unsigned long exp_time = jiffies +
9 				 msecs_to_jiffies(MLX5E_REPORTER_FLUSH_TIMEOUT_MSEC);
10 
11 	while (time_before(jiffies, exp_time)) {
12 		if (sq->cc == sq->pc)
13 			return 0;
14 
15 		msleep(20);
16 	}
17 
18 	netdev_err(sq->channel->netdev,
19 		   "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n",
20 		   sq->sqn, sq->cc, sq->pc);
21 
22 	return -ETIMEDOUT;
23 }
24 
mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq * sq)25 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq)
26 {
27 	WARN_ONCE(sq->cc != sq->pc,
28 		  "SQ 0x%x: cc (0x%x) != pc (0x%x)\n",
29 		  sq->sqn, sq->cc, sq->pc);
30 	sq->cc = 0;
31 	sq->dma_fifo_cc = 0;
32 	sq->pc = 0;
33 }
34 
mlx5e_tx_reporter_err_cqe_recover(void * ctx)35 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
36 {
37 	struct mlx5_core_dev *mdev;
38 	struct net_device *dev;
39 	struct mlx5e_txqsq *sq;
40 	u8 state;
41 	int err;
42 
43 	sq = ctx;
44 	mdev = sq->channel->mdev;
45 	dev = sq->channel->netdev;
46 
47 	if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
48 		return 0;
49 
50 	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
51 	if (err) {
52 		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
53 			   sq->sqn, err);
54 		goto out;
55 	}
56 
57 	if (state != MLX5_SQC_STATE_ERR)
58 		goto out;
59 
60 	mlx5e_tx_disable_queue(sq->txq);
61 
62 	err = mlx5e_wait_for_sq_flush(sq);
63 	if (err)
64 		goto out;
65 
66 	/* At this point, no new packets will arrive from the stack as TXQ is
67 	 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all
68 	 * pending WQEs. SQ can safely reset the SQ.
69 	 */
70 
71 	err = mlx5e_health_sq_to_ready(sq->channel, sq->sqn);
72 	if (err)
73 		goto out;
74 
75 	mlx5e_reset_txqsq_cc_pc(sq);
76 	sq->stats->recover++;
77 	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
78 	mlx5e_activate_txqsq(sq);
79 
80 	return 0;
81 out:
82 	clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
83 	return err;
84 }
85 
86 struct mlx5e_tx_timeout_ctx {
87 	struct mlx5e_txqsq *sq;
88 	signed int status;
89 };
90 
mlx5e_tx_reporter_timeout_recover(void * ctx)91 static int mlx5e_tx_reporter_timeout_recover(void *ctx)
92 {
93 	struct mlx5e_tx_timeout_ctx *to_ctx;
94 	struct mlx5e_priv *priv;
95 	struct mlx5_eq_comp *eq;
96 	struct mlx5e_txqsq *sq;
97 	int err;
98 
99 	to_ctx = ctx;
100 	sq = to_ctx->sq;
101 	eq = sq->cq.mcq.eq;
102 	priv = sq->channel->priv;
103 	err = mlx5e_health_channel_eq_recover(eq, sq->channel);
104 	if (!err) {
105 		to_ctx->status = 0; /* this sq recovered */
106 		return err;
107 	}
108 
109 	err = mlx5e_safe_reopen_channels(priv);
110 	if (!err) {
111 		to_ctx->status = 1; /* all channels recovered */
112 		return err;
113 	}
114 
115 	to_ctx->status = err;
116 	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
117 	netdev_err(priv->netdev,
118 		   "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
119 		   err);
120 
121 	return err;
122 }
123 
124 /* state lock cannot be grabbed within this function.
125  * It can cause a dead lock or a read-after-free.
126  */
mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx * err_ctx)127 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx)
128 {
129 	return err_ctx->recover(err_ctx->ctx);
130 }
131 
mlx5e_tx_reporter_recover(struct devlink_health_reporter * reporter,void * context,struct netlink_ext_ack * extack)132 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter,
133 				     void *context,
134 				     struct netlink_ext_ack *extack)
135 {
136 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
137 	struct mlx5e_err_ctx *err_ctx = context;
138 
139 	return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) :
140 			 mlx5e_health_recover_channels(priv);
141 }
142 
143 static int
mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg * fmsg,struct mlx5e_txqsq * sq,int tc)144 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg,
145 					struct mlx5e_txqsq *sq, int tc)
146 {
147 	struct mlx5e_priv *priv = sq->channel->priv;
148 	bool stopped = netif_xmit_stopped(sq->txq);
149 	u8 state;
150 	int err;
151 
152 	err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state);
153 	if (err)
154 		return err;
155 
156 	err = devlink_fmsg_obj_nest_start(fmsg);
157 	if (err)
158 		return err;
159 
160 	err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix);
161 	if (err)
162 		return err;
163 
164 	err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc);
165 	if (err)
166 		return err;
167 
168 	err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix);
169 	if (err)
170 		return err;
171 
172 	err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn);
173 	if (err)
174 		return err;
175 
176 	err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state);
177 	if (err)
178 		return err;
179 
180 	err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped);
181 	if (err)
182 		return err;
183 
184 	err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc);
185 	if (err)
186 		return err;
187 
188 	err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc);
189 	if (err)
190 		return err;
191 
192 	err = mlx5e_health_cq_diag_fmsg(&sq->cq, fmsg);
193 	if (err)
194 		return err;
195 
196 	err = mlx5e_health_eq_diag_fmsg(sq->cq.mcq.eq, fmsg);
197 	if (err)
198 		return err;
199 
200 	err = devlink_fmsg_obj_nest_end(fmsg);
201 	if (err)
202 		return err;
203 
204 	return 0;
205 }
206 
mlx5e_tx_reporter_diagnose(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg,struct netlink_ext_ack * extack)207 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter,
208 				      struct devlink_fmsg *fmsg,
209 				      struct netlink_ext_ack *extack)
210 {
211 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
212 	struct mlx5e_txqsq *generic_sq = priv->txq2sq[0];
213 	u32 sq_stride, sq_sz;
214 
215 	int i, tc, err = 0;
216 
217 	mutex_lock(&priv->state_lock);
218 
219 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
220 		goto unlock;
221 
222 	sq_sz = mlx5_wq_cyc_get_size(&generic_sq->wq);
223 	sq_stride = MLX5_SEND_WQE_BB;
224 
225 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "Common Config");
226 	if (err)
227 		goto unlock;
228 
229 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
230 	if (err)
231 		goto unlock;
232 
233 	err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride);
234 	if (err)
235 		goto unlock;
236 
237 	err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz);
238 	if (err)
239 		goto unlock;
240 
241 	err = mlx5e_health_cq_common_diag_fmsg(&generic_sq->cq, fmsg);
242 	if (err)
243 		goto unlock;
244 
245 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
246 	if (err)
247 		goto unlock;
248 
249 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
250 	if (err)
251 		goto unlock;
252 
253 	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
254 	if (err)
255 		goto unlock;
256 
257 	for (i = 0; i < priv->channels.num; i++) {
258 		struct mlx5e_channel *c = priv->channels.c[i];
259 
260 		for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
261 			struct mlx5e_txqsq *sq = &c->sq[tc];
262 
263 			err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc);
264 			if (err)
265 				goto unlock;
266 		}
267 	}
268 	err = devlink_fmsg_arr_pair_nest_end(fmsg);
269 	if (err)
270 		goto unlock;
271 
272 unlock:
273 	mutex_unlock(&priv->state_lock);
274 	return err;
275 }
276 
mlx5e_tx_reporter_dump_sq(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,void * ctx)277 static int mlx5e_tx_reporter_dump_sq(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
278 				     void *ctx)
279 {
280 	struct mlx5_rsc_key key = {};
281 	struct mlx5e_txqsq *sq = ctx;
282 	int err;
283 
284 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
285 		return 0;
286 
287 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
288 	if (err)
289 		return err;
290 
291 	key.size = PAGE_SIZE;
292 	key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
293 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
294 	if (err)
295 		return err;
296 
297 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
298 	if (err)
299 		return err;
300 
301 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SQ");
302 	if (err)
303 		return err;
304 
305 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "QPC");
306 	if (err)
307 		return err;
308 
309 	key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
310 	key.index1 = sq->sqn;
311 	key.num_of_obj1 = 1;
312 
313 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
314 	if (err)
315 		return err;
316 
317 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
318 	if (err)
319 		return err;
320 
321 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "send_buff");
322 	if (err)
323 		return err;
324 
325 	key.rsc = MLX5_SGMT_TYPE_SND_BUFF;
326 	key.num_of_obj2 = MLX5_RSC_DUMP_ALL;
327 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
328 	if (err)
329 		return err;
330 
331 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
332 	if (err)
333 		return err;
334 
335 	return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
336 }
337 
mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg)338 static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv,
339 					  struct devlink_fmsg *fmsg)
340 {
341 	struct mlx5_rsc_key key = {};
342 	int i, tc, err;
343 
344 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
345 		return 0;
346 
347 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "SX Slice");
348 	if (err)
349 		return err;
350 
351 	key.size = PAGE_SIZE;
352 	key.rsc = MLX5_SGMT_TYPE_SX_SLICE_ALL;
353 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
354 	if (err)
355 		return err;
356 
357 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
358 	if (err)
359 		return err;
360 
361 	err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
362 	if (err)
363 		return err;
364 
365 	for (i = 0; i < priv->channels.num; i++) {
366 		struct mlx5e_channel *c = priv->channels.c[i];
367 
368 		for (tc = 0; tc < priv->channels.params.num_tc; tc++) {
369 			struct mlx5e_txqsq *sq = &c->sq[tc];
370 
371 			err = mlx5e_health_queue_dump(priv, fmsg, sq->sqn, "SQ");
372 			if (err)
373 				return err;
374 		}
375 	}
376 	return devlink_fmsg_arr_pair_nest_end(fmsg);
377 }
378 
mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv * priv,struct mlx5e_err_ctx * err_ctx,struct devlink_fmsg * fmsg)379 static int mlx5e_tx_reporter_dump_from_ctx(struct mlx5e_priv *priv,
380 					   struct mlx5e_err_ctx *err_ctx,
381 					   struct devlink_fmsg *fmsg)
382 {
383 	return err_ctx->dump(priv, fmsg, err_ctx->ctx);
384 }
385 
mlx5e_tx_reporter_dump(struct devlink_health_reporter * reporter,struct devlink_fmsg * fmsg,void * context,struct netlink_ext_ack * extack)386 static int mlx5e_tx_reporter_dump(struct devlink_health_reporter *reporter,
387 				  struct devlink_fmsg *fmsg, void *context,
388 				  struct netlink_ext_ack *extack)
389 {
390 	struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
391 	struct mlx5e_err_ctx *err_ctx = context;
392 
393 	return err_ctx ? mlx5e_tx_reporter_dump_from_ctx(priv, err_ctx, fmsg) :
394 			 mlx5e_tx_reporter_dump_all_sqs(priv, fmsg);
395 }
396 
mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq * sq)397 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq)
398 {
399 	struct mlx5e_priv *priv = sq->channel->priv;
400 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
401 	struct mlx5e_err_ctx err_ctx = {};
402 
403 	err_ctx.ctx = sq;
404 	err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover;
405 	err_ctx.dump = mlx5e_tx_reporter_dump_sq;
406 	snprintf(err_str, sizeof(err_str), "ERR CQE on SQ: 0x%x", sq->sqn);
407 
408 	mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
409 }
410 
mlx5e_reporter_tx_timeout(struct mlx5e_txqsq * sq)411 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq)
412 {
413 	struct mlx5e_priv *priv = sq->channel->priv;
414 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
415 	struct mlx5e_tx_timeout_ctx to_ctx = {};
416 	struct mlx5e_err_ctx err_ctx = {};
417 
418 	to_ctx.sq = sq;
419 	err_ctx.ctx = &to_ctx;
420 	err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
421 	err_ctx.dump = mlx5e_tx_reporter_dump_sq;
422 	snprintf(err_str, sizeof(err_str),
423 		 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u",
424 		 sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
425 		 jiffies_to_usecs(jiffies - sq->txq->trans_start));
426 
427 	mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
428 	return to_ctx.status;
429 }
430 
431 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
432 		.name = "tx",
433 		.recover = mlx5e_tx_reporter_recover,
434 		.diagnose = mlx5e_tx_reporter_diagnose,
435 		.dump = mlx5e_tx_reporter_dump,
436 };
437 
438 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
439 
mlx5e_reporter_tx_create(struct mlx5e_priv * priv)440 void mlx5e_reporter_tx_create(struct mlx5e_priv *priv)
441 {
442 	struct devlink_health_reporter *reporter;
443 
444 	reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_tx_reporter_ops,
445 						       MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv);
446 	if (IS_ERR(reporter)) {
447 		netdev_warn(priv->netdev,
448 			    "Failed to create tx reporter, err = %ld\n",
449 			    PTR_ERR(reporter));
450 		return;
451 	}
452 	priv->tx_reporter = reporter;
453 }
454 
mlx5e_reporter_tx_destroy(struct mlx5e_priv * priv)455 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv)
456 {
457 	if (!priv->tx_reporter)
458 		return;
459 
460 	devlink_port_health_reporter_destroy(priv->tx_reporter);
461 }
462