1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (c) 2019 Mellanox Technologies.
3 
4 #include "health.h"
5 #include "lib/eq.h"
6 #include "lib/mlx5.h"
7 
mlx5e_health_fmsg_named_obj_nest_start(struct devlink_fmsg * fmsg,char * name)8 int mlx5e_health_fmsg_named_obj_nest_start(struct devlink_fmsg *fmsg, char *name)
9 {
10 	int err;
11 
12 	err = devlink_fmsg_pair_nest_start(fmsg, name);
13 	if (err)
14 		return err;
15 
16 	err = devlink_fmsg_obj_nest_start(fmsg);
17 	if (err)
18 		return err;
19 
20 	return 0;
21 }
22 
mlx5e_health_fmsg_named_obj_nest_end(struct devlink_fmsg * fmsg)23 int mlx5e_health_fmsg_named_obj_nest_end(struct devlink_fmsg *fmsg)
24 {
25 	int err;
26 
27 	err = devlink_fmsg_obj_nest_end(fmsg);
28 	if (err)
29 		return err;
30 
31 	err = devlink_fmsg_pair_nest_end(fmsg);
32 	if (err)
33 		return err;
34 
35 	return 0;
36 }
37 
mlx5e_health_cq_diag_fmsg(struct mlx5e_cq * cq,struct devlink_fmsg * fmsg)38 int mlx5e_health_cq_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg)
39 {
40 	struct mlx5e_priv *priv = cq->channel->priv;
41 	u32 out[MLX5_ST_SZ_DW(query_cq_out)] = {};
42 	u8 hw_status;
43 	void *cqc;
44 	int err;
45 
46 	err = mlx5_core_query_cq(priv->mdev, &cq->mcq, out);
47 	if (err)
48 		return err;
49 
50 	cqc = MLX5_ADDR_OF(query_cq_out, out, cq_context);
51 	hw_status = MLX5_GET(cqc, cqc, status);
52 
53 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "CQ");
54 	if (err)
55 		return err;
56 
57 	err = devlink_fmsg_u32_pair_put(fmsg, "cqn", cq->mcq.cqn);
58 	if (err)
59 		return err;
60 
61 	err = devlink_fmsg_u8_pair_put(fmsg, "HW status", hw_status);
62 	if (err)
63 		return err;
64 
65 	err = devlink_fmsg_u32_pair_put(fmsg, "ci", mlx5_cqwq_get_ci(&cq->wq));
66 	if (err)
67 		return err;
68 
69 	err = devlink_fmsg_u32_pair_put(fmsg, "size", mlx5_cqwq_get_size(&cq->wq));
70 	if (err)
71 		return err;
72 
73 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
74 	if (err)
75 		return err;
76 
77 	return 0;
78 }
79 
mlx5e_health_cq_common_diag_fmsg(struct mlx5e_cq * cq,struct devlink_fmsg * fmsg)80 int mlx5e_health_cq_common_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg)
81 {
82 	u8 cq_log_stride;
83 	u32 cq_sz;
84 	int err;
85 
86 	cq_sz = mlx5_cqwq_get_size(&cq->wq);
87 	cq_log_stride = mlx5_cqwq_get_log_stride_size(&cq->wq);
88 
89 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "CQ");
90 	if (err)
91 		return err;
92 
93 	err = devlink_fmsg_u64_pair_put(fmsg, "stride size", BIT(cq_log_stride));
94 	if (err)
95 		return err;
96 
97 	err = devlink_fmsg_u32_pair_put(fmsg, "size", cq_sz);
98 	if (err)
99 		return err;
100 
101 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
102 	if (err)
103 		return err;
104 
105 	return 0;
106 }
107 
mlx5e_health_eq_diag_fmsg(struct mlx5_eq_comp * eq,struct devlink_fmsg * fmsg)108 int mlx5e_health_eq_diag_fmsg(struct mlx5_eq_comp *eq, struct devlink_fmsg *fmsg)
109 {
110 	int err;
111 
112 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, "EQ");
113 	if (err)
114 		return err;
115 
116 	err = devlink_fmsg_u8_pair_put(fmsg, "eqn", eq->core.eqn);
117 	if (err)
118 		return err;
119 
120 	err = devlink_fmsg_u32_pair_put(fmsg, "irqn", eq->core.irqn);
121 	if (err)
122 		return err;
123 
124 	err = devlink_fmsg_u32_pair_put(fmsg, "vecidx", eq->core.vecidx);
125 	if (err)
126 		return err;
127 
128 	err = devlink_fmsg_u32_pair_put(fmsg, "ci", eq->core.cons_index);
129 	if (err)
130 		return err;
131 
132 	err = devlink_fmsg_u32_pair_put(fmsg, "size", eq->core.nent);
133 	if (err)
134 		return err;
135 
136 	return mlx5e_health_fmsg_named_obj_nest_end(fmsg);
137 }
138 
mlx5e_health_create_reporters(struct mlx5e_priv * priv)139 void mlx5e_health_create_reporters(struct mlx5e_priv *priv)
140 {
141 	mlx5e_reporter_tx_create(priv);
142 	mlx5e_reporter_rx_create(priv);
143 }
144 
mlx5e_health_destroy_reporters(struct mlx5e_priv * priv)145 void mlx5e_health_destroy_reporters(struct mlx5e_priv *priv)
146 {
147 	mlx5e_reporter_rx_destroy(priv);
148 	mlx5e_reporter_tx_destroy(priv);
149 }
150 
mlx5e_health_channels_update(struct mlx5e_priv * priv)151 void mlx5e_health_channels_update(struct mlx5e_priv *priv)
152 {
153 	if (priv->tx_reporter)
154 		devlink_health_reporter_state_update(priv->tx_reporter,
155 						     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
156 	if (priv->rx_reporter)
157 		devlink_health_reporter_state_update(priv->rx_reporter,
158 						     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
159 }
160 
mlx5e_health_sq_to_ready(struct mlx5e_channel * channel,u32 sqn)161 int mlx5e_health_sq_to_ready(struct mlx5e_channel *channel, u32 sqn)
162 {
163 	struct mlx5_core_dev *mdev = channel->mdev;
164 	struct net_device *dev = channel->netdev;
165 	struct mlx5e_modify_sq_param msp = {};
166 	int err;
167 
168 	msp.curr_state = MLX5_SQC_STATE_ERR;
169 	msp.next_state = MLX5_SQC_STATE_RST;
170 
171 	err = mlx5e_modify_sq(mdev, sqn, &msp);
172 	if (err) {
173 		netdev_err(dev, "Failed to move sq 0x%x to reset\n", sqn);
174 		return err;
175 	}
176 
177 	memset(&msp, 0, sizeof(msp));
178 	msp.curr_state = MLX5_SQC_STATE_RST;
179 	msp.next_state = MLX5_SQC_STATE_RDY;
180 
181 	err = mlx5e_modify_sq(mdev, sqn, &msp);
182 	if (err) {
183 		netdev_err(dev, "Failed to move sq 0x%x to ready\n", sqn);
184 		return err;
185 	}
186 
187 	return 0;
188 }
189 
mlx5e_health_recover_channels(struct mlx5e_priv * priv)190 int mlx5e_health_recover_channels(struct mlx5e_priv *priv)
191 {
192 	int err = 0;
193 
194 	rtnl_lock();
195 	mutex_lock(&priv->state_lock);
196 
197 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
198 		goto out;
199 
200 	err = mlx5e_safe_reopen_channels(priv);
201 
202 out:
203 	mutex_unlock(&priv->state_lock);
204 	rtnl_unlock();
205 
206 	return err;
207 }
208 
mlx5e_health_channel_eq_recover(struct mlx5_eq_comp * eq,struct mlx5e_channel * channel)209 int mlx5e_health_channel_eq_recover(struct mlx5_eq_comp *eq, struct mlx5e_channel *channel)
210 {
211 	u32 eqe_count;
212 
213 	netdev_err(channel->netdev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
214 		   eq->core.eqn, eq->core.cons_index, eq->core.irqn);
215 
216 	eqe_count = mlx5_eq_poll_irq_disabled(eq);
217 	if (!eqe_count)
218 		return -EIO;
219 
220 	netdev_err(channel->netdev, "Recovered %d eqes on EQ 0x%x\n",
221 		   eqe_count, eq->core.eqn);
222 
223 	channel->stats->eq_rearm++;
224 	return 0;
225 }
226 
mlx5e_health_report(struct mlx5e_priv * priv,struct devlink_health_reporter * reporter,char * err_str,struct mlx5e_err_ctx * err_ctx)227 int mlx5e_health_report(struct mlx5e_priv *priv,
228 			struct devlink_health_reporter *reporter, char *err_str,
229 			struct mlx5e_err_ctx *err_ctx)
230 {
231 	netdev_err(priv->netdev, "%s\n", err_str);
232 
233 	if (!reporter)
234 		return err_ctx->recover(err_ctx->ctx);
235 
236 	return devlink_health_report(reporter, err_str, err_ctx);
237 }
238 
239 #define MLX5_HEALTH_DEVLINK_MAX_SIZE 1024
mlx5e_health_rsc_fmsg_binary(struct devlink_fmsg * fmsg,const void * value,u32 value_len)240 static int mlx5e_health_rsc_fmsg_binary(struct devlink_fmsg *fmsg,
241 					const void *value, u32 value_len)
242 
243 {
244 	u32 data_size;
245 	int err = 0;
246 	u32 offset;
247 
248 	for (offset = 0; offset < value_len; offset += data_size) {
249 		data_size = value_len - offset;
250 		if (data_size > MLX5_HEALTH_DEVLINK_MAX_SIZE)
251 			data_size = MLX5_HEALTH_DEVLINK_MAX_SIZE;
252 		err = devlink_fmsg_binary_put(fmsg, value + offset, data_size);
253 		if (err)
254 			break;
255 	}
256 	return err;
257 }
258 
mlx5e_health_rsc_fmsg_dump(struct mlx5e_priv * priv,struct mlx5_rsc_key * key,struct devlink_fmsg * fmsg)259 int mlx5e_health_rsc_fmsg_dump(struct mlx5e_priv *priv, struct mlx5_rsc_key *key,
260 			       struct devlink_fmsg *fmsg)
261 {
262 	struct mlx5_core_dev *mdev = priv->mdev;
263 	struct mlx5_rsc_dump_cmd *cmd;
264 	struct page *page;
265 	int cmd_err, err;
266 	int end_err;
267 	int size;
268 
269 	if (IS_ERR_OR_NULL(mdev->rsc_dump))
270 		return -EOPNOTSUPP;
271 
272 	page = alloc_page(GFP_KERNEL);
273 	if (!page)
274 		return -ENOMEM;
275 
276 	err = devlink_fmsg_binary_pair_nest_start(fmsg, "data");
277 	if (err)
278 		return err;
279 
280 	cmd = mlx5_rsc_dump_cmd_create(mdev, key);
281 	if (IS_ERR(cmd)) {
282 		err = PTR_ERR(cmd);
283 		goto free_page;
284 	}
285 
286 	do {
287 		cmd_err = mlx5_rsc_dump_next(mdev, cmd, page, &size);
288 		if (cmd_err < 0) {
289 			err = cmd_err;
290 			goto destroy_cmd;
291 		}
292 
293 		err = mlx5e_health_rsc_fmsg_binary(fmsg, page_address(page), size);
294 		if (err)
295 			goto destroy_cmd;
296 
297 	} while (cmd_err > 0);
298 
299 destroy_cmd:
300 	mlx5_rsc_dump_cmd_destroy(cmd);
301 	end_err = devlink_fmsg_binary_pair_nest_end(fmsg);
302 	if (end_err)
303 		err = end_err;
304 free_page:
305 	__free_page(page);
306 	return err;
307 }
308 
mlx5e_health_queue_dump(struct mlx5e_priv * priv,struct devlink_fmsg * fmsg,int queue_idx,char * lbl)309 int mlx5e_health_queue_dump(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg,
310 			    int queue_idx, char *lbl)
311 {
312 	struct mlx5_rsc_key key = {};
313 	int err;
314 
315 	key.rsc = MLX5_SGMT_TYPE_FULL_QPC;
316 	key.index1 = queue_idx;
317 	key.size = PAGE_SIZE;
318 	key.num_of_obj1 = 1;
319 
320 	err = devlink_fmsg_obj_nest_start(fmsg);
321 	if (err)
322 		return err;
323 
324 	err = mlx5e_health_fmsg_named_obj_nest_start(fmsg, lbl);
325 	if (err)
326 		return err;
327 
328 	err = devlink_fmsg_u32_pair_put(fmsg, "index", queue_idx);
329 	if (err)
330 		return err;
331 
332 	err = mlx5e_health_rsc_fmsg_dump(priv, &key, fmsg);
333 	if (err)
334 		return err;
335 
336 	err = mlx5e_health_fmsg_named_obj_nest_end(fmsg);
337 	if (err)
338 		return err;
339 
340 	return devlink_fmsg_obj_nest_end(fmsg);
341 }
342