1 /*
2  * SPDX-License-Identifier: Apache-2.0
3  * Copyright (c) 2022 Intel Corp.
4  */
5 
6 #include <zephyr/logging/log.h>
7 LOG_MODULE_DECLARE(nvme, CONFIG_NVME_LOG_LEVEL);
8 
9 #include <zephyr/kernel.h>
10 #include <zephyr/cache.h>
11 #include <zephyr/sys/byteorder.h>
12 
13 #include <string.h>
14 
15 #include "nvme.h"
16 #include "nvme_helpers.h"
17 
18 static struct nvme_prp_list prp_list_pool[CONFIG_NVME_PRP_LIST_AMOUNT];
19 static sys_dlist_t free_prp_list;
20 
21 static struct nvme_request request_pool[NVME_REQUEST_AMOUNT];
22 static sys_dlist_t free_request;
23 static sys_dlist_t pending_request;
24 
25 static void request_timeout(struct k_work *work);
26 
27 static K_WORK_DELAYABLE_DEFINE(request_timer, request_timeout);
28 
nvme_cmd_init(void)29 void nvme_cmd_init(void)
30 {
31 	int idx;
32 
33 	sys_dlist_init(&free_request);
34 	sys_dlist_init(&pending_request);
35 	sys_dlist_init(&free_prp_list);
36 
37 	for (idx = 0; idx < NVME_REQUEST_AMOUNT; idx++) {
38 		sys_dlist_append(&free_request, &request_pool[idx].node);
39 	}
40 
41 	for (idx = 0; idx < CONFIG_NVME_PRP_LIST_AMOUNT; idx++) {
42 		sys_dlist_append(&free_prp_list, &prp_list_pool[idx].node);
43 	}
44 }
45 
nvme_prp_list_alloc(void)46 static struct nvme_prp_list *nvme_prp_list_alloc(void)
47 {
48 	sys_dnode_t *node;
49 
50 	node = sys_dlist_peek_head(&free_prp_list);
51 	if (!node) {
52 		LOG_ERR("Could not allocate PRP list");
53 		return NULL;
54 	}
55 
56 	sys_dlist_remove(node);
57 
58 	return CONTAINER_OF(node, struct nvme_prp_list, node);
59 }
60 
nvme_prp_list_free(struct nvme_prp_list * prp_list)61 static void nvme_prp_list_free(struct nvme_prp_list *prp_list)
62 {
63 	memset(prp_list, 0, sizeof(struct nvme_prp_list));
64 	sys_dlist_append(&free_prp_list, &prp_list->node);
65 }
66 
nvme_cmd_request_free(struct nvme_request * request)67 void nvme_cmd_request_free(struct nvme_request *request)
68 {
69 	if (sys_dnode_is_linked(&request->node)) {
70 		sys_dlist_remove(&request->node);
71 	}
72 
73 	if (request->prp_list != NULL) {
74 		nvme_prp_list_free(request->prp_list);
75 	}
76 
77 	memset(request, 0, sizeof(struct nvme_request));
78 	sys_dlist_append(&free_request, &request->node);
79 }
80 
nvme_cmd_request_alloc(void)81 struct nvme_request *nvme_cmd_request_alloc(void)
82 {
83 	sys_dnode_t *node;
84 
85 	node = sys_dlist_peek_head(&free_request);
86 	if (!node) {
87 		LOG_ERR("Could not allocate request");
88 		return NULL;
89 	}
90 
91 	sys_dlist_remove(node);
92 
93 	return CONTAINER_OF(node, struct nvme_request, node);
94 }
95 
nvme_cmd_register_request(struct nvme_request * request)96 static void nvme_cmd_register_request(struct nvme_request *request)
97 {
98 	sys_dlist_append(&pending_request, &request->node);
99 
100 	request->req_start = k_uptime_get_32();
101 
102 	if (!k_work_delayable_remaining_get(&request_timer)) {
103 		k_work_reschedule(&request_timer,
104 				  K_SECONDS(CONFIG_NVME_REQUEST_TIMEOUT));
105 	}
106 }
107 
request_timeout(struct k_work * work)108 static void request_timeout(struct k_work *work)
109 {
110 	uint32_t current = k_uptime_get_32();
111 	struct nvme_request *request, *next;
112 
113 	ARG_UNUSED(work);
114 
115 	SYS_DLIST_FOR_EACH_CONTAINER_SAFE(&pending_request,
116 					  request, next, node) {
117 		if ((int32_t)(request->req_start +
118 			      CONFIG_NVME_REQUEST_TIMEOUT - current) > 0) {
119 			break;
120 		}
121 
122 		LOG_WRN("Request %p CID %u timed-out",
123 			request, request->cmd.cdw0.cid);
124 
125 		/* ToDo:
126 		 * - check CSTS for fatal fault
127 		 * - reset hw otherwise if it's the case
128 		 * - or check completion for missed interruption
129 		 */
130 
131 		if (request->cb_fn) {
132 			request->cb_fn(request->cb_arg, NULL);
133 		}
134 
135 		nvme_cmd_request_free(request);
136 	}
137 
138 	if (request) {
139 		k_work_reschedule(&request_timer,
140 				  K_SECONDS(request->req_start +
141 					    CONFIG_NVME_REQUEST_TIMEOUT -
142 					    current));
143 	}
144 }
145 
nvme_completion_is_retry(const struct nvme_completion * cpl)146 static bool nvme_completion_is_retry(const struct nvme_completion *cpl)
147 {
148 	uint8_t sct, sc, dnr;
149 
150 	sct = NVME_STATUS_GET_SCT(cpl->status);
151 	sc = NVME_STATUS_GET_SC(cpl->status);
152 	dnr = NVME_STATUS_GET_DNR(cpl->status);
153 
154 	/*
155 	 * TODO: spec is not clear how commands that are aborted due
156 	 *  to TLER will be marked.  So for now, it seems
157 	 *  NAMESPACE_NOT_READY is the only case where we should
158 	 *  look at the DNR bit. Requests failed with ABORTED_BY_REQUEST
159 	 *  set the DNR bit correctly since the driver controls that.
160 	 */
161 	switch (sct) {
162 	case NVME_SCT_GENERIC:
163 		switch (sc) {
164 		case NVME_SC_ABORTED_BY_REQUEST:
165 		case NVME_SC_NAMESPACE_NOT_READY:
166 			if (dnr) {
167 				return false;
168 			}
169 
170 			return true;
171 		case NVME_SC_INVALID_OPCODE:
172 		case NVME_SC_INVALID_FIELD:
173 		case NVME_SC_COMMAND_ID_CONFLICT:
174 		case NVME_SC_DATA_TRANSFER_ERROR:
175 		case NVME_SC_ABORTED_POWER_LOSS:
176 		case NVME_SC_INTERNAL_DEVICE_ERROR:
177 		case NVME_SC_ABORTED_SQ_DELETION:
178 		case NVME_SC_ABORTED_FAILED_FUSED:
179 		case NVME_SC_ABORTED_MISSING_FUSED:
180 		case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
181 		case NVME_SC_COMMAND_SEQUENCE_ERROR:
182 		case NVME_SC_LBA_OUT_OF_RANGE:
183 		case NVME_SC_CAPACITY_EXCEEDED:
184 		default:
185 			return false;
186 		}
187 	case NVME_SCT_COMMAND_SPECIFIC:
188 	case NVME_SCT_MEDIA_ERROR:
189 		return false;
190 	case NVME_SCT_PATH_RELATED:
191 		switch (sc) {
192 		case NVME_SC_INTERNAL_PATH_ERROR:
193 			if (dnr) {
194 				return false;
195 			}
196 
197 			return true;
198 		default:
199 			return false;
200 		}
201 	case NVME_SCT_VENDOR_SPECIFIC:
202 	default:
203 		return false;
204 	}
205 }
206 
nvme_cmd_request_complete(struct nvme_request * request,struct nvme_completion * cpl)207 static void nvme_cmd_request_complete(struct nvme_request *request,
208 				      struct nvme_completion *cpl)
209 {
210 	bool error, retriable, retry;
211 
212 	error = nvme_completion_is_error(cpl);
213 	retriable = nvme_completion_is_retry(cpl);
214 	retry = error && retriable &&
215 		request->retries < CONFIG_NVME_RETRY_COUNT;
216 
217 	if (retry) {
218 		LOG_DBG("CMD will be retried");
219 		request->qpair->num_retries++;
220 	}
221 
222 	if (error &&
223 	    (!retriable || (request->retries >= CONFIG_NVME_RETRY_COUNT))) {
224 		LOG_DBG("CMD error");
225 		request->qpair->num_failures++;
226 	}
227 
228 	if (cpl->cid != request->cmd.cdw0.cid) {
229 		LOG_ERR("cpl cid != cmd cid");
230 	}
231 
232 	if (retry) {
233 		LOG_DBG("Retrying CMD");
234 		/* Let's remove it from pending... */
235 		sys_dlist_remove(&request->node);
236 		/* ...and re-submit, thus re-adding to pending */
237 		nvme_cmd_qpair_submit_request(request->qpair, request);
238 		request->retries++;
239 	} else {
240 		LOG_DBG("Request %p CMD complete on %p/%p",
241 			request, request->cb_fn, request->cb_arg);
242 
243 		if (request->cb_fn) {
244 			request->cb_fn(request->cb_arg, cpl);
245 		}
246 
247 		nvme_cmd_request_free(request);
248 	}
249 }
250 
nvme_cmd_qpair_process_completion(struct nvme_cmd_qpair * qpair)251 static void nvme_cmd_qpair_process_completion(struct nvme_cmd_qpair *qpair)
252 {
253 	struct nvme_request *request;
254 	struct nvme_completion cpl;
255 	int done = 0;
256 
257 	if (qpair->num_intr_handler_calls == 0 && qpair->phase == 0) {
258 		LOG_WRN("Phase wrong for first interrupt call.");
259 	}
260 
261 	qpair->num_intr_handler_calls++;
262 
263 	while (1) {
264 		uint16_t status;
265 
266 		status = sys_le16_to_cpu(qpair->cpl[qpair->cq_head].status);
267 		if (NVME_STATUS_GET_P(status) != qpair->phase) {
268 			break;
269 		}
270 
271 		cpl = qpair->cpl[qpair->cq_head];
272 		nvme_completion_swapbytes(&cpl);
273 
274 		if (NVME_STATUS_GET_P(status) != NVME_STATUS_GET_P(cpl.status)) {
275 			LOG_WRN("Phase unexpectedly inconsistent");
276 		}
277 
278 		if (cpl.cid < NVME_REQUEST_AMOUNT) {
279 			request = &request_pool[cpl.cid];
280 		} else {
281 			request = NULL;
282 		}
283 
284 		done++;
285 		if (request != NULL) {
286 			nvme_cmd_request_complete(request, &cpl);
287 			qpair->sq_head = cpl.sqhd;
288 		} else {
289 			LOG_ERR("cpl (cid = %u) does not map to cmd", cpl.cid);
290 		}
291 
292 		qpair->cq_head++;
293 		if (qpair->cq_head == qpair->num_entries) {
294 			qpair->cq_head = 0;
295 			qpair->phase = !qpair->phase;
296 		}
297 	}
298 
299 	if (done != 0) {
300 		mm_reg_t regs = DEVICE_MMIO_GET(qpair->ctrlr->dev);
301 
302 		sys_write32(qpair->cq_head, regs + qpair->cq_hdbl_off);
303 	}
304 }
305 
nvme_cmd_qpair_msi_handler(const void * arg)306 static void nvme_cmd_qpair_msi_handler(const void *arg)
307 {
308 	const struct nvme_cmd_qpair *qpair = arg;
309 
310 	nvme_cmd_qpair_process_completion((struct nvme_cmd_qpair *)qpair);
311 }
312 
nvme_cmd_qpair_setup(struct nvme_cmd_qpair * qpair,struct nvme_controller * ctrlr,uint32_t id)313 int nvme_cmd_qpair_setup(struct nvme_cmd_qpair *qpair,
314 			 struct nvme_controller *ctrlr,
315 			 uint32_t id)
316 {
317 	const struct nvme_controller_config *nvme_ctrlr_cfg =
318 		ctrlr->dev->config;
319 
320 	qpair->ctrlr = ctrlr;
321 	qpair->id = id;
322 	qpair->vector = qpair->id;
323 
324 	qpair->num_cmds = 0;
325 	qpair->num_intr_handler_calls = 0;
326 	qpair->num_retries = 0;
327 	qpair->num_failures = 0;
328 	qpair->num_ignored = 0;
329 
330 	qpair->cmd_bus_addr = (uintptr_t)qpair->cmd;
331 	qpair->cpl_bus_addr = (uintptr_t)qpair->cpl;
332 
333 	qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell) +
334 		(qpair->id << (ctrlr->dstrd + 1));
335 	qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell) +
336 		(qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd);
337 
338 	if (!pcie_msi_vector_connect(nvme_ctrlr_cfg->pcie->bdf,
339 				     &ctrlr->vectors[qpair->vector],
340 				     nvme_cmd_qpair_msi_handler, qpair, 0)) {
341 		LOG_ERR("Failed to connect MSI-X vector %u", qpair->id);
342 		return -EIO;
343 	}
344 
345 	LOG_DBG("CMD Qpair created ID %u, %u entries - cmd/cpl addr "
346 		"0x%lx/0x%lx - sq/cq offsets %u/%u",
347 		qpair->id, qpair->num_entries, qpair->cmd_bus_addr,
348 		qpair->cpl_bus_addr, qpair->sq_tdbl_off, qpair->cq_hdbl_off);
349 
350 	return 0;
351 }
352 
nvme_cmd_qpair_reset(struct nvme_cmd_qpair * qpair)353 void nvme_cmd_qpair_reset(struct nvme_cmd_qpair *qpair)
354 {
355 	qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
356 
357 	/*
358 	 * First time through the completion queue, HW will set phase
359 	 * bit on completions to 1.  So set this to 1 here, indicating
360 	 * we're looking for a 1 to know which entries have completed.
361 	 * we'll toggle the bit each time when the completion queue
362 	 * rolls over.
363 	 */
364 	qpair->phase = 1;
365 
366 	memset(qpair->cmd, 0,
367 	       qpair->num_entries * sizeof(struct nvme_command));
368 	memset(qpair->cpl, 0,
369 	       qpair->num_entries * sizeof(struct nvme_completion));
370 }
371 
nvme_cmd_qpair_fill_prp_list(struct nvme_cmd_qpair * qpair,struct nvme_request * request,int n_prp)372 static int nvme_cmd_qpair_fill_prp_list(struct nvme_cmd_qpair *qpair,
373 					struct nvme_request *request,
374 					int n_prp)
375 {
376 	struct nvme_prp_list *prp_list;
377 	uintptr_t p_addr;
378 	int idx;
379 
380 	prp_list = nvme_prp_list_alloc();
381 	if (prp_list == NULL) {
382 		return -ENOMEM;
383 	}
384 
385 	p_addr = (uintptr_t)request->payload;
386 	request->cmd.dptr.prp1 =
387 		(uint64_t)sys_cpu_to_le64(p_addr);
388 	request->cmd.dptr.prp2 =
389 		(uint64_t)sys_cpu_to_le64(&prp_list->prp);
390 	p_addr = NVME_PRP_NEXT_PAGE(p_addr);
391 
392 	for (idx = 0; idx < n_prp; idx++) {
393 		prp_list->prp[idx] = (uint64_t)sys_cpu_to_le64(p_addr);
394 		p_addr = NVME_PRP_NEXT_PAGE(p_addr);
395 	}
396 
397 	request->prp_list = prp_list;
398 
399 	return 0;
400 }
401 
nvme_cmd_qpair_fill_dptr(struct nvme_cmd_qpair * qpair,struct nvme_request * request)402 static int nvme_cmd_qpair_fill_dptr(struct nvme_cmd_qpair *qpair,
403 				    struct nvme_request *request)
404 {
405 	switch (request->type) {
406 	case NVME_REQUEST_NULL:
407 		break;
408 	case NVME_REQUEST_VADDR:
409 		int n_prp;
410 
411 		if (request->payload_size > qpair->ctrlr->max_xfer_size) {
412 			LOG_ERR("VADDR request's payload too big");
413 			return -EINVAL;
414 		}
415 
416 		n_prp = request->payload_size / qpair->ctrlr->page_size;
417 		if ((request->payload_size % qpair->ctrlr->page_size) ||
418 		    ((uintptr_t)request->payload & NVME_PBAO_MASK)) {
419 			n_prp++;
420 		}
421 
422 		if (n_prp <= 2) {
423 			request->cmd.dptr.prp1 =
424 				(uint64_t)sys_cpu_to_le64(request->payload);
425 			if ((uintptr_t)request->payload & NVME_PBAO_MASK) {
426 				request->cmd.dptr.prp2 =
427 					NVME_PRP_NEXT_PAGE(
428 						(uintptr_t)request->payload);
429 			} else {
430 				request->cmd.dptr.prp2 = 0;
431 			}
432 
433 			break;
434 		}
435 
436 		return nvme_cmd_qpair_fill_prp_list(qpair, request, n_prp);
437 	default:
438 		break;
439 	}
440 
441 	return 0;
442 }
443 
nvme_cmd_qpair_submit_request(struct nvme_cmd_qpair * qpair,struct nvme_request * request)444 int nvme_cmd_qpair_submit_request(struct nvme_cmd_qpair *qpair,
445 				  struct nvme_request *request)
446 {
447 	mm_reg_t regs = DEVICE_MMIO_GET(qpair->ctrlr->dev);
448 	int ret;
449 
450 	request->qpair = qpair;
451 
452 	request->cmd.cdw0.cid = sys_cpu_to_le16((uint16_t)(request -
453 							   request_pool));
454 
455 	ret = nvme_cmd_qpair_fill_dptr(qpair, request);
456 	if (ret != 0) {
457 		nvme_cmd_request_free(request);
458 		return ret;
459 	}
460 
461 	nvme_cmd_register_request(request);
462 
463 	memcpy(&qpair->cmd[qpair->sq_tail],
464 	       &request->cmd, sizeof(request->cmd));
465 
466 	qpair->sq_tail++;
467 	if (qpair->sq_tail == qpair->num_entries) {
468 		qpair->sq_tail = 0;
469 	}
470 
471 	sys_write32(qpair->sq_tail, regs + qpair->sq_tdbl_off);
472 	qpair->num_cmds++;
473 
474 	LOG_DBG("Request %p %llu submitted: CID %u - sq_tail %u",
475 		request, qpair->num_cmds, request->cmd.cdw0.cid,
476 		qpair->sq_tail - 1);
477 	return 0;
478 }
479 
480 void
nvme_completion_poll_cb(void * arg,const struct nvme_completion * cpl)481 nvme_completion_poll_cb(void *arg, const struct nvme_completion *cpl)
482 {
483 	struct nvme_completion_poll_status *status = arg;
484 
485 	if (cpl != NULL) {
486 		memcpy(&status->cpl, cpl, sizeof(*cpl));
487 	} else {
488 		status->status = -ETIMEDOUT;
489 	}
490 
491 	k_sem_give(&status->sem);
492 }
493