1 /*
2 * SPDX-License-Identifier: Apache-2.0
3 * Copyright (c) 2022 Intel Corp.
4 */
5
6 #include <zephyr/logging/log.h>
7 LOG_MODULE_DECLARE(nvme, CONFIG_NVME_LOG_LEVEL);
8
9 #include <zephyr/kernel.h>
10 #include <zephyr/cache.h>
11 #include <zephyr/sys/byteorder.h>
12
13 #include <string.h>
14
15 #include "nvme.h"
16 #include "nvme_helpers.h"
17
18 static struct nvme_prp_list prp_list_pool[CONFIG_NVME_PRP_LIST_AMOUNT];
19 static sys_dlist_t free_prp_list;
20
21 static struct nvme_request request_pool[NVME_REQUEST_AMOUNT];
22 static sys_dlist_t free_request;
23 static sys_dlist_t pending_request;
24
25 static void request_timeout(struct k_work *work);
26
27 static K_WORK_DELAYABLE_DEFINE(request_timer, request_timeout);
28
nvme_cmd_init(void)29 void nvme_cmd_init(void)
30 {
31 int idx;
32
33 sys_dlist_init(&free_request);
34 sys_dlist_init(&pending_request);
35 sys_dlist_init(&free_prp_list);
36
37 for (idx = 0; idx < NVME_REQUEST_AMOUNT; idx++) {
38 sys_dlist_append(&free_request, &request_pool[idx].node);
39 }
40
41 for (idx = 0; idx < CONFIG_NVME_PRP_LIST_AMOUNT; idx++) {
42 sys_dlist_append(&free_prp_list, &prp_list_pool[idx].node);
43 }
44 }
45
nvme_prp_list_alloc(void)46 static struct nvme_prp_list *nvme_prp_list_alloc(void)
47 {
48 sys_dnode_t *node;
49
50 node = sys_dlist_peek_head(&free_prp_list);
51 if (!node) {
52 LOG_ERR("Could not allocate PRP list");
53 return NULL;
54 }
55
56 sys_dlist_remove(node);
57
58 return CONTAINER_OF(node, struct nvme_prp_list, node);
59 }
60
nvme_prp_list_free(struct nvme_prp_list * prp_list)61 static void nvme_prp_list_free(struct nvme_prp_list *prp_list)
62 {
63 memset(prp_list, 0, sizeof(struct nvme_prp_list));
64 sys_dlist_append(&free_prp_list, &prp_list->node);
65 }
66
nvme_cmd_request_free(struct nvme_request * request)67 void nvme_cmd_request_free(struct nvme_request *request)
68 {
69 if (sys_dnode_is_linked(&request->node)) {
70 sys_dlist_remove(&request->node);
71 }
72
73 if (request->prp_list != NULL) {
74 nvme_prp_list_free(request->prp_list);
75 }
76
77 memset(request, 0, sizeof(struct nvme_request));
78 sys_dlist_append(&free_request, &request->node);
79 }
80
nvme_cmd_request_alloc(void)81 struct nvme_request *nvme_cmd_request_alloc(void)
82 {
83 sys_dnode_t *node;
84
85 node = sys_dlist_peek_head(&free_request);
86 if (!node) {
87 LOG_ERR("Could not allocate request");
88 return NULL;
89 }
90
91 sys_dlist_remove(node);
92
93 return CONTAINER_OF(node, struct nvme_request, node);
94 }
95
nvme_cmd_register_request(struct nvme_request * request)96 static void nvme_cmd_register_request(struct nvme_request *request)
97 {
98 sys_dlist_append(&pending_request, &request->node);
99
100 request->req_start = k_uptime_get_32();
101
102 if (!k_work_delayable_remaining_get(&request_timer)) {
103 k_work_reschedule(&request_timer,
104 K_SECONDS(CONFIG_NVME_REQUEST_TIMEOUT));
105 }
106 }
107
request_timeout(struct k_work * work)108 static void request_timeout(struct k_work *work)
109 {
110 uint32_t current = k_uptime_get_32();
111 struct nvme_request *request, *next;
112
113 ARG_UNUSED(work);
114
115 SYS_DLIST_FOR_EACH_CONTAINER_SAFE(&pending_request,
116 request, next, node) {
117 if ((int32_t)(request->req_start +
118 CONFIG_NVME_REQUEST_TIMEOUT - current) > 0) {
119 break;
120 }
121
122 LOG_WRN("Request %p CID %u timed-out",
123 request, request->cmd.cdw0.cid);
124
125 /* ToDo:
126 * - check CSTS for fatal fault
127 * - reset hw otherwise if it's the case
128 * - or check completion for missed interruption
129 */
130
131 if (request->cb_fn) {
132 request->cb_fn(request->cb_arg, NULL);
133 }
134
135 nvme_cmd_request_free(request);
136 }
137
138 if (request) {
139 k_work_reschedule(&request_timer,
140 K_SECONDS(request->req_start +
141 CONFIG_NVME_REQUEST_TIMEOUT -
142 current));
143 }
144 }
145
nvme_completion_is_retry(const struct nvme_completion * cpl)146 static bool nvme_completion_is_retry(const struct nvme_completion *cpl)
147 {
148 uint8_t sct, sc, dnr;
149
150 sct = NVME_STATUS_GET_SCT(cpl->status);
151 sc = NVME_STATUS_GET_SC(cpl->status);
152 dnr = NVME_STATUS_GET_DNR(cpl->status);
153
154 /*
155 * TODO: spec is not clear how commands that are aborted due
156 * to TLER will be marked. So for now, it seems
157 * NAMESPACE_NOT_READY is the only case where we should
158 * look at the DNR bit. Requests failed with ABORTED_BY_REQUEST
159 * set the DNR bit correctly since the driver controls that.
160 */
161 switch (sct) {
162 case NVME_SCT_GENERIC:
163 switch (sc) {
164 case NVME_SC_ABORTED_BY_REQUEST:
165 case NVME_SC_NAMESPACE_NOT_READY:
166 if (dnr) {
167 return false;
168 }
169
170 return true;
171 case NVME_SC_INVALID_OPCODE:
172 case NVME_SC_INVALID_FIELD:
173 case NVME_SC_COMMAND_ID_CONFLICT:
174 case NVME_SC_DATA_TRANSFER_ERROR:
175 case NVME_SC_ABORTED_POWER_LOSS:
176 case NVME_SC_INTERNAL_DEVICE_ERROR:
177 case NVME_SC_ABORTED_SQ_DELETION:
178 case NVME_SC_ABORTED_FAILED_FUSED:
179 case NVME_SC_ABORTED_MISSING_FUSED:
180 case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
181 case NVME_SC_COMMAND_SEQUENCE_ERROR:
182 case NVME_SC_LBA_OUT_OF_RANGE:
183 case NVME_SC_CAPACITY_EXCEEDED:
184 default:
185 return false;
186 }
187 case NVME_SCT_COMMAND_SPECIFIC:
188 case NVME_SCT_MEDIA_ERROR:
189 return false;
190 case NVME_SCT_PATH_RELATED:
191 switch (sc) {
192 case NVME_SC_INTERNAL_PATH_ERROR:
193 if (dnr) {
194 return false;
195 }
196
197 return true;
198 default:
199 return false;
200 }
201 case NVME_SCT_VENDOR_SPECIFIC:
202 default:
203 return false;
204 }
205 }
206
nvme_cmd_request_complete(struct nvme_request * request,struct nvme_completion * cpl)207 static void nvme_cmd_request_complete(struct nvme_request *request,
208 struct nvme_completion *cpl)
209 {
210 bool error, retriable, retry;
211
212 error = nvme_completion_is_error(cpl);
213 retriable = nvme_completion_is_retry(cpl);
214 retry = error && retriable &&
215 request->retries < CONFIG_NVME_RETRY_COUNT;
216
217 if (retry) {
218 LOG_DBG("CMD will be retried");
219 request->qpair->num_retries++;
220 }
221
222 if (error &&
223 (!retriable || (request->retries >= CONFIG_NVME_RETRY_COUNT))) {
224 LOG_DBG("CMD error");
225 request->qpair->num_failures++;
226 }
227
228 if (cpl->cid != request->cmd.cdw0.cid) {
229 LOG_ERR("cpl cid != cmd cid");
230 }
231
232 if (retry) {
233 LOG_DBG("Retrying CMD");
234 /* Let's remove it from pending... */
235 sys_dlist_remove(&request->node);
236 /* ...and re-submit, thus re-adding to pending */
237 nvme_cmd_qpair_submit_request(request->qpair, request);
238 request->retries++;
239 } else {
240 LOG_DBG("Request %p CMD complete on %p/%p",
241 request, request->cb_fn, request->cb_arg);
242
243 if (request->cb_fn) {
244 request->cb_fn(request->cb_arg, cpl);
245 }
246
247 nvme_cmd_request_free(request);
248 }
249 }
250
nvme_cmd_qpair_process_completion(struct nvme_cmd_qpair * qpair)251 static void nvme_cmd_qpair_process_completion(struct nvme_cmd_qpair *qpair)
252 {
253 struct nvme_request *request;
254 struct nvme_completion cpl;
255 int done = 0;
256
257 if (qpair->num_intr_handler_calls == 0 && qpair->phase == 0) {
258 LOG_WRN("Phase wrong for first interrupt call.");
259 }
260
261 qpair->num_intr_handler_calls++;
262
263 while (1) {
264 uint16_t status;
265
266 status = sys_le16_to_cpu(qpair->cpl[qpair->cq_head].status);
267 if (NVME_STATUS_GET_P(status) != qpair->phase) {
268 break;
269 }
270
271 cpl = qpair->cpl[qpair->cq_head];
272 nvme_completion_swapbytes(&cpl);
273
274 if (NVME_STATUS_GET_P(status) != NVME_STATUS_GET_P(cpl.status)) {
275 LOG_WRN("Phase unexpectedly inconsistent");
276 }
277
278 if (cpl.cid < NVME_REQUEST_AMOUNT) {
279 request = &request_pool[cpl.cid];
280 } else {
281 request = NULL;
282 }
283
284 done++;
285 if (request != NULL) {
286 nvme_cmd_request_complete(request, &cpl);
287 qpair->sq_head = cpl.sqhd;
288 } else {
289 LOG_ERR("cpl (cid = %u) does not map to cmd", cpl.cid);
290 }
291
292 qpair->cq_head++;
293 if (qpair->cq_head == qpair->num_entries) {
294 qpair->cq_head = 0;
295 qpair->phase = !qpair->phase;
296 }
297 }
298
299 if (done != 0) {
300 mm_reg_t regs = DEVICE_MMIO_GET(qpair->ctrlr->dev);
301
302 sys_write32(qpair->cq_head, regs + qpair->cq_hdbl_off);
303 }
304 }
305
nvme_cmd_qpair_msi_handler(const void * arg)306 static void nvme_cmd_qpair_msi_handler(const void *arg)
307 {
308 const struct nvme_cmd_qpair *qpair = arg;
309
310 nvme_cmd_qpair_process_completion((struct nvme_cmd_qpair *)qpair);
311 }
312
nvme_cmd_qpair_setup(struct nvme_cmd_qpair * qpair,struct nvme_controller * ctrlr,uint32_t id)313 int nvme_cmd_qpair_setup(struct nvme_cmd_qpair *qpair,
314 struct nvme_controller *ctrlr,
315 uint32_t id)
316 {
317 const struct nvme_controller_config *nvme_ctrlr_cfg =
318 ctrlr->dev->config;
319
320 qpair->ctrlr = ctrlr;
321 qpair->id = id;
322 qpair->vector = qpair->id;
323
324 qpair->num_cmds = 0;
325 qpair->num_intr_handler_calls = 0;
326 qpair->num_retries = 0;
327 qpair->num_failures = 0;
328 qpair->num_ignored = 0;
329
330 qpair->cmd_bus_addr = (uintptr_t)qpair->cmd;
331 qpair->cpl_bus_addr = (uintptr_t)qpair->cpl;
332
333 qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell) +
334 (qpair->id << (ctrlr->dstrd + 1));
335 qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell) +
336 (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd);
337
338 if (!pcie_msi_vector_connect(nvme_ctrlr_cfg->pcie->bdf,
339 &ctrlr->vectors[qpair->vector],
340 nvme_cmd_qpair_msi_handler, qpair, 0)) {
341 LOG_ERR("Failed to connect MSI-X vector %u", qpair->id);
342 return -EIO;
343 }
344
345 LOG_DBG("CMD Qpair created ID %u, %u entries - cmd/cpl addr "
346 "0x%lx/0x%lx - sq/cq offsets %u/%u",
347 qpair->id, qpair->num_entries, qpair->cmd_bus_addr,
348 qpair->cpl_bus_addr, qpair->sq_tdbl_off, qpair->cq_hdbl_off);
349
350 return 0;
351 }
352
nvme_cmd_qpair_reset(struct nvme_cmd_qpair * qpair)353 void nvme_cmd_qpair_reset(struct nvme_cmd_qpair *qpair)
354 {
355 qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
356
357 /*
358 * First time through the completion queue, HW will set phase
359 * bit on completions to 1. So set this to 1 here, indicating
360 * we're looking for a 1 to know which entries have completed.
361 * we'll toggle the bit each time when the completion queue
362 * rolls over.
363 */
364 qpair->phase = 1;
365
366 memset(qpair->cmd, 0,
367 qpair->num_entries * sizeof(struct nvme_command));
368 memset(qpair->cpl, 0,
369 qpair->num_entries * sizeof(struct nvme_completion));
370 }
371
nvme_cmd_qpair_fill_prp_list(struct nvme_cmd_qpair * qpair,struct nvme_request * request,int n_prp)372 static int nvme_cmd_qpair_fill_prp_list(struct nvme_cmd_qpair *qpair,
373 struct nvme_request *request,
374 int n_prp)
375 {
376 struct nvme_prp_list *prp_list;
377 uintptr_t p_addr;
378 int idx;
379
380 prp_list = nvme_prp_list_alloc();
381 if (prp_list == NULL) {
382 return -ENOMEM;
383 }
384
385 p_addr = (uintptr_t)request->payload;
386 request->cmd.dptr.prp1 =
387 (uint64_t)sys_cpu_to_le64(p_addr);
388 request->cmd.dptr.prp2 =
389 (uint64_t)sys_cpu_to_le64(&prp_list->prp);
390 p_addr = NVME_PRP_NEXT_PAGE(p_addr);
391
392 for (idx = 0; idx < n_prp; idx++) {
393 prp_list->prp[idx] = (uint64_t)sys_cpu_to_le64(p_addr);
394 p_addr = NVME_PRP_NEXT_PAGE(p_addr);
395 }
396
397 request->prp_list = prp_list;
398
399 return 0;
400 }
401
nvme_cmd_qpair_fill_dptr(struct nvme_cmd_qpair * qpair,struct nvme_request * request)402 static int nvme_cmd_qpair_fill_dptr(struct nvme_cmd_qpair *qpair,
403 struct nvme_request *request)
404 {
405 switch (request->type) {
406 case NVME_REQUEST_NULL:
407 break;
408 case NVME_REQUEST_VADDR:
409 int n_prp;
410
411 if (request->payload_size > qpair->ctrlr->max_xfer_size) {
412 LOG_ERR("VADDR request's payload too big");
413 return -EINVAL;
414 }
415
416 n_prp = request->payload_size / qpair->ctrlr->page_size;
417 if ((request->payload_size % qpair->ctrlr->page_size) ||
418 ((uintptr_t)request->payload & NVME_PBAO_MASK)) {
419 n_prp++;
420 }
421
422 if (n_prp <= 2) {
423 request->cmd.dptr.prp1 =
424 (uint64_t)sys_cpu_to_le64(request->payload);
425 if ((uintptr_t)request->payload & NVME_PBAO_MASK) {
426 request->cmd.dptr.prp2 =
427 NVME_PRP_NEXT_PAGE(
428 (uintptr_t)request->payload);
429 } else {
430 request->cmd.dptr.prp2 = 0;
431 }
432
433 break;
434 }
435
436 return nvme_cmd_qpair_fill_prp_list(qpair, request, n_prp);
437 default:
438 break;
439 }
440
441 return 0;
442 }
443
nvme_cmd_qpair_submit_request(struct nvme_cmd_qpair * qpair,struct nvme_request * request)444 int nvme_cmd_qpair_submit_request(struct nvme_cmd_qpair *qpair,
445 struct nvme_request *request)
446 {
447 mm_reg_t regs = DEVICE_MMIO_GET(qpair->ctrlr->dev);
448 int ret;
449
450 request->qpair = qpair;
451
452 request->cmd.cdw0.cid = sys_cpu_to_le16((uint16_t)(request -
453 request_pool));
454
455 ret = nvme_cmd_qpair_fill_dptr(qpair, request);
456 if (ret != 0) {
457 nvme_cmd_request_free(request);
458 return ret;
459 }
460
461 nvme_cmd_register_request(request);
462
463 memcpy(&qpair->cmd[qpair->sq_tail],
464 &request->cmd, sizeof(request->cmd));
465
466 qpair->sq_tail++;
467 if (qpair->sq_tail == qpair->num_entries) {
468 qpair->sq_tail = 0;
469 }
470
471 sys_write32(qpair->sq_tail, regs + qpair->sq_tdbl_off);
472 qpair->num_cmds++;
473
474 LOG_DBG("Request %p %llu submitted: CID %u - sq_tail %u",
475 request, qpair->num_cmds, request->cmd.cdw0.cid,
476 qpair->sq_tail - 1);
477 return 0;
478 }
479
480 void
nvme_completion_poll_cb(void * arg,const struct nvme_completion * cpl)481 nvme_completion_poll_cb(void *arg, const struct nvme_completion *cpl)
482 {
483 struct nvme_completion_poll_status *status = arg;
484
485 if (cpl != NULL) {
486 memcpy(&status->cpl, cpl, sizeof(*cpl));
487 } else {
488 status->status = -ETIMEDOUT;
489 }
490
491 k_sem_give(&status->sem);
492 }
493