1 /*
2 * The Kyber I/O scheduler. Controls latency by throttling queue depths using
3 * scalable techniques.
4 *
5 * Copyright (C) 2017 Facebook
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License v2 as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <https://www.gnu.org/licenses/>.
18 */
19
20 #include <linux/kernel.h>
21 #include <linux/blkdev.h>
22 #include <linux/blk-mq.h>
23 #include <linux/elevator.h>
24 #include <linux/module.h>
25 #include <linux/sbitmap.h>
26
27 #include "blk.h"
28 #include "blk-mq.h"
29 #include "blk-mq-debugfs.h"
30 #include "blk-mq-sched.h"
31 #include "blk-mq-tag.h"
32 #include "blk-stat.h"
33
34 /* Scheduling domains. */
35 enum {
36 KYBER_READ,
37 KYBER_SYNC_WRITE,
38 KYBER_OTHER, /* Async writes, discard, etc. */
39 KYBER_NUM_DOMAINS,
40 };
41
42 enum {
43 KYBER_MIN_DEPTH = 256,
44
45 /*
46 * In order to prevent starvation of synchronous requests by a flood of
47 * asynchronous requests, we reserve 25% of requests for synchronous
48 * operations.
49 */
50 KYBER_ASYNC_PERCENT = 75,
51 };
52
53 /*
54 * Initial device-wide depths for each scheduling domain.
55 *
56 * Even for fast devices with lots of tags like NVMe, you can saturate
57 * the device with only a fraction of the maximum possible queue depth.
58 * So, we cap these to a reasonable value.
59 */
60 static const unsigned int kyber_depth[] = {
61 [KYBER_READ] = 256,
62 [KYBER_SYNC_WRITE] = 128,
63 [KYBER_OTHER] = 64,
64 };
65
66 /*
67 * Scheduling domain batch sizes. We favor reads.
68 */
69 static const unsigned int kyber_batch_size[] = {
70 [KYBER_READ] = 16,
71 [KYBER_SYNC_WRITE] = 8,
72 [KYBER_OTHER] = 8,
73 };
74
75 /*
76 * There is a same mapping between ctx & hctx and kcq & khd,
77 * we use request->mq_ctx->index_hw to index the kcq in khd.
78 */
79 struct kyber_ctx_queue {
80 /*
81 * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
82 * Also protect the rqs on rq_list when merge.
83 */
84 spinlock_t lock;
85 struct list_head rq_list[KYBER_NUM_DOMAINS];
86 } ____cacheline_aligned_in_smp;
87
88 struct kyber_queue_data {
89 struct request_queue *q;
90
91 struct blk_stat_callback *cb;
92
93 /*
94 * The device is divided into multiple scheduling domains based on the
95 * request type. Each domain has a fixed number of in-flight requests of
96 * that type device-wide, limited by these tokens.
97 */
98 struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
99
100 /*
101 * Async request percentage, converted to per-word depth for
102 * sbitmap_get_shallow().
103 */
104 unsigned int async_depth;
105
106 /* Target latencies in nanoseconds. */
107 u64 read_lat_nsec, write_lat_nsec;
108 };
109
110 struct kyber_hctx_data {
111 spinlock_t lock;
112 struct list_head rqs[KYBER_NUM_DOMAINS];
113 unsigned int cur_domain;
114 unsigned int batching;
115 struct kyber_ctx_queue *kcqs;
116 struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
117 wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
118 struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
119 atomic_t wait_index[KYBER_NUM_DOMAINS];
120 };
121
122 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
123 void *key);
124
kyber_sched_domain(unsigned int op)125 static unsigned int kyber_sched_domain(unsigned int op)
126 {
127 if ((op & REQ_OP_MASK) == REQ_OP_READ)
128 return KYBER_READ;
129 else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
130 return KYBER_SYNC_WRITE;
131 else
132 return KYBER_OTHER;
133 }
134
135 enum {
136 NONE = 0,
137 GOOD = 1,
138 GREAT = 2,
139 BAD = -1,
140 AWFUL = -2,
141 };
142
143 #define IS_GOOD(status) ((status) > 0)
144 #define IS_BAD(status) ((status) < 0)
145
kyber_lat_status(struct blk_stat_callback * cb,unsigned int sched_domain,u64 target)146 static int kyber_lat_status(struct blk_stat_callback *cb,
147 unsigned int sched_domain, u64 target)
148 {
149 u64 latency;
150
151 if (!cb->stat[sched_domain].nr_samples)
152 return NONE;
153
154 latency = cb->stat[sched_domain].mean;
155 if (latency >= 2 * target)
156 return AWFUL;
157 else if (latency > target)
158 return BAD;
159 else if (latency <= target / 2)
160 return GREAT;
161 else /* (latency <= target) */
162 return GOOD;
163 }
164
165 /*
166 * Adjust the read or synchronous write depth given the status of reads and
167 * writes. The goal is that the latencies of the two domains are fair (i.e., if
168 * one is good, then the other is good).
169 */
kyber_adjust_rw_depth(struct kyber_queue_data * kqd,unsigned int sched_domain,int this_status,int other_status)170 static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
171 unsigned int sched_domain, int this_status,
172 int other_status)
173 {
174 unsigned int orig_depth, depth;
175
176 /*
177 * If this domain had no samples, or reads and writes are both good or
178 * both bad, don't adjust the depth.
179 */
180 if (this_status == NONE ||
181 (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
182 (IS_BAD(this_status) && IS_BAD(other_status)))
183 return;
184
185 orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
186
187 if (other_status == NONE) {
188 depth++;
189 } else {
190 switch (this_status) {
191 case GOOD:
192 if (other_status == AWFUL)
193 depth -= max(depth / 4, 1U);
194 else
195 depth -= max(depth / 8, 1U);
196 break;
197 case GREAT:
198 if (other_status == AWFUL)
199 depth /= 2;
200 else
201 depth -= max(depth / 4, 1U);
202 break;
203 case BAD:
204 depth++;
205 break;
206 case AWFUL:
207 if (other_status == GREAT)
208 depth += 2;
209 else
210 depth++;
211 break;
212 }
213 }
214
215 depth = clamp(depth, 1U, kyber_depth[sched_domain]);
216 if (depth != orig_depth)
217 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
218 }
219
220 /*
221 * Adjust the depth of other requests given the status of reads and synchronous
222 * writes. As long as either domain is doing fine, we don't throttle, but if
223 * both domains are doing badly, we throttle heavily.
224 */
kyber_adjust_other_depth(struct kyber_queue_data * kqd,int read_status,int write_status,bool have_samples)225 static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
226 int read_status, int write_status,
227 bool have_samples)
228 {
229 unsigned int orig_depth, depth;
230 int status;
231
232 orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
233
234 if (read_status == NONE && write_status == NONE) {
235 depth += 2;
236 } else if (have_samples) {
237 if (read_status == NONE)
238 status = write_status;
239 else if (write_status == NONE)
240 status = read_status;
241 else
242 status = max(read_status, write_status);
243 switch (status) {
244 case GREAT:
245 depth += 2;
246 break;
247 case GOOD:
248 depth++;
249 break;
250 case BAD:
251 depth -= max(depth / 4, 1U);
252 break;
253 case AWFUL:
254 depth /= 2;
255 break;
256 }
257 }
258
259 depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
260 if (depth != orig_depth)
261 sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
262 }
263
264 /*
265 * Apply heuristics for limiting queue depths based on gathered latency
266 * statistics.
267 */
kyber_stat_timer_fn(struct blk_stat_callback * cb)268 static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
269 {
270 struct kyber_queue_data *kqd = cb->data;
271 int read_status, write_status;
272
273 read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
274 write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
275
276 kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
277 kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
278 kyber_adjust_other_depth(kqd, read_status, write_status,
279 cb->stat[KYBER_OTHER].nr_samples != 0);
280
281 /*
282 * Continue monitoring latencies if we aren't hitting the targets or
283 * we're still throttling other requests.
284 */
285 if (!blk_stat_is_active(kqd->cb) &&
286 ((IS_BAD(read_status) || IS_BAD(write_status) ||
287 kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
288 blk_stat_activate_msecs(kqd->cb, 100);
289 }
290
kyber_sched_tags_shift(struct kyber_queue_data * kqd)291 static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
292 {
293 /*
294 * All of the hardware queues have the same depth, so we can just grab
295 * the shift of the first one.
296 */
297 return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
298 }
299
kyber_bucket_fn(const struct request * rq)300 static int kyber_bucket_fn(const struct request *rq)
301 {
302 return kyber_sched_domain(rq->cmd_flags);
303 }
304
kyber_queue_data_alloc(struct request_queue * q)305 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
306 {
307 struct kyber_queue_data *kqd;
308 unsigned int max_tokens;
309 unsigned int shift;
310 int ret = -ENOMEM;
311 int i;
312
313 kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
314 if (!kqd)
315 goto err;
316 kqd->q = q;
317
318 kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
319 KYBER_NUM_DOMAINS, kqd);
320 if (!kqd->cb)
321 goto err_kqd;
322
323 /*
324 * The maximum number of tokens for any scheduling domain is at least
325 * the queue depth of a single hardware queue. If the hardware doesn't
326 * have many tags, still provide a reasonable number.
327 */
328 max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
329 KYBER_MIN_DEPTH);
330 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
331 WARN_ON(!kyber_depth[i]);
332 WARN_ON(!kyber_batch_size[i]);
333 ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
334 max_tokens, -1, false, GFP_KERNEL,
335 q->node);
336 if (ret) {
337 while (--i >= 0)
338 sbitmap_queue_free(&kqd->domain_tokens[i]);
339 goto err_cb;
340 }
341 sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
342 }
343
344 shift = kyber_sched_tags_shift(kqd);
345 kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
346
347 kqd->read_lat_nsec = 2000000ULL;
348 kqd->write_lat_nsec = 10000000ULL;
349
350 return kqd;
351
352 err_cb:
353 blk_stat_free_callback(kqd->cb);
354 err_kqd:
355 kfree(kqd);
356 err:
357 return ERR_PTR(ret);
358 }
359
kyber_init_sched(struct request_queue * q,struct elevator_type * e)360 static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
361 {
362 struct kyber_queue_data *kqd;
363 struct elevator_queue *eq;
364
365 eq = elevator_alloc(q, e);
366 if (!eq)
367 return -ENOMEM;
368
369 kqd = kyber_queue_data_alloc(q);
370 if (IS_ERR(kqd)) {
371 kobject_put(&eq->kobj);
372 return PTR_ERR(kqd);
373 }
374
375 eq->elevator_data = kqd;
376 q->elevator = eq;
377
378 blk_stat_add_callback(q, kqd->cb);
379
380 return 0;
381 }
382
kyber_exit_sched(struct elevator_queue * e)383 static void kyber_exit_sched(struct elevator_queue *e)
384 {
385 struct kyber_queue_data *kqd = e->elevator_data;
386 struct request_queue *q = kqd->q;
387 int i;
388
389 blk_stat_remove_callback(q, kqd->cb);
390
391 for (i = 0; i < KYBER_NUM_DOMAINS; i++)
392 sbitmap_queue_free(&kqd->domain_tokens[i]);
393 blk_stat_free_callback(kqd->cb);
394 kfree(kqd);
395 }
396
kyber_ctx_queue_init(struct kyber_ctx_queue * kcq)397 static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
398 {
399 unsigned int i;
400
401 spin_lock_init(&kcq->lock);
402 for (i = 0; i < KYBER_NUM_DOMAINS; i++)
403 INIT_LIST_HEAD(&kcq->rq_list[i]);
404 }
405
kyber_init_hctx(struct blk_mq_hw_ctx * hctx,unsigned int hctx_idx)406 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
407 {
408 struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
409 struct kyber_hctx_data *khd;
410 int i;
411
412 khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
413 if (!khd)
414 return -ENOMEM;
415
416 khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
417 sizeof(struct kyber_ctx_queue),
418 GFP_KERNEL, hctx->numa_node);
419 if (!khd->kcqs)
420 goto err_khd;
421
422 for (i = 0; i < hctx->nr_ctx; i++)
423 kyber_ctx_queue_init(&khd->kcqs[i]);
424
425 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
426 if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
427 ilog2(8), GFP_KERNEL, hctx->numa_node)) {
428 while (--i >= 0)
429 sbitmap_free(&khd->kcq_map[i]);
430 goto err_kcqs;
431 }
432 }
433
434 spin_lock_init(&khd->lock);
435
436 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
437 INIT_LIST_HEAD(&khd->rqs[i]);
438 init_waitqueue_func_entry(&khd->domain_wait[i],
439 kyber_domain_wake);
440 khd->domain_wait[i].private = hctx;
441 INIT_LIST_HEAD(&khd->domain_wait[i].entry);
442 atomic_set(&khd->wait_index[i], 0);
443 }
444
445 khd->cur_domain = 0;
446 khd->batching = 0;
447
448 hctx->sched_data = khd;
449 sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
450 kqd->async_depth);
451
452 return 0;
453
454 err_kcqs:
455 kfree(khd->kcqs);
456 err_khd:
457 kfree(khd);
458 return -ENOMEM;
459 }
460
kyber_exit_hctx(struct blk_mq_hw_ctx * hctx,unsigned int hctx_idx)461 static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
462 {
463 struct kyber_hctx_data *khd = hctx->sched_data;
464 int i;
465
466 for (i = 0; i < KYBER_NUM_DOMAINS; i++)
467 sbitmap_free(&khd->kcq_map[i]);
468 kfree(khd->kcqs);
469 kfree(hctx->sched_data);
470 }
471
rq_get_domain_token(struct request * rq)472 static int rq_get_domain_token(struct request *rq)
473 {
474 return (long)rq->elv.priv[0];
475 }
476
rq_set_domain_token(struct request * rq,int token)477 static void rq_set_domain_token(struct request *rq, int token)
478 {
479 rq->elv.priv[0] = (void *)(long)token;
480 }
481
rq_clear_domain_token(struct kyber_queue_data * kqd,struct request * rq)482 static void rq_clear_domain_token(struct kyber_queue_data *kqd,
483 struct request *rq)
484 {
485 unsigned int sched_domain;
486 int nr;
487
488 nr = rq_get_domain_token(rq);
489 if (nr != -1) {
490 sched_domain = kyber_sched_domain(rq->cmd_flags);
491 sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
492 rq->mq_ctx->cpu);
493 }
494 }
495
kyber_limit_depth(unsigned int op,struct blk_mq_alloc_data * data)496 static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
497 {
498 /*
499 * We use the scheduler tags as per-hardware queue queueing tokens.
500 * Async requests can be limited at this stage.
501 */
502 if (!op_is_sync(op)) {
503 struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
504
505 data->shallow_depth = kqd->async_depth;
506 }
507 }
508
kyber_bio_merge(struct blk_mq_hw_ctx * hctx,struct bio * bio)509 static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
510 {
511 struct kyber_hctx_data *khd = hctx->sched_data;
512 struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
513 struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
514 unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
515 struct list_head *rq_list = &kcq->rq_list[sched_domain];
516 bool merged;
517
518 spin_lock(&kcq->lock);
519 merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
520 spin_unlock(&kcq->lock);
521 blk_mq_put_ctx(ctx);
522
523 return merged;
524 }
525
kyber_prepare_request(struct request * rq,struct bio * bio)526 static void kyber_prepare_request(struct request *rq, struct bio *bio)
527 {
528 rq_set_domain_token(rq, -1);
529 }
530
kyber_insert_requests(struct blk_mq_hw_ctx * hctx,struct list_head * rq_list,bool at_head)531 static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
532 struct list_head *rq_list, bool at_head)
533 {
534 struct kyber_hctx_data *khd = hctx->sched_data;
535 struct request *rq, *next;
536
537 list_for_each_entry_safe(rq, next, rq_list, queuelist) {
538 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
539 struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
540 struct list_head *head = &kcq->rq_list[sched_domain];
541
542 spin_lock(&kcq->lock);
543 if (at_head)
544 list_move(&rq->queuelist, head);
545 else
546 list_move_tail(&rq->queuelist, head);
547 sbitmap_set_bit(&khd->kcq_map[sched_domain],
548 rq->mq_ctx->index_hw);
549 blk_mq_sched_request_inserted(rq);
550 spin_unlock(&kcq->lock);
551 }
552 }
553
kyber_finish_request(struct request * rq)554 static void kyber_finish_request(struct request *rq)
555 {
556 struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
557
558 rq_clear_domain_token(kqd, rq);
559 }
560
kyber_completed_request(struct request * rq)561 static void kyber_completed_request(struct request *rq)
562 {
563 struct request_queue *q = rq->q;
564 struct kyber_queue_data *kqd = q->elevator->elevator_data;
565 unsigned int sched_domain;
566 u64 now, latency, target;
567
568 /*
569 * Check if this request met our latency goal. If not, quickly gather
570 * some statistics and start throttling.
571 */
572 sched_domain = kyber_sched_domain(rq->cmd_flags);
573 switch (sched_domain) {
574 case KYBER_READ:
575 target = kqd->read_lat_nsec;
576 break;
577 case KYBER_SYNC_WRITE:
578 target = kqd->write_lat_nsec;
579 break;
580 default:
581 return;
582 }
583
584 /* If we are already monitoring latencies, don't check again. */
585 if (blk_stat_is_active(kqd->cb))
586 return;
587
588 now = ktime_get_ns();
589 if (now < rq->io_start_time_ns)
590 return;
591
592 latency = now - rq->io_start_time_ns;
593
594 if (latency > target)
595 blk_stat_activate_msecs(kqd->cb, 10);
596 }
597
598 struct flush_kcq_data {
599 struct kyber_hctx_data *khd;
600 unsigned int sched_domain;
601 struct list_head *list;
602 };
603
flush_busy_kcq(struct sbitmap * sb,unsigned int bitnr,void * data)604 static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
605 {
606 struct flush_kcq_data *flush_data = data;
607 struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
608
609 spin_lock(&kcq->lock);
610 list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
611 flush_data->list);
612 sbitmap_clear_bit(sb, bitnr);
613 spin_unlock(&kcq->lock);
614
615 return true;
616 }
617
kyber_flush_busy_kcqs(struct kyber_hctx_data * khd,unsigned int sched_domain,struct list_head * list)618 static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
619 unsigned int sched_domain,
620 struct list_head *list)
621 {
622 struct flush_kcq_data data = {
623 .khd = khd,
624 .sched_domain = sched_domain,
625 .list = list,
626 };
627
628 sbitmap_for_each_set(&khd->kcq_map[sched_domain],
629 flush_busy_kcq, &data);
630 }
631
kyber_domain_wake(wait_queue_entry_t * wait,unsigned mode,int flags,void * key)632 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
633 void *key)
634 {
635 struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
636
637 list_del_init(&wait->entry);
638 blk_mq_run_hw_queue(hctx, true);
639 return 1;
640 }
641
kyber_get_domain_token(struct kyber_queue_data * kqd,struct kyber_hctx_data * khd,struct blk_mq_hw_ctx * hctx)642 static int kyber_get_domain_token(struct kyber_queue_data *kqd,
643 struct kyber_hctx_data *khd,
644 struct blk_mq_hw_ctx *hctx)
645 {
646 unsigned int sched_domain = khd->cur_domain;
647 struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
648 wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
649 struct sbq_wait_state *ws;
650 int nr;
651
652 nr = __sbitmap_queue_get(domain_tokens);
653
654 /*
655 * If we failed to get a domain token, make sure the hardware queue is
656 * run when one becomes available. Note that this is serialized on
657 * khd->lock, but we still need to be careful about the waker.
658 */
659 if (nr < 0 && list_empty_careful(&wait->entry)) {
660 ws = sbq_wait_ptr(domain_tokens,
661 &khd->wait_index[sched_domain]);
662 khd->domain_ws[sched_domain] = ws;
663 add_wait_queue(&ws->wait, wait);
664
665 /*
666 * Try again in case a token was freed before we got on the wait
667 * queue.
668 */
669 nr = __sbitmap_queue_get(domain_tokens);
670 }
671
672 /*
673 * If we got a token while we were on the wait queue, remove ourselves
674 * from the wait queue to ensure that all wake ups make forward
675 * progress. It's possible that the waker already deleted the entry
676 * between the !list_empty_careful() check and us grabbing the lock, but
677 * list_del_init() is okay with that.
678 */
679 if (nr >= 0 && !list_empty_careful(&wait->entry)) {
680 ws = khd->domain_ws[sched_domain];
681 spin_lock_irq(&ws->wait.lock);
682 list_del_init(&wait->entry);
683 spin_unlock_irq(&ws->wait.lock);
684 }
685
686 return nr;
687 }
688
689 static struct request *
kyber_dispatch_cur_domain(struct kyber_queue_data * kqd,struct kyber_hctx_data * khd,struct blk_mq_hw_ctx * hctx)690 kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
691 struct kyber_hctx_data *khd,
692 struct blk_mq_hw_ctx *hctx)
693 {
694 struct list_head *rqs;
695 struct request *rq;
696 int nr;
697
698 rqs = &khd->rqs[khd->cur_domain];
699
700 /*
701 * If we already have a flushed request, then we just need to get a
702 * token for it. Otherwise, if there are pending requests in the kcqs,
703 * flush the kcqs, but only if we can get a token. If not, we should
704 * leave the requests in the kcqs so that they can be merged. Note that
705 * khd->lock serializes the flushes, so if we observed any bit set in
706 * the kcq_map, we will always get a request.
707 */
708 rq = list_first_entry_or_null(rqs, struct request, queuelist);
709 if (rq) {
710 nr = kyber_get_domain_token(kqd, khd, hctx);
711 if (nr >= 0) {
712 khd->batching++;
713 rq_set_domain_token(rq, nr);
714 list_del_init(&rq->queuelist);
715 return rq;
716 }
717 } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
718 nr = kyber_get_domain_token(kqd, khd, hctx);
719 if (nr >= 0) {
720 kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
721 rq = list_first_entry(rqs, struct request, queuelist);
722 khd->batching++;
723 rq_set_domain_token(rq, nr);
724 list_del_init(&rq->queuelist);
725 return rq;
726 }
727 }
728
729 /* There were either no pending requests or no tokens. */
730 return NULL;
731 }
732
kyber_dispatch_request(struct blk_mq_hw_ctx * hctx)733 static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
734 {
735 struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
736 struct kyber_hctx_data *khd = hctx->sched_data;
737 struct request *rq;
738 int i;
739
740 spin_lock(&khd->lock);
741
742 /*
743 * First, if we are still entitled to batch, try to dispatch a request
744 * from the batch.
745 */
746 if (khd->batching < kyber_batch_size[khd->cur_domain]) {
747 rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
748 if (rq)
749 goto out;
750 }
751
752 /*
753 * Either,
754 * 1. We were no longer entitled to a batch.
755 * 2. The domain we were batching didn't have any requests.
756 * 3. The domain we were batching was out of tokens.
757 *
758 * Start another batch. Note that this wraps back around to the original
759 * domain if no other domains have requests or tokens.
760 */
761 khd->batching = 0;
762 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
763 if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
764 khd->cur_domain = 0;
765 else
766 khd->cur_domain++;
767
768 rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
769 if (rq)
770 goto out;
771 }
772
773 rq = NULL;
774 out:
775 spin_unlock(&khd->lock);
776 return rq;
777 }
778
kyber_has_work(struct blk_mq_hw_ctx * hctx)779 static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
780 {
781 struct kyber_hctx_data *khd = hctx->sched_data;
782 int i;
783
784 for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
785 if (!list_empty_careful(&khd->rqs[i]) ||
786 sbitmap_any_bit_set(&khd->kcq_map[i]))
787 return true;
788 }
789
790 return false;
791 }
792
793 #define KYBER_LAT_SHOW_STORE(op) \
794 static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
795 char *page) \
796 { \
797 struct kyber_queue_data *kqd = e->elevator_data; \
798 \
799 return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
800 } \
801 \
802 static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
803 const char *page, size_t count) \
804 { \
805 struct kyber_queue_data *kqd = e->elevator_data; \
806 unsigned long long nsec; \
807 int ret; \
808 \
809 ret = kstrtoull(page, 10, &nsec); \
810 if (ret) \
811 return ret; \
812 \
813 kqd->op##_lat_nsec = nsec; \
814 \
815 return count; \
816 }
817 KYBER_LAT_SHOW_STORE(read);
818 KYBER_LAT_SHOW_STORE(write);
819 #undef KYBER_LAT_SHOW_STORE
820
821 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
822 static struct elv_fs_entry kyber_sched_attrs[] = {
823 KYBER_LAT_ATTR(read),
824 KYBER_LAT_ATTR(write),
825 __ATTR_NULL
826 };
827 #undef KYBER_LAT_ATTR
828
829 #ifdef CONFIG_BLK_DEBUG_FS
830 #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
831 static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
832 { \
833 struct request_queue *q = data; \
834 struct kyber_queue_data *kqd = q->elevator->elevator_data; \
835 \
836 sbitmap_queue_show(&kqd->domain_tokens[domain], m); \
837 return 0; \
838 } \
839 \
840 static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \
841 __acquires(&khd->lock) \
842 { \
843 struct blk_mq_hw_ctx *hctx = m->private; \
844 struct kyber_hctx_data *khd = hctx->sched_data; \
845 \
846 spin_lock(&khd->lock); \
847 return seq_list_start(&khd->rqs[domain], *pos); \
848 } \
849 \
850 static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \
851 loff_t *pos) \
852 { \
853 struct blk_mq_hw_ctx *hctx = m->private; \
854 struct kyber_hctx_data *khd = hctx->sched_data; \
855 \
856 return seq_list_next(v, &khd->rqs[domain], pos); \
857 } \
858 \
859 static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \
860 __releases(&khd->lock) \
861 { \
862 struct blk_mq_hw_ctx *hctx = m->private; \
863 struct kyber_hctx_data *khd = hctx->sched_data; \
864 \
865 spin_unlock(&khd->lock); \
866 } \
867 \
868 static const struct seq_operations kyber_##name##_rqs_seq_ops = { \
869 .start = kyber_##name##_rqs_start, \
870 .next = kyber_##name##_rqs_next, \
871 .stop = kyber_##name##_rqs_stop, \
872 .show = blk_mq_debugfs_rq_show, \
873 }; \
874 \
875 static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
876 { \
877 struct blk_mq_hw_ctx *hctx = data; \
878 struct kyber_hctx_data *khd = hctx->sched_data; \
879 wait_queue_entry_t *wait = &khd->domain_wait[domain]; \
880 \
881 seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
882 return 0; \
883 }
KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ,read)884 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
885 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
886 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
887 #undef KYBER_DEBUGFS_DOMAIN_ATTRS
888
889 static int kyber_async_depth_show(void *data, struct seq_file *m)
890 {
891 struct request_queue *q = data;
892 struct kyber_queue_data *kqd = q->elevator->elevator_data;
893
894 seq_printf(m, "%u\n", kqd->async_depth);
895 return 0;
896 }
897
kyber_cur_domain_show(void * data,struct seq_file * m)898 static int kyber_cur_domain_show(void *data, struct seq_file *m)
899 {
900 struct blk_mq_hw_ctx *hctx = data;
901 struct kyber_hctx_data *khd = hctx->sched_data;
902
903 switch (khd->cur_domain) {
904 case KYBER_READ:
905 seq_puts(m, "READ\n");
906 break;
907 case KYBER_SYNC_WRITE:
908 seq_puts(m, "SYNC_WRITE\n");
909 break;
910 case KYBER_OTHER:
911 seq_puts(m, "OTHER\n");
912 break;
913 default:
914 seq_printf(m, "%u\n", khd->cur_domain);
915 break;
916 }
917 return 0;
918 }
919
kyber_batching_show(void * data,struct seq_file * m)920 static int kyber_batching_show(void *data, struct seq_file *m)
921 {
922 struct blk_mq_hw_ctx *hctx = data;
923 struct kyber_hctx_data *khd = hctx->sched_data;
924
925 seq_printf(m, "%u\n", khd->batching);
926 return 0;
927 }
928
929 #define KYBER_QUEUE_DOMAIN_ATTRS(name) \
930 {#name "_tokens", 0400, kyber_##name##_tokens_show}
931 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
932 KYBER_QUEUE_DOMAIN_ATTRS(read),
933 KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
934 KYBER_QUEUE_DOMAIN_ATTRS(other),
935 {"async_depth", 0400, kyber_async_depth_show},
936 {},
937 };
938 #undef KYBER_QUEUE_DOMAIN_ATTRS
939
940 #define KYBER_HCTX_DOMAIN_ATTRS(name) \
941 {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \
942 {#name "_waiting", 0400, kyber_##name##_waiting_show}
943 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
944 KYBER_HCTX_DOMAIN_ATTRS(read),
945 KYBER_HCTX_DOMAIN_ATTRS(sync_write),
946 KYBER_HCTX_DOMAIN_ATTRS(other),
947 {"cur_domain", 0400, kyber_cur_domain_show},
948 {"batching", 0400, kyber_batching_show},
949 {},
950 };
951 #undef KYBER_HCTX_DOMAIN_ATTRS
952 #endif
953
954 static struct elevator_type kyber_sched = {
955 .ops.mq = {
956 .init_sched = kyber_init_sched,
957 .exit_sched = kyber_exit_sched,
958 .init_hctx = kyber_init_hctx,
959 .exit_hctx = kyber_exit_hctx,
960 .limit_depth = kyber_limit_depth,
961 .bio_merge = kyber_bio_merge,
962 .prepare_request = kyber_prepare_request,
963 .insert_requests = kyber_insert_requests,
964 .finish_request = kyber_finish_request,
965 .requeue_request = kyber_finish_request,
966 .completed_request = kyber_completed_request,
967 .dispatch_request = kyber_dispatch_request,
968 .has_work = kyber_has_work,
969 },
970 .uses_mq = true,
971 #ifdef CONFIG_BLK_DEBUG_FS
972 .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
973 .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
974 #endif
975 .elevator_attrs = kyber_sched_attrs,
976 .elevator_name = "kyber",
977 .elevator_owner = THIS_MODULE,
978 };
979
kyber_init(void)980 static int __init kyber_init(void)
981 {
982 return elv_register(&kyber_sched);
983 }
984
kyber_exit(void)985 static void __exit kyber_exit(void)
986 {
987 elv_unregister(&kyber_sched);
988 }
989
990 module_init(kyber_init);
991 module_exit(kyber_exit);
992
993 MODULE_AUTHOR("Omar Sandoval");
994 MODULE_LICENSE("GPL");
995 MODULE_DESCRIPTION("Kyber I/O scheduler");
996