Lines Matching +full:entry +full:- +full:latency

1 // SPDX-License-Identifier: GPL-2.0
3 * The Kyber I/O scheduler. Controls latency by throttling queue depths using
11 #include <linux/blk-mq.h>
19 #include "blk-mq.h"
20 #include "blk-mq-debugfs.h"
21 #include "blk-mq-sched.h"
22 #include "blk-mq-tag.h"
56 * Maximum device-wide depth for each scheduling domain.
70 * Default latency targets for each scheduling domain.
91 * to the target latency:
93 * <= 1/4 * target latency
94 * <= 1/2 * target latency
95 * <= 3/4 * target latency
96 * <= target latency
97 * <= 1 1/4 * target latency
98 * <= 1 1/2 * target latency
99 * <= 1 3/4 * target latency
100 * > 1 3/4 * target latency
104 * The width of the latency histogram buckets is
105 * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
109 * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
118 * We measure both the total latency and the I/O latency (i.e., latency after
132 * Per-cpu latency histograms: total latency and I/O latency for each scheduling
141 * we use request->mq_ctx->index_hw to index the kcq in khd.
157 * Each scheduling domain has a limited number of in-flight requests
158 * device-wide, limited by these tokens.
163 * Async request percentage, converted to per-word depth for
216 unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; in flush_latency_buckets()
217 atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type]; in flush_latency_buckets()
225 * Calculate the histogram bucket with the given percentile rank, or -1 if there
232 unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; in calculate_percentile()
239 return -1; in calculate_percentile()
245 if (!kqd->latency_timeout[sched_domain]) in calculate_percentile()
246 kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL); in calculate_percentile()
248 time_is_after_jiffies(kqd->latency_timeout[sched_domain])) { in calculate_percentile()
249 return -1; in calculate_percentile()
251 kqd->latency_timeout[sched_domain] = 0; in calculate_percentile()
254 for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) { in calculate_percentile()
257 percentile_samples -= buckets[bucket]; in calculate_percentile()
259 memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type])); in calculate_percentile()
261 trace_kyber_latency(kqd->dev, kyber_domain_names[sched_domain], in calculate_percentile()
272 if (depth != kqd->domain_tokens[sched_domain].sb.depth) { in kyber_resize_domain()
273 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); in kyber_resize_domain()
274 trace_kyber_adjust(kqd->dev, kyber_domain_names[sched_domain], in kyber_resize_domain()
286 /* Sum all of the per-cpu latency histograms. */ in kyber_timer_fn()
290 cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu); in kyber_timer_fn()
300 * Check if any domains have a high I/O latency, which might indicate in kyber_timer_fn()
326 * necessarily have enough samples to calculate the latency in kyber_timer_fn()
330 * reset it to -1. in kyber_timer_fn()
334 p99 = kqd->domain_p99[sched_domain]; in kyber_timer_fn()
335 kqd->domain_p99[sched_domain] = -1; in kyber_timer_fn()
337 kqd->domain_p99[sched_domain] = p99; in kyber_timer_fn()
343 * If this domain has bad latency, throttle less. Otherwise, in kyber_timer_fn()
346 * The new depth is scaled linearly with the p99 latency vs the in kyber_timer_fn()
347 * latency target. E.g., if the p99 is 3/4 of the target, then in kyber_timer_fn()
352 orig_depth = kqd->domain_tokens[sched_domain].sb.depth; in kyber_timer_fn()
362 int ret = -ENOMEM; in kyber_queue_data_alloc()
365 kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); in kyber_queue_data_alloc()
369 kqd->q = q; in kyber_queue_data_alloc()
370 kqd->dev = disk_devt(q->disk); in kyber_queue_data_alloc()
372 kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency, in kyber_queue_data_alloc()
374 if (!kqd->cpu_latency) in kyber_queue_data_alloc()
377 timer_setup(&kqd->timer, kyber_timer_fn, 0); in kyber_queue_data_alloc()
382 ret = sbitmap_queue_init_node(&kqd->domain_tokens[i], in kyber_queue_data_alloc()
383 kyber_depth[i], -1, false, in kyber_queue_data_alloc()
384 GFP_KERNEL, q->node); in kyber_queue_data_alloc()
386 while (--i >= 0) in kyber_queue_data_alloc()
387 sbitmap_queue_free(&kqd->domain_tokens[i]); in kyber_queue_data_alloc()
393 kqd->domain_p99[i] = -1; in kyber_queue_data_alloc()
394 kqd->latency_targets[i] = kyber_latency_targets[i]; in kyber_queue_data_alloc()
400 free_percpu(kqd->cpu_latency); in kyber_queue_data_alloc()
414 return -ENOMEM; in kyber_init_sched()
418 kobject_put(&eq->kobj); in kyber_init_sched()
424 eq->elevator_data = kqd; in kyber_init_sched()
425 q->elevator = eq; in kyber_init_sched()
432 struct kyber_queue_data *kqd = e->elevator_data; in kyber_exit_sched()
435 del_timer_sync(&kqd->timer); in kyber_exit_sched()
438 sbitmap_queue_free(&kqd->domain_tokens[i]); in kyber_exit_sched()
439 free_percpu(kqd->cpu_latency); in kyber_exit_sched()
447 spin_lock_init(&kcq->lock); in kyber_ctx_queue_init()
449 INIT_LIST_HEAD(&kcq->rq_list[i]); in kyber_ctx_queue_init()
454 struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; in kyber_depth_updated()
455 struct blk_mq_tags *tags = hctx->sched_tags; in kyber_depth_updated()
456 unsigned int shift = tags->bitmap_tags->sb.shift; in kyber_depth_updated()
458 kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; in kyber_depth_updated()
460 sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth); in kyber_depth_updated()
468 khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node); in kyber_init_hctx()
470 return -ENOMEM; in kyber_init_hctx()
472 khd->kcqs = kmalloc_array_node(hctx->nr_ctx, in kyber_init_hctx()
474 GFP_KERNEL, hctx->numa_node); in kyber_init_hctx()
475 if (!khd->kcqs) in kyber_init_hctx()
478 for (i = 0; i < hctx->nr_ctx; i++) in kyber_init_hctx()
479 kyber_ctx_queue_init(&khd->kcqs[i]); in kyber_init_hctx()
482 if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx, in kyber_init_hctx()
483 ilog2(8), GFP_KERNEL, hctx->numa_node, in kyber_init_hctx()
485 while (--i >= 0) in kyber_init_hctx()
486 sbitmap_free(&khd->kcq_map[i]); in kyber_init_hctx()
491 spin_lock_init(&khd->lock); in kyber_init_hctx()
494 INIT_LIST_HEAD(&khd->rqs[i]); in kyber_init_hctx()
495 khd->domain_wait[i].sbq = NULL; in kyber_init_hctx()
496 init_waitqueue_func_entry(&khd->domain_wait[i].wait, in kyber_init_hctx()
498 khd->domain_wait[i].wait.private = hctx; in kyber_init_hctx()
499 INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry); in kyber_init_hctx()
500 atomic_set(&khd->wait_index[i], 0); in kyber_init_hctx()
503 khd->cur_domain = 0; in kyber_init_hctx()
504 khd->batching = 0; in kyber_init_hctx()
506 hctx->sched_data = khd; in kyber_init_hctx()
512 kfree(khd->kcqs); in kyber_init_hctx()
515 return -ENOMEM; in kyber_init_hctx()
520 struct kyber_hctx_data *khd = hctx->sched_data; in kyber_exit_hctx()
524 sbitmap_free(&khd->kcq_map[i]); in kyber_exit_hctx()
525 kfree(khd->kcqs); in kyber_exit_hctx()
526 kfree(hctx->sched_data); in kyber_exit_hctx()
531 return (long)rq->elv.priv[0]; in rq_get_domain_token()
536 rq->elv.priv[0] = (void *)(long)token; in rq_set_domain_token()
546 if (nr != -1) { in rq_clear_domain_token()
547 sched_domain = kyber_sched_domain(rq->cmd_flags); in rq_clear_domain_token()
548 sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr, in rq_clear_domain_token()
549 rq->mq_ctx->cpu); in rq_clear_domain_token()
556 * We use the scheduler tags as per-hardware queue queueing tokens. in kyber_limit_depth()
560 struct kyber_queue_data *kqd = data->q->elevator->elevator_data; in kyber_limit_depth()
562 data->shallow_depth = kqd->async_depth; in kyber_limit_depth()
570 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); in kyber_bio_merge()
571 struct kyber_hctx_data *khd = hctx->sched_data; in kyber_bio_merge()
572 struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; in kyber_bio_merge()
573 unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); in kyber_bio_merge()
574 struct list_head *rq_list = &kcq->rq_list[sched_domain]; in kyber_bio_merge()
577 spin_lock(&kcq->lock); in kyber_bio_merge()
578 merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); in kyber_bio_merge()
579 spin_unlock(&kcq->lock); in kyber_bio_merge()
586 rq_set_domain_token(rq, -1); in kyber_prepare_request()
592 struct kyber_hctx_data *khd = hctx->sched_data; in kyber_insert_requests()
596 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); in kyber_insert_requests()
597 struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]]; in kyber_insert_requests()
598 struct list_head *head = &kcq->rq_list[sched_domain]; in kyber_insert_requests()
600 spin_lock(&kcq->lock); in kyber_insert_requests()
603 list_move(&rq->queuelist, head); in kyber_insert_requests()
605 list_move_tail(&rq->queuelist, head); in kyber_insert_requests()
606 sbitmap_set_bit(&khd->kcq_map[sched_domain], in kyber_insert_requests()
607 rq->mq_ctx->index_hw[hctx->type]); in kyber_insert_requests()
608 spin_unlock(&kcq->lock); in kyber_insert_requests()
614 struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; in kyber_finish_request()
621 u64 target, u64 latency) in add_latency_sample() argument
626 if (latency > 0) { in add_latency_sample()
628 bucket = min_t(unsigned int, div64_u64(latency - 1, divisor), in add_latency_sample()
629 KYBER_LATENCY_BUCKETS - 1); in add_latency_sample()
634 atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]); in add_latency_sample()
639 struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; in kyber_completed_request()
644 sched_domain = kyber_sched_domain(rq->cmd_flags); in kyber_completed_request()
648 cpu_latency = get_cpu_ptr(kqd->cpu_latency); in kyber_completed_request()
649 target = kqd->latency_targets[sched_domain]; in kyber_completed_request()
651 target, now - rq->start_time_ns); in kyber_completed_request()
653 now - rq->io_start_time_ns); in kyber_completed_request()
654 put_cpu_ptr(kqd->cpu_latency); in kyber_completed_request()
656 timer_reduce(&kqd->timer, jiffies + HZ / 10); in kyber_completed_request()
668 struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr]; in flush_busy_kcq()
670 spin_lock(&kcq->lock); in flush_busy_kcq()
671 list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain], in flush_busy_kcq()
672 flush_data->list); in flush_busy_kcq()
674 spin_unlock(&kcq->lock); in flush_busy_kcq()
689 sbitmap_for_each_set(&khd->kcq_map[sched_domain], in kyber_flush_busy_kcqs()
696 struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private); in kyber_domain_wake()
708 unsigned int sched_domain = khd->cur_domain; in kyber_get_domain_token()
709 struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; in kyber_get_domain_token()
710 struct sbq_wait *wait = &khd->domain_wait[sched_domain]; in kyber_get_domain_token()
719 * khd->lock, but we still need to be careful about the waker. in kyber_get_domain_token()
721 if (nr < 0 && list_empty_careful(&wait->wait.entry)) { in kyber_get_domain_token()
723 &khd->wait_index[sched_domain]); in kyber_get_domain_token()
724 khd->domain_ws[sched_domain] = ws; in kyber_get_domain_token()
737 * progress. It's possible that the waker already deleted the entry in kyber_get_domain_token()
741 if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) { in kyber_get_domain_token()
742 ws = khd->domain_ws[sched_domain]; in kyber_get_domain_token()
743 spin_lock_irq(&ws->wait.lock); in kyber_get_domain_token()
745 spin_unlock_irq(&ws->wait.lock); in kyber_get_domain_token()
760 rqs = &khd->rqs[khd->cur_domain]; in kyber_dispatch_cur_domain()
767 * khd->lock serializes the flushes, so if we observed any bit set in in kyber_dispatch_cur_domain()
774 khd->batching++; in kyber_dispatch_cur_domain()
776 list_del_init(&rq->queuelist); in kyber_dispatch_cur_domain()
779 trace_kyber_throttled(kqd->dev, in kyber_dispatch_cur_domain()
780 kyber_domain_names[khd->cur_domain]); in kyber_dispatch_cur_domain()
782 } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { in kyber_dispatch_cur_domain()
785 kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs); in kyber_dispatch_cur_domain()
787 khd->batching++; in kyber_dispatch_cur_domain()
789 list_del_init(&rq->queuelist); in kyber_dispatch_cur_domain()
792 trace_kyber_throttled(kqd->dev, in kyber_dispatch_cur_domain()
793 kyber_domain_names[khd->cur_domain]); in kyber_dispatch_cur_domain()
803 struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; in kyber_dispatch_request()
804 struct kyber_hctx_data *khd = hctx->sched_data; in kyber_dispatch_request()
808 spin_lock(&khd->lock); in kyber_dispatch_request()
814 if (khd->batching < kyber_batch_size[khd->cur_domain]) { in kyber_dispatch_request()
829 khd->batching = 0; in kyber_dispatch_request()
831 if (khd->cur_domain == KYBER_NUM_DOMAINS - 1) in kyber_dispatch_request()
832 khd->cur_domain = 0; in kyber_dispatch_request()
834 khd->cur_domain++; in kyber_dispatch_request()
843 spin_unlock(&khd->lock); in kyber_dispatch_request()
849 struct kyber_hctx_data *khd = hctx->sched_data; in kyber_has_work()
853 if (!list_empty_careful(&khd->rqs[i]) || in kyber_has_work()
854 sbitmap_any_bit_set(&khd->kcq_map[i])) in kyber_has_work()
865 struct kyber_queue_data *kqd = e->elevator_data; \
867 return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \
873 struct kyber_queue_data *kqd = e->elevator_data; \
881 kqd->latency_targets[domain] = nsec; \
902 struct kyber_queue_data *kqd = q->elevator->elevator_data; \
904 sbitmap_queue_show(&kqd->domain_tokens[domain], m); \
909 __acquires(&khd->lock) \
911 struct blk_mq_hw_ctx *hctx = m->private; \
912 struct kyber_hctx_data *khd = hctx->sched_data; \
914 spin_lock(&khd->lock); \
915 return seq_list_start(&khd->rqs[domain], *pos); \
921 struct blk_mq_hw_ctx *hctx = m->private; \
922 struct kyber_hctx_data *khd = hctx->sched_data; \
924 return seq_list_next(v, &khd->rqs[domain], pos); \
928 __releases(&khd->lock) \
930 struct blk_mq_hw_ctx *hctx = m->private; \
931 struct kyber_hctx_data *khd = hctx->sched_data; \
933 spin_unlock(&khd->lock); \
946 struct kyber_hctx_data *khd = hctx->sched_data; \
947 wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \
949 seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
961 struct kyber_queue_data *kqd = q->elevator->elevator_data; in KYBER_DEBUGFS_DOMAIN_ATTRS()
963 seq_printf(m, "%u\n", kqd->async_depth); in KYBER_DEBUGFS_DOMAIN_ATTRS()
970 struct kyber_hctx_data *khd = hctx->sched_data; in kyber_cur_domain_show()
972 seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]); in kyber_cur_domain_show()
979 struct kyber_hctx_data *khd = hctx->sched_data; in kyber_batching_show()
981 seq_printf(m, "%u\n", khd->batching); in kyber_batching_show()