Lines Matching +full:idle +full:- +full:wait +full:- +full:delay

1 /* SPDX-License-Identifier: GPL-2.0
17 * useless for the purpose of IO capacity distribution. While on-device
19 * non-queued rotational devices, this is no longer viable with modern
27 * implement a reasonable work-conserving proportional IO resource
37 * Currently, there's only one builtin cost model - linear. Each IO is
47 * device-specific coefficients.
54 * 2-1. Vtime Distribution
66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
75 * against the device vtime - an IO which takes 10ms on the underlying
84 * 2-2. Vrate Adjustment
97 * To slow down, we lower the vrate - the rate at which the device vtime
100 * 750ms worth of IOs per second, and vice-versa for speeding up.
102 * Device business is determined using two criteria - rq wait and
105 * When a device gets saturated, the on-device and then the request queues
106 * fill up and a bio which is ready to be issued has to wait for a request
107 * to become available. When this delay becomes noticeable, it's a clear
114 * are executed, soley depending on rq wait may not result in satisfactory
121 * service. There is an inherent trade-off - the tighter the latency QoS,
125 * 2-3. Work Conservation
133 * compared to free-for-all competition. This is too high a cost to pay
156 * controller uses a drgn based monitoring script -
161 * active weight hweight% inflt% dbt delay usages%
165 * - per : Timer period
166 * - cur_per : Internal wall and device vtime clock
167 * - vrate : Device virtual time rate against wall clock
168 * - weight : Surplus-adjusted and configured weights
169 * - hweight : Surplus-adjusted and configured hierarchical weights
170 * - inflt : The percentage of in-flight IO cost at the end of last period
171 * - del_ms : Deferred issuer delay induction level and duration
172 * - usages : Usage history
183 #include "blk-rq-qos.h"
184 #include "blk-stat.h"
185 #include "blk-wbt.h"
186 #include "blk-cgroup.h"
190 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
200 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
220 * iocg->vtime is targeted at 50% behind the device vtime, which
244 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
245 * granularity and days of wrap-around time even at extreme vrates.
266 * The effect of delay is indirect and non-linear and a huge amount of
268 * up delay as debt is going up and then let it decay exponentially.
269 * This gives us quick ramp ups while delay is accumulating and long
273 * The delay mechanism provides adequate protection and behavior in many
281 * cache, the kernel doesn't have well-defined back-pressure propagation
302 * size-proportional components of cost calculation in closer
303 * numbers of digits to per-IO cost components.
307 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
319 /* io.cost.qos controls including per-dev enable of the whole controller */
457 /* per device-cgroup pair */
463 * A iocg can get its weight from two sources - an explicit
464 * per-device-cgroup configuration or the default weight of the
465 * cgroup. `cfg_weight` is the explicit per-device-cgroup
469 * When an idle cgroup becomes active its `active` goes from 0 to
474 * `last_inuse` remembers `inuse` while an iocg is idle to persist
498 * currently in-flight IOs.
504 /* current delay in effect and when it started */
505 u64 delay; member
563 struct wait_queue_entry wait; member
645 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
669 struct gendisk *disk = ioc->rqos.q->disk; in ioc_name()
673 return disk->disk_name; in ioc_name()
688 return pd_to_blkg(&iocg->pd); in iocg_to_blkg()
719 bio->bi_iocost_cost = cost; in iocg_commit_bio()
720 atomic64_add(cost, &iocg->vtime); in iocg_commit_bio()
722 gcs = get_cpu_ptr(iocg->pcpu_stat); in iocg_commit_bio()
723 local64_add(abs_cost, &gcs->abs_vusage); in iocg_commit_bio()
730 spin_lock_irqsave(&iocg->ioc->lock, *flags); in iocg_lock()
731 spin_lock(&iocg->waitq.lock); in iocg_lock()
733 spin_lock_irqsave(&iocg->waitq.lock, *flags); in iocg_lock()
740 spin_unlock(&iocg->waitq.lock); in iocg_unlock()
741 spin_unlock_irqrestore(&iocg->ioc->lock, *flags); in iocg_unlock()
743 spin_unlock_irqrestore(&iocg->waitq.lock, *flags); in iocg_unlock()
752 struct ioc_margins *margins = &ioc->margins; in ioc_refresh_margins()
753 u32 period_us = ioc->period_us; in ioc_refresh_margins()
754 u64 vrate = ioc->vtime_base_rate; in ioc_refresh_margins()
756 margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; in ioc_refresh_margins()
757 margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; in ioc_refresh_margins()
758 margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; in ioc_refresh_margins()
766 lockdep_assert_held(&ioc->lock); in ioc_refresh_period_us()
769 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) { in ioc_refresh_period_us()
770 ppm = ioc->params.qos[QOS_RPPM]; in ioc_refresh_period_us()
771 lat = ioc->params.qos[QOS_RLAT]; in ioc_refresh_period_us()
773 ppm = ioc->params.qos[QOS_WPPM]; in ioc_refresh_period_us()
774 lat = ioc->params.qos[QOS_WLAT]; in ioc_refresh_period_us()
786 multi = max_t(u32, (MILLION - ppm) / 50000, 2); in ioc_refresh_period_us()
793 ioc->period_us = period_us; in ioc_refresh_period_us()
794 ioc->timer_slack_ns = div64_u64( in ioc_refresh_period_us()
802 int idx = ioc->autop_idx; in ioc_autop_idx()
808 if (!blk_queue_nonrot(ioc->rqos.q)) in ioc_autop_idx()
812 if (blk_queue_depth(ioc->rqos.q) == 1) in ioc_autop_idx()
820 if (ioc->user_qos_params || ioc->user_cost_model) in ioc_autop_idx()
824 vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); in ioc_autop_idx()
827 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { in ioc_autop_idx()
828 if (!ioc->autop_too_fast_at) in ioc_autop_idx()
829 ioc->autop_too_fast_at = now_ns; in ioc_autop_idx()
830 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC) in ioc_autop_idx()
833 ioc->autop_too_fast_at = 0; in ioc_autop_idx()
836 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) { in ioc_autop_idx()
837 if (!ioc->autop_too_slow_at) in ioc_autop_idx()
838 ioc->autop_too_slow_at = now_ns; in ioc_autop_idx()
839 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC) in ioc_autop_idx()
840 return idx - 1; in ioc_autop_idx()
842 ioc->autop_too_slow_at = 0; in ioc_autop_idx()
857 * *@page per-page cost 1s / (@bps / 4096)
858 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
859 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
875 *seqio = v - *page; in calc_lcoefs()
881 *randio = v - *page; in calc_lcoefs()
887 u64 *u = ioc->params.i_lcoefs; in ioc_refresh_lcoefs()
888 u64 *c = ioc->params.lcoefs; in ioc_refresh_lcoefs()
901 lockdep_assert_held(&ioc->lock); in ioc_refresh_params()
906 if (idx == ioc->autop_idx && !force) in ioc_refresh_params()
909 if (idx != ioc->autop_idx) in ioc_refresh_params()
910 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); in ioc_refresh_params()
912 ioc->autop_idx = idx; in ioc_refresh_params()
913 ioc->autop_too_fast_at = 0; in ioc_refresh_params()
914 ioc->autop_too_slow_at = 0; in ioc_refresh_params()
916 if (!ioc->user_qos_params) in ioc_refresh_params()
917 memcpy(ioc->params.qos, p->qos, sizeof(p->qos)); in ioc_refresh_params()
918 if (!ioc->user_cost_model) in ioc_refresh_params()
919 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs)); in ioc_refresh_params()
924 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * in ioc_refresh_params()
926 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] * in ioc_refresh_params()
941 s64 pleft = ioc->period_at + ioc->period_us - now->now; in ioc_refresh_vrate()
942 s64 vperiod = ioc->period_us * ioc->vtime_base_rate; in ioc_refresh_vrate()
945 lockdep_assert_held(&ioc->lock); in ioc_refresh_vrate()
956 vcomp = -div64_s64(ioc->vtime_err, pleft); in ioc_refresh_vrate()
957 vcomp_min = -(ioc->vtime_base_rate >> 1); in ioc_refresh_vrate()
958 vcomp_max = ioc->vtime_base_rate; in ioc_refresh_vrate()
961 ioc->vtime_err += vcomp * pleft; in ioc_refresh_vrate()
963 atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); in ioc_refresh_vrate()
966 ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); in ioc_refresh_vrate()
973 u64 vrate = ioc->vtime_base_rate; in ioc_adjust_base_vrate()
974 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; in ioc_adjust_base_vrate()
976 if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { in ioc_adjust_base_vrate()
977 if (ioc->busy_level != prev_busy_level || nr_lagging) in ioc_adjust_base_vrate()
978 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), in ioc_adjust_base_vrate()
994 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); in ioc_adjust_base_vrate()
997 int idx = min_t(int, abs(ioc->busy_level), in ioc_adjust_base_vrate()
998 ARRAY_SIZE(vrate_adj_pct) - 1); in ioc_adjust_base_vrate()
1001 if (ioc->busy_level > 0) in ioc_adjust_base_vrate()
1002 adj_pct = 100 - adj_pct; in ioc_adjust_base_vrate()
1013 ioc->vtime_base_rate = vrate; in ioc_adjust_base_vrate()
1022 now->now_ns = ktime_get(); in ioc_now()
1023 now->now = ktime_to_us(now->now_ns); in ioc_now()
1024 now->vrate = atomic64_read(&ioc->vtime_rate); in ioc_now()
1035 seq = read_seqcount_begin(&ioc->period_seqcount); in ioc_now()
1036 now->vnow = ioc->period_at_vtime + in ioc_now()
1037 (now->now - ioc->period_at) * now->vrate; in ioc_now()
1038 } while (read_seqcount_retry(&ioc->period_seqcount, seq)); in ioc_now()
1043 WARN_ON_ONCE(ioc->running != IOC_RUNNING); in ioc_start_period()
1045 write_seqcount_begin(&ioc->period_seqcount); in ioc_start_period()
1046 ioc->period_at = now->now; in ioc_start_period()
1047 ioc->period_at_vtime = now->vnow; in ioc_start_period()
1048 write_seqcount_end(&ioc->period_seqcount); in ioc_start_period()
1050 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us); in ioc_start_period()
1051 add_timer(&ioc->timer); in ioc_start_period()
1057 * is saved to be used as reference for later inuse in-period adjustments.
1062 struct ioc *ioc = iocg->ioc; in __propagate_weights()
1065 lockdep_assert_held(&ioc->lock); in __propagate_weights()
1072 if (list_empty(&iocg->active_list) && iocg->child_active_sum) { in __propagate_weights()
1073 inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, in __propagate_weights()
1074 iocg->child_active_sum); in __propagate_weights()
1079 iocg->last_inuse = iocg->inuse; in __propagate_weights()
1081 iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); in __propagate_weights()
1083 if (active == iocg->active && inuse == iocg->inuse) in __propagate_weights()
1086 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { in __propagate_weights()
1087 struct ioc_gq *parent = iocg->ancestors[lvl]; in __propagate_weights()
1088 struct ioc_gq *child = iocg->ancestors[lvl + 1]; in __propagate_weights()
1092 parent->child_active_sum += (s32)(active - child->active); in __propagate_weights()
1093 parent->child_inuse_sum += (s32)(inuse - child->inuse); in __propagate_weights()
1095 child->active = active; in __propagate_weights()
1096 child->inuse = inuse; in __propagate_weights()
1103 if (parent->child_active_sum) { in __propagate_weights()
1104 parent_active = parent->weight; in __propagate_weights()
1106 parent_active * parent->child_inuse_sum, in __propagate_weights()
1107 parent->child_active_sum); in __propagate_weights()
1111 if (parent_active == parent->active && in __propagate_weights()
1112 parent_inuse == parent->inuse) in __propagate_weights()
1119 ioc->weights_updated = true; in __propagate_weights()
1124 lockdep_assert_held(&ioc->lock); in commit_weights()
1126 if (ioc->weights_updated) { in commit_weights()
1129 atomic_inc(&ioc->hweight_gen); in commit_weights()
1130 ioc->weights_updated = false; in commit_weights()
1138 commit_weights(iocg->ioc); in propagate_weights()
1143 struct ioc *ioc = iocg->ioc; in current_hweight()
1148 /* hot path - if uptodate, use cached */ in current_hweight()
1149 ioc_gen = atomic_read(&ioc->hweight_gen); in current_hweight()
1150 if (ioc_gen == iocg->hweight_gen) in current_hweight()
1166 for (lvl = 0; lvl <= iocg->level - 1; lvl++) { in current_hweight()
1167 struct ioc_gq *parent = iocg->ancestors[lvl]; in current_hweight()
1168 struct ioc_gq *child = iocg->ancestors[lvl + 1]; in current_hweight()
1169 u64 active_sum = READ_ONCE(parent->child_active_sum); in current_hweight()
1170 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); in current_hweight()
1171 u32 active = READ_ONCE(child->active); in current_hweight()
1172 u32 inuse = READ_ONCE(child->inuse); in current_hweight()
1185 iocg->hweight_active = max_t(u32, hwa, 1); in current_hweight()
1186 iocg->hweight_inuse = max_t(u32, hwi, 1); in current_hweight()
1187 iocg->hweight_gen = ioc_gen; in current_hweight()
1190 *hw_activep = iocg->hweight_active; in current_hweight()
1192 *hw_inusep = iocg->hweight_inuse; in current_hweight()
1202 u32 inuse = iocg->active; in current_hweight_max()
1206 lockdep_assert_held(&iocg->ioc->lock); in current_hweight_max()
1208 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { in current_hweight_max()
1209 struct ioc_gq *parent = iocg->ancestors[lvl]; in current_hweight_max()
1210 struct ioc_gq *child = iocg->ancestors[lvl + 1]; in current_hweight_max()
1212 child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; in current_hweight_max()
1214 inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, in current_hweight_max()
1215 parent->child_active_sum); in current_hweight_max()
1223 struct ioc *ioc = iocg->ioc; in weight_updated()
1225 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg); in weight_updated()
1228 lockdep_assert_held(&ioc->lock); in weight_updated()
1230 weight = iocg->cfg_weight ?: iocc->dfl_weight; in weight_updated()
1231 if (weight != iocg->weight && iocg->active) in weight_updated()
1232 propagate_weights(iocg, weight, iocg->inuse, true, now); in weight_updated()
1233 iocg->weight = weight; in weight_updated()
1238 struct ioc *ioc = iocg->ioc; in iocg_activate()
1247 if (!list_empty(&iocg->active_list)) { in iocg_activate()
1249 cur_period = atomic64_read(&ioc->cur_period); in iocg_activate()
1250 if (atomic64_read(&iocg->active_period) != cur_period) in iocg_activate()
1251 atomic64_set(&iocg->active_period, cur_period); in iocg_activate()
1256 if (iocg->child_active_sum) in iocg_activate()
1259 spin_lock_irq(&ioc->lock); in iocg_activate()
1264 cur_period = atomic64_read(&ioc->cur_period); in iocg_activate()
1265 last_period = atomic64_read(&iocg->active_period); in iocg_activate()
1266 atomic64_set(&iocg->active_period, cur_period); in iocg_activate()
1268 /* already activated or breaking leaf-only constraint? */ in iocg_activate()
1269 if (!list_empty(&iocg->active_list)) in iocg_activate()
1271 for (i = iocg->level - 1; i > 0; i--) in iocg_activate()
1272 if (!list_empty(&iocg->ancestors[i]->active_list)) in iocg_activate()
1275 if (iocg->child_active_sum) in iocg_activate()
1282 vtarget = now->vnow - ioc->margins.target; in iocg_activate()
1283 vtime = atomic64_read(&iocg->vtime); in iocg_activate()
1285 atomic64_add(vtarget - vtime, &iocg->vtime); in iocg_activate()
1286 atomic64_add(vtarget - vtime, &iocg->done_vtime); in iocg_activate()
1294 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; in iocg_activate()
1295 list_add(&iocg->active_list, &ioc->active_iocgs); in iocg_activate()
1297 propagate_weights(iocg, iocg->weight, in iocg_activate()
1298 iocg->last_inuse ?: iocg->weight, true, now); in iocg_activate()
1303 iocg->activated_at = now->now; in iocg_activate()
1305 if (ioc->running == IOC_IDLE) { in iocg_activate()
1306 ioc->running = IOC_RUNNING; in iocg_activate()
1307 ioc->dfgv_period_at = now->now; in iocg_activate()
1308 ioc->dfgv_period_rem = 0; in iocg_activate()
1313 spin_unlock_irq(&ioc->lock); in iocg_activate()
1317 spin_unlock_irq(&ioc->lock); in iocg_activate()
1323 struct ioc *ioc = iocg->ioc; in iocg_kick_delay()
1325 u64 tdelta, delay, new_delay; in iocg_kick_delay() local
1329 lockdep_assert_held(&iocg->waitq.lock); in iocg_kick_delay()
1331 /* calculate the current delay in effect - 1/2 every second */ in iocg_kick_delay()
1332 tdelta = now->now - iocg->delay_at; in iocg_kick_delay()
1333 if (iocg->delay) in iocg_kick_delay()
1334 delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); in iocg_kick_delay()
1336 delay = 0; in iocg_kick_delay()
1338 /* calculate the new delay from the debt amount */ in iocg_kick_delay()
1340 vover = atomic64_read(&iocg->vtime) + in iocg_kick_delay()
1341 abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; in iocg_kick_delay()
1343 ioc->period_us * ioc->vtime_base_rate); in iocg_kick_delay()
1351 div_u64((MAX_DELAY - MIN_DELAY) * in iocg_kick_delay()
1352 (vover_pct - MIN_DELAY_THR_PCT), in iocg_kick_delay()
1353 MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); in iocg_kick_delay()
1356 if (new_delay > delay) { in iocg_kick_delay()
1357 iocg->delay = new_delay; in iocg_kick_delay()
1358 iocg->delay_at = now->now; in iocg_kick_delay()
1359 delay = new_delay; in iocg_kick_delay()
1362 if (delay >= MIN_DELAY) { in iocg_kick_delay()
1363 if (!iocg->indelay_since) in iocg_kick_delay()
1364 iocg->indelay_since = now->now; in iocg_kick_delay()
1365 blkcg_set_delay(blkg, delay * NSEC_PER_USEC); in iocg_kick_delay()
1368 if (iocg->indelay_since) { in iocg_kick_delay()
1369 iocg->stat.indelay_us += now->now - iocg->indelay_since; in iocg_kick_delay()
1370 iocg->indelay_since = 0; in iocg_kick_delay()
1372 iocg->delay = 0; in iocg_kick_delay()
1383 lockdep_assert_held(&iocg->ioc->lock); in iocg_incur_debt()
1384 lockdep_assert_held(&iocg->waitq.lock); in iocg_incur_debt()
1385 WARN_ON_ONCE(list_empty(&iocg->active_list)); in iocg_incur_debt()
1391 if (!iocg->abs_vdebt && abs_cost) { in iocg_incur_debt()
1392 iocg->indebt_since = now->now; in iocg_incur_debt()
1393 propagate_weights(iocg, iocg->active, 0, false, now); in iocg_incur_debt()
1396 iocg->abs_vdebt += abs_cost; in iocg_incur_debt()
1398 gcs = get_cpu_ptr(iocg->pcpu_stat); in iocg_incur_debt()
1399 local64_add(abs_cost, &gcs->abs_vusage); in iocg_incur_debt()
1406 lockdep_assert_held(&iocg->ioc->lock); in iocg_pay_debt()
1407 lockdep_assert_held(&iocg->waitq.lock); in iocg_pay_debt()
1410 WARN_ON_ONCE(list_empty(&iocg->active_list)); in iocg_pay_debt()
1411 WARN_ON_ONCE(iocg->inuse > 1); in iocg_pay_debt()
1413 iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); in iocg_pay_debt()
1416 if (!iocg->abs_vdebt) { in iocg_pay_debt()
1417 iocg->stat.indebt_us += now->now - iocg->indebt_since; in iocg_pay_debt()
1418 iocg->indebt_since = 0; in iocg_pay_debt()
1420 propagate_weights(iocg, iocg->active, iocg->last_inuse, in iocg_pay_debt()
1428 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); in iocg_wake_fn() local
1430 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); in iocg_wake_fn()
1432 ctx->vbudget -= cost; in iocg_wake_fn()
1434 if (ctx->vbudget < 0) in iocg_wake_fn()
1435 return -1; in iocg_wake_fn()
1437 iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); in iocg_wake_fn()
1438 wait->committed = true; in iocg_wake_fn()
1441 * autoremove_wake_function() removes the wait entry only when it in iocg_wake_fn()
1442 * actually changed the task state. We want the wait always removed. in iocg_wake_fn()
1448 list_del_init_careful(&wq_entry->entry); in iocg_wake_fn()
1454 * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1455 * addition to iocg->waitq.lock.
1460 struct ioc *ioc = iocg->ioc; in iocg_kick_waitq()
1466 lockdep_assert_held(&iocg->waitq.lock); in iocg_kick_waitq()
1469 vbudget = now->vnow - atomic64_read(&iocg->vtime); in iocg_kick_waitq()
1472 if (pay_debt && iocg->abs_vdebt && vbudget > 0) { in iocg_kick_waitq()
1474 u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); in iocg_kick_waitq()
1477 lockdep_assert_held(&ioc->lock); in iocg_kick_waitq()
1479 atomic64_add(vpay, &iocg->vtime); in iocg_kick_waitq()
1480 atomic64_add(vpay, &iocg->done_vtime); in iocg_kick_waitq()
1482 vbudget -= vpay; in iocg_kick_waitq()
1485 if (iocg->abs_vdebt || iocg->delay) in iocg_kick_waitq()
1494 if (iocg->abs_vdebt) { in iocg_kick_waitq()
1495 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); in iocg_kick_waitq()
1496 vbudget = min_t(s64, 0, vbudget - vdebt); in iocg_kick_waitq()
1507 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); in iocg_kick_waitq()
1509 if (!waitqueue_active(&iocg->waitq)) { in iocg_kick_waitq()
1510 if (iocg->wait_since) { in iocg_kick_waitq()
1511 iocg->stat.wait_us += now->now - iocg->wait_since; in iocg_kick_waitq()
1512 iocg->wait_since = 0; in iocg_kick_waitq()
1517 if (!iocg->wait_since) in iocg_kick_waitq()
1518 iocg->wait_since = now->now; in iocg_kick_waitq()
1524 vshortage = -ctx.vbudget; in iocg_kick_waitq()
1525 expires = now->now_ns + in iocg_kick_waitq()
1526 DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * in iocg_kick_waitq()
1528 expires += ioc->timer_slack_ns; in iocg_kick_waitq()
1531 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); in iocg_kick_waitq()
1532 if (hrtimer_is_queued(&iocg->waitq_timer) && in iocg_kick_waitq()
1533 abs(oexpires - expires) <= ioc->timer_slack_ns) in iocg_kick_waitq()
1536 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), in iocg_kick_waitq()
1537 ioc->timer_slack_ns, HRTIMER_MODE_ABS); in iocg_kick_waitq()
1543 bool pay_debt = READ_ONCE(iocg->abs_vdebt); in iocg_waitq_timer_fn()
1547 ioc_now(iocg->ioc, &now); in iocg_waitq_timer_fn()
1564 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu); in ioc_lat_stat()
1568 u32 this_met = local_read(&stat->missed[rw].nr_met); in ioc_lat_stat()
1569 u32 this_missed = local_read(&stat->missed[rw].nr_missed); in ioc_lat_stat()
1571 nr_met[rw] += this_met - stat->missed[rw].last_met; in ioc_lat_stat()
1572 nr_missed[rw] += this_missed - stat->missed[rw].last_missed; in ioc_lat_stat()
1573 stat->missed[rw].last_met = this_met; in ioc_lat_stat()
1574 stat->missed[rw].last_missed = this_missed; in ioc_lat_stat()
1577 this_rq_wait_ns = local64_read(&stat->rq_wait_ns); in ioc_lat_stat()
1578 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; in ioc_lat_stat()
1579 stat->last_rq_wait_ns = this_rq_wait_ns; in ioc_lat_stat()
1592 ioc->period_us * NSEC_PER_USEC); in ioc_lat_stat()
1595 /* was iocg idle this period? */
1598 struct ioc *ioc = iocg->ioc; in iocg_is_idle()
1601 if (atomic64_read(&iocg->active_period) == in iocg_is_idle()
1602 atomic64_read(&ioc->cur_period)) in iocg_is_idle()
1606 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime)) in iocg_is_idle()
1613 * Call this function on the target leaf @iocg's to build pre-order traversal
1615 * ->walk_list and the caller is responsible for dissolving the list after use.
1622 WARN_ON_ONCE(!list_empty(&iocg->walk_list)); in iocg_build_inner_walk()
1625 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { in iocg_build_inner_walk()
1626 if (!list_empty(&iocg->ancestors[lvl]->walk_list)) in iocg_build_inner_walk()
1630 /* walk down and visit the inner nodes to get pre-order traversal */ in iocg_build_inner_walk()
1631 while (++lvl <= iocg->level - 1) { in iocg_build_inner_walk()
1632 struct ioc_gq *inner = iocg->ancestors[lvl]; in iocg_build_inner_walk()
1635 list_add_tail(&inner->walk_list, inner_walk); in iocg_build_inner_walk()
1642 if (iocg->level > 0) { in iocg_flush_stat_upward()
1644 &iocg->ancestors[iocg->level - 1]->stat; in iocg_flush_stat_upward()
1646 parent_stat->usage_us += in iocg_flush_stat_upward()
1647 iocg->stat.usage_us - iocg->last_stat.usage_us; in iocg_flush_stat_upward()
1648 parent_stat->wait_us += in iocg_flush_stat_upward()
1649 iocg->stat.wait_us - iocg->last_stat.wait_us; in iocg_flush_stat_upward()
1650 parent_stat->indebt_us += in iocg_flush_stat_upward()
1651 iocg->stat.indebt_us - iocg->last_stat.indebt_us; in iocg_flush_stat_upward()
1652 parent_stat->indelay_us += in iocg_flush_stat_upward()
1653 iocg->stat.indelay_us - iocg->last_stat.indelay_us; in iocg_flush_stat_upward()
1656 iocg->last_stat = iocg->stat; in iocg_flush_stat_upward()
1659 /* collect per-cpu counters and propagate the deltas to the parent */
1662 struct ioc *ioc = iocg->ioc; in iocg_flush_stat_leaf()
1667 lockdep_assert_held(&iocg->ioc->lock); in iocg_flush_stat_leaf()
1669 /* collect per-cpu counters */ in iocg_flush_stat_leaf()
1672 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); in iocg_flush_stat_leaf()
1674 vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; in iocg_flush_stat_leaf()
1675 iocg->last_stat_abs_vusage = abs_vusage; in iocg_flush_stat_leaf()
1677 iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); in iocg_flush_stat_leaf()
1678 iocg->stat.usage_us += iocg->usage_delta_us; in iocg_flush_stat_leaf()
1698 list_del_init(&iocg->walk_list); in iocg_flush_stat()
1710 struct ioc *ioc = iocg->ioc; in hweight_after_donation()
1711 u64 vtime = atomic64_read(&iocg->vtime); in hweight_after_donation()
1715 if (iocg->abs_vdebt) in hweight_after_donation()
1719 if (waitqueue_active(&iocg->waitq) || in hweight_after_donation()
1720 time_after64(vtime, now->vnow - ioc->margins.min)) in hweight_after_donation()
1724 excess = now->vnow - vtime - ioc->margins.target; in hweight_after_donation()
1726 atomic64_add(excess, &iocg->vtime); in hweight_after_donation()
1727 atomic64_add(excess, &iocg->done_vtime); in hweight_after_donation()
1729 ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); in hweight_after_donation()
1739 * new budget (1 - MARGIN_TARGET) and the leftover from the last period in hweight_after_donation()
1742 * usage = (1 - MARGIN_TARGET + delta) * new_hwi in hweight_after_donation()
1746 * new_hwi = usage / (1 - MARGIN_TARGET + delta) in hweight_after_donation()
1748 delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), in hweight_after_donation()
1749 now->vnow - ioc->period_at_vtime); in hweight_after_donation()
1751 new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); in hweight_after_donation()
1757 * For work-conservation, an iocg which isn't using all of its share should
1758 * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1762 * global hweight_inuse updates when idle iocg's get activated or inuse weights
1768 * other nodes as the impacts will be inherently correct. This also makes idle
1779 * Given the weights and target after-donation hweight_inuse values, Andy's
1781 * sibling level to maintain the relative relationship between all non-donating
1783 * non-donating parts, calculates global donation rate which is used to
1784 * determine the target hweight_inuse for each node, and then derives per-level
1791 * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1801 * f is the sum of the absolute budgets of non-donating nodes in the subtree.
1804 * w_f is the non-donating portion of w. w_f = w * f / b
1807 * s_f and s_t are the non-donating and donating portions of s.
1809 * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1833 after_sum += iocg->hweight_after_donation; in transfer_surpluses()
1835 if (iocg->hweight_after_donation > hwa) { in transfer_surpluses()
1836 over_sum += iocg->hweight_after_donation; in transfer_surpluses()
1837 list_add(&iocg->walk_list, &over_hwa); in transfer_surpluses()
1846 u32 over_delta = after_sum - (WEIGHT_ONE - 1); in transfer_surpluses()
1848 over_target = over_sum - over_delta; in transfer_surpluses()
1855 iocg->hweight_after_donation = in transfer_surpluses()
1856 div_u64((u64)iocg->hweight_after_donation * in transfer_surpluses()
1858 list_del_init(&iocg->walk_list); in transfer_surpluses()
1862 * Build pre-order inner node walk list and prepare for donation in transfer_surpluses()
1870 WARN_ON_ONCE(root_iocg->level > 0); in transfer_surpluses()
1873 iocg->child_adjusted_sum = 0; in transfer_surpluses()
1874 iocg->hweight_donating = 0; in transfer_surpluses()
1875 iocg->hweight_after_donation = 0; in transfer_surpluses()
1883 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1885 parent->hweight_donating += iocg->hweight_donating; in transfer_surpluses()
1886 parent->hweight_after_donation += iocg->hweight_after_donation; in transfer_surpluses()
1890 if (iocg->level > 0) { in transfer_surpluses()
1891 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1893 parent->hweight_donating += iocg->hweight_donating; in transfer_surpluses()
1894 parent->hweight_after_donation += iocg->hweight_after_donation; in transfer_surpluses()
1904 if (iocg->level) { in transfer_surpluses()
1905 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1907 iocg->hweight_active = DIV64_U64_ROUND_UP( in transfer_surpluses()
1908 (u64)parent->hweight_active * iocg->active, in transfer_surpluses()
1909 parent->child_active_sum); in transfer_surpluses()
1913 iocg->hweight_donating = min(iocg->hweight_donating, in transfer_surpluses()
1914 iocg->hweight_active); in transfer_surpluses()
1915 iocg->hweight_after_donation = min(iocg->hweight_after_donation, in transfer_surpluses()
1916 iocg->hweight_donating - 1); in transfer_surpluses()
1917 if (WARN_ON_ONCE(iocg->hweight_active <= 1 || in transfer_surpluses()
1918 iocg->hweight_donating <= 1 || in transfer_surpluses()
1919 iocg->hweight_after_donation == 0)) { in transfer_surpluses()
1921 pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); in transfer_surpluses()
1923 iocg->hweight_active, iocg->hweight_donating, in transfer_surpluses()
1924 iocg->hweight_after_donation); in transfer_surpluses()
1929 * Calculate the global donation rate (gamma) - the rate to adjust in transfer_surpluses()
1930 * non-donating budgets by. in transfer_surpluses()
1936 * hweights can't be whole; however, due to the round-ups during hweight in transfer_surpluses()
1937 * calculations, root_iocg->hweight_donating might still end up equal to in transfer_surpluses()
1940 * gamma = (1 - t_r') / (1 - t_r) in transfer_surpluses()
1943 (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, in transfer_surpluses()
1944 WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); in transfer_surpluses()
1955 if (iocg->level == 0) { in transfer_surpluses()
1957 iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( in transfer_surpluses()
1958 iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), in transfer_surpluses()
1959 WEIGHT_ONE - iocg->hweight_after_donation); in transfer_surpluses()
1963 parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1966 iocg->hweight_inuse = DIV64_U64_ROUND_UP( in transfer_surpluses()
1967 (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), in transfer_surpluses()
1968 WEIGHT_ONE) + iocg->hweight_after_donation; in transfer_surpluses()
1972 (u64)parent->child_adjusted_sum * iocg->hweight_inuse, in transfer_surpluses()
1973 parent->hweight_inuse); in transfer_surpluses()
1977 iocg->child_active_sum * iocg->hweight_donating, in transfer_surpluses()
1978 iocg->hweight_active); in transfer_surpluses()
1979 sf = iocg->child_active_sum - st; in transfer_surpluses()
1981 (u64)iocg->active * iocg->hweight_donating, in transfer_surpluses()
1982 iocg->hweight_active); in transfer_surpluses()
1984 (u64)inuse * iocg->hweight_after_donation, in transfer_surpluses()
1985 iocg->hweight_inuse); in transfer_surpluses()
1987 iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); in transfer_surpluses()
1991 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and in transfer_surpluses()
1995 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1999 * In-debt iocgs participated in the donation calculation with in transfer_surpluses()
2002 * @iocg->inuse stay at the minimum and we don't wanna in transfer_surpluses()
2005 if (iocg->abs_vdebt) { in transfer_surpluses()
2006 WARN_ON_ONCE(iocg->inuse > 1); in transfer_surpluses()
2012 parent->child_adjusted_sum * iocg->hweight_after_donation, in transfer_surpluses()
2013 parent->hweight_inuse); in transfer_surpluses()
2016 iocg->inuse, inuse, in transfer_surpluses()
2017 iocg->hweight_inuse, in transfer_surpluses()
2018 iocg->hweight_after_donation); in transfer_surpluses()
2020 __propagate_weights(iocg, iocg->active, inuse, true, now); in transfer_surpluses()
2025 list_del_init(&iocg->walk_list); in transfer_surpluses()
2032 * more. If there are no other subsequent IO issuers, the in-debt iocg may end
2033 * up blocked paying its debt while the IO device is idle.
2036 * sufficiently idle for a while, the debts are halved and delays are
2047 ioc->dfgv_period_at = now->now; in ioc_forgive_debts()
2048 ioc->dfgv_period_rem = 0; in ioc_forgive_debts()
2049 ioc->dfgv_usage_us_sum = 0; in ioc_forgive_debts()
2059 if (ioc->busy_level > 0) in ioc_forgive_debts()
2060 usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us); in ioc_forgive_debts()
2062 ioc->dfgv_usage_us_sum += usage_us_sum; in ioc_forgive_debts()
2063 if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD)) in ioc_forgive_debts()
2070 dur = now->now - ioc->dfgv_period_at; in ioc_forgive_debts()
2071 usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur); in ioc_forgive_debts()
2073 ioc->dfgv_period_at = now->now; in ioc_forgive_debts()
2074 ioc->dfgv_usage_us_sum = 0; in ioc_forgive_debts()
2078 ioc->dfgv_period_rem = 0; in ioc_forgive_debts()
2087 * run and carrying over the left-over duration in @ioc->dfgv_period_rem in ioc_forgive_debts()
2088 * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive in ioc_forgive_debts()
2091 nr_cycles = dur + ioc->dfgv_period_rem; in ioc_forgive_debts()
2092 ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD); in ioc_forgive_debts()
2094 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { in ioc_forgive_debts()
2097 if (!iocg->abs_vdebt && !iocg->delay) in ioc_forgive_debts()
2100 spin_lock(&iocg->waitq.lock); in ioc_forgive_debts()
2102 old_debt = iocg->abs_vdebt; in ioc_forgive_debts()
2103 old_delay = iocg->delay; in ioc_forgive_debts()
2105 if (iocg->abs_vdebt) in ioc_forgive_debts()
2106 iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1; in ioc_forgive_debts()
2107 if (iocg->delay) in ioc_forgive_debts()
2108 iocg->delay = iocg->delay >> nr_cycles ?: 1; in ioc_forgive_debts()
2113 old_debt, iocg->abs_vdebt, in ioc_forgive_debts()
2114 old_delay, iocg->delay); in ioc_forgive_debts()
2116 spin_unlock(&iocg->waitq.lock); in ioc_forgive_debts()
2122 * idle iocgs.
2127 * which should have woken up in the last period and expire idle
2135 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { in ioc_check_iocgs()
2136 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && in ioc_check_iocgs()
2137 !iocg->delay && !iocg_is_idle(iocg)) in ioc_check_iocgs()
2140 spin_lock(&iocg->waitq.lock); in ioc_check_iocgs()
2142 /* flush wait and indebt stat deltas */ in ioc_check_iocgs()
2143 if (iocg->wait_since) { in ioc_check_iocgs()
2144 iocg->stat.wait_us += now->now - iocg->wait_since; in ioc_check_iocgs()
2145 iocg->wait_since = now->now; in ioc_check_iocgs()
2147 if (iocg->indebt_since) { in ioc_check_iocgs()
2148 iocg->stat.indebt_us += in ioc_check_iocgs()
2149 now->now - iocg->indebt_since; in ioc_check_iocgs()
2150 iocg->indebt_since = now->now; in ioc_check_iocgs()
2152 if (iocg->indelay_since) { in ioc_check_iocgs()
2153 iocg->stat.indelay_us += in ioc_check_iocgs()
2154 now->now - iocg->indelay_since; in ioc_check_iocgs()
2155 iocg->indelay_since = now->now; in ioc_check_iocgs()
2158 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || in ioc_check_iocgs()
2159 iocg->delay) { in ioc_check_iocgs()
2162 if (iocg->abs_vdebt || iocg->delay) in ioc_check_iocgs()
2165 /* no waiter and idle, deactivate */ in ioc_check_iocgs()
2166 u64 vtime = atomic64_read(&iocg->vtime); in ioc_check_iocgs()
2175 excess = now->vnow - vtime - ioc->margins.target; in ioc_check_iocgs()
2180 ioc->vtime_err -= div64_u64(excess * old_hwi, in ioc_check_iocgs()
2185 atomic64_read(&iocg->active_period), in ioc_check_iocgs()
2186 atomic64_read(&ioc->cur_period), vtime); in ioc_check_iocgs()
2188 list_del_init(&iocg->active_list); in ioc_check_iocgs()
2191 spin_unlock(&iocg->waitq.lock); in ioc_check_iocgs()
2206 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; in ioc_timer_fn()
2207 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; in ioc_timer_fn()
2216 spin_lock_irq(&ioc->lock); in ioc_timer_fn()
2220 period_vtime = now.vnow - ioc->period_at_vtime; in ioc_timer_fn()
2222 spin_unlock_irq(&ioc->lock); in ioc_timer_fn()
2229 * Wait and indebt stat are flushed above and the donation calculation in ioc_timer_fn()
2230 * below needs updated usage stat. Let's bring stat up-to-date. in ioc_timer_fn()
2232 iocg_flush_stat(&ioc->active_iocgs, &now); in ioc_timer_fn()
2235 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { in ioc_timer_fn()
2243 vdone = atomic64_read(&iocg->done_vtime); in ioc_timer_fn()
2244 vtime = atomic64_read(&iocg->vtime); in ioc_timer_fn()
2249 * in-flight for longer than a period. Detect them by in ioc_timer_fn()
2254 !atomic_read(&iocg_to_blkg(iocg)->use_delay) && in ioc_timer_fn()
2256 time_after64(vtime, now.vnow - in ioc_timer_fn()
2258 time_before64(vdone, now.vnow - period_vtime)) in ioc_timer_fn()
2262 * Determine absolute usage factoring in in-flight IOs to avoid in ioc_timer_fn()
2263 * high-latency completions appearing as idle. in ioc_timer_fn()
2265 usage_us = iocg->usage_delta_us; in ioc_timer_fn()
2269 WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); in ioc_timer_fn()
2271 (!waitqueue_active(&iocg->waitq) && in ioc_timer_fn()
2272 time_before64(vtime, now.vnow - ioc->margins.low))) { in ioc_timer_fn()
2278 cost_to_abs_cost(vtime - vdone, hw_inuse), in ioc_timer_fn()
2279 ioc->vtime_base_rate); in ioc_timer_fn()
2285 if (time_after64(iocg->activated_at, ioc->period_at)) in ioc_timer_fn()
2286 usage_dur = max_t(u64, now.now - iocg->activated_at, 1); in ioc_timer_fn()
2288 usage_dur = max_t(u64, now.now - ioc->period_at, 1); in ioc_timer_fn()
2311 iocg->hweight_donating = hwa; in ioc_timer_fn()
2312 iocg->hweight_after_donation = new_hwi; in ioc_timer_fn()
2313 list_add(&iocg->surplus_list, &surpluses); in ioc_timer_fn()
2314 } else if (!iocg->abs_vdebt) { in ioc_timer_fn()
2326 iocg->inuse, iocg->active, in ioc_timer_fn()
2327 iocg->hweight_inuse, new_hwi); in ioc_timer_fn()
2329 __propagate_weights(iocg, iocg->active, in ioc_timer_fn()
2330 iocg->active, true, &now); in ioc_timer_fn()
2346 list_del_init(&iocg->surplus_list); in ioc_timer_fn()
2354 prev_busy_level = ioc->busy_level; in ioc_timer_fn()
2359 ioc->busy_level = max(ioc->busy_level, 0); in ioc_timer_fn()
2360 ioc->busy_level++; in ioc_timer_fn()
2370 ioc->busy_level = min(ioc->busy_level, 0); in ioc_timer_fn()
2373 * If there are IOs spanning multiple periods, wait in ioc_timer_fn()
2377 ioc->busy_level--; in ioc_timer_fn()
2385 ioc->busy_level = 0; in ioc_timer_fn()
2389 ioc->busy_level = 0; in ioc_timer_fn()
2392 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); in ioc_timer_fn()
2405 atomic64_inc(&ioc->cur_period); in ioc_timer_fn()
2407 if (ioc->running != IOC_STOP) { in ioc_timer_fn()
2408 if (!list_empty(&ioc->active_iocgs)) { in ioc_timer_fn()
2411 ioc->busy_level = 0; in ioc_timer_fn()
2412 ioc->vtime_err = 0; in ioc_timer_fn()
2413 ioc->running = IOC_IDLE; in ioc_timer_fn()
2419 spin_unlock_irq(&ioc->lock); in ioc_timer_fn()
2425 struct ioc *ioc = iocg->ioc; in adjust_inuse_and_calc_cost()
2426 struct ioc_margins *margins = &ioc->margins; in adjust_inuse_and_calc_cost()
2427 u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; in adjust_inuse_and_calc_cost()
2435 margin = now->vnow - vtime - cost; in adjust_inuse_and_calc_cost()
2438 if (iocg->abs_vdebt) in adjust_inuse_and_calc_cost()
2445 if (margin >= iocg->saved_margin || margin >= margins->low || in adjust_inuse_and_calc_cost()
2446 iocg->inuse == iocg->active) in adjust_inuse_and_calc_cost()
2449 spin_lock_irq(&ioc->lock); in adjust_inuse_and_calc_cost()
2452 if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { in adjust_inuse_and_calc_cost()
2453 spin_unlock_irq(&ioc->lock); in adjust_inuse_and_calc_cost()
2459 * adj_step must be determined after acquiring ioc->lock - we might in adjust_inuse_and_calc_cost()
2461 * be reading 0 iocg->active before ioc->lock which will lead to in adjust_inuse_and_calc_cost()
2464 new_inuse = iocg->inuse; in adjust_inuse_and_calc_cost()
2465 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); in adjust_inuse_and_calc_cost()
2468 propagate_weights(iocg, iocg->active, new_inuse, true, now); in adjust_inuse_and_calc_cost()
2471 } while (time_after64(vtime + cost, now->vnow) && in adjust_inuse_and_calc_cost()
2472 iocg->inuse != iocg->active); in adjust_inuse_and_calc_cost()
2474 spin_unlock_irq(&ioc->lock); in adjust_inuse_and_calc_cost()
2477 old_inuse, iocg->inuse, old_hwi, hwi); in adjust_inuse_and_calc_cost()
2485 struct ioc *ioc = iocg->ioc; in calc_vtime_cost_builtin()
2493 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; in calc_vtime_cost_builtin()
2494 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO]; in calc_vtime_cost_builtin()
2495 coef_page = ioc->params.lcoefs[LCOEF_RPAGE]; in calc_vtime_cost_builtin()
2498 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO]; in calc_vtime_cost_builtin()
2499 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO]; in calc_vtime_cost_builtin()
2500 coef_page = ioc->params.lcoefs[LCOEF_WPAGE]; in calc_vtime_cost_builtin()
2506 if (iocg->cursor) { in calc_vtime_cost_builtin()
2507 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor); in calc_vtime_cost_builtin()
2538 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE]; in calc_size_vtime_cost_builtin()
2541 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE]; in calc_size_vtime_cost_builtin()
2558 struct blkcg_gq *blkg = bio->bi_blkg; in ioc_rqos_throttle()
2562 struct iocg_wait wait; in ioc_rqos_throttle() local
2568 if (!ioc->enabled || !iocg || !iocg->level) in ioc_rqos_throttle()
2579 iocg->cursor = bio_end_sector(bio); in ioc_rqos_throttle()
2580 vtime = atomic64_read(&iocg->vtime); in ioc_rqos_throttle()
2585 * tests are racy but the races aren't systemic - we only miss once in ioc_rqos_throttle()
2588 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && in ioc_rqos_throttle()
2596 * cause priority inversions are punted to @ioc->aux_iocg and charged as in ioc_rqos_throttle()
2597 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling in ioc_rqos_throttle()
2598 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine in ioc_rqos_throttle()
2602 ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); in ioc_rqos_throttle()
2608 * is synchronized against both ioc->lock and waitq.lock and we won't in ioc_rqos_throttle()
2613 if (unlikely(list_empty(&iocg->active_list))) { in ioc_rqos_throttle()
2639 blkcg_schedule_throttle(rqos->q->disk, in ioc_rqos_throttle()
2640 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); in ioc_rqos_throttle()
2646 if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { in ioc_rqos_throttle()
2652 propagate_weights(iocg, iocg->active, iocg->active, true, in ioc_rqos_throttle()
2660 * or too long. Each wait entry records the absolute cost it's in ioc_rqos_throttle()
2661 * waiting for to allow re-evaluation using a custom wait entry. in ioc_rqos_throttle()
2666 * All waiters are on iocg->waitq and the wait states are in ioc_rqos_throttle()
2669 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); in ioc_rqos_throttle()
2670 wait.wait.private = current; in ioc_rqos_throttle()
2671 wait.bio = bio; in ioc_rqos_throttle()
2672 wait.abs_cost = abs_cost; in ioc_rqos_throttle()
2673 wait.committed = false; /* will be set true by waker */ in ioc_rqos_throttle()
2675 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); in ioc_rqos_throttle()
2682 if (wait.committed) in ioc_rqos_throttle()
2688 finish_wait(&iocg->waitq, &wait.wait); in ioc_rqos_throttle()
2694 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); in ioc_rqos_merge()
2702 if (!ioc->enabled || !iocg || !iocg->level) in ioc_rqos_merge()
2711 vtime = atomic64_read(&iocg->vtime); in ioc_rqos_merge()
2716 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor) in ioc_rqos_merge()
2717 iocg->cursor = bio_end; in ioc_rqos_merge()
2723 if (rq->bio && rq->bio->bi_iocost_cost && in ioc_rqos_merge()
2724 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { in ioc_rqos_merge()
2734 spin_lock_irqsave(&ioc->lock, flags); in ioc_rqos_merge()
2735 spin_lock(&iocg->waitq.lock); in ioc_rqos_merge()
2737 if (likely(!list_empty(&iocg->active_list))) { in ioc_rqos_merge()
2740 blkcg_schedule_throttle(rqos->q->disk, in ioc_rqos_merge()
2741 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); in ioc_rqos_merge()
2746 spin_unlock(&iocg->waitq.lock); in ioc_rqos_merge()
2747 spin_unlock_irqrestore(&ioc->lock, flags); in ioc_rqos_merge()
2752 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); in ioc_rqos_done_bio()
2754 if (iocg && bio->bi_iocost_cost) in ioc_rqos_done_bio()
2755 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime); in ioc_rqos_done_bio()
2765 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) in ioc_rqos_done()
2781 on_q_ns = ktime_get_ns() - rq->alloc_time_ns; in ioc_rqos_done()
2782 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; in ioc_rqos_done()
2785 ccs = get_cpu_ptr(ioc->pcpu_stat); in ioc_rqos_done()
2788 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) in ioc_rqos_done()
2789 local_inc(&ccs->missed[rw].nr_met); in ioc_rqos_done()
2791 local_inc(&ccs->missed[rw].nr_missed); in ioc_rqos_done()
2793 local64_add(rq_wait_ns, &ccs->rq_wait_ns); in ioc_rqos_done()
2802 spin_lock_irq(&ioc->lock); in ioc_rqos_queue_depth_changed()
2804 spin_unlock_irq(&ioc->lock); in ioc_rqos_queue_depth_changed()
2811 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost); in ioc_rqos_exit()
2813 spin_lock_irq(&ioc->lock); in ioc_rqos_exit()
2814 ioc->running = IOC_STOP; in ioc_rqos_exit()
2815 spin_unlock_irq(&ioc->lock); in ioc_rqos_exit()
2817 del_timer_sync(&ioc->timer); in ioc_rqos_exit()
2818 free_percpu(ioc->pcpu_stat); in ioc_rqos_exit()
2833 struct request_queue *q = disk->queue; in blk_iocost_init()
2840 return -ENOMEM; in blk_iocost_init()
2842 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat); in blk_iocost_init()
2843 if (!ioc->pcpu_stat) { in blk_iocost_init()
2845 return -ENOMEM; in blk_iocost_init()
2849 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); in blk_iocost_init()
2851 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { in blk_iocost_init()
2852 local_set(&ccs->missed[i].nr_met, 0); in blk_iocost_init()
2853 local_set(&ccs->missed[i].nr_missed, 0); in blk_iocost_init()
2855 local64_set(&ccs->rq_wait_ns, 0); in blk_iocost_init()
2858 rqos = &ioc->rqos; in blk_iocost_init()
2859 rqos->id = RQ_QOS_COST; in blk_iocost_init()
2860 rqos->ops = &ioc_rqos_ops; in blk_iocost_init()
2861 rqos->q = q; in blk_iocost_init()
2863 spin_lock_init(&ioc->lock); in blk_iocost_init()
2864 timer_setup(&ioc->timer, ioc_timer_fn, 0); in blk_iocost_init()
2865 INIT_LIST_HEAD(&ioc->active_iocgs); in blk_iocost_init()
2867 ioc->running = IOC_IDLE; in blk_iocost_init()
2868 ioc->vtime_base_rate = VTIME_PER_USEC; in blk_iocost_init()
2869 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); in blk_iocost_init()
2870 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); in blk_iocost_init()
2871 ioc->period_at = ktime_to_us(ktime_get()); in blk_iocost_init()
2872 atomic64_set(&ioc->cur_period, 0); in blk_iocost_init()
2873 atomic_set(&ioc->hweight_gen, 0); in blk_iocost_init()
2875 spin_lock_irq(&ioc->lock); in blk_iocost_init()
2876 ioc->autop_idx = AUTOP_INVALID; in blk_iocost_init()
2878 spin_unlock_irq(&ioc->lock); in blk_iocost_init()
2898 free_percpu(ioc->pcpu_stat); in blk_iocost_init()
2911 iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; in ioc_cpd_alloc()
2912 return &iocc->cpd; in ioc_cpd_alloc()
2923 int levels = blkcg->css.cgroup->level + 1; in ioc_pd_alloc()
2926 iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node); in ioc_pd_alloc()
2930 iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); in ioc_pd_alloc()
2931 if (!iocg->pcpu_stat) { in ioc_pd_alloc()
2936 return &iocg->pd; in ioc_pd_alloc()
2942 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd); in ioc_pd_init()
2943 struct ioc *ioc = q_to_ioc(blkg->q); in ioc_pd_init()
2950 iocg->ioc = ioc; in ioc_pd_init()
2951 atomic64_set(&iocg->vtime, now.vnow); in ioc_pd_init()
2952 atomic64_set(&iocg->done_vtime, now.vnow); in ioc_pd_init()
2953 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); in ioc_pd_init()
2954 INIT_LIST_HEAD(&iocg->active_list); in ioc_pd_init()
2955 INIT_LIST_HEAD(&iocg->walk_list); in ioc_pd_init()
2956 INIT_LIST_HEAD(&iocg->surplus_list); in ioc_pd_init()
2957 iocg->hweight_active = WEIGHT_ONE; in ioc_pd_init()
2958 iocg->hweight_inuse = WEIGHT_ONE; in ioc_pd_init()
2960 init_waitqueue_head(&iocg->waitq); in ioc_pd_init()
2961 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); in ioc_pd_init()
2962 iocg->waitq_timer.function = iocg_waitq_timer_fn; in ioc_pd_init()
2964 iocg->level = blkg->blkcg->css.cgroup->level; in ioc_pd_init()
2966 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) { in ioc_pd_init()
2968 iocg->ancestors[tiocg->level] = tiocg; in ioc_pd_init()
2971 spin_lock_irqsave(&ioc->lock, flags); in ioc_pd_init()
2973 spin_unlock_irqrestore(&ioc->lock, flags); in ioc_pd_init()
2979 struct ioc *ioc = iocg->ioc; in ioc_pd_free()
2983 spin_lock_irqsave(&ioc->lock, flags); in ioc_pd_free()
2985 if (!list_empty(&iocg->active_list)) { in ioc_pd_free()
2990 list_del_init(&iocg->active_list); in ioc_pd_free()
2993 WARN_ON_ONCE(!list_empty(&iocg->walk_list)); in ioc_pd_free()
2994 WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); in ioc_pd_free()
2996 spin_unlock_irqrestore(&ioc->lock, flags); in ioc_pd_free()
2998 hrtimer_cancel(&iocg->waitq_timer); in ioc_pd_free()
3000 free_percpu(iocg->pcpu_stat); in ioc_pd_free()
3007 struct ioc *ioc = iocg->ioc; in ioc_pd_stat()
3009 if (!ioc->enabled) in ioc_pd_stat()
3012 if (iocg->level == 0) { in ioc_pd_stat()
3014 ioc->vtime_base_rate * 10000, in ioc_pd_stat()
3019 seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); in ioc_pd_stat()
3022 seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", in ioc_pd_stat()
3023 iocg->last_stat.wait_us, in ioc_pd_stat()
3024 iocg->last_stat.indebt_us, in ioc_pd_stat()
3025 iocg->last_stat.indelay_us); in ioc_pd_stat()
3031 const char *dname = blkg_dev_name(pd->blkg); in ioc_weight_prfill()
3034 if (dname && iocg->cfg_weight) in ioc_weight_prfill()
3035 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); in ioc_weight_prfill()
3045 seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); in ioc_weight_show()
3047 &blkcg_policy_iocost, seq_cft(sf)->private, false); in ioc_weight_show()
3066 return -EINVAL; in ioc_weight_write()
3069 return -EINVAL; in ioc_weight_write()
3071 spin_lock_irq(&blkcg->lock); in ioc_weight_write()
3072 iocc->dfl_weight = v * WEIGHT_ONE; in ioc_weight_write()
3073 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { in ioc_weight_write()
3077 spin_lock(&iocg->ioc->lock); in ioc_weight_write()
3078 ioc_now(iocg->ioc, &now); in ioc_weight_write()
3080 spin_unlock(&iocg->ioc->lock); in ioc_weight_write()
3083 spin_unlock_irq(&blkcg->lock); in ioc_weight_write()
3103 spin_lock(&iocg->ioc->lock); in ioc_weight_write()
3104 iocg->cfg_weight = v * WEIGHT_ONE; in ioc_weight_write()
3105 ioc_now(iocg->ioc, &now); in ioc_weight_write()
3107 spin_unlock(&iocg->ioc->lock); in ioc_weight_write()
3114 return -EINVAL; in ioc_weight_write()
3120 const char *dname = blkg_dev_name(pd->blkg); in ioc_qos_prfill()
3121 struct ioc *ioc = pd_to_iocg(pd)->ioc; in ioc_qos_prfill()
3127 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", in ioc_qos_prfill()
3128 ioc->params.qos[QOS_RPPM] / 10000, in ioc_qos_prfill()
3129 ioc->params.qos[QOS_RPPM] % 10000 / 100, in ioc_qos_prfill()
3130 ioc->params.qos[QOS_RLAT], in ioc_qos_prfill()
3131 ioc->params.qos[QOS_WPPM] / 10000, in ioc_qos_prfill()
3132 ioc->params.qos[QOS_WPPM] % 10000 / 100, in ioc_qos_prfill()
3133 ioc->params.qos[QOS_WLAT], in ioc_qos_prfill()
3134 ioc->params.qos[QOS_MIN] / 10000, in ioc_qos_prfill()
3135 ioc->params.qos[QOS_MIN] % 10000 / 100, in ioc_qos_prfill()
3136 ioc->params.qos[QOS_MAX] / 10000, in ioc_qos_prfill()
3137 ioc->params.qos[QOS_MAX] % 10000 / 100); in ioc_qos_prfill()
3146 &blkcg_policy_iocost, seq_cft(sf)->private, false); in ioc_qos_show()
3181 disk = bdev->bd_disk; in ioc_qos_write()
3182 ioc = q_to_ioc(disk->queue); in ioc_qos_write()
3187 ioc = q_to_ioc(disk->queue); in ioc_qos_write()
3190 spin_lock_irq(&ioc->lock); in ioc_qos_write()
3191 memcpy(qos, ioc->params.qos, sizeof(qos)); in ioc_qos_write()
3192 enable = ioc->enabled; in ioc_qos_write()
3193 user = ioc->user_qos_params; in ioc_qos_write()
3194 spin_unlock_irq(&ioc->lock); in ioc_qos_write()
3261 spin_lock_irq(&ioc->lock); in ioc_qos_write()
3264 blk_stat_enable_accounting(disk->queue); in ioc_qos_write()
3265 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); in ioc_qos_write()
3266 ioc->enabled = true; in ioc_qos_write()
3268 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); in ioc_qos_write()
3269 ioc->enabled = false; in ioc_qos_write()
3273 memcpy(ioc->params.qos, qos, sizeof(qos)); in ioc_qos_write()
3274 ioc->user_qos_params = true; in ioc_qos_write()
3276 ioc->user_qos_params = false; in ioc_qos_write()
3280 spin_unlock_irq(&ioc->lock); in ioc_qos_write()
3285 ret = -EINVAL; in ioc_qos_write()
3294 const char *dname = blkg_dev_name(pd->blkg); in ioc_cost_model_prfill()
3295 struct ioc *ioc = pd_to_iocg(pd)->ioc; in ioc_cost_model_prfill()
3296 u64 *u = ioc->params.i_lcoefs; in ioc_cost_model_prfill()
3304 dname, ioc->user_cost_model ? "user" : "auto", in ioc_cost_model_prfill()
3315 &blkcg_policy_iocost, seq_cft(sf)->private, false); in ioc_cost_model_show()
3351 ret = blk_iocost_init(bdev->bd_disk); in ioc_cost_model_write()
3357 spin_lock_irq(&ioc->lock); in ioc_cost_model_write()
3358 memcpy(u, ioc->params.i_lcoefs, sizeof(u)); in ioc_cost_model_write()
3359 user = ioc->user_cost_model; in ioc_cost_model_write()
3360 spin_unlock_irq(&ioc->lock); in ioc_cost_model_write()
3397 spin_lock_irq(&ioc->lock); in ioc_cost_model_write()
3399 memcpy(ioc->params.i_lcoefs, u, sizeof(u)); in ioc_cost_model_write()
3400 ioc->user_cost_model = true; in ioc_cost_model_write()
3402 ioc->user_cost_model = false; in ioc_cost_model_write()
3405 spin_unlock_irq(&ioc->lock); in ioc_cost_model_write()
3411 ret = -EINVAL; in ioc_cost_model_write()