Lines Matching +full:wake +full:- +full:on +full:- +full:motion
1 // SPDX-License-Identifier: GPL-2.0-only
7 * Copyright (C) 1991-2002 Linus Torvalds
70 # include <linux/entry-common.h>
95 #include "../../io_uring/io-wq.h"
122 * at compile time and compiler optimization based on features default.
157 if (p->sched_class == &stop_sched_class) /* trumps deadline */ in __task_prio()
158 return -2; in __task_prio()
160 if (rt_prio(p->prio)) /* includes deadline */ in __task_prio()
161 return p->prio; /* [-1, 99] */ in __task_prio()
163 if (p->sched_class == &idle_sched_class) in __task_prio()
182 if (-pa < -pb) in prio_less()
185 if (-pb < -pa) in prio_less()
188 if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ in prio_less()
189 return !dl_time_before(a->dl.deadline, b->dl.deadline); in prio_less()
199 if (a->core_cookie < b->core_cookie) in __sched_core_less()
202 if (a->core_cookie > b->core_cookie) in __sched_core_less()
206 if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count)) in __sched_core_less()
224 if (cookie < p->core_cookie) in rb_sched_core_cmp()
225 return -1; in rb_sched_core_cmp()
227 if (cookie > p->core_cookie) in rb_sched_core_cmp()
235 rq->core->core_task_seq++; in sched_core_enqueue()
237 if (!p->core_cookie) in sched_core_enqueue()
240 rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); in sched_core_enqueue()
245 rq->core->core_task_seq++; in sched_core_dequeue()
248 rb_erase(&p->core_node, &rq->core_tree); in sched_core_dequeue()
249 RB_CLEAR_NODE(&p->core_node); in sched_core_dequeue()
255 * and re-examine whether the core is still in forced idle state. in sched_core_dequeue()
257 if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 && in sched_core_dequeue()
258 rq->core->core_forceidle_count && rq->curr == rq->idle) in sched_core_dequeue()
263 * Find left-most (aka, highest priority) task matching @cookie.
269 node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); in sched_core_find()
281 struct rb_node *node = &p->core_node; in sched_core_next()
288 if (p->core_cookie != cookie) in sched_core_next()
302 * always agree on what rq has what lock.
318 raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); in sched_core_lock()
327 raw_spin_unlock(&cpu_rq(t)->__lock); in sched_core_unlock()
348 cpu_rq(t)->core_enabled = enabled; in __sched_core_flip()
350 cpu_rq(cpu)->core->core_forceidle_start = 0; in __sched_core_flip()
361 cpu_rq(cpu)->core_enabled = enabled; in __sched_core_flip()
371 WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree)); in sched_core_assert_empty()
423 * 'work'. If it is the last *again*, we rely on in sched_core_put()
426 if (!atomic_add_unless(&sched_core_count, -1, 1)) in sched_core_put()
443 * p->pi_lock
444 * rq->lock
445 * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
447 * rq1->lock
448 * rq2->lock where: rq1 < rq2
452 * Normal scheduling state is serialized by rq->lock. __schedule() takes the
453 * local CPU's rq->lock, it optionally removes the task from the runqueue and
457 * Task enqueue is also under rq->lock, possibly taken from another CPU.
463 * complicated to avoid having to take two rq->locks.
467 * System-calls and anything external will use task_rq_lock() which acquires
468 * both p->pi_lock and rq->lock. As a consequence the state they change is
471 * - sched_setaffinity()/
472 * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
473 * - set_user_nice(): p->se.load, p->*prio
474 * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
475 * p->se.load, p->rt_priority,
476 * p->dl.dl_{runtime, deadline, period, flags, bw, density}
477 * - sched_setnuma(): p->numa_preferred_nid
478 * - sched_move_task(): p->sched_task_group
479 * - uclamp_update_active() p->uclamp*
481 * p->state <- TASK_*:
485 * try_to_wake_up(). This latter uses p->pi_lock to serialize against
488 * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
491 * rq->lock. Non-zero indicates the task is runnable, the special
493 * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
495 * p->on_cpu <- { 0, 1 }:
498 * set before p is scheduled-in and cleared after p is scheduled-out, both
499 * under rq->lock. Non-zero indicates the task is running on its CPU.
501 * [ The astute reader will observe that it is possible for two tasks on one
502 * CPU to have ->on_cpu = 1 at the same time. ]
506 * - Don't call set_task_cpu() on a blocked task:
508 * We don't care what CPU we're not running on, this simplifies hotplug,
511 * - for try_to_wake_up(), called under p->pi_lock:
513 * This allows try_to_wake_up() to only take one rq->lock, see its comment.
515 * - for migration called under rq->lock:
521 * - for migration called under double_rq_lock():
537 raw_spin_lock_nested(&rq->__lock, subclass); in raw_spin_rq_lock_nested()
563 ret = raw_spin_trylock(&rq->__lock); in raw_spin_rq_trylock()
586 * double_rq_lock - safely lock two runqueues
604 * __task_rq_lock - lock the rq @p resides on.
607 __acquires(rq->lock) in __task_rq_lock()
611 lockdep_assert_held(&p->pi_lock); in __task_rq_lock()
628 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
631 __acquires(p->pi_lock) in task_rq_lock()
632 __acquires(rq->lock) in task_rq_lock()
637 raw_spin_lock_irqsave(&p->pi_lock, rf->flags); in task_rq_lock()
643 * ACQUIRE (rq->lock) in task_rq_lock()
644 * [S] ->on_rq = MIGRATING [L] rq = task_rq() in task_rq_lock()
645 * WMB (__set_task_cpu()) ACQUIRE (rq->lock); in task_rq_lock()
646 * [S] ->cpu = new_cpu [L] task_rq() in task_rq_lock()
647 * [L] ->on_rq in task_rq_lock()
648 * RELEASE (rq->lock) in task_rq_lock()
651 * the old rq->lock will fully serialize against the stores. in task_rq_lock()
662 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); in task_rq_lock()
670 * RQ-clock updating methods:
682 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; in update_rq_clock_task()
685 * Since irq_time is only updated on {soft,}irq_exit, we might run into in update_rq_clock_task()
689 * When this happens, we stop ->clock_task and only update the in update_rq_clock_task()
691 * update will consume the rest. This ensures ->clock_task is in update_rq_clock_task()
694 * It does however cause some slight miss-attribution of {soft,}irq in update_rq_clock_task()
696 * the current rq->clock timestamp, except that would require using in update_rq_clock_task()
702 rq->prev_irq_time += irq_delta; in update_rq_clock_task()
703 delta -= irq_delta; in update_rq_clock_task()
704 psi_account_irqtime(rq->curr, irq_delta); in update_rq_clock_task()
709 steal -= rq->prev_steal_time_rq; in update_rq_clock_task()
714 rq->prev_steal_time_rq += steal; in update_rq_clock_task()
715 delta -= steal; in update_rq_clock_task()
719 rq->clock_task += delta; in update_rq_clock_task()
734 if (rq->clock_update_flags & RQCF_ACT_SKIP) in update_rq_clock()
739 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); in update_rq_clock()
740 rq->clock_update_flags |= RQCF_UPDATED; in update_rq_clock()
743 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; in update_rq_clock()
746 rq->clock += delta; in update_rq_clock()
752 * Use HR-timers to deliver accurate preemption points.
757 if (hrtimer_active(&rq->hrtick_timer)) in hrtick_clear()
758 hrtimer_cancel(&rq->hrtick_timer); in hrtick_clear()
762 * High-resolution timer tick.
774 rq->curr->sched_class->task_tick(rq, rq->curr, 1); in hrtick()
784 struct hrtimer *timer = &rq->hrtick_timer; in __hrtick_restart()
785 ktime_t time = rq->hrtick_time; in __hrtick_restart()
806 * called with rq->lock held and irqs disabled
810 struct hrtimer *timer = &rq->hrtick_timer; in hrtick_start()
818 rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); in hrtick_start()
823 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); in hrtick_start()
830 * called with rq->lock held and irqs disabled
836 * doesn't make sense. Rely on vruntime for fairness. in hrtick_start()
839 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), in hrtick_start()
848 INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); in hrtick_rq_init()
850 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); in hrtick_rq_init()
851 rq->hrtick_timer.function = hrtick; in hrtick_rq_init()
886 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); in set_nr_and_not_polling()
898 typeof(ti->flags) val = READ_ONCE(ti->flags); in set_nr_if_polling()
905 if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)) in set_nr_if_polling()
928 struct wake_q_node *node = &task->wake_q; in __wake_q_add()
931 * Atomically grab the task, if ->wake_q is !nil already it means in __wake_q_add()
939 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) in __wake_q_add()
945 *head->lastp = node; in __wake_q_add()
946 head->lastp = &node->next; in __wake_q_add()
951 * wake_q_add() - queue a wakeup for 'later' waking.
959 * This function must be used as-if it were wake_up_process(); IOW the task
969 * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
977 * This function must be used as-if it were wake_up_process(); IOW the task
980 * This function is essentially a task-safe equivalent to wake_q_add(). Callers
993 struct wake_q_node *node = head->first; in wake_up_q()
999 /* Task can safely be re-inserted now: */ in wake_up_q()
1000 node = node->next; in wake_up_q()
1001 task->wake_q.next = NULL; in wake_up_q()
1013 * resched_curr - mark rq's current task 'to be rescheduled now'.
1015 * On UP this means the setting of the need_resched flag, on SMP it
1016 * might also involve a cross-CPU call to trigger the scheduler on
1021 struct task_struct *curr = rq->curr; in resched_curr()
1058 * from an idle CPU. This is good for power-savings.
1066 int i, cpu = smp_processor_id(), default_cpu = -1; in get_nohz_timer_target()
1091 if (default_cpu == -1) in get_nohz_timer_target()
1102 * which is scheduled to wake up that CPU. In case of a completely
1116 if (set_nr_and_not_polling(rq->idle)) in wake_up_idle_cpu()
1125 * We just need the target to call irq_exit() and re-evaluate in wake_up_full_nohz_cpu()
1131 return true; /* Don't try to wake offline CPUs. */ in wake_up_full_nohz_cpu()
1143 * Wake up the specified CPU. If the CPU is going offline, it is the
1165 rq->idle_balance = idle_cpu(cpu); in nohz_csd_func()
1166 if (rq->idle_balance && !need_resched()) { in nohz_csd_func()
1167 rq->nohz_idle_balance = flags; in nohz_csd_func()
1180 if (rq->dl.dl_nr_running) in sched_can_stop_tick()
1187 if (rq->rt.rr_nr_running) { in sched_can_stop_tick()
1188 if (rq->rt.rr_nr_running == 1) in sched_can_stop_tick()
1198 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; in sched_can_stop_tick()
1207 if (rq->nr_running > 1) in sched_can_stop_tick()
1235 list_for_each_entry_rcu(child, &parent->children, siblings) { in walk_tg_tree_from()
1247 parent = parent->parent; in walk_tg_tree_from()
1262 int prio = p->static_prio - MAX_RT_PRIO; in set_load_weight()
1263 struct load_weight *load = &p->se.load; in set_load_weight()
1269 load->weight = scale_load(WEIGHT_IDLEPRIO); in set_load_weight()
1270 load->inv_weight = WMULT_IDLEPRIO; in set_load_weight()
1278 if (update_load && p->sched_class == &fair_sched_class) { in set_load_weight()
1281 load->weight = scale_load(sched_prio_to_weight[prio]); in set_load_weight()
1282 load->inv_weight = sched_prio_to_wmult[prio]; in set_load_weight()
1290 * The (slow-path) user-space triggers utilization clamp value updates which
1291 * can require updates on (fast-path) scheduler's data structures used to
1293 * While the per-CPU rq lock protects fast-path update operations, user-space
1315 * This knob only affects RT tasks that their uclamp_se->user_defined == false.
1353 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); in uclamp_bucket_id()
1366 uc_se->value = value; in uclamp_se_set()
1367 uc_se->bucket_id = uclamp_bucket_id(value); in uclamp_se_set()
1368 uc_se->user_defined = user_defined; in uclamp_se_set()
1377 * idle (which drops the max-clamp) by retaining the last known in uclamp_idle_value()
1378 * max-clamp. in uclamp_idle_value()
1381 rq->uclamp_flags |= UCLAMP_FLAG_IDLE; in uclamp_idle_value()
1391 /* Reset max-clamp retention only on idle exit */ in uclamp_idle_reset()
1392 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) in uclamp_idle_reset()
1395 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); in uclamp_idle_reset()
1402 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; in uclamp_rq_max_value()
1403 int bucket_id = UCLAMP_BUCKETS - 1; in uclamp_rq_max_value()
1409 for ( ; bucket_id >= 0; bucket_id--) { in uclamp_rq_max_value()
1415 /* No tasks -- default clamp values */ in uclamp_rq_max_value()
1424 lockdep_assert_held(&p->pi_lock); in __uclamp_update_util_min_rt_default()
1426 uc_se = &p->uclamp_req[UCLAMP_MIN]; in __uclamp_update_util_min_rt_default()
1429 if (uc_se->user_defined) in __uclamp_update_util_min_rt_default()
1444 /* Protect updates to p->uclamp_* */ in uclamp_update_util_min_rt_default()
1454 struct uclamp_se uc_req = p->uclamp_req[clamp_id]; in uclamp_tg_restrict()
1467 tg_min = task_group(p)->uclamp[UCLAMP_MIN].value; in uclamp_tg_restrict()
1468 tg_max = task_group(p)->uclamp[UCLAMP_MAX].value; in uclamp_tg_restrict()
1478 * The effective clamp bucket index of a task depends on, by increasing
1480 * - the task specific clamp value, when explicitly requested from userspace
1481 * - the task group effective clamp value, for tasks not either in the root
1483 * - the system default clamp value, defined by the sysadmin
1502 /* Task currently refcounted: use back-annotated (effective) value */ in uclamp_eff_value()
1503 if (p->uclamp[clamp_id].active) in uclamp_eff_value()
1504 return (unsigned long)p->uclamp[clamp_id].value; in uclamp_eff_value()
1512 * When a task is enqueued on a rq, the clamp bucket currently defined by the
1513 * task's uclamp::bucket_id is refcounted on that rq. This also immediately
1516 * Tasks can have a task-specific value requested from user-space, track
1524 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; in uclamp_rq_inc_id()
1525 struct uclamp_se *uc_se = &p->uclamp[clamp_id]; in uclamp_rq_inc_id()
1531 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); in uclamp_rq_inc_id()
1533 bucket = &uc_rq->bucket[uc_se->bucket_id]; in uclamp_rq_inc_id()
1534 bucket->tasks++; in uclamp_rq_inc_id()
1535 uc_se->active = true; in uclamp_rq_inc_id()
1537 uclamp_idle_reset(rq, clamp_id, uc_se->value); in uclamp_rq_inc_id()
1543 if (bucket->tasks == 1 || uc_se->value > bucket->value) in uclamp_rq_inc_id()
1544 bucket->value = uc_se->value; in uclamp_rq_inc_id()
1546 if (uc_se->value > READ_ONCE(uc_rq->value)) in uclamp_rq_inc_id()
1547 WRITE_ONCE(uc_rq->value, uc_se->value); in uclamp_rq_inc_id()
1562 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; in uclamp_rq_dec_id()
1563 struct uclamp_se *uc_se = &p->uclamp[clamp_id]; in uclamp_rq_dec_id()
1574 * In this case the uc_se->active flag should be false since no uclamp in uclamp_rq_dec_id()
1585 * // Must not decrement bucket->tasks here in uclamp_rq_dec_id()
1589 * bucket[uc_se->bucket_id]. in uclamp_rq_dec_id()
1593 if (unlikely(!uc_se->active)) in uclamp_rq_dec_id()
1596 bucket = &uc_rq->bucket[uc_se->bucket_id]; in uclamp_rq_dec_id()
1598 SCHED_WARN_ON(!bucket->tasks); in uclamp_rq_dec_id()
1599 if (likely(bucket->tasks)) in uclamp_rq_dec_id()
1600 bucket->tasks--; in uclamp_rq_dec_id()
1602 uc_se->active = false; in uclamp_rq_dec_id()
1610 if (likely(bucket->tasks)) in uclamp_rq_dec_id()
1613 rq_clamp = READ_ONCE(uc_rq->value); in uclamp_rq_dec_id()
1618 SCHED_WARN_ON(bucket->value > rq_clamp); in uclamp_rq_dec_id()
1619 if (bucket->value >= rq_clamp) { in uclamp_rq_dec_id()
1620 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); in uclamp_rq_dec_id()
1621 WRITE_ONCE(uc_rq->value, bkt_clamp); in uclamp_rq_dec_id()
1638 if (unlikely(!p->sched_class->uclamp_enabled)) in uclamp_rq_inc()
1645 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) in uclamp_rq_inc()
1646 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; in uclamp_rq_inc()
1662 if (unlikely(!p->sched_class->uclamp_enabled)) in uclamp_rq_dec()
1672 if (!p->uclamp[clamp_id].active) in uclamp_rq_reinc_id()
1680 * active tasks on rq. in uclamp_rq_reinc_id()
1682 if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) in uclamp_rq_reinc_id()
1683 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; in uclamp_rq_reinc_id()
1738 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], in uclamp_update_root_tg()
1740 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], in uclamp_update_root_tg()
1800 result = -EINVAL; in sysctl_sched_uclamp_handler()
1848 int util_min = p->uclamp_req[UCLAMP_MIN].value; in uclamp_validate()
1849 int util_max = p->uclamp_req[UCLAMP_MAX].value; in uclamp_validate()
1851 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { in uclamp_validate()
1852 util_min = attr->sched_util_min; in uclamp_validate()
1855 return -EINVAL; in uclamp_validate()
1858 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { in uclamp_validate()
1859 util_max = attr->sched_util_max; in uclamp_validate()
1862 return -EINVAL; in uclamp_validate()
1865 if (util_min != -1 && util_max != -1 && util_min > util_max) in uclamp_validate()
1866 return -EINVAL; in uclamp_validate()
1884 /* Reset on sched class change for a non user-defined clamp value. */ in uclamp_reset()
1885 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && in uclamp_reset()
1886 !uc_se->user_defined) in uclamp_reset()
1889 /* Reset on sched_util_{min,max} == -1. */ in uclamp_reset()
1891 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && in uclamp_reset()
1892 attr->sched_util_min == -1) { in uclamp_reset()
1897 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && in uclamp_reset()
1898 attr->sched_util_max == -1) { in uclamp_reset()
1911 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; in __setscheduler_uclamp()
1930 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) in __setscheduler_uclamp()
1933 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && in __setscheduler_uclamp()
1934 attr->sched_util_min != -1) { in __setscheduler_uclamp()
1935 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], in __setscheduler_uclamp()
1936 attr->sched_util_min, true); in __setscheduler_uclamp()
1939 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && in __setscheduler_uclamp()
1940 attr->sched_util_max != -1) { in __setscheduler_uclamp()
1941 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], in __setscheduler_uclamp()
1942 attr->sched_util_max, true); in __setscheduler_uclamp()
1951 * We don't need to hold task_rq_lock() when updating p->uclamp_* here in uclamp_fork()
1955 p->uclamp[clamp_id].active = false; in uclamp_fork()
1957 if (likely(!p->sched_reset_on_fork)) in uclamp_fork()
1961 uclamp_se_set(&p->uclamp_req[clamp_id], in uclamp_fork()
1974 struct uclamp_rq *uc_rq = rq->uclamp; in init_uclamp_rq()
1982 rq->uclamp_flags = UCLAMP_FLAG_IDLE; in init_uclamp_rq()
2016 return -EOPNOTSUPP; in uclamp_validate()
2039 raw_spin_lock_irq(&p->pi_lock); in get_wchan()
2040 state = READ_ONCE(p->__state); in get_wchan()
2042 if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq) in get_wchan()
2044 raw_spin_unlock_irq(&p->pi_lock); in get_wchan()
2060 p->sched_class->enqueue_task(rq, p, flags); in enqueue_task()
2080 p->sched_class->dequeue_task(rq, p, flags); in dequeue_task()
2087 p->on_rq = TASK_ON_RQ_QUEUED; in activate_task()
2092 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; in deactivate_task()
2102 prio = MAX_DL_PRIO - 1; in __normal_prio()
2104 prio = MAX_RT_PRIO - 1 - rt_prio; in __normal_prio()
2113 * without taking RT-inheritance into account. Might be
2120 return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio)); in normal_prio()
2128 * RT-boosted. If not then it returns p->normal_prio.
2132 p->normal_prio = normal_prio(p); in effective_prio()
2138 if (!rt_prio(p->prio)) in effective_prio()
2139 return p->normal_prio; in effective_prio()
2140 return p->prio; in effective_prio()
2144 * task_curr - is this task currently executing on a CPU?
2155 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
2165 if (prev_class != p->sched_class) { in check_class_changed()
2166 if (prev_class->switched_from) in check_class_changed()
2167 prev_class->switched_from(rq, p); in check_class_changed()
2169 p->sched_class->switched_to(rq, p); in check_class_changed()
2170 } else if (oldprio != p->prio || dl_task(p)) in check_class_changed()
2171 p->sched_class->prio_changed(rq, p, oldprio); in check_class_changed()
2176 if (p->sched_class == rq->curr->sched_class) in check_preempt_curr()
2177 rq->curr->sched_class->check_preempt_curr(rq, p, flags); in check_preempt_curr()
2178 else if (sched_class_above(p->sched_class, rq->curr->sched_class)) in check_preempt_curr()
2185 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) in check_preempt_curr()
2200 if (likely(!p->migration_disabled)) in migrate_disable_switch()
2203 if (p->cpus_ptr != &p->cpus_mask) in migrate_disable_switch()
2209 __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); in migrate_disable_switch()
2216 if (p->migration_disabled) { in migrate_disable()
2217 p->migration_disabled++; in migrate_disable()
2222 this_rq()->nr_pinned++; in migrate_disable()
2223 p->migration_disabled = 1; in migrate_disable()
2232 if (p->migration_disabled > 1) { in migrate_enable()
2233 p->migration_disabled--; in migrate_enable()
2237 if (WARN_ON_ONCE(!p->migration_disabled)) in migrate_enable()
2245 if (p->cpus_ptr != &p->cpus_mask) in migrate_enable()
2246 __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); in migrate_enable()
2253 p->migration_disabled = 0; in migrate_enable()
2254 this_rq()->nr_pinned--; in migrate_enable()
2261 return rq->nr_pinned; in rq_has_pinned_tasks()
2265 * Per-CPU kthreads are allowed to run on !active && online CPUs, see
2271 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) in is_cpu_allowed()
2279 if (!(p->flags & PF_KTHREAD)) in is_cpu_allowed()
2297 * 1) we invoke migration_cpu_stop() on the target CPU using
2309 * move_queued_task - move a queued task to new rq.
2354 * attempting to rebalance this task on exec (sched_exec).
2357 * as the task is no longer on this CPU.
2373 * migration_cpu_stop - this will be executed by a highprio stopper thread
2380 struct set_affinity_pending *pending = arg->pending; in migration_cpu_stop()
2381 struct task_struct *p = arg->task; in migration_cpu_stop()
2388 * be on another CPU but it doesn't matter. in migration_cpu_stop()
2392 * We need to explicitly wake pending tasks before running in migration_cpu_stop()
2398 raw_spin_lock(&p->pi_lock); in migration_cpu_stop()
2402 * If we were passed a pending, then ->stop_pending was set, thus in migration_cpu_stop()
2403 * p->migration_pending must have remained stable. in migration_cpu_stop()
2405 WARN_ON_ONCE(pending && pending != p->migration_pending); in migration_cpu_stop()
2409 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because in migration_cpu_stop()
2410 * we're holding p->pi_lock. in migration_cpu_stop()
2417 p->migration_pending = NULL; in migration_cpu_stop()
2420 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) in migration_cpu_stop()
2425 rq = __migrate_task(rq, &rf, p, arg->dest_cpu); in migration_cpu_stop()
2427 p->wake_cpu = arg->dest_cpu; in migration_cpu_stop()
2431 * up running on a dodgy CPU, AFAICT this can only happen in migration_cpu_stop()
2448 * ->pi_lock, so the allowed mask is stable - if it got in migration_cpu_stop()
2451 if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { in migration_cpu_stop()
2452 p->migration_pending = NULL; in migration_cpu_stop()
2458 * When migrate_enable() hits a rq mis-match we can't reliably in migration_cpu_stop()
2462 WARN_ON_ONCE(!pending->stop_pending); in migration_cpu_stop()
2465 &pending->arg, &pending->stop_work); in migration_cpu_stop()
2470 pending->stop_pending = false; in migration_cpu_stop()
2474 complete_all(&pending->done); in migration_cpu_stop()
2484 raw_spin_lock_irq(&p->pi_lock); in push_cpu_stop()
2491 p->migration_flags |= MDF_PUSH; in push_cpu_stop()
2495 p->migration_flags &= ~MDF_PUSH; in push_cpu_stop()
2497 if (p->sched_class->find_lock_rq) in push_cpu_stop()
2498 lowest_rq = p->sched_class->find_lock_rq(p, rq); in push_cpu_stop()
2506 set_task_cpu(p, lowest_rq->cpu); in push_cpu_stop()
2514 rq->push_busy = false; in push_cpu_stop()
2516 raw_spin_unlock_irq(&p->pi_lock); in push_cpu_stop()
2529 p->cpus_ptr = new_mask; in set_cpus_allowed_common()
2533 cpumask_copy(&p->cpus_mask, new_mask); in set_cpus_allowed_common()
2534 p->nr_cpus_allowed = cpumask_weight(new_mask); in set_cpus_allowed_common()
2545 * supposed to change these variables while holding both rq->lock and in __do_set_cpus_allowed()
2546 * p->pi_lock. in __do_set_cpus_allowed()
2549 * accesses these variables under p->pi_lock and only does so after in __do_set_cpus_allowed()
2550 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() in __do_set_cpus_allowed()
2556 SCHED_WARN_ON(!p->on_cpu); in __do_set_cpus_allowed()
2558 lockdep_assert_held(&p->pi_lock); in __do_set_cpus_allowed()
2565 * Because __kthread_bind() calls this on blocked tasks without in __do_set_cpus_allowed()
2566 * holding rq->lock. in __do_set_cpus_allowed()
2574 p->sched_class->set_cpus_allowed(p, new_mask, flags); in __do_set_cpus_allowed()
2590 if (!src->user_cpus_ptr) in dup_user_cpus_ptr()
2593 dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); in dup_user_cpus_ptr()
2594 if (!dst->user_cpus_ptr) in dup_user_cpus_ptr()
2595 return -ENOMEM; in dup_user_cpus_ptr()
2597 cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); in dup_user_cpus_ptr()
2605 swap(p->user_cpus_ptr, user_mask); in clear_user_cpus_ptr()
2620 * designated task is enqueued on an allowed CPU. If that task is currently
2623 * Migrate-Disable comes along and tramples all over our nice sandcastle.
2626 * Initial conditions: P0->cpus_mask = [0, 1]
2635 * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
2648 * `--> <woken on migration completion>
2650 * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
2652 * task p are serialized by p->pi_lock, which we can leverage: the one that
2653 * should come into effect at the end of the Migrate-Disable region is the last
2654 * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
2659 * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
2660 * setup an instance of that struct and install it on the targeted task_struct.
2663 * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
2669 * Migrate-Disable. Consider:
2671 * Initial conditions: P0->cpus_mask = [0, 1]
2689 * p->migration_pending done with p->pi_lock held.
2697 /* Can the task run on the task's current CPU? If so, we're done */ in affine_move_task()
2698 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { in affine_move_task()
2702 (p->migration_flags & MDF_PUSH) && !rq->push_busy) { in affine_move_task()
2703 rq->push_busy = true; in affine_move_task()
2711 pending = p->migration_pending; in affine_move_task()
2712 if (pending && !pending->stop_pending) { in affine_move_task()
2713 p->migration_pending = NULL; in affine_move_task()
2720 stop_one_cpu_nowait(rq->cpu, push_cpu_stop, in affine_move_task()
2721 p, &rq->push_work); in affine_move_task()
2725 complete_all(&pending->done); in affine_move_task()
2731 /* serialized by p->pi_lock */ in affine_move_task()
2732 if (!p->migration_pending) { in affine_move_task()
2742 p->migration_pending = &my_pending; in affine_move_task()
2744 pending = p->migration_pending; in affine_move_task()
2745 refcount_inc(&pending->refs); in affine_move_task()
2750 * task on a disallowed CPU. in affine_move_task()
2752 * Serialized by p->pi_lock, so this is safe. in affine_move_task()
2754 pending->arg.dest_cpu = dest_cpu; in affine_move_task()
2757 pending = p->migration_pending; in affine_move_task()
2759 * - !MIGRATE_ENABLE: in affine_move_task()
2762 * - MIGRATE_ENABLE: in affine_move_task()
2772 return -EINVAL; in affine_move_task()
2775 if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { in affine_move_task()
2779 * and have the stopper function handle it all race-free. in affine_move_task()
2781 stop_pending = pending->stop_pending; in affine_move_task()
2783 pending->stop_pending = true; in affine_move_task()
2786 p->migration_flags &= ~MDF_PUSH; in affine_move_task()
2792 &pending->arg, &pending->stop_work); in affine_move_task()
2803 if (!pending->stop_pending) { in affine_move_task()
2804 p->migration_pending = NULL; in affine_move_task()
2811 complete_all(&pending->done); in affine_move_task()
2814 wait_for_completion(&pending->done); in affine_move_task()
2816 if (refcount_dec_and_test(&pending->refs)) in affine_move_task()
2817 wake_up_var(&pending->refs); /* No UaF, just an address */ in affine_move_task()
2832 * Called with both p->pi_lock and rq->lock held; drops both before returning.
2839 __releases(rq->lock) in __set_cpus_allowed_ptr_locked()
2840 __releases(p->pi_lock) in __set_cpus_allowed_ptr_locked()
2844 bool kthread = p->flags & PF_KTHREAD; in __set_cpus_allowed_ptr_locked()
2853 * Kernel threads are allowed on online && !active CPUs, in __set_cpus_allowed_ptr_locked()
2854 * however, during cpu-hot-unplug, even these might get pushed in __set_cpus_allowed_ptr_locked()
2858 * cpumask_any_and_distribute() pick below, esp. so on in __set_cpus_allowed_ptr_locked()
2860 * set_cpus_allowed_common() and actually reset p->cpus_ptr. in __set_cpus_allowed_ptr_locked()
2866 ret = -EINVAL; in __set_cpus_allowed_ptr_locked()
2871 * Must re-check here, to close a race against __kthread_bind(), in __set_cpus_allowed_ptr_locked()
2874 if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { in __set_cpus_allowed_ptr_locked()
2875 ret = -EINVAL; in __set_cpus_allowed_ptr_locked()
2880 if (cpumask_equal(&p->cpus_mask, new_mask)) in __set_cpus_allowed_ptr_locked()
2886 ret = -EBUSY; in __set_cpus_allowed_ptr_locked()
2898 ret = -EINVAL; in __set_cpus_allowed_ptr_locked()
2921 * proper CPU and schedule it away if the CPU it's executing on
2947 * and pointing @p->user_cpus_ptr to a copy of the old mask.
2949 * -EINVAL.
2960 if (!p->user_cpus_ptr) { in restrict_cpus_allowed_ptr()
2963 return -ENOMEM; in restrict_cpus_allowed_ptr()
2974 err = -EPERM; in restrict_cpus_allowed_ptr()
2978 if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { in restrict_cpus_allowed_ptr()
2979 err = -EINVAL; in restrict_cpus_allowed_ptr()
2985 * the user asked for in case we're able to restore it later on. in restrict_cpus_allowed_ptr()
2988 cpumask_copy(user_mask, p->cpus_ptr); in restrict_cpus_allowed_ptr()
2989 p->user_cpus_ptr = user_mask; in restrict_cpus_allowed_ptr()
3002 * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
3027 * task, so override it based on its cpuset hierarchy. in force_compatible_cpus_allowed_ptr()
3035 task_pid_nr(p), p->comm, in force_compatible_cpus_allowed_ptr()
3051 * @p->user_cpus_ptr.
3058 struct cpumask *user_mask = p->user_cpus_ptr; in relax_compatible_cpus_allowed_ptr()
3069 raw_spin_lock_irqsave(&p->pi_lock, flags); in relax_compatible_cpus_allowed_ptr()
3071 raw_spin_unlock_irqrestore(&p->pi_lock, flags); in relax_compatible_cpus_allowed_ptr()
3079 unsigned int state = READ_ONCE(p->__state); in set_task_cpu()
3082 * We should never call set_task_cpu() on a blocked task, in set_task_cpu()
3085 WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq); in set_task_cpu()
3088 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, in set_task_cpu()
3090 * time relying on p->on_rq. in set_task_cpu()
3093 p->sched_class == &fair_sched_class && in set_task_cpu()
3094 (p->on_rq && !task_on_rq_migrating(p))); in set_task_cpu()
3098 * The caller should hold either p->pi_lock or rq->lock, when changing in set_task_cpu()
3099 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. in set_task_cpu()
3107 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || in set_task_cpu()
3121 if (p->sched_class->migrate_task_rq) in set_task_cpu()
3122 p->sched_class->migrate_task_rq(p, new_cpu); in set_task_cpu()
3123 p->se.nr_migrations++; in set_task_cpu()
3155 * it before it went to sleep. This means on wakeup we make the in __migrate_swap_task()
3158 p->wake_cpu = cpu; in __migrate_swap_task()
3171 int ret = -EAGAIN; in migrate_swap_stop()
3173 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) in migrate_swap_stop()
3174 return -EAGAIN; in migrate_swap_stop()
3176 src_rq = cpu_rq(arg->src_cpu); in migrate_swap_stop()
3177 dst_rq = cpu_rq(arg->dst_cpu); in migrate_swap_stop()
3179 double_raw_lock(&arg->src_task->pi_lock, in migrate_swap_stop()
3180 &arg->dst_task->pi_lock); in migrate_swap_stop()
3183 if (task_cpu(arg->dst_task) != arg->dst_cpu) in migrate_swap_stop()
3186 if (task_cpu(arg->src_task) != arg->src_cpu) in migrate_swap_stop()
3189 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) in migrate_swap_stop()
3192 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) in migrate_swap_stop()
3195 __migrate_swap_task(arg->src_task, arg->dst_cpu); in migrate_swap_stop()
3196 __migrate_swap_task(arg->dst_task, arg->src_cpu); in migrate_swap_stop()
3202 raw_spin_unlock(&arg->dst_task->pi_lock); in migrate_swap_stop()
3203 raw_spin_unlock(&arg->src_task->pi_lock); in migrate_swap_stop()
3215 int ret = -EINVAL; in migrate_swap()
3229 * will be re-checked with proper locks held further down the line. in migrate_swap()
3234 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) in migrate_swap()
3237 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) in migrate_swap()
3249 * wait_task_inactive - wait for a thread to unschedule.
3274 * any task-queue locks at all. We'll only try to get in wait_task_inactive()
3281 * If the task is actively running on another CPU in wait_task_inactive()
3282 * still, just relax and busy-wait without holding in wait_task_inactive()
3292 if (!(READ_ONCE(p->__state) & match_state)) in wait_task_inactive()
3307 if (READ_ONCE(p->__state) & match_state) in wait_task_inactive()
3308 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ in wait_task_inactive()
3335 * yield - it could be a while. in wait_task_inactive()
3357 * kick_process - kick a running thread to enter/exit the kernel
3358 * @p: the to-be-kicked thread
3360 * Cause a process which is running on another CPU to enter
3361 * kernel-mode, without any delay. (to get signals handled.)
3382 * ->cpus_ptr is protected by both rq->lock and p->pi_lock
3384 * A few notes on cpu_active vs cpu_online:
3386 * - cpu_active must be a subset of cpu_online
3388 * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
3393 * - on CPU-down we clear cpu_active() to mask the sched domains and
3394 * avoid the load balancer to place new tasks on the to be removed
3411 * If the node that the CPU is on has been offlined, cpu_to_node() in select_fallback_rq()
3412 * will return -1. There is no CPU on the node, and we should in select_fallback_rq()
3413 * select the CPU on the other node. in select_fallback_rq()
3415 if (nid != -1) { in select_fallback_rq()
3427 for_each_cpu(dest_cpu, p->cpus_ptr) { in select_fallback_rq()
3445 * hold p->pi_lock and again violate locking order. in select_fallback_rq()
3465 if (p->mm && printk_ratelimit()) { in select_fallback_rq()
3467 task_pid_nr(p), p->comm, cpu); in select_fallback_rq()
3475 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
3480 lockdep_assert_held(&p->pi_lock); in select_task_rq()
3482 if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) in select_task_rq()
3483 cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); in select_task_rq()
3485 cpu = cpumask_any(p->cpus_ptr); in select_task_rq()
3488 * In order not to call set_task_cpu() on a blocking task we need in select_task_rq()
3489 * to rely on ttwu() to place the task on a valid ->cpus_ptr in select_task_rq()
3494 * [ this allows ->select_task() to simply return task_cpu(p) and in select_task_rq()
3506 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; in sched_set_stop_task()
3507 struct task_struct *old_stop = cpu_rq(cpu)->stop; in sched_set_stop_task()
3515 * much confusion -- but then, stop work should not in sched_set_stop_task()
3516 * rely on PI working anyway. in sched_set_stop_task()
3520 stop->sched_class = &stop_sched_class; in sched_set_stop_task()
3523 * The PI code calls rt_mutex_setprio() with ->pi_lock held to in sched_set_stop_task()
3529 * The stop task itself will never be part of the PI-chain, it in sched_set_stop_task()
3530 * never blocks, therefore that ->pi_lock recursion is safe. in sched_set_stop_task()
3531 * Tell lockdep about this by placing the stop->pi_lock in its in sched_set_stop_task()
3534 lockdep_set_class(&stop->pi_lock, &stop_pi_lock); in sched_set_stop_task()
3537 cpu_rq(cpu)->stop = stop; in sched_set_stop_task()
3544 old_stop->sched_class = &rt_sched_class; in sched_set_stop_task()
3577 if (cpu == rq->cpu) { in ttwu_stat()
3578 __schedstat_inc(rq->ttwu_local); in ttwu_stat()
3579 __schedstat_inc(p->stats.nr_wakeups_local); in ttwu_stat()
3583 __schedstat_inc(p->stats.nr_wakeups_remote); in ttwu_stat()
3585 for_each_domain(rq->cpu, sd) { in ttwu_stat()
3587 __schedstat_inc(sd->ttwu_wake_remote); in ttwu_stat()
3595 __schedstat_inc(p->stats.nr_wakeups_migrate); in ttwu_stat()
3598 __schedstat_inc(rq->ttwu_count); in ttwu_stat()
3599 __schedstat_inc(p->stats.nr_wakeups); in ttwu_stat()
3602 __schedstat_inc(p->stats.nr_wakeups_sync); in ttwu_stat()
3606 * Mark the task runnable and perform wakeup-preemption.
3612 WRITE_ONCE(p->__state, TASK_RUNNING); in ttwu_do_wakeup()
3616 if (p->sched_class->task_woken) { in ttwu_do_wakeup()
3619 * drop the rq->lock, hereafter rq is only used for statistics. in ttwu_do_wakeup()
3622 p->sched_class->task_woken(rq, p); in ttwu_do_wakeup()
3626 if (rq->idle_stamp) { in ttwu_do_wakeup()
3627 u64 delta = rq_clock(rq) - rq->idle_stamp; in ttwu_do_wakeup()
3628 u64 max = 2*rq->max_idle_balance_cost; in ttwu_do_wakeup()
3630 update_avg(&rq->avg_idle, delta); in ttwu_do_wakeup()
3632 if (rq->avg_idle > max) in ttwu_do_wakeup()
3633 rq->avg_idle = max; in ttwu_do_wakeup()
3635 rq->wake_stamp = jiffies; in ttwu_do_wakeup()
3636 rq->wake_avg_idle = rq->avg_idle / 2; in ttwu_do_wakeup()
3638 rq->idle_stamp = 0; in ttwu_do_wakeup()
3651 if (p->sched_contributes_to_load) in ttwu_do_activate()
3652 rq->nr_uninterruptible--; in ttwu_do_activate()
3659 if (p->in_iowait) { in ttwu_do_activate()
3661 atomic_dec(&task_rq(p)->nr_iowait); in ttwu_do_activate()
3682 * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
3685 * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
3686 * then schedule() must still happen and p->state can be changed to
3723 * rq::ttwu_pending racy indication of out-standing wakeups. in sched_ttwu_pending()
3724 * Races such that false-negatives are possible, since they in sched_ttwu_pending()
3725 * are shorter lived that false-positives would be. in sched_ttwu_pending()
3727 WRITE_ONCE(rq->ttwu_pending, 0); in sched_ttwu_pending()
3733 if (WARN_ON_ONCE(p->on_cpu)) in sched_ttwu_pending()
3734 smp_cond_load_acquire(&p->on_cpu, !VAL); in sched_ttwu_pending()
3739 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); in sched_ttwu_pending()
3749 if (!set_nr_if_polling(rq->idle)) in send_call_function_single_ipi()
3756 * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
3757 * necessary. The wakee CPU on receipt of the IPI will queue the task
3765 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); in __ttwu_queue_wakelist()
3767 WRITE_ONCE(rq->ttwu_pending, 1); in __ttwu_queue_wakelist()
3768 __smp_call_single_queue(cpu, &p->wake_entry.llist); in __ttwu_queue_wakelist()
3778 if (!is_idle_task(rcu_dereference(rq->curr))) in wake_up_if_idle()
3782 if (is_idle_task(rq->curr)) in wake_up_if_idle()
3808 /* Ensure the task will still be allowed to run on the CPU. */ in ttwu_queue_cond()
3809 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) in ttwu_queue_cond()
3813 * If the CPU does not share cache, then queue the task on the in ttwu_queue_cond()
3824 * only running task on the CPU, then use the wakelist to offload in ttwu_queue_cond()
3825 * the task activation to the idle (or soon-to-be-idle) CPU as in ttwu_queue_cond()
3829 * Note that we can only get here with (wakee) p->on_rq=0, in ttwu_queue_cond()
3830 * p->on_cpu can be whatever, we've done the dequeue, so in ttwu_queue_cond()
3831 * the wakee has been accounted out of ->nr_running. in ttwu_queue_cond()
3833 if (!cpu_rq(cpu)->nr_running) in ttwu_queue_cond()
3895 if (READ_ONCE(p->__state) & state) { in ttwu_state_match()
3902 * Saved state preserves the task state across blocking on in ttwu_state_match()
3904 * TASK_RUNNING, but do not wake the task because it waits in ttwu_state_match()
3914 if (p->saved_state & state) { in ttwu_state_match()
3915 p->saved_state = TASK_RUNNING; in ttwu_state_match()
3923 * Notes on Program-Order guarantees on SMP systems.
3927 * The basic program-order guarantee on SMP systems is that when a task [t]
3928 * migrates, all its activity on its old CPU [c0] happens-before any subsequent
3929 * execution on its new CPU [c1].
3933 * A) UNLOCK of the rq(c0)->lock scheduling out task t
3934 * B) migration for t is required to synchronize *both* rq(c0)->lock and
3935 * rq(c1)->lock (if not at the same time, then in that order).
3936 * C) LOCK of the rq(c1)->lock scheduling in task
3945 * LOCK rq(0)->lock
3946 * sched-out X
3947 * sched-in Y
3948 * UNLOCK rq(0)->lock
3950 * LOCK rq(0)->lock // orders against CPU0
3952 * UNLOCK rq(0)->lock
3954 * LOCK rq(1)->lock
3956 * UNLOCK rq(1)->lock
3958 * LOCK rq(1)->lock // orders against CPU2
3959 * sched-out Z
3960 * sched-in X
3961 * UNLOCK rq(1)->lock
3964 * BLOCKING -- aka. SLEEP + WAKEUP
3970 * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
3971 * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
3977 * LOCK rq(0)->lock LOCK X->pi_lock
3979 * sched-out X
3980 * smp_store_release(X->on_cpu, 0);
3982 * smp_cond_load_acquire(&X->on_cpu, !VAL);
3983 * X->state = WAKING
3986 * LOCK rq(2)->lock
3988 * X->state = RUNNING
3989 * UNLOCK rq(2)->lock
3991 * LOCK rq(2)->lock // orders against CPU1
3992 * sched-out Z
3993 * sched-in X
3994 * UNLOCK rq(2)->lock
3996 * UNLOCK X->pi_lock
3997 * UNLOCK rq(0)->lock
4006 * try_to_wake_up - wake up a thread
4009 * @wake_flags: wake modifier flags (WF_*)
4013 * If (@state & @p->state) @p->state = TASK_RUNNING.
4015 * If the task was not queued/runnable, also place it back on a runqueue.
4019 * It issues a full memory barrier before accessing @p->state, see the comment
4022 * Uses p->pi_lock to serialize against concurrent wake-ups.
4024 * Relies on p->pi_lock stabilizing:
4025 * - p->sched_class
4026 * - p->cpus_ptr
4027 * - p->sched_task_group
4030 * Tries really hard to only take one task_rq(p)->lock for performance.
4031 * Takes rq->lock in:
4032 * - ttwu_runnable() -- old rq, unavoidable, see comment there;
4033 * - ttwu_queue() -- new rq, for enqueue of the task;
4034 * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
4039 * Return: %true if @p->state changes (an actual wakeup was done),
4051 * We're waking current, this means 'p->on_rq' and 'task_cpu(p) in try_to_wake_up()
4053 * case the whole 'p->on_rq && ttwu_runnable()' case below in try_to_wake_up()
4057 * - we rely on Program-Order guarantees for all the ordering, in try_to_wake_up()
4058 * - we're serialized against set_special_state() by virtue of in try_to_wake_up()
4059 * it disabling IRQs (this allows not taking ->pi_lock). in try_to_wake_up()
4065 WRITE_ONCE(p->__state, TASK_RUNNING); in try_to_wake_up()
4071 * If we are going to wake up a thread waiting for CONDITION we in try_to_wake_up()
4073 * reordered with p->state check below. This pairs with smp_store_mb() in try_to_wake_up()
4076 raw_spin_lock_irqsave(&p->pi_lock, flags); in try_to_wake_up()
4084 * Ensure we load p->on_rq _after_ p->state, otherwise it would in try_to_wake_up()
4085 * be possible to, falsely, observe p->on_rq == 0 and get stuck in try_to_wake_up()
4089 * STORE p->on_rq = 1 LOAD p->state in try_to_wake_up()
4090 * UNLOCK rq->lock in try_to_wake_up()
4093 * LOCK rq->lock smp_rmb(); in try_to_wake_up()
4095 * UNLOCK rq->lock in try_to_wake_up()
4098 * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq in try_to_wake_up()
4100 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in in try_to_wake_up()
4106 if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) in try_to_wake_up()
4111 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be in try_to_wake_up()
4112 * possible to, falsely, observe p->on_cpu == 0. in try_to_wake_up()
4114 * One must be running (->on_cpu == 1) in order to remove oneself in try_to_wake_up()
4118 * STORE p->on_cpu = 1 LOAD p->on_rq in try_to_wake_up()
4119 * UNLOCK rq->lock in try_to_wake_up()
4122 * LOCK rq->lock smp_rmb(); in try_to_wake_up()
4124 * STORE p->on_rq = 0 LOAD p->on_cpu in try_to_wake_up()
4126 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in in try_to_wake_up()
4129 * Form a control-dep-acquire with p->on_rq == 0 above, to ensure in try_to_wake_up()
4131 * care about it's own p->state. See the comment in __schedule(). in try_to_wake_up()
4136 * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq in try_to_wake_up()
4137 * == 0), which means we need to do an enqueue, change p->state to in try_to_wake_up()
4138 * TASK_WAKING such that we can unlock p->pi_lock before doing the in try_to_wake_up()
4141 WRITE_ONCE(p->__state, TASK_WAKING); in try_to_wake_up()
4145 * this task as prev, considering queueing p on the remote CPUs wake_list in try_to_wake_up()
4146 * which potentially sends an IPI instead of spinning on p->on_cpu to in try_to_wake_up()
4150 * Ensure we load task_cpu(p) after p->on_cpu: in try_to_wake_up()
4153 * STORE p->cpu = @cpu in try_to_wake_up()
4155 * LOCK rq->lock in try_to_wake_up()
4156 * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) in try_to_wake_up()
4157 * STORE p->on_cpu = 1 LOAD p->cpu in try_to_wake_up()
4159 * to ensure we observe the correct CPU on which the task is currently in try_to_wake_up()
4162 if (smp_load_acquire(&p->on_cpu) && in try_to_wake_up()
4175 smp_cond_load_acquire(&p->on_cpu, !VAL); in try_to_wake_up()
4177 cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); in try_to_wake_up()
4179 if (p->in_iowait) { in try_to_wake_up()
4181 atomic_dec(&task_rq(p)->nr_iowait); in try_to_wake_up()
4194 raw_spin_unlock_irqrestore(&p->pi_lock, flags); in try_to_wake_up()
4205 unsigned int state = READ_ONCE(p->__state); in __task_needs_rq_lock()
4208 * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when in __task_needs_rq_lock()
4216 * Ensure we load p->on_rq after p->__state, otherwise it would be in __task_needs_rq_lock()
4217 * possible to, falsely, observe p->on_rq == 0. in __task_needs_rq_lock()
4222 if (p->on_rq) in __task_needs_rq_lock()
4231 smp_cond_load_acquire(&p->on_cpu, !VAL); in __task_needs_rq_lock()
4238 * task_call_func - Invoke a function on task in fixed state
4244 * and call @func(@arg) on it. This function can use ->on_rq and task_curr()
4257 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); in task_call_func()
4264 * - blocked and we're holding off wakeups (pi->lock) in task_call_func()
4265 * - woken, and we're holding off enqueue (rq->lock) in task_call_func()
4266 * - queued, and we're holding off schedule (rq->lock) in task_call_func()
4267 * - running, and we're holding off de-schedule (rq->lock) in task_call_func()
4269 * The called function (@func) can use: task_curr(), p->on_rq and in task_call_func()
4270 * p->__state to differentiate between these states. in task_call_func()
4277 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); in task_call_func()
4282 * cpu_curr_snapshot - Return a snapshot of the currently running task
4283 * @cpu: The CPU on which to snapshot the task.
4285 * Returns the task_struct pointer of the task "currently" running on
4286 * the specified CPU. If the same task is running on that CPU throughout,
4290 * task_struct structure of a randomly chosen task that was running on
4314 * wake_up_process - Wake up a specific process
4317 * Attempt to wake up the nominated process and move it to the set of runnable
4343 p->on_rq = 0; in __sched_fork()
4345 p->se.on_rq = 0; in __sched_fork()
4346 p->se.exec_start = 0; in __sched_fork()
4347 p->se.sum_exec_runtime = 0; in __sched_fork()
4348 p->se.prev_sum_exec_runtime = 0; in __sched_fork()
4349 p->se.nr_migrations = 0; in __sched_fork()
4350 p->se.vruntime = 0; in __sched_fork()
4351 INIT_LIST_HEAD(&p->se.group_node); in __sched_fork()
4354 p->se.cfs_rq = NULL; in __sched_fork()
4359 memset(&p->stats, 0, sizeof(p->stats)); in __sched_fork()
4362 RB_CLEAR_NODE(&p->dl.rb_node); in __sched_fork()
4363 init_dl_task_timer(&p->dl); in __sched_fork()
4364 init_dl_inactive_task_timer(&p->dl); in __sched_fork()
4367 INIT_LIST_HEAD(&p->rt.run_list); in __sched_fork()
4368 p->rt.timeout = 0; in __sched_fork()
4369 p->rt.time_slice = sched_rr_timeslice; in __sched_fork()
4370 p->rt.on_rq = 0; in __sched_fork()
4371 p->rt.on_list = 0; in __sched_fork()
4374 INIT_HLIST_HEAD(&p->preempt_notifiers); in __sched_fork()
4378 p->capture_control = NULL; in __sched_fork()
4382 p->wake_entry.u_flags = CSD_TYPE_TTWU; in __sched_fork()
4383 p->migration_pending = NULL; in __sched_fork()
4416 pgdat->nbp_threshold = 0; in reset_memory_tiering()
4417 pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); in reset_memory_tiering()
4418 pgdat->nbp_th_start = jiffies_to_msecs(jiffies); in reset_memory_tiering()
4430 return -EPERM; in sysctl_numa_balancing()
4499 return -EPERM; in sysctl_schedstats()
4560 * fork()/clone()-time setup:
4568 * event cannot wake it up and insert it on the runqueue either. in sched_fork()
4570 p->__state = TASK_NEW; in sched_fork()
4575 p->prio = current->normal_prio; in sched_fork()
4580 * Revert to default priority/policy on fork if requested. in sched_fork()
4582 if (unlikely(p->sched_reset_on_fork)) { in sched_fork()
4584 p->policy = SCHED_NORMAL; in sched_fork()
4585 p->static_prio = NICE_TO_PRIO(0); in sched_fork()
4586 p->rt_priority = 0; in sched_fork()
4587 } else if (PRIO_TO_NICE(p->static_prio) < 0) in sched_fork()
4588 p->static_prio = NICE_TO_PRIO(0); in sched_fork()
4590 p->prio = p->normal_prio = p->static_prio; in sched_fork()
4597 p->sched_reset_on_fork = 0; in sched_fork()
4600 if (dl_prio(p->prio)) in sched_fork()
4601 return -EAGAIN; in sched_fork()
4602 else if (rt_prio(p->prio)) in sched_fork()
4603 p->sched_class = &rt_sched_class; in sched_fork()
4605 p->sched_class = &fair_sched_class; in sched_fork()
4607 init_entity_runnable_average(&p->se); in sched_fork()
4612 memset(&p->sched_info, 0, sizeof(p->sched_info)); in sched_fork()
4615 p->on_cpu = 0; in sched_fork()
4619 plist_node_init(&p->pushable_tasks, MAX_PRIO); in sched_fork()
4620 RB_CLEAR_NODE(&p->pushable_dl_tasks); in sched_fork()
4630 * Because we're not yet on the pid-hash, p->pi_lock isn't strictly in sched_cgroup_fork()
4633 raw_spin_lock_irqsave(&p->pi_lock, flags); in sched_cgroup_fork()
4637 tg = container_of(kargs->cset->subsys[cpu_cgrp_id], in sched_cgroup_fork()
4640 p->sched_task_group = tg; in sched_cgroup_fork()
4649 if (p->sched_class->task_fork) in sched_cgroup_fork()
4650 p->sched_class->task_fork(p); in sched_cgroup_fork()
4651 raw_spin_unlock_irqrestore(&p->pi_lock, flags); in sched_cgroup_fork()
4676 * wake_up_new_task - wake up a newly created task for the first time.
4680 * on the runqueue and wakes it.
4687 raw_spin_lock_irqsave(&p->pi_lock, rf.flags); in wake_up_new_task()
4688 WRITE_ONCE(p->__state, TASK_RUNNING); in wake_up_new_task()
4692 * - cpus_ptr can change in the fork path in wake_up_new_task()
4693 * - any previously selected CPU might disappear through hotplug in wake_up_new_task()
4696 * as we're not fully set-up yet. in wake_up_new_task()
4698 p->recent_used_cpu = task_cpu(p); in wake_up_new_task()
4710 if (p->sched_class->task_woken) { in wake_up_new_task()
4712 * Nothing relies on rq->lock after this, so it's fine to in wake_up_new_task()
4716 p->sched_class->task_woken(rq, p); in wake_up_new_task()
4740 * preempt_notifier_register - tell me when current is being preempted & rescheduled
4748 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); in preempt_notifier_register()
4753 * preempt_notifier_unregister - no longer interested in preemption notifications
4760 hlist_del(¬ifier->link); in preempt_notifier_unregister()
4768 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) in __fire_sched_in_preempt_notifiers()
4769 notifier->ops->sched_in(notifier, raw_smp_processor_id()); in __fire_sched_in_preempt_notifiers()
4784 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) in __fire_sched_out_preempt_notifiers()
4785 notifier->ops->sched_out(notifier, next); in __fire_sched_out_preempt_notifiers()
4817 * See the smp_load_acquire(&p->on_cpu) case in ttwu() and in prepare_task()
4820 WRITE_ONCE(next->on_cpu, 1); in prepare_task()
4829 * p->on_cpu is cleared, the task can be moved to a different CPU. We in finish_task()
4833 * In particular, the load of prev->state in finish_task_switch() must in finish_task()
4838 smp_store_release(&prev->on_cpu, 0); in finish_task()
4852 func = (void (*)(struct rq *))head->func; in do_balance_callbacks()
4853 next = head->next; in do_balance_callbacks()
4854 head->next = NULL; in do_balance_callbacks()
4868 * that queued it (only later, when it's safe to drop rq->lock again),
4872 * a single test, namely: rq->balance_callback == NULL.
4882 struct balance_callback *head = rq->balance_callback; in __splice_balance_callbacks()
4891 * in the same rq->lock section. in __splice_balance_callbacks()
4899 rq->balance_callback = NULL; in __splice_balance_callbacks()
4948 * of the scheduler it's an obvious special-case), so we in prepare_lock_switch()
4952 spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_); in prepare_lock_switch()
4955 rq_lockp(rq)->owner = next; in prepare_lock_switch()
4963 * fix up the runqueue lock - which gets 'carried over' from in finish_lock_switch()
4966 spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); in finish_lock_switch()
4986 if (unlikely(current->kmap_ctrl.idx)) in kmap_local_sched_out()
4994 if (unlikely(current->kmap_ctrl.idx)) in kmap_local_sched_in()
5000 * prepare_task_switch - prepare to switch tasks
5027 * finish_task_switch - clean up after a task-switch
5033 * and do any other architecture-specific cleanup actions.
5046 __releases(rq->lock) in finish_task_switch()
5049 struct mm_struct *mm = rq->prev_mm; in finish_task_switch()
5059 * raw_spin_lock_irq(&rq->lock) // 2 in finish_task_switch()
5065 current->comm, current->pid, preempt_count())) in finish_task_switch()
5068 rq->prev_mm = NULL; in finish_task_switch()
5072 * If a task dies, then it sets TASK_DEAD in tsk->state and calls in finish_task_switch()
5076 * We must observe prev->state before clearing prev->on_cpu (in in finish_task_switch()
5078 * running on another CPU and we could rave with its RUNNING -> DEAD in finish_task_switch()
5081 prev_state = READ_ONCE(prev->__state); in finish_task_switch()
5093 * Restoring the maps on sched in does not require interrupts being in finish_task_switch()
5103 * schedule between user->kernel->user threads without passing though in finish_task_switch()
5105 * rq->curr, before returning to userspace, so provide them here: in finish_task_switch()
5107 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly in finish_task_switch()
5109 * - a sync_core for SYNC_CORE. in finish_task_switch()
5116 if (prev->sched_class->task_dead) in finish_task_switch()
5117 prev->sched_class->task_dead(prev); in finish_task_switch()
5129 * schedule_tail - first thing a freshly forked thread must call.
5133 __releases(rq->lock) in schedule_tail()
5139 * finish_task_switch() will drop rq->lock() and lower preempt_count in schedule_tail()
5140 * and the preempt_enable() will end up enabling preemption (on in schedule_tail()
5147 if (current->set_child_tid) in schedule_tail()
5148 put_user(task_pid_vnr(current), current->set_child_tid); in schedule_tail()
5154 * context_switch - switch to the new MM and the new thread's register state.
5170 * kernel -> kernel lazy + transfer active in context_switch()
5171 * user -> kernel lazy + mmgrab() active in context_switch()
5173 * kernel -> user switch + mmdrop() active in context_switch()
5174 * user -> user switch in context_switch()
5176 if (!next->mm) { // to kernel in context_switch()
5177 enter_lazy_tlb(prev->active_mm, next); in context_switch()
5179 next->active_mm = prev->active_mm; in context_switch()
5180 if (prev->mm) // from user in context_switch()
5181 mmgrab(prev->active_mm); in context_switch()
5183 prev->active_mm = NULL; in context_switch()
5185 membarrier_switch_mm(rq, prev->active_mm, next->mm); in context_switch()
5188 * rq->curr / membarrier_switch_mm() and returning to userspace. in context_switch()
5191 * case 'prev->active_mm == next->mm' through in context_switch()
5194 switch_mm_irqs_off(prev->active_mm, next->mm, next); in context_switch()
5195 lru_gen_use_mm(next->mm); in context_switch()
5197 if (!prev->mm) { // from kernel in context_switch()
5199 rq->prev_mm = prev->active_mm; in context_switch()
5200 prev->active_mm = NULL; in context_switch()
5204 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); in context_switch()
5226 sum += cpu_rq(i)->nr_running; in nr_running()
5232 * Check if only the current task is running on the CPU.
5235 * preemption, thus the result might have a time-of-check-to-time-of-use
5238 * - from a non-preemptible section (of course)
5240 * - from a thread that is bound to a single CPU
5242 * - in a loop with very short iterations (e.g. a polling loop)
5246 return raw_rq()->nr_running == 1; in single_task_running()
5256 sum += cpu_rq(i)->nr_switches; in nr_context_switches()
5264 * for a CPU that has IO-wait which might not even end up running the task when
5270 return atomic_read(&cpu_rq(cpu)->nr_iowait); in nr_iowait_cpu()
5274 * IO-wait accounting, and how it's mostly bollocks (on SMP).
5276 * The idea behind IO-wait account is to account the idle time that we could
5278 * storage performance, we'd have a proportional reduction in IO-wait time.
5280 * This all works nicely on UP, where, when a task blocks on IO, we account
5281 * idle time as IO-wait, because if the storage were faster, it could've been
5287 * Imagine for instance the case where two tasks block on one CPU, only the one
5288 * CPU will have IO-wait accounted, while the other has regular idle. Even
5292 * This means, that when looking globally, the current IO-wait accounting on
5297 * associated with any one particular CPU, it can wake to another CPU than it
5298 * blocked on. This means the per CPU IO-wait number is meaningless.
5316 * sched_exec - execve() is a valuable balancing opportunity, because at
5325 raw_spin_lock_irqsave(&p->pi_lock, flags); in sched_exec()
5326 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); in sched_exec()
5333 raw_spin_unlock_irqrestore(&p->pi_lock, flags); in sched_exec()
5338 raw_spin_unlock_irqrestore(&p->pi_lock, flags); in sched_exec()
5351 * and its field curr->exec_start; when called from task_sched_runtime(),
5358 struct sched_entity *curr = (&p->se)->cfs_rq->curr; in prefetch_curr_exec_start()
5360 struct sched_entity *curr = (&task_rq(p)->cfs)->curr; in prefetch_curr_exec_start()
5363 prefetch(&curr->exec_start); in prefetch_curr_exec_start()
5379 * 64-bit doesn't need locks to atomically read a 64-bit value. in task_sched_runtime()
5381 * Reading ->on_cpu is racy, but this is ok. in task_sched_runtime()
5386 * If we see ->on_cpu without ->on_rq, the task is leaving, and has in task_sched_runtime()
5389 if (!p->on_cpu || !task_on_rq_queued(p)) in task_sched_runtime()
5390 return p->se.sum_exec_runtime; in task_sched_runtime()
5395 * Must be ->curr _and_ ->on_rq. If dequeued, we would in task_sched_runtime()
5402 p->sched_class->update_curr(rq); in task_sched_runtime()
5404 ns = p->se.sum_exec_runtime; in task_sched_runtime()
5426 if (!rq->last_seen_need_resched_ns) { in cpu_resched_latency()
5427 rq->last_seen_need_resched_ns = now; in cpu_resched_latency()
5428 rq->ticks_without_resched = 0; in cpu_resched_latency()
5432 rq->ticks_without_resched++; in cpu_resched_latency()
5433 resched_latency = now - rq->last_seen_need_resched_ns; in cpu_resched_latency()
5467 struct task_struct *curr = rq->curr; in scheduler_tick()
5480 curr->sched_class->task_tick(rq, curr, 0); in scheduler_tick()
5494 rq->idle_balance = idle_cpu(cpu); in scheduler_tick()
5506 /* Values for ->state, see diagram below. */
5512 * State diagram for ->state:
5521 * +--TICK_SCHED_REMOTE_OFFLINING
5540 int cpu = twork->cpu; in sched_tick_remote()
5551 * statistics and checks timeslices in a time-independent way, regardless in sched_tick_remote()
5558 curr = rq->curr; in sched_tick_remote()
5569 delta = rq_clock_task(rq) - curr->se.exec_start; in sched_tick_remote()
5572 curr->sched_class->task_tick(rq, curr, 0); in sched_tick_remote()
5585 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); in sched_tick_remote()
5602 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); in sched_tick_start()
5605 twork->cpu = cpu; in sched_tick_start()
5606 INIT_DELAYED_WORK(&twork->work, sched_tick_remote); in sched_tick_start()
5607 queue_delayed_work(system_unbound_wq, &twork->work, HZ); in sched_tick_start()
5623 /* There cannot be competing actions, but don't rely on stop-machine. */ in sched_tick_stop()
5624 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); in sched_tick_stop()
5653 current->preempt_disable_ip = ip; in preempt_latency_start()
5674 PREEMPT_MASK - 10); in preempt_count_add()
5721 return p->preempt_disable_ip; in get_preempt_disable_ip()
5739 prev->comm, prev->pid, preempt_count()); in __schedule_bug()
5758 * Various schedule()-time debugging checks and statistics:
5771 if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) { in schedule_debug()
5772 printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", in schedule_debug()
5773 prev->comm, prev->pid, prev->non_block_count); in schedule_debug()
5788 schedstat_inc(this_rq()->sched_count); in schedule_debug()
5798 * that when we release the rq->lock the task is in the same in put_prev_task_balance()
5799 * state as before we took rq->lock. in put_prev_task_balance()
5804 for_class_range(class, prev->sched_class, &idle_sched_class) { in put_prev_task_balance()
5805 if (class->balance(rq, prev, rf)) in put_prev_task_balance()
5814 * Pick up the highest-prio task:
5828 if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && in __pick_next_task()
5829 rq->nr_running == rq->cfs.h_nr_running)) { in __pick_next_task()
5848 p = class->pick_next_task(rq); in __pick_next_task()
5859 return (task_rq(t)->idle == t); in is_task_rq_idle()
5864 return is_task_rq_idle(a) || (a->core_cookie == cookie); in cookie_equals()
5872 return a->core_cookie == b->core_cookie; in cookie_match()
5881 p = class->pick_task(rq); in pick_task()
5899 bool core_clock_updated = (rq == rq->core); in pick_next_task()
5910 /* Stopper task is switching into idle, no need core-wide selection. */ in pick_next_task()
5917 rq->core_pick = NULL; in pick_next_task()
5926 * rq->core_pick can be NULL if no selection was made for a CPU because in pick_next_task()
5927 * it was either offline or went offline during a sibling's core-wide in pick_next_task()
5928 * selection. In this case, do a core-wide selection. in pick_next_task()
5930 if (rq->core->core_pick_seq == rq->core->core_task_seq && in pick_next_task()
5931 rq->core->core_pick_seq != rq->core_sched_seq && in pick_next_task()
5932 rq->core_pick) { in pick_next_task()
5933 WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); in pick_next_task()
5935 next = rq->core_pick; in pick_next_task()
5941 rq->core_pick = NULL; in pick_next_task()
5948 need_sync = !!rq->core->core_cookie; in pick_next_task()
5951 rq->core->core_cookie = 0UL; in pick_next_task()
5952 if (rq->core->core_forceidle_count) { in pick_next_task()
5954 update_rq_clock(rq->core); in pick_next_task()
5959 rq->core->core_forceidle_start = 0; in pick_next_task()
5960 rq->core->core_forceidle_count = 0; in pick_next_task()
5961 rq->core->core_forceidle_occupation = 0; in pick_next_task()
5967 * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq in pick_next_task()
5970 * @pick_seq is the @task_seq we did a selection on in pick_next_task()
5973 * However, preemptions can cause multiple picks on the same task set. in pick_next_task()
5976 rq->core->core_task_seq++; in pick_next_task()
5980 * and there are no cookied tasks running on siblings. in pick_next_task()
5984 if (!next->core_cookie) { in pick_next_task()
5985 rq->core_pick = NULL; in pick_next_task()
6000 * Tie-break prio towards the current CPU in pick_next_task()
6006 * Current cpu always has its clock updated on entrance to in pick_next_task()
6010 if (i != cpu && (rq_i != rq->core || !core_clock_updated)) in pick_next_task()
6013 p = rq_i->core_pick = pick_task(rq_i); in pick_next_task()
6018 cookie = rq->core->core_cookie = max->core_cookie; in pick_next_task()
6026 p = rq_i->core_pick; in pick_next_task()
6036 rq_i->core_pick = p; in pick_next_task()
6038 if (p == rq_i->idle) { in pick_next_task()
6039 if (rq_i->nr_running) { in pick_next_task()
6040 rq->core->core_forceidle_count++; in pick_next_task()
6042 rq->core->core_forceidle_seq++; in pick_next_task()
6049 if (schedstat_enabled() && rq->core->core_forceidle_count) { in pick_next_task()
6050 rq->core->core_forceidle_start = rq_clock(rq->core); in pick_next_task()
6051 rq->core->core_forceidle_occupation = occ; in pick_next_task()
6054 rq->core->core_pick_seq = rq->core->core_task_seq; in pick_next_task()
6055 next = rq->core_pick; in pick_next_task()
6056 rq->core_sched_seq = rq->core->core_pick_seq; in pick_next_task()
6064 * NOTE: L1TF -- at this point we're no longer running the old task and in pick_next_task()
6066 * their task. This ensures there is no inter-sibling overlap between in pick_next_task()
6067 * non-matching user state. in pick_next_task()
6076 * picked for it. That's Ok - it will pick tasks for itself, in pick_next_task()
6079 if (!rq_i->core_pick) in pick_next_task()
6083 * Update for new !FI->FI transitions, or if continuing to be in !FI: in pick_next_task()
6090 if (!(fi_before && rq->core->core_forceidle_count)) in pick_next_task()
6091 task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count); in pick_next_task()
6093 rq_i->core_pick->core_occupation = occ; in pick_next_task()
6096 rq_i->core_pick = NULL; in pick_next_task()
6101 WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick)); in pick_next_task()
6103 if (rq_i->curr == rq_i->core_pick) { in pick_next_task()
6104 rq_i->core_pick = NULL; in pick_next_task()
6114 if (rq->core->core_forceidle_count && next == rq->idle) in pick_next_task()
6130 cookie = dst->core->core_cookie; in try_steal_cookie()
6134 if (dst->curr != dst->idle) in try_steal_cookie()
6138 if (p == src->idle) in try_steal_cookie()
6142 if (p == src->core_pick || p == src->curr) in try_steal_cookie()
6148 if (p->core_occupation > dst->idle->core_occupation) in try_steal_cookie()
6216 if (!rq->core->core_cookie) in queue_core_balance()
6219 if (!rq->nr_running) /* not forced idle */ in queue_core_balance()
6222 queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); in queue_core_balance()
6234 WARN_ON_ONCE(rq->core != rq); in sched_core_cpu_starting()
6245 if (rq->core == rq) { in sched_core_cpu_starting()
6259 rq->core = core_rq; in sched_core_cpu_starting()
6261 WARN_ON_ONCE(rq->core != core_rq); in sched_core_cpu_starting()
6279 WARN_ON_ONCE(rq->core != rq); in sched_core_cpu_deactivate()
6284 if (rq->core != rq) in sched_core_cpu_deactivate()
6299 core_rq->core_task_seq = rq->core_task_seq; in sched_core_cpu_deactivate()
6300 core_rq->core_pick_seq = rq->core_pick_seq; in sched_core_cpu_deactivate()
6301 core_rq->core_cookie = rq->core_cookie; in sched_core_cpu_deactivate()
6302 core_rq->core_forceidle_count = rq->core_forceidle_count; in sched_core_cpu_deactivate()
6303 core_rq->core_forceidle_seq = rq->core_forceidle_seq; in sched_core_cpu_deactivate()
6304 core_rq->core_forceidle_occupation = rq->core_forceidle_occupation; in sched_core_cpu_deactivate()
6311 core_rq->core_forceidle_start = 0; in sched_core_cpu_deactivate()
6316 rq->core = core_rq; in sched_core_cpu_deactivate()
6327 if (rq->core != rq) in sched_core_cpu_dying()
6328 rq->core = rq; in sched_core_cpu_dying()
6349 * preemption from blocking on an 'sleeping' spin/rwlock. Note that
6370 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
6377 * task to the run-queue and that's it.
6379 * Now, if the new task added to the run-queue preempts the current
6381 * called on the nearest possible occasion:
6383 * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
6385 * - in syscall or exception context, at the next outmost
6389 * - in IRQ context, return from interrupt-handler to
6392 * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
6395 * - cond_resched() call
6396 * - explicit schedule() call
6397 * - return from syscall or exception to user-space
6398 * - return from interrupt-handler to user-space
6413 prev = rq->curr; in __schedule()
6424 * Make sure that signal_pending_state()->signal_pending() below in __schedule()
6431 * LOCK rq->lock LOCK p->pi_state in __schedule()
6433 * if (signal_pending_state()) if (p->state & @state) in __schedule()
6436 * after coming from user-space, before storing to rq->curr. in __schedule()
6442 rq->clock_update_flags <<= 1; in __schedule()
6445 switch_count = &prev->nivcsw; in __schedule()
6448 * We must load prev->state once (task_struct::state is volatile), such in __schedule()
6451 prev_state = READ_ONCE(prev->__state); in __schedule()
6454 WRITE_ONCE(prev->__state, TASK_RUNNING); in __schedule()
6456 prev->sched_contributes_to_load = in __schedule()
6461 if (prev->sched_contributes_to_load) in __schedule()
6462 rq->nr_uninterruptible++; in __schedule()
6466 * prev_state = prev->state; if (p->on_rq && ...) in __schedule()
6468 * p->on_rq = 0; smp_acquire__after_ctrl_dep(); in __schedule()
6469 * p->state = TASK_WAKING in __schedule()
6473 * After this, schedule() must not care about p->state any more. in __schedule()
6477 if (prev->in_iowait) { in __schedule()
6478 atomic_inc(&rq->nr_iowait); in __schedule()
6482 switch_count = &prev->nvcsw; in __schedule()
6489 rq->last_seen_need_resched_ns = 0; in __schedule()
6493 rq->nr_switches++; in __schedule()
6495 * RCU users of rcu_dereference(rq->curr) may not see in __schedule()
6498 RCU_INIT_POINTER(rq->curr, next); in __schedule()
6502 * rq->curr, before returning to user-space. in __schedule()
6504 * Here are the schemes providing that barrier on the in __schedule()
6506 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. in __schedule()
6507 * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. in __schedule()
6508 * - finish_lock_switch() for weakly-ordered in __schedule()
6510 * - switch_to() for arm64 (weakly-ordered, spin_unlock in __schedule()
6523 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); in __schedule()
6537 current->flags |= PF_NOFREEZE; in do_task_dead()
6542 /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ in do_task_dead()
6554 task_flags = tsk->flags; in sched_submit_work()
6557 * wants to wake up a task to maintain concurrency. in sched_submit_work()
6571 SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); in sched_submit_work()
6577 blk_flush_plug(tsk->plug, true); in sched_submit_work()
6582 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { in sched_update_worker()
6583 if (tsk->flags & PF_WQ_WORKER) in sched_update_worker()
6606 * state (have scheduled out non-voluntarily) by making sure that all
6609 * (schedule out non-voluntarily).
6623 WARN_ON_ONCE(current->__state); in schedule_idle()
6649 * schedule_preempt_disabled - called with preemption disabled
6703 * This is the entry point to schedule() from in-kernel preemption
6709 * If there is a non-zero preempt_count or interrupts are disabled, in preempt_schedule()
6741 * preempt_schedule_notrace - preempt_schedule called by tracing
6845 return try_to_wake_up(curr->private, mode, wake_flags); in default_wake_function()
6852 p->sched_class = &dl_sched_class; in __setscheduler_prio()
6854 p->sched_class = &rt_sched_class; in __setscheduler_prio()
6856 p->sched_class = &fair_sched_class; in __setscheduler_prio()
6858 p->prio = prio; in __setscheduler_prio()
6866 prio = min(prio, pi_task->prio); in __rt_effective_prio()
6879 * rt_mutex_setprio - set the current priority of a task
6884 * not touch ->normal_prio like __setscheduler().
6897 /* XXX used to be waiter->prio, not waiter->task->prio */ in rt_mutex_setprio()
6898 prio = __rt_effective_prio(pi_task, p->normal_prio); in rt_mutex_setprio()
6903 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) in rt_mutex_setprio()
6909 * Set under pi_lock && rq->lock, such that the value can be used under in rt_mutex_setprio()
6914 * ensure a task is de-boosted (pi_task is set to NULL) before the in rt_mutex_setprio()
6916 * points to a blocked task -- which guarantees the task is present. in rt_mutex_setprio()
6918 p->pi_top_task = pi_task; in rt_mutex_setprio()
6923 if (prio == p->prio && !dl_prio(prio)) in rt_mutex_setprio()
6931 * the timer wheel base->lock on the CPU and another CPU wants in rt_mutex_setprio()
6938 if (unlikely(p == rq->idle)) { in rt_mutex_setprio()
6939 WARN_ON(p != rq->curr); in rt_mutex_setprio()
6940 WARN_ON(p->pi_blocked_on); in rt_mutex_setprio()
6945 oldprio = p->prio; in rt_mutex_setprio()
6950 prev_class = p->sched_class; in rt_mutex_setprio()
6960 * 1. -rt task is running and holds mutex A in rt_mutex_setprio()
6961 * --> -dl task blocks on mutex A in rt_mutex_setprio()
6963 * 2. -dl task is running and holds mutex A in rt_mutex_setprio()
6964 * --> -dl task blocks on mutex A and could preempt the in rt_mutex_setprio()
6968 if (!dl_prio(p->normal_prio) || in rt_mutex_setprio()
6969 (pi_task && dl_prio(pi_task->prio) && in rt_mutex_setprio()
6970 dl_entity_preempt(&pi_task->dl, &p->dl))) { in rt_mutex_setprio()
6971 p->dl.pi_se = pi_task->dl.pi_se; in rt_mutex_setprio()
6974 p->dl.pi_se = &p->dl; in rt_mutex_setprio()
6978 p->dl.pi_se = &p->dl; in rt_mutex_setprio()
6983 p->dl.pi_se = &p->dl; in rt_mutex_setprio()
6985 p->rt.timeout = 0; in rt_mutex_setprio()
6997 /* Avoid rq from going away on us: */ in rt_mutex_setprio()
7024 * the task might be in the middle of scheduling on another CPU. in set_user_nice()
7031 * allow the 'normal' nice value to be set - but as expected in set_user_nice()
7032 * it won't have any effect on scheduling until the task is in set_user_nice()
7036 p->static_prio = NICE_TO_PRIO(nice); in set_user_nice()
7046 p->static_prio = NICE_TO_PRIO(nice); in set_user_nice()
7048 old_prio = p->prio; in set_user_nice()
7049 p->prio = effective_prio(p); in set_user_nice()
7060 p->sched_class->prio_changed(rq, p, old_prio); in set_user_nice()
7068 * is_nice_reduction - check if nice value is an actual reduction
7077 /* Convert nice value [19,-20] to rlimit style value [1,40]: */ in is_nice_reduction()
7084 * can_nice - check if a task can reduce its nice value
7096 * sys_nice - change the priority of the current process.
7111 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); in SYSCALL_DEFINE1()
7116 return -EPERM; in SYSCALL_DEFINE1()
7129 * task_prio - return the priority value of a given task.
7136 * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
7137 * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
7138 * deadline -101 -1 0
7142 return p->prio - MAX_RT_PRIO; in task_prio()
7146 * idle_cpu - is a given CPU idle currently?
7155 if (rq->curr != rq->idle) in idle_cpu()
7158 if (rq->nr_running) in idle_cpu()
7162 if (rq->ttwu_pending) in idle_cpu()
7170 * available_idle_cpu - is a given CPU idle for enqueuing work.
7187 * idle_task - return the idle task for a given CPU.
7194 return cpu_rq(cpu)->idle; in idle_task()
7210 * The cfs,rt,dl utilization are the running times measured with rq->clock_task
7211 * which excludes things like IRQ and steal-time. These latter are then accrued
7215 * based on the task model parameters and gives the minimal utilization
7228 type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { in effective_cpu_util()
7234 * because of inaccuracies in how we track these -- see in effective_cpu_util()
7242 * Because the time spend on RT/DL tasks is visible as 'lost' time to in effective_cpu_util()
7247 * CFS and RT utilization can be boosted or capped, depending on in effective_cpu_util()
7261 * of this sum because we want to use cpu_bw_dl() later on, but we need in effective_cpu_util()
7266 * saturation when we should -- something for later. in effective_cpu_util()
7283 * max - irq in effective_cpu_util()
7284 * U' = irq + --------- * U in effective_cpu_util()
7313 * find_process_by_pid - find a process with a matching PID value.
7324 * sched_setparam() passes in -1 for its policy, to let the functions
7327 #define SETPARAM_POLICY -1
7332 int policy = attr->sched_policy; in __setscheduler_params()
7335 policy = p->policy; in __setscheduler_params()
7337 p->policy = policy; in __setscheduler_params()
7342 p->static_prio = NICE_TO_PRIO(attr->sched_nice); in __setscheduler_params()
7345 * __sched_setscheduler() ensures attr->sched_priority == 0 when in __setscheduler_params()
7349 p->rt_priority = attr->sched_priority; in __setscheduler_params()
7350 p->normal_prio = normal_prio(p); in __setscheduler_params()
7364 match = (uid_eq(cred->euid, pcred->euid) || in check_same_owner()
7365 uid_eq(cred->euid, pcred->uid)); in check_same_owner()
7373 * event on permitted non-privileged operations:
7380 if (attr->sched_nice < task_nice(p) && in user_check_sched_setscheduler()
7381 !is_nice_reduction(p, attr->sched_nice)) in user_check_sched_setscheduler()
7389 if (policy != p->policy && !rlim_rtprio) in user_check_sched_setscheduler()
7393 if (attr->sched_priority > p->rt_priority && in user_check_sched_setscheduler()
7394 attr->sched_priority > rlim_rtprio) in user_check_sched_setscheduler()
7421 if (p->sched_reset_on_fork && !reset_on_fork) in user_check_sched_setscheduler()
7428 return -EPERM; in user_check_sched_setscheduler()
7437 int oldpolicy = -1, policy = attr->sched_policy; in __sched_setscheduler()
7451 reset_on_fork = p->sched_reset_on_fork; in __sched_setscheduler()
7452 policy = oldpolicy = p->policy; in __sched_setscheduler()
7454 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); in __sched_setscheduler()
7457 return -EINVAL; in __sched_setscheduler()
7460 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) in __sched_setscheduler()
7461 return -EINVAL; in __sched_setscheduler()
7465 * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL, in __sched_setscheduler()
7468 if (attr->sched_priority > MAX_RT_PRIO-1) in __sched_setscheduler()
7469 return -EINVAL; in __sched_setscheduler()
7471 (rt_policy(policy) != (attr->sched_priority != 0))) in __sched_setscheduler()
7472 return -EINVAL; in __sched_setscheduler()
7479 if (attr->sched_flags & SCHED_FLAG_SUGOV) in __sched_setscheduler()
7480 return -EINVAL; in __sched_setscheduler()
7488 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { in __sched_setscheduler()
7498 * Make sure no PI-waiters arrive (or leave) while we are in __sched_setscheduler()
7501 * To be able to change p->policy safely, the appropriate in __sched_setscheduler()
7510 if (p == rq->stop) { in __sched_setscheduler()
7511 retval = -EINVAL; in __sched_setscheduler()
7519 if (unlikely(policy == p->policy)) { in __sched_setscheduler()
7520 if (fair_policy(policy) && attr->sched_nice != task_nice(p)) in __sched_setscheduler()
7522 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) in __sched_setscheduler()
7526 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) in __sched_setscheduler()
7529 p->sched_reset_on_fork = reset_on_fork; in __sched_setscheduler()
7542 task_group(p)->rt_bandwidth.rt_runtime == 0 && in __sched_setscheduler()
7544 retval = -EPERM; in __sched_setscheduler()
7550 !(attr->sched_flags & SCHED_FLAG_SUGOV)) { in __sched_setscheduler()
7551 cpumask_t *span = rq->rd->span; in __sched_setscheduler()
7558 if (!cpumask_subset(span, p->cpus_ptr) || in __sched_setscheduler()
7559 rq->rd->dl_bw.bw == 0) { in __sched_setscheduler()
7560 retval = -EPERM; in __sched_setscheduler()
7567 /* Re-check policy now with rq lock held: */ in __sched_setscheduler()
7568 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { in __sched_setscheduler()
7569 policy = oldpolicy = -1; in __sched_setscheduler()
7582 retval = -EBUSY; in __sched_setscheduler()
7586 p->sched_reset_on_fork = reset_on_fork; in __sched_setscheduler()
7587 oldprio = p->prio; in __sched_setscheduler()
7589 newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice); in __sched_setscheduler()
7610 prev_class = p->sched_class; in __sched_setscheduler()
7612 if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { in __sched_setscheduler()
7623 if (oldprio < p->prio) in __sched_setscheduler()
7633 /* Avoid rq from going away on us: */ in __sched_setscheduler()
7661 .sched_priority = param->sched_priority, in _sched_setscheduler()
7662 .sched_nice = PRIO_TO_NICE(p->static_prio), in _sched_setscheduler()
7675 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
7682 * Return: 0 on success. An error code otherwise.
7704 …* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from ke…
7714 * Return: 0 on success. An error code otherwise.
7775 return -EINVAL; in do_sched_setscheduler()
7777 return -EFAULT; in do_sched_setscheduler()
7780 retval = -ESRCH; in do_sched_setscheduler()
7805 ret = get_user(size, &uattr->size); in sched_copy_attr()
7817 if (ret == -E2BIG) in sched_copy_attr()
7822 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && in sched_copy_attr()
7824 return -EINVAL; in sched_copy_attr()
7828 * to be strict and return an error on out-of-bounds values? in sched_copy_attr()
7830 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); in sched_copy_attr()
7835 put_user(sizeof(*attr), &uattr->size); in sched_copy_attr()
7836 return -E2BIG; in sched_copy_attr()
7844 attr->sched_priority = p->rt_priority; in get_params()
7846 attr->sched_nice = task_nice(p); in get_params()
7850 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
7855 * Return: 0 on success. An error code otherwise.
7860 return -EINVAL; in SYSCALL_DEFINE3()
7866 * sys_sched_setparam - set/change the RT priority of a thread
7870 * Return: 0 on success. An error code otherwise.
7878 * sys_sched_setattr - same as above, but with extended sched_attr
7891 return -EINVAL; in SYSCALL_DEFINE3()
7898 return -EINVAL; in SYSCALL_DEFINE3()
7903 retval = -ESRCH; in SYSCALL_DEFINE3()
7920 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
7923 * Return: On success, the policy of the thread. Otherwise, a negative error
7932 return -EINVAL; in SYSCALL_DEFINE1()
7934 retval = -ESRCH; in SYSCALL_DEFINE1()
7940 retval = p->policy in SYSCALL_DEFINE1()
7941 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); in SYSCALL_DEFINE1()
7948 * sys_sched_getparam - get the RT priority of a thread
7952 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
7962 return -EINVAL; in SYSCALL_DEFINE2()
7966 retval = -ESRCH; in SYSCALL_DEFINE2()
7975 lp.sched_priority = p->rt_priority; in SYSCALL_DEFINE2()
7981 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; in SYSCALL_DEFINE2()
7992 * than what user-space knows about) to user-space.
7994 * Note that all cases are valid: user-space buffer can be larger or
7995 * smaller than the kernel-space buffer. The usual case is that both
8006 return -EFAULT; in sched_attr_copy_to_user()
8011 * If usize == ksize then we just copy everything to user-space and all is good. in sched_attr_copy_to_user()
8013 * If usize < ksize then we only copy as much as user-space has space for, in sched_attr_copy_to_user()
8016 * If usize > ksize then user-space is using a newer version of the ABI, in sched_attr_copy_to_user()
8017 * which part the kernel doesn't know about. Just ignore it - tooling can in sched_attr_copy_to_user()
8018 * detect the kernel's knowledge of attributes from the attr->size value in sched_attr_copy_to_user()
8021 kattr->size = min(usize, ksize); in sched_attr_copy_to_user()
8023 if (copy_to_user(uattr, kattr, kattr->size)) in sched_attr_copy_to_user()
8024 return -EFAULT; in sched_attr_copy_to_user()
8030 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
8045 return -EINVAL; in SYSCALL_DEFINE4()
8049 retval = -ESRCH; in SYSCALL_DEFINE4()
8057 kattr.sched_policy = p->policy; in SYSCALL_DEFINE4()
8058 if (p->sched_reset_on_fork) in SYSCALL_DEFINE4()
8069 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; in SYSCALL_DEFINE4()
8070 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; in SYSCALL_DEFINE4()
8095 * Since bandwidth control happens on root_domain basis, in dl_task_check_affinity()
8096 * if admission test is enabled, we only admit -deadline in dl_task_check_affinity()
8097 * tasks allowed to run on all the CPUs in the task's in dl_task_check_affinity()
8101 if (!cpumask_subset(task_rq(p)->rd->span, mask)) in dl_task_check_affinity()
8102 ret = -EBUSY; in dl_task_check_affinity()
8115 return -ENOMEM; in __sched_setaffinity()
8118 retval = -ENOMEM; in __sched_setaffinity()
8160 return -ESRCH; in sched_setaffinity()
8167 if (p->flags & PF_NO_SETAFFINITY) { in sched_setaffinity()
8168 retval = -EINVAL; in sched_setaffinity()
8174 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { in sched_setaffinity()
8176 retval = -EPERM; in sched_setaffinity()
8200 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; in get_user_cpu_mask()
8204 * sys_sched_setaffinity - set the CPU affinity of a process
8207 * @user_mask_ptr: user-space pointer to the new CPU mask
8209 * Return: 0 on success. An error code otherwise.
8218 return -ENOMEM; in SYSCALL_DEFINE3()
8235 retval = -ESRCH; in sched_getaffinity()
8244 raw_spin_lock_irqsave(&p->pi_lock, flags); in sched_getaffinity()
8245 cpumask_and(mask, &p->cpus_mask, cpu_active_mask); in sched_getaffinity()
8246 raw_spin_unlock_irqrestore(&p->pi_lock, flags); in sched_getaffinity()
8255 * sys_sched_getaffinity - get the CPU affinity of a process
8258 * @user_mask_ptr: user-space pointer to hold the current CPU mask
8260 * Return: size of CPU mask copied to user_mask_ptr on success. An
8270 return -EINVAL; in SYSCALL_DEFINE3()
8271 if (len & (sizeof(unsigned long)-1)) in SYSCALL_DEFINE3()
8272 return -EINVAL; in SYSCALL_DEFINE3()
8275 return -ENOMEM; in SYSCALL_DEFINE3()
8282 ret = -EFAULT; in SYSCALL_DEFINE3()
8298 schedstat_inc(rq->yld_count); in do_sched_yield()
8299 current->sched_class->yield_task(rq); in do_sched_yield()
8309 * sys_sched_yield - yield the current processor to other threads.
8312 * other threads running on this CPU then this function will return.
8330 * In preemptible kernels, ->rcu_read_lock_nesting tells the tick in __cond_resched()
8331 * whether the current CPU is in an RCU read-side critical section, in __cond_resched()
8333 * in kernel context. In contrast, in non-preemptible kernels, in __cond_resched()
8334 * RCU readers leave no in-memory hints, which means that CPU-bound in __cond_resched()
8381 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
8382 * call schedule, and on return reacquire the lock.
8384 * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
8445 #include <linux/entry-common.h>
8457 * cond_resched <- __cond_resched
8458 * might_resched <- RET0
8459 * preempt_schedule <- NOP
8460 * preempt_schedule_notrace <- NOP
8461 * irqentry_exit_cond_resched <- NOP
8464 * cond_resched <- __cond_resched
8465 * might_resched <- __cond_resched
8466 * preempt_schedule <- NOP
8467 * preempt_schedule_notrace <- NOP
8468 * irqentry_exit_cond_resched <- NOP
8471 * cond_resched <- RET0
8472 * might_resched <- RET0
8473 * preempt_schedule <- preempt_schedule
8474 * preempt_schedule_notrace <- preempt_schedule_notrace
8475 * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
8479 preempt_dynamic_undefined = -1,
8498 return -EINVAL; in sched_dynamic_mode()
8514 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in in sched_dynamic_update()
8603 * yield - yield the current processor to other threads.
8632 * yield_to - yield the current processor to another thread in
8634 * processor it's on.
8639 * can't go away on us before we can do any checks.
8644 * -ESRCH if there's no task to yield to.
8659 * If we're the only runnable task on the rq and target rq also in yield_to()
8662 if (rq->nr_running == 1 && p_rq->nr_running == 1) { in yield_to()
8663 yielded = -ESRCH; in yield_to()
8673 if (!curr->sched_class->yield_to_task) in yield_to()
8676 if (curr->sched_class != p->sched_class) in yield_to()
8682 yielded = curr->sched_class->yield_to_task(rq, p); in yield_to()
8684 schedstat_inc(rq->yld_count); in yield_to()
8707 int old_iowait = current->in_iowait; in io_schedule_prepare()
8709 current->in_iowait = 1; in io_schedule_prepare()
8710 blk_flush_plug(current->plug, true); in io_schedule_prepare()
8716 current->in_iowait = token; in io_schedule_finish()
8720 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
8747 * sys_sched_get_priority_max - return maximum RT priority.
8750 * Return: On success, this syscall returns the maximum
8752 * On failure, a negative error code is returned.
8756 int ret = -EINVAL; in SYSCALL_DEFINE1()
8761 ret = MAX_RT_PRIO-1; in SYSCALL_DEFINE1()
8774 * sys_sched_get_priority_min - return minimum RT priority.
8777 * Return: On success, this syscall returns the minimum
8779 * On failure, a negative error code is returned.
8783 int ret = -EINVAL; in SYSCALL_DEFINE1()
8808 return -EINVAL; in sched_rr_get_interval()
8810 retval = -ESRCH; in sched_rr_get_interval()
8822 if (p->sched_class->get_rr_interval) in sched_rr_get_interval()
8823 time_slice = p->sched_class->get_rr_interval(rq, p); in sched_rr_get_interval()
8836 * sys_sched_rr_get_interval - return the default timeslice of a process.
8841 * into the user-space timespec buffer. A value of '0' means infinity.
8843 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
8879 pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); in sched_show_task()
8889 ppid = task_pid_nr(rcu_dereference(p->real_parent)); in sched_show_task()
8891 pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n", in sched_show_task()
8905 unsigned int state = READ_ONCE(p->__state); in state_filter_match()
8933 * reset the NMI-timeout, listing all files on a slow in show_state_filter()
8935 * Also, reset softlockup watchdogs on all CPUs, because in show_state_filter()
8958 * init_idle - set up an idle thread for a given CPU
8972 raw_spin_lock_irqsave(&idle->pi_lock, flags); in init_idle()
8975 idle->__state = TASK_RUNNING; in init_idle()
8976 idle->se.exec_start = sched_clock(); in init_idle()
8979 * look like a proper per-CPU kthread. in init_idle()
8981 idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; in init_idle()
8986 * It's possible that init_idle() gets called multiple times on a task, in init_idle()
8995 * holding rq->lock, the CPU isn't yet set to this CPU so the in init_idle()
8999 * use task_rq_lock() here and obtain the other rq->lock. in init_idle()
9007 rq->idle = idle; in init_idle()
9008 rcu_assign_pointer(rq->curr, idle); in init_idle()
9009 idle->on_rq = TASK_ON_RQ_QUEUED; in init_idle()
9011 idle->on_cpu = 1; in init_idle()
9014 raw_spin_unlock_irqrestore(&idle->pi_lock, flags); in init_idle()
9022 idle->sched_class = &idle_sched_class; in init_idle()
9026 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); in init_idle()
9056 * success of set_cpus_allowed_ptr() on all attached tasks in task_can_attach()
9059 if (p->flags & PF_NO_SETAFFINITY) { in task_can_attach()
9060 ret = -EINVAL; in task_can_attach()
9064 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, in task_can_attach()
9069 return -EINVAL; in task_can_attach()
9089 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) in migrate_task_to()
9090 return -EINVAL; in migrate_task_to()
9099 * Requeue a task on a given node and accurately track the number of NUMA
9100 * tasks on the runqueues
9117 p->numa_preferred_nid = nid; in sched_setnuma()
9134 struct mm_struct *mm = current->active_mm; in idle_task_exit()
9137 BUG_ON(current != this_rq()->idle); in idle_task_exit()
9144 /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ in idle_task_exit()
9154 raw_spin_lock_irq(&p->pi_lock); in __balance_push_cpu_stop()
9160 cpu = select_fallback_rq(rq->cpu, p); in __balance_push_cpu_stop()
9165 raw_spin_unlock_irq(&p->pi_lock); in __balance_push_cpu_stop()
9175 * Ensure we only run per-cpu kthreads once the CPU goes !active.
9178 * effective when the hotplug motion is down.
9182 struct task_struct *push_task = rq->curr; in balance_push()
9187 * Ensure the thing is persistent until balance_push_set(.on = false); in balance_push()
9189 rq->balance_callback = &balance_push_callback; in balance_push()
9192 * Only active while going offline and when invoked on the outgoing in balance_push()
9195 if (!cpu_dying(rq->cpu) || rq != this_rq()) in balance_push()
9199 * Both the cpu-hotplug and stop task are in this case and are in balance_push()
9206 * If this is the idle task on the outgoing CPU try to wake in balance_push()
9209 * accurate here because the waiter is pinned on this CPU in balance_push()
9212 * On RT kernels this also has to check whether there are in balance_push()
9213 * pinned and scheduled out tasks on the runqueue. They in balance_push()
9216 if (!rq->nr_running && !rq_has_pinned_tasks(rq) && in balance_push()
9217 rcuwait_active(&rq->hotplug_wait)) { in balance_push()
9219 rcuwait_wake_up(&rq->hotplug_wait); in balance_push()
9227 * Temporarily drop rq->lock such that we can wake-up the stop task. in balance_push()
9231 stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, in balance_push()
9241 static void balance_push_set(int cpu, bool on) in balance_push_set() argument
9247 if (on) { in balance_push_set()
9248 WARN_ON_ONCE(rq->balance_callback); in balance_push_set()
9249 rq->balance_callback = &balance_push_callback; in balance_push_set()
9250 } else if (rq->balance_callback == &balance_push_callback) { in balance_push_set()
9251 rq->balance_callback = NULL; in balance_push_set()
9259 * pushed off this CPU now via balance_push() or placed on a different CPU
9266 rcuwait_wait_event(&rq->hotplug_wait, in balance_hotplug_wait()
9267 rq->nr_running == 1 && !rq_has_pinned_tasks(rq), in balance_hotplug_wait()
9277 static inline void balance_push_set(int cpu, bool on) in balance_push_set() argument
9289 if (!rq->online) { in set_rq_online()
9292 cpumask_set_cpu(rq->cpu, rq->rd->online); in set_rq_online()
9293 rq->online = 1; in set_rq_online()
9296 if (class->rq_online) in set_rq_online()
9297 class->rq_online(rq); in set_rq_online()
9304 if (rq->online) { in set_rq_offline()
9308 if (class->rq_offline) in set_rq_offline()
9309 class->rq_offline(rq); in set_rq_offline()
9312 cpumask_clear_cpu(rq->cpu, rq->rd->online); in set_rq_offline()
9313 rq->online = 0; in set_rq_offline()
9340 if (--num_cpus_frozen) in cpuset_cpu_active()
9403 if (rq->rd) { in sched_cpu_activate()
9404 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); in sched_cpu_activate()
9436 * preempt-disabled and RCU users of this state to go away such that in sched_cpu_deactivate()
9439 * Specifically, we rely on ttwu to no longer target this CPU, see in sched_cpu_deactivate()
9447 if (rq->rd) { in sched_cpu_deactivate()
9449 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); in sched_cpu_deactivate()
9483 rq->calc_load_update = calc_load_update; in sched_rq_cpu_starting()
9502 * any of those which might be on the way out are gone.
9504 * If after this point a bound task is being woken on this CPU then the
9517 * stopper is the last running task on the CPU, so nr_active count is
9521 * Also see the comment "Global load-average calculations".
9538 printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running); in dump_rq_tasks()
9546 printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm); in dump_rq_tasks()
9559 if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { in sched_cpu_dying()
9586 /* Move init over to a non-isolated CPU */ in sched_init_smp()
9589 current->flags &= ~PF_NO_SETAFFINITY; in sched_init_smp()
9699 raw_spin_lock_init(&rq->__lock); in sched_init()
9700 rq->nr_running = 0; in sched_init()
9701 rq->calc_load_active = 0; in sched_init()
9702 rq->calc_load_update = jiffies + LOAD_FREQ; in sched_init()
9703 init_cfs_rq(&rq->cfs); in sched_init()
9704 init_rt_rq(&rq->rt); in sched_init()
9705 init_dl_rq(&rq->dl); in sched_init()
9707 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); in sched_init()
9708 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; in sched_init()
9712 * In case of task-groups formed thr' the cgroup filesystem, it in sched_init()
9715 * root_task_group and its child task-groups in a fair manner, in sched_init()
9716 * based on each entity's (task or task-group's) weight in sched_init()
9717 * (se->load.weight). in sched_init()
9726 * directly in rq->cfs (i.e root_task_group->se[] = NULL). in sched_init()
9728 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); in sched_init()
9731 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; in sched_init()
9733 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); in sched_init()
9736 rq->sd = NULL; in sched_init()
9737 rq->rd = NULL; in sched_init()
9738 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; in sched_init()
9739 rq->balance_callback = &balance_push_callback; in sched_init()
9740 rq->active_balance = 0; in sched_init()
9741 rq->next_balance = jiffies; in sched_init()
9742 rq->push_cpu = 0; in sched_init()
9743 rq->cpu = i; in sched_init()
9744 rq->online = 0; in sched_init()
9745 rq->idle_stamp = 0; in sched_init()
9746 rq->avg_idle = 2*sysctl_sched_migration_cost; in sched_init()
9747 rq->wake_stamp = jiffies; in sched_init()
9748 rq->wake_avg_idle = rq->avg_idle; in sched_init()
9749 rq->max_idle_balance_cost = sysctl_sched_migration_cost; in sched_init()
9751 INIT_LIST_HEAD(&rq->cfs_tasks); in sched_init()
9755 rq->last_blocked_load_update_tick = jiffies; in sched_init()
9756 atomic_set(&rq->nohz_flags, 0); in sched_init()
9758 INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); in sched_init()
9761 rcuwait_init(&rq->hotplug_wait); in sched_init()
9765 atomic_set(&rq->nr_iowait, 0); in sched_init()
9768 rq->core = rq; in sched_init()
9769 rq->core_pick = NULL; in sched_init()
9770 rq->core_enabled = 0; in sched_init()
9771 rq->core_tree = RB_ROOT; in sched_init()
9772 rq->core_forceidle_count = 0; in sched_init()
9773 rq->core_forceidle_occupation = 0; in sched_init()
9774 rq->core_forceidle_start = 0; in sched_init()
9776 rq->core_cookie = 0UL; in sched_init()
9790 * is dressed up as a per-CPU kthread and thus needs to play the part in sched_init()
9791 * if we want to avoid special-casing it in code that deals with per-CPU in sched_init()
9827 * Blocking primitives will set (and therefore destroy) current->state, in __might_sleep()
9831 WARN_ONCE(state != TASK_RUNNING && current->task_state_change, in __might_sleep()
9834 (void *)current->task_state_change, in __might_sleep()
9835 (void *)current->task_state_change); in __might_sleep()
9873 !is_idle_task(current) && !current->non_block_count) || in __might_resched()
9888 in_atomic(), irqs_disabled(), current->non_block_count, in __might_resched()
9889 current->pid, current->comm); in __might_resched()
9933 current->pid, current->comm); in __cant_sleep()
9965 current->pid, current->comm); in __cant_migrate()
9988 if (p->flags & PF_KTHREAD) in normalize_rt_tasks()
9991 p->se.exec_start = 0; in normalize_rt_tasks()
9992 schedstat_set(p->stats.wait_start, 0); in normalize_rt_tasks()
9993 schedstat_set(p->stats.sleep_start, 0); in normalize_rt_tasks()
9994 schedstat_set(p->stats.block_start, 0); in normalize_rt_tasks()
10018 * stopped - every CPU needs to be quiescent, and no scheduling
10025 * curr_task - return the current task for a given CPU.
10041 * ia64_set_curr_task - set the current task for a given CPU.
10045 * Description: This function must only be used when non-maskable interrupts
10046 * are serviced on a separate stack. It allows the architecture to switch the
10047 * notion of the current task on a CPU in a non-blocking manner. This function
10051 * re-starting the system.
10073 uclamp_se_set(&tg->uclamp_req[clamp_id], in alloc_uclamp_sched_group()
10075 tg->uclamp[clamp_id] = parent->uclamp[clamp_id]; in alloc_uclamp_sched_group()
10101 call_rcu(&tg->rcu, sched_free_group_rcu); in sched_unregister_group()
10111 return ERR_PTR(-ENOMEM); in sched_create_group()
10125 return ERR_PTR(-ENOMEM); in sched_create_group()
10133 list_add_rcu(&tg->list, &task_groups); in sched_online_group()
10138 tg->parent = parent; in sched_online_group()
10139 INIT_LIST_HEAD(&tg->children); in sched_online_group()
10140 list_add_rcu(&tg->siblings, &parent->children); in sched_online_group()
10156 call_rcu(&tg->rcu, sched_unregister_group_rcu); in sched_destroy_group()
10177 list_del_rcu(&tg->list); in sched_release_group()
10178 list_del_rcu(&tg->siblings); in sched_release_group()
10194 tsk->sched_task_group = tg; in sched_change_group()
10197 if (tsk->sched_class->task_change_group) in sched_change_group()
10198 tsk->sched_class->task_change_group(tsk); in sched_change_group()
10208 * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
10264 return ERR_PTR(-ENOMEM); in cpu_cgroup_css_alloc()
10266 return &tg->css; in cpu_cgroup_css_alloc()
10273 struct task_group *parent = css_tg(css->parent); in cpu_cgroup_css_online()
10302 * Relies on the RCU grace period between css_released() and this. in cpu_cgroup_css_free()
10315 return -EINVAL; in cpu_cgroup_can_attach()
10344 uc_parent = css_tg(css)->parent in cpu_util_update_eff()
10345 ? css_tg(css)->parent->uclamp : NULL; in cpu_util_update_eff()
10349 eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value; in cpu_util_update_eff()
10361 uc_se = css_tg(css)->uclamp; in cpu_util_update_eff()
10411 req.ret = -ERANGE; in capacity_from_percent()
10439 if (tg->uclamp_req[clamp_id].value != req.util) in cpu_uclamp_write()
10440 uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false); in cpu_uclamp_write()
10446 tg->uclamp_pct[clamp_id] = req.percent; in cpu_uclamp_write()
10481 util_clamp = tg->uclamp_req[clamp_id].value; in cpu_uclamp_print()
10489 percent = tg->uclamp_pct[clamp_id]; in cpu_uclamp_print()
10521 return (u64) scale_load_down(tg->shares); in cpu_shares_read_u64()
10538 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; in tg_set_cfs_bandwidth()
10541 return -EINVAL; in tg_set_cfs_bandwidth()
10549 return -EINVAL; in tg_set_cfs_bandwidth()
10552 * Likewise, bound things on the other side by preventing insane quota in tg_set_cfs_bandwidth()
10557 return -EINVAL; in tg_set_cfs_bandwidth()
10563 return -EINVAL; in tg_set_cfs_bandwidth()
10567 return -EINVAL; in tg_set_cfs_bandwidth()
10570 * Prevent race between setting of cfs_rq->runtime_enabled and in tg_set_cfs_bandwidth()
10580 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; in tg_set_cfs_bandwidth()
10582 * If we need to toggle cfs_bandwidth_used, off->on must occur in tg_set_cfs_bandwidth()
10583 * before making related changes, and on->off must occur afterwards in tg_set_cfs_bandwidth()
10587 raw_spin_lock_irq(&cfs_b->lock); in tg_set_cfs_bandwidth()
10588 cfs_b->period = ns_to_ktime(period); in tg_set_cfs_bandwidth()
10589 cfs_b->quota = quota; in tg_set_cfs_bandwidth()
10590 cfs_b->burst = burst; in tg_set_cfs_bandwidth()
10598 raw_spin_unlock_irq(&cfs_b->lock); in tg_set_cfs_bandwidth()
10601 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; in tg_set_cfs_bandwidth()
10602 struct rq *rq = cfs_rq->rq; in tg_set_cfs_bandwidth()
10606 cfs_rq->runtime_enabled = runtime_enabled; in tg_set_cfs_bandwidth()
10607 cfs_rq->runtime_remaining = 0; in tg_set_cfs_bandwidth()
10609 if (cfs_rq->throttled) in tg_set_cfs_bandwidth()
10626 period = ktime_to_ns(tg->cfs_bandwidth.period); in tg_set_cfs_quota()
10627 burst = tg->cfs_bandwidth.burst; in tg_set_cfs_quota()
10633 return -EINVAL; in tg_set_cfs_quota()
10642 if (tg->cfs_bandwidth.quota == RUNTIME_INF) in tg_get_cfs_quota()
10643 return -1; in tg_get_cfs_quota()
10645 quota_us = tg->cfs_bandwidth.quota; in tg_get_cfs_quota()
10656 return -EINVAL; in tg_set_cfs_period()
10659 quota = tg->cfs_bandwidth.quota; in tg_set_cfs_period()
10660 burst = tg->cfs_bandwidth.burst; in tg_set_cfs_period()
10669 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); in tg_get_cfs_period()
10680 return -EINVAL; in tg_set_cfs_burst()
10683 period = ktime_to_ns(tg->cfs_bandwidth.period); in tg_set_cfs_burst()
10684 quota = tg->cfs_bandwidth.quota; in tg_set_cfs_burst()
10693 burst_us = tg->cfs_bandwidth.burst; in tg_get_cfs_burst()
10749 if (tg == d->tg) { in normalize_cfs_quota()
10750 period = d->period; in normalize_cfs_quota()
10751 quota = d->quota; in normalize_cfs_quota()
10758 if (quota == RUNTIME_INF || quota == -1) in normalize_cfs_quota()
10767 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; in tg_cfs_schedulable_down()
10768 s64 quota = 0, parent_quota = -1; in tg_cfs_schedulable_down()
10770 if (!tg->parent) { in tg_cfs_schedulable_down()
10773 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; in tg_cfs_schedulable_down()
10776 parent_quota = parent_b->hierarchical_quota; in tg_cfs_schedulable_down()
10779 * Ensure max(child_quota) <= parent_quota. On cgroup2, in tg_cfs_schedulable_down()
10780 * always take the min. On cgroup1, only inherit when no in tg_cfs_schedulable_down()
10789 return -EINVAL; in tg_cfs_schedulable_down()
10792 cfs_b->hierarchical_quota = quota; in tg_cfs_schedulable_down()
10821 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; in cpu_cfs_stat_show()
10823 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); in cpu_cfs_stat_show()
10824 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); in cpu_cfs_stat_show()
10825 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); in cpu_cfs_stat_show()
10833 stats = __schedstats_from_se(tg->se[i]); in cpu_cfs_stat_show()
10834 ws += schedstat_val(stats->wait_sum); in cpu_cfs_stat_show()
10840 seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst); in cpu_cfs_stat_show()
10841 seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time); in cpu_cfs_stat_show()
10878 return css_tg(css)->idle; in cpu_idle_read_s64()
10957 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; in cpu_extra_stat_show()
10960 throttled_usec = cfs_b->throttled_time; in cpu_extra_stat_show()
10962 burst_usec = cfs_b->burst_time; in cpu_extra_stat_show()
10970 cfs_b->nr_periods, cfs_b->nr_throttled, in cpu_extra_stat_show()
10971 throttled_usec, cfs_b->nr_burst, burst_usec); in cpu_extra_stat_show()
10982 u64 weight = scale_load_down(tg->shares); in cpu_weight_read_u64()
10993 * a bit of range on both ends, it maps pretty well onto the shares in cpu_weight_write_u64()
10994 * value used by scheduler and the round-trip conversions preserve in cpu_weight_write_u64()
10998 return -ERANGE; in cpu_weight_write_u64()
11008 unsigned long weight = scale_load_down(css_tg(css)->shares); in cpu_weight_nice_read_s64()
11014 delta = abs(sched_prio_to_weight[prio] - weight); in cpu_weight_nice_read_s64()
11020 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO); in cpu_weight_nice_read_s64()
11030 return -ERANGE; in cpu_weight_nice_write_s64()
11032 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; in cpu_weight_nice_write_s64()
11058 return -EINVAL; in cpu_period_quota_parse()
11067 return -EINVAL; in cpu_period_quota_parse()
11188 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
11189 * nice 1, it will get ~10% less CPU time than another CPU-bound task
11190 * that remained on nice 0.
11193 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
11199 /* -20 */ 88761, 71755, 56483, 46273, 36291,
11200 /* -15 */ 29154, 23254, 18705, 14949, 11916,
11201 /* -10 */ 9548, 7620, 6100, 4904, 3906,
11202 /* -5 */ 3121, 2501, 1991, 1586, 1277,
11217 /* -20 */ 48388, 59856, 76040, 92818, 118348,
11218 /* -15 */ 147320, 184698, 229616, 287308, 360437,
11219 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
11220 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,