Lines Matching +full:min +full:- +full:sample +full:- +full:time
21 * The time in which a task can execute on a CPU is our baseline for
22 * productivity. Pressure expresses the amount of time in which this
33 * In the FULL state of a given resource, all non-idle tasks are
42 * The percentage of wallclock time spent in those compound stall
47 * %SOME = time(SOME) / period
48 * %FULL = time(FULL) / period
54 * unrealized due to resource contention *also* scales with non-idle
63 * threads will be contended at any given time, or about 0.4%.
66 * given time *one* of the tasks is delayed due to a lack of memory.
73 * we have to base our calculation on the number of non-idle tasks in
79 * threads = min(nr_nonidle_tasks, nr_cpus)
80 * SOME = min(nr_delayed_tasks / threads, 1)
81 * FULL = (threads - min(nr_running_tasks, threads)) / threads
85 * threads = min(257, 256)
86 * SOME = min(1 / 256, 1) = 0.4%
87 * FULL = (256 - min(257, 256)) / 256 = 0%
89 * For the 1 out of 4 memory-delayed tasks, this yields:
91 * threads = min(4, 4)
92 * SOME = min(1 / 4, 1) = 25%
93 * FULL = (4 - min(3, 4)) / 4 = 25%
96 * extension of the single-CPU model. ]
100 * To assess the precise time spent in each such state, we would have
111 * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
112 * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
113 * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
126 * cost-wise, yet way more sensitive and accurate than periodic
160 /* Running averages - we need to be higher-res than loadavg */
162 #define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
167 #define WINDOW_MIN_US 500000 /* Min window size is 500ms */
174 /* System-level pressure and stall tracking */
187 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); in group_init()
188 group->avg_last_update = sched_clock(); in group_init()
189 group->avg_next_update = group->avg_last_update + psi_period; in group_init()
190 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); in group_init()
191 mutex_init(&group->avgs_lock); in group_init()
192 /* Init trigger-related members */ in group_init()
193 mutex_init(&group->trigger_lock); in group_init()
194 INIT_LIST_HEAD(&group->triggers); in group_init()
195 memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); in group_init()
196 group->poll_states = 0; in group_init()
197 group->poll_min_period = U32_MAX; in group_init()
198 memset(group->polling_total, 0, sizeof(group->polling_total)); in group_init()
199 group->polling_next_update = ULLONG_MAX; in group_init()
200 group->polling_until = 0; in group_init()
201 rcu_assign_pointer(group->poll_task, NULL); in group_init()
240 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); in get_recent_times()
250 seq = read_seqcount_begin(&groupc->seq); in get_recent_times()
252 memcpy(times, groupc->times, sizeof(groupc->times)); in get_recent_times()
253 state_mask = groupc->state_mask; in get_recent_times()
254 state_start = groupc->state_start; in get_recent_times()
255 } while (read_seqcount_retry(&groupc->seq, seq)); in get_recent_times()
257 /* Calculate state time deltas against the previous snapshot */ in get_recent_times()
270 times[s] += now - state_start; in get_recent_times()
272 delta = times[s] - groupc->times_prev[aggregator][s]; in get_recent_times()
273 groupc->times_prev[aggregator][s] = times[s]; in get_recent_times()
282 u64 time, u64 period) in calc_avgs() argument
293 /* Sample the most recent active period */ in calc_avgs()
294 pct = div_u64(time * 100, period); in calc_avgs()
305 u64 deltas[NR_PSI_STATES - 1] = { 0, }; in collect_percpu_times()
312 * Collect the per-cpu time buckets and average them into a in collect_percpu_times()
313 * single time sample that is normalized to wallclock time. in collect_percpu_times()
315 * For averaging, each CPU is weighted by its non-idle time in in collect_percpu_times()
336 * Integrate the sample into the running statistics that are in collect_percpu_times()
348 for (s = 0; s < NR_PSI_STATES - 1; s++) in collect_percpu_times()
349 group->total[aggregator][s] += in collect_percpu_times()
364 expires = group->avg_next_update; in update_averages()
365 if (now - expires >= psi_period) in update_averages()
366 missed_periods = div_u64(now - expires, psi_period); in update_averages()
372 * But the deltas we sample out of the per-cpu buckets above in update_averages()
373 * are based on the actual time elapsing between clock ticks. in update_averages()
376 period = now - (group->avg_last_update + (missed_periods * psi_period)); in update_averages()
377 group->avg_last_update = now; in update_averages()
379 for (s = 0; s < NR_PSI_STATES - 1; s++) { in update_averages()
380 u32 sample; in update_averages() local
382 sample = group->total[PSI_AVGS][s] - group->avg_total[s]; in update_averages()
384 * Due to the lockless sampling of the time buckets, in update_averages()
385 * recorded time deltas can slip into the next period, in update_averages()
389 * We don't want to report non-sensical pressures in in update_averages()
398 * it frees up its time T in P. in update_averages()
400 if (sample > period) in update_averages()
401 sample = period; in update_averages()
402 group->avg_total[s] += sample; in update_averages()
403 calc_avgs(group->avg[s], missed_periods, sample, period); in update_averages()
420 mutex_lock(&group->avgs_lock); in psi_avgs_work()
427 * If there is task activity, periodically fold the per-cpu in psi_avgs_work()
431 * go - see calc_avgs() and missed_periods. in psi_avgs_work()
433 if (now >= group->avg_next_update) in psi_avgs_work()
434 group->avg_next_update = update_averages(group, now); in psi_avgs_work()
438 group->avg_next_update - now) + 1); in psi_avgs_work()
441 mutex_unlock(&group->avgs_lock); in psi_avgs_work()
448 win->start_time = now; in window_reset()
449 win->start_value = value; in window_reset()
450 win->prev_growth = prev_growth; in window_reset()
469 elapsed = now - win->start_time; in window_update()
470 growth = value - win->start_value; in window_update()
472 * After each tracking window passes win->start_value and in window_update()
473 * win->start_time get reset and win->prev_growth stores in window_update()
474 * the average per-window growth of the previous window. in window_update()
475 * win->prev_growth is then used to interpolate additional in window_update()
478 if (elapsed > win->size) in window_update()
483 remaining = win->size - elapsed; in window_update()
484 growth += div64_u64(win->prev_growth * remaining, win->size); in window_update()
494 list_for_each_entry(t, &group->triggers, node) in init_triggers()
495 window_reset(&t->win, now, in init_triggers()
496 group->total[PSI_POLL][t->state], 0); in init_triggers()
497 memcpy(group->polling_total, group->total[PSI_POLL], in init_triggers()
498 sizeof(group->polling_total)); in init_triggers()
499 group->polling_next_update = now + group->poll_min_period; in init_triggers()
506 u64 *total = group->total[PSI_POLL]; in update_triggers()
512 list_for_each_entry(t, &group->triggers, node) { in update_triggers()
516 if (group->polling_total[t->state] == total[t->state]) in update_triggers()
521 * remember to update group->polling_total[] once we've in update_triggers()
523 * polling time if we see new stall activity. in update_triggers()
528 growth = window_update(&t->win, now, total[t->state]); in update_triggers()
529 if (growth < t->threshold) in update_triggers()
533 if (now < t->last_event_time + t->win.size) in update_triggers()
537 if (cmpxchg(&t->event, 0, 1) == 0) in update_triggers()
538 wake_up_interruptible(&t->event_wait); in update_triggers()
539 t->last_event_time = now; in update_triggers()
543 memcpy(group->polling_total, total, in update_triggers()
544 sizeof(group->polling_total)); in update_triggers()
546 return now + group->poll_min_period; in update_triggers()
557 * mod_timer below can be tolerated because group->polling_next_update in psi_schedule_poll_work()
560 if (timer_pending(&group->poll_timer)) in psi_schedule_poll_work()
565 task = rcu_dereference(group->poll_task); in psi_schedule_poll_work()
571 mod_timer(&group->poll_timer, jiffies + delay); in psi_schedule_poll_work()
581 mutex_lock(&group->trigger_lock); in psi_poll_work()
587 if (changed_states & group->poll_states) { in psi_poll_work()
589 if (now > group->polling_until) in psi_poll_work()
597 group->polling_until = now + in psi_poll_work()
598 group->poll_min_period * UPDATES_PER_WINDOW; in psi_poll_work()
601 if (now > group->polling_until) { in psi_poll_work()
602 group->polling_next_update = ULLONG_MAX; in psi_poll_work()
606 if (now >= group->polling_next_update) in psi_poll_work()
607 group->polling_next_update = update_triggers(group, now); in psi_poll_work()
610 nsecs_to_jiffies(group->polling_next_update - now) + 1); in psi_poll_work()
613 mutex_unlock(&group->trigger_lock); in psi_poll_work()
623 wait_event_interruptible(group->poll_wait, in psi_poll_worker()
624 atomic_cmpxchg(&group->poll_wakeup, 1, 0) || in psi_poll_worker()
638 atomic_set(&group->poll_wakeup, 1); in poll_timer_fn()
639 wake_up_interruptible(&group->poll_wait); in poll_timer_fn()
649 delta = now - groupc->state_start; in record_times()
650 groupc->state_start = now; in record_times()
652 if (groupc->state_mask & (1 << PSI_IO_SOME)) { in record_times()
653 groupc->times[PSI_IO_SOME] += delta; in record_times()
654 if (groupc->state_mask & (1 << PSI_IO_FULL)) in record_times()
655 groupc->times[PSI_IO_FULL] += delta; in record_times()
658 if (groupc->state_mask & (1 << PSI_MEM_SOME)) { in record_times()
659 groupc->times[PSI_MEM_SOME] += delta; in record_times()
660 if (groupc->state_mask & (1 << PSI_MEM_FULL)) in record_times()
661 groupc->times[PSI_MEM_FULL] += delta; in record_times()
663 u32 sample; in record_times() local
672 * regardless of runnable tasks, sample a FULL in record_times()
676 sample = min(delta, (u32)jiffies_to_nsecs(1)); in record_times()
677 groupc->times[PSI_MEM_FULL] += sample; in record_times()
681 if (groupc->state_mask & (1 << PSI_CPU_SOME)) in record_times()
682 groupc->times[PSI_CPU_SOME] += delta; in record_times()
684 if (groupc->state_mask & (1 << PSI_NONIDLE)) in record_times()
685 groupc->times[PSI_NONIDLE] += delta; in record_times()
697 groupc = per_cpu_ptr(group->pcpu, cpu); in psi_group_change()
702 * SOME and FULL time these may have resulted in. in psi_group_change()
707 write_seqcount_begin(&groupc->seq); in psi_group_change()
714 if (groupc->tasks[t] == 0 && !psi_bug) { in psi_group_change()
716 cpu, t, groupc->tasks[0], in psi_group_change()
717 groupc->tasks[1], groupc->tasks[2], in psi_group_change()
718 groupc->tasks[3], clear, set); in psi_group_change()
721 groupc->tasks[t]--; in psi_group_change()
726 groupc->tasks[t]++; in psi_group_change()
730 if (test_state(groupc->tasks, s)) in psi_group_change()
733 groupc->state_mask = state_mask; in psi_group_change()
735 write_seqcount_end(&groupc->seq); in psi_group_change()
737 if (state_mask & group->poll_states) in psi_group_change()
740 if (wake_clock && !delayed_work_pending(&group->avgs_work)) in psi_group_change()
741 schedule_delayed_work(&group->avgs_work, PSI_FREQ); in psi_group_change()
750 cgroup = task->cgroups->dfl_cgrp; in iterate_groups()
770 if (((task->psi_flags & set) || in psi_flags_change()
771 (task->psi_flags & clear) != clear) && in psi_flags_change()
774 task->pid, task->comm, task_cpu(task), in psi_flags_change()
775 task->psi_flags, clear, set); in psi_flags_change()
779 task->psi_flags &= ~clear; in psi_flags_change()
780 task->psi_flags |= set; in psi_flags_change()
790 if (!task->pid) in psi_task_change()
799 * itself going to sleep, or we'll ping-pong forever. in psi_task_change()
802 (task->flags & PF_WQ_WORKER) && in psi_task_change()
817 if (next->pid) { in psi_task_switch()
828 if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { in psi_task_switch()
845 if (prev->pid) { in psi_task_switch()
862 groupc = per_cpu_ptr(group->pcpu, cpu); in psi_memstall_tick()
863 write_seqcount_begin(&groupc->seq); in psi_memstall_tick()
865 write_seqcount_end(&groupc->seq); in psi_memstall_tick()
870 * psi_memstall_enter - mark the beginning of a memory stall section
884 *flags = current->in_memstall; in psi_memstall_enter()
894 current->in_memstall = 1; in psi_memstall_enter()
901 * psi_memstall_leave - mark the end of an memory stall section
923 current->in_memstall = 0; in psi_memstall_leave()
935 cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); in psi_cgroup_alloc()
936 if (!cgroup->psi.pcpu) in psi_cgroup_alloc()
937 return -ENOMEM; in psi_cgroup_alloc()
938 group_init(&cgroup->psi); in psi_cgroup_alloc()
947 cancel_delayed_work_sync(&cgroup->psi.avgs_work); in psi_cgroup_free()
948 free_percpu(cgroup->psi.pcpu); in psi_cgroup_free()
950 WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); in psi_cgroup_free()
954 * cgroup_move_task - move task to a different cgroup
962 * changes to the task's scheduling state and - in case the task is
963 * running - concurrent changes to its stall state.
976 rcu_assign_pointer(task->cgroups, to); in cgroup_move_task()
986 } else if (task->in_iowait) in cgroup_move_task()
989 if (task->in_memstall) in cgroup_move_task()
996 rcu_assign_pointer(task->cgroups, to); in cgroup_move_task()
1011 return -EOPNOTSUPP; in psi_show()
1014 mutex_lock(&group->avgs_lock); in psi_show()
1017 if (now >= group->avg_next_update) in psi_show()
1018 group->avg_next_update = update_averages(group, now); in psi_show()
1019 mutex_unlock(&group->avgs_lock); in psi_show()
1021 for (full = 0; full < 2 - (res == PSI_CPU); full++) { in psi_show()
1027 avg[w] = group->avg[res * 2 + full][w]; in psi_show()
1028 total = div_u64(group->total[PSI_AVGS][res * 2 + full], in psi_show()
1081 return ERR_PTR(-EOPNOTSUPP); in psi_trigger_create()
1088 return ERR_PTR(-EINVAL); in psi_trigger_create()
1091 return ERR_PTR(-EINVAL); in psi_trigger_create()
1095 return ERR_PTR(-EINVAL); in psi_trigger_create()
1099 return ERR_PTR(-EINVAL); in psi_trigger_create()
1103 return ERR_PTR(-ENOMEM); in psi_trigger_create()
1105 t->group = group; in psi_trigger_create()
1106 t->state = state; in psi_trigger_create()
1107 t->threshold = threshold_us * NSEC_PER_USEC; in psi_trigger_create()
1108 t->win.size = window_us * NSEC_PER_USEC; in psi_trigger_create()
1109 window_reset(&t->win, 0, 0, 0); in psi_trigger_create()
1111 t->event = 0; in psi_trigger_create()
1112 t->last_event_time = 0; in psi_trigger_create()
1113 init_waitqueue_head(&t->event_wait); in psi_trigger_create()
1114 kref_init(&t->refcount); in psi_trigger_create()
1116 mutex_lock(&group->trigger_lock); in psi_trigger_create()
1118 if (!rcu_access_pointer(group->poll_task)) { in psi_trigger_create()
1124 mutex_unlock(&group->trigger_lock); in psi_trigger_create()
1127 atomic_set(&group->poll_wakeup, 0); in psi_trigger_create()
1128 init_waitqueue_head(&group->poll_wait); in psi_trigger_create()
1130 timer_setup(&group->poll_timer, poll_timer_fn, 0); in psi_trigger_create()
1131 rcu_assign_pointer(group->poll_task, task); in psi_trigger_create()
1134 list_add(&t->node, &group->triggers); in psi_trigger_create()
1135 group->poll_min_period = min(group->poll_min_period, in psi_trigger_create()
1136 div_u64(t->win.size, UPDATES_PER_WINDOW)); in psi_trigger_create()
1137 group->nr_triggers[t->state]++; in psi_trigger_create()
1138 group->poll_states |= (1 << t->state); in psi_trigger_create()
1140 mutex_unlock(&group->trigger_lock); in psi_trigger_create()
1148 struct psi_group *group = t->group; in psi_trigger_destroy()
1158 wake_up_interruptible(&t->event_wait); in psi_trigger_destroy()
1160 mutex_lock(&group->trigger_lock); in psi_trigger_destroy()
1162 if (!list_empty(&t->node)) { in psi_trigger_destroy()
1166 list_del(&t->node); in psi_trigger_destroy()
1167 group->nr_triggers[t->state]--; in psi_trigger_destroy()
1168 if (!group->nr_triggers[t->state]) in psi_trigger_destroy()
1169 group->poll_states &= ~(1 << t->state); in psi_trigger_destroy()
1170 /* reset min update period for the remaining triggers */ in psi_trigger_destroy()
1171 list_for_each_entry(tmp, &group->triggers, node) in psi_trigger_destroy()
1172 period = min(period, div_u64(tmp->win.size, in psi_trigger_destroy()
1174 group->poll_min_period = period; in psi_trigger_destroy()
1176 if (group->poll_states == 0) { in psi_trigger_destroy()
1177 group->polling_until = 0; in psi_trigger_destroy()
1179 group->poll_task, in psi_trigger_destroy()
1180 lockdep_is_held(&group->trigger_lock)); in psi_trigger_destroy()
1181 rcu_assign_pointer(group->poll_task, NULL); in psi_trigger_destroy()
1185 mutex_unlock(&group->trigger_lock); in psi_trigger_destroy()
1189 * poll_task RCUs to complete their read-side critical sections in psi_trigger_destroy()
1200 * can no longer be found through group->poll_task. in psi_trigger_destroy()
1202 * that - deschedule it cleanly before destroying it. in psi_trigger_destroy()
1204 del_timer_sync(&group->poll_timer); in psi_trigger_destroy()
1219 kref_put(&old->refcount, psi_trigger_destroy); in psi_trigger_replace()
1238 kref_get(&t->refcount); in psi_trigger_poll()
1242 poll_wait(file, &t->event_wait, wait); in psi_trigger_poll()
1244 if (cmpxchg(&t->event, 1, 0) == 1) in psi_trigger_poll()
1247 kref_put(&t->refcount, psi_trigger_destroy); in psi_trigger_poll()
1261 return -EOPNOTSUPP; in psi_write()
1264 return -EINVAL; in psi_write()
1266 buf_size = min(nbytes, sizeof(buf)); in psi_write()
1268 return -EFAULT; in psi_write()
1270 buf[buf_size - 1] = '\0'; in psi_write()
1276 seq = file->private_data; in psi_write()
1277 /* Take seq->lock to protect seq->private from concurrent writes */ in psi_write()
1278 mutex_lock(&seq->lock); in psi_write()
1279 psi_trigger_replace(&seq->private, new); in psi_write()
1280 mutex_unlock(&seq->lock); in psi_write()
1305 struct seq_file *seq = file->private_data; in psi_fop_poll()
1307 return psi_trigger_poll(&seq->private, file, wait); in psi_fop_poll()
1312 struct seq_file *seq = file->private_data; in psi_fop_release()
1314 psi_trigger_replace(&seq->private, NULL); in psi_fop_release()