Lines Matching +full:avg +full:- +full:samples
33 * In the FULL state of a given resource, all non-idle tasks are
38 * system level, but exist at the cgroup level, means all non-idle tasks
57 * unrealized due to resource contention *also* scales with non-idle
76 * we have to base our calculation on the number of non-idle tasks in
84 * FULL = (threads - min(nr_running_tasks, threads)) / threads
90 * FULL = (256 - min(257, 256)) / 256 = 0%
92 * For the 1 out of 4 memory-delayed tasks, this yields:
96 * FULL = (4 - min(3, 4)) / 4 = 25%
99 * extension of the single-CPU model. ]
129 * cost-wise, yet way more sensitive and accurate than periodic
164 /* Running averages - we need to be higher-res than loadavg */
166 #define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
178 /* System-level pressure and stall tracking */
193 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); in group_init()
194 group->avg_last_update = sched_clock(); in group_init()
195 group->avg_next_update = group->avg_last_update + psi_period; in group_init()
196 INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); in group_init()
197 mutex_init(&group->avgs_lock); in group_init()
198 /* Init trigger-related members */ in group_init()
199 mutex_init(&group->trigger_lock); in group_init()
200 INIT_LIST_HEAD(&group->triggers); in group_init()
201 memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); in group_init()
202 group->poll_states = 0; in group_init()
203 group->poll_min_period = U32_MAX; in group_init()
204 memset(group->polling_total, 0, sizeof(group->polling_total)); in group_init()
205 group->polling_next_update = ULLONG_MAX; in group_init()
206 group->polling_until = 0; in group_init()
207 init_waitqueue_head(&group->poll_wait); in group_init()
208 timer_setup(&group->poll_timer, poll_timer_fn, 0); in group_init()
209 rcu_assign_pointer(group->poll_task, NULL); in group_init()
253 struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); in get_recent_times()
263 seq = read_seqcount_begin(&groupc->seq); in get_recent_times()
265 memcpy(times, groupc->times, sizeof(groupc->times)); in get_recent_times()
266 state_mask = groupc->state_mask; in get_recent_times()
267 state_start = groupc->state_start; in get_recent_times()
268 } while (read_seqcount_retry(&groupc->seq, seq)); in get_recent_times()
283 times[s] += now - state_start; in get_recent_times()
285 delta = times[s] - groupc->times_prev[aggregator][s]; in get_recent_times()
286 groupc->times_prev[aggregator][s] = times[s]; in get_recent_times()
294 static void calc_avgs(unsigned long avg[3], int missed_periods, in calc_avgs()
301 avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods); in calc_avgs()
302 avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods); in calc_avgs()
303 avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods); in calc_avgs()
309 avg[0] = calc_load(avg[0], EXP_10s, pct); in calc_avgs()
310 avg[1] = calc_load(avg[1], EXP_60s, pct); in calc_avgs()
311 avg[2] = calc_load(avg[2], EXP_300s, pct); in calc_avgs()
318 u64 deltas[NR_PSI_STATES - 1] = { 0, }; in collect_percpu_times()
325 * Collect the per-cpu time buckets and average them into a in collect_percpu_times()
328 * For averaging, each CPU is weighted by its non-idle time in in collect_percpu_times()
361 for (s = 0; s < NR_PSI_STATES - 1; s++) in collect_percpu_times()
362 group->total[aggregator][s] += in collect_percpu_times()
377 expires = group->avg_next_update; in update_averages()
378 if (now - expires >= psi_period) in update_averages()
379 missed_periods = div_u64(now - expires, psi_period); in update_averages()
385 * But the deltas we sample out of the per-cpu buckets above in update_averages()
389 period = now - (group->avg_last_update + (missed_periods * psi_period)); in update_averages()
390 group->avg_last_update = now; in update_averages()
392 for (s = 0; s < NR_PSI_STATES - 1; s++) { in update_averages()
395 sample = group->total[PSI_AVGS][s] - group->avg_total[s]; in update_averages()
399 * which under full pressure can result in samples in in update_averages()
402 * We don't want to report non-sensical pressures in in update_averages()
415 group->avg_total[s] += sample; in update_averages()
416 calc_avgs(group->avg[s], missed_periods, sample, period); in update_averages()
433 mutex_lock(&group->avgs_lock); in psi_avgs_work()
440 * If there is task activity, periodically fold the per-cpu in psi_avgs_work()
441 * times and feed samples into the running averages. If things in psi_avgs_work()
444 * go - see calc_avgs() and missed_periods. in psi_avgs_work()
446 if (now >= group->avg_next_update) in psi_avgs_work()
447 group->avg_next_update = update_averages(group, now); in psi_avgs_work()
451 group->avg_next_update - now) + 1); in psi_avgs_work()
454 mutex_unlock(&group->avgs_lock); in psi_avgs_work()
461 win->start_time = now; in window_reset()
462 win->start_value = value; in window_reset()
463 win->prev_growth = prev_growth; in window_reset()
482 elapsed = now - win->start_time; in window_update()
483 growth = value - win->start_value; in window_update()
485 * After each tracking window passes win->start_value and in window_update()
486 * win->start_time get reset and win->prev_growth stores in window_update()
487 * the average per-window growth of the previous window. in window_update()
488 * win->prev_growth is then used to interpolate additional in window_update()
491 if (elapsed > win->size) in window_update()
496 remaining = win->size - elapsed; in window_update()
497 growth += div64_u64(win->prev_growth * remaining, win->size); in window_update()
507 list_for_each_entry(t, &group->triggers, node) in init_triggers()
508 window_reset(&t->win, now, in init_triggers()
509 group->total[PSI_POLL][t->state], 0); in init_triggers()
510 memcpy(group->polling_total, group->total[PSI_POLL], in init_triggers()
511 sizeof(group->polling_total)); in init_triggers()
512 group->polling_next_update = now + group->poll_min_period; in init_triggers()
519 u64 *total = group->total[PSI_POLL]; in update_triggers()
525 list_for_each_entry(t, &group->triggers, node) { in update_triggers()
529 if (group->polling_total[t->state] == total[t->state]) in update_triggers()
534 * remember to update group->polling_total[] once we've in update_triggers()
541 growth = window_update(&t->win, now, total[t->state]); in update_triggers()
542 if (growth < t->threshold) in update_triggers()
546 if (now < t->last_event_time + t->win.size) in update_triggers()
550 if (cmpxchg(&t->event, 0, 1) == 0) in update_triggers()
551 wake_up_interruptible(&t->event_wait); in update_triggers()
552 t->last_event_time = now; in update_triggers()
556 memcpy(group->polling_total, total, in update_triggers()
557 sizeof(group->polling_total)); in update_triggers()
559 return now + group->poll_min_period; in update_triggers()
570 * mod_timer below can be tolerated because group->polling_next_update in psi_schedule_poll_work()
573 if (timer_pending(&group->poll_timer)) in psi_schedule_poll_work()
578 task = rcu_dereference(group->poll_task); in psi_schedule_poll_work()
584 mod_timer(&group->poll_timer, jiffies + delay); in psi_schedule_poll_work()
594 mutex_lock(&group->trigger_lock); in psi_poll_work()
600 if (changed_states & group->poll_states) { in psi_poll_work()
602 if (now > group->polling_until) in psi_poll_work()
610 group->polling_until = now + in psi_poll_work()
611 group->poll_min_period * UPDATES_PER_WINDOW; in psi_poll_work()
614 if (now > group->polling_until) { in psi_poll_work()
615 group->polling_next_update = ULLONG_MAX; in psi_poll_work()
619 if (now >= group->polling_next_update) in psi_poll_work()
620 group->polling_next_update = update_triggers(group, now); in psi_poll_work()
623 nsecs_to_jiffies(group->polling_next_update - now) + 1); in psi_poll_work()
626 mutex_unlock(&group->trigger_lock); in psi_poll_work()
636 wait_event_interruptible(group->poll_wait, in psi_poll_worker()
637 atomic_cmpxchg(&group->poll_wakeup, 1, 0) || in psi_poll_worker()
651 atomic_set(&group->poll_wakeup, 1); in poll_timer_fn()
652 wake_up_interruptible(&group->poll_wait); in poll_timer_fn()
659 delta = now - groupc->state_start; in record_times()
660 groupc->state_start = now; in record_times()
662 if (groupc->state_mask & (1 << PSI_IO_SOME)) { in record_times()
663 groupc->times[PSI_IO_SOME] += delta; in record_times()
664 if (groupc->state_mask & (1 << PSI_IO_FULL)) in record_times()
665 groupc->times[PSI_IO_FULL] += delta; in record_times()
668 if (groupc->state_mask & (1 << PSI_MEM_SOME)) { in record_times()
669 groupc->times[PSI_MEM_SOME] += delta; in record_times()
670 if (groupc->state_mask & (1 << PSI_MEM_FULL)) in record_times()
671 groupc->times[PSI_MEM_FULL] += delta; in record_times()
674 if (groupc->state_mask & (1 << PSI_CPU_SOME)) { in record_times()
675 groupc->times[PSI_CPU_SOME] += delta; in record_times()
676 if (groupc->state_mask & (1 << PSI_CPU_FULL)) in record_times()
677 groupc->times[PSI_CPU_FULL] += delta; in record_times()
680 if (groupc->state_mask & (1 << PSI_NONIDLE)) in record_times()
681 groupc->times[PSI_NONIDLE] += delta; in record_times()
693 groupc = per_cpu_ptr(group->pcpu, cpu); in psi_group_change()
703 write_seqcount_begin(&groupc->seq); in psi_group_change()
710 if (groupc->tasks[t]) { in psi_group_change()
711 groupc->tasks[t]--; in psi_group_change()
714 cpu, t, groupc->tasks[0], in psi_group_change()
715 groupc->tasks[1], groupc->tasks[2], in psi_group_change()
716 groupc->tasks[3], clear, set); in psi_group_change()
723 groupc->tasks[t]++; in psi_group_change()
727 if (test_state(groupc->tasks, s)) in psi_group_change()
739 if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) in psi_group_change()
742 groupc->state_mask = state_mask; in psi_group_change()
744 write_seqcount_end(&groupc->seq); in psi_group_change()
746 if (state_mask & group->poll_states) in psi_group_change()
749 if (wake_clock && !delayed_work_pending(&group->avgs_work)) in psi_group_change()
750 schedule_delayed_work(&group->avgs_work, PSI_FREQ); in psi_group_change()
763 cgroup = task->cgroups->dfl_cgrp; in iterate_groups()
779 if (((task->psi_flags & set) || in psi_flags_change()
780 (task->psi_flags & clear) != clear) && in psi_flags_change()
783 task->pid, task->comm, task_cpu(task), in psi_flags_change()
784 task->psi_flags, clear, set); in psi_flags_change()
788 task->psi_flags &= ~clear; in psi_flags_change()
789 task->psi_flags |= set; in psi_flags_change()
800 if (!task->pid) in psi_task_change()
810 * itself going to sleep, or we'll ping-pong forever. in psi_task_change()
813 (task->flags & PF_WQ_WORKER) && in psi_task_change()
829 if (next->pid) { in psi_task_switch()
840 identical_state = prev->psi_flags == next->psi_flags; in psi_task_switch()
844 per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { in psi_task_switch()
853 if (prev->pid) { in psi_task_switch()
863 if (prev->in_iowait) in psi_task_switch()
886 * psi_memstall_enter - mark the beginning of a memory stall section
900 *flags = current->in_memstall; in psi_memstall_enter()
910 current->in_memstall = 1; in psi_memstall_enter()
917 * psi_memstall_leave - mark the end of an memory stall section
939 current->in_memstall = 0; in psi_memstall_leave()
951 cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); in psi_cgroup_alloc()
952 if (!cgroup->psi.pcpu) in psi_cgroup_alloc()
953 return -ENOMEM; in psi_cgroup_alloc()
954 group_init(&cgroup->psi); in psi_cgroup_alloc()
963 cancel_delayed_work_sync(&cgroup->psi.avgs_work); in psi_cgroup_free()
964 free_percpu(cgroup->psi.pcpu); in psi_cgroup_free()
966 WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); in psi_cgroup_free()
970 * cgroup_move_task - move task to a different cgroup
978 * changes to the task's scheduling state and - in case the task is
979 * running - concurrent changes to its stall state.
992 rcu_assign_pointer(task->cgroups, to); in cgroup_move_task()
1008 * p->on_rq = 0 in cgroup_move_task()
1014 * task->cgroups = to in cgroup_move_task()
1022 task_flags = task->psi_flags; in cgroup_move_task()
1028 rcu_assign_pointer(task->cgroups, to); in cgroup_move_task()
1043 return -EOPNOTSUPP; in psi_show()
1046 mutex_lock(&group->avgs_lock); in psi_show()
1049 if (now >= group->avg_next_update) in psi_show()
1050 group->avg_next_update = update_averages(group, now); in psi_show()
1051 mutex_unlock(&group->avgs_lock); in psi_show()
1054 unsigned long avg[3]; in psi_show() local
1059 avg[w] = group->avg[res * 2 + full][w]; in psi_show()
1060 total = div_u64(group->total[PSI_AVGS][res * 2 + full], in psi_show()
1065 LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), in psi_show()
1066 LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), in psi_show()
1067 LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), in psi_show()
1091 if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) in psi_open()
1092 return -EPERM; in psi_open()
1121 return ERR_PTR(-EOPNOTSUPP); in psi_trigger_create()
1128 return ERR_PTR(-EINVAL); in psi_trigger_create()
1131 return ERR_PTR(-EINVAL); in psi_trigger_create()
1135 return ERR_PTR(-EINVAL); in psi_trigger_create()
1139 return ERR_PTR(-EINVAL); in psi_trigger_create()
1143 return ERR_PTR(-ENOMEM); in psi_trigger_create()
1145 t->group = group; in psi_trigger_create()
1146 t->state = state; in psi_trigger_create()
1147 t->threshold = threshold_us * NSEC_PER_USEC; in psi_trigger_create()
1148 t->win.size = window_us * NSEC_PER_USEC; in psi_trigger_create()
1149 window_reset(&t->win, 0, 0, 0); in psi_trigger_create()
1151 t->event = 0; in psi_trigger_create()
1152 t->last_event_time = 0; in psi_trigger_create()
1153 init_waitqueue_head(&t->event_wait); in psi_trigger_create()
1154 kref_init(&t->refcount); in psi_trigger_create()
1156 mutex_lock(&group->trigger_lock); in psi_trigger_create()
1158 if (!rcu_access_pointer(group->poll_task)) { in psi_trigger_create()
1164 mutex_unlock(&group->trigger_lock); in psi_trigger_create()
1167 atomic_set(&group->poll_wakeup, 0); in psi_trigger_create()
1169 rcu_assign_pointer(group->poll_task, task); in psi_trigger_create()
1172 list_add(&t->node, &group->triggers); in psi_trigger_create()
1173 group->poll_min_period = min(group->poll_min_period, in psi_trigger_create()
1174 div_u64(t->win.size, UPDATES_PER_WINDOW)); in psi_trigger_create()
1175 group->nr_triggers[t->state]++; in psi_trigger_create()
1176 group->poll_states |= (1 << t->state); in psi_trigger_create()
1178 mutex_unlock(&group->trigger_lock); in psi_trigger_create()
1186 struct psi_group *group = t->group; in psi_trigger_destroy()
1196 wake_up_interruptible(&t->event_wait); in psi_trigger_destroy()
1198 mutex_lock(&group->trigger_lock); in psi_trigger_destroy()
1200 if (!list_empty(&t->node)) { in psi_trigger_destroy()
1204 list_del(&t->node); in psi_trigger_destroy()
1205 group->nr_triggers[t->state]--; in psi_trigger_destroy()
1206 if (!group->nr_triggers[t->state]) in psi_trigger_destroy()
1207 group->poll_states &= ~(1 << t->state); in psi_trigger_destroy()
1209 list_for_each_entry(tmp, &group->triggers, node) in psi_trigger_destroy()
1210 period = min(period, div_u64(tmp->win.size, in psi_trigger_destroy()
1212 group->poll_min_period = period; in psi_trigger_destroy()
1214 if (group->poll_states == 0) { in psi_trigger_destroy()
1215 group->polling_until = 0; in psi_trigger_destroy()
1217 group->poll_task, in psi_trigger_destroy()
1218 lockdep_is_held(&group->trigger_lock)); in psi_trigger_destroy()
1219 rcu_assign_pointer(group->poll_task, NULL); in psi_trigger_destroy()
1220 del_timer(&group->poll_timer); in psi_trigger_destroy()
1224 mutex_unlock(&group->trigger_lock); in psi_trigger_destroy()
1228 * poll_task RCUs to complete their read-side critical sections in psi_trigger_destroy()
1239 * can no longer be found through group->poll_task. in psi_trigger_destroy()
1255 kref_put(&old->refcount, psi_trigger_destroy); in psi_trigger_replace()
1274 kref_get(&t->refcount); in psi_trigger_poll()
1278 poll_wait(file, &t->event_wait, wait); in psi_trigger_poll()
1280 if (cmpxchg(&t->event, 1, 0) == 1) in psi_trigger_poll()
1283 kref_put(&t->refcount, psi_trigger_destroy); in psi_trigger_poll()
1297 return -EOPNOTSUPP; in psi_write()
1300 return -EINVAL; in psi_write()
1304 return -EFAULT; in psi_write()
1306 buf[buf_size - 1] = '\0'; in psi_write()
1312 seq = file->private_data; in psi_write()
1313 /* Take seq->lock to protect seq->private from concurrent writes */ in psi_write()
1314 mutex_lock(&seq->lock); in psi_write()
1315 psi_trigger_replace(&seq->private, new); in psi_write()
1316 mutex_unlock(&seq->lock); in psi_write()
1341 struct seq_file *seq = file->private_data; in psi_fop_poll()
1343 return psi_trigger_poll(&seq->private, file, wait); in psi_fop_poll()
1348 struct seq_file *seq = file->private_data; in psi_fop_release()
1350 psi_trigger_replace(&seq->private, NULL); in psi_fop_release()