1  /* SPDX-License-Identifier: GPL-2.0 */
2  #ifndef _LINUX_PSI_TYPES_H
3  #define _LINUX_PSI_TYPES_H
4  
5  #include <linux/kthread.h>
6  #include <linux/seqlock.h>
7  #include <linux/types.h>
8  #include <linux/kref.h>
9  #include <linux/wait.h>
10  
11  #ifdef CONFIG_PSI
12  
13  /* Tracked task states */
14  enum psi_task_count {
15  	NR_IOWAIT,
16  	NR_MEMSTALL,
17  	NR_RUNNING,
18  	/*
19  	 * For IO and CPU stalls the presence of running/oncpu tasks
20  	 * in the domain means a partial rather than a full stall.
21  	 * For memory it's not so simple because of page reclaimers:
22  	 * they are running/oncpu while representing a stall. To tell
23  	 * whether a domain has productivity left or not, we need to
24  	 * distinguish between regular running (i.e. productive)
25  	 * threads and memstall ones.
26  	 */
27  	NR_MEMSTALL_RUNNING,
28  	NR_PSI_TASK_COUNTS = 4,
29  };
30  
31  /* Task state bitmasks */
32  #define TSK_IOWAIT	(1 << NR_IOWAIT)
33  #define TSK_MEMSTALL	(1 << NR_MEMSTALL)
34  #define TSK_RUNNING	(1 << NR_RUNNING)
35  #define TSK_MEMSTALL_RUNNING	(1 << NR_MEMSTALL_RUNNING)
36  
37  /* Only one task can be scheduled, no corresponding task count */
38  #define TSK_ONCPU	(1 << NR_PSI_TASK_COUNTS)
39  
40  /* Resources that workloads could be stalled on */
41  enum psi_res {
42  	PSI_IO,
43  	PSI_MEM,
44  	PSI_CPU,
45  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
46  	PSI_IRQ,
47  #endif
48  	NR_PSI_RESOURCES,
49  };
50  
51  /*
52   * Pressure states for each resource:
53   *
54   * SOME: Stalled tasks & working tasks
55   * FULL: Stalled tasks & no working tasks
56   */
57  enum psi_states {
58  	PSI_IO_SOME,
59  	PSI_IO_FULL,
60  	PSI_MEM_SOME,
61  	PSI_MEM_FULL,
62  	PSI_CPU_SOME,
63  	PSI_CPU_FULL,
64  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
65  	PSI_IRQ_FULL,
66  #endif
67  	/* Only per-CPU, to weigh the CPU in the global average: */
68  	PSI_NONIDLE,
69  	NR_PSI_STATES,
70  };
71  
72  /* Use one bit in the state mask to track TSK_ONCPU */
73  #define PSI_ONCPU	(1 << NR_PSI_STATES)
74  
75  enum psi_aggregators {
76  	PSI_AVGS = 0,
77  	PSI_POLL,
78  	NR_PSI_AGGREGATORS,
79  };
80  
81  struct psi_group_cpu {
82  	/* 1st cacheline updated by the scheduler */
83  
84  	/* Aggregator needs to know of concurrent changes */
85  	seqcount_t seq ____cacheline_aligned_in_smp;
86  
87  	/* States of the tasks belonging to this group */
88  	unsigned int tasks[NR_PSI_TASK_COUNTS];
89  
90  	/* Aggregate pressure state derived from the tasks */
91  	u32 state_mask;
92  
93  	/* Period time sampling buckets for each state of interest (ns) */
94  	u32 times[NR_PSI_STATES];
95  
96  	/* Time of last task change in this group (rq_clock) */
97  	u64 state_start;
98  
99  	/* 2nd cacheline updated by the aggregator */
100  
101  	/* Delta detection against the sampling buckets */
102  	u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES]
103  			____cacheline_aligned_in_smp;
104  };
105  
106  /* PSI growth tracking window */
107  struct psi_window {
108  	/* Window size in ns */
109  	u64 size;
110  
111  	/* Start time of the current window in ns */
112  	u64 start_time;
113  
114  	/* Value at the start of the window */
115  	u64 start_value;
116  
117  	/* Value growth in the previous window */
118  	u64 prev_growth;
119  };
120  
121  struct psi_trigger {
122  	/* PSI state being monitored by the trigger */
123  	enum psi_states state;
124  
125  	/* User-spacified threshold in ns */
126  	u64 threshold;
127  
128  	/* List node inside triggers list */
129  	struct list_head node;
130  
131  	/* Backpointer needed during trigger destruction */
132  	struct psi_group *group;
133  
134  	/* Wait queue for polling */
135  	wait_queue_head_t event_wait;
136  
137  	/* Pending event flag */
138  	int event;
139  
140  	/* Tracking window */
141  	struct psi_window win;
142  
143  	/*
144  	 * Time last event was generated. Used for rate-limiting
145  	 * events to one per window
146  	 */
147  	u64 last_event_time;
148  
149  	/* Deferred event(s) from previous ratelimit window */
150  	bool pending_event;
151  };
152  
153  struct psi_group {
154  	struct psi_group *parent;
155  	bool enabled;
156  
157  	/* Protects data used by the aggregator */
158  	struct mutex avgs_lock;
159  
160  	/* Per-cpu task state & time tracking */
161  	struct psi_group_cpu __percpu *pcpu;
162  
163  	/* Running pressure averages */
164  	u64 avg_total[NR_PSI_STATES - 1];
165  	u64 avg_last_update;
166  	u64 avg_next_update;
167  
168  	/* Aggregator work control */
169  	struct delayed_work avgs_work;
170  
171  	/* Total stall times and sampled pressure averages */
172  	u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
173  	unsigned long avg[NR_PSI_STATES - 1][3];
174  
175  	/* Monitor work control */
176  	struct task_struct __rcu *poll_task;
177  	struct timer_list poll_timer;
178  	wait_queue_head_t poll_wait;
179  	atomic_t poll_wakeup;
180  
181  	/* Protects data used by the monitor */
182  	struct mutex trigger_lock;
183  
184  	/* Configured polling triggers */
185  	struct list_head triggers;
186  	u32 nr_triggers[NR_PSI_STATES - 1];
187  	u32 poll_states;
188  	u64 poll_min_period;
189  
190  	/* Total stall times at the start of monitor activation */
191  	u64 polling_total[NR_PSI_STATES - 1];
192  	u64 polling_next_update;
193  	u64 polling_until;
194  };
195  
196  #else /* CONFIG_PSI */
197  
198  #define NR_PSI_RESOURCES	0
199  
200  struct psi_group { };
201  
202  #endif /* CONFIG_PSI */
203  
204  #endif /* _LINUX_PSI_TYPES_H */
205