1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
3 #include "vmlinux.h"
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
7
8 /* task->flags for off-cpu analysis */
9 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
10
11 /* task->state for off-cpu analysis */
12 #define TASK_INTERRUPTIBLE 0x0001
13 #define TASK_UNINTERRUPTIBLE 0x0002
14
15 /* create a new thread */
16 #define CLONE_THREAD 0x10000
17
18 #define MAX_STACKS 32
19 #define MAX_ENTRIES 102400
20
21 struct tstamp_data {
22 __u32 stack_id;
23 __u32 state;
24 __u64 timestamp;
25 };
26
27 struct offcpu_key {
28 __u32 pid;
29 __u32 tgid;
30 __u32 stack_id;
31 __u32 state;
32 __u64 cgroup_id;
33 };
34
35 struct {
36 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
37 __uint(key_size, sizeof(__u32));
38 __uint(value_size, MAX_STACKS * sizeof(__u64));
39 __uint(max_entries, MAX_ENTRIES);
40 } stacks SEC(".maps");
41
42 struct {
43 __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
44 __uint(map_flags, BPF_F_NO_PREALLOC);
45 __type(key, int);
46 __type(value, struct tstamp_data);
47 } tstamp SEC(".maps");
48
49 struct {
50 __uint(type, BPF_MAP_TYPE_HASH);
51 __uint(key_size, sizeof(struct offcpu_key));
52 __uint(value_size, sizeof(__u64));
53 __uint(max_entries, MAX_ENTRIES);
54 } off_cpu SEC(".maps");
55
56 struct {
57 __uint(type, BPF_MAP_TYPE_HASH);
58 __uint(key_size, sizeof(__u32));
59 __uint(value_size, sizeof(__u8));
60 __uint(max_entries, 1);
61 } cpu_filter SEC(".maps");
62
63 struct {
64 __uint(type, BPF_MAP_TYPE_HASH);
65 __uint(key_size, sizeof(__u32));
66 __uint(value_size, sizeof(__u8));
67 __uint(max_entries, 1);
68 } task_filter SEC(".maps");
69
70 struct {
71 __uint(type, BPF_MAP_TYPE_HASH);
72 __uint(key_size, sizeof(__u64));
73 __uint(value_size, sizeof(__u8));
74 __uint(max_entries, 1);
75 } cgroup_filter SEC(".maps");
76
77 /* new kernel task_struct definition */
78 struct task_struct___new {
79 long __state;
80 } __attribute__((preserve_access_index));
81
82 /* old kernel task_struct definition */
83 struct task_struct___old {
84 long state;
85 } __attribute__((preserve_access_index));
86
87 int enabled = 0;
88 int has_cpu = 0;
89 int has_task = 0;
90 int has_cgroup = 0;
91 int uses_tgid = 0;
92
93 const volatile bool has_prev_state = false;
94 const volatile bool needs_cgroup = false;
95 const volatile bool uses_cgroup_v1 = false;
96
97 int perf_subsys_id = -1;
98
99 /*
100 * Old kernel used to call it task_struct->state and now it's '__state'.
101 * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
102 *
103 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
104 */
get_task_state(struct task_struct * t)105 static inline int get_task_state(struct task_struct *t)
106 {
107 /* recast pointer to capture new type for compiler */
108 struct task_struct___new *t_new = (void *)t;
109
110 if (bpf_core_field_exists(t_new->__state)) {
111 return BPF_CORE_READ(t_new, __state);
112 } else {
113 /* recast pointer to capture old type for compiler */
114 struct task_struct___old *t_old = (void *)t;
115
116 return BPF_CORE_READ(t_old, state);
117 }
118 }
119
get_cgroup_id(struct task_struct * t)120 static inline __u64 get_cgroup_id(struct task_struct *t)
121 {
122 struct cgroup *cgrp;
123
124 if (!uses_cgroup_v1)
125 return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
126
127 if (perf_subsys_id == -1) {
128 #if __has_builtin(__builtin_preserve_enum_value)
129 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
130 perf_event_cgrp_id);
131 #else
132 perf_subsys_id = perf_event_cgrp_id;
133 #endif
134 }
135
136 cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
137 return BPF_CORE_READ(cgrp, kn, id);
138 }
139
can_record(struct task_struct * t,int state)140 static inline int can_record(struct task_struct *t, int state)
141 {
142 /* kernel threads don't have user stack */
143 if (t->flags & PF_KTHREAD)
144 return 0;
145
146 if (state != TASK_INTERRUPTIBLE &&
147 state != TASK_UNINTERRUPTIBLE)
148 return 0;
149
150 if (has_cpu) {
151 __u32 cpu = bpf_get_smp_processor_id();
152 __u8 *ok;
153
154 ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
155 if (!ok)
156 return 0;
157 }
158
159 if (has_task) {
160 __u8 *ok;
161 __u32 pid;
162
163 if (uses_tgid)
164 pid = t->tgid;
165 else
166 pid = t->pid;
167
168 ok = bpf_map_lookup_elem(&task_filter, &pid);
169 if (!ok)
170 return 0;
171 }
172
173 if (has_cgroup) {
174 __u8 *ok;
175 __u64 cgrp_id = get_cgroup_id(t);
176
177 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
178 if (!ok)
179 return 0;
180 }
181
182 return 1;
183 }
184
off_cpu_stat(u64 * ctx,struct task_struct * prev,struct task_struct * next,int state)185 static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
186 struct task_struct *next, int state)
187 {
188 __u64 ts;
189 __u32 stack_id;
190 struct tstamp_data *pelem;
191
192 ts = bpf_ktime_get_ns();
193
194 if (!can_record(prev, state))
195 goto next;
196
197 stack_id = bpf_get_stackid(ctx, &stacks,
198 BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
199
200 pelem = bpf_task_storage_get(&tstamp, prev, NULL,
201 BPF_LOCAL_STORAGE_GET_F_CREATE);
202 if (!pelem)
203 goto next;
204
205 pelem->timestamp = ts;
206 pelem->state = state;
207 pelem->stack_id = stack_id;
208
209 next:
210 pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
211
212 if (pelem && pelem->timestamp) {
213 struct offcpu_key key = {
214 .pid = next->pid,
215 .tgid = next->tgid,
216 .stack_id = pelem->stack_id,
217 .state = pelem->state,
218 .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
219 };
220 __u64 delta = ts - pelem->timestamp;
221 __u64 *total;
222
223 total = bpf_map_lookup_elem(&off_cpu, &key);
224 if (total)
225 *total += delta;
226 else
227 bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
228
229 /* prevent to reuse the timestamp later */
230 pelem->timestamp = 0;
231 }
232
233 return 0;
234 }
235
236 SEC("tp_btf/task_newtask")
on_newtask(u64 * ctx)237 int on_newtask(u64 *ctx)
238 {
239 struct task_struct *task;
240 u64 clone_flags;
241 u32 pid;
242 u8 val = 1;
243
244 if (!uses_tgid)
245 return 0;
246
247 task = (struct task_struct *)bpf_get_current_task();
248
249 pid = BPF_CORE_READ(task, tgid);
250 if (!bpf_map_lookup_elem(&task_filter, &pid))
251 return 0;
252
253 task = (struct task_struct *)ctx[0];
254 clone_flags = ctx[1];
255
256 pid = task->tgid;
257 if (!(clone_flags & CLONE_THREAD))
258 bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
259
260 return 0;
261 }
262
263 SEC("tp_btf/sched_switch")
on_switch(u64 * ctx)264 int on_switch(u64 *ctx)
265 {
266 struct task_struct *prev, *next;
267 int prev_state;
268
269 if (!enabled)
270 return 0;
271
272 prev = (struct task_struct *)ctx[1];
273 next = (struct task_struct *)ctx[2];
274
275 if (has_prev_state)
276 prev_state = (int)ctx[3];
277 else
278 prev_state = get_task_state(prev);
279
280 return off_cpu_stat(ctx, prev, next, prev_state);
281 }
282
283 char LICENSE[] SEC("license") = "Dual BSD/GPL";
284