1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (c) 2019 Facebook
3
4 #include <stdint.h>
5 #include <stddef.h>
6 #include <stdbool.h>
7 #include <linux/bpf.h>
8 #include <linux/ptrace.h>
9 #include <linux/sched.h>
10 #include <linux/types.h>
11 #include "bpf_helpers.h"
12
13 typedef uint32_t pid_t;
14 struct task_struct {};
15
16 #define TASK_COMM_LEN 16
17 #define PERF_MAX_STACK_DEPTH 127
18
19 #define STROBE_TYPE_INVALID 0
20 #define STROBE_TYPE_INT 1
21 #define STROBE_TYPE_STR 2
22 #define STROBE_TYPE_MAP 3
23
24 #define STACK_TABLE_EPOCH_SHIFT 20
25 #define STROBE_MAX_STR_LEN 1
26 #define STROBE_MAX_CFGS 32
27 #define STROBE_MAX_PAYLOAD \
28 (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \
29 STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
30
31 struct strobe_value_header {
32 /*
33 * meaning depends on type:
34 * 1. int: 0, if value not set, 1 otherwise
35 * 2. str: 1 always, whether value is set or not is determined by ptr
36 * 3. map: 1 always, pointer points to additional struct with number
37 * of entries (up to STROBE_MAX_MAP_ENTRIES)
38 */
39 uint16_t len;
40 /*
41 * _reserved might be used for some future fields/flags, but we always
42 * want to keep strobe_value_header to be 8 bytes, so BPF can read 16
43 * bytes in one go and get both header and value
44 */
45 uint8_t _reserved[6];
46 };
47
48 /*
49 * strobe_value_generic is used from BPF probe only, but needs to be a union
50 * of strobe_value_int/strobe_value_str/strobe_value_map
51 */
52 struct strobe_value_generic {
53 struct strobe_value_header header;
54 union {
55 int64_t val;
56 void *ptr;
57 };
58 };
59
60 struct strobe_value_int {
61 struct strobe_value_header header;
62 int64_t value;
63 };
64
65 struct strobe_value_str {
66 struct strobe_value_header header;
67 const char* value;
68 };
69
70 struct strobe_value_map {
71 struct strobe_value_header header;
72 const struct strobe_map_raw* value;
73 };
74
75 struct strobe_map_entry {
76 const char* key;
77 const char* val;
78 };
79
80 /*
81 * Map of C-string key/value pairs with fixed maximum capacity. Each map has
82 * corresponding int64 ID, which application can use (or ignore) in whatever
83 * way appropriate. Map is "write-only", there is no way to get data out of
84 * map. Map is intended to be used to provide metadata for profilers and is
85 * not to be used for internal in-app communication. All methods are
86 * thread-safe.
87 */
88 struct strobe_map_raw {
89 /*
90 * general purpose unique ID that's up to application to decide
91 * whether and how to use; for request metadata use case id is unique
92 * request ID that's used to match metadata with stack traces on
93 * Strobelight backend side
94 */
95 int64_t id;
96 /* number of used entries in map */
97 int64_t cnt;
98 /*
99 * having volatile doesn't change anything on BPF side, but clang
100 * emits warnings for passing `volatile const char *` into
101 * bpf_probe_read_str that expects just `const char *`
102 */
103 const char* tag;
104 /*
105 * key/value entries, each consisting of 2 pointers to key and value
106 * C strings
107 */
108 struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES];
109 };
110
111 /* Following values define supported values of TLS mode */
112 #define TLS_NOT_SET -1
113 #define TLS_LOCAL_EXEC 0
114 #define TLS_IMM_EXEC 1
115 #define TLS_GENERAL_DYN 2
116
117 /*
118 * structure that universally represents TLS location (both for static
119 * executables and shared libraries)
120 */
121 struct strobe_value_loc {
122 /*
123 * tls_mode defines what TLS mode was used for particular metavariable:
124 * - -1 (TLS_NOT_SET) - no metavariable;
125 * - 0 (TLS_LOCAL_EXEC) - Local Executable mode;
126 * - 1 (TLS_IMM_EXEC) - Immediate Executable mode;
127 * - 2 (TLS_GENERAL_DYN) - General Dynamic mode;
128 * Local Dynamic mode is not yet supported, because never seen in
129 * practice. Mode defines how offset field is interpreted. See
130 * calc_location() in below for details.
131 */
132 int64_t tls_mode;
133 /*
134 * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64,
135 * tpidr_el0 for aarch64).
136 * TLS_IMM_EXEC: absolute address of GOT entry containing offset
137 * from thread pointer;
138 * TLS_GENERAL_DYN: absolute addres of double GOT entry
139 * containing tls_index_t struct;
140 */
141 int64_t offset;
142 };
143
144 struct strobemeta_cfg {
145 int64_t req_meta_idx;
146 struct strobe_value_loc int_locs[STROBE_MAX_INTS];
147 struct strobe_value_loc str_locs[STROBE_MAX_STRS];
148 struct strobe_value_loc map_locs[STROBE_MAX_MAPS];
149 };
150
151 struct strobe_map_descr {
152 uint64_t id;
153 int16_t tag_len;
154 /*
155 * cnt <0 - map value isn't set;
156 * 0 - map has id set, but no key/value entries
157 */
158 int16_t cnt;
159 /*
160 * both key_lens[i] and val_lens[i] should be >0 for present key/value
161 * entry
162 */
163 uint16_t key_lens[STROBE_MAX_MAP_ENTRIES];
164 uint16_t val_lens[STROBE_MAX_MAP_ENTRIES];
165 };
166
167 struct strobemeta_payload {
168 /* req_id has valid request ID, if req_meta_valid == 1 */
169 int64_t req_id;
170 uint8_t req_meta_valid;
171 /*
172 * mask has Nth bit set to 1, if Nth metavar was present and
173 * successfully read
174 */
175 uint64_t int_vals_set_mask;
176 int64_t int_vals[STROBE_MAX_INTS];
177 /* len is >0 for present values */
178 uint16_t str_lens[STROBE_MAX_STRS];
179 /* if map_descrs[i].cnt == -1, metavar is not present/set */
180 struct strobe_map_descr map_descrs[STROBE_MAX_MAPS];
181 /*
182 * payload has compactly packed values of str and map variables in the
183 * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0
184 * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines
185 * value length
186 */
187 char payload[STROBE_MAX_PAYLOAD];
188 };
189
190 struct strobelight_bpf_sample {
191 uint64_t ktime;
192 char comm[TASK_COMM_LEN];
193 pid_t pid;
194 int user_stack_id;
195 int kernel_stack_id;
196 int has_meta;
197 struct strobemeta_payload metadata;
198 /*
199 * makes it possible to pass (<real payload size> + 1) as data size to
200 * perf_submit() to avoid perf_submit's paranoia about passing zero as
201 * size, as it deduces that <real payload size> might be
202 * **theoretically** zero
203 */
204 char dummy_safeguard;
205 };
206
207 struct {
208 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
209 __uint(max_entries, 32);
210 __uint(key_size, sizeof(int));
211 __uint(value_size, sizeof(int));
212 } samples SEC(".maps");
213
214 struct {
215 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
216 __uint(max_entries, 16);
217 __uint(key_size, sizeof(uint32_t));
218 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
219 } stacks_0 SEC(".maps");
220
221 struct {
222 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
223 __uint(max_entries, 16);
224 __uint(key_size, sizeof(uint32_t));
225 __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
226 } stacks_1 SEC(".maps");
227
228 struct {
229 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
230 __uint(max_entries, 1);
231 __type(key, uint32_t);
232 __type(value, struct strobelight_bpf_sample);
233 } sample_heap SEC(".maps");
234
235 struct {
236 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
237 __uint(max_entries, STROBE_MAX_CFGS);
238 __type(key, pid_t);
239 __type(value, struct strobemeta_cfg);
240 } strobemeta_cfgs SEC(".maps");
241
242 /* Type for the dtv. */
243 /* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
244 typedef union dtv {
245 size_t counter;
246 struct {
247 void* val;
248 bool is_static;
249 } pointer;
250 } dtv_t;
251
252 /* Partial definition for tcbhead_t */
253 /* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
254 struct tcbhead {
255 void* tcb;
256 dtv_t* dtv;
257 };
258
259 /*
260 * TLS module/offset information for shared library case.
261 * For x86-64, this is mapped onto two entries in GOT.
262 * For aarch64, this is pointed to by second GOT entry.
263 */
264 struct tls_index {
265 uint64_t module;
266 uint64_t offset;
267 };
268
calc_location(struct strobe_value_loc * loc,void * tls_base)269 static __always_inline void *calc_location(struct strobe_value_loc *loc,
270 void *tls_base)
271 {
272 /*
273 * tls_mode value is:
274 * - -1 (TLS_NOT_SET), if no metavar is present;
275 * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS
276 * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64);
277 * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS;
278 * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS;
279 * This schema allows to use something like:
280 * (tls_mode + 1) * (tls_base + offset)
281 * to get NULL for "no metavar" location, or correct pointer for local
282 * executable mode without doing extra ifs.
283 */
284 if (loc->tls_mode <= TLS_LOCAL_EXEC) {
285 /* static executable is simple, we just have offset from
286 * tls_base */
287 void *addr = tls_base + loc->offset;
288 /* multiply by (tls_mode + 1) to get NULL, if we have no
289 * metavar in this slot */
290 return (void *)((loc->tls_mode + 1) * (int64_t)addr);
291 }
292 /*
293 * Other modes are more complicated, we need to jump through few hoops.
294 *
295 * For immediate executable mode (currently supported only for aarch64):
296 * - loc->offset is pointing to a GOT entry containing fixed offset
297 * relative to tls_base;
298 *
299 * For general dynamic mode:
300 * - loc->offset is pointing to a beginning of double GOT entries;
301 * - (for aarch64 only) second entry points to tls_index_t struct;
302 * - (for x86-64 only) two GOT entries are already tls_index_t;
303 * - tls_index_t->module is used to find start of TLS section in
304 * which variable resides;
305 * - tls_index_t->offset provides offset within that TLS section,
306 * pointing to value of variable.
307 */
308 struct tls_index tls_index;
309 dtv_t *dtv;
310 void *tls_ptr;
311
312 bpf_probe_read(&tls_index, sizeof(struct tls_index),
313 (void *)loc->offset);
314 /* valid module index is always positive */
315 if (tls_index.module > 0) {
316 /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
317 bpf_probe_read(&dtv, sizeof(dtv),
318 &((struct tcbhead *)tls_base)->dtv);
319 dtv += tls_index.module;
320 } else {
321 dtv = NULL;
322 }
323 bpf_probe_read(&tls_ptr, sizeof(void *), dtv);
324 /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
325 return tls_ptr && tls_ptr != (void *)-1
326 ? tls_ptr + tls_index.offset
327 : NULL;
328 }
329
read_int_var(struct strobemeta_cfg * cfg,size_t idx,void * tls_base,struct strobe_value_generic * value,struct strobemeta_payload * data)330 static __always_inline void read_int_var(struct strobemeta_cfg *cfg,
331 size_t idx, void *tls_base,
332 struct strobe_value_generic *value,
333 struct strobemeta_payload *data)
334 {
335 void *location = calc_location(&cfg->int_locs[idx], tls_base);
336 if (!location)
337 return;
338
339 bpf_probe_read(value, sizeof(struct strobe_value_generic), location);
340 data->int_vals[idx] = value->val;
341 if (value->header.len)
342 data->int_vals_set_mask |= (1 << idx);
343 }
344
read_str_var(struct strobemeta_cfg * cfg,size_t idx,void * tls_base,struct strobe_value_generic * value,struct strobemeta_payload * data,void * payload)345 static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
346 size_t idx, void *tls_base,
347 struct strobe_value_generic *value,
348 struct strobemeta_payload *data,
349 void *payload)
350 {
351 void *location;
352 uint32_t len;
353
354 data->str_lens[idx] = 0;
355 location = calc_location(&cfg->str_locs[idx], tls_base);
356 if (!location)
357 return 0;
358
359 bpf_probe_read(value, sizeof(struct strobe_value_generic), location);
360 len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, value->ptr);
361 /*
362 * if bpf_probe_read_str returns error (<0), due to casting to
363 * unsinged int, it will become big number, so next check is
364 * sufficient to check for errors AND prove to BPF verifier, that
365 * bpf_probe_read_str won't return anything bigger than
366 * STROBE_MAX_STR_LEN
367 */
368 if (len > STROBE_MAX_STR_LEN)
369 return 0;
370
371 data->str_lens[idx] = len;
372 return len;
373 }
374
read_map_var(struct strobemeta_cfg * cfg,size_t idx,void * tls_base,struct strobe_value_generic * value,struct strobemeta_payload * data,void * payload)375 static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
376 size_t idx, void *tls_base,
377 struct strobe_value_generic *value,
378 struct strobemeta_payload *data,
379 void *payload)
380 {
381 struct strobe_map_descr* descr = &data->map_descrs[idx];
382 struct strobe_map_raw map;
383 void *location;
384 uint32_t len;
385 int i;
386
387 descr->tag_len = 0; /* presume no tag is set */
388 descr->cnt = -1; /* presume no value is set */
389
390 location = calc_location(&cfg->map_locs[idx], tls_base);
391 if (!location)
392 return payload;
393
394 bpf_probe_read(value, sizeof(struct strobe_value_generic), location);
395 if (bpf_probe_read(&map, sizeof(struct strobe_map_raw), value->ptr))
396 return payload;
397
398 descr->id = map.id;
399 descr->cnt = map.cnt;
400 if (cfg->req_meta_idx == idx) {
401 data->req_id = map.id;
402 data->req_meta_valid = 1;
403 }
404
405 len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN, map.tag);
406 if (len <= STROBE_MAX_STR_LEN) {
407 descr->tag_len = len;
408 payload += len;
409 }
410
411 #ifdef NO_UNROLL
412 #pragma clang loop unroll(disable)
413 #else
414 #pragma unroll
415 #endif
416 for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) {
417 if (i >= map.cnt)
418 break;
419
420 descr->key_lens[i] = 0;
421 len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN,
422 map.entries[i].key);
423 if (len <= STROBE_MAX_STR_LEN) {
424 descr->key_lens[i] = len;
425 payload += len;
426 }
427 descr->val_lens[i] = 0;
428 len = bpf_probe_read_str(payload, STROBE_MAX_STR_LEN,
429 map.entries[i].val);
430 if (len <= STROBE_MAX_STR_LEN) {
431 descr->val_lens[i] = len;
432 payload += len;
433 }
434 }
435
436 return payload;
437 }
438
439 /*
440 * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
441 * pointer to *right after* payload ends
442 */
read_strobe_meta(struct task_struct * task,struct strobemeta_payload * data)443 static __always_inline void *read_strobe_meta(struct task_struct *task,
444 struct strobemeta_payload *data)
445 {
446 pid_t pid = bpf_get_current_pid_tgid() >> 32;
447 struct strobe_value_generic value = {0};
448 struct strobemeta_cfg *cfg;
449 void *tls_base, *payload;
450
451 cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid);
452 if (!cfg)
453 return NULL;
454
455 data->int_vals_set_mask = 0;
456 data->req_meta_valid = 0;
457 payload = data->payload;
458 /*
459 * we don't have struct task_struct definition, it should be:
460 * tls_base = (void *)task->thread.fsbase;
461 */
462 tls_base = (void *)task;
463
464 #ifdef NO_UNROLL
465 #pragma clang loop unroll(disable)
466 #else
467 #pragma unroll
468 #endif
469 for (int i = 0; i < STROBE_MAX_INTS; ++i) {
470 read_int_var(cfg, i, tls_base, &value, data);
471 }
472 #ifdef NO_UNROLL
473 #pragma clang loop unroll(disable)
474 #else
475 #pragma unroll
476 #endif
477 for (int i = 0; i < STROBE_MAX_STRS; ++i) {
478 payload += read_str_var(cfg, i, tls_base, &value, data, payload);
479 }
480 #ifdef NO_UNROLL
481 #pragma clang loop unroll(disable)
482 #else
483 #pragma unroll
484 #endif
485 for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
486 payload = read_map_var(cfg, i, tls_base, &value, data, payload);
487 }
488 /*
489 * return pointer right after end of payload, so it's possible to
490 * calculate exact amount of useful data that needs to be sent
491 */
492 return payload;
493 }
494
495 SEC("raw_tracepoint/kfree_skb")
on_event(struct pt_regs * ctx)496 int on_event(struct pt_regs *ctx) {
497 pid_t pid = bpf_get_current_pid_tgid() >> 32;
498 struct strobelight_bpf_sample* sample;
499 struct task_struct *task;
500 uint32_t zero = 0;
501 uint64_t ktime_ns;
502 void *sample_end;
503
504 sample = bpf_map_lookup_elem(&sample_heap, &zero);
505 if (!sample)
506 return 0; /* this will never happen */
507
508 sample->pid = pid;
509 bpf_get_current_comm(&sample->comm, TASK_COMM_LEN);
510 ktime_ns = bpf_ktime_get_ns();
511 sample->ktime = ktime_ns;
512
513 task = (struct task_struct *)bpf_get_current_task();
514 sample_end = read_strobe_meta(task, &sample->metadata);
515 sample->has_meta = sample_end != NULL;
516 sample_end = sample_end ? : &sample->metadata;
517
518 if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) {
519 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0);
520 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK);
521 } else {
522 sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0);
523 sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK);
524 }
525
526 uint64_t sample_size = sample_end - (void *)sample;
527 /* should always be true */
528 if (sample_size < sizeof(struct strobelight_bpf_sample))
529 bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size);
530 return 0;
531 }
532
533 char _license[] SEC("license") = "GPL";
534