1 /*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 *
16 * Released under the GPL v2. (and only v2, not any later version)
17 */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61
62 #include "sane_ctype.h"
63
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC 02000000
66 #endif
67
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE 1024
70 #endif
71
72 struct trace {
73 struct perf_tool tool;
74 struct syscalltbl *sctbl;
75 struct {
76 int max;
77 struct syscall *table;
78 struct {
79 struct perf_evsel *sys_enter,
80 *sys_exit,
81 *augmented;
82 } events;
83 } syscalls;
84 struct record_opts opts;
85 struct perf_evlist *evlist;
86 struct machine *host;
87 struct thread *current;
88 struct cgroup *cgroup;
89 u64 base_time;
90 FILE *output;
91 unsigned long nr_events;
92 struct strlist *ev_qualifier;
93 struct {
94 size_t nr;
95 int *entries;
96 } ev_qualifier_ids;
97 struct {
98 size_t nr;
99 pid_t *entries;
100 } filter_pids;
101 double duration_filter;
102 double runtime_ms;
103 struct {
104 u64 vfs_getname,
105 proc_getname;
106 } stats;
107 unsigned int max_stack;
108 unsigned int min_stack;
109 bool not_ev_qualifier;
110 bool live;
111 bool full_time;
112 bool sched;
113 bool multiple_threads;
114 bool summary;
115 bool summary_only;
116 bool failure_only;
117 bool show_comm;
118 bool print_sample;
119 bool show_tool_stats;
120 bool trace_syscalls;
121 bool kernel_syscallchains;
122 bool force;
123 bool vfs_getname;
124 int trace_pgfaults;
125 };
126
127 struct tp_field {
128 int offset;
129 union {
130 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132 };
133 };
134
135 #define TP_UINT_FIELD(bits) \
136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
137 { \
138 u##bits value; \
139 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140 return value; \
141 }
142
143 TP_UINT_FIELD(8);
144 TP_UINT_FIELD(16);
145 TP_UINT_FIELD(32);
146 TP_UINT_FIELD(64);
147
148 #define TP_UINT_FIELD__SWAPPED(bits) \
149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
150 { \
151 u##bits value; \
152 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153 return bswap_##bits(value);\
154 }
155
156 TP_UINT_FIELD__SWAPPED(16);
157 TP_UINT_FIELD__SWAPPED(32);
158 TP_UINT_FIELD__SWAPPED(64);
159
__tp_field__init_uint(struct tp_field * field,int size,int offset,bool needs_swap)160 static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
161 {
162 field->offset = offset;
163
164 switch (size) {
165 case 1:
166 field->integer = tp_field__u8;
167 break;
168 case 2:
169 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
170 break;
171 case 4:
172 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
173 break;
174 case 8:
175 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
176 break;
177 default:
178 return -1;
179 }
180
181 return 0;
182 }
183
tp_field__init_uint(struct tp_field * field,struct format_field * format_field,bool needs_swap)184 static int tp_field__init_uint(struct tp_field *field, struct format_field *format_field, bool needs_swap)
185 {
186 return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
187 }
188
tp_field__ptr(struct tp_field * field,struct perf_sample * sample)189 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
190 {
191 return sample->raw_data + field->offset;
192 }
193
__tp_field__init_ptr(struct tp_field * field,int offset)194 static int __tp_field__init_ptr(struct tp_field *field, int offset)
195 {
196 field->offset = offset;
197 field->pointer = tp_field__ptr;
198 return 0;
199 }
200
tp_field__init_ptr(struct tp_field * field,struct format_field * format_field)201 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
202 {
203 return __tp_field__init_ptr(field, format_field->offset);
204 }
205
206 struct syscall_tp {
207 struct tp_field id;
208 union {
209 struct tp_field args, ret;
210 };
211 };
212
perf_evsel__init_tp_uint_field(struct perf_evsel * evsel,struct tp_field * field,const char * name)213 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
214 struct tp_field *field,
215 const char *name)
216 {
217 struct format_field *format_field = perf_evsel__field(evsel, name);
218
219 if (format_field == NULL)
220 return -1;
221
222 return tp_field__init_uint(field, format_field, evsel->needs_swap);
223 }
224
225 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
226 ({ struct syscall_tp *sc = evsel->priv;\
227 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
228
perf_evsel__init_tp_ptr_field(struct perf_evsel * evsel,struct tp_field * field,const char * name)229 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
230 struct tp_field *field,
231 const char *name)
232 {
233 struct format_field *format_field = perf_evsel__field(evsel, name);
234
235 if (format_field == NULL)
236 return -1;
237
238 return tp_field__init_ptr(field, format_field);
239 }
240
241 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
242 ({ struct syscall_tp *sc = evsel->priv;\
243 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
244
perf_evsel__delete_priv(struct perf_evsel * evsel)245 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
246 {
247 zfree(&evsel->priv);
248 perf_evsel__delete(evsel);
249 }
250
perf_evsel__init_syscall_tp(struct perf_evsel * evsel)251 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
252 {
253 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
254
255 if (evsel->priv != NULL) {
256 if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr"))
257 goto out_delete;
258 return 0;
259 }
260
261 return -ENOMEM;
262 out_delete:
263 zfree(&evsel->priv);
264 return -ENOENT;
265 }
266
perf_evsel__init_augmented_syscall_tp(struct perf_evsel * evsel)267 static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel)
268 {
269 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
270
271 if (evsel->priv != NULL) { /* field, sizeof_field, offsetof_field */
272 if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap))
273 goto out_delete;
274
275 return 0;
276 }
277
278 return -ENOMEM;
279 out_delete:
280 zfree(&evsel->priv);
281 return -EINVAL;
282 }
283
perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel * evsel)284 static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
285 {
286 struct syscall_tp *sc = evsel->priv;
287
288 return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
289 }
290
perf_evsel__init_raw_syscall_tp(struct perf_evsel * evsel,void * handler)291 static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
292 {
293 evsel->priv = malloc(sizeof(struct syscall_tp));
294 if (evsel->priv != NULL) {
295 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
296 goto out_delete;
297
298 evsel->handler = handler;
299 return 0;
300 }
301
302 return -ENOMEM;
303
304 out_delete:
305 zfree(&evsel->priv);
306 return -ENOENT;
307 }
308
perf_evsel__raw_syscall_newtp(const char * direction,void * handler)309 static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
310 {
311 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
312
313 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
314 if (IS_ERR(evsel))
315 evsel = perf_evsel__newtp("syscalls", direction);
316
317 if (IS_ERR(evsel))
318 return NULL;
319
320 if (perf_evsel__init_raw_syscall_tp(evsel, handler))
321 goto out_delete;
322
323 return evsel;
324
325 out_delete:
326 perf_evsel__delete_priv(evsel);
327 return NULL;
328 }
329
330 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
331 ({ struct syscall_tp *fields = evsel->priv; \
332 fields->name.integer(&fields->name, sample); })
333
334 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
335 ({ struct syscall_tp *fields = evsel->priv; \
336 fields->name.pointer(&fields->name, sample); })
337
strarray__scnprintf(struct strarray * sa,char * bf,size_t size,const char * intfmt,int val)338 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
339 {
340 int idx = val - sa->offset;
341
342 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
343 return scnprintf(bf, size, intfmt, val);
344
345 return scnprintf(bf, size, "%s", sa->entries[idx]);
346 }
347
__syscall_arg__scnprintf_strarray(char * bf,size_t size,const char * intfmt,struct syscall_arg * arg)348 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
349 const char *intfmt,
350 struct syscall_arg *arg)
351 {
352 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
353 }
354
syscall_arg__scnprintf_strarray(char * bf,size_t size,struct syscall_arg * arg)355 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
356 struct syscall_arg *arg)
357 {
358 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
359 }
360
361 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
362
363 struct strarrays {
364 int nr_entries;
365 struct strarray **entries;
366 };
367
368 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
369 .nr_entries = ARRAY_SIZE(array), \
370 .entries = array, \
371 }
372
syscall_arg__scnprintf_strarrays(char * bf,size_t size,struct syscall_arg * arg)373 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
374 struct syscall_arg *arg)
375 {
376 struct strarrays *sas = arg->parm;
377 int i;
378
379 for (i = 0; i < sas->nr_entries; ++i) {
380 struct strarray *sa = sas->entries[i];
381 int idx = arg->val - sa->offset;
382
383 if (idx >= 0 && idx < sa->nr_entries) {
384 if (sa->entries[idx] == NULL)
385 break;
386 return scnprintf(bf, size, "%s", sa->entries[idx]);
387 }
388 }
389
390 return scnprintf(bf, size, "%d", arg->val);
391 }
392
393 #ifndef AT_FDCWD
394 #define AT_FDCWD -100
395 #endif
396
syscall_arg__scnprintf_fd_at(char * bf,size_t size,struct syscall_arg * arg)397 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
398 struct syscall_arg *arg)
399 {
400 int fd = arg->val;
401
402 if (fd == AT_FDCWD)
403 return scnprintf(bf, size, "CWD");
404
405 return syscall_arg__scnprintf_fd(bf, size, arg);
406 }
407
408 #define SCA_FDAT syscall_arg__scnprintf_fd_at
409
410 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
411 struct syscall_arg *arg);
412
413 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
414
syscall_arg__scnprintf_hex(char * bf,size_t size,struct syscall_arg * arg)415 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
416 {
417 return scnprintf(bf, size, "%#lx", arg->val);
418 }
419
syscall_arg__scnprintf_int(char * bf,size_t size,struct syscall_arg * arg)420 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
421 {
422 return scnprintf(bf, size, "%d", arg->val);
423 }
424
syscall_arg__scnprintf_long(char * bf,size_t size,struct syscall_arg * arg)425 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
426 {
427 return scnprintf(bf, size, "%ld", arg->val);
428 }
429
430 static const char *bpf_cmd[] = {
431 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
432 "MAP_GET_NEXT_KEY", "PROG_LOAD",
433 };
434 static DEFINE_STRARRAY(bpf_cmd);
435
436 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
437 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
438
439 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
440 static DEFINE_STRARRAY(itimers);
441
442 static const char *keyctl_options[] = {
443 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
444 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
445 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
446 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
447 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
448 };
449 static DEFINE_STRARRAY(keyctl_options);
450
451 static const char *whences[] = { "SET", "CUR", "END",
452 #ifdef SEEK_DATA
453 "DATA",
454 #endif
455 #ifdef SEEK_HOLE
456 "HOLE",
457 #endif
458 };
459 static DEFINE_STRARRAY(whences);
460
461 static const char *fcntl_cmds[] = {
462 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
463 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
464 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
465 "GETOWNER_UIDS",
466 };
467 static DEFINE_STRARRAY(fcntl_cmds);
468
469 static const char *fcntl_linux_specific_cmds[] = {
470 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
471 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
472 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
473 };
474
475 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
476
477 static struct strarray *fcntl_cmds_arrays[] = {
478 &strarray__fcntl_cmds,
479 &strarray__fcntl_linux_specific_cmds,
480 };
481
482 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
483
484 static const char *rlimit_resources[] = {
485 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
486 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
487 "RTTIME",
488 };
489 static DEFINE_STRARRAY(rlimit_resources);
490
491 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
492 static DEFINE_STRARRAY(sighow);
493
494 static const char *clockid[] = {
495 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
496 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
497 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
498 };
499 static DEFINE_STRARRAY(clockid);
500
501 static const char *socket_families[] = {
502 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
503 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
504 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
505 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
506 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
507 "ALG", "NFC", "VSOCK",
508 };
509 static DEFINE_STRARRAY(socket_families);
510
syscall_arg__scnprintf_access_mode(char * bf,size_t size,struct syscall_arg * arg)511 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
512 struct syscall_arg *arg)
513 {
514 size_t printed = 0;
515 int mode = arg->val;
516
517 if (mode == F_OK) /* 0 */
518 return scnprintf(bf, size, "F");
519 #define P_MODE(n) \
520 if (mode & n##_OK) { \
521 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
522 mode &= ~n##_OK; \
523 }
524
525 P_MODE(R);
526 P_MODE(W);
527 P_MODE(X);
528 #undef P_MODE
529
530 if (mode)
531 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
532
533 return printed;
534 }
535
536 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
537
538 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
539 struct syscall_arg *arg);
540
541 #define SCA_FILENAME syscall_arg__scnprintf_filename
542
syscall_arg__scnprintf_pipe_flags(char * bf,size_t size,struct syscall_arg * arg)543 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
544 struct syscall_arg *arg)
545 {
546 int printed = 0, flags = arg->val;
547
548 #define P_FLAG(n) \
549 if (flags & O_##n) { \
550 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
551 flags &= ~O_##n; \
552 }
553
554 P_FLAG(CLOEXEC);
555 P_FLAG(NONBLOCK);
556 #undef P_FLAG
557
558 if (flags)
559 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
560
561 return printed;
562 }
563
564 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
565
566 #ifndef GRND_NONBLOCK
567 #define GRND_NONBLOCK 0x0001
568 #endif
569 #ifndef GRND_RANDOM
570 #define GRND_RANDOM 0x0002
571 #endif
572
syscall_arg__scnprintf_getrandom_flags(char * bf,size_t size,struct syscall_arg * arg)573 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
574 struct syscall_arg *arg)
575 {
576 int printed = 0, flags = arg->val;
577
578 #define P_FLAG(n) \
579 if (flags & GRND_##n) { \
580 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
581 flags &= ~GRND_##n; \
582 }
583
584 P_FLAG(RANDOM);
585 P_FLAG(NONBLOCK);
586 #undef P_FLAG
587
588 if (flags)
589 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
590
591 return printed;
592 }
593
594 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
595
596 #define STRARRAY(name, array) \
597 { .scnprintf = SCA_STRARRAY, \
598 .parm = &strarray__##array, }
599
600 #include "trace/beauty/arch_errno_names.c"
601 #include "trace/beauty/eventfd.c"
602 #include "trace/beauty/futex_op.c"
603 #include "trace/beauty/futex_val3.c"
604 #include "trace/beauty/mmap.c"
605 #include "trace/beauty/mode_t.c"
606 #include "trace/beauty/msg_flags.c"
607 #include "trace/beauty/open_flags.c"
608 #include "trace/beauty/perf_event_open.c"
609 #include "trace/beauty/pid.c"
610 #include "trace/beauty/sched_policy.c"
611 #include "trace/beauty/seccomp.c"
612 #include "trace/beauty/signum.c"
613 #include "trace/beauty/socket_type.c"
614 #include "trace/beauty/waitid_options.c"
615
616 struct syscall_arg_fmt {
617 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
618 void *parm;
619 const char *name;
620 bool show_zero;
621 };
622
623 static struct syscall_fmt {
624 const char *name;
625 const char *alias;
626 struct syscall_arg_fmt arg[6];
627 u8 nr_args;
628 bool errpid;
629 bool timeout;
630 bool hexret;
631 } syscall_fmts[] = {
632 { .name = "access",
633 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
634 { .name = "bpf",
635 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
636 { .name = "brk", .hexret = true,
637 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
638 { .name = "clock_gettime",
639 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
640 { .name = "clone", .errpid = true, .nr_args = 5,
641 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
642 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
643 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
644 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
645 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
646 { .name = "close",
647 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
648 { .name = "epoll_ctl",
649 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
650 { .name = "eventfd2",
651 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
652 { .name = "fchmodat",
653 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
654 { .name = "fchownat",
655 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
656 { .name = "fcntl",
657 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
658 .parm = &strarrays__fcntl_cmds_arrays,
659 .show_zero = true, },
660 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
661 { .name = "flock",
662 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
663 { .name = "fstat", .alias = "newfstat", },
664 { .name = "fstatat", .alias = "newfstatat", },
665 { .name = "futex",
666 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
667 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
668 { .name = "futimesat",
669 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
670 { .name = "getitimer",
671 .arg = { [0] = STRARRAY(which, itimers), }, },
672 { .name = "getpid", .errpid = true, },
673 { .name = "getpgid", .errpid = true, },
674 { .name = "getppid", .errpid = true, },
675 { .name = "getrandom",
676 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
677 { .name = "getrlimit",
678 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
679 { .name = "gettid", .errpid = true, },
680 { .name = "ioctl",
681 .arg = {
682 #if defined(__i386__) || defined(__x86_64__)
683 /*
684 * FIXME: Make this available to all arches.
685 */
686 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
687 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
688 #else
689 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
690 #endif
691 { .name = "kcmp", .nr_args = 5,
692 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
693 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
694 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
695 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
696 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
697 { .name = "keyctl",
698 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
699 { .name = "kill",
700 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
701 { .name = "linkat",
702 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
703 { .name = "lseek",
704 .arg = { [2] = STRARRAY(whence, whences), }, },
705 { .name = "lstat", .alias = "newlstat", },
706 { .name = "madvise",
707 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
708 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
709 { .name = "mkdirat",
710 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
711 { .name = "mknodat",
712 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
713 { .name = "mlock",
714 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
715 { .name = "mlockall",
716 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
717 { .name = "mmap", .hexret = true,
718 /* The standard mmap maps to old_mmap on s390x */
719 #if defined(__s390x__)
720 .alias = "old_mmap",
721 #endif
722 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
723 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
724 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
725 { .name = "mprotect",
726 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
727 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
728 { .name = "mq_unlink",
729 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
730 { .name = "mremap", .hexret = true,
731 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
732 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
733 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, },
734 { .name = "munlock",
735 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
736 { .name = "munmap",
737 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
738 { .name = "name_to_handle_at",
739 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
740 { .name = "newfstatat",
741 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
742 { .name = "open",
743 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
744 { .name = "open_by_handle_at",
745 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
746 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
747 { .name = "openat",
748 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
749 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
750 { .name = "perf_event_open",
751 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
752 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
753 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
754 { .name = "pipe2",
755 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
756 { .name = "pkey_alloc",
757 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
758 { .name = "pkey_free",
759 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
760 { .name = "pkey_mprotect",
761 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
762 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
763 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
764 { .name = "poll", .timeout = true, },
765 { .name = "ppoll", .timeout = true, },
766 { .name = "prctl", .alias = "arch_prctl",
767 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
768 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
769 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
770 { .name = "pread", .alias = "pread64", },
771 { .name = "preadv", .alias = "pread", },
772 { .name = "prlimit64",
773 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
774 { .name = "pwrite", .alias = "pwrite64", },
775 { .name = "readlinkat",
776 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
777 { .name = "recvfrom",
778 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
779 { .name = "recvmmsg",
780 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
781 { .name = "recvmsg",
782 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
783 { .name = "renameat",
784 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
785 { .name = "rt_sigaction",
786 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
787 { .name = "rt_sigprocmask",
788 .arg = { [0] = STRARRAY(how, sighow), }, },
789 { .name = "rt_sigqueueinfo",
790 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
791 { .name = "rt_tgsigqueueinfo",
792 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
793 { .name = "sched_setscheduler",
794 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
795 { .name = "seccomp",
796 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
797 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
798 { .name = "select", .timeout = true, },
799 { .name = "sendmmsg",
800 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
801 { .name = "sendmsg",
802 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
803 { .name = "sendto",
804 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
805 { .name = "set_tid_address", .errpid = true, },
806 { .name = "setitimer",
807 .arg = { [0] = STRARRAY(which, itimers), }, },
808 { .name = "setrlimit",
809 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
810 { .name = "socket",
811 .arg = { [0] = STRARRAY(family, socket_families),
812 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
813 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
814 { .name = "socketpair",
815 .arg = { [0] = STRARRAY(family, socket_families),
816 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
817 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
818 { .name = "stat", .alias = "newstat", },
819 { .name = "statx",
820 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
821 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
822 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
823 { .name = "swapoff",
824 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
825 { .name = "swapon",
826 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
827 { .name = "symlinkat",
828 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
829 { .name = "tgkill",
830 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
831 { .name = "tkill",
832 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
833 { .name = "uname", .alias = "newuname", },
834 { .name = "unlinkat",
835 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
836 { .name = "utimensat",
837 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
838 { .name = "wait4", .errpid = true,
839 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
840 { .name = "waitid", .errpid = true,
841 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
842 };
843
syscall_fmt__cmp(const void * name,const void * fmtp)844 static int syscall_fmt__cmp(const void *name, const void *fmtp)
845 {
846 const struct syscall_fmt *fmt = fmtp;
847 return strcmp(name, fmt->name);
848 }
849
syscall_fmt__find(const char * name)850 static struct syscall_fmt *syscall_fmt__find(const char *name)
851 {
852 const int nmemb = ARRAY_SIZE(syscall_fmts);
853 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
854 }
855
856 /*
857 * is_exit: is this "exit" or "exit_group"?
858 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
859 */
860 struct syscall {
861 struct event_format *tp_format;
862 int nr_args;
863 bool is_exit;
864 bool is_open;
865 struct format_field *args;
866 const char *name;
867 struct syscall_fmt *fmt;
868 struct syscall_arg_fmt *arg_fmt;
869 };
870
871 /*
872 * We need to have this 'calculated' boolean because in some cases we really
873 * don't know what is the duration of a syscall, for instance, when we start
874 * a session and some threads are waiting for a syscall to finish, say 'poll',
875 * in which case all we can do is to print "( ? ) for duration and for the
876 * start timestamp.
877 */
fprintf_duration(unsigned long t,bool calculated,FILE * fp)878 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
879 {
880 double duration = (double)t / NSEC_PER_MSEC;
881 size_t printed = fprintf(fp, "(");
882
883 if (!calculated)
884 printed += fprintf(fp, " ");
885 else if (duration >= 1.0)
886 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
887 else if (duration >= 0.01)
888 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
889 else
890 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
891 return printed + fprintf(fp, "): ");
892 }
893
894 /**
895 * filename.ptr: The filename char pointer that will be vfs_getname'd
896 * filename.entry_str_pos: Where to insert the string translated from
897 * filename.ptr by the vfs_getname tracepoint/kprobe.
898 * ret_scnprintf: syscall args may set this to a different syscall return
899 * formatter, for instance, fcntl may return fds, file flags, etc.
900 */
901 struct thread_trace {
902 u64 entry_time;
903 bool entry_pending;
904 unsigned long nr_events;
905 unsigned long pfmaj, pfmin;
906 char *entry_str;
907 double runtime_ms;
908 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
909 struct {
910 unsigned long ptr;
911 short int entry_str_pos;
912 bool pending_open;
913 unsigned int namelen;
914 char *name;
915 } filename;
916 struct {
917 int max;
918 char **table;
919 } paths;
920
921 struct intlist *syscall_stats;
922 };
923
thread_trace__new(void)924 static struct thread_trace *thread_trace__new(void)
925 {
926 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
927
928 if (ttrace)
929 ttrace->paths.max = -1;
930
931 ttrace->syscall_stats = intlist__new(NULL);
932
933 return ttrace;
934 }
935
thread__trace(struct thread * thread,FILE * fp)936 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
937 {
938 struct thread_trace *ttrace;
939
940 if (thread == NULL)
941 goto fail;
942
943 if (thread__priv(thread) == NULL)
944 thread__set_priv(thread, thread_trace__new());
945
946 if (thread__priv(thread) == NULL)
947 goto fail;
948
949 ttrace = thread__priv(thread);
950 ++ttrace->nr_events;
951
952 return ttrace;
953 fail:
954 color_fprintf(fp, PERF_COLOR_RED,
955 "WARNING: not enough memory, dropping samples!\n");
956 return NULL;
957 }
958
959
syscall_arg__set_ret_scnprintf(struct syscall_arg * arg,size_t (* ret_scnprintf)(char * bf,size_t size,struct syscall_arg * arg))960 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
961 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
962 {
963 struct thread_trace *ttrace = thread__priv(arg->thread);
964
965 ttrace->ret_scnprintf = ret_scnprintf;
966 }
967
968 #define TRACE_PFMAJ (1 << 0)
969 #define TRACE_PFMIN (1 << 1)
970
971 static const size_t trace__entry_str_size = 2048;
972
trace__set_fd_pathname(struct thread * thread,int fd,const char * pathname)973 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
974 {
975 struct thread_trace *ttrace = thread__priv(thread);
976
977 if (fd > ttrace->paths.max) {
978 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
979
980 if (npath == NULL)
981 return -1;
982
983 if (ttrace->paths.max != -1) {
984 memset(npath + ttrace->paths.max + 1, 0,
985 (fd - ttrace->paths.max) * sizeof(char *));
986 } else {
987 memset(npath, 0, (fd + 1) * sizeof(char *));
988 }
989
990 ttrace->paths.table = npath;
991 ttrace->paths.max = fd;
992 }
993
994 ttrace->paths.table[fd] = strdup(pathname);
995
996 return ttrace->paths.table[fd] != NULL ? 0 : -1;
997 }
998
thread__read_fd_path(struct thread * thread,int fd)999 static int thread__read_fd_path(struct thread *thread, int fd)
1000 {
1001 char linkname[PATH_MAX], pathname[PATH_MAX];
1002 struct stat st;
1003 int ret;
1004
1005 if (thread->pid_ == thread->tid) {
1006 scnprintf(linkname, sizeof(linkname),
1007 "/proc/%d/fd/%d", thread->pid_, fd);
1008 } else {
1009 scnprintf(linkname, sizeof(linkname),
1010 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1011 }
1012
1013 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1014 return -1;
1015
1016 ret = readlink(linkname, pathname, sizeof(pathname));
1017
1018 if (ret < 0 || ret > st.st_size)
1019 return -1;
1020
1021 pathname[ret] = '\0';
1022 return trace__set_fd_pathname(thread, fd, pathname);
1023 }
1024
thread__fd_path(struct thread * thread,int fd,struct trace * trace)1025 static const char *thread__fd_path(struct thread *thread, int fd,
1026 struct trace *trace)
1027 {
1028 struct thread_trace *ttrace = thread__priv(thread);
1029
1030 if (ttrace == NULL)
1031 return NULL;
1032
1033 if (fd < 0)
1034 return NULL;
1035
1036 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1037 if (!trace->live)
1038 return NULL;
1039 ++trace->stats.proc_getname;
1040 if (thread__read_fd_path(thread, fd))
1041 return NULL;
1042 }
1043
1044 return ttrace->paths.table[fd];
1045 }
1046
syscall_arg__scnprintf_fd(char * bf,size_t size,struct syscall_arg * arg)1047 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1048 {
1049 int fd = arg->val;
1050 size_t printed = scnprintf(bf, size, "%d", fd);
1051 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1052
1053 if (path)
1054 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1055
1056 return printed;
1057 }
1058
pid__scnprintf_fd(struct trace * trace,pid_t pid,int fd,char * bf,size_t size)1059 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1060 {
1061 size_t printed = scnprintf(bf, size, "%d", fd);
1062 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1063
1064 if (thread) {
1065 const char *path = thread__fd_path(thread, fd, trace);
1066
1067 if (path)
1068 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1069
1070 thread__put(thread);
1071 }
1072
1073 return printed;
1074 }
1075
syscall_arg__scnprintf_close_fd(char * bf,size_t size,struct syscall_arg * arg)1076 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1077 struct syscall_arg *arg)
1078 {
1079 int fd = arg->val;
1080 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1081 struct thread_trace *ttrace = thread__priv(arg->thread);
1082
1083 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1084 zfree(&ttrace->paths.table[fd]);
1085
1086 return printed;
1087 }
1088
thread__set_filename_pos(struct thread * thread,const char * bf,unsigned long ptr)1089 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1090 unsigned long ptr)
1091 {
1092 struct thread_trace *ttrace = thread__priv(thread);
1093
1094 ttrace->filename.ptr = ptr;
1095 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1096 }
1097
syscall_arg__scnprintf_filename(char * bf,size_t size,struct syscall_arg * arg)1098 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1099 struct syscall_arg *arg)
1100 {
1101 unsigned long ptr = arg->val;
1102
1103 if (!arg->trace->vfs_getname)
1104 return scnprintf(bf, size, "%#x", ptr);
1105
1106 thread__set_filename_pos(arg->thread, bf, ptr);
1107 return 0;
1108 }
1109
trace__filter_duration(struct trace * trace,double t)1110 static bool trace__filter_duration(struct trace *trace, double t)
1111 {
1112 return t < (trace->duration_filter * NSEC_PER_MSEC);
1113 }
1114
__trace__fprintf_tstamp(struct trace * trace,u64 tstamp,FILE * fp)1115 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1116 {
1117 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1118
1119 return fprintf(fp, "%10.3f ", ts);
1120 }
1121
1122 /*
1123 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1124 * using ttrace->entry_time for a thread that receives a sys_exit without
1125 * first having received a sys_enter ("poll" issued before tracing session
1126 * starts, lost sys_enter exit due to ring buffer overflow).
1127 */
trace__fprintf_tstamp(struct trace * trace,u64 tstamp,FILE * fp)1128 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1129 {
1130 if (tstamp > 0)
1131 return __trace__fprintf_tstamp(trace, tstamp, fp);
1132
1133 return fprintf(fp, " ? ");
1134 }
1135
1136 static bool done = false;
1137 static bool interrupted = false;
1138
sig_handler(int sig)1139 static void sig_handler(int sig)
1140 {
1141 done = true;
1142 interrupted = sig == SIGINT;
1143 }
1144
trace__fprintf_entry_head(struct trace * trace,struct thread * thread,u64 duration,bool duration_calculated,u64 tstamp,FILE * fp)1145 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1146 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1147 {
1148 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1149 printed += fprintf_duration(duration, duration_calculated, fp);
1150
1151 if (trace->multiple_threads) {
1152 if (trace->show_comm)
1153 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1154 printed += fprintf(fp, "%d ", thread->tid);
1155 }
1156
1157 return printed;
1158 }
1159
trace__process_event(struct trace * trace,struct machine * machine,union perf_event * event,struct perf_sample * sample)1160 static int trace__process_event(struct trace *trace, struct machine *machine,
1161 union perf_event *event, struct perf_sample *sample)
1162 {
1163 int ret = 0;
1164
1165 switch (event->header.type) {
1166 case PERF_RECORD_LOST:
1167 color_fprintf(trace->output, PERF_COLOR_RED,
1168 "LOST %" PRIu64 " events!\n", event->lost.lost);
1169 ret = machine__process_lost_event(machine, event, sample);
1170 break;
1171 default:
1172 ret = machine__process_event(machine, event, sample);
1173 break;
1174 }
1175
1176 return ret;
1177 }
1178
trace__tool_process(struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)1179 static int trace__tool_process(struct perf_tool *tool,
1180 union perf_event *event,
1181 struct perf_sample *sample,
1182 struct machine *machine)
1183 {
1184 struct trace *trace = container_of(tool, struct trace, tool);
1185 return trace__process_event(trace, machine, event, sample);
1186 }
1187
trace__machine__resolve_kernel_addr(void * vmachine,unsigned long long * addrp,char ** modp)1188 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1189 {
1190 struct machine *machine = vmachine;
1191
1192 if (machine->kptr_restrict_warned)
1193 return NULL;
1194
1195 if (symbol_conf.kptr_restrict) {
1196 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1197 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1198 "Kernel samples will not be resolved.\n");
1199 machine->kptr_restrict_warned = true;
1200 return NULL;
1201 }
1202
1203 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1204 }
1205
trace__symbols_init(struct trace * trace,struct perf_evlist * evlist)1206 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1207 {
1208 int err = symbol__init(NULL);
1209
1210 if (err)
1211 return err;
1212
1213 trace->host = machine__new_host();
1214 if (trace->host == NULL)
1215 return -ENOMEM;
1216
1217 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1218 if (err < 0)
1219 goto out;
1220
1221 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1222 evlist->threads, trace__tool_process, false,
1223 trace->opts.proc_map_timeout, 1);
1224 out:
1225 if (err)
1226 symbol__exit();
1227
1228 return err;
1229 }
1230
trace__symbols__exit(struct trace * trace)1231 static void trace__symbols__exit(struct trace *trace)
1232 {
1233 machine__exit(trace->host);
1234 trace->host = NULL;
1235
1236 symbol__exit();
1237 }
1238
syscall__alloc_arg_fmts(struct syscall * sc,int nr_args)1239 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1240 {
1241 int idx;
1242
1243 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1244 nr_args = sc->fmt->nr_args;
1245
1246 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1247 if (sc->arg_fmt == NULL)
1248 return -1;
1249
1250 for (idx = 0; idx < nr_args; ++idx) {
1251 if (sc->fmt)
1252 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1253 }
1254
1255 sc->nr_args = nr_args;
1256 return 0;
1257 }
1258
syscall__set_arg_fmts(struct syscall * sc)1259 static int syscall__set_arg_fmts(struct syscall *sc)
1260 {
1261 struct format_field *field;
1262 int idx = 0, len;
1263
1264 for (field = sc->args; field; field = field->next, ++idx) {
1265 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1266 continue;
1267
1268 if (strcmp(field->type, "const char *") == 0 &&
1269 (strcmp(field->name, "filename") == 0 ||
1270 strcmp(field->name, "path") == 0 ||
1271 strcmp(field->name, "pathname") == 0))
1272 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1273 else if (field->flags & FIELD_IS_POINTER)
1274 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1275 else if (strcmp(field->type, "pid_t") == 0)
1276 sc->arg_fmt[idx].scnprintf = SCA_PID;
1277 else if (strcmp(field->type, "umode_t") == 0)
1278 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1279 else if ((strcmp(field->type, "int") == 0 ||
1280 strcmp(field->type, "unsigned int") == 0 ||
1281 strcmp(field->type, "long") == 0) &&
1282 (len = strlen(field->name)) >= 2 &&
1283 strcmp(field->name + len - 2, "fd") == 0) {
1284 /*
1285 * /sys/kernel/tracing/events/syscalls/sys_enter*
1286 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1287 * 65 int
1288 * 23 unsigned int
1289 * 7 unsigned long
1290 */
1291 sc->arg_fmt[idx].scnprintf = SCA_FD;
1292 }
1293 }
1294
1295 return 0;
1296 }
1297
trace__read_syscall_info(struct trace * trace,int id)1298 static int trace__read_syscall_info(struct trace *trace, int id)
1299 {
1300 char tp_name[128];
1301 struct syscall *sc;
1302 const char *name = syscalltbl__name(trace->sctbl, id);
1303
1304 if (name == NULL)
1305 return -1;
1306
1307 if (id > trace->syscalls.max) {
1308 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1309
1310 if (nsyscalls == NULL)
1311 return -1;
1312
1313 if (trace->syscalls.max != -1) {
1314 memset(nsyscalls + trace->syscalls.max + 1, 0,
1315 (id - trace->syscalls.max) * sizeof(*sc));
1316 } else {
1317 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1318 }
1319
1320 trace->syscalls.table = nsyscalls;
1321 trace->syscalls.max = id;
1322 }
1323
1324 sc = trace->syscalls.table + id;
1325 sc->name = name;
1326
1327 sc->fmt = syscall_fmt__find(sc->name);
1328
1329 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1330 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1331
1332 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1333 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1334 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1335 }
1336
1337 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1338 return -1;
1339
1340 if (IS_ERR(sc->tp_format))
1341 return -1;
1342
1343 sc->args = sc->tp_format->format.fields;
1344 /*
1345 * We need to check and discard the first variable '__syscall_nr'
1346 * or 'nr' that mean the syscall number. It is needless here.
1347 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1348 */
1349 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1350 sc->args = sc->args->next;
1351 --sc->nr_args;
1352 }
1353
1354 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1355 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1356
1357 return syscall__set_arg_fmts(sc);
1358 }
1359
trace__validate_ev_qualifier(struct trace * trace)1360 static int trace__validate_ev_qualifier(struct trace *trace)
1361 {
1362 int err = 0, i;
1363 size_t nr_allocated;
1364 struct str_node *pos;
1365
1366 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1367 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1368 sizeof(trace->ev_qualifier_ids.entries[0]));
1369
1370 if (trace->ev_qualifier_ids.entries == NULL) {
1371 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1372 trace->output);
1373 err = -EINVAL;
1374 goto out;
1375 }
1376
1377 nr_allocated = trace->ev_qualifier_ids.nr;
1378 i = 0;
1379
1380 strlist__for_each_entry(pos, trace->ev_qualifier) {
1381 const char *sc = pos->s;
1382 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1383
1384 if (id < 0) {
1385 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1386 if (id >= 0)
1387 goto matches;
1388
1389 if (err == 0) {
1390 fputs("Error:\tInvalid syscall ", trace->output);
1391 err = -EINVAL;
1392 } else {
1393 fputs(", ", trace->output);
1394 }
1395
1396 fputs(sc, trace->output);
1397 }
1398 matches:
1399 trace->ev_qualifier_ids.entries[i++] = id;
1400 if (match_next == -1)
1401 continue;
1402
1403 while (1) {
1404 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1405 if (id < 0)
1406 break;
1407 if (nr_allocated == trace->ev_qualifier_ids.nr) {
1408 void *entries;
1409
1410 nr_allocated += 8;
1411 entries = realloc(trace->ev_qualifier_ids.entries,
1412 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1413 if (entries == NULL) {
1414 err = -ENOMEM;
1415 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1416 goto out_free;
1417 }
1418 trace->ev_qualifier_ids.entries = entries;
1419 }
1420 trace->ev_qualifier_ids.nr++;
1421 trace->ev_qualifier_ids.entries[i++] = id;
1422 }
1423 }
1424
1425 if (err < 0) {
1426 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1427 "\nHint:\tand: 'man syscalls'\n", trace->output);
1428 out_free:
1429 zfree(&trace->ev_qualifier_ids.entries);
1430 trace->ev_qualifier_ids.nr = 0;
1431 }
1432 out:
1433 return err;
1434 }
1435
1436 /*
1437 * args is to be interpreted as a series of longs but we need to handle
1438 * 8-byte unaligned accesses. args points to raw_data within the event
1439 * and raw_data is guaranteed to be 8-byte unaligned because it is
1440 * preceded by raw_size which is a u32. So we need to copy args to a temp
1441 * variable to read it. Most notably this avoids extended load instructions
1442 * on unaligned addresses
1443 */
syscall_arg__val(struct syscall_arg * arg,u8 idx)1444 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1445 {
1446 unsigned long val;
1447 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1448
1449 memcpy(&val, p, sizeof(val));
1450 return val;
1451 }
1452
syscall__scnprintf_name(struct syscall * sc,char * bf,size_t size,struct syscall_arg * arg)1453 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1454 struct syscall_arg *arg)
1455 {
1456 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1457 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1458
1459 return scnprintf(bf, size, "arg%d: ", arg->idx);
1460 }
1461
syscall__scnprintf_val(struct syscall * sc,char * bf,size_t size,struct syscall_arg * arg,unsigned long val)1462 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1463 struct syscall_arg *arg, unsigned long val)
1464 {
1465 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1466 arg->val = val;
1467 if (sc->arg_fmt[arg->idx].parm)
1468 arg->parm = sc->arg_fmt[arg->idx].parm;
1469 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1470 }
1471 return scnprintf(bf, size, "%ld", val);
1472 }
1473
syscall__scnprintf_args(struct syscall * sc,char * bf,size_t size,unsigned char * args,struct trace * trace,struct thread * thread)1474 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1475 unsigned char *args, struct trace *trace,
1476 struct thread *thread)
1477 {
1478 size_t printed = 0;
1479 unsigned long val;
1480 u8 bit = 1;
1481 struct syscall_arg arg = {
1482 .args = args,
1483 .idx = 0,
1484 .mask = 0,
1485 .trace = trace,
1486 .thread = thread,
1487 };
1488 struct thread_trace *ttrace = thread__priv(thread);
1489
1490 /*
1491 * Things like fcntl will set this in its 'cmd' formatter to pick the
1492 * right formatter for the return value (an fd? file flags?), which is
1493 * not needed for syscalls that always return a given type, say an fd.
1494 */
1495 ttrace->ret_scnprintf = NULL;
1496
1497 if (sc->args != NULL) {
1498 struct format_field *field;
1499
1500 for (field = sc->args; field;
1501 field = field->next, ++arg.idx, bit <<= 1) {
1502 if (arg.mask & bit)
1503 continue;
1504
1505 val = syscall_arg__val(&arg, arg.idx);
1506
1507 /*
1508 * Suppress this argument if its value is zero and
1509 * and we don't have a string associated in an
1510 * strarray for it.
1511 */
1512 if (val == 0 &&
1513 !(sc->arg_fmt &&
1514 (sc->arg_fmt[arg.idx].show_zero ||
1515 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1516 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1517 sc->arg_fmt[arg.idx].parm))
1518 continue;
1519
1520 printed += scnprintf(bf + printed, size - printed,
1521 "%s%s: ", printed ? ", " : "", field->name);
1522 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1523 }
1524 } else if (IS_ERR(sc->tp_format)) {
1525 /*
1526 * If we managed to read the tracepoint /format file, then we
1527 * may end up not having any args, like with gettid(), so only
1528 * print the raw args when we didn't manage to read it.
1529 */
1530 while (arg.idx < sc->nr_args) {
1531 if (arg.mask & bit)
1532 goto next_arg;
1533 val = syscall_arg__val(&arg, arg.idx);
1534 if (printed)
1535 printed += scnprintf(bf + printed, size - printed, ", ");
1536 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1537 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1538 next_arg:
1539 ++arg.idx;
1540 bit <<= 1;
1541 }
1542 }
1543
1544 return printed;
1545 }
1546
1547 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1548 union perf_event *event,
1549 struct perf_sample *sample);
1550
trace__syscall_info(struct trace * trace,struct perf_evsel * evsel,int id)1551 static struct syscall *trace__syscall_info(struct trace *trace,
1552 struct perf_evsel *evsel, int id)
1553 {
1554
1555 if (id < 0) {
1556
1557 /*
1558 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1559 * before that, leaving at a higher verbosity level till that is
1560 * explained. Reproduced with plain ftrace with:
1561 *
1562 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1563 * grep "NR -1 " /t/trace_pipe
1564 *
1565 * After generating some load on the machine.
1566 */
1567 if (verbose > 1) {
1568 static u64 n;
1569 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1570 id, perf_evsel__name(evsel), ++n);
1571 }
1572 return NULL;
1573 }
1574
1575 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1576 trace__read_syscall_info(trace, id))
1577 goto out_cant_read;
1578
1579 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1580 goto out_cant_read;
1581
1582 return &trace->syscalls.table[id];
1583
1584 out_cant_read:
1585 if (verbose > 0) {
1586 fprintf(trace->output, "Problems reading syscall %d", id);
1587 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1588 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1589 fputs(" information\n", trace->output);
1590 }
1591 return NULL;
1592 }
1593
thread__update_stats(struct thread_trace * ttrace,int id,struct perf_sample * sample)1594 static void thread__update_stats(struct thread_trace *ttrace,
1595 int id, struct perf_sample *sample)
1596 {
1597 struct int_node *inode;
1598 struct stats *stats;
1599 u64 duration = 0;
1600
1601 inode = intlist__findnew(ttrace->syscall_stats, id);
1602 if (inode == NULL)
1603 return;
1604
1605 stats = inode->priv;
1606 if (stats == NULL) {
1607 stats = malloc(sizeof(struct stats));
1608 if (stats == NULL)
1609 return;
1610 init_stats(stats);
1611 inode->priv = stats;
1612 }
1613
1614 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1615 duration = sample->time - ttrace->entry_time;
1616
1617 update_stats(stats, duration);
1618 }
1619
trace__printf_interrupted_entry(struct trace * trace)1620 static int trace__printf_interrupted_entry(struct trace *trace)
1621 {
1622 struct thread_trace *ttrace;
1623 size_t printed;
1624
1625 if (trace->failure_only || trace->current == NULL)
1626 return 0;
1627
1628 ttrace = thread__priv(trace->current);
1629
1630 if (!ttrace->entry_pending)
1631 return 0;
1632
1633 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1634 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1635 ttrace->entry_pending = false;
1636
1637 return printed;
1638 }
1639
trace__fprintf_sample(struct trace * trace,struct perf_evsel * evsel,struct perf_sample * sample,struct thread * thread)1640 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1641 struct perf_sample *sample, struct thread *thread)
1642 {
1643 int printed = 0;
1644
1645 if (trace->print_sample) {
1646 double ts = (double)sample->time / NSEC_PER_MSEC;
1647
1648 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1649 perf_evsel__name(evsel), ts,
1650 thread__comm_str(thread),
1651 sample->pid, sample->tid, sample->cpu);
1652 }
1653
1654 return printed;
1655 }
1656
trace__sys_enter(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1657 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1658 union perf_event *event __maybe_unused,
1659 struct perf_sample *sample)
1660 {
1661 char *msg;
1662 void *args;
1663 size_t printed = 0;
1664 struct thread *thread;
1665 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1666 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1667 struct thread_trace *ttrace;
1668
1669 if (sc == NULL)
1670 return -1;
1671
1672 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1673 ttrace = thread__trace(thread, trace->output);
1674 if (ttrace == NULL)
1675 goto out_put;
1676
1677 trace__fprintf_sample(trace, evsel, sample, thread);
1678
1679 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1680
1681 if (ttrace->entry_str == NULL) {
1682 ttrace->entry_str = malloc(trace__entry_str_size);
1683 if (!ttrace->entry_str)
1684 goto out_put;
1685 }
1686
1687 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1688 trace__printf_interrupted_entry(trace);
1689
1690 ttrace->entry_time = sample->time;
1691 msg = ttrace->entry_str;
1692 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1693
1694 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1695 args, trace, thread);
1696
1697 if (sc->is_exit) {
1698 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1699 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1700 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1701 }
1702 } else {
1703 ttrace->entry_pending = true;
1704 /* See trace__vfs_getname & trace__sys_exit */
1705 ttrace->filename.pending_open = false;
1706 }
1707
1708 if (trace->current != thread) {
1709 thread__put(trace->current);
1710 trace->current = thread__get(thread);
1711 }
1712 err = 0;
1713 out_put:
1714 thread__put(thread);
1715 return err;
1716 }
1717
trace__fprintf_sys_enter(struct trace * trace,struct perf_evsel * evsel,struct perf_sample * sample)1718 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1719 struct perf_sample *sample)
1720 {
1721 struct thread_trace *ttrace;
1722 struct thread *thread;
1723 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1724 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1725 char msg[1024];
1726 void *args;
1727
1728 if (sc == NULL)
1729 return -1;
1730
1731 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1732 ttrace = thread__trace(thread, trace->output);
1733 /*
1734 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1735 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1736 */
1737 if (ttrace == NULL)
1738 goto out_put;
1739
1740 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1741 syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread);
1742 fprintf(trace->output, "%s", msg);
1743 err = 0;
1744 out_put:
1745 thread__put(thread);
1746 return err;
1747 }
1748
trace__resolve_callchain(struct trace * trace,struct perf_evsel * evsel,struct perf_sample * sample,struct callchain_cursor * cursor)1749 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1750 struct perf_sample *sample,
1751 struct callchain_cursor *cursor)
1752 {
1753 struct addr_location al;
1754 int max_stack = evsel->attr.sample_max_stack ?
1755 evsel->attr.sample_max_stack :
1756 trace->max_stack;
1757
1758 if (machine__resolve(trace->host, &al, sample) < 0 ||
1759 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1760 return -1;
1761
1762 return 0;
1763 }
1764
trace__fprintf_callchain(struct trace * trace,struct perf_sample * sample)1765 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1766 {
1767 /* TODO: user-configurable print_opts */
1768 const unsigned int print_opts = EVSEL__PRINT_SYM |
1769 EVSEL__PRINT_DSO |
1770 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1771
1772 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1773 }
1774
errno_to_name(struct perf_evsel * evsel,int err)1775 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1776 {
1777 struct perf_env *env = perf_evsel__env(evsel);
1778 const char *arch_name = perf_env__arch(env);
1779
1780 return arch_syscalls__strerrno(arch_name, err);
1781 }
1782
trace__sys_exit(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1783 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1784 union perf_event *event __maybe_unused,
1785 struct perf_sample *sample)
1786 {
1787 long ret;
1788 u64 duration = 0;
1789 bool duration_calculated = false;
1790 struct thread *thread;
1791 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1792 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1793 struct thread_trace *ttrace;
1794
1795 if (sc == NULL)
1796 return -1;
1797
1798 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1799 ttrace = thread__trace(thread, trace->output);
1800 if (ttrace == NULL)
1801 goto out_put;
1802
1803 trace__fprintf_sample(trace, evsel, sample, thread);
1804
1805 if (trace->summary)
1806 thread__update_stats(ttrace, id, sample);
1807
1808 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1809
1810 if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1811 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1812 ttrace->filename.pending_open = false;
1813 ++trace->stats.vfs_getname;
1814 }
1815
1816 if (ttrace->entry_time) {
1817 duration = sample->time - ttrace->entry_time;
1818 if (trace__filter_duration(trace, duration))
1819 goto out;
1820 duration_calculated = true;
1821 } else if (trace->duration_filter)
1822 goto out;
1823
1824 if (sample->callchain) {
1825 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1826 if (callchain_ret == 0) {
1827 if (callchain_cursor.nr < trace->min_stack)
1828 goto out;
1829 callchain_ret = 1;
1830 }
1831 }
1832
1833 if (trace->summary_only || (ret >= 0 && trace->failure_only))
1834 goto out;
1835
1836 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1837
1838 if (ttrace->entry_pending) {
1839 fprintf(trace->output, "%-70s", ttrace->entry_str);
1840 } else {
1841 fprintf(trace->output, " ... [");
1842 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1843 fprintf(trace->output, "]: %s()", sc->name);
1844 }
1845
1846 if (sc->fmt == NULL) {
1847 if (ret < 0)
1848 goto errno_print;
1849 signed_print:
1850 fprintf(trace->output, ") = %ld", ret);
1851 } else if (ret < 0) {
1852 errno_print: {
1853 char bf[STRERR_BUFSIZE];
1854 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1855 *e = errno_to_name(evsel, -ret);
1856
1857 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1858 }
1859 } else if (ret == 0 && sc->fmt->timeout)
1860 fprintf(trace->output, ") = 0 Timeout");
1861 else if (ttrace->ret_scnprintf) {
1862 char bf[1024];
1863 struct syscall_arg arg = {
1864 .val = ret,
1865 .thread = thread,
1866 .trace = trace,
1867 };
1868 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1869 ttrace->ret_scnprintf = NULL;
1870 fprintf(trace->output, ") = %s", bf);
1871 } else if (sc->fmt->hexret)
1872 fprintf(trace->output, ") = %#lx", ret);
1873 else if (sc->fmt->errpid) {
1874 struct thread *child = machine__find_thread(trace->host, ret, ret);
1875
1876 if (child != NULL) {
1877 fprintf(trace->output, ") = %ld", ret);
1878 if (child->comm_set)
1879 fprintf(trace->output, " (%s)", thread__comm_str(child));
1880 thread__put(child);
1881 }
1882 } else
1883 goto signed_print;
1884
1885 fputc('\n', trace->output);
1886
1887 if (callchain_ret > 0)
1888 trace__fprintf_callchain(trace, sample);
1889 else if (callchain_ret < 0)
1890 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1891 out:
1892 ttrace->entry_pending = false;
1893 err = 0;
1894 out_put:
1895 thread__put(thread);
1896 return err;
1897 }
1898
trace__vfs_getname(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1899 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1900 union perf_event *event __maybe_unused,
1901 struct perf_sample *sample)
1902 {
1903 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1904 struct thread_trace *ttrace;
1905 size_t filename_len, entry_str_len, to_move;
1906 ssize_t remaining_space;
1907 char *pos;
1908 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1909
1910 if (!thread)
1911 goto out;
1912
1913 ttrace = thread__priv(thread);
1914 if (!ttrace)
1915 goto out_put;
1916
1917 filename_len = strlen(filename);
1918 if (filename_len == 0)
1919 goto out_put;
1920
1921 if (ttrace->filename.namelen < filename_len) {
1922 char *f = realloc(ttrace->filename.name, filename_len + 1);
1923
1924 if (f == NULL)
1925 goto out_put;
1926
1927 ttrace->filename.namelen = filename_len;
1928 ttrace->filename.name = f;
1929 }
1930
1931 strcpy(ttrace->filename.name, filename);
1932 ttrace->filename.pending_open = true;
1933
1934 if (!ttrace->filename.ptr)
1935 goto out_put;
1936
1937 entry_str_len = strlen(ttrace->entry_str);
1938 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1939 if (remaining_space <= 0)
1940 goto out_put;
1941
1942 if (filename_len > (size_t)remaining_space) {
1943 filename += filename_len - remaining_space;
1944 filename_len = remaining_space;
1945 }
1946
1947 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1948 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1949 memmove(pos + filename_len, pos, to_move);
1950 memcpy(pos, filename, filename_len);
1951
1952 ttrace->filename.ptr = 0;
1953 ttrace->filename.entry_str_pos = 0;
1954 out_put:
1955 thread__put(thread);
1956 out:
1957 return 0;
1958 }
1959
trace__sched_stat_runtime(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1960 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1961 union perf_event *event __maybe_unused,
1962 struct perf_sample *sample)
1963 {
1964 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1965 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1966 struct thread *thread = machine__findnew_thread(trace->host,
1967 sample->pid,
1968 sample->tid);
1969 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1970
1971 if (ttrace == NULL)
1972 goto out_dump;
1973
1974 ttrace->runtime_ms += runtime_ms;
1975 trace->runtime_ms += runtime_ms;
1976 out_put:
1977 thread__put(thread);
1978 return 0;
1979
1980 out_dump:
1981 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1982 evsel->name,
1983 perf_evsel__strval(evsel, sample, "comm"),
1984 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1985 runtime,
1986 perf_evsel__intval(evsel, sample, "vruntime"));
1987 goto out_put;
1988 }
1989
bpf_output__printer(enum binary_printer_ops op,unsigned int val,void * extra __maybe_unused,FILE * fp)1990 static int bpf_output__printer(enum binary_printer_ops op,
1991 unsigned int val, void *extra __maybe_unused, FILE *fp)
1992 {
1993 unsigned char ch = (unsigned char)val;
1994
1995 switch (op) {
1996 case BINARY_PRINT_CHAR_DATA:
1997 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1998 case BINARY_PRINT_DATA_BEGIN:
1999 case BINARY_PRINT_LINE_BEGIN:
2000 case BINARY_PRINT_ADDR:
2001 case BINARY_PRINT_NUM_DATA:
2002 case BINARY_PRINT_NUM_PAD:
2003 case BINARY_PRINT_SEP:
2004 case BINARY_PRINT_CHAR_PAD:
2005 case BINARY_PRINT_LINE_END:
2006 case BINARY_PRINT_DATA_END:
2007 default:
2008 break;
2009 }
2010
2011 return 0;
2012 }
2013
bpf_output__fprintf(struct trace * trace,struct perf_sample * sample)2014 static void bpf_output__fprintf(struct trace *trace,
2015 struct perf_sample *sample)
2016 {
2017 binary__fprintf(sample->raw_data, sample->raw_size, 8,
2018 bpf_output__printer, NULL, trace->output);
2019 }
2020
trace__event_handler(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)2021 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2022 union perf_event *event __maybe_unused,
2023 struct perf_sample *sample)
2024 {
2025 int callchain_ret = 0;
2026
2027 if (sample->callchain) {
2028 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2029 if (callchain_ret == 0) {
2030 if (callchain_cursor.nr < trace->min_stack)
2031 goto out;
2032 callchain_ret = 1;
2033 }
2034 }
2035
2036 trace__printf_interrupted_entry(trace);
2037 trace__fprintf_tstamp(trace, sample->time, trace->output);
2038
2039 if (trace->trace_syscalls)
2040 fprintf(trace->output, "( ): ");
2041
2042 fprintf(trace->output, "%s:", evsel->name);
2043
2044 if (perf_evsel__is_bpf_output(evsel)) {
2045 if (evsel == trace->syscalls.events.augmented)
2046 trace__fprintf_sys_enter(trace, evsel, sample);
2047 else
2048 bpf_output__fprintf(trace, sample);
2049 } else if (evsel->tp_format) {
2050 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2051 trace__fprintf_sys_enter(trace, evsel, sample)) {
2052 event_format__fprintf(evsel->tp_format, sample->cpu,
2053 sample->raw_data, sample->raw_size,
2054 trace->output);
2055 }
2056 }
2057
2058 fprintf(trace->output, "\n");
2059
2060 if (callchain_ret > 0)
2061 trace__fprintf_callchain(trace, sample);
2062 else if (callchain_ret < 0)
2063 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2064 out:
2065 return 0;
2066 }
2067
print_location(FILE * f,struct perf_sample * sample,struct addr_location * al,bool print_dso,bool print_sym)2068 static void print_location(FILE *f, struct perf_sample *sample,
2069 struct addr_location *al,
2070 bool print_dso, bool print_sym)
2071 {
2072
2073 if ((verbose > 0 || print_dso) && al->map)
2074 fprintf(f, "%s@", al->map->dso->long_name);
2075
2076 if ((verbose > 0 || print_sym) && al->sym)
2077 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2078 al->addr - al->sym->start);
2079 else if (al->map)
2080 fprintf(f, "0x%" PRIx64, al->addr);
2081 else
2082 fprintf(f, "0x%" PRIx64, sample->addr);
2083 }
2084
trace__pgfault(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)2085 static int trace__pgfault(struct trace *trace,
2086 struct perf_evsel *evsel,
2087 union perf_event *event __maybe_unused,
2088 struct perf_sample *sample)
2089 {
2090 struct thread *thread;
2091 struct addr_location al;
2092 char map_type = 'd';
2093 struct thread_trace *ttrace;
2094 int err = -1;
2095 int callchain_ret = 0;
2096
2097 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2098
2099 if (sample->callchain) {
2100 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2101 if (callchain_ret == 0) {
2102 if (callchain_cursor.nr < trace->min_stack)
2103 goto out_put;
2104 callchain_ret = 1;
2105 }
2106 }
2107
2108 ttrace = thread__trace(thread, trace->output);
2109 if (ttrace == NULL)
2110 goto out_put;
2111
2112 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2113 ttrace->pfmaj++;
2114 else
2115 ttrace->pfmin++;
2116
2117 if (trace->summary_only)
2118 goto out;
2119
2120 thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2121
2122 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2123
2124 fprintf(trace->output, "%sfault [",
2125 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2126 "maj" : "min");
2127
2128 print_location(trace->output, sample, &al, false, true);
2129
2130 fprintf(trace->output, "] => ");
2131
2132 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2133
2134 if (!al.map) {
2135 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2136
2137 if (al.map)
2138 map_type = 'x';
2139 else
2140 map_type = '?';
2141 }
2142
2143 print_location(trace->output, sample, &al, true, false);
2144
2145 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2146
2147 if (callchain_ret > 0)
2148 trace__fprintf_callchain(trace, sample);
2149 else if (callchain_ret < 0)
2150 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2151 out:
2152 err = 0;
2153 out_put:
2154 thread__put(thread);
2155 return err;
2156 }
2157
trace__set_base_time(struct trace * trace,struct perf_evsel * evsel,struct perf_sample * sample)2158 static void trace__set_base_time(struct trace *trace,
2159 struct perf_evsel *evsel,
2160 struct perf_sample *sample)
2161 {
2162 /*
2163 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2164 * and don't use sample->time unconditionally, we may end up having
2165 * some other event in the future without PERF_SAMPLE_TIME for good
2166 * reason, i.e. we may not be interested in its timestamps, just in
2167 * it taking place, picking some piece of information when it
2168 * appears in our event stream (vfs_getname comes to mind).
2169 */
2170 if (trace->base_time == 0 && !trace->full_time &&
2171 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2172 trace->base_time = sample->time;
2173 }
2174
trace__process_sample(struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct perf_evsel * evsel,struct machine * machine __maybe_unused)2175 static int trace__process_sample(struct perf_tool *tool,
2176 union perf_event *event,
2177 struct perf_sample *sample,
2178 struct perf_evsel *evsel,
2179 struct machine *machine __maybe_unused)
2180 {
2181 struct trace *trace = container_of(tool, struct trace, tool);
2182 struct thread *thread;
2183 int err = 0;
2184
2185 tracepoint_handler handler = evsel->handler;
2186
2187 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2188 if (thread && thread__is_filtered(thread))
2189 goto out;
2190
2191 trace__set_base_time(trace, evsel, sample);
2192
2193 if (handler) {
2194 ++trace->nr_events;
2195 handler(trace, evsel, event, sample);
2196 }
2197 out:
2198 thread__put(thread);
2199 return err;
2200 }
2201
trace__record(struct trace * trace,int argc,const char ** argv)2202 static int trace__record(struct trace *trace, int argc, const char **argv)
2203 {
2204 unsigned int rec_argc, i, j;
2205 const char **rec_argv;
2206 const char * const record_args[] = {
2207 "record",
2208 "-R",
2209 "-m", "1024",
2210 "-c", "1",
2211 };
2212
2213 const char * const sc_args[] = { "-e", };
2214 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2215 const char * const majpf_args[] = { "-e", "major-faults" };
2216 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2217 const char * const minpf_args[] = { "-e", "minor-faults" };
2218 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2219
2220 /* +1 is for the event string below */
2221 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2222 majpf_args_nr + minpf_args_nr + argc;
2223 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2224
2225 if (rec_argv == NULL)
2226 return -ENOMEM;
2227
2228 j = 0;
2229 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2230 rec_argv[j++] = record_args[i];
2231
2232 if (trace->trace_syscalls) {
2233 for (i = 0; i < sc_args_nr; i++)
2234 rec_argv[j++] = sc_args[i];
2235
2236 /* event string may be different for older kernels - e.g., RHEL6 */
2237 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2238 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2239 else if (is_valid_tracepoint("syscalls:sys_enter"))
2240 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2241 else {
2242 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2243 free(rec_argv);
2244 return -1;
2245 }
2246 }
2247
2248 if (trace->trace_pgfaults & TRACE_PFMAJ)
2249 for (i = 0; i < majpf_args_nr; i++)
2250 rec_argv[j++] = majpf_args[i];
2251
2252 if (trace->trace_pgfaults & TRACE_PFMIN)
2253 for (i = 0; i < minpf_args_nr; i++)
2254 rec_argv[j++] = minpf_args[i];
2255
2256 for (i = 0; i < (unsigned int)argc; i++)
2257 rec_argv[j++] = argv[i];
2258
2259 return cmd_record(j, rec_argv);
2260 }
2261
2262 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2263
perf_evlist__add_vfs_getname(struct perf_evlist * evlist)2264 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2265 {
2266 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2267
2268 if (IS_ERR(evsel))
2269 return false;
2270
2271 if (perf_evsel__field(evsel, "pathname") == NULL) {
2272 perf_evsel__delete(evsel);
2273 return false;
2274 }
2275
2276 evsel->handler = trace__vfs_getname;
2277 perf_evlist__add(evlist, evsel);
2278 return true;
2279 }
2280
perf_evsel__new_pgfault(u64 config)2281 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2282 {
2283 struct perf_evsel *evsel;
2284 struct perf_event_attr attr = {
2285 .type = PERF_TYPE_SOFTWARE,
2286 .mmap_data = 1,
2287 };
2288
2289 attr.config = config;
2290 attr.sample_period = 1;
2291
2292 event_attr_init(&attr);
2293
2294 evsel = perf_evsel__new(&attr);
2295 if (evsel)
2296 evsel->handler = trace__pgfault;
2297
2298 return evsel;
2299 }
2300
trace__handle_event(struct trace * trace,union perf_event * event,struct perf_sample * sample)2301 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2302 {
2303 const u32 type = event->header.type;
2304 struct perf_evsel *evsel;
2305
2306 if (type != PERF_RECORD_SAMPLE) {
2307 trace__process_event(trace, trace->host, event, sample);
2308 return;
2309 }
2310
2311 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2312 if (evsel == NULL) {
2313 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2314 return;
2315 }
2316
2317 trace__set_base_time(trace, evsel, sample);
2318
2319 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2320 sample->raw_data == NULL) {
2321 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2322 perf_evsel__name(evsel), sample->tid,
2323 sample->cpu, sample->raw_size);
2324 } else {
2325 tracepoint_handler handler = evsel->handler;
2326 handler(trace, evsel, event, sample);
2327 }
2328 }
2329
trace__add_syscall_newtp(struct trace * trace)2330 static int trace__add_syscall_newtp(struct trace *trace)
2331 {
2332 int ret = -1;
2333 struct perf_evlist *evlist = trace->evlist;
2334 struct perf_evsel *sys_enter, *sys_exit;
2335
2336 sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2337 if (sys_enter == NULL)
2338 goto out;
2339
2340 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2341 goto out_delete_sys_enter;
2342
2343 sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2344 if (sys_exit == NULL)
2345 goto out_delete_sys_enter;
2346
2347 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2348 goto out_delete_sys_exit;
2349
2350 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2351 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2352
2353 perf_evlist__add(evlist, sys_enter);
2354 perf_evlist__add(evlist, sys_exit);
2355
2356 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2357 /*
2358 * We're interested only in the user space callchain
2359 * leading to the syscall, allow overriding that for
2360 * debugging reasons using --kernel_syscall_callchains
2361 */
2362 sys_exit->attr.exclude_callchain_kernel = 1;
2363 }
2364
2365 trace->syscalls.events.sys_enter = sys_enter;
2366 trace->syscalls.events.sys_exit = sys_exit;
2367
2368 ret = 0;
2369 out:
2370 return ret;
2371
2372 out_delete_sys_exit:
2373 perf_evsel__delete_priv(sys_exit);
2374 out_delete_sys_enter:
2375 perf_evsel__delete_priv(sys_enter);
2376 goto out;
2377 }
2378
trace__set_ev_qualifier_filter(struct trace * trace)2379 static int trace__set_ev_qualifier_filter(struct trace *trace)
2380 {
2381 int err = -1;
2382 struct perf_evsel *sys_exit;
2383 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2384 trace->ev_qualifier_ids.nr,
2385 trace->ev_qualifier_ids.entries);
2386
2387 if (filter == NULL)
2388 goto out_enomem;
2389
2390 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2391 filter)) {
2392 sys_exit = trace->syscalls.events.sys_exit;
2393 err = perf_evsel__append_tp_filter(sys_exit, filter);
2394 }
2395
2396 free(filter);
2397 out:
2398 return err;
2399 out_enomem:
2400 errno = ENOMEM;
2401 goto out;
2402 }
2403
trace__set_filter_loop_pids(struct trace * trace)2404 static int trace__set_filter_loop_pids(struct trace *trace)
2405 {
2406 unsigned int nr = 1;
2407 pid_t pids[32] = {
2408 getpid(),
2409 };
2410 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2411
2412 while (thread && nr < ARRAY_SIZE(pids)) {
2413 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2414
2415 if (parent == NULL)
2416 break;
2417
2418 if (!strcmp(thread__comm_str(parent), "sshd")) {
2419 pids[nr++] = parent->tid;
2420 break;
2421 }
2422 thread = parent;
2423 }
2424
2425 return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2426 }
2427
trace__run(struct trace * trace,int argc,const char ** argv)2428 static int trace__run(struct trace *trace, int argc, const char **argv)
2429 {
2430 struct perf_evlist *evlist = trace->evlist;
2431 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2432 int err = -1, i;
2433 unsigned long before;
2434 const bool forks = argc > 0;
2435 bool draining = false;
2436
2437 trace->live = true;
2438
2439 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2440 goto out_error_raw_syscalls;
2441
2442 if (trace->trace_syscalls)
2443 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2444
2445 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2446 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2447 if (pgfault_maj == NULL)
2448 goto out_error_mem;
2449 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2450 perf_evlist__add(evlist, pgfault_maj);
2451 }
2452
2453 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2454 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2455 if (pgfault_min == NULL)
2456 goto out_error_mem;
2457 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2458 perf_evlist__add(evlist, pgfault_min);
2459 }
2460
2461 if (trace->sched &&
2462 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2463 trace__sched_stat_runtime))
2464 goto out_error_sched_stat_runtime;
2465
2466 /*
2467 * If a global cgroup was set, apply it to all the events without an
2468 * explicit cgroup. I.e.:
2469 *
2470 * trace -G A -e sched:*switch
2471 *
2472 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2473 * _and_ sched:sched_switch to the 'A' cgroup, while:
2474 *
2475 * trace -e sched:*switch -G A
2476 *
2477 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2478 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2479 * a cgroup (on the root cgroup, sys wide, etc).
2480 *
2481 * Multiple cgroups:
2482 *
2483 * trace -G A -e sched:*switch -G B
2484 *
2485 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2486 * to the 'B' cgroup.
2487 *
2488 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2489 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2490 */
2491 if (trace->cgroup)
2492 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2493
2494 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2495 if (err < 0) {
2496 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2497 goto out_delete_evlist;
2498 }
2499
2500 err = trace__symbols_init(trace, evlist);
2501 if (err < 0) {
2502 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2503 goto out_delete_evlist;
2504 }
2505
2506 perf_evlist__config(evlist, &trace->opts, &callchain_param);
2507
2508 signal(SIGCHLD, sig_handler);
2509 signal(SIGINT, sig_handler);
2510
2511 if (forks) {
2512 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2513 argv, false, NULL);
2514 if (err < 0) {
2515 fprintf(trace->output, "Couldn't run the workload!\n");
2516 goto out_delete_evlist;
2517 }
2518 }
2519
2520 err = perf_evlist__open(evlist);
2521 if (err < 0)
2522 goto out_error_open;
2523
2524 err = bpf__apply_obj_config();
2525 if (err) {
2526 char errbuf[BUFSIZ];
2527
2528 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2529 pr_err("ERROR: Apply config to BPF failed: %s\n",
2530 errbuf);
2531 goto out_error_open;
2532 }
2533
2534 /*
2535 * Better not use !target__has_task() here because we need to cover the
2536 * case where no threads were specified in the command line, but a
2537 * workload was, and in that case we will fill in the thread_map when
2538 * we fork the workload in perf_evlist__prepare_workload.
2539 */
2540 if (trace->filter_pids.nr > 0)
2541 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2542 else if (thread_map__pid(evlist->threads, 0) == -1)
2543 err = trace__set_filter_loop_pids(trace);
2544
2545 if (err < 0)
2546 goto out_error_mem;
2547
2548 if (trace->ev_qualifier_ids.nr > 0) {
2549 err = trace__set_ev_qualifier_filter(trace);
2550 if (err < 0)
2551 goto out_errno;
2552
2553 pr_debug("event qualifier tracepoint filter: %s\n",
2554 trace->syscalls.events.sys_exit->filter);
2555 }
2556
2557 err = perf_evlist__apply_filters(evlist, &evsel);
2558 if (err < 0)
2559 goto out_error_apply_filters;
2560
2561 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2562 if (err < 0)
2563 goto out_error_mmap;
2564
2565 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2566 perf_evlist__enable(evlist);
2567
2568 if (forks)
2569 perf_evlist__start_workload(evlist);
2570
2571 if (trace->opts.initial_delay) {
2572 usleep(trace->opts.initial_delay * 1000);
2573 perf_evlist__enable(evlist);
2574 }
2575
2576 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2577 evlist->threads->nr > 1 ||
2578 perf_evlist__first(evlist)->attr.inherit;
2579
2580 /*
2581 * Now that we already used evsel->attr to ask the kernel to setup the
2582 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2583 * trace__resolve_callchain(), allowing per-event max-stack settings
2584 * to override an explicitely set --max-stack global setting.
2585 */
2586 evlist__for_each_entry(evlist, evsel) {
2587 if (evsel__has_callchain(evsel) &&
2588 evsel->attr.sample_max_stack == 0)
2589 evsel->attr.sample_max_stack = trace->max_stack;
2590 }
2591 again:
2592 before = trace->nr_events;
2593
2594 for (i = 0; i < evlist->nr_mmaps; i++) {
2595 union perf_event *event;
2596 struct perf_mmap *md;
2597
2598 md = &evlist->mmap[i];
2599 if (perf_mmap__read_init(md) < 0)
2600 continue;
2601
2602 while ((event = perf_mmap__read_event(md)) != NULL) {
2603 struct perf_sample sample;
2604
2605 ++trace->nr_events;
2606
2607 err = perf_evlist__parse_sample(evlist, event, &sample);
2608 if (err) {
2609 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2610 goto next_event;
2611 }
2612
2613 trace__handle_event(trace, event, &sample);
2614 next_event:
2615 perf_mmap__consume(md);
2616
2617 if (interrupted)
2618 goto out_disable;
2619
2620 if (done && !draining) {
2621 perf_evlist__disable(evlist);
2622 draining = true;
2623 }
2624 }
2625 perf_mmap__read_done(md);
2626 }
2627
2628 if (trace->nr_events == before) {
2629 int timeout = done ? 100 : -1;
2630
2631 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2632 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2633 draining = true;
2634
2635 goto again;
2636 }
2637 } else {
2638 goto again;
2639 }
2640
2641 out_disable:
2642 thread__zput(trace->current);
2643
2644 perf_evlist__disable(evlist);
2645
2646 if (!err) {
2647 if (trace->summary)
2648 trace__fprintf_thread_summary(trace, trace->output);
2649
2650 if (trace->show_tool_stats) {
2651 fprintf(trace->output, "Stats:\n "
2652 " vfs_getname : %" PRIu64 "\n"
2653 " proc_getname: %" PRIu64 "\n",
2654 trace->stats.vfs_getname,
2655 trace->stats.proc_getname);
2656 }
2657 }
2658
2659 out_delete_evlist:
2660 trace__symbols__exit(trace);
2661
2662 perf_evlist__delete(evlist);
2663 cgroup__put(trace->cgroup);
2664 trace->evlist = NULL;
2665 trace->live = false;
2666 return err;
2667 {
2668 char errbuf[BUFSIZ];
2669
2670 out_error_sched_stat_runtime:
2671 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2672 goto out_error;
2673
2674 out_error_raw_syscalls:
2675 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2676 goto out_error;
2677
2678 out_error_mmap:
2679 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2680 goto out_error;
2681
2682 out_error_open:
2683 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2684
2685 out_error:
2686 fprintf(trace->output, "%s\n", errbuf);
2687 goto out_delete_evlist;
2688
2689 out_error_apply_filters:
2690 fprintf(trace->output,
2691 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2692 evsel->filter, perf_evsel__name(evsel), errno,
2693 str_error_r(errno, errbuf, sizeof(errbuf)));
2694 goto out_delete_evlist;
2695 }
2696 out_error_mem:
2697 fprintf(trace->output, "Not enough memory to run!\n");
2698 goto out_delete_evlist;
2699
2700 out_errno:
2701 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2702 goto out_delete_evlist;
2703 }
2704
trace__replay(struct trace * trace)2705 static int trace__replay(struct trace *trace)
2706 {
2707 const struct perf_evsel_str_handler handlers[] = {
2708 { "probe:vfs_getname", trace__vfs_getname, },
2709 };
2710 struct perf_data data = {
2711 .file = {
2712 .path = input_name,
2713 },
2714 .mode = PERF_DATA_MODE_READ,
2715 .force = trace->force,
2716 };
2717 struct perf_session *session;
2718 struct perf_evsel *evsel;
2719 int err = -1;
2720
2721 trace->tool.sample = trace__process_sample;
2722 trace->tool.mmap = perf_event__process_mmap;
2723 trace->tool.mmap2 = perf_event__process_mmap2;
2724 trace->tool.comm = perf_event__process_comm;
2725 trace->tool.exit = perf_event__process_exit;
2726 trace->tool.fork = perf_event__process_fork;
2727 trace->tool.attr = perf_event__process_attr;
2728 trace->tool.tracing_data = perf_event__process_tracing_data;
2729 trace->tool.build_id = perf_event__process_build_id;
2730 trace->tool.namespaces = perf_event__process_namespaces;
2731
2732 trace->tool.ordered_events = true;
2733 trace->tool.ordering_requires_timestamps = true;
2734
2735 /* add tid to output */
2736 trace->multiple_threads = true;
2737
2738 session = perf_session__new(&data, false, &trace->tool);
2739 if (session == NULL)
2740 return -1;
2741
2742 if (trace->opts.target.pid)
2743 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2744
2745 if (trace->opts.target.tid)
2746 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2747
2748 if (symbol__init(&session->header.env) < 0)
2749 goto out;
2750
2751 trace->host = &session->machines.host;
2752
2753 err = perf_session__set_tracepoints_handlers(session, handlers);
2754 if (err)
2755 goto out;
2756
2757 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2758 "raw_syscalls:sys_enter");
2759 /* older kernels have syscalls tp versus raw_syscalls */
2760 if (evsel == NULL)
2761 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2762 "syscalls:sys_enter");
2763
2764 if (evsel &&
2765 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
2766 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2767 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2768 goto out;
2769 }
2770
2771 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2772 "raw_syscalls:sys_exit");
2773 if (evsel == NULL)
2774 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2775 "syscalls:sys_exit");
2776 if (evsel &&
2777 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
2778 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2779 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2780 goto out;
2781 }
2782
2783 evlist__for_each_entry(session->evlist, evsel) {
2784 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2785 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2786 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2787 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2788 evsel->handler = trace__pgfault;
2789 }
2790
2791 setup_pager();
2792
2793 err = perf_session__process_events(session);
2794 if (err)
2795 pr_err("Failed to process events, error %d", err);
2796
2797 else if (trace->summary)
2798 trace__fprintf_thread_summary(trace, trace->output);
2799
2800 out:
2801 perf_session__delete(session);
2802
2803 return err;
2804 }
2805
trace__fprintf_threads_header(FILE * fp)2806 static size_t trace__fprintf_threads_header(FILE *fp)
2807 {
2808 size_t printed;
2809
2810 printed = fprintf(fp, "\n Summary of events:\n\n");
2811
2812 return printed;
2813 }
2814
2815 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2816 struct stats *stats;
2817 double msecs;
2818 int syscall;
2819 )
2820 {
2821 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2822 struct stats *stats = source->priv;
2823
2824 entry->syscall = source->i;
2825 entry->stats = stats;
2826 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2827 }
2828
thread__dump_stats(struct thread_trace * ttrace,struct trace * trace,FILE * fp)2829 static size_t thread__dump_stats(struct thread_trace *ttrace,
2830 struct trace *trace, FILE *fp)
2831 {
2832 size_t printed = 0;
2833 struct syscall *sc;
2834 struct rb_node *nd;
2835 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2836
2837 if (syscall_stats == NULL)
2838 return 0;
2839
2840 printed += fprintf(fp, "\n");
2841
2842 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2843 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2844 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2845
2846 resort_rb__for_each_entry(nd, syscall_stats) {
2847 struct stats *stats = syscall_stats_entry->stats;
2848 if (stats) {
2849 double min = (double)(stats->min) / NSEC_PER_MSEC;
2850 double max = (double)(stats->max) / NSEC_PER_MSEC;
2851 double avg = avg_stats(stats);
2852 double pct;
2853 u64 n = (u64) stats->n;
2854
2855 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2856 avg /= NSEC_PER_MSEC;
2857
2858 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2859 printed += fprintf(fp, " %-15s", sc->name);
2860 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2861 n, syscall_stats_entry->msecs, min, avg);
2862 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2863 }
2864 }
2865
2866 resort_rb__delete(syscall_stats);
2867 printed += fprintf(fp, "\n\n");
2868
2869 return printed;
2870 }
2871
trace__fprintf_thread(FILE * fp,struct thread * thread,struct trace * trace)2872 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2873 {
2874 size_t printed = 0;
2875 struct thread_trace *ttrace = thread__priv(thread);
2876 double ratio;
2877
2878 if (ttrace == NULL)
2879 return 0;
2880
2881 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2882
2883 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2884 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2885 printed += fprintf(fp, "%.1f%%", ratio);
2886 if (ttrace->pfmaj)
2887 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2888 if (ttrace->pfmin)
2889 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2890 if (trace->sched)
2891 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2892 else if (fputc('\n', fp) != EOF)
2893 ++printed;
2894
2895 printed += thread__dump_stats(ttrace, trace, fp);
2896
2897 return printed;
2898 }
2899
thread__nr_events(struct thread_trace * ttrace)2900 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2901 {
2902 return ttrace ? ttrace->nr_events : 0;
2903 }
2904
2905 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2906 struct thread *thread;
2907 )
2908 {
2909 entry->thread = rb_entry(nd, struct thread, rb_node);
2910 }
2911
trace__fprintf_thread_summary(struct trace * trace,FILE * fp)2912 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2913 {
2914 size_t printed = trace__fprintf_threads_header(fp);
2915 struct rb_node *nd;
2916 int i;
2917
2918 for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2919 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2920
2921 if (threads == NULL) {
2922 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2923 return 0;
2924 }
2925
2926 resort_rb__for_each_entry(nd, threads)
2927 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2928
2929 resort_rb__delete(threads);
2930 }
2931 return printed;
2932 }
2933
trace__set_duration(const struct option * opt,const char * str,int unset __maybe_unused)2934 static int trace__set_duration(const struct option *opt, const char *str,
2935 int unset __maybe_unused)
2936 {
2937 struct trace *trace = opt->value;
2938
2939 trace->duration_filter = atof(str);
2940 return 0;
2941 }
2942
trace__set_filter_pids(const struct option * opt,const char * str,int unset __maybe_unused)2943 static int trace__set_filter_pids(const struct option *opt, const char *str,
2944 int unset __maybe_unused)
2945 {
2946 int ret = -1;
2947 size_t i;
2948 struct trace *trace = opt->value;
2949 /*
2950 * FIXME: introduce a intarray class, plain parse csv and create a
2951 * { int nr, int entries[] } struct...
2952 */
2953 struct intlist *list = intlist__new(str);
2954
2955 if (list == NULL)
2956 return -1;
2957
2958 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2959 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2960
2961 if (trace->filter_pids.entries == NULL)
2962 goto out;
2963
2964 trace->filter_pids.entries[0] = getpid();
2965
2966 for (i = 1; i < trace->filter_pids.nr; ++i)
2967 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2968
2969 intlist__delete(list);
2970 ret = 0;
2971 out:
2972 return ret;
2973 }
2974
trace__open_output(struct trace * trace,const char * filename)2975 static int trace__open_output(struct trace *trace, const char *filename)
2976 {
2977 struct stat st;
2978
2979 if (!stat(filename, &st) && st.st_size) {
2980 char oldname[PATH_MAX];
2981
2982 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2983 unlink(oldname);
2984 rename(filename, oldname);
2985 }
2986
2987 trace->output = fopen(filename, "w");
2988
2989 return trace->output == NULL ? -errno : 0;
2990 }
2991
parse_pagefaults(const struct option * opt,const char * str,int unset __maybe_unused)2992 static int parse_pagefaults(const struct option *opt, const char *str,
2993 int unset __maybe_unused)
2994 {
2995 int *trace_pgfaults = opt->value;
2996
2997 if (strcmp(str, "all") == 0)
2998 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2999 else if (strcmp(str, "maj") == 0)
3000 *trace_pgfaults |= TRACE_PFMAJ;
3001 else if (strcmp(str, "min") == 0)
3002 *trace_pgfaults |= TRACE_PFMIN;
3003 else
3004 return -1;
3005
3006 return 0;
3007 }
3008
evlist__set_evsel_handler(struct perf_evlist * evlist,void * handler)3009 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3010 {
3011 struct perf_evsel *evsel;
3012
3013 evlist__for_each_entry(evlist, evsel)
3014 evsel->handler = handler;
3015 }
3016
evlist__set_syscall_tp_fields(struct perf_evlist * evlist)3017 static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
3018 {
3019 struct perf_evsel *evsel;
3020
3021 evlist__for_each_entry(evlist, evsel) {
3022 if (evsel->priv || !evsel->tp_format)
3023 continue;
3024
3025 if (strcmp(evsel->tp_format->system, "syscalls"))
3026 continue;
3027
3028 if (perf_evsel__init_syscall_tp(evsel))
3029 return -1;
3030
3031 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3032 struct syscall_tp *sc = evsel->priv;
3033
3034 if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3035 return -1;
3036 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3037 struct syscall_tp *sc = evsel->priv;
3038
3039 if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3040 return -1;
3041 }
3042 }
3043
3044 return 0;
3045 }
3046
3047 /*
3048 * XXX: Hackish, just splitting the combined -e+--event (syscalls
3049 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3050 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3051 *
3052 * It'd be better to introduce a parse_options() variant that would return a
3053 * list with the terms it didn't match to an event...
3054 */
trace__parse_events_option(const struct option * opt,const char * str,int unset __maybe_unused)3055 static int trace__parse_events_option(const struct option *opt, const char *str,
3056 int unset __maybe_unused)
3057 {
3058 struct trace *trace = (struct trace *)opt->value;
3059 const char *s = str;
3060 char *sep = NULL, *lists[2] = { NULL, NULL, };
3061 int len = strlen(str) + 1, err = -1, list, idx;
3062 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3063 char group_name[PATH_MAX];
3064
3065 if (strace_groups_dir == NULL)
3066 return -1;
3067
3068 if (*s == '!') {
3069 ++s;
3070 trace->not_ev_qualifier = true;
3071 }
3072
3073 while (1) {
3074 if ((sep = strchr(s, ',')) != NULL)
3075 *sep = '\0';
3076
3077 list = 0;
3078 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3079 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3080 list = 1;
3081 } else {
3082 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3083 if (access(group_name, R_OK) == 0)
3084 list = 1;
3085 }
3086
3087 if (lists[list]) {
3088 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3089 } else {
3090 lists[list] = malloc(len);
3091 if (lists[list] == NULL)
3092 goto out;
3093 strcpy(lists[list], s);
3094 }
3095
3096 if (!sep)
3097 break;
3098
3099 *sep = ',';
3100 s = sep + 1;
3101 }
3102
3103 if (lists[1] != NULL) {
3104 struct strlist_config slist_config = {
3105 .dirname = strace_groups_dir,
3106 };
3107
3108 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3109 if (trace->ev_qualifier == NULL) {
3110 fputs("Not enough memory to parse event qualifier", trace->output);
3111 goto out;
3112 }
3113
3114 if (trace__validate_ev_qualifier(trace))
3115 goto out;
3116 trace->trace_syscalls = true;
3117 }
3118
3119 err = 0;
3120
3121 if (lists[0]) {
3122 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3123 "event selector. use 'perf list' to list available events",
3124 parse_events_option);
3125 err = parse_events_option(&o, lists[0], 0);
3126 }
3127 out:
3128 if (sep)
3129 *sep = ',';
3130
3131 return err;
3132 }
3133
trace__parse_cgroups(const struct option * opt,const char * str,int unset)3134 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3135 {
3136 struct trace *trace = opt->value;
3137
3138 if (!list_empty(&trace->evlist->entries))
3139 return parse_cgroups(opt, str, unset);
3140
3141 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3142
3143 return 0;
3144 }
3145
cmd_trace(int argc,const char ** argv)3146 int cmd_trace(int argc, const char **argv)
3147 {
3148 const char *trace_usage[] = {
3149 "perf trace [<options>] [<command>]",
3150 "perf trace [<options>] -- <command> [<options>]",
3151 "perf trace record [<options>] [<command>]",
3152 "perf trace record [<options>] -- <command> [<options>]",
3153 NULL
3154 };
3155 struct trace trace = {
3156 .syscalls = {
3157 . max = -1,
3158 },
3159 .opts = {
3160 .target = {
3161 .uid = UINT_MAX,
3162 .uses_mmap = true,
3163 },
3164 .user_freq = UINT_MAX,
3165 .user_interval = ULLONG_MAX,
3166 .no_buffering = true,
3167 .mmap_pages = UINT_MAX,
3168 .proc_map_timeout = 500,
3169 },
3170 .output = stderr,
3171 .show_comm = true,
3172 .trace_syscalls = false,
3173 .kernel_syscallchains = false,
3174 .max_stack = UINT_MAX,
3175 };
3176 const char *output_name = NULL;
3177 const struct option trace_options[] = {
3178 OPT_CALLBACK('e', "event", &trace, "event",
3179 "event/syscall selector. use 'perf list' to list available events",
3180 trace__parse_events_option),
3181 OPT_BOOLEAN(0, "comm", &trace.show_comm,
3182 "show the thread COMM next to its id"),
3183 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3184 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3185 trace__parse_events_option),
3186 OPT_STRING('o', "output", &output_name, "file", "output file name"),
3187 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3188 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3189 "trace events on existing process id"),
3190 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3191 "trace events on existing thread id"),
3192 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3193 "pids to filter (by the kernel)", trace__set_filter_pids),
3194 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3195 "system-wide collection from all CPUs"),
3196 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3197 "list of cpus to monitor"),
3198 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3199 "child tasks do not inherit counters"),
3200 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3201 "number of mmap data pages",
3202 perf_evlist__parse_mmap_pages),
3203 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3204 "user to profile"),
3205 OPT_CALLBACK(0, "duration", &trace, "float",
3206 "show only events with duration > N.M ms",
3207 trace__set_duration),
3208 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3209 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3210 OPT_BOOLEAN('T', "time", &trace.full_time,
3211 "Show full timestamp, not time relative to first start"),
3212 OPT_BOOLEAN(0, "failure", &trace.failure_only,
3213 "Show only syscalls that failed"),
3214 OPT_BOOLEAN('s', "summary", &trace.summary_only,
3215 "Show only syscall summary with statistics"),
3216 OPT_BOOLEAN('S', "with-summary", &trace.summary,
3217 "Show all syscalls and summary with statistics"),
3218 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3219 "Trace pagefaults", parse_pagefaults, "maj"),
3220 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3221 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3222 OPT_CALLBACK(0, "call-graph", &trace.opts,
3223 "record_mode[,record_size]", record_callchain_help,
3224 &record_parse_callchain_opt),
3225 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3226 "Show the kernel callchains on the syscall exit path"),
3227 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3228 "Set the minimum stack depth when parsing the callchain, "
3229 "anything below the specified depth will be ignored."),
3230 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3231 "Set the maximum stack depth when parsing the callchain, "
3232 "anything beyond the specified depth will be ignored. "
3233 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3234 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3235 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3236 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3237 "per thread proc mmap processing timeout in ms"),
3238 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3239 trace__parse_cgroups),
3240 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3241 "ms to wait before starting measurement after program "
3242 "start"),
3243 OPT_END()
3244 };
3245 bool __maybe_unused max_stack_user_set = true;
3246 bool mmap_pages_user_set = true;
3247 struct perf_evsel *evsel;
3248 const char * const trace_subcommands[] = { "record", NULL };
3249 int err = -1;
3250 char bf[BUFSIZ];
3251
3252 signal(SIGSEGV, sighandler_dump_stack);
3253 signal(SIGFPE, sighandler_dump_stack);
3254
3255 trace.evlist = perf_evlist__new();
3256 trace.sctbl = syscalltbl__new();
3257
3258 if (trace.evlist == NULL || trace.sctbl == NULL) {
3259 pr_err("Not enough memory to run!\n");
3260 err = -ENOMEM;
3261 goto out;
3262 }
3263
3264 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3265 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3266
3267 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3268 usage_with_options_msg(trace_usage, trace_options,
3269 "cgroup monitoring only available in system-wide mode");
3270 }
3271
3272 evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
3273 if (IS_ERR(evsel)) {
3274 bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3275 pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
3276 goto out;
3277 }
3278
3279 if (evsel) {
3280 if (perf_evsel__init_augmented_syscall_tp(evsel) ||
3281 perf_evsel__init_augmented_syscall_tp_args(evsel))
3282 goto out;
3283 trace.syscalls.events.augmented = evsel;
3284 }
3285
3286 err = bpf__setup_stdout(trace.evlist);
3287 if (err) {
3288 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3289 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3290 goto out;
3291 }
3292
3293 err = -1;
3294
3295 if (trace.trace_pgfaults) {
3296 trace.opts.sample_address = true;
3297 trace.opts.sample_time = true;
3298 }
3299
3300 if (trace.opts.mmap_pages == UINT_MAX)
3301 mmap_pages_user_set = false;
3302
3303 if (trace.max_stack == UINT_MAX) {
3304 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3305 max_stack_user_set = false;
3306 }
3307
3308 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3309 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3310 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3311 }
3312 #endif
3313
3314 if (callchain_param.enabled) {
3315 if (!mmap_pages_user_set && geteuid() == 0)
3316 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3317
3318 symbol_conf.use_callchain = true;
3319 }
3320
3321 if (trace.evlist->nr_entries > 0) {
3322 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3323 if (evlist__set_syscall_tp_fields(trace.evlist)) {
3324 perror("failed to set syscalls:* tracepoint fields");
3325 goto out;
3326 }
3327 }
3328
3329 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3330 return trace__record(&trace, argc-1, &argv[1]);
3331
3332 /* summary_only implies summary option, but don't overwrite summary if set */
3333 if (trace.summary_only)
3334 trace.summary = trace.summary_only;
3335
3336 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3337 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3338 trace.trace_syscalls = true;
3339 }
3340
3341 if (output_name != NULL) {
3342 err = trace__open_output(&trace, output_name);
3343 if (err < 0) {
3344 perror("failed to create output file");
3345 goto out;
3346 }
3347 }
3348
3349 err = target__validate(&trace.opts.target);
3350 if (err) {
3351 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3352 fprintf(trace.output, "%s", bf);
3353 goto out_close;
3354 }
3355
3356 err = target__parse_uid(&trace.opts.target);
3357 if (err) {
3358 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3359 fprintf(trace.output, "%s", bf);
3360 goto out_close;
3361 }
3362
3363 if (!argc && target__none(&trace.opts.target))
3364 trace.opts.target.system_wide = true;
3365
3366 if (input_name)
3367 err = trace__replay(&trace);
3368 else
3369 err = trace__run(&trace, argc, argv);
3370
3371 out_close:
3372 if (output_name != NULL)
3373 fclose(trace.output);
3374 out:
3375 return err;
3376 }
3377