1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "asm/bug.h"
51 #include "perf.h"
52 
53 #include <errno.h>
54 #include <inttypes.h>
55 #include <locale.h>
56 #include <poll.h>
57 #include <pthread.h>
58 #include <unistd.h>
59 #include <sched.h>
60 #include <signal.h>
61 #ifdef HAVE_EVENTFD_SUPPORT
62 #include <sys/eventfd.h>
63 #endif
64 #include <sys/mman.h>
65 #include <sys/wait.h>
66 #include <sys/types.h>
67 #include <sys/stat.h>
68 #include <fcntl.h>
69 #include <linux/err.h>
70 #include <linux/string.h>
71 #include <linux/time64.h>
72 #include <linux/zalloc.h>
73 #include <linux/bitmap.h>
74 #include <sys/time.h>
75 
76 struct switch_output {
77 	bool		 enabled;
78 	bool		 signal;
79 	unsigned long	 size;
80 	unsigned long	 time;
81 	const char	*str;
82 	bool		 set;
83 	char		 **filenames;
84 	int		 num_files;
85 	int		 cur_file;
86 };
87 
88 struct record {
89 	struct perf_tool	tool;
90 	struct record_opts	opts;
91 	u64			bytes_written;
92 	struct perf_data	data;
93 	struct auxtrace_record	*itr;
94 	struct evlist	*evlist;
95 	struct perf_session	*session;
96 	struct evlist		*sb_evlist;
97 	pthread_t		thread_id;
98 	int			realtime_prio;
99 	bool			switch_output_event_set;
100 	bool			no_buildid;
101 	bool			no_buildid_set;
102 	bool			no_buildid_cache;
103 	bool			no_buildid_cache_set;
104 	bool			buildid_all;
105 	bool			timestamp_filename;
106 	bool			timestamp_boundary;
107 	struct switch_output	switch_output;
108 	unsigned long long	samples;
109 	struct mmap_cpu_mask	affinity_mask;
110 	unsigned long		output_max_size;	/* = 0: unlimited */
111 };
112 
113 static volatile int done;
114 
115 static volatile int auxtrace_record__snapshot_started;
116 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
117 static DEFINE_TRIGGER(switch_output_trigger);
118 
119 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
120 	"SYS", "NODE", "CPU"
121 };
122 
switch_output_signal(struct record * rec)123 static bool switch_output_signal(struct record *rec)
124 {
125 	return rec->switch_output.signal &&
126 	       trigger_is_ready(&switch_output_trigger);
127 }
128 
switch_output_size(struct record * rec)129 static bool switch_output_size(struct record *rec)
130 {
131 	return rec->switch_output.size &&
132 	       trigger_is_ready(&switch_output_trigger) &&
133 	       (rec->bytes_written >= rec->switch_output.size);
134 }
135 
switch_output_time(struct record * rec)136 static bool switch_output_time(struct record *rec)
137 {
138 	return rec->switch_output.time &&
139 	       trigger_is_ready(&switch_output_trigger);
140 }
141 
record__output_max_size_exceeded(struct record * rec)142 static bool record__output_max_size_exceeded(struct record *rec)
143 {
144 	return rec->output_max_size &&
145 	       (rec->bytes_written >= rec->output_max_size);
146 }
147 
record__write(struct record * rec,struct mmap * map __maybe_unused,void * bf,size_t size)148 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
149 			 void *bf, size_t size)
150 {
151 	struct perf_data_file *file = &rec->session->data->file;
152 
153 	if (perf_data_file__write(file, bf, size) < 0) {
154 		pr_err("failed to write perf data, error: %m\n");
155 		return -1;
156 	}
157 
158 	rec->bytes_written += size;
159 
160 	if (record__output_max_size_exceeded(rec) && !done) {
161 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
162 				" stopping session ]\n",
163 				rec->bytes_written >> 10);
164 		done = 1;
165 	}
166 
167 	if (switch_output_size(rec))
168 		trigger_hit(&switch_output_trigger);
169 
170 	return 0;
171 }
172 
173 static int record__aio_enabled(struct record *rec);
174 static int record__comp_enabled(struct record *rec);
175 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
176 			    void *src, size_t src_size);
177 
178 #ifdef HAVE_AIO_SUPPORT
record__aio_write(struct aiocb * cblock,int trace_fd,void * buf,size_t size,off_t off)179 static int record__aio_write(struct aiocb *cblock, int trace_fd,
180 		void *buf, size_t size, off_t off)
181 {
182 	int rc;
183 
184 	cblock->aio_fildes = trace_fd;
185 	cblock->aio_buf    = buf;
186 	cblock->aio_nbytes = size;
187 	cblock->aio_offset = off;
188 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
189 
190 	do {
191 		rc = aio_write(cblock);
192 		if (rc == 0) {
193 			break;
194 		} else if (errno != EAGAIN) {
195 			cblock->aio_fildes = -1;
196 			pr_err("failed to queue perf data, error: %m\n");
197 			break;
198 		}
199 	} while (1);
200 
201 	return rc;
202 }
203 
record__aio_complete(struct mmap * md,struct aiocb * cblock)204 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
205 {
206 	void *rem_buf;
207 	off_t rem_off;
208 	size_t rem_size;
209 	int rc, aio_errno;
210 	ssize_t aio_ret, written;
211 
212 	aio_errno = aio_error(cblock);
213 	if (aio_errno == EINPROGRESS)
214 		return 0;
215 
216 	written = aio_ret = aio_return(cblock);
217 	if (aio_ret < 0) {
218 		if (aio_errno != EINTR)
219 			pr_err("failed to write perf data, error: %m\n");
220 		written = 0;
221 	}
222 
223 	rem_size = cblock->aio_nbytes - written;
224 
225 	if (rem_size == 0) {
226 		cblock->aio_fildes = -1;
227 		/*
228 		 * md->refcount is incremented in record__aio_pushfn() for
229 		 * every aio write request started in record__aio_push() so
230 		 * decrement it because the request is now complete.
231 		 */
232 		perf_mmap__put(&md->core);
233 		rc = 1;
234 	} else {
235 		/*
236 		 * aio write request may require restart with the
237 		 * reminder if the kernel didn't write whole
238 		 * chunk at once.
239 		 */
240 		rem_off = cblock->aio_offset + written;
241 		rem_buf = (void *)(cblock->aio_buf + written);
242 		record__aio_write(cblock, cblock->aio_fildes,
243 				rem_buf, rem_size, rem_off);
244 		rc = 0;
245 	}
246 
247 	return rc;
248 }
249 
record__aio_sync(struct mmap * md,bool sync_all)250 static int record__aio_sync(struct mmap *md, bool sync_all)
251 {
252 	struct aiocb **aiocb = md->aio.aiocb;
253 	struct aiocb *cblocks = md->aio.cblocks;
254 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
255 	int i, do_suspend;
256 
257 	do {
258 		do_suspend = 0;
259 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
260 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
261 				if (sync_all)
262 					aiocb[i] = NULL;
263 				else
264 					return i;
265 			} else {
266 				/*
267 				 * Started aio write is not complete yet
268 				 * so it has to be waited before the
269 				 * next allocation.
270 				 */
271 				aiocb[i] = &cblocks[i];
272 				do_suspend = 1;
273 			}
274 		}
275 		if (!do_suspend)
276 			return -1;
277 
278 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
279 			if (!(errno == EAGAIN || errno == EINTR))
280 				pr_err("failed to sync perf data, error: %m\n");
281 		}
282 	} while (1);
283 }
284 
285 struct record_aio {
286 	struct record	*rec;
287 	void		*data;
288 	size_t		size;
289 };
290 
record__aio_pushfn(struct mmap * map,void * to,void * buf,size_t size)291 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
292 {
293 	struct record_aio *aio = to;
294 
295 	/*
296 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
297 	 * to release space in the kernel buffer as fast as possible, calling
298 	 * perf_mmap__consume() from perf_mmap__push() function.
299 	 *
300 	 * That lets the kernel to proceed with storing more profiling data into
301 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
302 	 *
303 	 * Coping can be done in two steps in case the chunk of profiling data
304 	 * crosses the upper bound of the kernel buffer. In this case we first move
305 	 * part of data from map->start till the upper bound and then the reminder
306 	 * from the beginning of the kernel buffer till the end of the data chunk.
307 	 */
308 
309 	if (record__comp_enabled(aio->rec)) {
310 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
311 				     mmap__mmap_len(map) - aio->size,
312 				     buf, size);
313 	} else {
314 		memcpy(aio->data + aio->size, buf, size);
315 	}
316 
317 	if (!aio->size) {
318 		/*
319 		 * Increment map->refcount to guard map->aio.data[] buffer
320 		 * from premature deallocation because map object can be
321 		 * released earlier than aio write request started on
322 		 * map->aio.data[] buffer is complete.
323 		 *
324 		 * perf_mmap__put() is done at record__aio_complete()
325 		 * after started aio request completion or at record__aio_push()
326 		 * if the request failed to start.
327 		 */
328 		perf_mmap__get(&map->core);
329 	}
330 
331 	aio->size += size;
332 
333 	return size;
334 }
335 
record__aio_push(struct record * rec,struct mmap * map,off_t * off)336 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
337 {
338 	int ret, idx;
339 	int trace_fd = rec->session->data->file.fd;
340 	struct record_aio aio = { .rec = rec, .size = 0 };
341 
342 	/*
343 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
344 	 * becomes available after previous aio write operation.
345 	 */
346 
347 	idx = record__aio_sync(map, false);
348 	aio.data = map->aio.data[idx];
349 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
350 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
351 		return ret;
352 
353 	rec->samples++;
354 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
355 	if (!ret) {
356 		*off += aio.size;
357 		rec->bytes_written += aio.size;
358 		if (switch_output_size(rec))
359 			trigger_hit(&switch_output_trigger);
360 	} else {
361 		/*
362 		 * Decrement map->refcount incremented in record__aio_pushfn()
363 		 * back if record__aio_write() operation failed to start, otherwise
364 		 * map->refcount is decremented in record__aio_complete() after
365 		 * aio write operation finishes successfully.
366 		 */
367 		perf_mmap__put(&map->core);
368 	}
369 
370 	return ret;
371 }
372 
record__aio_get_pos(int trace_fd)373 static off_t record__aio_get_pos(int trace_fd)
374 {
375 	return lseek(trace_fd, 0, SEEK_CUR);
376 }
377 
record__aio_set_pos(int trace_fd,off_t pos)378 static void record__aio_set_pos(int trace_fd, off_t pos)
379 {
380 	lseek(trace_fd, pos, SEEK_SET);
381 }
382 
record__aio_mmap_read_sync(struct record * rec)383 static void record__aio_mmap_read_sync(struct record *rec)
384 {
385 	int i;
386 	struct evlist *evlist = rec->evlist;
387 	struct mmap *maps = evlist->mmap;
388 
389 	if (!record__aio_enabled(rec))
390 		return;
391 
392 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
393 		struct mmap *map = &maps[i];
394 
395 		if (map->core.base)
396 			record__aio_sync(map, true);
397 	}
398 }
399 
400 static int nr_cblocks_default = 1;
401 static int nr_cblocks_max = 4;
402 
record__aio_parse(const struct option * opt,const char * str,int unset)403 static int record__aio_parse(const struct option *opt,
404 			     const char *str,
405 			     int unset)
406 {
407 	struct record_opts *opts = (struct record_opts *)opt->value;
408 
409 	if (unset) {
410 		opts->nr_cblocks = 0;
411 	} else {
412 		if (str)
413 			opts->nr_cblocks = strtol(str, NULL, 0);
414 		if (!opts->nr_cblocks)
415 			opts->nr_cblocks = nr_cblocks_default;
416 	}
417 
418 	return 0;
419 }
420 #else /* HAVE_AIO_SUPPORT */
421 static int nr_cblocks_max = 0;
422 
record__aio_push(struct record * rec __maybe_unused,struct mmap * map __maybe_unused,off_t * off __maybe_unused)423 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
424 			    off_t *off __maybe_unused)
425 {
426 	return -1;
427 }
428 
record__aio_get_pos(int trace_fd __maybe_unused)429 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
430 {
431 	return -1;
432 }
433 
record__aio_set_pos(int trace_fd __maybe_unused,off_t pos __maybe_unused)434 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
435 {
436 }
437 
record__aio_mmap_read_sync(struct record * rec __maybe_unused)438 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
439 {
440 }
441 #endif
442 
record__aio_enabled(struct record * rec)443 static int record__aio_enabled(struct record *rec)
444 {
445 	return rec->opts.nr_cblocks > 0;
446 }
447 
448 #define MMAP_FLUSH_DEFAULT 1
record__mmap_flush_parse(const struct option * opt,const char * str,int unset)449 static int record__mmap_flush_parse(const struct option *opt,
450 				    const char *str,
451 				    int unset)
452 {
453 	int flush_max;
454 	struct record_opts *opts = (struct record_opts *)opt->value;
455 	static struct parse_tag tags[] = {
456 			{ .tag  = 'B', .mult = 1       },
457 			{ .tag  = 'K', .mult = 1 << 10 },
458 			{ .tag  = 'M', .mult = 1 << 20 },
459 			{ .tag  = 'G', .mult = 1 << 30 },
460 			{ .tag  = 0 },
461 	};
462 
463 	if (unset)
464 		return 0;
465 
466 	if (str) {
467 		opts->mmap_flush = parse_tag_value(str, tags);
468 		if (opts->mmap_flush == (int)-1)
469 			opts->mmap_flush = strtol(str, NULL, 0);
470 	}
471 
472 	if (!opts->mmap_flush)
473 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
474 
475 	flush_max = evlist__mmap_size(opts->mmap_pages);
476 	flush_max /= 4;
477 	if (opts->mmap_flush > flush_max)
478 		opts->mmap_flush = flush_max;
479 
480 	return 0;
481 }
482 
483 #ifdef HAVE_ZSTD_SUPPORT
484 static unsigned int comp_level_default = 1;
485 
record__parse_comp_level(const struct option * opt,const char * str,int unset)486 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
487 {
488 	struct record_opts *opts = opt->value;
489 
490 	if (unset) {
491 		opts->comp_level = 0;
492 	} else {
493 		if (str)
494 			opts->comp_level = strtol(str, NULL, 0);
495 		if (!opts->comp_level)
496 			opts->comp_level = comp_level_default;
497 	}
498 
499 	return 0;
500 }
501 #endif
502 static unsigned int comp_level_max = 22;
503 
record__comp_enabled(struct record * rec)504 static int record__comp_enabled(struct record *rec)
505 {
506 	return rec->opts.comp_level > 0;
507 }
508 
process_synthesized_event(struct perf_tool * tool,union perf_event * event,struct perf_sample * sample __maybe_unused,struct machine * machine __maybe_unused)509 static int process_synthesized_event(struct perf_tool *tool,
510 				     union perf_event *event,
511 				     struct perf_sample *sample __maybe_unused,
512 				     struct machine *machine __maybe_unused)
513 {
514 	struct record *rec = container_of(tool, struct record, tool);
515 	return record__write(rec, NULL, event, event->header.size);
516 }
517 
process_locked_synthesized_event(struct perf_tool * tool,union perf_event * event,struct perf_sample * sample __maybe_unused,struct machine * machine __maybe_unused)518 static int process_locked_synthesized_event(struct perf_tool *tool,
519 				     union perf_event *event,
520 				     struct perf_sample *sample __maybe_unused,
521 				     struct machine *machine __maybe_unused)
522 {
523 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
524 	int ret;
525 
526 	pthread_mutex_lock(&synth_lock);
527 	ret = process_synthesized_event(tool, event, sample, machine);
528 	pthread_mutex_unlock(&synth_lock);
529 	return ret;
530 }
531 
record__pushfn(struct mmap * map,void * to,void * bf,size_t size)532 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
533 {
534 	struct record *rec = to;
535 
536 	if (record__comp_enabled(rec)) {
537 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
538 		bf   = map->data;
539 	}
540 
541 	rec->samples++;
542 	return record__write(rec, map, bf, size);
543 }
544 
545 static volatile int signr = -1;
546 static volatile int child_finished;
547 #ifdef HAVE_EVENTFD_SUPPORT
548 static int done_fd = -1;
549 #endif
550 
sig_handler(int sig)551 static void sig_handler(int sig)
552 {
553 	if (sig == SIGCHLD)
554 		child_finished = 1;
555 	else
556 		signr = sig;
557 
558 	done = 1;
559 #ifdef HAVE_EVENTFD_SUPPORT
560 {
561 	u64 tmp = 1;
562 	/*
563 	 * It is possible for this signal handler to run after done is checked
564 	 * in the main loop, but before the perf counter fds are polled. If this
565 	 * happens, the poll() will continue to wait even though done is set,
566 	 * and will only break out if either another signal is received, or the
567 	 * counters are ready for read. To ensure the poll() doesn't sleep when
568 	 * done is set, use an eventfd (done_fd) to wake up the poll().
569 	 */
570 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
571 		pr_err("failed to signal wakeup fd, error: %m\n");
572 }
573 #endif // HAVE_EVENTFD_SUPPORT
574 }
575 
sigsegv_handler(int sig)576 static void sigsegv_handler(int sig)
577 {
578 	perf_hooks__recover();
579 	sighandler_dump_stack(sig);
580 }
581 
record__sig_exit(void)582 static void record__sig_exit(void)
583 {
584 	if (signr == -1)
585 		return;
586 
587 	signal(signr, SIG_DFL);
588 	raise(signr);
589 }
590 
591 #ifdef HAVE_AUXTRACE_SUPPORT
592 
record__process_auxtrace(struct perf_tool * tool,struct mmap * map,union perf_event * event,void * data1,size_t len1,void * data2,size_t len2)593 static int record__process_auxtrace(struct perf_tool *tool,
594 				    struct mmap *map,
595 				    union perf_event *event, void *data1,
596 				    size_t len1, void *data2, size_t len2)
597 {
598 	struct record *rec = container_of(tool, struct record, tool);
599 	struct perf_data *data = &rec->data;
600 	size_t padding;
601 	u8 pad[8] = {0};
602 
603 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
604 		off_t file_offset;
605 		int fd = perf_data__fd(data);
606 		int err;
607 
608 		file_offset = lseek(fd, 0, SEEK_CUR);
609 		if (file_offset == -1)
610 			return -1;
611 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
612 						     event, file_offset);
613 		if (err)
614 			return err;
615 	}
616 
617 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
618 	padding = (len1 + len2) & 7;
619 	if (padding)
620 		padding = 8 - padding;
621 
622 	record__write(rec, map, event, event->header.size);
623 	record__write(rec, map, data1, len1);
624 	if (len2)
625 		record__write(rec, map, data2, len2);
626 	record__write(rec, map, &pad, padding);
627 
628 	return 0;
629 }
630 
record__auxtrace_mmap_read(struct record * rec,struct mmap * map)631 static int record__auxtrace_mmap_read(struct record *rec,
632 				      struct mmap *map)
633 {
634 	int ret;
635 
636 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
637 				  record__process_auxtrace);
638 	if (ret < 0)
639 		return ret;
640 
641 	if (ret)
642 		rec->samples++;
643 
644 	return 0;
645 }
646 
record__auxtrace_mmap_read_snapshot(struct record * rec,struct mmap * map)647 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
648 					       struct mmap *map)
649 {
650 	int ret;
651 
652 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
653 					   record__process_auxtrace,
654 					   rec->opts.auxtrace_snapshot_size);
655 	if (ret < 0)
656 		return ret;
657 
658 	if (ret)
659 		rec->samples++;
660 
661 	return 0;
662 }
663 
record__auxtrace_read_snapshot_all(struct record * rec)664 static int record__auxtrace_read_snapshot_all(struct record *rec)
665 {
666 	int i;
667 	int rc = 0;
668 
669 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
670 		struct mmap *map = &rec->evlist->mmap[i];
671 
672 		if (!map->auxtrace_mmap.base)
673 			continue;
674 
675 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
676 			rc = -1;
677 			goto out;
678 		}
679 	}
680 out:
681 	return rc;
682 }
683 
record__read_auxtrace_snapshot(struct record * rec,bool on_exit)684 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
685 {
686 	pr_debug("Recording AUX area tracing snapshot\n");
687 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
688 		trigger_error(&auxtrace_snapshot_trigger);
689 	} else {
690 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
691 			trigger_error(&auxtrace_snapshot_trigger);
692 		else
693 			trigger_ready(&auxtrace_snapshot_trigger);
694 	}
695 }
696 
record__auxtrace_snapshot_exit(struct record * rec)697 static int record__auxtrace_snapshot_exit(struct record *rec)
698 {
699 	if (trigger_is_error(&auxtrace_snapshot_trigger))
700 		return 0;
701 
702 	if (!auxtrace_record__snapshot_started &&
703 	    auxtrace_record__snapshot_start(rec->itr))
704 		return -1;
705 
706 	record__read_auxtrace_snapshot(rec, true);
707 	if (trigger_is_error(&auxtrace_snapshot_trigger))
708 		return -1;
709 
710 	return 0;
711 }
712 
record__auxtrace_init(struct record * rec)713 static int record__auxtrace_init(struct record *rec)
714 {
715 	int err;
716 
717 	if (!rec->itr) {
718 		rec->itr = auxtrace_record__init(rec->evlist, &err);
719 		if (err)
720 			return err;
721 	}
722 
723 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
724 					      rec->opts.auxtrace_snapshot_opts);
725 	if (err)
726 		return err;
727 
728 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
729 					    rec->opts.auxtrace_sample_opts);
730 	if (err)
731 		return err;
732 
733 	return auxtrace_parse_filters(rec->evlist);
734 }
735 
736 #else
737 
738 static inline
record__auxtrace_mmap_read(struct record * rec __maybe_unused,struct mmap * map __maybe_unused)739 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
740 			       struct mmap *map __maybe_unused)
741 {
742 	return 0;
743 }
744 
745 static inline
record__read_auxtrace_snapshot(struct record * rec __maybe_unused,bool on_exit __maybe_unused)746 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
747 				    bool on_exit __maybe_unused)
748 {
749 }
750 
751 static inline
auxtrace_record__snapshot_start(struct auxtrace_record * itr __maybe_unused)752 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
753 {
754 	return 0;
755 }
756 
757 static inline
record__auxtrace_snapshot_exit(struct record * rec __maybe_unused)758 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
759 {
760 	return 0;
761 }
762 
record__auxtrace_init(struct record * rec __maybe_unused)763 static int record__auxtrace_init(struct record *rec __maybe_unused)
764 {
765 	return 0;
766 }
767 
768 #endif
769 
record__config_text_poke(struct evlist * evlist)770 static int record__config_text_poke(struct evlist *evlist)
771 {
772 	struct evsel *evsel;
773 	int err;
774 
775 	/* Nothing to do if text poke is already configured */
776 	evlist__for_each_entry(evlist, evsel) {
777 		if (evsel->core.attr.text_poke)
778 			return 0;
779 	}
780 
781 	err = parse_events(evlist, "dummy:u", NULL);
782 	if (err)
783 		return err;
784 
785 	evsel = evlist__last(evlist);
786 
787 	evsel->core.attr.freq = 0;
788 	evsel->core.attr.sample_period = 1;
789 	evsel->core.attr.text_poke = 1;
790 	evsel->core.attr.ksymbol = 1;
791 
792 	evsel->core.system_wide = true;
793 	evsel->no_aux_samples = true;
794 	evsel->immediate = true;
795 
796 	/* Text poke must be collected on all CPUs */
797 	perf_cpu_map__put(evsel->core.own_cpus);
798 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
799 	perf_cpu_map__put(evsel->core.cpus);
800 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
801 
802 	evsel__set_sample_bit(evsel, TIME);
803 
804 	return 0;
805 }
806 
record__kcore_readable(struct machine * machine)807 static bool record__kcore_readable(struct machine *machine)
808 {
809 	char kcore[PATH_MAX];
810 	int fd;
811 
812 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
813 
814 	fd = open(kcore, O_RDONLY);
815 	if (fd < 0)
816 		return false;
817 
818 	close(fd);
819 
820 	return true;
821 }
822 
record__kcore_copy(struct machine * machine,struct perf_data * data)823 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
824 {
825 	char from_dir[PATH_MAX];
826 	char kcore_dir[PATH_MAX];
827 	int ret;
828 
829 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
830 
831 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
832 	if (ret)
833 		return ret;
834 
835 	return kcore_copy(from_dir, kcore_dir);
836 }
837 
record__mmap_evlist(struct record * rec,struct evlist * evlist)838 static int record__mmap_evlist(struct record *rec,
839 			       struct evlist *evlist)
840 {
841 	struct record_opts *opts = &rec->opts;
842 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
843 				  opts->auxtrace_sample_mode;
844 	char msg[512];
845 
846 	if (opts->affinity != PERF_AFFINITY_SYS)
847 		cpu__setup_cpunode_map();
848 
849 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
850 				 opts->auxtrace_mmap_pages,
851 				 auxtrace_overwrite,
852 				 opts->nr_cblocks, opts->affinity,
853 				 opts->mmap_flush, opts->comp_level) < 0) {
854 		if (errno == EPERM) {
855 			pr_err("Permission error mapping pages.\n"
856 			       "Consider increasing "
857 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
858 			       "or try again with a smaller value of -m/--mmap_pages.\n"
859 			       "(current value: %u,%u)\n",
860 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
861 			return -errno;
862 		} else {
863 			pr_err("failed to mmap with %d (%s)\n", errno,
864 				str_error_r(errno, msg, sizeof(msg)));
865 			if (errno)
866 				return -errno;
867 			else
868 				return -EINVAL;
869 		}
870 	}
871 	return 0;
872 }
873 
record__mmap(struct record * rec)874 static int record__mmap(struct record *rec)
875 {
876 	return record__mmap_evlist(rec, rec->evlist);
877 }
878 
record__open(struct record * rec)879 static int record__open(struct record *rec)
880 {
881 	char msg[BUFSIZ];
882 	struct evsel *pos;
883 	struct evlist *evlist = rec->evlist;
884 	struct perf_session *session = rec->session;
885 	struct record_opts *opts = &rec->opts;
886 	int rc = 0;
887 
888 	/*
889 	 * For initial_delay or system wide, we need to add a dummy event so
890 	 * that we can track PERF_RECORD_MMAP to cover the delay of waiting or
891 	 * event synthesis.
892 	 */
893 	if (opts->initial_delay || target__has_cpu(&opts->target)) {
894 		pos = perf_evlist__get_tracking_event(evlist);
895 		if (!evsel__is_dummy_event(pos)) {
896 			/* Set up dummy event. */
897 			if (evlist__add_dummy(evlist))
898 				return -ENOMEM;
899 			pos = evlist__last(evlist);
900 			perf_evlist__set_tracking_event(evlist, pos);
901 		}
902 
903 		/*
904 		 * Enable the dummy event when the process is forked for
905 		 * initial_delay, immediately for system wide.
906 		 */
907 		if (opts->initial_delay && !pos->immediate)
908 			pos->core.attr.enable_on_exec = 1;
909 		else
910 			pos->immediate = 1;
911 	}
912 
913 	perf_evlist__config(evlist, opts, &callchain_param);
914 
915 	evlist__for_each_entry(evlist, pos) {
916 try_again:
917 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
918 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
919 				if (verbose > 0)
920 					ui__warning("%s\n", msg);
921 				goto try_again;
922 			}
923 			if ((errno == EINVAL || errno == EBADF) &&
924 			    pos->leader != pos &&
925 			    pos->weak_group) {
926 			        pos = perf_evlist__reset_weak_group(evlist, pos, true);
927 				goto try_again;
928 			}
929 			rc = -errno;
930 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
931 			ui__error("%s\n", msg);
932 			goto out;
933 		}
934 
935 		pos->supported = true;
936 	}
937 
938 	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
939 		pr_warning(
940 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
941 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
942 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
943 "file is not found in the buildid cache or in the vmlinux path.\n\n"
944 "Samples in kernel modules won't be resolved at all.\n\n"
945 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
946 "even with a suitable vmlinux or kallsyms file.\n\n");
947 	}
948 
949 	if (perf_evlist__apply_filters(evlist, &pos)) {
950 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
951 			pos->filter, evsel__name(pos), errno,
952 			str_error_r(errno, msg, sizeof(msg)));
953 		rc = -1;
954 		goto out;
955 	}
956 
957 	rc = record__mmap(rec);
958 	if (rc)
959 		goto out;
960 
961 	session->evlist = evlist;
962 	perf_session__set_id_hdr_size(session);
963 out:
964 	return rc;
965 }
966 
process_sample_event(struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct evsel * evsel,struct machine * machine)967 static int process_sample_event(struct perf_tool *tool,
968 				union perf_event *event,
969 				struct perf_sample *sample,
970 				struct evsel *evsel,
971 				struct machine *machine)
972 {
973 	struct record *rec = container_of(tool, struct record, tool);
974 
975 	if (rec->evlist->first_sample_time == 0)
976 		rec->evlist->first_sample_time = sample->time;
977 
978 	rec->evlist->last_sample_time = sample->time;
979 
980 	if (rec->buildid_all)
981 		return 0;
982 
983 	rec->samples++;
984 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
985 }
986 
process_buildids(struct record * rec)987 static int process_buildids(struct record *rec)
988 {
989 	struct perf_session *session = rec->session;
990 
991 	if (perf_data__size(&rec->data) == 0)
992 		return 0;
993 
994 	/*
995 	 * During this process, it'll load kernel map and replace the
996 	 * dso->long_name to a real pathname it found.  In this case
997 	 * we prefer the vmlinux path like
998 	 *   /lib/modules/3.16.4/build/vmlinux
999 	 *
1000 	 * rather than build-id path (in debug directory).
1001 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1002 	 */
1003 	symbol_conf.ignore_vmlinux_buildid = true;
1004 
1005 	/*
1006 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1007 	 * so no need to process samples. But if timestamp_boundary is enabled,
1008 	 * it still needs to walk on all samples to get the timestamps of
1009 	 * first/last samples.
1010 	 */
1011 	if (rec->buildid_all && !rec->timestamp_boundary)
1012 		rec->tool.sample = NULL;
1013 
1014 	return perf_session__process_events(session);
1015 }
1016 
perf_event__synthesize_guest_os(struct machine * machine,void * data)1017 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1018 {
1019 	int err;
1020 	struct perf_tool *tool = data;
1021 	/*
1022 	 *As for guest kernel when processing subcommand record&report,
1023 	 *we arrange module mmap prior to guest kernel mmap and trigger
1024 	 *a preload dso because default guest module symbols are loaded
1025 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1026 	 *method is used to avoid symbol missing when the first addr is
1027 	 *in module instead of in guest kernel.
1028 	 */
1029 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1030 					     machine);
1031 	if (err < 0)
1032 		pr_err("Couldn't record guest kernel [%d]'s reference"
1033 		       " relocation symbol.\n", machine->pid);
1034 
1035 	/*
1036 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1037 	 * have no _text sometimes.
1038 	 */
1039 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1040 						 machine);
1041 	if (err < 0)
1042 		pr_err("Couldn't record guest kernel [%d]'s reference"
1043 		       " relocation symbol.\n", machine->pid);
1044 }
1045 
1046 static struct perf_event_header finished_round_event = {
1047 	.size = sizeof(struct perf_event_header),
1048 	.type = PERF_RECORD_FINISHED_ROUND,
1049 };
1050 
record__adjust_affinity(struct record * rec,struct mmap * map)1051 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1052 {
1053 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1054 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1055 			  rec->affinity_mask.nbits)) {
1056 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1057 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1058 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1059 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1060 				  (cpu_set_t *)rec->affinity_mask.bits);
1061 		if (verbose == 2)
1062 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1063 	}
1064 }
1065 
process_comp_header(void * record,size_t increment)1066 static size_t process_comp_header(void *record, size_t increment)
1067 {
1068 	struct perf_record_compressed *event = record;
1069 	size_t size = sizeof(*event);
1070 
1071 	if (increment) {
1072 		event->header.size += increment;
1073 		return increment;
1074 	}
1075 
1076 	event->header.type = PERF_RECORD_COMPRESSED;
1077 	event->header.size = size;
1078 
1079 	return size;
1080 }
1081 
zstd_compress(struct perf_session * session,void * dst,size_t dst_size,void * src,size_t src_size)1082 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1083 			    void *src, size_t src_size)
1084 {
1085 	size_t compressed;
1086 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1087 
1088 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1089 						     max_record_size, process_comp_header);
1090 
1091 	session->bytes_transferred += src_size;
1092 	session->bytes_compressed  += compressed;
1093 
1094 	return compressed;
1095 }
1096 
record__mmap_read_evlist(struct record * rec,struct evlist * evlist,bool overwrite,bool synch)1097 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1098 				    bool overwrite, bool synch)
1099 {
1100 	u64 bytes_written = rec->bytes_written;
1101 	int i;
1102 	int rc = 0;
1103 	struct mmap *maps;
1104 	int trace_fd = rec->data.file.fd;
1105 	off_t off = 0;
1106 
1107 	if (!evlist)
1108 		return 0;
1109 
1110 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1111 	if (!maps)
1112 		return 0;
1113 
1114 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1115 		return 0;
1116 
1117 	if (record__aio_enabled(rec))
1118 		off = record__aio_get_pos(trace_fd);
1119 
1120 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1121 		u64 flush = 0;
1122 		struct mmap *map = &maps[i];
1123 
1124 		if (map->core.base) {
1125 			record__adjust_affinity(rec, map);
1126 			if (synch) {
1127 				flush = map->core.flush;
1128 				map->core.flush = 1;
1129 			}
1130 			if (!record__aio_enabled(rec)) {
1131 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1132 					if (synch)
1133 						map->core.flush = flush;
1134 					rc = -1;
1135 					goto out;
1136 				}
1137 			} else {
1138 				if (record__aio_push(rec, map, &off) < 0) {
1139 					record__aio_set_pos(trace_fd, off);
1140 					if (synch)
1141 						map->core.flush = flush;
1142 					rc = -1;
1143 					goto out;
1144 				}
1145 			}
1146 			if (synch)
1147 				map->core.flush = flush;
1148 		}
1149 
1150 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1151 		    !rec->opts.auxtrace_sample_mode &&
1152 		    record__auxtrace_mmap_read(rec, map) != 0) {
1153 			rc = -1;
1154 			goto out;
1155 		}
1156 	}
1157 
1158 	if (record__aio_enabled(rec))
1159 		record__aio_set_pos(trace_fd, off);
1160 
1161 	/*
1162 	 * Mark the round finished in case we wrote
1163 	 * at least one event.
1164 	 */
1165 	if (bytes_written != rec->bytes_written)
1166 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1167 
1168 	if (overwrite)
1169 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1170 out:
1171 	return rc;
1172 }
1173 
record__mmap_read_all(struct record * rec,bool synch)1174 static int record__mmap_read_all(struct record *rec, bool synch)
1175 {
1176 	int err;
1177 
1178 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1179 	if (err)
1180 		return err;
1181 
1182 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1183 }
1184 
record__init_features(struct record * rec)1185 static void record__init_features(struct record *rec)
1186 {
1187 	struct perf_session *session = rec->session;
1188 	int feat;
1189 
1190 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1191 		perf_header__set_feat(&session->header, feat);
1192 
1193 	if (rec->no_buildid)
1194 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1195 
1196 	if (!have_tracepoints(&rec->evlist->core.entries))
1197 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1198 
1199 	if (!rec->opts.branch_stack)
1200 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1201 
1202 	if (!rec->opts.full_auxtrace)
1203 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1204 
1205 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1206 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1207 
1208 	if (!rec->opts.use_clockid)
1209 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1210 
1211 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1212 	if (!record__comp_enabled(rec))
1213 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1214 
1215 	perf_header__clear_feat(&session->header, HEADER_STAT);
1216 }
1217 
1218 static void
record__finish_output(struct record * rec)1219 record__finish_output(struct record *rec)
1220 {
1221 	struct perf_data *data = &rec->data;
1222 	int fd = perf_data__fd(data);
1223 
1224 	if (data->is_pipe)
1225 		return;
1226 
1227 	rec->session->header.data_size += rec->bytes_written;
1228 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1229 
1230 	if (!rec->no_buildid) {
1231 		process_buildids(rec);
1232 
1233 		if (rec->buildid_all)
1234 			dsos__hit_all(rec->session);
1235 	}
1236 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1237 
1238 	return;
1239 }
1240 
record__synthesize_workload(struct record * rec,bool tail)1241 static int record__synthesize_workload(struct record *rec, bool tail)
1242 {
1243 	int err;
1244 	struct perf_thread_map *thread_map;
1245 
1246 	if (rec->opts.tail_synthesize != tail)
1247 		return 0;
1248 
1249 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1250 	if (thread_map == NULL)
1251 		return -1;
1252 
1253 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1254 						 process_synthesized_event,
1255 						 &rec->session->machines.host,
1256 						 rec->opts.sample_address);
1257 	perf_thread_map__put(thread_map);
1258 	return err;
1259 }
1260 
1261 static int record__synthesize(struct record *rec, bool tail);
1262 
1263 static int
record__switch_output(struct record * rec,bool at_exit)1264 record__switch_output(struct record *rec, bool at_exit)
1265 {
1266 	struct perf_data *data = &rec->data;
1267 	int fd, err;
1268 	char *new_filename;
1269 
1270 	/* Same Size:      "2015122520103046"*/
1271 	char timestamp[] = "InvalidTimestamp";
1272 
1273 	record__aio_mmap_read_sync(rec);
1274 
1275 	record__synthesize(rec, true);
1276 	if (target__none(&rec->opts.target))
1277 		record__synthesize_workload(rec, true);
1278 
1279 	rec->samples = 0;
1280 	record__finish_output(rec);
1281 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1282 	if (err) {
1283 		pr_err("Failed to get current timestamp\n");
1284 		return -EINVAL;
1285 	}
1286 
1287 	fd = perf_data__switch(data, timestamp,
1288 				    rec->session->header.data_offset,
1289 				    at_exit, &new_filename);
1290 	if (fd >= 0 && !at_exit) {
1291 		rec->bytes_written = 0;
1292 		rec->session->header.data_size = 0;
1293 	}
1294 
1295 	if (!quiet)
1296 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1297 			data->path, timestamp);
1298 
1299 	if (rec->switch_output.num_files) {
1300 		int n = rec->switch_output.cur_file + 1;
1301 
1302 		if (n >= rec->switch_output.num_files)
1303 			n = 0;
1304 		rec->switch_output.cur_file = n;
1305 		if (rec->switch_output.filenames[n]) {
1306 			remove(rec->switch_output.filenames[n]);
1307 			zfree(&rec->switch_output.filenames[n]);
1308 		}
1309 		rec->switch_output.filenames[n] = new_filename;
1310 	} else {
1311 		free(new_filename);
1312 	}
1313 
1314 	/* Output tracking events */
1315 	if (!at_exit) {
1316 		record__synthesize(rec, false);
1317 
1318 		/*
1319 		 * In 'perf record --switch-output' without -a,
1320 		 * record__synthesize() in record__switch_output() won't
1321 		 * generate tracking events because there's no thread_map
1322 		 * in evlist. Which causes newly created perf.data doesn't
1323 		 * contain map and comm information.
1324 		 * Create a fake thread_map and directly call
1325 		 * perf_event__synthesize_thread_map() for those events.
1326 		 */
1327 		if (target__none(&rec->opts.target))
1328 			record__synthesize_workload(rec, false);
1329 	}
1330 	return fd;
1331 }
1332 
1333 static volatile int workload_exec_errno;
1334 
1335 /*
1336  * perf_evlist__prepare_workload will send a SIGUSR1
1337  * if the fork fails, since we asked by setting its
1338  * want_signal to true.
1339  */
workload_exec_failed_signal(int signo __maybe_unused,siginfo_t * info,void * ucontext __maybe_unused)1340 static void workload_exec_failed_signal(int signo __maybe_unused,
1341 					siginfo_t *info,
1342 					void *ucontext __maybe_unused)
1343 {
1344 	workload_exec_errno = info->si_value.sival_int;
1345 	done = 1;
1346 	child_finished = 1;
1347 }
1348 
1349 static void snapshot_sig_handler(int sig);
1350 static void alarm_sig_handler(int sig);
1351 
1352 static const struct perf_event_mmap_page *
perf_evlist__pick_pc(struct evlist * evlist)1353 perf_evlist__pick_pc(struct evlist *evlist)
1354 {
1355 	if (evlist) {
1356 		if (evlist->mmap && evlist->mmap[0].core.base)
1357 			return evlist->mmap[0].core.base;
1358 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1359 			return evlist->overwrite_mmap[0].core.base;
1360 	}
1361 	return NULL;
1362 }
1363 
record__pick_pc(struct record * rec)1364 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1365 {
1366 	const struct perf_event_mmap_page *pc;
1367 
1368 	pc = perf_evlist__pick_pc(rec->evlist);
1369 	if (pc)
1370 		return pc;
1371 	return NULL;
1372 }
1373 
record__synthesize(struct record * rec,bool tail)1374 static int record__synthesize(struct record *rec, bool tail)
1375 {
1376 	struct perf_session *session = rec->session;
1377 	struct machine *machine = &session->machines.host;
1378 	struct perf_data *data = &rec->data;
1379 	struct record_opts *opts = &rec->opts;
1380 	struct perf_tool *tool = &rec->tool;
1381 	int fd = perf_data__fd(data);
1382 	int err = 0;
1383 	event_op f = process_synthesized_event;
1384 
1385 	if (rec->opts.tail_synthesize != tail)
1386 		return 0;
1387 
1388 	if (data->is_pipe) {
1389 		/*
1390 		 * We need to synthesize events first, because some
1391 		 * features works on top of them (on report side).
1392 		 */
1393 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1394 						   process_synthesized_event);
1395 		if (err < 0) {
1396 			pr_err("Couldn't synthesize attrs.\n");
1397 			goto out;
1398 		}
1399 
1400 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1401 						      process_synthesized_event);
1402 		if (err < 0) {
1403 			pr_err("Couldn't synthesize features.\n");
1404 			return err;
1405 		}
1406 
1407 		if (have_tracepoints(&rec->evlist->core.entries)) {
1408 			/*
1409 			 * FIXME err <= 0 here actually means that
1410 			 * there were no tracepoints so its not really
1411 			 * an error, just that we don't need to
1412 			 * synthesize anything.  We really have to
1413 			 * return this more properly and also
1414 			 * propagate errors that now are calling die()
1415 			 */
1416 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1417 								  process_synthesized_event);
1418 			if (err <= 0) {
1419 				pr_err("Couldn't record tracing data.\n");
1420 				goto out;
1421 			}
1422 			rec->bytes_written += err;
1423 		}
1424 	}
1425 
1426 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1427 					  process_synthesized_event, machine);
1428 	if (err)
1429 		goto out;
1430 
1431 	/* Synthesize id_index before auxtrace_info */
1432 	if (rec->opts.auxtrace_sample_mode) {
1433 		err = perf_event__synthesize_id_index(tool,
1434 						      process_synthesized_event,
1435 						      session->evlist, machine);
1436 		if (err)
1437 			goto out;
1438 	}
1439 
1440 	if (rec->opts.full_auxtrace) {
1441 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1442 					session, process_synthesized_event);
1443 		if (err)
1444 			goto out;
1445 	}
1446 
1447 	if (!perf_evlist__exclude_kernel(rec->evlist)) {
1448 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1449 							 machine);
1450 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1451 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1452 				   "Check /proc/kallsyms permission or run as root.\n");
1453 
1454 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1455 						     machine);
1456 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1457 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1458 				   "Check /proc/modules permission or run as root.\n");
1459 	}
1460 
1461 	if (perf_guest) {
1462 		machines__process_guests(&session->machines,
1463 					 perf_event__synthesize_guest_os, tool);
1464 	}
1465 
1466 	err = perf_event__synthesize_extra_attr(&rec->tool,
1467 						rec->evlist,
1468 						process_synthesized_event,
1469 						data->is_pipe);
1470 	if (err)
1471 		goto out;
1472 
1473 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1474 						 process_synthesized_event,
1475 						NULL);
1476 	if (err < 0) {
1477 		pr_err("Couldn't synthesize thread map.\n");
1478 		return err;
1479 	}
1480 
1481 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1482 					     process_synthesized_event, NULL);
1483 	if (err < 0) {
1484 		pr_err("Couldn't synthesize cpu map.\n");
1485 		return err;
1486 	}
1487 
1488 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1489 						machine, opts);
1490 	if (err < 0)
1491 		pr_warning("Couldn't synthesize bpf events.\n");
1492 
1493 	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1494 					     machine);
1495 	if (err < 0)
1496 		pr_warning("Couldn't synthesize cgroup events.\n");
1497 
1498 	if (rec->opts.nr_threads_synthesize > 1) {
1499 		perf_set_multithreaded();
1500 		f = process_locked_synthesized_event;
1501 	}
1502 
1503 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1504 					    f, opts->sample_address,
1505 					    rec->opts.nr_threads_synthesize);
1506 
1507 	if (rec->opts.nr_threads_synthesize > 1)
1508 		perf_set_singlethreaded();
1509 
1510 out:
1511 	return err;
1512 }
1513 
record__process_signal_event(union perf_event * event __maybe_unused,void * data)1514 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1515 {
1516 	struct record *rec = data;
1517 	pthread_kill(rec->thread_id, SIGUSR2);
1518 	return 0;
1519 }
1520 
record__setup_sb_evlist(struct record * rec)1521 static int record__setup_sb_evlist(struct record *rec)
1522 {
1523 	struct record_opts *opts = &rec->opts;
1524 
1525 	if (rec->sb_evlist != NULL) {
1526 		/*
1527 		 * We get here if --switch-output-event populated the
1528 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1529 		 * to the main thread.
1530 		 */
1531 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1532 		rec->thread_id = pthread_self();
1533 	}
1534 #ifdef HAVE_LIBBPF_SUPPORT
1535 	if (!opts->no_bpf_event) {
1536 		if (rec->sb_evlist == NULL) {
1537 			rec->sb_evlist = evlist__new();
1538 
1539 			if (rec->sb_evlist == NULL) {
1540 				pr_err("Couldn't create side band evlist.\n.");
1541 				return -1;
1542 			}
1543 		}
1544 
1545 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1546 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1547 			return -1;
1548 		}
1549 	}
1550 #endif
1551 	if (perf_evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1552 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1553 		opts->no_bpf_event = true;
1554 	}
1555 
1556 	return 0;
1557 }
1558 
record__init_clock(struct record * rec)1559 static int record__init_clock(struct record *rec)
1560 {
1561 	struct perf_session *session = rec->session;
1562 	struct timespec ref_clockid;
1563 	struct timeval ref_tod;
1564 	u64 ref;
1565 
1566 	if (!rec->opts.use_clockid)
1567 		return 0;
1568 
1569 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1570 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1571 
1572 	session->header.env.clock.clockid = rec->opts.clockid;
1573 
1574 	if (gettimeofday(&ref_tod, NULL) != 0) {
1575 		pr_err("gettimeofday failed, cannot set reference time.\n");
1576 		return -1;
1577 	}
1578 
1579 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1580 		pr_err("clock_gettime failed, cannot set reference time.\n");
1581 		return -1;
1582 	}
1583 
1584 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1585 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1586 
1587 	session->header.env.clock.tod_ns = ref;
1588 
1589 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1590 	      (u64) ref_clockid.tv_nsec;
1591 
1592 	session->header.env.clock.clockid_ns = ref;
1593 	return 0;
1594 }
1595 
hit_auxtrace_snapshot_trigger(struct record * rec)1596 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1597 {
1598 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1599 		trigger_hit(&auxtrace_snapshot_trigger);
1600 		auxtrace_record__snapshot_started = 1;
1601 		if (auxtrace_record__snapshot_start(rec->itr))
1602 			trigger_error(&auxtrace_snapshot_trigger);
1603 	}
1604 }
1605 
__cmd_record(struct record * rec,int argc,const char ** argv)1606 static int __cmd_record(struct record *rec, int argc, const char **argv)
1607 {
1608 	int err;
1609 	int status = 0;
1610 	unsigned long waking = 0;
1611 	const bool forks = argc > 0;
1612 	struct perf_tool *tool = &rec->tool;
1613 	struct record_opts *opts = &rec->opts;
1614 	struct perf_data *data = &rec->data;
1615 	struct perf_session *session;
1616 	bool disabled = false, draining = false;
1617 	int fd;
1618 	float ratio = 0;
1619 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1620 
1621 	atexit(record__sig_exit);
1622 	signal(SIGCHLD, sig_handler);
1623 	signal(SIGINT, sig_handler);
1624 	signal(SIGTERM, sig_handler);
1625 	signal(SIGSEGV, sigsegv_handler);
1626 
1627 	if (rec->opts.record_namespaces)
1628 		tool->namespace_events = true;
1629 
1630 	if (rec->opts.record_cgroup) {
1631 #ifdef HAVE_FILE_HANDLE
1632 		tool->cgroup_events = true;
1633 #else
1634 		pr_err("cgroup tracking is not supported\n");
1635 		return -1;
1636 #endif
1637 	}
1638 
1639 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1640 		signal(SIGUSR2, snapshot_sig_handler);
1641 		if (rec->opts.auxtrace_snapshot_mode)
1642 			trigger_on(&auxtrace_snapshot_trigger);
1643 		if (rec->switch_output.enabled)
1644 			trigger_on(&switch_output_trigger);
1645 	} else {
1646 		signal(SIGUSR2, SIG_IGN);
1647 	}
1648 
1649 	session = perf_session__new(data, false, tool);
1650 	if (IS_ERR(session)) {
1651 		pr_err("Perf session creation failed.\n");
1652 		return PTR_ERR(session);
1653 	}
1654 
1655 	fd = perf_data__fd(data);
1656 	rec->session = session;
1657 
1658 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1659 		pr_err("Compression initialization failed.\n");
1660 		return -1;
1661 	}
1662 #ifdef HAVE_EVENTFD_SUPPORT
1663 	done_fd = eventfd(0, EFD_NONBLOCK);
1664 	if (done_fd < 0) {
1665 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1666 		status = -1;
1667 		goto out_delete_session;
1668 	}
1669 	err = evlist__add_pollfd(rec->evlist, done_fd);
1670 	if (err < 0) {
1671 		pr_err("Failed to add wakeup eventfd to poll list\n");
1672 		status = err;
1673 		goto out_delete_session;
1674 	}
1675 #endif // HAVE_EVENTFD_SUPPORT
1676 
1677 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1678 	session->header.env.comp_level = rec->opts.comp_level;
1679 
1680 	if (rec->opts.kcore &&
1681 	    !record__kcore_readable(&session->machines.host)) {
1682 		pr_err("ERROR: kcore is not readable.\n");
1683 		return -1;
1684 	}
1685 
1686 	if (record__init_clock(rec))
1687 		return -1;
1688 
1689 	record__init_features(rec);
1690 
1691 	if (forks) {
1692 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1693 						    argv, data->is_pipe,
1694 						    workload_exec_failed_signal);
1695 		if (err < 0) {
1696 			pr_err("Couldn't run the workload!\n");
1697 			status = err;
1698 			goto out_delete_session;
1699 		}
1700 	}
1701 
1702 	/*
1703 	 * If we have just single event and are sending data
1704 	 * through pipe, we need to force the ids allocation,
1705 	 * because we synthesize event name through the pipe
1706 	 * and need the id for that.
1707 	 */
1708 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1709 		rec->opts.sample_id = true;
1710 
1711 	if (record__open(rec) != 0) {
1712 		err = -1;
1713 		goto out_child;
1714 	}
1715 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1716 
1717 	if (rec->opts.kcore) {
1718 		err = record__kcore_copy(&session->machines.host, data);
1719 		if (err) {
1720 			pr_err("ERROR: Failed to copy kcore\n");
1721 			goto out_child;
1722 		}
1723 	}
1724 
1725 	err = bpf__apply_obj_config();
1726 	if (err) {
1727 		char errbuf[BUFSIZ];
1728 
1729 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1730 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1731 			 errbuf);
1732 		goto out_child;
1733 	}
1734 
1735 	/*
1736 	 * Normally perf_session__new would do this, but it doesn't have the
1737 	 * evlist.
1738 	 */
1739 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1740 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1741 		rec->tool.ordered_events = false;
1742 	}
1743 
1744 	if (!rec->evlist->nr_groups)
1745 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1746 
1747 	if (data->is_pipe) {
1748 		err = perf_header__write_pipe(fd);
1749 		if (err < 0)
1750 			goto out_child;
1751 	} else {
1752 		err = perf_session__write_header(session, rec->evlist, fd, false);
1753 		if (err < 0)
1754 			goto out_child;
1755 	}
1756 
1757 	err = -1;
1758 	if (!rec->no_buildid
1759 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1760 		pr_err("Couldn't generate buildids. "
1761 		       "Use --no-buildid to profile anyway.\n");
1762 		goto out_child;
1763 	}
1764 
1765 	err = record__setup_sb_evlist(rec);
1766 	if (err)
1767 		goto out_child;
1768 
1769 	err = record__synthesize(rec, false);
1770 	if (err < 0)
1771 		goto out_child;
1772 
1773 	if (rec->realtime_prio) {
1774 		struct sched_param param;
1775 
1776 		param.sched_priority = rec->realtime_prio;
1777 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1778 			pr_err("Could not set realtime priority.\n");
1779 			err = -1;
1780 			goto out_child;
1781 		}
1782 	}
1783 
1784 	/*
1785 	 * When perf is starting the traced process, all the events
1786 	 * (apart from group members) have enable_on_exec=1 set,
1787 	 * so don't spoil it by prematurely enabling them.
1788 	 */
1789 	if (!target__none(&opts->target) && !opts->initial_delay)
1790 		evlist__enable(rec->evlist);
1791 
1792 	/*
1793 	 * Let the child rip
1794 	 */
1795 	if (forks) {
1796 		struct machine *machine = &session->machines.host;
1797 		union perf_event *event;
1798 		pid_t tgid;
1799 
1800 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1801 		if (event == NULL) {
1802 			err = -ENOMEM;
1803 			goto out_child;
1804 		}
1805 
1806 		/*
1807 		 * Some H/W events are generated before COMM event
1808 		 * which is emitted during exec(), so perf script
1809 		 * cannot see a correct process name for those events.
1810 		 * Synthesize COMM event to prevent it.
1811 		 */
1812 		tgid = perf_event__synthesize_comm(tool, event,
1813 						   rec->evlist->workload.pid,
1814 						   process_synthesized_event,
1815 						   machine);
1816 		free(event);
1817 
1818 		if (tgid == -1)
1819 			goto out_child;
1820 
1821 		event = malloc(sizeof(event->namespaces) +
1822 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1823 			       machine->id_hdr_size);
1824 		if (event == NULL) {
1825 			err = -ENOMEM;
1826 			goto out_child;
1827 		}
1828 
1829 		/*
1830 		 * Synthesize NAMESPACES event for the command specified.
1831 		 */
1832 		perf_event__synthesize_namespaces(tool, event,
1833 						  rec->evlist->workload.pid,
1834 						  tgid, process_synthesized_event,
1835 						  machine);
1836 		free(event);
1837 
1838 		perf_evlist__start_workload(rec->evlist);
1839 	}
1840 
1841 	if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1842 		goto out_child;
1843 
1844 	if (opts->initial_delay) {
1845 		pr_info(EVLIST_DISABLED_MSG);
1846 		if (opts->initial_delay > 0) {
1847 			usleep(opts->initial_delay * USEC_PER_MSEC);
1848 			evlist__enable(rec->evlist);
1849 			pr_info(EVLIST_ENABLED_MSG);
1850 		}
1851 	}
1852 
1853 	trigger_ready(&auxtrace_snapshot_trigger);
1854 	trigger_ready(&switch_output_trigger);
1855 	perf_hooks__invoke_record_start();
1856 	for (;;) {
1857 		unsigned long long hits = rec->samples;
1858 
1859 		/*
1860 		 * rec->evlist->bkw_mmap_state is possible to be
1861 		 * BKW_MMAP_EMPTY here: when done == true and
1862 		 * hits != rec->samples in previous round.
1863 		 *
1864 		 * perf_evlist__toggle_bkw_mmap ensure we never
1865 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1866 		 */
1867 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1868 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1869 
1870 		if (record__mmap_read_all(rec, false) < 0) {
1871 			trigger_error(&auxtrace_snapshot_trigger);
1872 			trigger_error(&switch_output_trigger);
1873 			err = -1;
1874 			goto out_child;
1875 		}
1876 
1877 		if (auxtrace_record__snapshot_started) {
1878 			auxtrace_record__snapshot_started = 0;
1879 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1880 				record__read_auxtrace_snapshot(rec, false);
1881 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1882 				pr_err("AUX area tracing snapshot failed\n");
1883 				err = -1;
1884 				goto out_child;
1885 			}
1886 		}
1887 
1888 		if (trigger_is_hit(&switch_output_trigger)) {
1889 			/*
1890 			 * If switch_output_trigger is hit, the data in
1891 			 * overwritable ring buffer should have been collected,
1892 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1893 			 *
1894 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1895 			 * record__mmap_read_all() didn't collect data from
1896 			 * overwritable ring buffer. Read again.
1897 			 */
1898 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1899 				continue;
1900 			trigger_ready(&switch_output_trigger);
1901 
1902 			/*
1903 			 * Reenable events in overwrite ring buffer after
1904 			 * record__mmap_read_all(): we should have collected
1905 			 * data from it.
1906 			 */
1907 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1908 
1909 			if (!quiet)
1910 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1911 					waking);
1912 			waking = 0;
1913 			fd = record__switch_output(rec, false);
1914 			if (fd < 0) {
1915 				pr_err("Failed to switch to new file\n");
1916 				trigger_error(&switch_output_trigger);
1917 				err = fd;
1918 				goto out_child;
1919 			}
1920 
1921 			/* re-arm the alarm */
1922 			if (rec->switch_output.time)
1923 				alarm(rec->switch_output.time);
1924 		}
1925 
1926 		if (hits == rec->samples) {
1927 			if (done || draining)
1928 				break;
1929 			err = evlist__poll(rec->evlist, -1);
1930 			/*
1931 			 * Propagate error, only if there's any. Ignore positive
1932 			 * number of returned events and interrupt error.
1933 			 */
1934 			if (err > 0 || (err < 0 && errno == EINTR))
1935 				err = 0;
1936 			waking++;
1937 
1938 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1939 				draining = true;
1940 		}
1941 
1942 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1943 			switch (cmd) {
1944 			case EVLIST_CTL_CMD_ENABLE:
1945 				pr_info(EVLIST_ENABLED_MSG);
1946 				break;
1947 			case EVLIST_CTL_CMD_DISABLE:
1948 				pr_info(EVLIST_DISABLED_MSG);
1949 				break;
1950 			case EVLIST_CTL_CMD_SNAPSHOT:
1951 				hit_auxtrace_snapshot_trigger(rec);
1952 				evlist__ctlfd_ack(rec->evlist);
1953 				break;
1954 			case EVLIST_CTL_CMD_ACK:
1955 			case EVLIST_CTL_CMD_UNSUPPORTED:
1956 			default:
1957 				break;
1958 			}
1959 		}
1960 
1961 		/*
1962 		 * When perf is starting the traced process, at the end events
1963 		 * die with the process and we wait for that. Thus no need to
1964 		 * disable events in this case.
1965 		 */
1966 		if (done && !disabled && !target__none(&opts->target)) {
1967 			trigger_off(&auxtrace_snapshot_trigger);
1968 			evlist__disable(rec->evlist);
1969 			disabled = true;
1970 		}
1971 	}
1972 
1973 	trigger_off(&auxtrace_snapshot_trigger);
1974 	trigger_off(&switch_output_trigger);
1975 
1976 	if (opts->auxtrace_snapshot_on_exit)
1977 		record__auxtrace_snapshot_exit(rec);
1978 
1979 	if (forks && workload_exec_errno) {
1980 		char msg[STRERR_BUFSIZE];
1981 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1982 		pr_err("Workload failed: %s\n", emsg);
1983 		err = -1;
1984 		goto out_child;
1985 	}
1986 
1987 	if (!quiet)
1988 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1989 
1990 	if (target__none(&rec->opts.target))
1991 		record__synthesize_workload(rec, true);
1992 
1993 out_child:
1994 	evlist__finalize_ctlfd(rec->evlist);
1995 	record__mmap_read_all(rec, true);
1996 	record__aio_mmap_read_sync(rec);
1997 
1998 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1999 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2000 		session->header.env.comp_ratio = ratio + 0.5;
2001 	}
2002 
2003 	if (forks) {
2004 		int exit_status;
2005 
2006 		if (!child_finished)
2007 			kill(rec->evlist->workload.pid, SIGTERM);
2008 
2009 		wait(&exit_status);
2010 
2011 		if (err < 0)
2012 			status = err;
2013 		else if (WIFEXITED(exit_status))
2014 			status = WEXITSTATUS(exit_status);
2015 		else if (WIFSIGNALED(exit_status))
2016 			signr = WTERMSIG(exit_status);
2017 	} else
2018 		status = err;
2019 
2020 	record__synthesize(rec, true);
2021 	/* this will be recalculated during process_buildids() */
2022 	rec->samples = 0;
2023 
2024 	if (!err) {
2025 		if (!rec->timestamp_filename) {
2026 			record__finish_output(rec);
2027 		} else {
2028 			fd = record__switch_output(rec, true);
2029 			if (fd < 0) {
2030 				status = fd;
2031 				goto out_delete_session;
2032 			}
2033 		}
2034 	}
2035 
2036 	perf_hooks__invoke_record_end();
2037 
2038 	if (!err && !quiet) {
2039 		char samples[128];
2040 		const char *postfix = rec->timestamp_filename ?
2041 					".<timestamp>" : "";
2042 
2043 		if (rec->samples && !rec->opts.full_auxtrace)
2044 			scnprintf(samples, sizeof(samples),
2045 				  " (%" PRIu64 " samples)", rec->samples);
2046 		else
2047 			samples[0] = '\0';
2048 
2049 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2050 			perf_data__size(data) / 1024.0 / 1024.0,
2051 			data->path, postfix, samples);
2052 		if (ratio) {
2053 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2054 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2055 					ratio);
2056 		}
2057 		fprintf(stderr, " ]\n");
2058 	}
2059 
2060 out_delete_session:
2061 #ifdef HAVE_EVENTFD_SUPPORT
2062 	if (done_fd >= 0)
2063 		close(done_fd);
2064 #endif
2065 	zstd_fini(&session->zstd_data);
2066 	perf_session__delete(session);
2067 
2068 	if (!opts->no_bpf_event)
2069 		perf_evlist__stop_sb_thread(rec->sb_evlist);
2070 	return status;
2071 }
2072 
callchain_debug(struct callchain_param * callchain)2073 static void callchain_debug(struct callchain_param *callchain)
2074 {
2075 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2076 
2077 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2078 
2079 	if (callchain->record_mode == CALLCHAIN_DWARF)
2080 		pr_debug("callchain: stack dump size %d\n",
2081 			 callchain->dump_size);
2082 }
2083 
record_opts__parse_callchain(struct record_opts * record,struct callchain_param * callchain,const char * arg,bool unset)2084 int record_opts__parse_callchain(struct record_opts *record,
2085 				 struct callchain_param *callchain,
2086 				 const char *arg, bool unset)
2087 {
2088 	int ret;
2089 	callchain->enabled = !unset;
2090 
2091 	/* --no-call-graph */
2092 	if (unset) {
2093 		callchain->record_mode = CALLCHAIN_NONE;
2094 		pr_debug("callchain: disabled\n");
2095 		return 0;
2096 	}
2097 
2098 	ret = parse_callchain_record_opt(arg, callchain);
2099 	if (!ret) {
2100 		/* Enable data address sampling for DWARF unwind. */
2101 		if (callchain->record_mode == CALLCHAIN_DWARF)
2102 			record->sample_address = true;
2103 		callchain_debug(callchain);
2104 	}
2105 
2106 	return ret;
2107 }
2108 
record_parse_callchain_opt(const struct option * opt,const char * arg,int unset)2109 int record_parse_callchain_opt(const struct option *opt,
2110 			       const char *arg,
2111 			       int unset)
2112 {
2113 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2114 }
2115 
record_callchain_opt(const struct option * opt,const char * arg __maybe_unused,int unset __maybe_unused)2116 int record_callchain_opt(const struct option *opt,
2117 			 const char *arg __maybe_unused,
2118 			 int unset __maybe_unused)
2119 {
2120 	struct callchain_param *callchain = opt->value;
2121 
2122 	callchain->enabled = true;
2123 
2124 	if (callchain->record_mode == CALLCHAIN_NONE)
2125 		callchain->record_mode = CALLCHAIN_FP;
2126 
2127 	callchain_debug(callchain);
2128 	return 0;
2129 }
2130 
perf_record_config(const char * var,const char * value,void * cb)2131 static int perf_record_config(const char *var, const char *value, void *cb)
2132 {
2133 	struct record *rec = cb;
2134 
2135 	if (!strcmp(var, "record.build-id")) {
2136 		if (!strcmp(value, "cache"))
2137 			rec->no_buildid_cache = false;
2138 		else if (!strcmp(value, "no-cache"))
2139 			rec->no_buildid_cache = true;
2140 		else if (!strcmp(value, "skip"))
2141 			rec->no_buildid = true;
2142 		else
2143 			return -1;
2144 		return 0;
2145 	}
2146 	if (!strcmp(var, "record.call-graph")) {
2147 		var = "call-graph.record-mode";
2148 		return perf_default_config(var, value, cb);
2149 	}
2150 #ifdef HAVE_AIO_SUPPORT
2151 	if (!strcmp(var, "record.aio")) {
2152 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2153 		if (!rec->opts.nr_cblocks)
2154 			rec->opts.nr_cblocks = nr_cblocks_default;
2155 	}
2156 #endif
2157 
2158 	return 0;
2159 }
2160 
2161 
record__parse_affinity(const struct option * opt,const char * str,int unset)2162 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2163 {
2164 	struct record_opts *opts = (struct record_opts *)opt->value;
2165 
2166 	if (unset || !str)
2167 		return 0;
2168 
2169 	if (!strcasecmp(str, "node"))
2170 		opts->affinity = PERF_AFFINITY_NODE;
2171 	else if (!strcasecmp(str, "cpu"))
2172 		opts->affinity = PERF_AFFINITY_CPU;
2173 
2174 	return 0;
2175 }
2176 
parse_output_max_size(const struct option * opt,const char * str,int unset)2177 static int parse_output_max_size(const struct option *opt,
2178 				 const char *str, int unset)
2179 {
2180 	unsigned long *s = (unsigned long *)opt->value;
2181 	static struct parse_tag tags_size[] = {
2182 		{ .tag  = 'B', .mult = 1       },
2183 		{ .tag  = 'K', .mult = 1 << 10 },
2184 		{ .tag  = 'M', .mult = 1 << 20 },
2185 		{ .tag  = 'G', .mult = 1 << 30 },
2186 		{ .tag  = 0 },
2187 	};
2188 	unsigned long val;
2189 
2190 	if (unset) {
2191 		*s = 0;
2192 		return 0;
2193 	}
2194 
2195 	val = parse_tag_value(str, tags_size);
2196 	if (val != (unsigned long) -1) {
2197 		*s = val;
2198 		return 0;
2199 	}
2200 
2201 	return -1;
2202 }
2203 
record__parse_mmap_pages(const struct option * opt,const char * str,int unset __maybe_unused)2204 static int record__parse_mmap_pages(const struct option *opt,
2205 				    const char *str,
2206 				    int unset __maybe_unused)
2207 {
2208 	struct record_opts *opts = opt->value;
2209 	char *s, *p;
2210 	unsigned int mmap_pages;
2211 	int ret;
2212 
2213 	if (!str)
2214 		return -EINVAL;
2215 
2216 	s = strdup(str);
2217 	if (!s)
2218 		return -ENOMEM;
2219 
2220 	p = strchr(s, ',');
2221 	if (p)
2222 		*p = '\0';
2223 
2224 	if (*s) {
2225 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
2226 		if (ret)
2227 			goto out_free;
2228 		opts->mmap_pages = mmap_pages;
2229 	}
2230 
2231 	if (!p) {
2232 		ret = 0;
2233 		goto out_free;
2234 	}
2235 
2236 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
2237 	if (ret)
2238 		goto out_free;
2239 
2240 	opts->auxtrace_mmap_pages = mmap_pages;
2241 
2242 out_free:
2243 	free(s);
2244 	return ret;
2245 }
2246 
parse_control_option(const struct option * opt,const char * str,int unset __maybe_unused)2247 static int parse_control_option(const struct option *opt,
2248 				const char *str,
2249 				int unset __maybe_unused)
2250 {
2251 	struct record_opts *opts = opt->value;
2252 
2253 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2254 }
2255 
switch_output_size_warn(struct record * rec)2256 static void switch_output_size_warn(struct record *rec)
2257 {
2258 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2259 	struct switch_output *s = &rec->switch_output;
2260 
2261 	wakeup_size /= 2;
2262 
2263 	if (s->size < wakeup_size) {
2264 		char buf[100];
2265 
2266 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2267 		pr_warning("WARNING: switch-output data size lower than "
2268 			   "wakeup kernel buffer size (%s) "
2269 			   "expect bigger perf.data sizes\n", buf);
2270 	}
2271 }
2272 
switch_output_setup(struct record * rec)2273 static int switch_output_setup(struct record *rec)
2274 {
2275 	struct switch_output *s = &rec->switch_output;
2276 	static struct parse_tag tags_size[] = {
2277 		{ .tag  = 'B', .mult = 1       },
2278 		{ .tag  = 'K', .mult = 1 << 10 },
2279 		{ .tag  = 'M', .mult = 1 << 20 },
2280 		{ .tag  = 'G', .mult = 1 << 30 },
2281 		{ .tag  = 0 },
2282 	};
2283 	static struct parse_tag tags_time[] = {
2284 		{ .tag  = 's', .mult = 1        },
2285 		{ .tag  = 'm', .mult = 60       },
2286 		{ .tag  = 'h', .mult = 60*60    },
2287 		{ .tag  = 'd', .mult = 60*60*24 },
2288 		{ .tag  = 0 },
2289 	};
2290 	unsigned long val;
2291 
2292 	/*
2293 	 * If we're using --switch-output-events, then we imply its
2294 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2295 	 *  thread to its parent.
2296 	 */
2297 	if (rec->switch_output_event_set)
2298 		goto do_signal;
2299 
2300 	if (!s->set)
2301 		return 0;
2302 
2303 	if (!strcmp(s->str, "signal")) {
2304 do_signal:
2305 		s->signal = true;
2306 		pr_debug("switch-output with SIGUSR2 signal\n");
2307 		goto enabled;
2308 	}
2309 
2310 	val = parse_tag_value(s->str, tags_size);
2311 	if (val != (unsigned long) -1) {
2312 		s->size = val;
2313 		pr_debug("switch-output with %s size threshold\n", s->str);
2314 		goto enabled;
2315 	}
2316 
2317 	val = parse_tag_value(s->str, tags_time);
2318 	if (val != (unsigned long) -1) {
2319 		s->time = val;
2320 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2321 			 s->str, s->time);
2322 		goto enabled;
2323 	}
2324 
2325 	return -1;
2326 
2327 enabled:
2328 	rec->timestamp_filename = true;
2329 	s->enabled              = true;
2330 
2331 	if (s->size && !rec->opts.no_buffering)
2332 		switch_output_size_warn(rec);
2333 
2334 	return 0;
2335 }
2336 
2337 static const char * const __record_usage[] = {
2338 	"perf record [<options>] [<command>]",
2339 	"perf record [<options>] -- <command> [<options>]",
2340 	NULL
2341 };
2342 const char * const *record_usage = __record_usage;
2343 
build_id__process_mmap(struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)2344 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2345 				  struct perf_sample *sample, struct machine *machine)
2346 {
2347 	/*
2348 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2349 	 * no need to add them twice.
2350 	 */
2351 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2352 		return 0;
2353 	return perf_event__process_mmap(tool, event, sample, machine);
2354 }
2355 
build_id__process_mmap2(struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)2356 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2357 				   struct perf_sample *sample, struct machine *machine)
2358 {
2359 	/*
2360 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2361 	 * no need to add them twice.
2362 	 */
2363 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2364 		return 0;
2365 
2366 	return perf_event__process_mmap2(tool, event, sample, machine);
2367 }
2368 
2369 /*
2370  * XXX Ideally would be local to cmd_record() and passed to a record__new
2371  * because we need to have access to it in record__exit, that is called
2372  * after cmd_record() exits, but since record_options need to be accessible to
2373  * builtin-script, leave it here.
2374  *
2375  * At least we don't ouch it in all the other functions here directly.
2376  *
2377  * Just say no to tons of global variables, sigh.
2378  */
2379 static struct record record = {
2380 	.opts = {
2381 		.sample_time	     = true,
2382 		.mmap_pages	     = UINT_MAX,
2383 		.user_freq	     = UINT_MAX,
2384 		.user_interval	     = ULLONG_MAX,
2385 		.freq		     = 4000,
2386 		.target		     = {
2387 			.uses_mmap   = true,
2388 			.default_per_cpu = true,
2389 		},
2390 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2391 		.nr_threads_synthesize = 1,
2392 		.ctl_fd              = -1,
2393 		.ctl_fd_ack          = -1,
2394 	},
2395 	.tool = {
2396 		.sample		= process_sample_event,
2397 		.fork		= perf_event__process_fork,
2398 		.exit		= perf_event__process_exit,
2399 		.comm		= perf_event__process_comm,
2400 		.namespaces	= perf_event__process_namespaces,
2401 		.mmap		= build_id__process_mmap,
2402 		.mmap2		= build_id__process_mmap2,
2403 		.ordered_events	= true,
2404 	},
2405 };
2406 
2407 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2408 	"\n\t\t\t\tDefault: fp";
2409 
2410 static bool dry_run;
2411 
2412 /*
2413  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2414  * with it and switch to use the library functions in perf_evlist that came
2415  * from builtin-record.c, i.e. use record_opts,
2416  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2417  * using pipes, etc.
2418  */
2419 static struct option __record_options[] = {
2420 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2421 		     "event selector. use 'perf list' to list available events",
2422 		     parse_events_option),
2423 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2424 		     "event filter", parse_filter),
2425 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2426 			   NULL, "don't record events from perf itself",
2427 			   exclude_perf),
2428 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2429 		    "record events on existing process id"),
2430 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2431 		    "record events on existing thread id"),
2432 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2433 		    "collect data with this RT SCHED_FIFO priority"),
2434 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2435 		    "collect data without buffering"),
2436 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2437 		    "collect raw sample records from all opened counters"),
2438 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2439 			    "system-wide collection from all CPUs"),
2440 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2441 		    "list of cpus to monitor"),
2442 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2443 	OPT_STRING('o', "output", &record.data.path, "file",
2444 		    "output file name"),
2445 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2446 			&record.opts.no_inherit_set,
2447 			"child tasks do not inherit counters"),
2448 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2449 		    "synthesize non-sample events at the end of output"),
2450 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2451 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2452 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2453 		    "Fail if the specified frequency can't be used"),
2454 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2455 		     "profile at this frequency",
2456 		      record__parse_freq),
2457 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2458 		     "number of mmap data pages and AUX area tracing mmap pages",
2459 		     record__parse_mmap_pages),
2460 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2461 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2462 		     record__mmap_flush_parse),
2463 	OPT_BOOLEAN(0, "group", &record.opts.group,
2464 		    "put the counters into a counter group"),
2465 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2466 			   NULL, "enables call-graph recording" ,
2467 			   &record_callchain_opt),
2468 	OPT_CALLBACK(0, "call-graph", &record.opts,
2469 		     "record_mode[,record_size]", record_callchain_help,
2470 		     &record_parse_callchain_opt),
2471 	OPT_INCR('v', "verbose", &verbose,
2472 		    "be more verbose (show counter open errors, etc)"),
2473 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2474 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2475 		    "per thread counts"),
2476 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2477 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2478 		    "Record the sample physical addresses"),
2479 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2480 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2481 			&record.opts.sample_time_set,
2482 			"Record the sample timestamps"),
2483 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2484 			"Record the sample period"),
2485 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2486 		    "don't sample"),
2487 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2488 			&record.no_buildid_cache_set,
2489 			"do not update the buildid cache"),
2490 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2491 			&record.no_buildid_set,
2492 			"do not collect buildids in perf.data"),
2493 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2494 		     "monitor event in cgroup name only",
2495 		     parse_cgroups),
2496 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2497 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2498 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2499 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2500 		   "user to profile"),
2501 
2502 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2503 		     "branch any", "sample any taken branches",
2504 		     parse_branch_stack),
2505 
2506 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2507 		     "branch filter mask", "branch stack filter modes",
2508 		     parse_branch_stack),
2509 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2510 		    "sample by weight (on special events only)"),
2511 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2512 		    "sample transaction flags (special events only)"),
2513 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2514 		    "use per-thread mmaps"),
2515 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2516 		    "sample selected machine registers on interrupt,"
2517 		    " use '-I?' to list register names", parse_intr_regs),
2518 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2519 		    "sample selected machine registers on interrupt,"
2520 		    " use '--user-regs=?' to list register names", parse_user_regs),
2521 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2522 		    "Record running/enabled time of read (:S) events"),
2523 	OPT_CALLBACK('k', "clockid", &record.opts,
2524 	"clockid", "clockid to use for events, see clock_gettime()",
2525 	parse_clockid),
2526 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2527 			  "opts", "AUX area tracing Snapshot Mode", ""),
2528 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2529 			  "opts", "sample AUX area", ""),
2530 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2531 			"per thread proc mmap processing timeout in ms"),
2532 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2533 		    "Record namespaces events"),
2534 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2535 		    "Record cgroup events"),
2536 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2537 			&record.opts.record_switch_events_set,
2538 			"Record context switch events"),
2539 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2540 			 "Configure all used events to run in kernel space.",
2541 			 PARSE_OPT_EXCLUSIVE),
2542 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2543 			 "Configure all used events to run in user space.",
2544 			 PARSE_OPT_EXCLUSIVE),
2545 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2546 		    "collect kernel callchains"),
2547 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2548 		    "collect user callchains"),
2549 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2550 		   "clang binary to use for compiling BPF scriptlets"),
2551 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2552 		   "options passed to clang when compiling BPF scriptlets"),
2553 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2554 		   "file", "vmlinux pathname"),
2555 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2556 		    "Record build-id of all DSOs regardless of hits"),
2557 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2558 		    "append timestamp to output filename"),
2559 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2560 		    "Record timestamp boundary (time of first/last samples)"),
2561 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2562 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2563 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2564 			  "signal"),
2565 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2566 			 "switch output event selector. use 'perf list' to list available events",
2567 			 parse_events_option_new_evlist),
2568 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2569 		   "Limit number of switch output generated files"),
2570 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2571 		    "Parse options then exit"),
2572 #ifdef HAVE_AIO_SUPPORT
2573 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2574 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2575 		     record__aio_parse),
2576 #endif
2577 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2578 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2579 		     record__parse_affinity),
2580 #ifdef HAVE_ZSTD_SUPPORT
2581 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2582 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2583 			    record__parse_comp_level),
2584 #endif
2585 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2586 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2587 	OPT_UINTEGER(0, "num-thread-synthesize",
2588 		     &record.opts.nr_threads_synthesize,
2589 		     "number of threads to run for event synthesis"),
2590 #ifdef HAVE_LIBPFM
2591 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2592 		"libpfm4 event selector. use 'perf list' to list available events",
2593 		parse_libpfm_events_option),
2594 #endif
2595 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2596 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2597 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2598 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2599 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2600 		      parse_control_option),
2601 	OPT_END()
2602 };
2603 
2604 struct option *record_options = __record_options;
2605 
cmd_record(int argc,const char ** argv)2606 int cmd_record(int argc, const char **argv)
2607 {
2608 	int err;
2609 	struct record *rec = &record;
2610 	char errbuf[BUFSIZ];
2611 
2612 	setlocale(LC_ALL, "");
2613 
2614 #ifndef HAVE_LIBBPF_SUPPORT
2615 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2616 	set_nobuild('\0', "clang-path", true);
2617 	set_nobuild('\0', "clang-opt", true);
2618 # undef set_nobuild
2619 #endif
2620 
2621 #ifndef HAVE_BPF_PROLOGUE
2622 # if !defined (HAVE_DWARF_SUPPORT)
2623 #  define REASON  "NO_DWARF=1"
2624 # elif !defined (HAVE_LIBBPF_SUPPORT)
2625 #  define REASON  "NO_LIBBPF=1"
2626 # else
2627 #  define REASON  "this architecture doesn't support BPF prologue"
2628 # endif
2629 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2630 	set_nobuild('\0', "vmlinux", true);
2631 # undef set_nobuild
2632 # undef REASON
2633 #endif
2634 
2635 	rec->opts.affinity = PERF_AFFINITY_SYS;
2636 
2637 	rec->evlist = evlist__new();
2638 	if (rec->evlist == NULL)
2639 		return -ENOMEM;
2640 
2641 	err = perf_config(perf_record_config, rec);
2642 	if (err)
2643 		return err;
2644 
2645 	argc = parse_options(argc, argv, record_options, record_usage,
2646 			    PARSE_OPT_STOP_AT_NON_OPTION);
2647 	if (quiet)
2648 		perf_quiet_option();
2649 
2650 	/* Make system wide (-a) the default target. */
2651 	if (!argc && target__none(&rec->opts.target))
2652 		rec->opts.target.system_wide = true;
2653 
2654 	if (nr_cgroups && !rec->opts.target.system_wide) {
2655 		usage_with_options_msg(record_usage, record_options,
2656 			"cgroup monitoring only available in system-wide mode");
2657 
2658 	}
2659 
2660 	if (rec->opts.kcore)
2661 		rec->data.is_dir = true;
2662 
2663 	if (rec->opts.comp_level != 0) {
2664 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2665 		rec->no_buildid = true;
2666 	}
2667 
2668 	if (rec->opts.record_switch_events &&
2669 	    !perf_can_record_switch_events()) {
2670 		ui__error("kernel does not support recording context switch events\n");
2671 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2672 		err = -EINVAL;
2673 		goto out_opts;
2674 	}
2675 
2676 	if (switch_output_setup(rec)) {
2677 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2678 		err = -EINVAL;
2679 		goto out_opts;
2680 	}
2681 
2682 	if (rec->switch_output.time) {
2683 		signal(SIGALRM, alarm_sig_handler);
2684 		alarm(rec->switch_output.time);
2685 	}
2686 
2687 	if (rec->switch_output.num_files) {
2688 		rec->switch_output.filenames = calloc(sizeof(char *),
2689 						      rec->switch_output.num_files);
2690 		if (!rec->switch_output.filenames) {
2691 			err = -EINVAL;
2692 			goto out_opts;
2693 		}
2694 	}
2695 
2696 	/*
2697 	 * Allow aliases to facilitate the lookup of symbols for address
2698 	 * filters. Refer to auxtrace_parse_filters().
2699 	 */
2700 	symbol_conf.allow_aliases = true;
2701 
2702 	symbol__init(NULL);
2703 
2704 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2705 		rec->affinity_mask.nbits = cpu__max_cpu();
2706 		rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2707 		if (!rec->affinity_mask.bits) {
2708 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2709 			err = -ENOMEM;
2710 			goto out_opts;
2711 		}
2712 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2713 	}
2714 
2715 	err = record__auxtrace_init(rec);
2716 	if (err)
2717 		goto out;
2718 
2719 	if (dry_run)
2720 		goto out;
2721 
2722 	err = bpf__setup_stdout(rec->evlist);
2723 	if (err) {
2724 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2725 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2726 			 errbuf);
2727 		goto out;
2728 	}
2729 
2730 	err = -ENOMEM;
2731 
2732 	if (rec->no_buildid_cache || rec->no_buildid) {
2733 		disable_buildid_cache();
2734 	} else if (rec->switch_output.enabled) {
2735 		/*
2736 		 * In 'perf record --switch-output', disable buildid
2737 		 * generation by default to reduce data file switching
2738 		 * overhead. Still generate buildid if they are required
2739 		 * explicitly using
2740 		 *
2741 		 *  perf record --switch-output --no-no-buildid \
2742 		 *              --no-no-buildid-cache
2743 		 *
2744 		 * Following code equals to:
2745 		 *
2746 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2747 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2748 		 *         disable_buildid_cache();
2749 		 */
2750 		bool disable = true;
2751 
2752 		if (rec->no_buildid_set && !rec->no_buildid)
2753 			disable = false;
2754 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2755 			disable = false;
2756 		if (disable) {
2757 			rec->no_buildid = true;
2758 			rec->no_buildid_cache = true;
2759 			disable_buildid_cache();
2760 		}
2761 	}
2762 
2763 	if (record.opts.overwrite)
2764 		record.opts.tail_synthesize = true;
2765 
2766 	if (rec->evlist->core.nr_entries == 0 &&
2767 	    __evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2768 		pr_err("Not enough memory for event selector list\n");
2769 		goto out;
2770 	}
2771 
2772 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2773 		rec->opts.no_inherit = true;
2774 
2775 	err = target__validate(&rec->opts.target);
2776 	if (err) {
2777 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2778 		ui__warning("%s\n", errbuf);
2779 	}
2780 
2781 	err = target__parse_uid(&rec->opts.target);
2782 	if (err) {
2783 		int saved_errno = errno;
2784 
2785 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2786 		ui__error("%s", errbuf);
2787 
2788 		err = -saved_errno;
2789 		goto out;
2790 	}
2791 
2792 	/* Enable ignoring missing threads when -u/-p option is defined. */
2793 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2794 
2795 	err = -ENOMEM;
2796 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2797 		usage_with_options(record_usage, record_options);
2798 
2799 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2800 	if (err)
2801 		goto out;
2802 
2803 	/*
2804 	 * We take all buildids when the file contains
2805 	 * AUX area tracing data because we do not decode the
2806 	 * trace because it would take too long.
2807 	 */
2808 	if (rec->opts.full_auxtrace)
2809 		rec->buildid_all = true;
2810 
2811 	if (rec->opts.text_poke) {
2812 		err = record__config_text_poke(rec->evlist);
2813 		if (err) {
2814 			pr_err("record__config_text_poke failed, error %d\n", err);
2815 			goto out;
2816 		}
2817 	}
2818 
2819 	if (record_opts__config(&rec->opts)) {
2820 		err = -EINVAL;
2821 		goto out;
2822 	}
2823 
2824 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2825 		rec->opts.nr_cblocks = nr_cblocks_max;
2826 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2827 
2828 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2829 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2830 
2831 	if (rec->opts.comp_level > comp_level_max)
2832 		rec->opts.comp_level = comp_level_max;
2833 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2834 
2835 	err = __cmd_record(&record, argc, argv);
2836 out:
2837 	bitmap_free(rec->affinity_mask.bits);
2838 	evlist__delete(rec->evlist);
2839 	symbol__exit();
2840 	auxtrace_record__free(rec->itr);
2841 out_opts:
2842 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2843 	return err;
2844 }
2845 
snapshot_sig_handler(int sig __maybe_unused)2846 static void snapshot_sig_handler(int sig __maybe_unused)
2847 {
2848 	struct record *rec = &record;
2849 
2850 	hit_auxtrace_snapshot_trigger(rec);
2851 
2852 	if (switch_output_signal(rec))
2853 		trigger_hit(&switch_output_trigger);
2854 }
2855 
alarm_sig_handler(int sig __maybe_unused)2856 static void alarm_sig_handler(int sig __maybe_unused)
2857 {
2858 	struct record *rec = &record;
2859 
2860 	if (switch_output_time(rec))
2861 		trigger_hit(&switch_output_trigger);
2862 }
2863