1 /* SPDX-License-Identifier: GPL-2.0 */
2 #undef TRACE_SYSTEM
3 #define TRACE_SYSTEM sched
4 
5 #if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
6 #define _TRACE_SCHED_H
7 
8 #include <linux/sched/numa_balancing.h>
9 #include <linux/tracepoint.h>
10 #include <linux/binfmts.h>
11 
12 /*
13  * Tracepoint for calling kthread_stop, performed to end a kthread:
14  */
15 TRACE_EVENT(sched_kthread_stop,
16 
17 	TP_PROTO(struct task_struct *t),
18 
19 	TP_ARGS(t),
20 
21 	TP_STRUCT__entry(
22 		__array(	char,	comm,	TASK_COMM_LEN	)
23 		__field(	pid_t,	pid			)
24 	),
25 
26 	TP_fast_assign(
27 		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
28 		__entry->pid	= t->pid;
29 	),
30 
31 	TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid)
32 );
33 
34 /*
35  * Tracepoint for the return value of the kthread stopping:
36  */
37 TRACE_EVENT(sched_kthread_stop_ret,
38 
39 	TP_PROTO(int ret),
40 
41 	TP_ARGS(ret),
42 
43 	TP_STRUCT__entry(
44 		__field(	int,	ret	)
45 	),
46 
47 	TP_fast_assign(
48 		__entry->ret	= ret;
49 	),
50 
51 	TP_printk("ret=%d", __entry->ret)
52 );
53 
54 /*
55  * Tracepoint for waking up a task:
56  */
57 DECLARE_EVENT_CLASS(sched_wakeup_template,
58 
59 	TP_PROTO(struct task_struct *p),
60 
61 	TP_ARGS(__perf_task(p)),
62 
63 	TP_STRUCT__entry(
64 		__array(	char,	comm,	TASK_COMM_LEN	)
65 		__field(	pid_t,	pid			)
66 		__field(	int,	prio			)
67 		__field(	int,	success			)
68 		__field(	int,	target_cpu		)
69 	),
70 
71 	TP_fast_assign(
72 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
73 		__entry->pid		= p->pid;
74 		__entry->prio		= p->prio; /* XXX SCHED_DEADLINE */
75 		__entry->success	= 1; /* rudiment, kill when possible */
76 		__entry->target_cpu	= task_cpu(p);
77 	),
78 
79 	TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d",
80 		  __entry->comm, __entry->pid, __entry->prio,
81 		  __entry->target_cpu)
82 );
83 
84 /*
85  * Tracepoint called when waking a task; this tracepoint is guaranteed to be
86  * called from the waking context.
87  */
88 DEFINE_EVENT(sched_wakeup_template, sched_waking,
89 	     TP_PROTO(struct task_struct *p),
90 	     TP_ARGS(p));
91 
92 /*
93  * Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG.
94  * It it not always called from the waking context.
95  */
96 DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
97 	     TP_PROTO(struct task_struct *p),
98 	     TP_ARGS(p));
99 
100 /*
101  * Tracepoint for waking up a new task:
102  */
103 DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
104 	     TP_PROTO(struct task_struct *p),
105 	     TP_ARGS(p));
106 
107 #ifdef CREATE_TRACE_POINTS
__trace_sched_switch_state(bool preempt,struct task_struct * p)108 static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
109 {
110 	unsigned int state;
111 
112 #ifdef CONFIG_SCHED_DEBUG
113 	BUG_ON(p != current);
114 #endif /* CONFIG_SCHED_DEBUG */
115 
116 	/*
117 	 * Preemption ignores task state, therefore preempted tasks are always
118 	 * RUNNING (we will not have dequeued if state != RUNNING).
119 	 */
120 	if (preempt)
121 		return TASK_REPORT_MAX;
122 
123 	/*
124 	 * task_state_index() uses fls() and returns a value from 0-8 range.
125 	 * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using
126 	 * it for left shift operation to get the correct task->state
127 	 * mapping.
128 	 */
129 	state = task_state_index(p);
130 
131 	return state ? (1 << (state - 1)) : state;
132 }
133 #endif /* CREATE_TRACE_POINTS */
134 
135 /*
136  * Tracepoint for task switches, performed by the scheduler:
137  */
138 TRACE_EVENT(sched_switch,
139 
140 	TP_PROTO(bool preempt,
141 		 struct task_struct *prev,
142 		 struct task_struct *next),
143 
144 	TP_ARGS(preempt, prev, next),
145 
146 	TP_STRUCT__entry(
147 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
148 		__field(	pid_t,	prev_pid			)
149 		__field(	int,	prev_prio			)
150 		__field(	long,	prev_state			)
151 		__array(	char,	next_comm,	TASK_COMM_LEN	)
152 		__field(	pid_t,	next_pid			)
153 		__field(	int,	next_prio			)
154 	),
155 
156 	TP_fast_assign(
157 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
158 		__entry->prev_pid	= prev->pid;
159 		__entry->prev_prio	= prev->prio;
160 		__entry->prev_state	= __trace_sched_switch_state(preempt, prev);
161 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
162 		__entry->next_pid	= next->pid;
163 		__entry->next_prio	= next->prio;
164 		/* XXX SCHED_DEADLINE */
165 	),
166 
167 	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
168 		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
169 
170 		(__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
171 		  __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
172 				{ TASK_INTERRUPTIBLE, "S" },
173 				{ TASK_UNINTERRUPTIBLE, "D" },
174 				{ __TASK_STOPPED, "T" },
175 				{ __TASK_TRACED, "t" },
176 				{ EXIT_DEAD, "X" },
177 				{ EXIT_ZOMBIE, "Z" },
178 				{ TASK_PARKED, "P" },
179 				{ TASK_DEAD, "I" }) :
180 		  "R",
181 
182 		__entry->prev_state & TASK_REPORT_MAX ? "+" : "",
183 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
184 );
185 
186 /*
187  * Tracepoint for a task being migrated:
188  */
189 TRACE_EVENT(sched_migrate_task,
190 
191 	TP_PROTO(struct task_struct *p, int dest_cpu),
192 
193 	TP_ARGS(p, dest_cpu),
194 
195 	TP_STRUCT__entry(
196 		__array(	char,	comm,	TASK_COMM_LEN	)
197 		__field(	pid_t,	pid			)
198 		__field(	int,	prio			)
199 		__field(	int,	orig_cpu		)
200 		__field(	int,	dest_cpu		)
201 	),
202 
203 	TP_fast_assign(
204 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
205 		__entry->pid		= p->pid;
206 		__entry->prio		= p->prio; /* XXX SCHED_DEADLINE */
207 		__entry->orig_cpu	= task_cpu(p);
208 		__entry->dest_cpu	= dest_cpu;
209 	),
210 
211 	TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d",
212 		  __entry->comm, __entry->pid, __entry->prio,
213 		  __entry->orig_cpu, __entry->dest_cpu)
214 );
215 
216 DECLARE_EVENT_CLASS(sched_process_template,
217 
218 	TP_PROTO(struct task_struct *p),
219 
220 	TP_ARGS(p),
221 
222 	TP_STRUCT__entry(
223 		__array(	char,	comm,	TASK_COMM_LEN	)
224 		__field(	pid_t,	pid			)
225 		__field(	int,	prio			)
226 	),
227 
228 	TP_fast_assign(
229 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
230 		__entry->pid		= p->pid;
231 		__entry->prio		= p->prio; /* XXX SCHED_DEADLINE */
232 	),
233 
234 	TP_printk("comm=%s pid=%d prio=%d",
235 		  __entry->comm, __entry->pid, __entry->prio)
236 );
237 
238 /*
239  * Tracepoint for freeing a task:
240  */
241 DEFINE_EVENT(sched_process_template, sched_process_free,
242 	     TP_PROTO(struct task_struct *p),
243 	     TP_ARGS(p));
244 
245 /*
246  * Tracepoint for a task exiting:
247  */
248 DEFINE_EVENT(sched_process_template, sched_process_exit,
249 	     TP_PROTO(struct task_struct *p),
250 	     TP_ARGS(p));
251 
252 /*
253  * Tracepoint for waiting on task to unschedule:
254  */
255 DEFINE_EVENT(sched_process_template, sched_wait_task,
256 	TP_PROTO(struct task_struct *p),
257 	TP_ARGS(p));
258 
259 /*
260  * Tracepoint for a waiting task:
261  */
262 TRACE_EVENT(sched_process_wait,
263 
264 	TP_PROTO(struct pid *pid),
265 
266 	TP_ARGS(pid),
267 
268 	TP_STRUCT__entry(
269 		__array(	char,	comm,	TASK_COMM_LEN	)
270 		__field(	pid_t,	pid			)
271 		__field(	int,	prio			)
272 	),
273 
274 	TP_fast_assign(
275 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
276 		__entry->pid		= pid_nr(pid);
277 		__entry->prio		= current->prio; /* XXX SCHED_DEADLINE */
278 	),
279 
280 	TP_printk("comm=%s pid=%d prio=%d",
281 		  __entry->comm, __entry->pid, __entry->prio)
282 );
283 
284 /*
285  * Tracepoint for do_fork:
286  */
287 TRACE_EVENT(sched_process_fork,
288 
289 	TP_PROTO(struct task_struct *parent, struct task_struct *child),
290 
291 	TP_ARGS(parent, child),
292 
293 	TP_STRUCT__entry(
294 		__array(	char,	parent_comm,	TASK_COMM_LEN	)
295 		__field(	pid_t,	parent_pid			)
296 		__array(	char,	child_comm,	TASK_COMM_LEN	)
297 		__field(	pid_t,	child_pid			)
298 	),
299 
300 	TP_fast_assign(
301 		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
302 		__entry->parent_pid	= parent->pid;
303 		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
304 		__entry->child_pid	= child->pid;
305 	),
306 
307 	TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d",
308 		__entry->parent_comm, __entry->parent_pid,
309 		__entry->child_comm, __entry->child_pid)
310 );
311 
312 /*
313  * Tracepoint for exec:
314  */
315 TRACE_EVENT(sched_process_exec,
316 
317 	TP_PROTO(struct task_struct *p, pid_t old_pid,
318 		 struct linux_binprm *bprm),
319 
320 	TP_ARGS(p, old_pid, bprm),
321 
322 	TP_STRUCT__entry(
323 		__string(	filename,	bprm->filename	)
324 		__field(	pid_t,		pid		)
325 		__field(	pid_t,		old_pid		)
326 	),
327 
328 	TP_fast_assign(
329 		__assign_str(filename, bprm->filename);
330 		__entry->pid		= p->pid;
331 		__entry->old_pid	= old_pid;
332 	),
333 
334 	TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename),
335 		  __entry->pid, __entry->old_pid)
336 );
337 
338 
339 #ifdef CONFIG_SCHEDSTATS
340 #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT
341 #define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS
342 #else
343 #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT_NOP
344 #define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS_NOP
345 #endif
346 
347 /*
348  * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
349  *     adding sched_stat support to SCHED_FIFO/RR would be welcome.
350  */
351 DECLARE_EVENT_CLASS_SCHEDSTAT(sched_stat_template,
352 
353 	TP_PROTO(struct task_struct *tsk, u64 delay),
354 
355 	TP_ARGS(__perf_task(tsk), __perf_count(delay)),
356 
357 	TP_STRUCT__entry(
358 		__array( char,	comm,	TASK_COMM_LEN	)
359 		__field( pid_t,	pid			)
360 		__field( u64,	delay			)
361 	),
362 
363 	TP_fast_assign(
364 		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
365 		__entry->pid	= tsk->pid;
366 		__entry->delay	= delay;
367 	),
368 
369 	TP_printk("comm=%s pid=%d delay=%Lu [ns]",
370 			__entry->comm, __entry->pid,
371 			(unsigned long long)__entry->delay)
372 );
373 
374 /*
375  * Tracepoint for accounting wait time (time the task is runnable
376  * but not actually running due to scheduler contention).
377  */
378 DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_wait,
379 	     TP_PROTO(struct task_struct *tsk, u64 delay),
380 	     TP_ARGS(tsk, delay));
381 
382 /*
383  * Tracepoint for accounting sleep time (time the task is not runnable,
384  * including iowait, see below).
385  */
386 DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_sleep,
387 	     TP_PROTO(struct task_struct *tsk, u64 delay),
388 	     TP_ARGS(tsk, delay));
389 
390 /*
391  * Tracepoint for accounting iowait time (time the task is not runnable
392  * due to waiting on IO to complete).
393  */
394 DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_iowait,
395 	     TP_PROTO(struct task_struct *tsk, u64 delay),
396 	     TP_ARGS(tsk, delay));
397 
398 /*
399  * Tracepoint for accounting blocked time (time the task is in uninterruptible).
400  */
401 DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked,
402 	     TP_PROTO(struct task_struct *tsk, u64 delay),
403 	     TP_ARGS(tsk, delay));
404 
405 /*
406  * Tracepoint for accounting runtime (time the task is executing
407  * on a CPU).
408  */
409 DECLARE_EVENT_CLASS(sched_stat_runtime,
410 
411 	TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
412 
413 	TP_ARGS(tsk, __perf_count(runtime), vruntime),
414 
415 	TP_STRUCT__entry(
416 		__array( char,	comm,	TASK_COMM_LEN	)
417 		__field( pid_t,	pid			)
418 		__field( u64,	runtime			)
419 		__field( u64,	vruntime			)
420 	),
421 
422 	TP_fast_assign(
423 		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
424 		__entry->pid		= tsk->pid;
425 		__entry->runtime	= runtime;
426 		__entry->vruntime	= vruntime;
427 	),
428 
429 	TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]",
430 			__entry->comm, __entry->pid,
431 			(unsigned long long)__entry->runtime,
432 			(unsigned long long)__entry->vruntime)
433 );
434 
435 DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
436 	     TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
437 	     TP_ARGS(tsk, runtime, vruntime));
438 
439 /*
440  * Tracepoint for showing priority inheritance modifying a tasks
441  * priority.
442  */
443 TRACE_EVENT(sched_pi_setprio,
444 
445 	TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task),
446 
447 	TP_ARGS(tsk, pi_task),
448 
449 	TP_STRUCT__entry(
450 		__array( char,	comm,	TASK_COMM_LEN	)
451 		__field( pid_t,	pid			)
452 		__field( int,	oldprio			)
453 		__field( int,	newprio			)
454 	),
455 
456 	TP_fast_assign(
457 		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
458 		__entry->pid		= tsk->pid;
459 		__entry->oldprio	= tsk->prio;
460 		__entry->newprio	= pi_task ?
461 				min(tsk->normal_prio, pi_task->prio) :
462 				tsk->normal_prio;
463 		/* XXX SCHED_DEADLINE bits missing */
464 	),
465 
466 	TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
467 			__entry->comm, __entry->pid,
468 			__entry->oldprio, __entry->newprio)
469 );
470 
471 #ifdef CONFIG_DETECT_HUNG_TASK
472 TRACE_EVENT(sched_process_hang,
473 	TP_PROTO(struct task_struct *tsk),
474 	TP_ARGS(tsk),
475 
476 	TP_STRUCT__entry(
477 		__array( char,	comm,	TASK_COMM_LEN	)
478 		__field( pid_t,	pid			)
479 	),
480 
481 	TP_fast_assign(
482 		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
483 		__entry->pid = tsk->pid;
484 	),
485 
486 	TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid)
487 );
488 #endif /* CONFIG_DETECT_HUNG_TASK */
489 
490 DECLARE_EVENT_CLASS(sched_move_task_template,
491 
492 	TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
493 
494 	TP_ARGS(tsk, src_cpu, dst_cpu),
495 
496 	TP_STRUCT__entry(
497 		__field( pid_t,	pid			)
498 		__field( pid_t,	tgid			)
499 		__field( pid_t,	ngid			)
500 		__field( int,	src_cpu			)
501 		__field( int,	src_nid			)
502 		__field( int,	dst_cpu			)
503 		__field( int,	dst_nid			)
504 	),
505 
506 	TP_fast_assign(
507 		__entry->pid		= task_pid_nr(tsk);
508 		__entry->tgid		= task_tgid_nr(tsk);
509 		__entry->ngid		= task_numa_group_id(tsk);
510 		__entry->src_cpu	= src_cpu;
511 		__entry->src_nid	= cpu_to_node(src_cpu);
512 		__entry->dst_cpu	= dst_cpu;
513 		__entry->dst_nid	= cpu_to_node(dst_cpu);
514 	),
515 
516 	TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d",
517 			__entry->pid, __entry->tgid, __entry->ngid,
518 			__entry->src_cpu, __entry->src_nid,
519 			__entry->dst_cpu, __entry->dst_nid)
520 );
521 
522 /*
523  * Tracks migration of tasks from one runqueue to another. Can be used to
524  * detect if automatic NUMA balancing is bouncing between nodes
525  */
526 DEFINE_EVENT(sched_move_task_template, sched_move_numa,
527 	TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
528 
529 	TP_ARGS(tsk, src_cpu, dst_cpu)
530 );
531 
532 DEFINE_EVENT(sched_move_task_template, sched_stick_numa,
533 	TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu),
534 
535 	TP_ARGS(tsk, src_cpu, dst_cpu)
536 );
537 
538 TRACE_EVENT(sched_swap_numa,
539 
540 	TP_PROTO(struct task_struct *src_tsk, int src_cpu,
541 		 struct task_struct *dst_tsk, int dst_cpu),
542 
543 	TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu),
544 
545 	TP_STRUCT__entry(
546 		__field( pid_t,	src_pid			)
547 		__field( pid_t,	src_tgid		)
548 		__field( pid_t,	src_ngid		)
549 		__field( int,	src_cpu			)
550 		__field( int,	src_nid			)
551 		__field( pid_t,	dst_pid			)
552 		__field( pid_t,	dst_tgid		)
553 		__field( pid_t,	dst_ngid		)
554 		__field( int,	dst_cpu			)
555 		__field( int,	dst_nid			)
556 	),
557 
558 	TP_fast_assign(
559 		__entry->src_pid	= task_pid_nr(src_tsk);
560 		__entry->src_tgid	= task_tgid_nr(src_tsk);
561 		__entry->src_ngid	= task_numa_group_id(src_tsk);
562 		__entry->src_cpu	= src_cpu;
563 		__entry->src_nid	= cpu_to_node(src_cpu);
564 		__entry->dst_pid	= task_pid_nr(dst_tsk);
565 		__entry->dst_tgid	= task_tgid_nr(dst_tsk);
566 		__entry->dst_ngid	= task_numa_group_id(dst_tsk);
567 		__entry->dst_cpu	= dst_cpu;
568 		__entry->dst_nid	= cpu_to_node(dst_cpu);
569 	),
570 
571 	TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d",
572 			__entry->src_pid, __entry->src_tgid, __entry->src_ngid,
573 			__entry->src_cpu, __entry->src_nid,
574 			__entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid,
575 			__entry->dst_cpu, __entry->dst_nid)
576 );
577 
578 /*
579  * Tracepoint for waking a polling cpu without an IPI.
580  */
581 TRACE_EVENT(sched_wake_idle_without_ipi,
582 
583 	TP_PROTO(int cpu),
584 
585 	TP_ARGS(cpu),
586 
587 	TP_STRUCT__entry(
588 		__field(	int,	cpu	)
589 	),
590 
591 	TP_fast_assign(
592 		__entry->cpu	= cpu;
593 	),
594 
595 	TP_printk("cpu=%d", __entry->cpu)
596 );
597 
598 /*
599  * Following tracepoints are not exported in tracefs and provide hooking
600  * mechanisms only for testing and debugging purposes.
601  *
602  * Postfixed with _tp to make them easily identifiable in the code.
603  */
604 DECLARE_TRACE(pelt_cfs_tp,
605 	TP_PROTO(struct cfs_rq *cfs_rq),
606 	TP_ARGS(cfs_rq));
607 
608 DECLARE_TRACE(pelt_rt_tp,
609 	TP_PROTO(struct rq *rq),
610 	TP_ARGS(rq));
611 
612 DECLARE_TRACE(pelt_dl_tp,
613 	TP_PROTO(struct rq *rq),
614 	TP_ARGS(rq));
615 
616 DECLARE_TRACE(pelt_irq_tp,
617 	TP_PROTO(struct rq *rq),
618 	TP_ARGS(rq));
619 
620 DECLARE_TRACE(pelt_se_tp,
621 	TP_PROTO(struct sched_entity *se),
622 	TP_ARGS(se));
623 
624 DECLARE_TRACE(sched_overutilized_tp,
625 	TP_PROTO(struct root_domain *rd, bool overutilized),
626 	TP_ARGS(rd, overutilized));
627 
628 #endif /* _TRACE_SCHED_H */
629 
630 /* This part must be outside protection */
631 #include <trace/define_trace.h>
632