1 /*
2  * Copyright (c) 2019 Intel Corporation
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  */
6 
7 #include <zephyr/kernel.h>
8 #include <zephyr/sys/printk.h>
9 #include <wait_q.h>
10 #include <ksched.h>
11 
12 /* This is a scheduler microbenchmark, designed to measure latencies
13  * of specific low level scheduling primitives independent of overhead
14  * from application or API abstractions.  It works very simply: a main
15  * thread creates a "partner" thread at a higher priority, the partner
16  * then sleeps using z_pend_curr().  From this initial
17  * state:
18  *
19  * 1. The main thread calls z_unpend_first_thread()
20  * 2. The main thread calls z_ready_thread()
21  * 3. The main thread calls k_yield()
22  *    (the kernel switches to the partner thread)
23  * 4. The partner thread then runs and calls z_pend_curr() again
24  *    (the kernel switches to the main thread)
25  * 5. The main thread returns from k_yield()
26  *
27  * It then iterates this many times, reporting timestamp latencies
28  * between each numbered step and for the whole cycle, and a running
29  * average for all cycles run.
30  */
31 
32 #define N_RUNS 1000
33 #define N_SETTLE 10
34 
35 
36 static K_THREAD_STACK_DEFINE(partner_stack, 1024);
37 static struct k_thread partner_thread;
38 
39 #if (CONFIG_MP_MAX_NUM_CPUS > 1)
40 static struct k_thread busy_thread[CONFIG_MP_MAX_NUM_CPUS - 1];
41 
42 #define BUSY_THREAD_STACK_SIZE  (1024 + CONFIG_TEST_EXTRA_STACK_SIZE)
43 
44 static K_THREAD_STACK_ARRAY_DEFINE(busy_thread_stack, CONFIG_MP_MAX_NUM_CPUS - 1,
45 				   BUSY_THREAD_STACK_SIZE);
46 #endif /* (CONFIG_MP_MAX_NUM_CPUS > 1) */
47 
48 _wait_q_t waitq;
49 
50 enum {
51 	UNPENDING,
52 	UNPENDED_READYING,
53 	READIED_YIELDING,
54 	PARTNER_AWAKE_PENDING,
55 	YIELDED,
56 	NUM_STAMP_STATES
57 };
58 
59 uint32_t stamps[NUM_STAMP_STATES];
60 
61 static struct k_spinlock lock;
62 
_stamp(int state)63 static inline int _stamp(int state)
64 {
65 	uint32_t t;
66 
67 	/* In theory the TSC has much lower overhead and higher
68 	 * precision.  In practice it's VERY jittery in recent qemu
69 	 * versions and frankly too noisy to trust.
70 	 */
71 #ifdef CONFIG_X86
72 	__asm__ volatile("rdtsc" : "=a"(t) : : "edx");
73 #else
74 	t = k_cycle_get_32();
75 #endif
76 
77 	stamps[state] = t;
78 	return t;
79 }
80 
81 /* #define stamp(s) printk("%s @ %d\n", #s, _stamp(s)) */
82 #define stamp(s) _stamp(s)
83 
partner_fn(void * arg1,void * arg2,void * arg3)84 static void partner_fn(void *arg1, void *arg2, void *arg3)
85 {
86 	ARG_UNUSED(arg1);
87 	ARG_UNUSED(arg2);
88 	ARG_UNUSED(arg3);
89 
90 	printk("Running %p\n", k_current_get());
91 
92 	while (true) {
93 		k_spinlock_key_t  key = k_spin_lock(&lock);
94 
95 		z_pend_curr(&lock, key, &waitq, K_FOREVER);
96 		stamp(PARTNER_AWAKE_PENDING);
97 	}
98 }
99 
100 #if (CONFIG_MP_MAX_NUM_CPUS > 1)
busy_thread_entry(void * arg1,void * arg2,void * arg3)101 static void busy_thread_entry(void *arg1, void *arg2, void *arg3)
102 {
103 	while (true) {
104 	}
105 }
106 #endif /* (CONFIG_MP_MAX_NUM_CPUS > 1) */
107 
main(void)108 int main(void)
109 {
110 #if (CONFIG_MP_MAX_NUM_CPUS > 1)
111 	/* Spawn busy threads that will execute on the other cores */
112 	for (uint32_t i = 0; i < CONFIG_MP_MAX_NUM_CPUS - 1; i++) {
113 		k_thread_create(&busy_thread[i], busy_thread_stack[i],
114 				BUSY_THREAD_STACK_SIZE, busy_thread_entry,
115 				NULL, NULL, NULL,
116 				K_HIGHEST_THREAD_PRIO, 0, K_NO_WAIT);
117 	}
118 #endif /* (CONFIG_MP_MAX_NUM_CPUS > 1) */
119 
120 	z_waitq_init(&waitq);
121 
122 	int main_prio = k_thread_priority_get(k_current_get());
123 	int partner_prio = main_prio - 1;
124 
125 	k_tid_t th = k_thread_create(&partner_thread, partner_stack,
126 				     K_THREAD_STACK_SIZEOF(partner_stack),
127 				     partner_fn, NULL, NULL, NULL,
128 				     partner_prio, 0, K_NO_WAIT);
129 
130 	/* Let it start running and pend */
131 	k_sleep(K_MSEC(100));
132 
133 	uint64_t tot = 0U;
134 	uint32_t runs = 0U;
135 
136 	int key;
137 
138 	for (int i = 0; i < N_RUNS + N_SETTLE; i++) {
139 		key = arch_irq_lock();
140 		stamp(UNPENDING);
141 		z_unpend_first_thread(&waitq);
142 		arch_irq_unlock(key);
143 		stamp(UNPENDED_READYING);
144 		z_ready_thread(th);
145 		stamp(READIED_YIELDING);
146 
147 		/* z_ready_thread() does not reschedule, so this is
148 		 * guaranteed to be the point where we will yield to
149 		 * the new thread, which (being higher priority) will
150 		 * run immediately, and we'll wake up synchronously as
151 		 * soon as it pends.
152 		 */
153 		k_yield();
154 		stamp(YIELDED);
155 
156 		uint32_t avg, whole = stamps[4] - stamps[0];
157 
158 		if (++runs > N_SETTLE) {
159 			/* Only compute averages after the first ~10
160 			 * runs to let performance settle, cache
161 			 * effects in the host pollute the early
162 			 * data
163 			 */
164 			tot += whole;
165 			avg = tot / (runs - 10);
166 		} else {
167 			tot = 0U;
168 			avg = 0U;
169 		}
170 
171 		/* For reference, an unmodified HEAD on qemu_x86 with
172 		 * !USERSPACE and SCHED_DUMB and using -icount
173 		 * shift=0,sleep=off,align=off, I get results of:
174 		 *
175 		 * unpend 132 ready 257 switch 278 pend 321 tot 988 (avg 900)
176 		 */
177 		printk("unpend %4d ready %4d switch %4d pend %4d tot %4d (avg %4d)\n",
178 		       stamps[1] - stamps[0],
179 		       stamps[2] - stamps[1],
180 		       stamps[3] - stamps[2],
181 		       stamps[4] - stamps[3],
182 		       whole, avg);
183 	}
184 	printk("fin\n");
185 	return 0;
186 }
187