1 /*
2 * Copyright (c) 2019 Intel Corporation
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7 #include <zephyr/kernel.h>
8 #include <zephyr/sys/printk.h>
9 #include <wait_q.h>
10 #include <ksched.h>
11
12 /* This is a scheduler microbenchmark, designed to measure latencies
13 * of specific low level scheduling primitives independent of overhead
14 * from application or API abstractions. It works very simply: a main
15 * thread creates a "partner" thread at a higher priority, the partner
16 * then sleeps using z_pend_curr(). From this initial
17 * state:
18 *
19 * 1. The main thread calls z_unpend_first_thread()
20 * 2. The main thread calls z_ready_thread()
21 * 3. The main thread calls k_yield()
22 * (the kernel switches to the partner thread)
23 * 4. The partner thread then runs and calls z_pend_curr() again
24 * (the kernel switches to the main thread)
25 * 5. The main thread returns from k_yield()
26 *
27 * It then iterates this many times, reporting timestamp latencies
28 * between each numbered step and for the whole cycle, and a running
29 * average for all cycles run.
30 */
31
32 #define N_RUNS 1000
33 #define N_SETTLE 10
34
35
36 static K_THREAD_STACK_DEFINE(partner_stack, 1024);
37 static struct k_thread partner_thread;
38
39 #if (CONFIG_MP_MAX_NUM_CPUS > 1)
40 static struct k_thread busy_thread[CONFIG_MP_MAX_NUM_CPUS - 1];
41
42 #define BUSY_THREAD_STACK_SIZE (1024 + CONFIG_TEST_EXTRA_STACK_SIZE)
43
44 static K_THREAD_STACK_ARRAY_DEFINE(busy_thread_stack, CONFIG_MP_MAX_NUM_CPUS - 1,
45 BUSY_THREAD_STACK_SIZE);
46 #endif /* (CONFIG_MP_MAX_NUM_CPUS > 1) */
47
48 _wait_q_t waitq;
49
50 enum {
51 UNPENDING,
52 UNPENDED_READYING,
53 READIED_YIELDING,
54 PARTNER_AWAKE_PENDING,
55 YIELDED,
56 NUM_STAMP_STATES
57 };
58
59 uint32_t stamps[NUM_STAMP_STATES];
60
61 static struct k_spinlock lock;
62
_stamp(int state)63 static inline int _stamp(int state)
64 {
65 uint32_t t;
66
67 /* In theory the TSC has much lower overhead and higher
68 * precision. In practice it's VERY jittery in recent qemu
69 * versions and frankly too noisy to trust.
70 */
71 #ifdef CONFIG_X86
72 __asm__ volatile("rdtsc" : "=a"(t) : : "edx");
73 #else
74 t = k_cycle_get_32();
75 #endif
76
77 stamps[state] = t;
78 return t;
79 }
80
81 /* #define stamp(s) printk("%s @ %d\n", #s, _stamp(s)) */
82 #define stamp(s) _stamp(s)
83
partner_fn(void * arg1,void * arg2,void * arg3)84 static void partner_fn(void *arg1, void *arg2, void *arg3)
85 {
86 ARG_UNUSED(arg1);
87 ARG_UNUSED(arg2);
88 ARG_UNUSED(arg3);
89
90 printk("Running %p\n", k_current_get());
91
92 while (true) {
93 k_spinlock_key_t key = k_spin_lock(&lock);
94
95 z_pend_curr(&lock, key, &waitq, K_FOREVER);
96 stamp(PARTNER_AWAKE_PENDING);
97 }
98 }
99
100 #if (CONFIG_MP_MAX_NUM_CPUS > 1)
busy_thread_entry(void * arg1,void * arg2,void * arg3)101 static void busy_thread_entry(void *arg1, void *arg2, void *arg3)
102 {
103 while (true) {
104 }
105 }
106 #endif /* (CONFIG_MP_MAX_NUM_CPUS > 1) */
107
main(void)108 int main(void)
109 {
110 #if (CONFIG_MP_MAX_NUM_CPUS > 1)
111 /* Spawn busy threads that will execute on the other cores */
112 for (uint32_t i = 0; i < CONFIG_MP_MAX_NUM_CPUS - 1; i++) {
113 k_thread_create(&busy_thread[i], busy_thread_stack[i],
114 BUSY_THREAD_STACK_SIZE, busy_thread_entry,
115 NULL, NULL, NULL,
116 K_HIGHEST_THREAD_PRIO, 0, K_NO_WAIT);
117 }
118 #endif /* (CONFIG_MP_MAX_NUM_CPUS > 1) */
119
120 z_waitq_init(&waitq);
121
122 int main_prio = k_thread_priority_get(k_current_get());
123 int partner_prio = main_prio - 1;
124
125 k_tid_t th = k_thread_create(&partner_thread, partner_stack,
126 K_THREAD_STACK_SIZEOF(partner_stack),
127 partner_fn, NULL, NULL, NULL,
128 partner_prio, 0, K_NO_WAIT);
129
130 /* Let it start running and pend */
131 k_sleep(K_MSEC(100));
132
133 uint64_t tot = 0U;
134 uint32_t runs = 0U;
135
136 int key;
137
138 for (int i = 0; i < N_RUNS + N_SETTLE; i++) {
139 key = arch_irq_lock();
140 stamp(UNPENDING);
141 z_unpend_first_thread(&waitq);
142 arch_irq_unlock(key);
143 stamp(UNPENDED_READYING);
144 z_ready_thread(th);
145 stamp(READIED_YIELDING);
146
147 /* z_ready_thread() does not reschedule, so this is
148 * guaranteed to be the point where we will yield to
149 * the new thread, which (being higher priority) will
150 * run immediately, and we'll wake up synchronously as
151 * soon as it pends.
152 */
153 k_yield();
154 stamp(YIELDED);
155
156 uint32_t avg, whole = stamps[4] - stamps[0];
157
158 if (++runs > N_SETTLE) {
159 /* Only compute averages after the first ~10
160 * runs to let performance settle, cache
161 * effects in the host pollute the early
162 * data
163 */
164 tot += whole;
165 avg = tot / (runs - 10);
166 } else {
167 tot = 0U;
168 avg = 0U;
169 }
170
171 /* For reference, an unmodified HEAD on qemu_x86 with
172 * !USERSPACE and SCHED_DUMB and using -icount
173 * shift=0,sleep=off,align=off, I get results of:
174 *
175 * unpend 132 ready 257 switch 278 pend 321 tot 988 (avg 900)
176 */
177 printk("unpend %4d ready %4d switch %4d pend %4d tot %4d (avg %4d)\n",
178 stamps[1] - stamps[0],
179 stamps[2] - stamps[1],
180 stamps[3] - stamps[2],
181 stamps[4] - stamps[3],
182 whole, avg);
183 }
184 printk("fin\n");
185 return 0;
186 }
187