1 /*
2 * Copyright (c) 2021 Intel Corporation
3 * SPDX-License-Identifier: Apache-2.0
4 */
5
6 #include <cpuid.h> /* Header provided by the toolchain. */
7
8 #include <zephyr/init.h>
9 #include <zephyr/arch/x86/cpuid.h>
10 #include <zephyr/drivers/timer/system_timer.h>
11 #include <zephyr/sys_clock.h>
12 #include <zephyr/spinlock.h>
13 #include <zephyr/drivers/interrupt_controller/loapic.h>
14 #include <zephyr/irq.h>
15
16 /*
17 * This driver is selected when either CONFIG_APIC_TIMER_TSC or
18 * CONFIG_APIC_TSC_DEADLINE_TIMER is selected. The later is preferred over
19 * the former when the TSC deadline comparator is available.
20 */
21 BUILD_ASSERT((!IS_ENABLED(CONFIG_APIC_TIMER_TSC) &&
22 IS_ENABLED(CONFIG_APIC_TSC_DEADLINE_TIMER)) ||
23 (!IS_ENABLED(CONFIG_APIC_TSC_DEADLINE_TIMER) &&
24 IS_ENABLED(CONFIG_APIC_TIMER_TSC)),
25 "one of CONFIG_APIC_TIMER_TSC or CONFIG_APIC_TSC_DEADLINE_TIMER must be set");
26
27 /*
28 * If the TSC deadline comparator is not supported then the ICR in one-shot
29 * mode is used as a fallback method to trigger the next timeout interrupt.
30 * Those config symbols must then be defined:
31 *
32 * CONFIG_APIC_TIMER_TSC_N=<n>
33 * CONFIG_APIC_TIMER_TSC_M=<m>
34 *
35 * These are set to indicate the ratio of the TSC frequency to the local
36 * APIC timer frequency. This can be found via CPUID 0x15 (n = EBX, m = EAX)
37 * on most CPUs.
38 */
39 #ifdef CONFIG_APIC_TIMER_TSC
40 #define APIC_TIMER_TSC_M CONFIG_APIC_TIMER_TSC_M
41 #define APIC_TIMER_TSC_N CONFIG_APIC_TIMER_TSC_N
42 #else
43 #define APIC_TIMER_TSC_M 1
44 #define APIC_TIMER_TSC_N 1
45 #endif
46
47 #define IA32_TSC_DEADLINE_MSR 0x6e0
48 #define IA32_TSC_ADJUST_MSR 0x03b
49
50 #define CYC_PER_TICK (uint32_t)(CONFIG_SYS_CLOCK_HW_CYCLES_PER_SEC \
51 / CONFIG_SYS_CLOCK_TICKS_PER_SEC)
52
53 /* the unsigned long cast limits divisors to native CPU register width */
54 #define cycle_diff_t unsigned long
55 #define CYCLE_DIFF_MAX (~(cycle_diff_t)0)
56
57 /*
58 * We have two constraints on the maximum number of cycles we can wait for.
59 *
60 * 1) sys_clock_announce() accepts at most INT32_MAX ticks.
61 *
62 * 2) The number of cycles between two reports must fit in a cycle_diff_t
63 * variable before converting it to ticks.
64 *
65 * Then:
66 *
67 * 3) Pick the smallest between (1) and (2).
68 *
69 * 4) Take into account some room for the unavoidable IRQ servicing latency.
70 * Let's use 3/4 of the max range.
71 *
72 * Finally let's add the LSB value to the result so to clear out a bunch of
73 * consecutive set bits coming from the original max values to produce a
74 * nicer literal for assembly generation.
75 */
76 #define CYCLES_MAX_1 ((uint64_t)INT32_MAX * (uint64_t)CYC_PER_TICK)
77 #define CYCLES_MAX_2 ((uint64_t)CYCLE_DIFF_MAX)
78 #define CYCLES_MAX_3 MIN(CYCLES_MAX_1, CYCLES_MAX_2)
79 #define CYCLES_MAX_4 (CYCLES_MAX_3 / 2 + CYCLES_MAX_3 / 4)
80 #define CYCLES_MAX (CYCLES_MAX_4 + LSB_GET(CYCLES_MAX_4))
81
82 struct apic_timer_lvt {
83 uint8_t vector : 8;
84 uint8_t unused0 : 8;
85 uint8_t masked : 1;
86 enum { ONE_SHOT, PERIODIC, TSC_DEADLINE } mode: 2;
87 uint32_t unused2 : 13;
88 };
89
90 static struct k_spinlock lock;
91 static uint64_t last_cycle;
92 static uint64_t last_tick;
93 static uint32_t last_elapsed;
94 static union { uint32_t val; struct apic_timer_lvt lvt; } lvt_reg;
95
rdtsc(void)96 static ALWAYS_INLINE uint64_t rdtsc(void)
97 {
98 uint32_t hi, lo;
99
100 __asm__ volatile("rdtsc" : "=d"(hi), "=a"(lo));
101 return lo + (((uint64_t)hi) << 32);
102 }
103
wrmsr(int32_t msr,uint64_t val)104 static inline void wrmsr(int32_t msr, uint64_t val)
105 {
106 uint32_t hi = (uint32_t) (val >> 32);
107 uint32_t lo = (uint32_t) val;
108
109 __asm__ volatile("wrmsr" :: "d"(hi), "a"(lo), "c"(msr));
110 }
111
set_trigger(uint64_t deadline)112 static void set_trigger(uint64_t deadline)
113 {
114 if (IS_ENABLED(CONFIG_APIC_TSC_DEADLINE_TIMER)) {
115 wrmsr(IA32_TSC_DEADLINE_MSR, deadline);
116 } else {
117 /* use the timer ICR to trigger next interrupt */
118 uint64_t curr_cycle = rdtsc();
119 uint64_t delta_cycles = deadline - MIN(deadline, curr_cycle);
120 uint64_t icr = (delta_cycles * APIC_TIMER_TSC_M) / APIC_TIMER_TSC_N;
121
122 /* cap icr to 32 bits, and not zero */
123 icr = CLAMP(icr, 1, UINT32_MAX);
124 x86_write_loapic(LOAPIC_TIMER_ICR, icr);
125 }
126 }
127
isr(const void * arg)128 static void isr(const void *arg)
129 {
130 ARG_UNUSED(arg);
131
132 k_spinlock_key_t key = k_spin_lock(&lock);
133 uint64_t curr_cycle = rdtsc();
134 uint64_t delta_cycles = curr_cycle - last_cycle;
135 uint32_t delta_ticks = (cycle_diff_t)delta_cycles / CYC_PER_TICK;
136
137 last_cycle += (cycle_diff_t)delta_ticks * CYC_PER_TICK;
138 last_tick += delta_ticks;
139 last_elapsed = 0;
140
141 if (!IS_ENABLED(CONFIG_TICKLESS_KERNEL)) {
142 uint64_t next_cycle = last_cycle + CYC_PER_TICK;
143
144 set_trigger(next_cycle);
145 }
146
147 k_spin_unlock(&lock, key);
148 sys_clock_announce(delta_ticks);
149 }
150
sys_clock_set_timeout(int32_t ticks,bool idle)151 void sys_clock_set_timeout(int32_t ticks, bool idle)
152 {
153 ARG_UNUSED(idle);
154
155 if (!IS_ENABLED(CONFIG_TICKLESS_KERNEL)) {
156 return;
157 }
158
159 k_spinlock_key_t key = k_spin_lock(&lock);
160 uint64_t next_cycle;
161
162 if (ticks == K_TICKS_FOREVER) {
163 next_cycle = last_cycle + CYCLES_MAX;
164 } else {
165 next_cycle = (last_tick + last_elapsed + ticks) * CYC_PER_TICK;
166 if ((next_cycle - last_cycle) > CYCLES_MAX) {
167 next_cycle = last_cycle + CYCLES_MAX;
168 }
169 }
170
171 /*
172 * Interpreted strictly, the IA SDM description of the
173 * TSC_DEADLINE MSR implies that it will trigger an immediate
174 * interrupt if we try to set an expiration across the 64 bit
175 * rollover. Unfortunately there's no way to test that as on
176 * real hardware it requires more than a century of uptime,
177 * but this is cheap and safe.
178 */
179 if (next_cycle < last_cycle) {
180 next_cycle = UINT64_MAX;
181 }
182 set_trigger(next_cycle);
183
184 k_spin_unlock(&lock, key);
185 }
186
sys_clock_elapsed(void)187 uint32_t sys_clock_elapsed(void)
188 {
189 if (!IS_ENABLED(CONFIG_TICKLESS_KERNEL)) {
190 return 0;
191 }
192
193 k_spinlock_key_t key = k_spin_lock(&lock);
194 uint64_t curr_cycle = rdtsc();
195 uint64_t delta_cycles = curr_cycle - last_cycle;
196 uint32_t delta_ticks = (cycle_diff_t)delta_cycles / CYC_PER_TICK;
197
198 last_elapsed = delta_ticks;
199 k_spin_unlock(&lock, key);
200 return delta_ticks;
201 }
202
sys_clock_cycle_get_32(void)203 uint32_t sys_clock_cycle_get_32(void)
204 {
205 return (uint32_t) rdtsc();
206 }
207
sys_clock_cycle_get_64(void)208 uint64_t sys_clock_cycle_get_64(void)
209 {
210 return rdtsc();
211 }
212
timer_irq(void)213 static inline uint32_t timer_irq(void)
214 {
215 /* The Zephyr APIC API is... idiosyncratic. The timer is a
216 * "local vector table" interrupt. These aren't system IRQs
217 * presented to the IO-APIC, they're indices into a register
218 * array in the local APIC. By Zephyr convention they come
219 * after all the external IO-APIC interrupts, but that number
220 * changes depending on device configuration so we have to
221 * fetch it at runtime. The timer happens to be the first
222 * entry in the table.
223 */
224 return z_loapic_irq_base();
225 }
226
227 /* The TSC_ADJUST MSR implements a synchronized offset such that
228 * multiple CPUs (within a socket, anyway) can synchronize exactly, or
229 * implement managed timing spaces for guests in a recoverable way,
230 * etc... We set it to zero on all cores for simplicity, because
231 * firmware often leaves it in an inconsistent state between cores.
232 */
clear_tsc_adjust(void)233 static void clear_tsc_adjust(void)
234 {
235 /* But don't touch it on ACRN, where an hypervisor bug
236 * confuses the APIC emulation and deadline interrupts don't
237 * arrive.
238 */
239 #ifndef CONFIG_BOARD_ACRN
240 wrmsr(IA32_TSC_ADJUST_MSR, 0);
241 #endif
242 }
243
smp_timer_init(void)244 void smp_timer_init(void)
245 {
246 /* Copy the LVT configuration from CPU0, because IRQ_CONNECT()
247 * doesn't know how to manage LVT interrupts for anything
248 * other than the calling/initial CPU. Same fence needed to
249 * prevent later MSR writes from reordering before the APIC
250 * configuration write.
251 */
252 x86_write_loapic(LOAPIC_TIMER, lvt_reg.val);
253 __asm__ volatile("mfence" ::: "memory");
254 clear_tsc_adjust();
255 irq_enable(timer_irq());
256 }
257
sys_clock_driver_init(void)258 static int sys_clock_driver_init(void)
259 {
260 #ifdef CONFIG_ASSERT
261 uint32_t eax, ebx, ecx, edx;
262
263 if (IS_ENABLED(CONFIG_APIC_TSC_DEADLINE_TIMER)) {
264 ecx = 0; /* prevent compiler warning */
265 __get_cpuid(CPUID_BASIC_INFO_1, &eax, &ebx, &ecx, &edx);
266 __ASSERT((ecx & BIT(24)) != 0, "No TSC Deadline support");
267 }
268
269 edx = 0; /* prevent compiler warning */
270 __get_cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
271 __ASSERT((edx & BIT(8)) != 0, "No Invariant TSC support");
272
273 if (IS_ENABLED(CONFIG_SMP)) {
274 ebx = 0; /* prevent compiler warning */
275 __get_cpuid_count(CPUID_EXTENDED_FEATURES_LVL, 0, &eax, &ebx, &ecx, &edx);
276 __ASSERT((ebx & BIT(1)) != 0, "No TSC_ADJUST MSR support");
277 }
278 #endif
279
280 if (IS_ENABLED(CONFIG_SMP)) {
281 clear_tsc_adjust();
282 }
283
284 /* Timer interrupt number is runtime-fetched, so can't use
285 * static IRQ_CONNECT()
286 */
287 irq_connect_dynamic(timer_irq(), CONFIG_APIC_TIMER_IRQ_PRIORITY, isr, 0, 0);
288
289 if (IS_ENABLED(CONFIG_APIC_TIMER_TSC)) {
290 uint32_t timer_conf;
291
292 timer_conf = x86_read_loapic(LOAPIC_TIMER_CONFIG);
293 timer_conf &= ~0x0f; /* clear divider bits */
294 timer_conf |= 0x0b; /* divide by 1 */
295 x86_write_loapic(LOAPIC_TIMER_CONFIG, timer_conf);
296 }
297
298 lvt_reg.val = x86_read_loapic(LOAPIC_TIMER);
299 lvt_reg.lvt.mode = IS_ENABLED(CONFIG_APIC_TSC_DEADLINE_TIMER) ?
300 TSC_DEADLINE : ONE_SHOT;
301 lvt_reg.lvt.masked = 0;
302 x86_write_loapic(LOAPIC_TIMER, lvt_reg.val);
303
304 /* Per the SDM, the TSC_DEADLINE MSR is not serializing, so
305 * this fence is needed to be sure that an upcoming MSR write
306 * (i.e. a timeout we're about to set) cannot possibly reorder
307 * around the initialization we just did.
308 */
309 __asm__ volatile("mfence" ::: "memory");
310
311 last_tick = rdtsc() / CYC_PER_TICK;
312 last_cycle = last_tick * CYC_PER_TICK;
313 if (!IS_ENABLED(CONFIG_TICKLESS_KERNEL)) {
314 set_trigger(last_cycle + CYC_PER_TICK);
315 }
316 irq_enable(timer_irq());
317
318 return 0;
319 }
320
321 SYS_INIT(sys_clock_driver_init, PRE_KERNEL_2,
322 CONFIG_SYSTEM_CLOCK_INIT_PRIORITY);
323