1 /*
2 * Copyright (c) 2021 Intel Corporation
3 * SPDX-License-Identifier: Apache-2.0
4 */
5 #include <zephyr/init.h>
6 #include <zephyr/drivers/timer/system_timer.h>
7 #include <zephyr/sys_clock.h>
8 #include <zephyr/spinlock.h>
9 #include <zephyr/drivers/interrupt_controller/loapic.h>
10 #include <zephyr/irq.h>
11
12 #define IA32_TSC_DEADLINE_MSR 0x6e0
13 #define IA32_TSC_ADJUST_MSR 0x03b
14
15 #define CYC_PER_TICK (CONFIG_SYS_CLOCK_HW_CYCLES_PER_SEC \
16 / (uint64_t) CONFIG_SYS_CLOCK_TICKS_PER_SEC)
17
18 struct apic_timer_lvt {
19 uint8_t vector : 8;
20 uint8_t unused0 : 8;
21 uint8_t masked : 1;
22 enum { ONE_SHOT, PERIODIC, TSC_DEADLINE } mode: 2;
23 uint32_t unused2 : 13;
24 };
25
26 static struct k_spinlock lock;
27 static uint64_t last_announce;
28 static union { uint32_t val; struct apic_timer_lvt lvt; } lvt_reg;
29
rdtsc(void)30 static ALWAYS_INLINE uint64_t rdtsc(void)
31 {
32 uint32_t hi, lo;
33
34 __asm__ volatile("rdtsc" : "=d"(hi), "=a"(lo));
35 return lo + (((uint64_t)hi) << 32);
36 }
37
isr(const void * arg)38 static void isr(const void *arg)
39 {
40 ARG_UNUSED(arg);
41 k_spinlock_key_t key = k_spin_lock(&lock);
42 uint32_t ticks = (rdtsc() - last_announce) / CYC_PER_TICK;
43
44 last_announce += ticks * CYC_PER_TICK;
45 k_spin_unlock(&lock, key);
46 sys_clock_announce(ticks);
47
48 if (!IS_ENABLED(CONFIG_TICKLESS_KERNEL)) {
49 sys_clock_set_timeout(1, false);
50 }
51 }
52
wrmsr(int32_t msr,uint64_t val)53 static inline void wrmsr(int32_t msr, uint64_t val)
54 {
55 uint32_t hi = (uint32_t) (val >> 32);
56 uint32_t lo = (uint32_t) val;
57
58 __asm__ volatile("wrmsr" :: "d"(hi), "a"(lo), "c"(msr));
59 }
60
sys_clock_set_timeout(int32_t ticks,bool idle)61 void sys_clock_set_timeout(int32_t ticks, bool idle)
62 {
63 ARG_UNUSED(idle);
64
65 uint64_t now = rdtsc();
66 k_spinlock_key_t key = k_spin_lock(&lock);
67 uint64_t expires = now + MAX(ticks - 1, 0) * CYC_PER_TICK;
68
69 expires = last_announce + (((expires - last_announce + CYC_PER_TICK - 1)
70 / CYC_PER_TICK) * CYC_PER_TICK);
71
72 /* The second condition is to catch the wraparound.
73 * Interpreted strictly, the IA SDM description of the
74 * TSC_DEADLINE MSR implies that it will trigger an immediate
75 * interrupt if we try to set an expiration across the 64 bit
76 * rollover. Unfortunately there's no way to test that as on
77 * real hardware it requires more than a century of uptime,
78 * but this is cheap and safe.
79 */
80 if (ticks == K_TICKS_FOREVER || expires < last_announce) {
81 expires = UINT64_MAX;
82 }
83
84 wrmsr(IA32_TSC_DEADLINE_MSR, expires);
85 k_spin_unlock(&lock, key);
86 }
87
sys_clock_elapsed(void)88 uint32_t sys_clock_elapsed(void)
89 {
90 k_spinlock_key_t key = k_spin_lock(&lock);
91 uint32_t ret = (rdtsc() - last_announce) / CYC_PER_TICK;
92
93 k_spin_unlock(&lock, key);
94 return ret;
95 }
96
sys_clock_cycle_get_32(void)97 uint32_t sys_clock_cycle_get_32(void)
98 {
99 return (uint32_t) rdtsc();
100 }
101
sys_clock_cycle_get_64(void)102 uint64_t sys_clock_cycle_get_64(void)
103 {
104 return rdtsc();
105 }
106
timer_irq(void)107 static inline uint32_t timer_irq(void)
108 {
109 /* The Zephyr APIC API is... idiosyncratic. The timer is a
110 * "local vector table" interrupt. These aren't system IRQs
111 * presented to the IO-APIC, they're indices into a register
112 * array in the local APIC. By Zephyr convention they come
113 * after all the external IO-APIC interrupts, but that number
114 * changes depending on device configuration so we have to
115 * fetch it at runtime. The timer happens to be the first
116 * entry in the table.
117 */
118 return z_loapic_irq_base();
119 }
120
121 /* The TSC_ADJUST MSR implements a synchronized offset such that
122 * multiple CPUs (within a socket, anyway) can synchronize exactly, or
123 * implement managed timing spaces for guests in a recoverable way,
124 * etc... We set it to zero on all cores for simplicity, because
125 * firmware often leaves it in an inconsistent state between cores.
126 */
clear_tsc_adjust(void)127 static void clear_tsc_adjust(void)
128 {
129 /* But don't touch it on ACRN, where an hypervisor bug
130 * confuses the APIC emulation and deadline interrupts don't
131 * arrive.
132 */
133 #ifndef CONFIG_BOARD_ACRN
134 wrmsr(IA32_TSC_ADJUST_MSR, 0);
135 #endif
136 }
137
smp_timer_init(void)138 void smp_timer_init(void)
139 {
140 /* Copy the LVT configuration from CPU0, because IRQ_CONNECT()
141 * doesn't know how to manage LVT interrupts for anything
142 * other than the calling/initial CPU. Same fence needed to
143 * prevent later MSR writes from reordering before the APIC
144 * configuration write.
145 */
146 x86_write_loapic(LOAPIC_TIMER, lvt_reg.val);
147 __asm__ volatile("mfence" ::: "memory");
148 clear_tsc_adjust();
149 irq_enable(timer_irq());
150 }
151
cpuid(uint32_t * eax,uint32_t * ebx,uint32_t * ecx,uint32_t * edx)152 static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
153 {
154 __asm__ volatile("cpuid"
155 : "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
156 : "a"(*eax), "c"(*ecx));
157 }
158
sys_clock_driver_init(void)159 static int sys_clock_driver_init(void)
160 {
161 #ifdef CONFIG_ASSERT
162 uint32_t eax, ebx, ecx, edx;
163
164 eax = 1; ecx = 0;
165 cpuid(&eax, &ebx, &ecx, &edx);
166 __ASSERT((ecx & BIT(24)) != 0, "No TSC Deadline support");
167
168 eax = 0x80000007; ecx = 0;
169 cpuid(&eax, &ebx, &ecx, &edx);
170 __ASSERT((edx & BIT(8)) != 0, "No Invariant TSC support");
171
172 eax = 7; ecx = 0;
173 cpuid(&eax, &ebx, &ecx, &edx);
174 __ASSERT((ebx & BIT(1)) != 0, "No TSC_ADJUST MSR support");
175 #endif
176
177 clear_tsc_adjust();
178
179 /* Timer interrupt number is runtime-fetched, so can't use
180 * static IRQ_CONNECT()
181 */
182 irq_connect_dynamic(timer_irq(), CONFIG_APIC_TIMER_IRQ_PRIORITY, isr, 0, 0);
183
184 lvt_reg.val = x86_read_loapic(LOAPIC_TIMER);
185 lvt_reg.lvt.mode = TSC_DEADLINE;
186 lvt_reg.lvt.masked = 0;
187 x86_write_loapic(LOAPIC_TIMER, lvt_reg.val);
188
189 /* Per the SDM, the TSC_DEADLINE MSR is not serializing, so
190 * this fence is needed to be sure that an upcoming MSR write
191 * (i.e. a timeout we're about to set) cannot possibly reorder
192 * around the initialization we just did.
193 */
194 __asm__ volatile("mfence" ::: "memory");
195
196 last_announce = rdtsc();
197 irq_enable(timer_irq());
198
199 if (!IS_ENABLED(CONFIG_TICKLESS_KERNEL)) {
200 sys_clock_set_timeout(1, false);
201 }
202
203 return 0;
204 }
205
206 SYS_INIT(sys_clock_driver_init, PRE_KERNEL_2,
207 CONFIG_SYSTEM_CLOCK_INIT_PRIORITY);
208