1/*
2 * Copyright (c) 2017 Intel Corporation
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 */
6
7#include <zephyr/toolchain.h>
8#include <zephyr/arch/cpu.h>
9#include <offsets_short.h>
10#include <zephyr/syscall.h>
11#include <zephyr/kernel/mm.h>
12
13#ifdef CONFIG_X86_KPTI
14/* Copy interrupt return stack context to the trampoline stack, switch back
15 * to the user page table, and only then 'iret'. We jump to this instead
16 * of calling 'iret' if KPTI is turned on. This must be invoked with interrupts
17 * locked.
18 *
19 * Stack layout is expected to be what 'iretq' expects, which is as follows:
20 *
21 * 32 SS
22 * 24 RSP
23 * 16 RFLAGS
24 * 8  CS
25 * 0  RIP
26 */
27.global z_x86_trampoline_to_user
28z_x86_trampoline_to_user:
29	/* Stash EDI, need a free register */
30	pushq	%rdi
31
32	/* Store old stack pointer and switch to trampoline stack */
33	movq	%rsp, %rdi
34	movq	%gs:__x86_tss64_t_ist2_OFFSET, %rsp
35
36	/* Copy context */
37	pushq	40(%rdi)	/* SS */
38	pushq	32(%rdi)	/* RSP */
39	pushq	24(%rdi)	/* RFLAGS */
40	pushq	16(%rdi)	/* CS */
41	pushq   8(%rdi)		/* RIP */
42	xchgq	%rdi, (%rdi)	/* Exchange old rdi to restore it and put
43				   trampoline stack address in its old storage
44				   area */
45
46	/* Switch to thread's page table */
47	pushq	%rax
48	movq	%gs:__x86_tss64_t_cpu_OFFSET, %rax
49	movq	___cpu_t_current_OFFSET(%rax), %rax
50	movq	_thread_offset_to_ptables(%rax), %rax
51	movq	%rax, %cr3
52	popq	%rax
53	movq	$0, -8(%rsp)	/* Delete stashed RAX data */
54
55	/* Trampoline stack should have nothing sensitive in it at this point */
56	swapgs
57	iretq
58#endif /* CONFIG_X86_KPTI */
59
60
61/* Landing site for 'syscall' instruction
62 *
63 * Call id is in RAX
64 * Arguments are in RDI, RSI, RDX, R10, R8, R9
65 * Return address stored by CPU in RCX
66 * User RFLAGS store by CPU in R11
67 * Current RFLAGS has been masked with ~X86_FMASK_MSR
68 */
69.global z_x86_syscall_entry_stub
70z_x86_syscall_entry_stub:
71	swapgs
72
73	/* Save original stack pointer from user mode in memory, at the
74	 * moment we have no free registers or stack to save it to. This
75	 * eventually gets put on the stack before we re-enable interrupts
76	 * as this is a per-cpu and not per-thread area.
77	 */
78	movq	%rsp, %gs:__x86_tss64_t_usp_OFFSET
79
80#ifdef CONFIG_X86_KPTI
81	/* We need to switch to the trampoline stack so that we can
82	 * switch to the kernel's page table
83	 */
84	movq	%gs:__x86_tss64_t_ist2_OFFSET, %rsp
85
86	/* Load kernel's page table */
87	pushq	%rax
88
89	/* NOTE: Presumes phys=virt */
90	movq	$K_MEM_PHYS_ADDR(z_x86_kernel_ptables), %rax
91	movq	%rax, %cr3
92	popq	%rax
93	movq	$0, -8(%rsp)	/* Delete stashed RAX data */
94#endif /* CONFIG_X86_KPTI */
95
96	/* Switch to the privilege mode stack pointer stored in
97	 * x86_tss64.psp
98	 */
99	movq	%gs:__x86_tss64_t_psp_OFFSET, %rsp
100
101	/* We're now on the privilege mode stack; push the old user stack
102	 * pointer onto it
103	 */
104	pushq	%gs:__x86_tss64_t_usp_OFFSET
105#ifdef CONFIG_X86_KPTI
106	movq	$0, %gs:__x86_tss64_t_usp_OFFSET
107#endif
108
109	sti			/* re-enable interrupts */
110
111	/* call_id is in RAX. bounds-check it, must be less than
112	 * K_SYSCALL_LIMIT.
113	 */
114	cmp	$K_SYSCALL_LIMIT, %rax
115	jae	_bad_syscall
116
117_id_ok:
118#ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION
119	/* Prevent speculation with bogus system call IDs */
120	lfence
121#endif
122
123	/* Remaining registers not involved in the syscall operation are
124	 * RBX, RBP, R12-R15, plus floating point / SIMD registers.
125	 *
126	 * We save caller-saved registers so we can restore to original values
127	 * when we call 'sysretq' at the end.
128	 */
129	pushq	%rdi
130	subq	$X86_FXSAVE_SIZE, %rsp
131	fxsave	(%rsp)
132	pushq	%rsi
133	pushq	%rdx
134	pushq	%r8
135	pushq	%r9
136	pushq	%r10
137	pushq	%r11	/* RFLAGS */
138	pushq	%rcx	/* Return address stored by 'syscall' */
139	pushq	%rsp	/* SSF parameter */
140
141	/* All other args are in the right registers, except arg4 which
142	 * we had to put in r10 instead of RCX
143	 */
144	movq	%r10, %rcx
145
146	/* from the call ID in RAX, load R10 with the actual function pointer
147	 * to call by looking it up in the system call dispatch table
148	 */
149	xorq	%r11, %r11
150	movq	_k_syscall_table(%r11, %rax, 8), %r10
151
152	/* Run the marshal function, which is some entry in _k_syscall_table */
153	call	*%r10
154
155	/* RAX now contains the return value
156	 *
157	 * Callee-saved registers are un-touched from original values per C
158	 * calling convention, but sensitive data may lurk in caller-saved regs
159	 * RDI, RSI, RDX, R8, R9, R10, XMM* after we have serviced the system
160	 * call.  We saved them earlier, restore their original values when
161	 * the syscall was made. This also preserves these registers if they
162	 * were not used as arguments.
163	 *
164	 * We also can't have RCX and R11 clobbered as we need the original
165	 * values to successfully 'sysretq'.
166	 */
167	addq	$8, %rsp	/* Discard ssf */
168	popq	%rcx	/* Restore return address for 'sysretq' */
169	popq	%r11	/* Restore RFLAGS for 'sysretq' */
170	popq	%r10
171	popq	%r9
172	popq	%r8
173	popq	%rdx
174	popq	%rsi
175	fxrstor	(%rsp)
176	addq	$X86_FXSAVE_SIZE, %rsp
177	popq	%rdi
178
179#ifdef CONFIG_X86_KPTI
180	/* Lock IRQs as we are using per-cpu memory areas and the
181	 * trampoline stack
182	 */
183	cli
184
185	/* Stash user stack pointer and switch to trampoline stack */
186	popq	%gs:__x86_tss64_t_usp_OFFSET
187	movq	%gs:__x86_tss64_t_ist2_OFFSET, %rsp
188
189	/* Switch to thread's page table */
190	pushq	%rax
191	movq	%gs:__x86_tss64_t_cpu_OFFSET, %rax
192	movq	___cpu_t_current_OFFSET(%rax), %rax
193	movq	_thread_offset_to_ptables(%rax), %rax
194	movq	%rax, %cr3
195	popq	%rax
196	movq	$0, -8(%rsp)	/* Delete stashed RAX data */
197
198	/* Restore saved user stack pointer */
199	movq	%gs:__x86_tss64_t_usp_OFFSET, %rsp
200	movq	$0, %gs:__x86_tss64_t_usp_OFFSET
201#else
202	/* Restore user stack pointer */
203	popq	%rsp
204
205	/* Return to user mode, locking interrupts as the normal interrupt
206	 * handling path will get very confused if it occurs between
207	 * 'swapgs' and 'sysretq'
208	 */
209	cli
210#endif /* CONFIG_X86_KPTI */
211
212	swapgs
213	sysretq
214
215_bad_syscall:
216	/* RAX had a bogus syscall value in it, replace with the bad syscall
217	 * handler's ID, and put the bad ID as its first argument.
218	 *
219	 * TODO: On this and all other arches, simply immediately return
220	 * with -ENOSYS, once all syscalls have a return value
221	 */
222	movq	%rax, %rdi
223	movq	$K_SYSCALL_BAD, %rax
224	jmp	_id_ok
225
226/*
227 * size_t arch_user_string_nlen(const char *s, size_t maxsize, int *err_arg)
228 *                              ^ RDI          ^ RSI           ^ RDX
229 */
230.global arch_user_string_nlen
231arch_user_string_nlen:
232	/* Initial error value, strlen_done adjusts this if we succeed */
233	movl	$-1, %r8d
234
235	/* use RAX as our length count (this function's return value) */
236	xor	%rax, %rax
237
238	/* This code might page fault */
239strlen_loop:
240.global z_x86_user_string_nlen_fault_start
241z_x86_user_string_nlen_fault_start:
242	cmpb	$0x0, (%rdi, %rax, 1)	/* *(RDI + RAX) == 0? Could fault. */
243
244.global z_x86_user_string_nlen_fault_end
245z_x86_user_string_nlen_fault_end:
246	je	strlen_done
247	cmp	%rsi, %rax		/* Max length reached? */
248	je	strlen_done
249	inc	%rax			/* EAX++ and loop again */
250	jmp	strlen_loop
251
252strlen_done:
253	/* Set error value to 0 since we succeeded */
254	xorl	%r8d, %r8d
255
256.global z_x86_user_string_nlen_fixup
257z_x86_user_string_nlen_fixup:
258	/* Write error value to 32-bit integer err pointer parameter */
259	movl	%r8d, (%rdx)
260	retq
261
262/*
263 * Trampoline function to put the p3 parameter in the register expected
264 * by the calling convention, we couldn't use RCX when we called 'sysret'
265 */
266z_x86_userspace_landing_site:
267	/* Place argument 4 in the correct position */
268	movq	%r10, %rcx
269	call	z_thread_entry
270
271/* FUNC_NORETURN void z_x86_userspace_enter(
272 *		k_thread_entry_t user_entry,	<- RDI
273 *		void *p1, void *p2, void *p3,	<- RSI, RDX, RCX
274 *		uintptr_t stack_end,		<- R8
275 *		uintptr_t stack_start)		<- R9
276 *
277 * A one-way trip to userspace.
278 */
279.global z_x86_userspace_enter
280z_x86_userspace_enter:
281	/* RCX is sysret return address, pass along p3 in r10,
282	 * z_x86_userspace_landing_site will fix this up
283	 */
284	movq	%rcx, %r10
285
286	/* switch to privilege mode stack so we can erase thread stack buffer,
287	 * the buffer is the page immediately before the thread stack
288	 */
289	movq	%r9, %rsp
290
291	/* Push callee-saved regs and go back into C code to erase the stack
292	 * buffer and set US bit in page tables for it
293	 */
294	pushq	%rdx
295	pushq	%rsi
296	pushq	%rdi
297	pushq	%r8
298	pushq	%r10
299	callq	z_x86_current_stack_perms
300	popq	%r10
301	popq	%r8
302	popq	%rdi
303	popq	%rsi
304	popq	%rdx
305
306	/* Reset to the beginning of the user stack */
307	movq	%r8, %rsp
308
309	/* set sysret entry point */
310	movq	$z_x86_userspace_landing_site, %rcx
311
312	/* Copy RFLAGS into r11, required by sysret */
313	pushfq
314	movq	(%rsp), %r11
315	movq	$0, (%rsp)	/* Now a debugger-friendly return address */
316
317	/* cleanse other registers */
318	xorq	%rbx, %rbx
319	xorq	%rbp, %rbp
320	xorq	%r12, %r12
321	xorq	%r13, %r13
322	xorq	%r14, %r14
323	xorq	%r15, %r15
324
325	cli
326
327#ifdef CONFIG_X86_KPTI
328	/* Switch to thread's page table. We have free registers so no need
329	 * to involve the trampoline stack.
330	 */
331	movq	%gs:__x86_tss64_t_cpu_OFFSET, %rax
332	movq	___cpu_t_current_OFFSET(%rax), %rax
333	movq	_thread_offset_to_ptables(%rax), %rax
334	movq	%rax, %cr3
335#endif
336	swapgs
337	sysretq
338