1/* 2 * Copyright (c) 2017 Intel Corporation 3 * 4 * SPDX-License-Identifier: Apache-2.0 5 */ 6 7#include <zephyr/toolchain.h> 8#include <zephyr/arch/cpu.h> 9#include <offsets_short.h> 10#include <zephyr/syscall.h> 11#include <zephyr/kernel/mm.h> 12 13#ifdef CONFIG_X86_KPTI 14/* Copy interrupt return stack context to the trampoline stack, switch back 15 * to the user page table, and only then 'iret'. We jump to this instead 16 * of calling 'iret' if KPTI is turned on. This must be invoked with interrupts 17 * locked. 18 * 19 * Stack layout is expected to be what 'iretq' expects, which is as follows: 20 * 21 * 32 SS 22 * 24 RSP 23 * 16 RFLAGS 24 * 8 CS 25 * 0 RIP 26 */ 27.global z_x86_trampoline_to_user 28z_x86_trampoline_to_user: 29 /* Stash EDI, need a free register */ 30 pushq %rdi 31 32 /* Store old stack pointer and switch to trampoline stack */ 33 movq %rsp, %rdi 34 movq %gs:__x86_tss64_t_ist2_OFFSET, %rsp 35 36 /* Copy context */ 37 pushq 40(%rdi) /* SS */ 38 pushq 32(%rdi) /* RSP */ 39 pushq 24(%rdi) /* RFLAGS */ 40 pushq 16(%rdi) /* CS */ 41 pushq 8(%rdi) /* RIP */ 42 xchgq %rdi, (%rdi) /* Exchange old rdi to restore it and put 43 trampoline stack address in its old storage 44 area */ 45 46 /* Switch to thread's page table */ 47 pushq %rax 48 movq %gs:__x86_tss64_t_cpu_OFFSET, %rax 49 movq ___cpu_t_current_OFFSET(%rax), %rax 50 movq _thread_offset_to_ptables(%rax), %rax 51 movq %rax, %cr3 52 popq %rax 53 movq $0, -8(%rsp) /* Delete stashed RAX data */ 54 55 /* Trampoline stack should have nothing sensitive in it at this point */ 56 swapgs 57 iretq 58#endif /* CONFIG_X86_KPTI */ 59 60 61/* Landing site for 'syscall' instruction 62 * 63 * Call id is in RAX 64 * Arguments are in RDI, RSI, RDX, R10, R8, R9 65 * Return address stored by CPU in RCX 66 * User RFLAGS store by CPU in R11 67 * Current RFLAGS has been masked with ~X86_FMASK_MSR 68 */ 69.global z_x86_syscall_entry_stub 70z_x86_syscall_entry_stub: 71 swapgs 72 73 /* Save original stack pointer from user mode in memory, at the 74 * moment we have no free registers or stack to save it to. This 75 * eventually gets put on the stack before we re-enable interrupts 76 * as this is a per-cpu and not per-thread area. 77 */ 78 movq %rsp, %gs:__x86_tss64_t_usp_OFFSET 79 80#ifdef CONFIG_X86_KPTI 81 /* We need to switch to the trampoline stack so that we can 82 * switch to the kernel's page table 83 */ 84 movq %gs:__x86_tss64_t_ist2_OFFSET, %rsp 85 86 /* Load kernel's page table */ 87 pushq %rax 88 89 /* NOTE: Presumes phys=virt */ 90 movq $K_MEM_PHYS_ADDR(z_x86_kernel_ptables), %rax 91 movq %rax, %cr3 92 popq %rax 93 movq $0, -8(%rsp) /* Delete stashed RAX data */ 94#endif /* CONFIG_X86_KPTI */ 95 96 /* Switch to the privilege mode stack pointer stored in 97 * x86_tss64.psp 98 */ 99 movq %gs:__x86_tss64_t_psp_OFFSET, %rsp 100 101 /* We're now on the privilege mode stack; push the old user stack 102 * pointer onto it 103 */ 104 pushq %gs:__x86_tss64_t_usp_OFFSET 105#ifdef CONFIG_X86_KPTI 106 movq $0, %gs:__x86_tss64_t_usp_OFFSET 107#endif 108 109 sti /* re-enable interrupts */ 110 111 /* call_id is in RAX. bounds-check it, must be less than 112 * K_SYSCALL_LIMIT. 113 */ 114 cmp $K_SYSCALL_LIMIT, %rax 115 jae _bad_syscall 116 117_id_ok: 118#ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION 119 /* Prevent speculation with bogus system call IDs */ 120 lfence 121#endif 122 123 /* Remaining registers not involved in the syscall operation are 124 * RBX, RBP, R12-R15, plus floating point / SIMD registers. 125 * 126 * We save caller-saved registers so we can restore to original values 127 * when we call 'sysretq' at the end. 128 */ 129 pushq %rdi 130 subq $X86_FXSAVE_SIZE, %rsp 131 fxsave (%rsp) 132 pushq %rsi 133 pushq %rdx 134 pushq %r8 135 pushq %r9 136 pushq %r10 137 pushq %r11 /* RFLAGS */ 138 pushq %rcx /* Return address stored by 'syscall' */ 139 pushq %rsp /* SSF parameter */ 140 141 /* All other args are in the right registers, except arg4 which 142 * we had to put in r10 instead of RCX 143 */ 144 movq %r10, %rcx 145 146 /* from the call ID in RAX, load R10 with the actual function pointer 147 * to call by looking it up in the system call dispatch table 148 */ 149 xorq %r11, %r11 150 movq _k_syscall_table(%r11, %rax, 8), %r10 151 152 /* Run the marshal function, which is some entry in _k_syscall_table */ 153 call *%r10 154 155 /* RAX now contains the return value 156 * 157 * Callee-saved registers are un-touched from original values per C 158 * calling convention, but sensitive data may lurk in caller-saved regs 159 * RDI, RSI, RDX, R8, R9, R10, XMM* after we have serviced the system 160 * call. We saved them earlier, restore their original values when 161 * the syscall was made. This also preserves these registers if they 162 * were not used as arguments. 163 * 164 * We also can't have RCX and R11 clobbered as we need the original 165 * values to successfully 'sysretq'. 166 */ 167 addq $8, %rsp /* Discard ssf */ 168 popq %rcx /* Restore return address for 'sysretq' */ 169 popq %r11 /* Restore RFLAGS for 'sysretq' */ 170 popq %r10 171 popq %r9 172 popq %r8 173 popq %rdx 174 popq %rsi 175 fxrstor (%rsp) 176 addq $X86_FXSAVE_SIZE, %rsp 177 popq %rdi 178 179#ifdef CONFIG_X86_KPTI 180 /* Lock IRQs as we are using per-cpu memory areas and the 181 * trampoline stack 182 */ 183 cli 184 185 /* Stash user stack pointer and switch to trampoline stack */ 186 popq %gs:__x86_tss64_t_usp_OFFSET 187 movq %gs:__x86_tss64_t_ist2_OFFSET, %rsp 188 189 /* Switch to thread's page table */ 190 pushq %rax 191 movq %gs:__x86_tss64_t_cpu_OFFSET, %rax 192 movq ___cpu_t_current_OFFSET(%rax), %rax 193 movq _thread_offset_to_ptables(%rax), %rax 194 movq %rax, %cr3 195 popq %rax 196 movq $0, -8(%rsp) /* Delete stashed RAX data */ 197 198 /* Restore saved user stack pointer */ 199 movq %gs:__x86_tss64_t_usp_OFFSET, %rsp 200 movq $0, %gs:__x86_tss64_t_usp_OFFSET 201#else 202 /* Restore user stack pointer */ 203 popq %rsp 204 205 /* Return to user mode, locking interrupts as the normal interrupt 206 * handling path will get very confused if it occurs between 207 * 'swapgs' and 'sysretq' 208 */ 209 cli 210#endif /* CONFIG_X86_KPTI */ 211 212 swapgs 213 sysretq 214 215_bad_syscall: 216 /* RAX had a bogus syscall value in it, replace with the bad syscall 217 * handler's ID, and put the bad ID as its first argument. 218 * 219 * TODO: On this and all other arches, simply immediately return 220 * with -ENOSYS, once all syscalls have a return value 221 */ 222 movq %rax, %rdi 223 movq $K_SYSCALL_BAD, %rax 224 jmp _id_ok 225 226/* 227 * size_t arch_user_string_nlen(const char *s, size_t maxsize, int *err_arg) 228 * ^ RDI ^ RSI ^ RDX 229 */ 230.global arch_user_string_nlen 231arch_user_string_nlen: 232 /* Initial error value, strlen_done adjusts this if we succeed */ 233 movl $-1, %r8d 234 235 /* use RAX as our length count (this function's return value) */ 236 xor %rax, %rax 237 238 /* This code might page fault */ 239strlen_loop: 240.global z_x86_user_string_nlen_fault_start 241z_x86_user_string_nlen_fault_start: 242 cmpb $0x0, (%rdi, %rax, 1) /* *(RDI + RAX) == 0? Could fault. */ 243 244.global z_x86_user_string_nlen_fault_end 245z_x86_user_string_nlen_fault_end: 246 je strlen_done 247 cmp %rsi, %rax /* Max length reached? */ 248 je strlen_done 249 inc %rax /* EAX++ and loop again */ 250 jmp strlen_loop 251 252strlen_done: 253 /* Set error value to 0 since we succeeded */ 254 xorl %r8d, %r8d 255 256.global z_x86_user_string_nlen_fixup 257z_x86_user_string_nlen_fixup: 258 /* Write error value to 32-bit integer err pointer parameter */ 259 movl %r8d, (%rdx) 260 retq 261 262/* 263 * Trampoline function to put the p3 parameter in the register expected 264 * by the calling convention, we couldn't use RCX when we called 'sysret' 265 */ 266z_x86_userspace_landing_site: 267 /* Place argument 4 in the correct position */ 268 movq %r10, %rcx 269 call z_thread_entry 270 271/* FUNC_NORETURN void z_x86_userspace_enter( 272 * k_thread_entry_t user_entry, <- RDI 273 * void *p1, void *p2, void *p3, <- RSI, RDX, RCX 274 * uintptr_t stack_end, <- R8 275 * uintptr_t stack_start) <- R9 276 * 277 * A one-way trip to userspace. 278 */ 279.global z_x86_userspace_enter 280z_x86_userspace_enter: 281 /* RCX is sysret return address, pass along p3 in r10, 282 * z_x86_userspace_landing_site will fix this up 283 */ 284 movq %rcx, %r10 285 286 /* switch to privilege mode stack so we can erase thread stack buffer, 287 * the buffer is the page immediately before the thread stack 288 */ 289 movq %r9, %rsp 290 291 /* Push callee-saved regs and go back into C code to erase the stack 292 * buffer and set US bit in page tables for it 293 */ 294 pushq %rdx 295 pushq %rsi 296 pushq %rdi 297 pushq %r8 298 pushq %r10 299 callq z_x86_current_stack_perms 300 popq %r10 301 popq %r8 302 popq %rdi 303 popq %rsi 304 popq %rdx 305 306 /* Reset to the beginning of the user stack */ 307 movq %r8, %rsp 308 309 /* set sysret entry point */ 310 movq $z_x86_userspace_landing_site, %rcx 311 312 /* Copy RFLAGS into r11, required by sysret */ 313 pushfq 314 movq (%rsp), %r11 315 movq $0, (%rsp) /* Now a debugger-friendly return address */ 316 317 /* cleanse other registers */ 318 xorq %rbx, %rbx 319 xorq %rbp, %rbp 320 xorq %r12, %r12 321 xorq %r13, %r13 322 xorq %r14, %r14 323 xorq %r15, %r15 324 325 cli 326 327#ifdef CONFIG_X86_KPTI 328 /* Switch to thread's page table. We have free registers so no need 329 * to involve the trampoline stack. 330 */ 331 movq %gs:__x86_tss64_t_cpu_OFFSET, %rax 332 movq ___cpu_t_current_OFFSET(%rax), %rax 333 movq _thread_offset_to_ptables(%rax), %rax 334 movq %rax, %cr3 335#endif 336 swapgs 337 sysretq 338