1 /*
2 * Copyright (c) 2011-2014 Wind River Systems, Inc.
3 * Copyright (c) 2017-2020 Intel Corporation
4 *
5 * SPDX-License-Identifier: Apache-2.0
6 */
7
8 #include <zephyr/kernel.h>
9 #include <zephyr/arch/x86/mmustructs.h>
10 #include <zephyr/kernel/mm.h>
11 #include <zephyr/sys/__assert.h>
12 #include <zephyr/sys/check.h>
13 #include <zephyr/logging/log.h>
14 #include <errno.h>
15 #include <ctype.h>
16 #include <zephyr/spinlock.h>
17 #include <kernel_arch_func.h>
18 #include <x86_mmu.h>
19 #include <zephyr/init.h>
20 #include <kernel_internal.h>
21 #include <mmu.h>
22 #include <zephyr/drivers/interrupt_controller/loapic.h>
23 #include <mmu.h>
24 #include <zephyr/arch/x86/memmap.h>
25
26 LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL);
27
28 /* We will use some ignored bits in the PTE to backup permission settings
29 * when the mapping was made. This is used to un-apply memory domain memory
30 * partitions to page tables when the partitions are removed.
31 */
32 #define MMU_RW_ORIG MMU_IGNORED0
33 #define MMU_US_ORIG MMU_IGNORED1
34 #define MMU_XD_ORIG MMU_IGNORED2
35
36 /* Bits in the PTE that form the set of permission bits, when resetting */
37 #define MASK_PERM (MMU_RW | MMU_US | MMU_XD)
38
39 /* When we want to set up a new mapping, discarding any previous state */
40 #define MASK_ALL (~((pentry_t)0U))
41
42 /* Bits to set at mapping time for particular permissions. We set the actual
43 * page table bit effecting the policy and also the backup bit.
44 */
45 #define ENTRY_RW (MMU_RW | MMU_RW_ORIG)
46 #define ENTRY_US (MMU_US | MMU_US_ORIG)
47 #define ENTRY_XD (MMU_XD | MMU_XD_ORIG)
48
49 /* Bit position which is always zero in a PTE. We'll use the PAT bit.
50 * This helps disambiguate PTEs that do not have the Present bit set (MMU_P):
51 * - If the entire entry is zero, it's an un-mapped virtual page
52 * - If PTE_ZERO is set, we flipped this page due to KPTI
53 * - Otherwise, this was a page-out
54 */
55 #define PTE_ZERO MMU_PAT
56
57 /* Protects x86_domain_list and serializes instantiation of intermediate
58 * paging structures.
59 */
60 __pinned_bss
61 static struct k_spinlock x86_mmu_lock;
62
63 #if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
64 /* List of all active and initialized memory domains. This is used to make
65 * sure all memory mappings are the same across all page tables when invoking
66 * range_map()
67 */
68 __pinned_bss
69 static sys_slist_t x86_domain_list;
70 #endif
71
72 /*
73 * Definitions for building an ontology of paging levels and capabilities
74 * at each level
75 */
76
77 /* Data structure describing the characteristics of a particular paging
78 * level
79 */
80 struct paging_level {
81 /* What bits are used to store physical address */
82 pentry_t mask;
83
84 /* Number of entries in this paging structure */
85 size_t entries;
86
87 /* How many bits to right-shift a virtual address to obtain the
88 * appropriate entry within this table.
89 *
90 * The memory scope of each entry in this table is 1 << shift.
91 */
92 unsigned int shift;
93 #ifdef CONFIG_EXCEPTION_DEBUG
94 /* Name of this level, for debug purposes */
95 const char *name;
96 #endif
97 };
98
99 /* Flags for all entries in intermediate paging levels.
100 * Fortunately, the same bits are set for all intermediate levels for all
101 * three paging modes.
102 *
103 * Obviously P is set.
104 *
105 * We want RW and US bit always set; actual access control will be
106 * done at the leaf level.
107 *
108 * XD (if supported) always 0. Disabling execution done at leaf level.
109 *
110 * PCD/PWT always 0. Caching properties again done at leaf level.
111 */
112 #define INT_FLAGS (MMU_P | MMU_RW | MMU_US)
113
114 /* Paging level ontology for the selected paging mode.
115 *
116 * See Figures 4-4, 4-7, 4-11 in the Intel SDM, vol 3A
117 */
118 __pinned_rodata
119 static const struct paging_level paging_levels[] = {
120 #ifdef CONFIG_X86_64
121 /* Page Map Level 4 */
122 {
123 .mask = 0x7FFFFFFFFFFFF000ULL,
124 .entries = 512U,
125 .shift = 39U,
126 #ifdef CONFIG_EXCEPTION_DEBUG
127 .name = "PML4"
128 #endif
129 },
130 #endif /* CONFIG_X86_64 */
131 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
132 /* Page Directory Pointer Table */
133 {
134 .mask = 0x7FFFFFFFFFFFF000ULL,
135 #ifdef CONFIG_X86_64
136 .entries = 512U,
137 #else
138 /* PAE version */
139 .entries = 4U,
140 #endif
141 .shift = 30U,
142 #ifdef CONFIG_EXCEPTION_DEBUG
143 .name = "PDPT"
144 #endif
145 },
146 #endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
147 /* Page Directory */
148 {
149 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
150 .mask = 0x7FFFFFFFFFFFF000ULL,
151 .entries = 512U,
152 .shift = 21U,
153 #else
154 /* 32-bit */
155 .mask = 0xFFFFF000U,
156 .entries = 1024U,
157 .shift = 22U,
158 #endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
159 #ifdef CONFIG_EXCEPTION_DEBUG
160 .name = "PD"
161 #endif
162 },
163 /* Page Table */
164 {
165 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
166 .mask = 0x07FFFFFFFFFFF000ULL,
167 .entries = 512U,
168 .shift = 12U,
169 #else
170 /* 32-bit */
171 .mask = 0xFFFFF000U,
172 .entries = 1024U,
173 .shift = 12U,
174 #endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
175 #ifdef CONFIG_EXCEPTION_DEBUG
176 .name = "PT"
177 #endif
178 }
179 };
180
181 #define NUM_LEVELS ARRAY_SIZE(paging_levels)
182 #define PTE_LEVEL (NUM_LEVELS - 1)
183 #define PDE_LEVEL (NUM_LEVELS - 2)
184
185 /*
186 * Macros for reserving space for page tables
187 *
188 * We need to reserve a block of memory equal in size to the page tables
189 * generated by gen_mmu.py so that memory addresses do not shift between
190 * build phases. These macros ultimately specify INITIAL_PAGETABLE_SIZE.
191 */
192 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
193 #ifdef CONFIG_X86_64
194 #define NUM_PML4_ENTRIES 512U
195 #define NUM_PDPT_ENTRIES 512U
196 #else
197 #define NUM_PDPT_ENTRIES 4U
198 #endif /* CONFIG_X86_64 */
199 #define NUM_PD_ENTRIES 512U
200 #define NUM_PT_ENTRIES 512U
201 #else
202 #define NUM_PD_ENTRIES 1024U
203 #define NUM_PT_ENTRIES 1024U
204 #endif /* !CONFIG_X86_64 && !CONFIG_X86_PAE */
205
206 /* Memory range covered by an instance of various table types */
207 #define PT_AREA ((uintptr_t)(CONFIG_MMU_PAGE_SIZE * NUM_PT_ENTRIES))
208 #define PD_AREA (PT_AREA * NUM_PD_ENTRIES)
209 #ifdef CONFIG_X86_64
210 #define PDPT_AREA (PD_AREA * NUM_PDPT_ENTRIES)
211 #endif
212
213 #define VM_ADDR CONFIG_KERNEL_VM_BASE
214 #define VM_SIZE CONFIG_KERNEL_VM_SIZE
215
216 /* Define a range [PT_START, PT_END) which is the memory range
217 * covered by all the page tables needed for the address space
218 */
219 #define PT_START ((uintptr_t)ROUND_DOWN(VM_ADDR, PT_AREA))
220 #define PT_END ((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PT_AREA))
221
222 /* Number of page tables needed to cover address space. Depends on the specific
223 * bounds, but roughly 1 page table per 2MB of RAM
224 */
225 #define NUM_PT ((PT_END - PT_START) / PT_AREA)
226
227 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
228 /* Same semantics as above, but for the page directories needed to cover
229 * system RAM.
230 */
231 #define PD_START ((uintptr_t)ROUND_DOWN(VM_ADDR, PD_AREA))
232 #define PD_END ((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PD_AREA))
233 /* Number of page directories needed to cover the address space. Depends on the
234 * specific bounds, but roughly 1 page directory per 1GB of RAM
235 */
236 #define NUM_PD ((PD_END - PD_START) / PD_AREA)
237 #else
238 /* 32-bit page tables just have one toplevel page directory */
239 #define NUM_PD 1
240 #endif
241
242 #ifdef CONFIG_X86_64
243 /* Same semantics as above, but for the page directory pointer tables needed
244 * to cover the address space. On 32-bit there is just one 4-entry PDPT.
245 */
246 #define PDPT_START ((uintptr_t)ROUND_DOWN(VM_ADDR, PDPT_AREA))
247 #define PDPT_END ((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PDPT_AREA))
248 /* Number of PDPTs needed to cover the address space. 1 PDPT per 512GB of VM */
249 #define NUM_PDPT ((PDPT_END - PDPT_START) / PDPT_AREA)
250
251 /* All pages needed for page tables, using computed values plus one more for
252 * the top-level PML4
253 */
254 #define NUM_TABLE_PAGES (NUM_PT + NUM_PD + NUM_PDPT + 1)
255 #else /* !CONFIG_X86_64 */
256 /* Number of pages we need to reserve in the stack for per-thread page tables */
257 #define NUM_TABLE_PAGES (NUM_PT + NUM_PD)
258 #endif /* CONFIG_X86_64 */
259
260 #define INITIAL_PTABLE_PAGES \
261 (NUM_TABLE_PAGES + CONFIG_X86_EXTRA_PAGE_TABLE_PAGES)
262
263 #ifdef CONFIG_X86_PAE
264 /* Toplevel PDPT wasn't included as it is not a page in size */
265 #define INITIAL_PTABLE_SIZE \
266 ((INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE) + 0x20)
267 #else
268 #define INITIAL_PTABLE_SIZE \
269 (INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE)
270 #endif
271
272 /* "dummy" pagetables for the first-phase build. The real page tables
273 * are produced by gen-mmu.py based on data read in zephyr-prebuilt.elf,
274 * and this dummy array is discarded.
275 */
276 Z_GENERIC_SECTION(.dummy_pagetables)
277 static __used char dummy_pagetables[INITIAL_PTABLE_SIZE];
278
279 /*
280 * Utility functions
281 */
282
283 /* For a table at a particular level, get the entry index that corresponds to
284 * the provided virtual address
285 */
286 __pinned_func
get_index(void * virt,int level)287 static inline int get_index(void *virt, int level)
288 {
289 return (((uintptr_t)virt >> paging_levels[level].shift) %
290 paging_levels[level].entries);
291 }
292
293 __pinned_func
get_entry_ptr(pentry_t * ptables,void * virt,int level)294 static inline pentry_t *get_entry_ptr(pentry_t *ptables, void *virt, int level)
295 {
296 return &ptables[get_index(virt, level)];
297 }
298
299 __pinned_func
get_entry(pentry_t * ptables,void * virt,int level)300 static inline pentry_t get_entry(pentry_t *ptables, void *virt, int level)
301 {
302 return ptables[get_index(virt, level)];
303 }
304
305 /* Get the physical memory address associated with this table entry */
306 __pinned_func
get_entry_phys(pentry_t entry,int level)307 static inline uintptr_t get_entry_phys(pentry_t entry, int level)
308 {
309 return entry & paging_levels[level].mask;
310 }
311
312 /* Return the virtual address of a linked table stored in the provided entry */
313 __pinned_func
next_table(pentry_t entry,int level)314 static inline pentry_t *next_table(pentry_t entry, int level)
315 {
316 return k_mem_virt_addr(get_entry_phys(entry, level));
317 }
318
319 /* Number of table entries at this level */
320 __pinned_func
get_num_entries(int level)321 static inline size_t get_num_entries(int level)
322 {
323 return paging_levels[level].entries;
324 }
325
326 /* 4K for everything except PAE PDPTs */
327 __pinned_func
table_size(int level)328 static inline size_t table_size(int level)
329 {
330 return get_num_entries(level) * sizeof(pentry_t);
331 }
332
333 /* For a table at a particular level, size of the amount of virtual memory
334 * that an entry within the table covers
335 */
336 __pinned_func
get_entry_scope(int level)337 static inline size_t get_entry_scope(int level)
338 {
339 return (1UL << paging_levels[level].shift);
340 }
341
342 /* For a table at a particular level, size of the amount of virtual memory
343 * that this entire table covers
344 */
345 __pinned_func
get_table_scope(int level)346 static inline size_t get_table_scope(int level)
347 {
348 return get_entry_scope(level) * get_num_entries(level);
349 }
350
351 /* Must have checked Present bit first! Non-present entries may have OS data
352 * stored in any other bits
353 */
354 __pinned_func
is_leaf(int level,pentry_t entry)355 static inline bool is_leaf(int level, pentry_t entry)
356 {
357 if (level == PTE_LEVEL) {
358 /* Always true for PTE */
359 return true;
360 }
361
362 return ((entry & MMU_PS) != 0U);
363 }
364
365 /* This does NOT (by design) un-flip KPTI PTEs, it's just the raw PTE value */
366 __pinned_func
pentry_get(int * paging_level,pentry_t * val,pentry_t * ptables,void * virt)367 static inline void pentry_get(int *paging_level, pentry_t *val,
368 pentry_t *ptables, void *virt)
369 {
370 pentry_t *table = ptables;
371
372 for (int level = 0; level < NUM_LEVELS; level++) {
373 pentry_t entry = get_entry(table, virt, level);
374
375 if ((entry & MMU_P) == 0 || is_leaf(level, entry)) {
376 *val = entry;
377 if (paging_level != NULL) {
378 *paging_level = level;
379 }
380 break;
381 } else {
382 table = next_table(entry, level);
383 }
384 }
385 }
386
387 __pinned_func
tlb_flush_page(void * addr)388 static inline void tlb_flush_page(void *addr)
389 {
390 /* Invalidate TLB entries corresponding to the page containing the
391 * specified address
392 */
393 char *page = (char *)addr;
394
395 __asm__ ("invlpg %0" :: "m" (*page));
396 }
397
398 #ifdef CONFIG_X86_KPTI
399 __pinned_func
is_flipped_pte(pentry_t pte)400 static inline bool is_flipped_pte(pentry_t pte)
401 {
402 return (pte & MMU_P) == 0 && (pte & PTE_ZERO) != 0;
403 }
404 #endif
405
406 #if defined(CONFIG_SMP)
407 __pinned_func
z_x86_tlb_ipi(const void * arg)408 void z_x86_tlb_ipi(const void *arg)
409 {
410 uintptr_t ptables_phys;
411
412 ARG_UNUSED(arg);
413
414 #ifdef CONFIG_X86_KPTI
415 /* We're always on the kernel's set of page tables in this context
416 * if KPTI is turned on
417 */
418 ptables_phys = z_x86_cr3_get();
419 __ASSERT(ptables_phys == k_mem_phys_addr(&z_x86_kernel_ptables), "");
420 #else
421 /* We might have been moved to another memory domain, so always invoke
422 * z_x86_thread_page_tables_get() instead of using current CR3 value.
423 */
424 ptables_phys = k_mem_phys_addr(z_x86_thread_page_tables_get(arch_current_thread()));
425 #endif
426 /*
427 * In the future, we can consider making this smarter, such as
428 * propagating which page tables were modified (in case they are
429 * not active on this CPU) or an address range to call
430 * tlb_flush_page() on.
431 */
432 LOG_DBG("%s on CPU %d\n", __func__, arch_curr_cpu()->id);
433
434 z_x86_cr3_set(ptables_phys);
435 }
436
437 /* NOTE: This is not synchronous and the actual flush takes place some short
438 * time after this exits.
439 */
440 __pinned_func
tlb_shootdown(void)441 static inline void tlb_shootdown(void)
442 {
443 z_loapic_ipi(0, LOAPIC_ICR_IPI_OTHERS, CONFIG_TLB_IPI_VECTOR);
444 }
445 #endif /* CONFIG_SMP */
446
447 __pinned_func
assert_addr_aligned(uintptr_t addr)448 static inline void assert_addr_aligned(uintptr_t addr)
449 {
450 #if __ASSERT_ON
451 __ASSERT((addr & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U,
452 "unaligned address 0x%" PRIxPTR, addr);
453 #else
454 ARG_UNUSED(addr);
455 #endif
456 }
457
458 __pinned_func
is_addr_aligned(uintptr_t addr)459 static inline bool is_addr_aligned(uintptr_t addr)
460 {
461 if ((addr & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U) {
462 return true;
463 } else {
464 return false;
465 }
466 }
467
468 __pinned_func
assert_virt_addr_aligned(void * addr)469 static inline void assert_virt_addr_aligned(void *addr)
470 {
471 assert_addr_aligned((uintptr_t)addr);
472 }
473
474 __pinned_func
is_virt_addr_aligned(void * addr)475 static inline bool is_virt_addr_aligned(void *addr)
476 {
477 return is_addr_aligned((uintptr_t)addr);
478 }
479
480 __pinned_func
assert_size_aligned(size_t size)481 static inline void assert_size_aligned(size_t size)
482 {
483 #if __ASSERT_ON
484 __ASSERT((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U,
485 "unaligned size %zu", size);
486 #else
487 ARG_UNUSED(size);
488 #endif
489 }
490
491 __pinned_func
is_size_aligned(size_t size)492 static inline bool is_size_aligned(size_t size)
493 {
494 if ((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U) {
495 return true;
496 } else {
497 return false;
498 }
499 }
500
501 __pinned_func
assert_region_page_aligned(void * addr,size_t size)502 static inline void assert_region_page_aligned(void *addr, size_t size)
503 {
504 assert_virt_addr_aligned(addr);
505 assert_size_aligned(size);
506 }
507
508 __pinned_func
is_region_page_aligned(void * addr,size_t size)509 static inline bool is_region_page_aligned(void *addr, size_t size)
510 {
511 if (!is_virt_addr_aligned(addr)) {
512 return false;
513 }
514
515 return is_size_aligned(size);
516 }
517
518 /*
519 * Debug functions. All conditionally compiled with CONFIG_EXCEPTION_DEBUG.
520 */
521 #ifdef CONFIG_EXCEPTION_DEBUG
522
523 /* Add colors to page table dumps to indicate mapping type */
524 #define COLOR_PAGE_TABLES 1
525
526 #if COLOR_PAGE_TABLES
527 #define ANSI_DEFAULT "\x1B" "[0m"
528 #define ANSI_RED "\x1B" "[1;31m"
529 #define ANSI_GREEN "\x1B" "[1;32m"
530 #define ANSI_YELLOW "\x1B" "[1;33m"
531 #define ANSI_BLUE "\x1B" "[1;34m"
532 #define ANSI_MAGENTA "\x1B" "[1;35m"
533 #define ANSI_CYAN "\x1B" "[1;36m"
534 #define ANSI_GREY "\x1B" "[1;90m"
535
536 #define COLOR(x) printk(_CONCAT(ANSI_, x))
537 #else
538 #define COLOR(x) do { } while (false)
539 #endif
540
541 __pinned_func
get_entry_code(pentry_t value)542 static char get_entry_code(pentry_t value)
543 {
544 char ret;
545
546 if (value == 0U) {
547 /* Unmapped entry */
548 ret = '.';
549 } else {
550 if ((value & MMU_RW) != 0U) {
551 /* Writable page */
552 if ((value & MMU_XD) != 0U) {
553 /* RW */
554 ret = 'w';
555 } else {
556 /* RWX */
557 ret = 'a';
558 }
559 } else {
560 if ((value & MMU_XD) != 0U) {
561 /* R */
562 ret = 'r';
563 } else {
564 /* RX */
565 ret = 'x';
566 }
567 }
568
569 if ((value & MMU_US) != 0U) {
570 /* Uppercase indicates user mode access */
571 ret = toupper((unsigned char)ret);
572 }
573 }
574
575 return ret;
576 }
577
578 __pinned_func
print_entries(pentry_t entries_array[],uint8_t * base,int level,size_t count)579 static void print_entries(pentry_t entries_array[], uint8_t *base, int level,
580 size_t count)
581 {
582 int column = 0;
583
584 for (int i = 0; i < count; i++) {
585 pentry_t entry = entries_array[i];
586
587 uintptr_t phys = get_entry_phys(entry, level);
588 uintptr_t virt =
589 (uintptr_t)base + (get_entry_scope(level) * i);
590
591 if ((entry & MMU_P) != 0U) {
592 if (is_leaf(level, entry)) {
593 if (phys == virt) {
594 /* Identity mappings */
595 COLOR(YELLOW);
596 } else if (phys + K_MEM_VIRT_OFFSET == virt) {
597 /* Permanent RAM mappings */
598 COLOR(GREEN);
599 } else {
600 /* General mapped pages */
601 COLOR(CYAN);
602 }
603 } else {
604 /* Intermediate entry */
605 COLOR(MAGENTA);
606 }
607 } else {
608 if (is_leaf(level, entry)) {
609 if (entry == 0U) {
610 /* Unmapped */
611 COLOR(GREY);
612 #ifdef CONFIG_X86_KPTI
613 } else if (is_flipped_pte(entry)) {
614 /* KPTI, un-flip it */
615 COLOR(BLUE);
616 entry = ~entry;
617 phys = get_entry_phys(entry, level);
618 if (phys == virt) {
619 /* Identity mapped */
620 COLOR(CYAN);
621 } else {
622 /* Non-identity mapped */
623 COLOR(BLUE);
624 }
625 #endif
626 } else {
627 /* Paged out */
628 COLOR(RED);
629 }
630 } else {
631 /* Un-mapped intermediate entry */
632 COLOR(GREY);
633 }
634 }
635
636 printk("%c", get_entry_code(entry));
637
638 column++;
639 if (column == 64) {
640 column = 0;
641 printk("\n");
642 }
643 }
644 COLOR(DEFAULT);
645
646 if (column != 0) {
647 printk("\n");
648 }
649 }
650
651 __pinned_func
dump_ptables(pentry_t * table,uint8_t * base,int level)652 static void dump_ptables(pentry_t *table, uint8_t *base, int level)
653 {
654 const struct paging_level *info = &paging_levels[level];
655
656 #ifdef CONFIG_X86_64
657 /* Account for the virtual memory "hole" with sign-extension */
658 if (((uintptr_t)base & BITL(47)) != 0) {
659 base = (uint8_t *)((uintptr_t)base | (0xFFFFULL << 48));
660 }
661 #endif
662
663 printk("%s at %p (0x%" PRIxPTR "): ", info->name, table,
664 k_mem_phys_addr(table));
665 if (level == 0) {
666 printk("entire address space\n");
667 } else {
668 printk("for %p - %p\n", base,
669 base + get_table_scope(level) - 1);
670 }
671
672 print_entries(table, base, level, info->entries);
673
674 /* Check if we're a page table */
675 if (level == PTE_LEVEL) {
676 return;
677 }
678
679 /* Dump all linked child tables */
680 for (int j = 0; j < info->entries; j++) {
681 pentry_t entry = table[j];
682 pentry_t *next;
683
684 if ((entry & MMU_P) == 0U ||
685 (entry & MMU_PS) != 0U) {
686 /* Not present or big page, skip */
687 continue;
688 }
689
690 next = next_table(entry, level);
691 dump_ptables(next, base + (j * get_entry_scope(level)),
692 level + 1);
693 }
694 }
695
696 __pinned_func
z_x86_dump_page_tables(pentry_t * ptables)697 void z_x86_dump_page_tables(pentry_t *ptables)
698 {
699 dump_ptables(ptables, NULL, 0);
700 }
701
702 /* Enable to dump out the kernel's page table right before main() starts,
703 * sometimes useful for deep debugging. May overwhelm twister.
704 */
705 #define DUMP_PAGE_TABLES 0
706
707 #if DUMP_PAGE_TABLES
708 __pinned_func
dump_kernel_tables(void)709 static int dump_kernel_tables(void)
710 {
711 z_x86_dump_page_tables(z_x86_kernel_ptables);
712
713 return 0;
714 }
715
716 SYS_INIT(dump_kernel_tables, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT);
717 #endif
718
719 __pinned_func
str_append(char ** buf,size_t * size,const char * str)720 static void str_append(char **buf, size_t *size, const char *str)
721 {
722 int ret = snprintk(*buf, *size, "%s", str);
723
724 if (ret >= *size) {
725 /* Truncated */
726 *size = 0U;
727 } else {
728 *size -= ret;
729 *buf += ret;
730 }
731
732 }
733
734 __pinned_func
dump_entry(int level,void * virt,pentry_t entry)735 static void dump_entry(int level, void *virt, pentry_t entry)
736 {
737 const struct paging_level *info = &paging_levels[level];
738 char buf[24] = { 0 };
739 char *pos = buf;
740 size_t sz = sizeof(buf);
741 uint8_t *virtmap = (uint8_t *)ROUND_DOWN(virt, get_entry_scope(level));
742
743 #define DUMP_BIT(bit) do { \
744 if ((entry & MMU_##bit) != 0U) { \
745 str_append(&pos, &sz, #bit " "); \
746 } \
747 } while (false)
748
749 DUMP_BIT(RW);
750 DUMP_BIT(US);
751 DUMP_BIT(PWT);
752 DUMP_BIT(PCD);
753 DUMP_BIT(A);
754 DUMP_BIT(D);
755 DUMP_BIT(G);
756 DUMP_BIT(XD);
757
758 LOG_ERR("%sE: %p -> " PRI_ENTRY ": %s", info->name,
759 virtmap, entry & info->mask, buf);
760
761 #undef DUMP_BIT
762 }
763
764 __pinned_func
z_x86_pentry_get(int * paging_level,pentry_t * val,pentry_t * ptables,void * virt)765 void z_x86_pentry_get(int *paging_level, pentry_t *val, pentry_t *ptables,
766 void *virt)
767 {
768 pentry_get(paging_level, val, ptables, virt);
769 }
770
771 /*
772 * Debug function for dumping out MMU table information to the LOG for a
773 * specific virtual address, such as when we get an unexpected page fault.
774 */
775 __pinned_func
z_x86_dump_mmu_flags(pentry_t * ptables,void * virt)776 void z_x86_dump_mmu_flags(pentry_t *ptables, void *virt)
777 {
778 pentry_t entry = 0;
779 int level = 0;
780
781 pentry_get(&level, &entry, ptables, virt);
782
783 if ((entry & MMU_P) == 0) {
784 LOG_ERR("%sE: not present", paging_levels[level].name);
785 } else {
786 dump_entry(level, virt, entry);
787 }
788 }
789 #endif /* CONFIG_EXCEPTION_DEBUG */
790
791 /* Reset permissions on a PTE to original state when the mapping was made */
792 __pinned_func
reset_pte(pentry_t old_val)793 static inline pentry_t reset_pte(pentry_t old_val)
794 {
795 pentry_t new_val;
796
797 /* Clear any existing state in permission bits */
798 new_val = old_val & (~K_MEM_PARTITION_PERM_MASK);
799
800 /* Now set permissions based on the stashed original values */
801 if ((old_val & MMU_RW_ORIG) != 0) {
802 new_val |= MMU_RW;
803 }
804 if ((old_val & MMU_US_ORIG) != 0) {
805 new_val |= MMU_US;
806 }
807 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
808 if ((old_val & MMU_XD_ORIG) != 0) {
809 new_val |= MMU_XD;
810 }
811 #endif
812 return new_val;
813 }
814
815 /* Wrapper functions for some gross stuff we have to do for Kernel
816 * page table isolation. If these are User mode page tables, the user bit
817 * isn't set, and this is not the shared page, all the bits in the PTE
818 * are flipped. This serves three purposes:
819 * - The page isn't present, implementing page table isolation
820 * - Flipping the physical address bits cheaply mitigates L1TF
821 * - State is preserved; to get original PTE, just complement again
822 */
823 __pinned_func
pte_finalize_value(pentry_t val,bool user_table,int level)824 static inline pentry_t pte_finalize_value(pentry_t val, bool user_table,
825 int level)
826 {
827 #ifdef CONFIG_X86_KPTI
828 static const uintptr_t shared_phys_addr =
829 K_MEM_PHYS_ADDR(POINTER_TO_UINT(&z_shared_kernel_page_start));
830
831 if (user_table && (val & MMU_US) == 0 && (val & MMU_P) != 0 &&
832 get_entry_phys(val, level) != shared_phys_addr) {
833 val = ~val;
834 }
835 #else
836 ARG_UNUSED(user_table);
837 ARG_UNUSED(level);
838 #endif
839 return val;
840 }
841
842 /* Atomic functions for modifying PTEs. These don't map nicely to Zephyr's
843 * atomic API since the only types supported are 'int' and 'void *' and
844 * the size of pentry_t depends on other factors like PAE.
845 */
846 #ifndef CONFIG_X86_PAE
847 /* Non-PAE, pentry_t is same size as void ptr so use atomic_ptr_* APIs */
848 __pinned_func
atomic_pte_get(const pentry_t * target)849 static inline pentry_t atomic_pte_get(const pentry_t *target)
850 {
851 return (pentry_t)atomic_ptr_get((const atomic_ptr_t *)target);
852 }
853
854 __pinned_func
atomic_pte_cas(pentry_t * target,pentry_t old_value,pentry_t new_value)855 static inline bool atomic_pte_cas(pentry_t *target, pentry_t old_value,
856 pentry_t new_value)
857 {
858 return atomic_ptr_cas((atomic_ptr_t *)target, (void *)old_value,
859 (void *)new_value);
860 }
861 #else
862 /* Atomic builtins for 64-bit values on 32-bit x86 require floating point.
863 * Don't do this, just lock local interrupts. Needless to say, this
864 * isn't workable if someone ever adds SMP to the 32-bit x86 port.
865 */
866 BUILD_ASSERT(!IS_ENABLED(CONFIG_SMP));
867
868 __pinned_func
atomic_pte_get(const pentry_t * target)869 static inline pentry_t atomic_pte_get(const pentry_t *target)
870 {
871 return *target;
872 }
873
874 __pinned_func
atomic_pte_cas(pentry_t * target,pentry_t old_value,pentry_t new_value)875 static inline bool atomic_pte_cas(pentry_t *target, pentry_t old_value,
876 pentry_t new_value)
877 {
878 bool ret = false;
879 int key = arch_irq_lock();
880
881 if (*target == old_value) {
882 *target = new_value;
883 ret = true;
884 }
885 arch_irq_unlock(key);
886
887 return ret;
888 }
889 #endif /* CONFIG_X86_PAE */
890
891 /* Indicates that the target page tables will be used by user mode threads.
892 * This only has implications for CONFIG_X86_KPTI where user thread facing
893 * page tables need nearly all pages that don't have the US bit to also
894 * not be Present.
895 */
896 #define OPTION_USER BIT(0)
897
898 /* Indicates that the operation requires TLBs to be flushed as we are altering
899 * existing mappings. Not needed for establishing new mappings
900 */
901 #define OPTION_FLUSH BIT(1)
902
903 /* Indicates that each PTE's permission bits should be restored to their
904 * original state when the memory was mapped. All other bits in the PTE are
905 * preserved.
906 */
907 #define OPTION_RESET BIT(2)
908
909 /* Indicates that the mapping will need to be cleared entirely. This is
910 * mainly used for unmapping the memory region.
911 */
912 #define OPTION_CLEAR BIT(3)
913
914 /**
915 * Atomically update bits in a page table entry
916 *
917 * This is atomic with respect to modifications by other CPUs or preempted
918 * contexts, which can be very important when making decisions based on
919 * the PTE's prior "dirty" state.
920 *
921 * @param pte Pointer to page table entry to update
922 * @param update_val Updated bits to set/clear in PTE. Ignored with
923 * OPTION_RESET or OPTION_CLEAR.
924 * @param update_mask Which bits to modify in the PTE. Ignored with
925 * OPTION_RESET or OPTION_CLEAR.
926 * @param options Control flags
927 * @retval Old PTE value
928 */
929 __pinned_func
pte_atomic_update(pentry_t * pte,pentry_t update_val,pentry_t update_mask,uint32_t options)930 static inline pentry_t pte_atomic_update(pentry_t *pte, pentry_t update_val,
931 pentry_t update_mask,
932 uint32_t options)
933 {
934 bool user_table = (options & OPTION_USER) != 0U;
935 bool reset = (options & OPTION_RESET) != 0U;
936 bool clear = (options & OPTION_CLEAR) != 0U;
937 pentry_t old_val, new_val;
938
939 do {
940 old_val = atomic_pte_get(pte);
941
942 new_val = old_val;
943 #ifdef CONFIG_X86_KPTI
944 if (is_flipped_pte(new_val)) {
945 /* Page was flipped for KPTI. Un-flip it */
946 new_val = ~new_val;
947 }
948 #endif /* CONFIG_X86_KPTI */
949
950 if (reset) {
951 new_val = reset_pte(new_val);
952 } else if (clear) {
953 new_val = 0;
954 } else {
955 new_val = ((new_val & ~update_mask) |
956 (update_val & update_mask));
957 }
958
959 new_val = pte_finalize_value(new_val, user_table, PTE_LEVEL);
960 } while (atomic_pte_cas(pte, old_val, new_val) == false);
961
962 #ifdef CONFIG_X86_KPTI
963 if (is_flipped_pte(old_val)) {
964 /* Page was flipped for KPTI. Un-flip it */
965 old_val = ~old_val;
966 }
967 #endif /* CONFIG_X86_KPTI */
968
969 return old_val;
970 }
971
972 /**
973 * Low level page table update function for a virtual page
974 *
975 * For the provided set of page tables, update the PTE associated with the
976 * virtual address to a new value, using the mask to control what bits
977 * need to be preserved.
978 *
979 * It is permitted to set up mappings without the Present bit set, in which
980 * case all other bits may be used for OS accounting.
981 *
982 * This function is atomic with respect to the page table entries being
983 * modified by another CPU, using atomic operations to update the requested
984 * bits and return the previous PTE value.
985 *
986 * Common mask values:
987 * MASK_ALL - Update all PTE bits. Existing state totally discarded.
988 * MASK_PERM - Only update permission bits. All other bits and physical
989 * mapping preserved.
990 *
991 * @param ptables Page tables to modify
992 * @param virt Virtual page table entry to update
993 * @param entry_val Value to update in the PTE (ignored if OPTION_RESET or
994 * OPTION_CLEAR)
995 * @param [out] old_val_ptr Filled in with previous PTE value. May be NULL.
996 * @param mask What bits to update in the PTE (ignored if OPTION_RESET or
997 * OPTION_CLEAR)
998 * @param options Control options, described above
999 *
1000 * @retval 0 if successful
1001 * @retval -EFAULT if large page encountered or missing page table level
1002 */
1003 __pinned_func
page_map_set(pentry_t * ptables,void * virt,pentry_t entry_val,pentry_t * old_val_ptr,pentry_t mask,uint32_t options)1004 static int page_map_set(pentry_t *ptables, void *virt, pentry_t entry_val,
1005 pentry_t *old_val_ptr, pentry_t mask, uint32_t options)
1006 {
1007 pentry_t *table = ptables;
1008 bool flush = (options & OPTION_FLUSH) != 0U;
1009 int ret = 0;
1010
1011 for (int level = 0; level < NUM_LEVELS; level++) {
1012 int index;
1013 pentry_t *entryp;
1014
1015 index = get_index(virt, level);
1016 entryp = &table[index];
1017
1018 /* Check if we're a PTE */
1019 if (level == PTE_LEVEL) {
1020 pentry_t old_val = pte_atomic_update(entryp, entry_val,
1021 mask, options);
1022 if (old_val_ptr != NULL) {
1023 *old_val_ptr = old_val;
1024 }
1025 break;
1026 }
1027
1028 /* We bail out early here due to no support for
1029 * splitting existing bigpage mappings.
1030 * If the PS bit is not supported at some level (like
1031 * in a PML4 entry) it is always reserved and must be 0
1032 */
1033 CHECKIF(!((*entryp & MMU_PS) == 0U)) {
1034 /* Cannot continue since we cannot split
1035 * bigpage mappings.
1036 */
1037 LOG_ERR("large page encountered");
1038 ret = -EFAULT;
1039 goto out;
1040 }
1041
1042 table = next_table(*entryp, level);
1043
1044 CHECKIF(!(table != NULL)) {
1045 /* Cannot continue since table is NULL,
1046 * and it cannot be dereferenced in next loop
1047 * iteration.
1048 */
1049 LOG_ERR("missing page table level %d when trying to map %p",
1050 level + 1, virt);
1051 ret = -EFAULT;
1052 goto out;
1053 }
1054 }
1055
1056 out:
1057 if (flush) {
1058 tlb_flush_page(virt);
1059 }
1060
1061 return ret;
1062 }
1063
1064 /**
1065 * Map a physical region in a specific set of page tables.
1066 *
1067 * See documentation for page_map_set() for additional notes about masks and
1068 * supported options.
1069 *
1070 * It is vital to remember that all virtual-to-physical mappings must be
1071 * the same with respect to supervisor mode regardless of what thread is
1072 * scheduled (and therefore, if multiple sets of page tables exist, which one
1073 * is active).
1074 *
1075 * It is permitted to set up mappings without the Present bit set.
1076 *
1077 * @param ptables Page tables to modify
1078 * @param virt Base page-aligned virtual memory address to map the region.
1079 * @param phys Base page-aligned physical memory address for the region.
1080 * Ignored if OPTION_RESET or OPTION_CLEAR. Also affected by the mask
1081 * parameter. This address is not directly examined, it will simply be
1082 * programmed into the PTE.
1083 * @param size Size of the physical region to map
1084 * @param entry_flags Non-address bits to set in every PTE. Ignored if
1085 * OPTION_RESET. Also affected by the mask parameter.
1086 * @param mask What bits to update in each PTE. Un-set bits will never be
1087 * modified. Ignored if OPTION_RESET or OPTION_CLEAR.
1088 * @param options Control options, described above
1089 *
1090 * @retval 0 if successful
1091 * @retval -EINVAL if invalid parameters are supplied
1092 * @retval -EFAULT if errors encountered when updating page tables
1093 */
1094 __pinned_func
range_map_ptables(pentry_t * ptables,void * virt,uintptr_t phys,size_t size,pentry_t entry_flags,pentry_t mask,uint32_t options)1095 static int range_map_ptables(pentry_t *ptables, void *virt, uintptr_t phys,
1096 size_t size, pentry_t entry_flags, pentry_t mask,
1097 uint32_t options)
1098 {
1099 bool zero_entry = (options & (OPTION_RESET | OPTION_CLEAR)) != 0U;
1100 int ret = 0, ret2;
1101
1102 CHECKIF(!is_addr_aligned(phys) || !is_size_aligned(size)) {
1103 ret = -EINVAL;
1104 goto out;
1105 }
1106
1107 CHECKIF(!((entry_flags & paging_levels[0].mask) == 0U)) {
1108 LOG_ERR("entry_flags " PRI_ENTRY " overlaps address area",
1109 entry_flags);
1110 ret = -EINVAL;
1111 goto out;
1112 }
1113
1114 /* This implementation is stack-efficient but not particularly fast.
1115 * We do a full page table walk for every page we are updating.
1116 * Recursive approaches are possible, but use much more stack space.
1117 */
1118 for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) {
1119 uint8_t *dest_virt = (uint8_t *)virt + offset;
1120 pentry_t entry_val;
1121
1122 if (zero_entry) {
1123 entry_val = 0;
1124 } else {
1125 entry_val = (pentry_t)(phys + offset) | entry_flags;
1126 }
1127
1128 ret2 = page_map_set(ptables, dest_virt, entry_val, NULL, mask,
1129 options);
1130 ARG_UNUSED(ret2);
1131 CHECKIF(ret2 != 0) {
1132 ret = ret2;
1133 }
1134 }
1135
1136 out:
1137 return ret;
1138 }
1139
1140 /**
1141 * Establish or update a memory mapping for all page tables
1142 *
1143 * The physical region noted from phys to phys + size will be mapped to
1144 * an equal sized virtual region starting at virt, with the provided flags.
1145 * The mask value denotes what bits in PTEs will actually be modified.
1146 *
1147 * See range_map_ptables() for additional details.
1148 *
1149 * @param virt Page-aligned starting virtual address
1150 * @param phys Page-aligned starting physical address. Ignored if the mask
1151 * parameter does not enable address bits or OPTION_RESET used.
1152 * This region is not directly examined, it will simply be
1153 * programmed into the page tables.
1154 * @param size Size of the physical region to map
1155 * @param entry_flags Desired state of non-address PTE bits covered by mask,
1156 * ignored if OPTION_RESET
1157 * @param mask What bits in the PTE to actually modify; unset bits will
1158 * be preserved. Ignored if OPTION_RESET.
1159 * @param options Control options. Do not set OPTION_USER here. OPTION_FLUSH
1160 * will trigger a TLB shootdown after all tables are updated.
1161 *
1162 * @retval 0 if successful
1163 * @retval -EINVAL if invalid parameters are supplied
1164 * @retval -EFAULT if errors encountered when updating page tables
1165 */
1166 __pinned_func
range_map(void * virt,uintptr_t phys,size_t size,pentry_t entry_flags,pentry_t mask,uint32_t options)1167 static int range_map(void *virt, uintptr_t phys, size_t size,
1168 pentry_t entry_flags, pentry_t mask, uint32_t options)
1169 {
1170 int ret = 0, ret2;
1171
1172 LOG_DBG("%s: 0x%" PRIxPTR " -> %p (%zu) flags " PRI_ENTRY " mask "
1173 PRI_ENTRY " opt 0x%x", __func__, phys, virt, size,
1174 entry_flags, mask, options);
1175
1176 #ifdef CONFIG_X86_64
1177 /* There's a gap in the "64-bit" address space, as 4-level paging
1178 * requires bits 48 to 63 to be copies of bit 47. Test this
1179 * by treating as a signed value and shifting.
1180 */
1181 __ASSERT(((((intptr_t)virt) << 16) >> 16) == (intptr_t)virt,
1182 "non-canonical virtual address mapping %p (size %zu)",
1183 virt, size);
1184 #endif /* CONFIG_X86_64 */
1185
1186 CHECKIF(!((options & OPTION_USER) == 0U)) {
1187 LOG_ERR("invalid option for mapping");
1188 ret = -EINVAL;
1189 goto out;
1190 }
1191
1192 /* All virtual-to-physical mappings are the same in all page tables.
1193 * What can differ is only access permissions, defined by the memory
1194 * domain associated with the page tables, and the threads that are
1195 * members of that domain.
1196 *
1197 * Any new mappings need to be applied to all page tables.
1198 */
1199 #if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
1200 sys_snode_t *node;
1201
1202 SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
1203 struct arch_mem_domain *domain =
1204 CONTAINER_OF(node, struct arch_mem_domain, node);
1205
1206 ret2 = range_map_ptables(domain->ptables, virt, phys, size,
1207 entry_flags, mask,
1208 options | OPTION_USER);
1209 ARG_UNUSED(ret2);
1210 CHECKIF(ret2 != 0) {
1211 ret = ret2;
1212 }
1213 }
1214 #endif /* CONFIG_USERSPACE */
1215
1216 ret2 = range_map_ptables(z_x86_kernel_ptables, virt, phys, size,
1217 entry_flags, mask, options);
1218 ARG_UNUSED(ret2);
1219 CHECKIF(ret2 != 0) {
1220 ret = ret2;
1221 }
1222
1223 out:
1224 #ifdef CONFIG_SMP
1225 if ((options & OPTION_FLUSH) != 0U) {
1226 tlb_shootdown();
1227 }
1228 #endif /* CONFIG_SMP */
1229
1230 return ret;
1231 }
1232
1233 __pinned_func
range_map_unlocked(void * virt,uintptr_t phys,size_t size,pentry_t entry_flags,pentry_t mask,uint32_t options)1234 static inline int range_map_unlocked(void *virt, uintptr_t phys, size_t size,
1235 pentry_t entry_flags, pentry_t mask,
1236 uint32_t options)
1237 {
1238 k_spinlock_key_t key;
1239 int ret;
1240
1241 key = k_spin_lock(&x86_mmu_lock);
1242 ret = range_map(virt, phys, size, entry_flags, mask, options);
1243 k_spin_unlock(&x86_mmu_lock, key);
1244
1245 return ret;
1246 }
1247
1248 __pinned_func
flags_to_entry(uint32_t flags)1249 static pentry_t flags_to_entry(uint32_t flags)
1250 {
1251 pentry_t entry_flags = MMU_P;
1252
1253 /* Translate flags argument into HW-recognized entry flags.
1254 *
1255 * Support for PAT is not implemented yet. Many systems may have
1256 * BIOS-populated MTRR values such that these cache settings are
1257 * redundant.
1258 */
1259 switch (flags & K_MEM_CACHE_MASK) {
1260 case K_MEM_CACHE_NONE:
1261 entry_flags |= MMU_PCD;
1262 break;
1263 case K_MEM_CACHE_WT:
1264 entry_flags |= MMU_PWT;
1265 break;
1266 case K_MEM_CACHE_WB:
1267 break;
1268 default:
1269 __ASSERT(false, "bad memory mapping flags 0x%x", flags);
1270 }
1271
1272 if ((flags & K_MEM_PERM_RW) != 0U) {
1273 entry_flags |= ENTRY_RW;
1274 }
1275
1276 if ((flags & K_MEM_PERM_USER) != 0U) {
1277 entry_flags |= ENTRY_US;
1278 }
1279
1280 if ((flags & K_MEM_PERM_EXEC) == 0U) {
1281 entry_flags |= ENTRY_XD;
1282 }
1283
1284 if (IS_ENABLED(CONFIG_DEMAND_MAPPING) && (flags & K_MEM_MAP_UNPAGED) != 0U) {
1285 /* same state as in arch_mem_page_out() */
1286 entry_flags &= ~MMU_P;
1287 entry_flags |= MMU_A;
1288 }
1289
1290 return entry_flags;
1291 }
1292
1293 /* map new region virt..virt+size to phys with provided arch-neutral flags */
1294 __pinned_func
arch_mem_map(void * virt,uintptr_t phys,size_t size,uint32_t flags)1295 void arch_mem_map(void *virt, uintptr_t phys, size_t size, uint32_t flags)
1296 {
1297 int ret;
1298
1299 ret = range_map_unlocked(virt, phys, size, flags_to_entry(flags),
1300 MASK_ALL, 0);
1301 __ASSERT_NO_MSG(ret == 0);
1302 ARG_UNUSED(ret);
1303 }
1304
1305 /* unmap region addr..addr+size, reset entries and flush TLB */
arch_mem_unmap(void * addr,size_t size)1306 void arch_mem_unmap(void *addr, size_t size)
1307 {
1308 int ret;
1309
1310 ret = range_map_unlocked(addr, 0, size, 0, 0,
1311 OPTION_FLUSH | OPTION_CLEAR);
1312 __ASSERT_NO_MSG(ret == 0);
1313 ARG_UNUSED(ret);
1314 }
1315
1316 #ifdef K_MEM_IS_VM_KERNEL
1317 __boot_func
identity_map_remove(uint32_t level)1318 static void identity_map_remove(uint32_t level)
1319 {
1320 size_t size, scope = get_entry_scope(level);
1321 pentry_t *table;
1322 uint32_t cur_level;
1323 uint8_t *pos;
1324 pentry_t entry;
1325 pentry_t *entry_ptr;
1326
1327 k_mem_region_align((uintptr_t *)&pos, &size,
1328 (uintptr_t)CONFIG_SRAM_BASE_ADDRESS,
1329 (size_t)CONFIG_SRAM_SIZE * 1024U, scope);
1330
1331 while (size != 0U) {
1332 /* Need to get to the correct table */
1333 table = z_x86_kernel_ptables;
1334 for (cur_level = 0; cur_level < level; cur_level++) {
1335 entry = get_entry(table, pos, cur_level);
1336 table = next_table(entry, level);
1337 }
1338
1339 entry_ptr = get_entry_ptr(table, pos, level);
1340
1341 /* set_pte */
1342 *entry_ptr = 0;
1343 pos += scope;
1344 size -= scope;
1345 }
1346 }
1347 #endif
1348
1349 /* Invoked to remove the identity mappings in the page tables,
1350 * they were only needed to transition the instruction pointer at early boot
1351 */
1352 __boot_func
z_x86_mmu_init(void)1353 void z_x86_mmu_init(void)
1354 {
1355 #ifdef K_MEM_IS_VM_KERNEL
1356 /* We booted with physical address space being identity mapped.
1357 * As we are now executing in virtual address space,
1358 * the identity map is no longer needed. So remove them.
1359 *
1360 * Without PAE, only need to remove the entries at the PD level.
1361 * With PAE, need to also remove the entry at PDP level.
1362 */
1363 identity_map_remove(PDE_LEVEL);
1364
1365 #ifdef CONFIG_X86_PAE
1366 identity_map_remove(0);
1367 #endif
1368 #endif
1369 }
1370
1371 #ifdef CONFIG_X86_STACK_PROTECTION
1372 __pinned_func
z_x86_set_stack_guard(k_thread_stack_t * stack)1373 void z_x86_set_stack_guard(k_thread_stack_t *stack)
1374 {
1375 int ret;
1376
1377 /* Applied to all page tables as this affects supervisor mode.
1378 * XXX: This never gets reset when the thread exits, which can
1379 * cause problems if the memory is later used for something else.
1380 * See #29499
1381 *
1382 * Guard page is always the first page of the stack object for both
1383 * kernel and thread stacks.
1384 */
1385 ret = range_map_unlocked(stack, 0, CONFIG_MMU_PAGE_SIZE,
1386 MMU_P | ENTRY_XD, MASK_PERM, OPTION_FLUSH);
1387 __ASSERT_NO_MSG(ret == 0);
1388 ARG_UNUSED(ret);
1389 }
1390 #endif /* CONFIG_X86_STACK_PROTECTION */
1391
1392 #ifdef CONFIG_USERSPACE
1393 __pinned_func
page_validate(pentry_t * ptables,uint8_t * addr,bool write)1394 static bool page_validate(pentry_t *ptables, uint8_t *addr, bool write)
1395 {
1396 pentry_t *table = ptables;
1397
1398 for (int level = 0; level < NUM_LEVELS; level++) {
1399 pentry_t entry = get_entry(table, addr, level);
1400
1401 if (is_leaf(level, entry)) {
1402 #ifdef CONFIG_X86_KPTI
1403 if (is_flipped_pte(entry)) {
1404 /* We flipped this to prevent user access
1405 * since just clearing US isn't sufficient
1406 */
1407 return false;
1408 }
1409 #endif
1410 /* US and RW bits still carry meaning if non-present.
1411 * If the data page is paged out, access bits are
1412 * preserved. If un-mapped, the whole entry is 0.
1413 */
1414 if (((entry & MMU_US) == 0U) ||
1415 (write && ((entry & MMU_RW) == 0U))) {
1416 return false;
1417 }
1418 } else {
1419 if ((entry & MMU_P) == 0U) {
1420 /* Missing intermediate table, address is
1421 * un-mapped
1422 */
1423 return false;
1424 }
1425 table = next_table(entry, level);
1426 }
1427 }
1428
1429 return true;
1430 }
1431
1432 __pinned_func
bcb_fence(void)1433 static inline void bcb_fence(void)
1434 {
1435 #ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION
1436 __asm__ volatile ("lfence" : : : "memory");
1437 #endif
1438 }
1439
1440 __pinned_func
arch_buffer_validate(const void * addr,size_t size,int write)1441 int arch_buffer_validate(const void *addr, size_t size, int write)
1442 {
1443 pentry_t *ptables = z_x86_thread_page_tables_get(arch_current_thread());
1444 uint8_t *virt;
1445 size_t aligned_size;
1446 int ret = 0;
1447
1448 /* addr/size arbitrary, fix this up into an aligned region */
1449 (void)k_mem_region_align((uintptr_t *)&virt, &aligned_size,
1450 (uintptr_t)addr, size, CONFIG_MMU_PAGE_SIZE);
1451
1452 for (size_t offset = 0; offset < aligned_size;
1453 offset += CONFIG_MMU_PAGE_SIZE) {
1454 if (!page_validate(ptables, virt + offset, write)) {
1455 ret = -1;
1456 break;
1457 }
1458 }
1459
1460 bcb_fence();
1461
1462 return ret;
1463 }
1464 #ifdef CONFIG_X86_COMMON_PAGE_TABLE
1465 /* Very low memory configuration. A single set of page tables is used for
1466 * all threads. This relies on some assumptions:
1467 *
1468 * - No KPTI. If that were supported, we would need both a kernel and user
1469 * set of page tables.
1470 * - No SMP. If that were supported, we would need per-core page tables.
1471 * - Memory domains don't affect supervisor mode.
1472 * - All threads have the same virtual-to-physical mappings.
1473 * - Memory domain APIs can't be called by user mode.
1474 *
1475 * Because there is no SMP, only one set of page tables, and user threads can't
1476 * modify their own memory domains, we don't have to do much when
1477 * arch_mem_domain_* APIs are called. We do use a caching scheme to avoid
1478 * updating page tables if the last user thread scheduled was in the same
1479 * domain.
1480 *
1481 * We don't set CONFIG_ARCH_MEM_DOMAIN_DATA, since we aren't setting
1482 * up any arch-specific memory domain data (per domain page tables.)
1483 *
1484 * This is all nice and simple and saves a lot of memory. The cost is that
1485 * context switching is not trivial CR3 update. We have to reset all partitions
1486 * for the current domain configuration and then apply all the partitions for
1487 * the incoming thread's domain if they are not the same. We also need to
1488 * update permissions similarly on the thread stack region.
1489 */
1490
1491 __pinned_func
reset_region(uintptr_t start,size_t size)1492 static inline int reset_region(uintptr_t start, size_t size)
1493 {
1494 return range_map_unlocked((void *)start, 0, size, 0, 0,
1495 OPTION_FLUSH | OPTION_RESET);
1496 }
1497
1498 __pinned_func
apply_region(uintptr_t start,size_t size,pentry_t attr)1499 static inline int apply_region(uintptr_t start, size_t size, pentry_t attr)
1500 {
1501 return range_map_unlocked((void *)start, 0, size, attr, MASK_PERM,
1502 OPTION_FLUSH);
1503 }
1504
1505 /* Cache of the current memory domain applied to the common page tables and
1506 * the stack buffer region that had User access granted.
1507 */
1508 static __pinned_bss struct k_mem_domain *current_domain;
1509 static __pinned_bss uintptr_t current_stack_start;
1510 static __pinned_bss size_t current_stack_size;
1511
1512 __pinned_func
z_x86_swap_update_common_page_table(struct k_thread * incoming)1513 void z_x86_swap_update_common_page_table(struct k_thread *incoming)
1514 {
1515 k_spinlock_key_t key;
1516
1517 if ((incoming->base.user_options & K_USER) == 0) {
1518 /* Incoming thread is not a user thread. Memory domains don't
1519 * affect supervisor threads and we don't need to enable User
1520 * bits for its stack buffer; do nothing.
1521 */
1522 return;
1523 }
1524
1525 /* Step 1: Make sure the thread stack is set up correctly for the
1526 * for the incoming thread
1527 */
1528 if (incoming->stack_info.start != current_stack_start ||
1529 incoming->stack_info.size != current_stack_size) {
1530 if (current_stack_size != 0U) {
1531 reset_region(current_stack_start, current_stack_size);
1532 }
1533
1534 /* The incoming thread's stack region needs User permissions */
1535 apply_region(incoming->stack_info.start,
1536 incoming->stack_info.size,
1537 K_MEM_PARTITION_P_RW_U_RW);
1538
1539 /* Update cache */
1540 current_stack_start = incoming->stack_info.start;
1541 current_stack_size = incoming->stack_info.size;
1542 }
1543
1544 /* Step 2: The page tables always have some memory domain applied to
1545 * them. If the incoming thread's memory domain is different,
1546 * update the page tables
1547 */
1548 key = k_spin_lock(&z_mem_domain_lock);
1549 if (incoming->mem_domain_info.mem_domain == current_domain) {
1550 /* The incoming thread's domain is already applied */
1551 goto out_unlock;
1552 }
1553
1554 /* Reset the current memory domain regions... */
1555 if (current_domain != NULL) {
1556 for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) {
1557 struct k_mem_partition *ptn =
1558 ¤t_domain->partitions[i];
1559
1560 if (ptn->size == 0) {
1561 continue;
1562 }
1563 reset_region(ptn->start, ptn->size);
1564 }
1565 }
1566
1567 /* ...and apply all the incoming domain's regions */
1568 for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) {
1569 struct k_mem_partition *ptn =
1570 &incoming->mem_domain_info.mem_domain->partitions[i];
1571
1572 if (ptn->size == 0) {
1573 continue;
1574 }
1575 apply_region(ptn->start, ptn->size, ptn->attr);
1576 }
1577 current_domain = incoming->mem_domain_info.mem_domain;
1578 out_unlock:
1579 k_spin_unlock(&z_mem_domain_lock, key);
1580 }
1581
1582 /* If a partition was added or removed in the cached domain, update the
1583 * page tables.
1584 */
1585 __pinned_func
arch_mem_domain_partition_remove(struct k_mem_domain * domain,uint32_t partition_id)1586 int arch_mem_domain_partition_remove(struct k_mem_domain *domain,
1587 uint32_t partition_id)
1588 {
1589 struct k_mem_partition *ptn;
1590
1591 if (domain != current_domain) {
1592 return 0;
1593 }
1594
1595 ptn = &domain->partitions[partition_id];
1596
1597 return reset_region(ptn->start, ptn->size);
1598 }
1599
1600 __pinned_func
arch_mem_domain_partition_add(struct k_mem_domain * domain,uint32_t partition_id)1601 int arch_mem_domain_partition_add(struct k_mem_domain *domain,
1602 uint32_t partition_id)
1603 {
1604 struct k_mem_partition *ptn;
1605
1606 if (domain != current_domain) {
1607 return 0;
1608 }
1609
1610 ptn = &domain->partitions[partition_id];
1611
1612 return apply_region(ptn->start, ptn->size, ptn->attr);
1613 }
1614
1615 /* Rest of the APIs don't need to do anything */
1616 __pinned_func
arch_mem_domain_thread_add(struct k_thread * thread)1617 int arch_mem_domain_thread_add(struct k_thread *thread)
1618 {
1619 return 0;
1620 }
1621
1622 __pinned_func
arch_mem_domain_thread_remove(struct k_thread * thread)1623 int arch_mem_domain_thread_remove(struct k_thread *thread)
1624 {
1625 return 0;
1626 }
1627 #else
1628 /* Memory domains each have a set of page tables assigned to them */
1629
1630 /*
1631 * Pool of free memory pages for copying page tables, as needed.
1632 */
1633 #define PTABLE_COPY_SIZE (INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE)
1634
1635 static uint8_t __pinned_noinit
1636 page_pool[PTABLE_COPY_SIZE * CONFIG_X86_MAX_ADDITIONAL_MEM_DOMAINS]
1637 __aligned(CONFIG_MMU_PAGE_SIZE);
1638
1639 __pinned_data
1640 static uint8_t *page_pos = page_pool + sizeof(page_pool);
1641
1642 /* Return a zeroed and suitably aligned memory page for page table data
1643 * from the global page pool
1644 */
1645 __pinned_func
page_pool_get(void)1646 static void *page_pool_get(void)
1647 {
1648 void *ret;
1649
1650 if (page_pos == page_pool) {
1651 ret = NULL;
1652 } else {
1653 page_pos -= CONFIG_MMU_PAGE_SIZE;
1654 ret = page_pos;
1655 }
1656
1657 if (ret != NULL) {
1658 memset(ret, 0, CONFIG_MMU_PAGE_SIZE);
1659 }
1660
1661 return ret;
1662 }
1663
1664 /* Debugging function to show how many pages are free in the pool */
1665 __pinned_func
pages_free(void)1666 static inline unsigned int pages_free(void)
1667 {
1668 return (page_pos - page_pool) / CONFIG_MMU_PAGE_SIZE;
1669 }
1670
1671 /**
1672 * Duplicate an entire set of page tables
1673 *
1674 * Uses recursion, but depth at any given moment is limited by the number of
1675 * paging levels.
1676 *
1677 * x86_mmu_lock must be held.
1678 *
1679 * @param dst a zeroed out chunk of memory of sufficient size for the indicated
1680 * paging level.
1681 * @param src some paging structure from within the source page tables to copy
1682 * at the indicated paging level
1683 * @param level Current paging level
1684 * @retval 0 Success
1685 * @retval -ENOMEM Insufficient page pool memory
1686 */
1687 __pinned_func
copy_page_table(pentry_t * dst,pentry_t * src,int level)1688 static int copy_page_table(pentry_t *dst, pentry_t *src, int level)
1689 {
1690 if (level == PTE_LEVEL) {
1691 /* Base case: leaf page table */
1692 for (int i = 0; i < get_num_entries(level); i++) {
1693 dst[i] = pte_finalize_value(reset_pte(src[i]), true,
1694 PTE_LEVEL);
1695 }
1696 } else {
1697 /* Recursive case: allocate sub-structures as needed and
1698 * make recursive calls on them
1699 */
1700 for (int i = 0; i < get_num_entries(level); i++) {
1701 pentry_t *child_dst;
1702 int ret;
1703
1704 if ((src[i] & MMU_P) == 0) {
1705 /* Non-present, skip */
1706 continue;
1707 }
1708
1709 if ((level == PDE_LEVEL) && ((src[i] & MMU_PS) != 0)) {
1710 /* large page: no lower level table */
1711 dst[i] = pte_finalize_value(src[i], true,
1712 PDE_LEVEL);
1713 continue;
1714 }
1715
1716 __ASSERT((src[i] & MMU_PS) == 0,
1717 "large page encountered");
1718
1719 child_dst = page_pool_get();
1720 if (child_dst == NULL) {
1721 return -ENOMEM;
1722 }
1723
1724 /* Page table links are by physical address. RAM
1725 * for page tables is identity-mapped, but double-
1726 * cast needed for PAE case where sizeof(void *) and
1727 * sizeof(pentry_t) are not the same.
1728 */
1729 dst[i] = ((pentry_t)k_mem_phys_addr(child_dst) |
1730 INT_FLAGS);
1731
1732 ret = copy_page_table(child_dst,
1733 next_table(src[i], level),
1734 level + 1);
1735 if (ret != 0) {
1736 return ret;
1737 }
1738 }
1739 }
1740
1741 return 0;
1742 }
1743
1744 __pinned_func
region_map_update(pentry_t * ptables,void * start,size_t size,pentry_t flags,bool reset)1745 static int region_map_update(pentry_t *ptables, void *start,
1746 size_t size, pentry_t flags, bool reset)
1747 {
1748 uint32_t options = OPTION_USER;
1749 int ret;
1750 k_spinlock_key_t key;
1751
1752 if (reset) {
1753 options |= OPTION_RESET;
1754 }
1755 if (ptables == z_x86_page_tables_get()) {
1756 options |= OPTION_FLUSH;
1757 }
1758
1759 key = k_spin_lock(&x86_mmu_lock);
1760 ret = range_map_ptables(ptables, start, 0, size, flags, MASK_PERM,
1761 options);
1762 k_spin_unlock(&x86_mmu_lock, key);
1763
1764 #ifdef CONFIG_SMP
1765 tlb_shootdown();
1766 #endif
1767
1768 return ret;
1769 }
1770
1771 __pinned_func
reset_region(pentry_t * ptables,void * start,size_t size)1772 static inline int reset_region(pentry_t *ptables, void *start, size_t size)
1773 {
1774 LOG_DBG("%s(%p, %p, %zu)", __func__, ptables, start, size);
1775 return region_map_update(ptables, start, size, 0, true);
1776 }
1777
1778 __pinned_func
apply_region(pentry_t * ptables,void * start,size_t size,pentry_t attr)1779 static inline int apply_region(pentry_t *ptables, void *start,
1780 size_t size, pentry_t attr)
1781 {
1782 LOG_DBG("%s(%p, %p, %zu, " PRI_ENTRY ")", __func__, ptables, start,
1783 size, attr);
1784 return region_map_update(ptables, start, size, attr, false);
1785 }
1786
1787 __pinned_func
set_stack_perms(struct k_thread * thread,pentry_t * ptables)1788 static void set_stack_perms(struct k_thread *thread, pentry_t *ptables)
1789 {
1790 LOG_DBG("update stack for thread %p's ptables at %p: 0x%" PRIxPTR " (size %zu)",
1791 thread, ptables, thread->stack_info.start,
1792 thread->stack_info.size);
1793 apply_region(ptables, (void *)thread->stack_info.start,
1794 thread->stack_info.size,
1795 MMU_P | MMU_XD | MMU_RW | MMU_US);
1796 }
1797
1798 /*
1799 * Arch interface implementations for memory domains and userspace
1800 */
1801
1802 __boot_func
arch_mem_domain_init(struct k_mem_domain * domain)1803 int arch_mem_domain_init(struct k_mem_domain *domain)
1804 {
1805 int ret;
1806 k_spinlock_key_t key = k_spin_lock(&x86_mmu_lock);
1807
1808 LOG_DBG("%s(%p)", __func__, domain);
1809 #if __ASSERT_ON
1810 sys_snode_t *node;
1811
1812 /* Assert that we have not already initialized this domain */
1813 SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
1814 struct arch_mem_domain *list_domain =
1815 CONTAINER_OF(node, struct arch_mem_domain, node);
1816
1817 __ASSERT(list_domain != &domain->arch,
1818 "%s(%p) called multiple times", __func__, domain);
1819 }
1820 #endif /* __ASSERT_ON */
1821 #ifndef CONFIG_X86_KPTI
1822 /* If we're not using KPTI then we can use the build time page tables
1823 * (which are mutable) as the set of page tables for the default
1824 * memory domain, saving us some memory.
1825 *
1826 * We skip adding this domain to x86_domain_list since we already
1827 * update z_x86_kernel_ptables directly in range_map().
1828 */
1829 if (domain == &k_mem_domain_default) {
1830 domain->arch.ptables = z_x86_kernel_ptables;
1831 k_spin_unlock(&x86_mmu_lock, key);
1832 return 0;
1833 }
1834 #endif /* CONFIG_X86_KPTI */
1835 #ifdef CONFIG_X86_PAE
1836 /* PDPT is stored within the memory domain itself since it is
1837 * much smaller than a full page
1838 */
1839 (void)memset(domain->arch.pdpt, 0, sizeof(domain->arch.pdpt));
1840 domain->arch.ptables = domain->arch.pdpt;
1841 #else
1842 /* Allocate a page-sized top-level structure, either a PD or PML4 */
1843 domain->arch.ptables = page_pool_get();
1844 if (domain->arch.ptables == NULL) {
1845 k_spin_unlock(&x86_mmu_lock, key);
1846 return -ENOMEM;
1847 }
1848 #endif /* CONFIG_X86_PAE */
1849
1850 LOG_DBG("copy_page_table(%p, %p, 0)", domain->arch.ptables,
1851 z_x86_kernel_ptables);
1852
1853 /* Make a copy of the boot page tables created by gen_mmu.py */
1854 ret = copy_page_table(domain->arch.ptables, z_x86_kernel_ptables, 0);
1855 if (ret == 0) {
1856 sys_slist_append(&x86_domain_list, &domain->arch.node);
1857 }
1858 k_spin_unlock(&x86_mmu_lock, key);
1859
1860 return ret;
1861 }
1862
arch_mem_domain_partition_remove(struct k_mem_domain * domain,uint32_t partition_id)1863 int arch_mem_domain_partition_remove(struct k_mem_domain *domain,
1864 uint32_t partition_id)
1865 {
1866 struct k_mem_partition *partition = &domain->partitions[partition_id];
1867
1868 /* Reset the partition's region back to defaults */
1869 return reset_region(domain->arch.ptables, (void *)partition->start,
1870 partition->size);
1871 }
1872
1873 /* Called on thread exit or when moving it to a different memory domain */
arch_mem_domain_thread_remove(struct k_thread * thread)1874 int arch_mem_domain_thread_remove(struct k_thread *thread)
1875 {
1876 struct k_mem_domain *domain = thread->mem_domain_info.mem_domain;
1877
1878 if ((thread->base.user_options & K_USER) == 0) {
1879 return 0;
1880 }
1881
1882 if ((thread->base.thread_state & _THREAD_DEAD) == 0) {
1883 /* Thread is migrating to another memory domain and not
1884 * exiting for good; we weren't called from
1885 * z_thread_abort(). Resetting the stack region will
1886 * take place in the forthcoming thread_add() call.
1887 */
1888 return 0;
1889 }
1890
1891 /* Restore permissions on the thread's stack area since it is no
1892 * longer a member of the domain.
1893 */
1894 return reset_region(domain->arch.ptables,
1895 (void *)thread->stack_info.start,
1896 thread->stack_info.size);
1897 }
1898
1899 __pinned_func
arch_mem_domain_partition_add(struct k_mem_domain * domain,uint32_t partition_id)1900 int arch_mem_domain_partition_add(struct k_mem_domain *domain,
1901 uint32_t partition_id)
1902 {
1903 struct k_mem_partition *partition = &domain->partitions[partition_id];
1904
1905 /* Update the page tables with the partition info */
1906 return apply_region(domain->arch.ptables, (void *)partition->start,
1907 partition->size, partition->attr | MMU_P);
1908 }
1909
1910 /* Invoked from memory domain API calls, as well as during thread creation */
1911 __pinned_func
arch_mem_domain_thread_add(struct k_thread * thread)1912 int arch_mem_domain_thread_add(struct k_thread *thread)
1913 {
1914 int ret = 0;
1915
1916 /* New memory domain we are being added to */
1917 struct k_mem_domain *domain = thread->mem_domain_info.mem_domain;
1918 /* This is only set for threads that were migrating from some other
1919 * memory domain; new threads this is NULL.
1920 *
1921 * Note that NULL check on old_ptables must be done before any
1922 * address translation or else (NULL + offset) != NULL.
1923 */
1924 pentry_t *old_ptables = UINT_TO_POINTER(thread->arch.ptables);
1925 bool is_user = (thread->base.user_options & K_USER) != 0;
1926 bool is_migration = (old_ptables != NULL) && is_user;
1927
1928 /* Allow US access to the thread's stack in its new domain if
1929 * we are migrating. If we are not migrating this is done in
1930 * z_x86_current_stack_perms()
1931 */
1932 if (is_migration) {
1933 old_ptables = k_mem_virt_addr(thread->arch.ptables);
1934 set_stack_perms(thread, domain->arch.ptables);
1935 }
1936
1937 thread->arch.ptables = k_mem_phys_addr(domain->arch.ptables);
1938 LOG_DBG("set thread %p page tables to 0x%" PRIxPTR, thread,
1939 thread->arch.ptables);
1940
1941 /* Check if we're doing a migration from a different memory domain
1942 * and have to remove permissions from its old domain.
1943 *
1944 * XXX: The checks we have to do here and in
1945 * arch_mem_domain_thread_remove() are clumsy, it may be worth looking
1946 * into adding a specific arch_mem_domain_thread_migrate() API.
1947 * See #29601
1948 */
1949 if (is_migration) {
1950 ret = reset_region(old_ptables,
1951 (void *)thread->stack_info.start,
1952 thread->stack_info.size);
1953 }
1954
1955 #if !defined(CONFIG_X86_KPTI) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
1956 /* Need to switch to using these new page tables, in case we drop
1957 * to user mode before we are ever context switched out.
1958 * IPI takes care of this if the thread is currently running on some
1959 * other CPU.
1960 */
1961 if (thread == arch_current_thread() && thread->arch.ptables != z_x86_cr3_get()) {
1962 z_x86_cr3_set(thread->arch.ptables);
1963 }
1964 #endif /* CONFIG_X86_KPTI */
1965
1966 return ret;
1967 }
1968 #endif /* !CONFIG_X86_COMMON_PAGE_TABLE */
1969
1970 __pinned_func
arch_mem_domain_max_partitions_get(void)1971 int arch_mem_domain_max_partitions_get(void)
1972 {
1973 return CONFIG_MAX_DOMAIN_PARTITIONS;
1974 }
1975
1976 /* Invoked from z_x86_userspace_enter */
1977 __pinned_func
z_x86_current_stack_perms(void)1978 void z_x86_current_stack_perms(void)
1979 {
1980 /* Clear any previous context in the stack buffer to prevent
1981 * unintentional data leakage.
1982 */
1983 (void)memset((void *)arch_current_thread()->stack_info.start, 0xAA,
1984 arch_current_thread()->stack_info.size -
1985 arch_current_thread()->stack_info.delta);
1986
1987 /* Only now is it safe to grant access to the stack buffer since any
1988 * previous context has been erased.
1989 */
1990 #ifdef CONFIG_X86_COMMON_PAGE_TABLE
1991 /* Re run swap page table update logic since we're entering User mode.
1992 * This will grant stack and memory domain access if it wasn't set
1993 * already (in which case this returns very quickly).
1994 */
1995 z_x86_swap_update_common_page_table(arch_current_thread());
1996 #else
1997 /* Memory domain access is already programmed into the page tables.
1998 * Need to enable access to this new user thread's stack buffer in
1999 * its domain-specific page tables.
2000 */
2001 set_stack_perms(arch_current_thread(), z_x86_thread_page_tables_get(arch_current_thread()));
2002 #endif
2003 }
2004 #endif /* CONFIG_USERSPACE */
2005
2006 #ifdef CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES
2007 __boot_func
mark_addr_page_reserved(uintptr_t addr,size_t len)2008 static void mark_addr_page_reserved(uintptr_t addr, size_t len)
2009 {
2010 uintptr_t pos = ROUND_DOWN(addr, CONFIG_MMU_PAGE_SIZE);
2011 uintptr_t end = ROUND_UP(addr + len, CONFIG_MMU_PAGE_SIZE);
2012
2013 for (; pos < end; pos += CONFIG_MMU_PAGE_SIZE) {
2014 if (!k_mem_is_page_frame(pos)) {
2015 continue;
2016 }
2017
2018 k_mem_page_frame_set(k_mem_phys_to_page_frame(pos),
2019 K_MEM_PAGE_FRAME_RESERVED);
2020 }
2021 }
2022
2023 __boot_func
arch_reserved_pages_update(void)2024 void arch_reserved_pages_update(void)
2025 {
2026 #ifdef CONFIG_X86_PC_COMPATIBLE
2027 /*
2028 * Best is to do some E820 or similar enumeration to specifically
2029 * identify all page frames which are reserved by the hardware or
2030 * firmware. Or use x86_memmap[] with Multiboot if available.
2031 *
2032 * But still, reserve everything in the first megabyte of physical
2033 * memory on PC-compatible platforms.
2034 */
2035 mark_addr_page_reserved(0, MB(1));
2036 #endif /* CONFIG_X86_PC_COMPATIBLE */
2037
2038 #ifdef CONFIG_X86_MEMMAP
2039 for (int i = 0; i < CONFIG_X86_MEMMAP_ENTRIES; i++) {
2040 struct x86_memmap_entry *entry = &x86_memmap[i];
2041
2042 switch (entry->type) {
2043 case X86_MEMMAP_ENTRY_UNUSED:
2044 __fallthrough;
2045 case X86_MEMMAP_ENTRY_RAM:
2046 continue;
2047
2048 case X86_MEMMAP_ENTRY_ACPI:
2049 __fallthrough;
2050 case X86_MEMMAP_ENTRY_NVS:
2051 __fallthrough;
2052 case X86_MEMMAP_ENTRY_DEFECTIVE:
2053 __fallthrough;
2054 default:
2055 /* If any of three above cases satisfied, exit switch
2056 * and mark page reserved
2057 */
2058 break;
2059 }
2060
2061 mark_addr_page_reserved(entry->base, entry->length);
2062 }
2063 #endif /* CONFIG_X86_MEMMAP */
2064 }
2065 #endif /* CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES */
2066
arch_page_phys_get(void * virt,uintptr_t * phys)2067 int arch_page_phys_get(void *virt, uintptr_t *phys)
2068 {
2069 pentry_t pte = 0;
2070 int level, ret;
2071
2072 __ASSERT(POINTER_TO_UINT(virt) % CONFIG_MMU_PAGE_SIZE == 0U,
2073 "unaligned address %p to %s", virt, __func__);
2074
2075 pentry_get(&level, &pte, z_x86_page_tables_get(), virt);
2076
2077 if ((pte & MMU_P) != 0) {
2078 if (phys != NULL) {
2079 *phys = (uintptr_t)get_entry_phys(pte, PTE_LEVEL);
2080 }
2081 ret = 0;
2082 } else {
2083 /* Not mapped */
2084 ret = -EFAULT;
2085 }
2086
2087 return ret;
2088 }
2089
2090 #ifdef CONFIG_DEMAND_PAGING
2091 #define PTE_MASK (paging_levels[PTE_LEVEL].mask)
2092
2093 __pinned_func
arch_mem_page_out(void * addr,uintptr_t location)2094 void arch_mem_page_out(void *addr, uintptr_t location)
2095 {
2096 int ret;
2097 pentry_t mask = PTE_MASK | MMU_P | MMU_A;
2098
2099 /* Accessed bit set to guarantee the entry is not completely 0 in
2100 * case of location value 0. A totally 0 PTE is un-mapped.
2101 */
2102 ret = range_map(addr, location, CONFIG_MMU_PAGE_SIZE, MMU_A, mask,
2103 OPTION_FLUSH);
2104 __ASSERT_NO_MSG(ret == 0);
2105 ARG_UNUSED(ret);
2106 }
2107
2108 __pinned_func
arch_mem_page_in(void * addr,uintptr_t phys)2109 void arch_mem_page_in(void *addr, uintptr_t phys)
2110 {
2111 int ret;
2112 pentry_t mask = PTE_MASK | MMU_P | MMU_D | MMU_A;
2113
2114 ret = range_map(addr, phys, CONFIG_MMU_PAGE_SIZE, MMU_P, mask,
2115 OPTION_FLUSH);
2116 __ASSERT_NO_MSG(ret == 0);
2117 ARG_UNUSED(ret);
2118 }
2119
2120 __pinned_func
arch_mem_scratch(uintptr_t phys)2121 void arch_mem_scratch(uintptr_t phys)
2122 {
2123 page_map_set(z_x86_page_tables_get(), K_MEM_SCRATCH_PAGE,
2124 phys | MMU_P | MMU_RW | MMU_XD, NULL, MASK_ALL,
2125 OPTION_FLUSH);
2126 }
2127
2128 __pinned_func
arch_page_info_get(void * addr,uintptr_t * phys,bool clear_accessed)2129 uintptr_t arch_page_info_get(void *addr, uintptr_t *phys, bool clear_accessed)
2130 {
2131 pentry_t all_pte, mask;
2132 uint32_t options;
2133
2134 /* What to change, if anything, in the page_map_set() calls */
2135 if (clear_accessed) {
2136 mask = MMU_A;
2137 options = OPTION_FLUSH;
2138 } else {
2139 /* In this configuration page_map_set() just queries the
2140 * page table and makes no changes
2141 */
2142 mask = 0;
2143 options = 0U;
2144 }
2145
2146 page_map_set(z_x86_kernel_ptables, addr, 0, &all_pte, mask, options);
2147
2148 /* Un-mapped PTEs are completely zeroed. No need to report anything
2149 * else in this case.
2150 */
2151 if (all_pte == 0) {
2152 return ARCH_DATA_PAGE_NOT_MAPPED;
2153 }
2154
2155 #if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
2156 /* Don't bother looking at other page tables if non-present as we
2157 * are not required to report accurate accessed/dirty in this case
2158 * and all mappings are otherwise the same.
2159 */
2160 if ((all_pte & MMU_P) != 0) {
2161 sys_snode_t *node;
2162
2163 /* IRQs are locked, safe to do this */
2164 SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
2165 pentry_t cur_pte;
2166 struct arch_mem_domain *domain =
2167 CONTAINER_OF(node, struct arch_mem_domain,
2168 node);
2169
2170 page_map_set(domain->ptables, addr, 0, &cur_pte,
2171 mask, options | OPTION_USER);
2172
2173 /* Logical OR of relevant PTE in all page tables.
2174 * addr/location and present state should be identical
2175 * among them.
2176 */
2177 all_pte |= cur_pte;
2178 }
2179 }
2180 #endif /* USERSPACE && ~X86_COMMON_PAGE_TABLE */
2181
2182 /* NOTE: We are truncating the PTE on PAE systems, whose pentry_t
2183 * are larger than a uintptr_t.
2184 *
2185 * We currently aren't required to report back XD state (bit 63), and
2186 * Zephyr just doesn't support large physical memory on 32-bit
2187 * systems, PAE was only implemented for XD support.
2188 */
2189 if (phys != NULL) {
2190 *phys = (uintptr_t)get_entry_phys(all_pte, PTE_LEVEL);
2191 }
2192
2193 /* We don't filter out any other bits in the PTE and the kernel
2194 * ignores them. For the case of ARCH_DATA_PAGE_NOT_MAPPED,
2195 * we use a bit which is never set in a real PTE (the PAT bit) in the
2196 * current system.
2197 *
2198 * The other ARCH_DATA_PAGE_* macros are defined to their corresponding
2199 * bits in the PTE.
2200 */
2201 return (uintptr_t)all_pte;
2202 }
2203
2204 __pinned_func
arch_page_location_get(void * addr,uintptr_t * location)2205 enum arch_page_location arch_page_location_get(void *addr, uintptr_t *location)
2206 {
2207 pentry_t pte;
2208 int level;
2209
2210 /* TODO: since we only have to query the current set of page tables,
2211 * could optimize this with recursive page table mapping
2212 */
2213 pentry_get(&level, &pte, z_x86_page_tables_get(), addr);
2214
2215 if (pte == 0) {
2216 /* Not mapped */
2217 return ARCH_PAGE_LOCATION_BAD;
2218 }
2219
2220 __ASSERT(level == PTE_LEVEL, "bigpage found at %p", addr);
2221 *location = (uintptr_t)get_entry_phys(pte, PTE_LEVEL);
2222
2223 if ((pte & MMU_P) != 0) {
2224 return ARCH_PAGE_LOCATION_PAGED_IN;
2225 } else {
2226 return ARCH_PAGE_LOCATION_PAGED_OUT;
2227 }
2228 }
2229
2230 #ifdef CONFIG_X86_KPTI
2231 __pinned_func
z_x86_kpti_is_access_ok(void * addr,pentry_t * ptables)2232 bool z_x86_kpti_is_access_ok(void *addr, pentry_t *ptables)
2233 {
2234 pentry_t pte;
2235 int level;
2236
2237 pentry_get(&level, &pte, ptables, addr);
2238
2239 /* Might as well also check if it's un-mapped, normally we don't
2240 * fetch the PTE from the page tables until we are inside
2241 * k_mem_page_fault() and call arch_page_fault_status_get()
2242 */
2243 if (level != PTE_LEVEL || pte == 0 || is_flipped_pte(pte)) {
2244 return false;
2245 }
2246
2247 return true;
2248 }
2249 #endif /* CONFIG_X86_KPTI */
2250 #endif /* CONFIG_DEMAND_PAGING */
2251