1 /*
2 * Copyright (c) 2011-2014 Wind River Systems, Inc.
3 * Copyright (c) 2017-2020 Intel Corporation
4 *
5 * SPDX-License-Identifier: Apache-2.0
6 */
7
8 #include <zephyr/kernel.h>
9 #include <zephyr/arch/x86/mmustructs.h>
10 #include <zephyr/sys/mem_manage.h>
11 #include <zephyr/sys/__assert.h>
12 #include <zephyr/sys/check.h>
13 #include <zephyr/logging/log.h>
14 #include <errno.h>
15 #include <ctype.h>
16 #include <zephyr/spinlock.h>
17 #include <kernel_arch_func.h>
18 #include <x86_mmu.h>
19 #include <zephyr/init.h>
20 #include <kernel_internal.h>
21 #include <mmu.h>
22 #include <zephyr/drivers/interrupt_controller/loapic.h>
23 #include <mmu.h>
24 #include <zephyr/arch/x86/memmap.h>
25
26 LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL);
27
28 /* We will use some ignored bits in the PTE to backup permission settings
29 * when the mapping was made. This is used to un-apply memory domain memory
30 * partitions to page tables when the partitions are removed.
31 */
32 #define MMU_RW_ORIG MMU_IGNORED0
33 #define MMU_US_ORIG MMU_IGNORED1
34 #define MMU_XD_ORIG MMU_IGNORED2
35
36 /* Bits in the PTE that form the set of permission bits, when resetting */
37 #define MASK_PERM (MMU_RW | MMU_US | MMU_XD)
38
39 /* When we want to set up a new mapping, discarding any previous state */
40 #define MASK_ALL (~((pentry_t)0U))
41
42 /* Bits to set at mapping time for particular permissions. We set the actual
43 * page table bit effecting the policy and also the backup bit.
44 */
45 #define ENTRY_RW (MMU_RW | MMU_RW_ORIG)
46 #define ENTRY_US (MMU_US | MMU_US_ORIG)
47 #define ENTRY_XD (MMU_XD | MMU_XD_ORIG)
48
49 /* Bit position which is always zero in a PTE. We'll use the PAT bit.
50 * This helps disambiguate PTEs that do not have the Present bit set (MMU_P):
51 * - If the entire entry is zero, it's an un-mapped virtual page
52 * - If PTE_ZERO is set, we flipped this page due to KPTI
53 * - Otherwise, this was a page-out
54 */
55 #define PTE_ZERO MMU_PAT
56
57 /* Protects x86_domain_list and serializes instantiation of intermediate
58 * paging structures.
59 */
60 __pinned_bss
61 static struct k_spinlock x86_mmu_lock;
62
63 #if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
64 /* List of all active and initialized memory domains. This is used to make
65 * sure all memory mappings are the same across all page tables when invoking
66 * range_map()
67 */
68 __pinned_bss
69 static sys_slist_t x86_domain_list;
70 #endif
71
72 /*
73 * Definitions for building an ontology of paging levels and capabilities
74 * at each level
75 */
76
77 /* Data structure describing the characteristics of a particular paging
78 * level
79 */
80 struct paging_level {
81 /* What bits are used to store physical address */
82 pentry_t mask;
83
84 /* Number of entries in this paging structure */
85 size_t entries;
86
87 /* How many bits to right-shift a virtual address to obtain the
88 * appropriate entry within this table.
89 *
90 * The memory scope of each entry in this table is 1 << shift.
91 */
92 unsigned int shift;
93 #ifdef CONFIG_EXCEPTION_DEBUG
94 /* Name of this level, for debug purposes */
95 const char *name;
96 #endif
97 };
98
99 /* Flags for all entries in intermediate paging levels.
100 * Fortunately, the same bits are set for all intermediate levels for all
101 * three paging modes.
102 *
103 * Obviously P is set.
104 *
105 * We want RW and US bit always set; actual access control will be
106 * done at the leaf level.
107 *
108 * XD (if supported) always 0. Disabling execution done at leaf level.
109 *
110 * PCD/PWT always 0. Caching properties again done at leaf level.
111 */
112 #define INT_FLAGS (MMU_P | MMU_RW | MMU_US)
113
114 /* Paging level ontology for the selected paging mode.
115 *
116 * See Figures 4-4, 4-7, 4-11 in the Intel SDM, vol 3A
117 */
118 __pinned_rodata
119 static const struct paging_level paging_levels[] = {
120 #ifdef CONFIG_X86_64
121 /* Page Map Level 4 */
122 {
123 .mask = 0x7FFFFFFFFFFFF000ULL,
124 .entries = 512U,
125 .shift = 39U,
126 #ifdef CONFIG_EXCEPTION_DEBUG
127 .name = "PML4"
128 #endif
129 },
130 #endif /* CONFIG_X86_64 */
131 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
132 /* Page Directory Pointer Table */
133 {
134 .mask = 0x7FFFFFFFFFFFF000ULL,
135 #ifdef CONFIG_X86_64
136 .entries = 512U,
137 #else
138 /* PAE version */
139 .entries = 4U,
140 #endif
141 .shift = 30U,
142 #ifdef CONFIG_EXCEPTION_DEBUG
143 .name = "PDPT"
144 #endif
145 },
146 #endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
147 /* Page Directory */
148 {
149 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
150 .mask = 0x7FFFFFFFFFFFF000ULL,
151 .entries = 512U,
152 .shift = 21U,
153 #else
154 /* 32-bit */
155 .mask = 0xFFFFF000U,
156 .entries = 1024U,
157 .shift = 22U,
158 #endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
159 #ifdef CONFIG_EXCEPTION_DEBUG
160 .name = "PD"
161 #endif
162 },
163 /* Page Table */
164 {
165 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
166 .mask = 0x07FFFFFFFFFFF000ULL,
167 .entries = 512U,
168 .shift = 12U,
169 #else
170 /* 32-bit */
171 .mask = 0xFFFFF000U,
172 .entries = 1024U,
173 .shift = 12U,
174 #endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
175 #ifdef CONFIG_EXCEPTION_DEBUG
176 .name = "PT"
177 #endif
178 }
179 };
180
181 #define NUM_LEVELS ARRAY_SIZE(paging_levels)
182 #define PTE_LEVEL (NUM_LEVELS - 1)
183 #define PDE_LEVEL (NUM_LEVELS - 2)
184
185 /*
186 * Macros for reserving space for page tables
187 *
188 * We need to reserve a block of memory equal in size to the page tables
189 * generated by gen_mmu.py so that memory addresses do not shift between
190 * build phases. These macros ultimately specify INITIAL_PAGETABLE_SIZE.
191 */
192 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
193 #ifdef CONFIG_X86_64
194 #define NUM_PML4_ENTRIES 512U
195 #define NUM_PDPT_ENTRIES 512U
196 #else
197 #define NUM_PDPT_ENTRIES 4U
198 #endif /* CONFIG_X86_64 */
199 #define NUM_PD_ENTRIES 512U
200 #define NUM_PT_ENTRIES 512U
201 #else
202 #define NUM_PD_ENTRIES 1024U
203 #define NUM_PT_ENTRIES 1024U
204 #endif /* !CONFIG_X86_64 && !CONFIG_X86_PAE */
205
206 /* Memory range covered by an instance of various table types */
207 #define PT_AREA ((uintptr_t)(CONFIG_MMU_PAGE_SIZE * NUM_PT_ENTRIES))
208 #define PD_AREA (PT_AREA * NUM_PD_ENTRIES)
209 #ifdef CONFIG_X86_64
210 #define PDPT_AREA (PD_AREA * NUM_PDPT_ENTRIES)
211 #endif
212
213 #define VM_ADDR CONFIG_KERNEL_VM_BASE
214 #define VM_SIZE CONFIG_KERNEL_VM_SIZE
215
216 /* Define a range [PT_START, PT_END) which is the memory range
217 * covered by all the page tables needed for the address space
218 */
219 #define PT_START ((uintptr_t)ROUND_DOWN(VM_ADDR, PT_AREA))
220 #define PT_END ((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PT_AREA))
221
222 /* Number of page tables needed to cover address space. Depends on the specific
223 * bounds, but roughly 1 page table per 2MB of RAM
224 */
225 #define NUM_PT ((PT_END - PT_START) / PT_AREA)
226
227 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
228 /* Same semantics as above, but for the page directories needed to cover
229 * system RAM.
230 */
231 #define PD_START ((uintptr_t)ROUND_DOWN(VM_ADDR, PD_AREA))
232 #define PD_END ((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PD_AREA))
233 /* Number of page directories needed to cover the address space. Depends on the
234 * specific bounds, but roughly 1 page directory per 1GB of RAM
235 */
236 #define NUM_PD ((PD_END - PD_START) / PD_AREA)
237 #else
238 /* 32-bit page tables just have one toplevel page directory */
239 #define NUM_PD 1
240 #endif
241
242 #ifdef CONFIG_X86_64
243 /* Same semantics as above, but for the page directory pointer tables needed
244 * to cover the address space. On 32-bit there is just one 4-entry PDPT.
245 */
246 #define PDPT_START ((uintptr_t)ROUND_DOWN(VM_ADDR, PDPT_AREA))
247 #define PDPT_END ((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PDPT_AREA))
248 /* Number of PDPTs needed to cover the address space. 1 PDPT per 512GB of VM */
249 #define NUM_PDPT ((PDPT_END - PDPT_START) / PDPT_AREA)
250
251 /* All pages needed for page tables, using computed values plus one more for
252 * the top-level PML4
253 */
254 #define NUM_TABLE_PAGES (NUM_PT + NUM_PD + NUM_PDPT + 1)
255 #else /* !CONFIG_X86_64 */
256 /* Number of pages we need to reserve in the stack for per-thread page tables */
257 #define NUM_TABLE_PAGES (NUM_PT + NUM_PD)
258 #endif /* CONFIG_X86_64 */
259
260 #define INITIAL_PTABLE_PAGES \
261 (NUM_TABLE_PAGES + CONFIG_X86_EXTRA_PAGE_TABLE_PAGES)
262
263 #ifdef CONFIG_X86_PAE
264 /* Toplevel PDPT wasn't included as it is not a page in size */
265 #define INITIAL_PTABLE_SIZE \
266 ((INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE) + 0x20)
267 #else
268 #define INITIAL_PTABLE_SIZE \
269 (INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE)
270 #endif
271
272 /* "dummy" pagetables for the first-phase build. The real page tables
273 * are produced by gen-mmu.py based on data read in zephyr-prebuilt.elf,
274 * and this dummy array is discarded.
275 */
276 Z_GENERIC_SECTION(.dummy_pagetables)
277 static __used char dummy_pagetables[INITIAL_PTABLE_SIZE];
278
279 /*
280 * Utility functions
281 */
282
283 /* For a table at a particular level, get the entry index that corresponds to
284 * the provided virtual address
285 */
286 __pinned_func
get_index(void * virt,int level)287 static inline int get_index(void *virt, int level)
288 {
289 return (((uintptr_t)virt >> paging_levels[level].shift) %
290 paging_levels[level].entries);
291 }
292
293 __pinned_func
get_entry_ptr(pentry_t * ptables,void * virt,int level)294 static inline pentry_t *get_entry_ptr(pentry_t *ptables, void *virt, int level)
295 {
296 return &ptables[get_index(virt, level)];
297 }
298
299 __pinned_func
get_entry(pentry_t * ptables,void * virt,int level)300 static inline pentry_t get_entry(pentry_t *ptables, void *virt, int level)
301 {
302 return ptables[get_index(virt, level)];
303 }
304
305 /* Get the physical memory address associated with this table entry */
306 __pinned_func
get_entry_phys(pentry_t entry,int level)307 static inline uintptr_t get_entry_phys(pentry_t entry, int level)
308 {
309 return entry & paging_levels[level].mask;
310 }
311
312 /* Return the virtual address of a linked table stored in the provided entry */
313 __pinned_func
next_table(pentry_t entry,int level)314 static inline pentry_t *next_table(pentry_t entry, int level)
315 {
316 return z_mem_virt_addr(get_entry_phys(entry, level));
317 }
318
319 /* Number of table entries at this level */
320 __pinned_func
get_num_entries(int level)321 static inline size_t get_num_entries(int level)
322 {
323 return paging_levels[level].entries;
324 }
325
326 /* 4K for everything except PAE PDPTs */
327 __pinned_func
table_size(int level)328 static inline size_t table_size(int level)
329 {
330 return get_num_entries(level) * sizeof(pentry_t);
331 }
332
333 /* For a table at a particular level, size of the amount of virtual memory
334 * that an entry within the table covers
335 */
336 __pinned_func
get_entry_scope(int level)337 static inline size_t get_entry_scope(int level)
338 {
339 return (1UL << paging_levels[level].shift);
340 }
341
342 /* For a table at a particular level, size of the amount of virtual memory
343 * that this entire table covers
344 */
345 __pinned_func
get_table_scope(int level)346 static inline size_t get_table_scope(int level)
347 {
348 return get_entry_scope(level) * get_num_entries(level);
349 }
350
351 /* Must have checked Present bit first! Non-present entries may have OS data
352 * stored in any other bits
353 */
354 __pinned_func
is_leaf(int level,pentry_t entry)355 static inline bool is_leaf(int level, pentry_t entry)
356 {
357 if (level == PTE_LEVEL) {
358 /* Always true for PTE */
359 return true;
360 }
361
362 return ((entry & MMU_PS) != 0U);
363 }
364
365 /* This does NOT (by design) un-flip KPTI PTEs, it's just the raw PTE value */
366 __pinned_func
pentry_get(int * paging_level,pentry_t * val,pentry_t * ptables,void * virt)367 static inline void pentry_get(int *paging_level, pentry_t *val,
368 pentry_t *ptables, void *virt)
369 {
370 pentry_t *table = ptables;
371
372 for (int level = 0; level < NUM_LEVELS; level++) {
373 pentry_t entry = get_entry(table, virt, level);
374
375 if ((entry & MMU_P) == 0 || is_leaf(level, entry)) {
376 *val = entry;
377 if (paging_level != NULL) {
378 *paging_level = level;
379 }
380 break;
381 } else {
382 table = next_table(entry, level);
383 }
384 }
385 }
386
387 __pinned_func
tlb_flush_page(void * addr)388 static inline void tlb_flush_page(void *addr)
389 {
390 /* Invalidate TLB entries corresponding to the page containing the
391 * specified address
392 */
393 char *page = (char *)addr;
394
395 __asm__ ("invlpg %0" :: "m" (*page));
396 }
397
398 #ifdef CONFIG_X86_KPTI
399 __pinned_func
is_flipped_pte(pentry_t pte)400 static inline bool is_flipped_pte(pentry_t pte)
401 {
402 return (pte & MMU_P) == 0 && (pte & PTE_ZERO) != 0;
403 }
404 #endif
405
406 #if defined(CONFIG_SMP)
407 __pinned_func
z_x86_tlb_ipi(const void * arg)408 void z_x86_tlb_ipi(const void *arg)
409 {
410 uintptr_t ptables_phys;
411
412 ARG_UNUSED(arg);
413
414 #ifdef CONFIG_X86_KPTI
415 /* We're always on the kernel's set of page tables in this context
416 * if KPTI is turned on
417 */
418 ptables_phys = z_x86_cr3_get();
419 __ASSERT(ptables_phys == z_mem_phys_addr(&z_x86_kernel_ptables), "");
420 #else
421 /* We might have been moved to another memory domain, so always invoke
422 * z_x86_thread_page_tables_get() instead of using current CR3 value.
423 */
424 ptables_phys = z_mem_phys_addr(z_x86_thread_page_tables_get(_current));
425 #endif
426 /*
427 * In the future, we can consider making this smarter, such as
428 * propagating which page tables were modified (in case they are
429 * not active on this CPU) or an address range to call
430 * tlb_flush_page() on.
431 */
432 LOG_DBG("%s on CPU %d\n", __func__, arch_curr_cpu()->id);
433
434 z_x86_cr3_set(ptables_phys);
435 }
436
437 /* NOTE: This is not synchronous and the actual flush takes place some short
438 * time after this exits.
439 */
440 __pinned_func
tlb_shootdown(void)441 static inline void tlb_shootdown(void)
442 {
443 z_loapic_ipi(0, LOAPIC_ICR_IPI_OTHERS, CONFIG_TLB_IPI_VECTOR);
444 }
445 #endif /* CONFIG_SMP */
446
447 __pinned_func
assert_addr_aligned(uintptr_t addr)448 static inline void assert_addr_aligned(uintptr_t addr)
449 {
450 #if __ASSERT_ON
451 __ASSERT((addr & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U,
452 "unaligned address 0x%" PRIxPTR, addr);
453 #endif
454 }
455
456 __pinned_func
is_addr_aligned(uintptr_t addr)457 static inline bool is_addr_aligned(uintptr_t addr)
458 {
459 if ((addr & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U) {
460 return true;
461 } else {
462 return false;
463 }
464 }
465
466 __pinned_func
assert_virt_addr_aligned(void * addr)467 static inline void assert_virt_addr_aligned(void *addr)
468 {
469 assert_addr_aligned((uintptr_t)addr);
470 }
471
472 __pinned_func
is_virt_addr_aligned(void * addr)473 static inline bool is_virt_addr_aligned(void *addr)
474 {
475 return is_addr_aligned((uintptr_t)addr);
476 }
477
478 __pinned_func
assert_size_aligned(size_t size)479 static inline void assert_size_aligned(size_t size)
480 {
481 #if __ASSERT_ON
482 __ASSERT((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U,
483 "unaligned size %zu", size);
484 #endif
485 }
486
487 __pinned_func
is_size_aligned(size_t size)488 static inline bool is_size_aligned(size_t size)
489 {
490 if ((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U) {
491 return true;
492 } else {
493 return false;
494 }
495 }
496
497 __pinned_func
assert_region_page_aligned(void * addr,size_t size)498 static inline void assert_region_page_aligned(void *addr, size_t size)
499 {
500 assert_virt_addr_aligned(addr);
501 assert_size_aligned(size);
502 }
503
504 __pinned_func
is_region_page_aligned(void * addr,size_t size)505 static inline bool is_region_page_aligned(void *addr, size_t size)
506 {
507 if (!is_virt_addr_aligned(addr)) {
508 return false;
509 }
510
511 return is_size_aligned(size);
512 }
513
514 /*
515 * Debug functions. All conditionally compiled with CONFIG_EXCEPTION_DEBUG.
516 */
517 #ifdef CONFIG_EXCEPTION_DEBUG
518
519 /* Add colors to page table dumps to indicate mapping type */
520 #define COLOR_PAGE_TABLES 1
521
522 #if COLOR_PAGE_TABLES
523 #define ANSI_DEFAULT "\x1B" "[0m"
524 #define ANSI_RED "\x1B" "[1;31m"
525 #define ANSI_GREEN "\x1B" "[1;32m"
526 #define ANSI_YELLOW "\x1B" "[1;33m"
527 #define ANSI_BLUE "\x1B" "[1;34m"
528 #define ANSI_MAGENTA "\x1B" "[1;35m"
529 #define ANSI_CYAN "\x1B" "[1;36m"
530 #define ANSI_GREY "\x1B" "[1;90m"
531
532 #define COLOR(x) printk(_CONCAT(ANSI_, x))
533 #else
534 #define COLOR(x) do { } while (false)
535 #endif
536
537 __pinned_func
get_entry_code(pentry_t value)538 static char get_entry_code(pentry_t value)
539 {
540 char ret;
541
542 if (value == 0U) {
543 /* Unmapped entry */
544 ret = '.';
545 } else {
546 if ((value & MMU_RW) != 0U) {
547 /* Writable page */
548 if ((value & MMU_XD) != 0U) {
549 /* RW */
550 ret = 'w';
551 } else {
552 /* RWX */
553 ret = 'a';
554 }
555 } else {
556 if ((value & MMU_XD) != 0U) {
557 /* R */
558 ret = 'r';
559 } else {
560 /* RX */
561 ret = 'x';
562 }
563 }
564
565 if ((value & MMU_US) != 0U) {
566 /* Uppercase indicates user mode access */
567 ret = toupper((unsigned char)ret);
568 }
569 }
570
571 return ret;
572 }
573
574 __pinned_func
print_entries(pentry_t entries_array[],uint8_t * base,int level,size_t count)575 static void print_entries(pentry_t entries_array[], uint8_t *base, int level,
576 size_t count)
577 {
578 int column = 0;
579
580 for (int i = 0; i < count; i++) {
581 pentry_t entry = entries_array[i];
582
583 uintptr_t phys = get_entry_phys(entry, level);
584 uintptr_t virt =
585 (uintptr_t)base + (get_entry_scope(level) * i);
586
587 if ((entry & MMU_P) != 0U) {
588 if (is_leaf(level, entry)) {
589 if (phys == virt) {
590 /* Identity mappings */
591 COLOR(YELLOW);
592 } else if (phys + Z_MEM_VM_OFFSET == virt) {
593 /* Permanent RAM mappings */
594 COLOR(GREEN);
595 } else {
596 /* General mapped pages */
597 COLOR(CYAN);
598 }
599 } else {
600 /* Intermediate entry */
601 COLOR(MAGENTA);
602 }
603 } else {
604 if (is_leaf(level, entry)) {
605 if (entry == 0U) {
606 /* Unmapped */
607 COLOR(GREY);
608 #ifdef CONFIG_X86_KPTI
609 } else if (is_flipped_pte(entry)) {
610 /* KPTI, un-flip it */
611 COLOR(BLUE);
612 entry = ~entry;
613 phys = get_entry_phys(entry, level);
614 if (phys == virt) {
615 /* Identity mapped */
616 COLOR(CYAN);
617 } else {
618 /* Non-identity mapped */
619 COLOR(BLUE);
620 }
621 #endif
622 } else {
623 /* Paged out */
624 COLOR(RED);
625 }
626 } else {
627 /* Un-mapped intermediate entry */
628 COLOR(GREY);
629 }
630 }
631
632 printk("%c", get_entry_code(entry));
633
634 column++;
635 if (column == 64) {
636 column = 0;
637 printk("\n");
638 }
639 }
640 COLOR(DEFAULT);
641
642 if (column != 0) {
643 printk("\n");
644 }
645 }
646
647 __pinned_func
dump_ptables(pentry_t * table,uint8_t * base,int level)648 static void dump_ptables(pentry_t *table, uint8_t *base, int level)
649 {
650 const struct paging_level *info = &paging_levels[level];
651
652 #ifdef CONFIG_X86_64
653 /* Account for the virtual memory "hole" with sign-extension */
654 if (((uintptr_t)base & BITL(47)) != 0) {
655 base = (uint8_t *)((uintptr_t)base | (0xFFFFULL << 48));
656 }
657 #endif
658
659 printk("%s at %p (0x%" PRIxPTR "): ", info->name, table,
660 z_mem_phys_addr(table));
661 if (level == 0) {
662 printk("entire address space\n");
663 } else {
664 printk("for %p - %p\n", base,
665 base + get_table_scope(level) - 1);
666 }
667
668 print_entries(table, base, level, info->entries);
669
670 /* Check if we're a page table */
671 if (level == PTE_LEVEL) {
672 return;
673 }
674
675 /* Dump all linked child tables */
676 for (int j = 0; j < info->entries; j++) {
677 pentry_t entry = table[j];
678 pentry_t *next;
679
680 if ((entry & MMU_P) == 0U ||
681 (entry & MMU_PS) != 0U) {
682 /* Not present or big page, skip */
683 continue;
684 }
685
686 next = next_table(entry, level);
687 dump_ptables(next, base + (j * get_entry_scope(level)),
688 level + 1);
689 }
690 }
691
692 __pinned_func
z_x86_dump_page_tables(pentry_t * ptables)693 void z_x86_dump_page_tables(pentry_t *ptables)
694 {
695 dump_ptables(ptables, NULL, 0);
696 }
697
698 /* Enable to dump out the kernel's page table right before main() starts,
699 * sometimes useful for deep debugging. May overwhelm twister.
700 */
701 #define DUMP_PAGE_TABLES 0
702
703 #if DUMP_PAGE_TABLES
704 __pinned_func
dump_kernel_tables(void)705 static int dump_kernel_tables(void)
706 {
707 z_x86_dump_page_tables(z_x86_kernel_ptables);
708
709 return 0;
710 }
711
712 SYS_INIT(dump_kernel_tables, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT);
713 #endif
714
715 __pinned_func
str_append(char ** buf,size_t * size,const char * str)716 static void str_append(char **buf, size_t *size, const char *str)
717 {
718 int ret = snprintk(*buf, *size, "%s", str);
719
720 if (ret >= *size) {
721 /* Truncated */
722 *size = 0U;
723 } else {
724 *size -= ret;
725 *buf += ret;
726 }
727
728 }
729
730 __pinned_func
dump_entry(int level,void * virt,pentry_t entry)731 static void dump_entry(int level, void *virt, pentry_t entry)
732 {
733 const struct paging_level *info = &paging_levels[level];
734 char buf[24] = { 0 };
735 char *pos = buf;
736 size_t sz = sizeof(buf);
737 uint8_t *virtmap = (uint8_t *)ROUND_DOWN(virt, get_entry_scope(level));
738
739 #define DUMP_BIT(bit) do { \
740 if ((entry & MMU_##bit) != 0U) { \
741 str_append(&pos, &sz, #bit " "); \
742 } \
743 } while (false)
744
745 DUMP_BIT(RW);
746 DUMP_BIT(US);
747 DUMP_BIT(PWT);
748 DUMP_BIT(PCD);
749 DUMP_BIT(A);
750 DUMP_BIT(D);
751 DUMP_BIT(G);
752 DUMP_BIT(XD);
753
754 LOG_ERR("%sE: %p -> " PRI_ENTRY ": %s", info->name,
755 virtmap, entry & info->mask, buf);
756
757 #undef DUMP_BIT
758 }
759
760 __pinned_func
z_x86_pentry_get(int * paging_level,pentry_t * val,pentry_t * ptables,void * virt)761 void z_x86_pentry_get(int *paging_level, pentry_t *val, pentry_t *ptables,
762 void *virt)
763 {
764 pentry_get(paging_level, val, ptables, virt);
765 }
766
767 /*
768 * Debug function for dumping out MMU table information to the LOG for a
769 * specific virtual address, such as when we get an unexpected page fault.
770 */
771 __pinned_func
z_x86_dump_mmu_flags(pentry_t * ptables,void * virt)772 void z_x86_dump_mmu_flags(pentry_t *ptables, void *virt)
773 {
774 pentry_t entry = 0;
775 int level = 0;
776
777 pentry_get(&level, &entry, ptables, virt);
778
779 if ((entry & MMU_P) == 0) {
780 LOG_ERR("%sE: not present", paging_levels[level].name);
781 } else {
782 dump_entry(level, virt, entry);
783 }
784 }
785 #endif /* CONFIG_EXCEPTION_DEBUG */
786
787 /* Reset permissions on a PTE to original state when the mapping was made */
788 __pinned_func
reset_pte(pentry_t old_val)789 static inline pentry_t reset_pte(pentry_t old_val)
790 {
791 pentry_t new_val;
792
793 /* Clear any existing state in permission bits */
794 new_val = old_val & (~K_MEM_PARTITION_PERM_MASK);
795
796 /* Now set permissions based on the stashed original values */
797 if ((old_val & MMU_RW_ORIG) != 0) {
798 new_val |= MMU_RW;
799 }
800 if ((old_val & MMU_US_ORIG) != 0) {
801 new_val |= MMU_US;
802 }
803 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
804 if ((old_val & MMU_XD_ORIG) != 0) {
805 new_val |= MMU_XD;
806 }
807 #endif
808 return new_val;
809 }
810
811 /* Wrapper functions for some gross stuff we have to do for Kernel
812 * page table isolation. If these are User mode page tables, the user bit
813 * isn't set, and this is not the shared page, all the bits in the PTE
814 * are flipped. This serves three purposes:
815 * - The page isn't present, implementing page table isolation
816 * - Flipping the physical address bits cheaply mitigates L1TF
817 * - State is preserved; to get original PTE, just complement again
818 */
819 __pinned_func
pte_finalize_value(pentry_t val,bool user_table,int level)820 static inline pentry_t pte_finalize_value(pentry_t val, bool user_table,
821 int level)
822 {
823 #ifdef CONFIG_X86_KPTI
824 static const uintptr_t shared_phys_addr =
825 Z_MEM_PHYS_ADDR(POINTER_TO_UINT(&z_shared_kernel_page_start));
826
827 if (user_table && (val & MMU_US) == 0 && (val & MMU_P) != 0 &&
828 get_entry_phys(val, level) != shared_phys_addr) {
829 val = ~val;
830 }
831 #endif
832 return val;
833 }
834
835 /* Atomic functions for modifying PTEs. These don't map nicely to Zephyr's
836 * atomic API since the only types supported are 'int' and 'void *' and
837 * the size of pentry_t depends on other factors like PAE.
838 */
839 #ifndef CONFIG_X86_PAE
840 /* Non-PAE, pentry_t is same size as void ptr so use atomic_ptr_* APIs */
841 __pinned_func
atomic_pte_get(const pentry_t * target)842 static inline pentry_t atomic_pte_get(const pentry_t *target)
843 {
844 return (pentry_t)atomic_ptr_get((atomic_ptr_t *)target);
845 }
846
847 __pinned_func
atomic_pte_cas(pentry_t * target,pentry_t old_value,pentry_t new_value)848 static inline bool atomic_pte_cas(pentry_t *target, pentry_t old_value,
849 pentry_t new_value)
850 {
851 return atomic_ptr_cas((atomic_ptr_t *)target, (void *)old_value,
852 (void *)new_value);
853 }
854 #else
855 /* Atomic builtins for 64-bit values on 32-bit x86 require floating point.
856 * Don't do this, just lock local interrupts. Needless to say, this
857 * isn't workable if someone ever adds SMP to the 32-bit x86 port.
858 */
859 BUILD_ASSERT(!IS_ENABLED(CONFIG_SMP));
860
861 __pinned_func
atomic_pte_get(const pentry_t * target)862 static inline pentry_t atomic_pte_get(const pentry_t *target)
863 {
864 return *target;
865 }
866
867 __pinned_func
atomic_pte_cas(pentry_t * target,pentry_t old_value,pentry_t new_value)868 static inline bool atomic_pte_cas(pentry_t *target, pentry_t old_value,
869 pentry_t new_value)
870 {
871 bool ret = false;
872 int key = arch_irq_lock();
873
874 if (*target == old_value) {
875 *target = new_value;
876 ret = true;
877 }
878 arch_irq_unlock(key);
879
880 return ret;
881 }
882 #endif /* CONFIG_X86_PAE */
883
884 /* Indicates that the target page tables will be used by user mode threads.
885 * This only has implications for CONFIG_X86_KPTI where user thread facing
886 * page tables need nearly all pages that don't have the US bit to also
887 * not be Present.
888 */
889 #define OPTION_USER BIT(0)
890
891 /* Indicates that the operation requires TLBs to be flushed as we are altering
892 * existing mappings. Not needed for establishing new mappings
893 */
894 #define OPTION_FLUSH BIT(1)
895
896 /* Indicates that each PTE's permission bits should be restored to their
897 * original state when the memory was mapped. All other bits in the PTE are
898 * preserved.
899 */
900 #define OPTION_RESET BIT(2)
901
902 /* Indicates that the mapping will need to be cleared entirely. This is
903 * mainly used for unmapping the memory region.
904 */
905 #define OPTION_CLEAR BIT(3)
906
907 /**
908 * Atomically update bits in a page table entry
909 *
910 * This is atomic with respect to modifications by other CPUs or preempted
911 * contexts, which can be very important when making decisions based on
912 * the PTE's prior "dirty" state.
913 *
914 * @param pte Pointer to page table entry to update
915 * @param update_val Updated bits to set/clear in PTE. Ignored with
916 * OPTION_RESET or OPTION_CLEAR.
917 * @param update_mask Which bits to modify in the PTE. Ignored with
918 * OPTION_RESET or OPTION_CLEAR.
919 * @param options Control flags
920 * @retval Old PTE value
921 */
922 __pinned_func
pte_atomic_update(pentry_t * pte,pentry_t update_val,pentry_t update_mask,uint32_t options)923 static inline pentry_t pte_atomic_update(pentry_t *pte, pentry_t update_val,
924 pentry_t update_mask,
925 uint32_t options)
926 {
927 bool user_table = (options & OPTION_USER) != 0U;
928 bool reset = (options & OPTION_RESET) != 0U;
929 bool clear = (options & OPTION_CLEAR) != 0U;
930 pentry_t old_val, new_val;
931
932 do {
933 old_val = atomic_pte_get(pte);
934
935 new_val = old_val;
936 #ifdef CONFIG_X86_KPTI
937 if (is_flipped_pte(new_val)) {
938 /* Page was flipped for KPTI. Un-flip it */
939 new_val = ~new_val;
940 }
941 #endif /* CONFIG_X86_KPTI */
942
943 if (reset) {
944 new_val = reset_pte(new_val);
945 } else if (clear) {
946 new_val = 0;
947 } else {
948 new_val = ((new_val & ~update_mask) |
949 (update_val & update_mask));
950 }
951
952 new_val = pte_finalize_value(new_val, user_table, PTE_LEVEL);
953 } while (atomic_pte_cas(pte, old_val, new_val) == false);
954
955 #ifdef CONFIG_X86_KPTI
956 if (is_flipped_pte(old_val)) {
957 /* Page was flipped for KPTI. Un-flip it */
958 old_val = ~old_val;
959 }
960 #endif /* CONFIG_X86_KPTI */
961
962 return old_val;
963 }
964
965 /**
966 * Low level page table update function for a virtual page
967 *
968 * For the provided set of page tables, update the PTE associated with the
969 * virtual address to a new value, using the mask to control what bits
970 * need to be preserved.
971 *
972 * It is permitted to set up mappings without the Present bit set, in which
973 * case all other bits may be used for OS accounting.
974 *
975 * This function is atomic with respect to the page table entries being
976 * modified by another CPU, using atomic operations to update the requested
977 * bits and return the previous PTE value.
978 *
979 * Common mask values:
980 * MASK_ALL - Update all PTE bits. Existing state totally discarded.
981 * MASK_PERM - Only update permission bits. All other bits and physical
982 * mapping preserved.
983 *
984 * @param ptables Page tables to modify
985 * @param virt Virtual page table entry to update
986 * @param entry_val Value to update in the PTE (ignored if OPTION_RESET or
987 * OPTION_CLEAR)
988 * @param [out] old_val_ptr Filled in with previous PTE value. May be NULL.
989 * @param mask What bits to update in the PTE (ignored if OPTION_RESET or
990 * OPTION_CLEAR)
991 * @param options Control options, described above
992 *
993 * @retval 0 if successful
994 * @retval -EFAULT if large page encountered or missing page table level
995 */
996 __pinned_func
page_map_set(pentry_t * ptables,void * virt,pentry_t entry_val,pentry_t * old_val_ptr,pentry_t mask,uint32_t options)997 static int page_map_set(pentry_t *ptables, void *virt, pentry_t entry_val,
998 pentry_t *old_val_ptr, pentry_t mask, uint32_t options)
999 {
1000 pentry_t *table = ptables;
1001 bool flush = (options & OPTION_FLUSH) != 0U;
1002 int ret = 0;
1003
1004 for (int level = 0; level < NUM_LEVELS; level++) {
1005 int index;
1006 pentry_t *entryp;
1007
1008 index = get_index(virt, level);
1009 entryp = &table[index];
1010
1011 /* Check if we're a PTE */
1012 if (level == PTE_LEVEL) {
1013 pentry_t old_val = pte_atomic_update(entryp, entry_val,
1014 mask, options);
1015 if (old_val_ptr != NULL) {
1016 *old_val_ptr = old_val;
1017 }
1018 break;
1019 }
1020
1021 /* We bail out early here due to no support for
1022 * splitting existing bigpage mappings.
1023 * If the PS bit is not supported at some level (like
1024 * in a PML4 entry) it is always reserved and must be 0
1025 */
1026 CHECKIF(!((*entryp & MMU_PS) == 0U)) {
1027 /* Cannot continue since we cannot split
1028 * bigpage mappings.
1029 */
1030 LOG_ERR("large page encountered");
1031 ret = -EFAULT;
1032 goto out;
1033 }
1034
1035 table = next_table(*entryp, level);
1036
1037 CHECKIF(!(table != NULL)) {
1038 /* Cannot continue since table is NULL,
1039 * and it cannot be dereferenced in next loop
1040 * iteration.
1041 */
1042 LOG_ERR("missing page table level %d when trying to map %p",
1043 level + 1, virt);
1044 ret = -EFAULT;
1045 goto out;
1046 }
1047 }
1048
1049 out:
1050 if (flush) {
1051 tlb_flush_page(virt);
1052 }
1053
1054 return ret;
1055 }
1056
1057 /**
1058 * Map a physical region in a specific set of page tables.
1059 *
1060 * See documentation for page_map_set() for additional notes about masks and
1061 * supported options.
1062 *
1063 * It is vital to remember that all virtual-to-physical mappings must be
1064 * the same with respect to supervisor mode regardless of what thread is
1065 * scheduled (and therefore, if multiple sets of page tables exist, which one
1066 * is active).
1067 *
1068 * It is permitted to set up mappings without the Present bit set.
1069 *
1070 * @param ptables Page tables to modify
1071 * @param virt Base page-aligned virtual memory address to map the region.
1072 * @param phys Base page-aligned physical memory address for the region.
1073 * Ignored if OPTION_RESET or OPTION_CLEAR. Also affected by the mask
1074 * parameter. This address is not directly examined, it will simply be
1075 * programmed into the PTE.
1076 * @param size Size of the physical region to map
1077 * @param entry_flags Non-address bits to set in every PTE. Ignored if
1078 * OPTION_RESET. Also affected by the mask parameter.
1079 * @param mask What bits to update in each PTE. Un-set bits will never be
1080 * modified. Ignored if OPTION_RESET or OPTION_CLEAR.
1081 * @param options Control options, described above
1082 *
1083 * @retval 0 if successful
1084 * @retval -EINVAL if invalid parameters are supplied
1085 * @retval -EFAULT if errors encountered when updating page tables
1086 */
1087 __pinned_func
range_map_ptables(pentry_t * ptables,void * virt,uintptr_t phys,size_t size,pentry_t entry_flags,pentry_t mask,uint32_t options)1088 static int range_map_ptables(pentry_t *ptables, void *virt, uintptr_t phys,
1089 size_t size, pentry_t entry_flags, pentry_t mask,
1090 uint32_t options)
1091 {
1092 bool zero_entry = (options & (OPTION_RESET | OPTION_CLEAR)) != 0U;
1093 int ret = 0, ret2;
1094
1095 CHECKIF(!is_addr_aligned(phys) || !is_size_aligned(size)) {
1096 ret = -EINVAL;
1097 goto out;
1098 }
1099
1100 CHECKIF(!((entry_flags & paging_levels[0].mask) == 0U)) {
1101 LOG_ERR("entry_flags " PRI_ENTRY " overlaps address area",
1102 entry_flags);
1103 ret = -EINVAL;
1104 goto out;
1105 }
1106
1107 /* This implementation is stack-efficient but not particularly fast.
1108 * We do a full page table walk for every page we are updating.
1109 * Recursive approaches are possible, but use much more stack space.
1110 */
1111 for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) {
1112 uint8_t *dest_virt = (uint8_t *)virt + offset;
1113 pentry_t entry_val;
1114
1115 if (zero_entry) {
1116 entry_val = 0;
1117 } else {
1118 entry_val = (pentry_t)(phys + offset) | entry_flags;
1119 }
1120
1121 ret2 = page_map_set(ptables, dest_virt, entry_val, NULL, mask,
1122 options);
1123 ARG_UNUSED(ret2);
1124 CHECKIF(ret2 != 0) {
1125 ret = ret2;
1126 }
1127 }
1128
1129 out:
1130 return ret;
1131 }
1132
1133 /**
1134 * Establish or update a memory mapping for all page tables
1135 *
1136 * The physical region noted from phys to phys + size will be mapped to
1137 * an equal sized virtual region starting at virt, with the provided flags.
1138 * The mask value denotes what bits in PTEs will actually be modified.
1139 *
1140 * See range_map_ptables() for additional details.
1141 *
1142 * @param virt Page-aligned starting virtual address
1143 * @param phys Page-aligned starting physical address. Ignored if the mask
1144 * parameter does not enable address bits or OPTION_RESET used.
1145 * This region is not directly examined, it will simply be
1146 * programmed into the page tables.
1147 * @param size Size of the physical region to map
1148 * @param entry_flags Desired state of non-address PTE bits covered by mask,
1149 * ignored if OPTION_RESET
1150 * @param mask What bits in the PTE to actually modify; unset bits will
1151 * be preserved. Ignored if OPTION_RESET.
1152 * @param options Control options. Do not set OPTION_USER here. OPTION_FLUSH
1153 * will trigger a TLB shootdown after all tables are updated.
1154 *
1155 * @retval 0 if successful
1156 * @retval -EINVAL if invalid parameters are supplied
1157 * @retval -EFAULT if errors encountered when updating page tables
1158 */
1159 __pinned_func
range_map(void * virt,uintptr_t phys,size_t size,pentry_t entry_flags,pentry_t mask,uint32_t options)1160 static int range_map(void *virt, uintptr_t phys, size_t size,
1161 pentry_t entry_flags, pentry_t mask, uint32_t options)
1162 {
1163 int ret = 0, ret2;
1164
1165 LOG_DBG("%s: %p -> %p (%zu) flags " PRI_ENTRY " mask "
1166 PRI_ENTRY " opt 0x%x", __func__, (void *)phys, virt, size,
1167 entry_flags, mask, options);
1168
1169 #ifdef CONFIG_X86_64
1170 /* There's a gap in the "64-bit" address space, as 4-level paging
1171 * requires bits 48 to 63 to be copies of bit 47. Test this
1172 * by treating as a signed value and shifting.
1173 */
1174 __ASSERT(((((intptr_t)virt) << 16) >> 16) == (intptr_t)virt,
1175 "non-canonical virtual address mapping %p (size %zu)",
1176 virt, size);
1177 #endif /* CONFIG_X86_64 */
1178
1179 CHECKIF(!((options & OPTION_USER) == 0U)) {
1180 LOG_ERR("invalid option for mapping");
1181 ret = -EINVAL;
1182 goto out;
1183 }
1184
1185 /* All virtual-to-physical mappings are the same in all page tables.
1186 * What can differ is only access permissions, defined by the memory
1187 * domain associated with the page tables, and the threads that are
1188 * members of that domain.
1189 *
1190 * Any new mappings need to be applied to all page tables.
1191 */
1192 #if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
1193 sys_snode_t *node;
1194
1195 SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
1196 struct arch_mem_domain *domain =
1197 CONTAINER_OF(node, struct arch_mem_domain, node);
1198
1199 ret2 = range_map_ptables(domain->ptables, virt, phys, size,
1200 entry_flags, mask,
1201 options | OPTION_USER);
1202 ARG_UNUSED(ret2);
1203 CHECKIF(ret2 != 0) {
1204 ret = ret2;
1205 }
1206 }
1207 #endif /* CONFIG_USERSPACE */
1208
1209 ret2 = range_map_ptables(z_x86_kernel_ptables, virt, phys, size,
1210 entry_flags, mask, options);
1211 ARG_UNUSED(ret2);
1212 CHECKIF(ret2 != 0) {
1213 ret = ret2;
1214 }
1215
1216 out:
1217 #ifdef CONFIG_SMP
1218 if ((options & OPTION_FLUSH) != 0U) {
1219 tlb_shootdown();
1220 }
1221 #endif /* CONFIG_SMP */
1222
1223 return ret;
1224 }
1225
1226 __pinned_func
range_map_unlocked(void * virt,uintptr_t phys,size_t size,pentry_t entry_flags,pentry_t mask,uint32_t options)1227 static inline int range_map_unlocked(void *virt, uintptr_t phys, size_t size,
1228 pentry_t entry_flags, pentry_t mask,
1229 uint32_t options)
1230 {
1231 k_spinlock_key_t key;
1232 int ret;
1233
1234 key = k_spin_lock(&x86_mmu_lock);
1235 ret = range_map(virt, phys, size, entry_flags, mask, options);
1236 k_spin_unlock(&x86_mmu_lock, key);
1237
1238 return ret;
1239 }
1240
1241 __pinned_func
flags_to_entry(uint32_t flags)1242 static pentry_t flags_to_entry(uint32_t flags)
1243 {
1244 pentry_t entry_flags = MMU_P;
1245
1246 /* Translate flags argument into HW-recognized entry flags.
1247 *
1248 * Support for PAT is not implemented yet. Many systems may have
1249 * BIOS-populated MTRR values such that these cache settings are
1250 * redundant.
1251 */
1252 switch (flags & K_MEM_CACHE_MASK) {
1253 case K_MEM_CACHE_NONE:
1254 entry_flags |= MMU_PCD;
1255 break;
1256 case K_MEM_CACHE_WT:
1257 entry_flags |= MMU_PWT;
1258 break;
1259 case K_MEM_CACHE_WB:
1260 break;
1261 default:
1262 __ASSERT(false, "bad memory mapping flags 0x%x", flags);
1263 }
1264
1265 if ((flags & K_MEM_PERM_RW) != 0U) {
1266 entry_flags |= ENTRY_RW;
1267 }
1268
1269 if ((flags & K_MEM_PERM_USER) != 0U) {
1270 entry_flags |= ENTRY_US;
1271 }
1272
1273 if ((flags & K_MEM_PERM_EXEC) == 0U) {
1274 entry_flags |= ENTRY_XD;
1275 }
1276
1277 return entry_flags;
1278 }
1279
1280 /* map new region virt..virt+size to phys with provided arch-neutral flags */
1281 __pinned_func
arch_mem_map(void * virt,uintptr_t phys,size_t size,uint32_t flags)1282 void arch_mem_map(void *virt, uintptr_t phys, size_t size, uint32_t flags)
1283 {
1284 int ret;
1285
1286 ret = range_map_unlocked(virt, phys, size, flags_to_entry(flags),
1287 MASK_ALL, 0);
1288 __ASSERT_NO_MSG(ret == 0);
1289 ARG_UNUSED(ret);
1290 }
1291
1292 /* unmap region addr..addr+size, reset entries and flush TLB */
arch_mem_unmap(void * addr,size_t size)1293 void arch_mem_unmap(void *addr, size_t size)
1294 {
1295 int ret;
1296
1297 ret = range_map_unlocked((void *)addr, 0, size, 0, 0,
1298 OPTION_FLUSH | OPTION_CLEAR);
1299 __ASSERT_NO_MSG(ret == 0);
1300 ARG_UNUSED(ret);
1301 }
1302
1303 #ifdef Z_VM_KERNEL
1304 __boot_func
identity_map_remove(uint32_t level)1305 static void identity_map_remove(uint32_t level)
1306 {
1307 size_t size, scope = get_entry_scope(level);
1308 pentry_t *table;
1309 uint32_t cur_level;
1310 uint8_t *pos;
1311 pentry_t entry;
1312 pentry_t *entry_ptr;
1313
1314 k_mem_region_align((uintptr_t *)&pos, &size,
1315 (uintptr_t)CONFIG_SRAM_BASE_ADDRESS,
1316 (size_t)CONFIG_SRAM_SIZE * 1024U, scope);
1317
1318 while (size != 0U) {
1319 /* Need to get to the correct table */
1320 table = z_x86_kernel_ptables;
1321 for (cur_level = 0; cur_level < level; cur_level++) {
1322 entry = get_entry(table, pos, cur_level);
1323 table = next_table(entry, level);
1324 }
1325
1326 entry_ptr = get_entry_ptr(table, pos, level);
1327
1328 /* set_pte */
1329 *entry_ptr = 0;
1330 pos += scope;
1331 size -= scope;
1332 }
1333 }
1334 #endif
1335
1336 /* Invoked to remove the identity mappings in the page tables,
1337 * they were only needed to transition the instruction pointer at early boot
1338 */
1339 __boot_func
z_x86_mmu_init(void)1340 void z_x86_mmu_init(void)
1341 {
1342 #ifdef Z_VM_KERNEL
1343 /* We booted with physical address space being identity mapped.
1344 * As we are now executing in virtual address space,
1345 * the identity map is no longer needed. So remove them.
1346 *
1347 * Without PAE, only need to remove the entries at the PD level.
1348 * With PAE, need to also remove the entry at PDP level.
1349 */
1350 identity_map_remove(PDE_LEVEL);
1351
1352 #ifdef CONFIG_X86_PAE
1353 identity_map_remove(0);
1354 #endif
1355 #endif
1356 }
1357
1358 #if CONFIG_X86_STACK_PROTECTION
1359 __pinned_func
z_x86_set_stack_guard(k_thread_stack_t * stack)1360 void z_x86_set_stack_guard(k_thread_stack_t *stack)
1361 {
1362 int ret;
1363
1364 /* Applied to all page tables as this affects supervisor mode.
1365 * XXX: This never gets reset when the thread exits, which can
1366 * cause problems if the memory is later used for something else.
1367 * See #29499
1368 *
1369 * Guard page is always the first page of the stack object for both
1370 * kernel and thread stacks.
1371 */
1372 ret = range_map_unlocked(stack, 0, CONFIG_MMU_PAGE_SIZE,
1373 MMU_P | ENTRY_XD, MASK_PERM, OPTION_FLUSH);
1374 __ASSERT_NO_MSG(ret == 0);
1375 ARG_UNUSED(ret);
1376 }
1377 #endif /* CONFIG_X86_STACK_PROTECTION */
1378
1379 #ifdef CONFIG_USERSPACE
1380 __pinned_func
page_validate(pentry_t * ptables,uint8_t * addr,bool write)1381 static bool page_validate(pentry_t *ptables, uint8_t *addr, bool write)
1382 {
1383 pentry_t *table = (pentry_t *)ptables;
1384
1385 for (int level = 0; level < NUM_LEVELS; level++) {
1386 pentry_t entry = get_entry(table, addr, level);
1387
1388 if (is_leaf(level, entry)) {
1389 #ifdef CONFIG_X86_KPTI
1390 if (is_flipped_pte(entry)) {
1391 /* We flipped this to prevent user access
1392 * since just clearing US isn't sufficient
1393 */
1394 return false;
1395 }
1396 #endif
1397 /* US and RW bits still carry meaning if non-present.
1398 * If the data page is paged out, access bits are
1399 * preserved. If un-mapped, the whole entry is 0.
1400 */
1401 if (((entry & MMU_US) == 0U) ||
1402 (write && ((entry & MMU_RW) == 0U))) {
1403 return false;
1404 }
1405 } else {
1406 if ((entry & MMU_P) == 0U) {
1407 /* Missing intermediate table, address is
1408 * un-mapped
1409 */
1410 return false;
1411 }
1412 table = next_table(entry, level);
1413 }
1414 }
1415
1416 return true;
1417 }
1418
1419 __pinned_func
bcb_fence(void)1420 static inline void bcb_fence(void)
1421 {
1422 #ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION
1423 __asm__ volatile ("lfence" : : : "memory");
1424 #endif
1425 }
1426
1427 __pinned_func
arch_buffer_validate(void * addr,size_t size,int write)1428 int arch_buffer_validate(void *addr, size_t size, int write)
1429 {
1430 pentry_t *ptables = z_x86_thread_page_tables_get(_current);
1431 uint8_t *virt;
1432 size_t aligned_size;
1433 int ret = 0;
1434
1435 /* addr/size arbitrary, fix this up into an aligned region */
1436 k_mem_region_align((uintptr_t *)&virt, &aligned_size,
1437 (uintptr_t)addr, size, CONFIG_MMU_PAGE_SIZE);
1438
1439 for (size_t offset = 0; offset < aligned_size;
1440 offset += CONFIG_MMU_PAGE_SIZE) {
1441 if (!page_validate(ptables, virt + offset, write)) {
1442 ret = -1;
1443 break;
1444 }
1445 }
1446
1447 bcb_fence();
1448
1449 return ret;
1450 }
1451 #ifdef CONFIG_X86_COMMON_PAGE_TABLE
1452 /* Very low memory configuration. A single set of page tables is used for
1453 * all threads. This relies on some assumptions:
1454 *
1455 * - No KPTI. If that were supported, we would need both a kernel and user
1456 * set of page tables.
1457 * - No SMP. If that were supported, we would need per-core page tables.
1458 * - Memory domains don't affect supervisor mode.
1459 * - All threads have the same virtual-to-physical mappings.
1460 * - Memory domain APIs can't be called by user mode.
1461 *
1462 * Because there is no SMP, only one set of page tables, and user threads can't
1463 * modify their own memory domains, we don't have to do much when
1464 * arch_mem_domain_* APIs are called. We do use a caching scheme to avoid
1465 * updating page tables if the last user thread scheduled was in the same
1466 * domain.
1467 *
1468 * We don't set CONFIG_ARCH_MEM_DOMAIN_DATA, since we aren't setting
1469 * up any arch-specific memory domain data (per domain page tables.)
1470 *
1471 * This is all nice and simple and saves a lot of memory. The cost is that
1472 * context switching is not trivial CR3 update. We have to reset all partitions
1473 * for the current domain configuration and then apply all the partitions for
1474 * the incoming thread's domain if they are not the same. We also need to
1475 * update permissions similarly on the thread stack region.
1476 */
1477
1478 __pinned_func
reset_region(uintptr_t start,size_t size)1479 static inline int reset_region(uintptr_t start, size_t size)
1480 {
1481 return range_map_unlocked((void *)start, 0, size, 0, 0,
1482 OPTION_FLUSH | OPTION_RESET);
1483 }
1484
1485 __pinned_func
apply_region(uintptr_t start,size_t size,pentry_t attr)1486 static inline int apply_region(uintptr_t start, size_t size, pentry_t attr)
1487 {
1488 return range_map_unlocked((void *)start, 0, size, attr, MASK_PERM,
1489 OPTION_FLUSH);
1490 }
1491
1492 /* Cache of the current memory domain applied to the common page tables and
1493 * the stack buffer region that had User access granted.
1494 */
1495 static __pinned_bss struct k_mem_domain *current_domain;
1496 static __pinned_bss uintptr_t current_stack_start;
1497 static __pinned_bss size_t current_stack_size;
1498
1499 __pinned_func
z_x86_swap_update_common_page_table(struct k_thread * incoming)1500 void z_x86_swap_update_common_page_table(struct k_thread *incoming)
1501 {
1502 k_spinlock_key_t key;
1503
1504 if ((incoming->base.user_options & K_USER) == 0) {
1505 /* Incoming thread is not a user thread. Memory domains don't
1506 * affect supervisor threads and we don't need to enable User
1507 * bits for its stack buffer; do nothing.
1508 */
1509 return;
1510 }
1511
1512 /* Step 1: Make sure the thread stack is set up correctly for the
1513 * for the incoming thread
1514 */
1515 if (incoming->stack_info.start != current_stack_start ||
1516 incoming->stack_info.size != current_stack_size) {
1517 if (current_stack_size != 0U) {
1518 reset_region(current_stack_start, current_stack_size);
1519 }
1520
1521 /* The incoming thread's stack region needs User permissions */
1522 apply_region(incoming->stack_info.start,
1523 incoming->stack_info.size,
1524 K_MEM_PARTITION_P_RW_U_RW);
1525
1526 /* Update cache */
1527 current_stack_start = incoming->stack_info.start;
1528 current_stack_size = incoming->stack_info.size;
1529 }
1530
1531 /* Step 2: The page tables always have some memory domain applied to
1532 * them. If the incoming thread's memory domain is different,
1533 * update the page tables
1534 */
1535 key = k_spin_lock(&z_mem_domain_lock);
1536 if (incoming->mem_domain_info.mem_domain == current_domain) {
1537 /* The incoming thread's domain is already applied */
1538 goto out_unlock;
1539 }
1540
1541 /* Reset the current memory domain regions... */
1542 if (current_domain != NULL) {
1543 for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) {
1544 struct k_mem_partition *ptn =
1545 ¤t_domain->partitions[i];
1546
1547 if (ptn->size == 0) {
1548 continue;
1549 }
1550 reset_region(ptn->start, ptn->size);
1551 }
1552 }
1553
1554 /* ...and apply all the incoming domain's regions */
1555 for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) {
1556 struct k_mem_partition *ptn =
1557 &incoming->mem_domain_info.mem_domain->partitions[i];
1558
1559 if (ptn->size == 0) {
1560 continue;
1561 }
1562 apply_region(ptn->start, ptn->size, ptn->attr);
1563 }
1564 current_domain = incoming->mem_domain_info.mem_domain;
1565 out_unlock:
1566 k_spin_unlock(&z_mem_domain_lock, key);
1567 }
1568
1569 /* If a partition was added or removed in the cached domain, update the
1570 * page tables.
1571 */
1572 __pinned_func
arch_mem_domain_partition_remove(struct k_mem_domain * domain,uint32_t partition_id)1573 int arch_mem_domain_partition_remove(struct k_mem_domain *domain,
1574 uint32_t partition_id)
1575 {
1576 struct k_mem_partition *ptn;
1577
1578 if (domain != current_domain) {
1579 return 0;
1580 }
1581
1582 ptn = &domain->partitions[partition_id];
1583
1584 return reset_region(ptn->start, ptn->size);
1585 }
1586
1587 __pinned_func
arch_mem_domain_partition_add(struct k_mem_domain * domain,uint32_t partition_id)1588 int arch_mem_domain_partition_add(struct k_mem_domain *domain,
1589 uint32_t partition_id)
1590 {
1591 struct k_mem_partition *ptn;
1592
1593 if (domain != current_domain) {
1594 return 0;
1595 }
1596
1597 ptn = &domain->partitions[partition_id];
1598
1599 return apply_region(ptn->start, ptn->size, ptn->attr);
1600 }
1601
1602 /* Rest of the APIs don't need to do anything */
1603 __pinned_func
arch_mem_domain_thread_add(struct k_thread * thread)1604 int arch_mem_domain_thread_add(struct k_thread *thread)
1605 {
1606 return 0;
1607 }
1608
1609 __pinned_func
arch_mem_domain_thread_remove(struct k_thread * thread)1610 int arch_mem_domain_thread_remove(struct k_thread *thread)
1611 {
1612 return 0;
1613 }
1614 #else
1615 /* Memory domains each have a set of page tables assigned to them */
1616
1617 /*
1618 * Pool of free memory pages for copying page tables, as needed.
1619 */
1620 #define PTABLE_COPY_SIZE (INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE)
1621
1622 static uint8_t __pinned_noinit
1623 page_pool[PTABLE_COPY_SIZE * CONFIG_X86_MAX_ADDITIONAL_MEM_DOMAINS]
1624 __aligned(CONFIG_MMU_PAGE_SIZE);
1625
1626 __pinned_data
1627 static uint8_t *page_pos = page_pool + sizeof(page_pool);
1628
1629 /* Return a zeroed and suitably aligned memory page for page table data
1630 * from the global page pool
1631 */
1632 __pinned_func
page_pool_get(void)1633 static void *page_pool_get(void)
1634 {
1635 void *ret;
1636
1637 if (page_pos == page_pool) {
1638 ret = NULL;
1639 } else {
1640 page_pos -= CONFIG_MMU_PAGE_SIZE;
1641 ret = page_pos;
1642 }
1643
1644 if (ret != NULL) {
1645 memset(ret, 0, CONFIG_MMU_PAGE_SIZE);
1646 }
1647
1648 return ret;
1649 }
1650
1651 /* Debugging function to show how many pages are free in the pool */
1652 __pinned_func
pages_free(void)1653 static inline unsigned int pages_free(void)
1654 {
1655 return (page_pos - page_pool) / CONFIG_MMU_PAGE_SIZE;
1656 }
1657
1658 /**
1659 * Duplicate an entire set of page tables
1660 *
1661 * Uses recursion, but depth at any given moment is limited by the number of
1662 * paging levels.
1663 *
1664 * x86_mmu_lock must be held.
1665 *
1666 * @param dst a zeroed out chunk of memory of sufficient size for the indicated
1667 * paging level.
1668 * @param src some paging structure from within the source page tables to copy
1669 * at the indicated paging level
1670 * @param level Current paging level
1671 * @retval 0 Success
1672 * @retval -ENOMEM Insufficient page pool memory
1673 */
1674 __pinned_func
copy_page_table(pentry_t * dst,pentry_t * src,int level)1675 static int copy_page_table(pentry_t *dst, pentry_t *src, int level)
1676 {
1677 if (level == PTE_LEVEL) {
1678 /* Base case: leaf page table */
1679 for (int i = 0; i < get_num_entries(level); i++) {
1680 dst[i] = pte_finalize_value(reset_pte(src[i]), true,
1681 PTE_LEVEL);
1682 }
1683 } else {
1684 /* Recursive case: allocate sub-structures as needed and
1685 * make recursive calls on them
1686 */
1687 for (int i = 0; i < get_num_entries(level); i++) {
1688 pentry_t *child_dst;
1689 int ret;
1690
1691 if ((src[i] & MMU_P) == 0) {
1692 /* Non-present, skip */
1693 continue;
1694 }
1695
1696 if ((level == PDE_LEVEL) && ((src[i] & MMU_PS) != 0)) {
1697 /* large page: no lower level table */
1698 dst[i] = pte_finalize_value(src[i], true,
1699 PDE_LEVEL);
1700 continue;
1701 }
1702
1703 __ASSERT((src[i] & MMU_PS) == 0,
1704 "large page encountered");
1705
1706 child_dst = page_pool_get();
1707 if (child_dst == NULL) {
1708 return -ENOMEM;
1709 }
1710
1711 /* Page table links are by physical address. RAM
1712 * for page tables is identity-mapped, but double-
1713 * cast needed for PAE case where sizeof(void *) and
1714 * sizeof(pentry_t) are not the same.
1715 */
1716 dst[i] = ((pentry_t)z_mem_phys_addr(child_dst) |
1717 INT_FLAGS);
1718
1719 ret = copy_page_table(child_dst,
1720 next_table(src[i], level),
1721 level + 1);
1722 if (ret != 0) {
1723 return ret;
1724 }
1725 }
1726 }
1727
1728 return 0;
1729 }
1730
1731 __pinned_func
region_map_update(pentry_t * ptables,void * start,size_t size,pentry_t flags,bool reset)1732 static int region_map_update(pentry_t *ptables, void *start,
1733 size_t size, pentry_t flags, bool reset)
1734 {
1735 uint32_t options = OPTION_USER;
1736 int ret;
1737 k_spinlock_key_t key;
1738
1739 if (reset) {
1740 options |= OPTION_RESET;
1741 }
1742 if (ptables == z_x86_page_tables_get()) {
1743 options |= OPTION_FLUSH;
1744 }
1745
1746 key = k_spin_lock(&x86_mmu_lock);
1747 ret = range_map_ptables(ptables, start, 0, size, flags, MASK_PERM,
1748 options);
1749 k_spin_unlock(&x86_mmu_lock, key);
1750
1751 #ifdef CONFIG_SMP
1752 tlb_shootdown();
1753 #endif
1754
1755 return ret;
1756 }
1757
1758 __pinned_func
reset_region(pentry_t * ptables,void * start,size_t size)1759 static inline int reset_region(pentry_t *ptables, void *start, size_t size)
1760 {
1761 LOG_DBG("%s(%p, %p, %zu)", __func__, ptables, start, size);
1762 return region_map_update(ptables, start, size, 0, true);
1763 }
1764
1765 __pinned_func
apply_region(pentry_t * ptables,void * start,size_t size,pentry_t attr)1766 static inline int apply_region(pentry_t *ptables, void *start,
1767 size_t size, pentry_t attr)
1768 {
1769 LOG_DBG("%s(%p, %p, %zu, " PRI_ENTRY ")", __func__, ptables, start,
1770 size, attr);
1771 return region_map_update(ptables, start, size, attr, false);
1772 }
1773
1774 __pinned_func
set_stack_perms(struct k_thread * thread,pentry_t * ptables)1775 static void set_stack_perms(struct k_thread *thread, pentry_t *ptables)
1776 {
1777 LOG_DBG("update stack for thread %p's ptables at %p: %p (size %zu)",
1778 thread, ptables, (void *)thread->stack_info.start,
1779 thread->stack_info.size);
1780 apply_region(ptables, (void *)thread->stack_info.start,
1781 thread->stack_info.size,
1782 MMU_P | MMU_XD | MMU_RW | MMU_US);
1783 }
1784
1785 /*
1786 * Arch interface implementations for memory domains and userspace
1787 */
1788
1789 __boot_func
arch_mem_domain_init(struct k_mem_domain * domain)1790 int arch_mem_domain_init(struct k_mem_domain *domain)
1791 {
1792 int ret;
1793 k_spinlock_key_t key = k_spin_lock(&x86_mmu_lock);
1794
1795 LOG_DBG("%s(%p)", __func__, domain);
1796 #if __ASSERT_ON
1797 sys_snode_t *node;
1798
1799 /* Assert that we have not already initialized this domain */
1800 SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
1801 struct arch_mem_domain *list_domain =
1802 CONTAINER_OF(node, struct arch_mem_domain, node);
1803
1804 __ASSERT(list_domain != &domain->arch,
1805 "%s(%p) called multiple times", __func__, domain);
1806 }
1807 #endif /* __ASSERT_ON */
1808 #ifndef CONFIG_X86_KPTI
1809 /* If we're not using KPTI then we can use the build time page tables
1810 * (which are mutable) as the set of page tables for the default
1811 * memory domain, saving us some memory.
1812 *
1813 * We skip adding this domain to x86_domain_list since we already
1814 * update z_x86_kernel_ptables directly in range_map().
1815 */
1816 if (domain == &k_mem_domain_default) {
1817 domain->arch.ptables = z_x86_kernel_ptables;
1818 k_spin_unlock(&x86_mmu_lock, key);
1819 return 0;
1820 }
1821 #endif /* CONFIG_X86_KPTI */
1822 #ifdef CONFIG_X86_PAE
1823 /* PDPT is stored within the memory domain itself since it is
1824 * much smaller than a full page
1825 */
1826 (void)memset(domain->arch.pdpt, 0, sizeof(domain->arch.pdpt));
1827 domain->arch.ptables = domain->arch.pdpt;
1828 #else
1829 /* Allocate a page-sized top-level structure, either a PD or PML4 */
1830 domain->arch.ptables = page_pool_get();
1831 if (domain->arch.ptables == NULL) {
1832 k_spin_unlock(&x86_mmu_lock, key);
1833 return -ENOMEM;
1834 }
1835 #endif /* CONFIG_X86_PAE */
1836
1837 LOG_DBG("copy_page_table(%p, %p, 0)", domain->arch.ptables,
1838 z_x86_kernel_ptables);
1839
1840 /* Make a copy of the boot page tables created by gen_mmu.py */
1841 ret = copy_page_table(domain->arch.ptables, z_x86_kernel_ptables, 0);
1842 if (ret == 0) {
1843 sys_slist_append(&x86_domain_list, &domain->arch.node);
1844 }
1845 k_spin_unlock(&x86_mmu_lock, key);
1846
1847 return ret;
1848 }
1849
arch_mem_domain_partition_remove(struct k_mem_domain * domain,uint32_t partition_id)1850 int arch_mem_domain_partition_remove(struct k_mem_domain *domain,
1851 uint32_t partition_id)
1852 {
1853 struct k_mem_partition *partition = &domain->partitions[partition_id];
1854
1855 /* Reset the partition's region back to defaults */
1856 return reset_region(domain->arch.ptables, (void *)partition->start,
1857 partition->size);
1858 }
1859
1860 /* Called on thread exit or when moving it to a different memory domain */
arch_mem_domain_thread_remove(struct k_thread * thread)1861 int arch_mem_domain_thread_remove(struct k_thread *thread)
1862 {
1863 struct k_mem_domain *domain = thread->mem_domain_info.mem_domain;
1864
1865 if ((thread->base.user_options & K_USER) == 0) {
1866 return 0;
1867 }
1868
1869 if ((thread->base.thread_state & _THREAD_DEAD) == 0) {
1870 /* Thread is migrating to another memory domain and not
1871 * exiting for good; we weren't called from
1872 * z_thread_abort(). Resetting the stack region will
1873 * take place in the forthcoming thread_add() call.
1874 */
1875 return 0;
1876 }
1877
1878 /* Restore permissions on the thread's stack area since it is no
1879 * longer a member of the domain.
1880 */
1881 return reset_region(domain->arch.ptables,
1882 (void *)thread->stack_info.start,
1883 thread->stack_info.size);
1884 }
1885
1886 __pinned_func
arch_mem_domain_partition_add(struct k_mem_domain * domain,uint32_t partition_id)1887 int arch_mem_domain_partition_add(struct k_mem_domain *domain,
1888 uint32_t partition_id)
1889 {
1890 struct k_mem_partition *partition = &domain->partitions[partition_id];
1891
1892 /* Update the page tables with the partition info */
1893 return apply_region(domain->arch.ptables, (void *)partition->start,
1894 partition->size, partition->attr | MMU_P);
1895 }
1896
1897 /* Invoked from memory domain API calls, as well as during thread creation */
1898 __pinned_func
arch_mem_domain_thread_add(struct k_thread * thread)1899 int arch_mem_domain_thread_add(struct k_thread *thread)
1900 {
1901 int ret = 0;
1902
1903 /* New memory domain we are being added to */
1904 struct k_mem_domain *domain = thread->mem_domain_info.mem_domain;
1905 /* This is only set for threads that were migrating from some other
1906 * memory domain; new threads this is NULL.
1907 *
1908 * Note that NULL check on old_ptables must be done before any
1909 * address translation or else (NULL + offset) != NULL.
1910 */
1911 pentry_t *old_ptables = UINT_TO_POINTER(thread->arch.ptables);
1912 bool is_user = (thread->base.user_options & K_USER) != 0;
1913 bool is_migration = (old_ptables != NULL) && is_user;
1914
1915 /* Allow US access to the thread's stack in its new domain if
1916 * we are migrating. If we are not migrating this is done in
1917 * z_x86_current_stack_perms()
1918 */
1919 if (is_migration) {
1920 old_ptables = z_mem_virt_addr(thread->arch.ptables);
1921 set_stack_perms(thread, domain->arch.ptables);
1922 }
1923
1924 thread->arch.ptables = z_mem_phys_addr(domain->arch.ptables);
1925 LOG_DBG("set thread %p page tables to %p", thread,
1926 (void *)thread->arch.ptables);
1927
1928 /* Check if we're doing a migration from a different memory domain
1929 * and have to remove permissions from its old domain.
1930 *
1931 * XXX: The checks we have to do here and in
1932 * arch_mem_domain_thread_remove() are clumsy, it may be worth looking
1933 * into adding a specific arch_mem_domain_thread_migrate() API.
1934 * See #29601
1935 */
1936 if (is_migration) {
1937 ret = reset_region(old_ptables,
1938 (void *)thread->stack_info.start,
1939 thread->stack_info.size);
1940 }
1941
1942 #if !defined(CONFIG_X86_KPTI) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
1943 /* Need to switch to using these new page tables, in case we drop
1944 * to user mode before we are ever context switched out.
1945 * IPI takes care of this if the thread is currently running on some
1946 * other CPU.
1947 */
1948 if (thread == _current && thread->arch.ptables != z_x86_cr3_get()) {
1949 z_x86_cr3_set(thread->arch.ptables);
1950 }
1951 #endif /* CONFIG_X86_KPTI */
1952
1953 return ret;
1954 }
1955 #endif /* !CONFIG_X86_COMMON_PAGE_TABLE */
1956
1957 __pinned_func
arch_mem_domain_max_partitions_get(void)1958 int arch_mem_domain_max_partitions_get(void)
1959 {
1960 return CONFIG_MAX_DOMAIN_PARTITIONS;
1961 }
1962
1963 /* Invoked from z_x86_userspace_enter */
1964 __pinned_func
z_x86_current_stack_perms(void)1965 void z_x86_current_stack_perms(void)
1966 {
1967 /* Clear any previous context in the stack buffer to prevent
1968 * unintentional data leakage.
1969 */
1970 (void)memset((void *)_current->stack_info.start, 0xAA,
1971 _current->stack_info.size - _current->stack_info.delta);
1972
1973 /* Only now is it safe to grant access to the stack buffer since any
1974 * previous context has been erased.
1975 */
1976 #ifdef CONFIG_X86_COMMON_PAGE_TABLE
1977 /* Re run swap page table update logic since we're entering User mode.
1978 * This will grant stack and memory domain access if it wasn't set
1979 * already (in which case this returns very quickly).
1980 */
1981 z_x86_swap_update_common_page_table(_current);
1982 #else
1983 /* Memory domain access is already programmed into the page tables.
1984 * Need to enable access to this new user thread's stack buffer in
1985 * its domain-specific page tables.
1986 */
1987 set_stack_perms(_current, z_x86_thread_page_tables_get(_current));
1988 #endif
1989 }
1990 #endif /* CONFIG_USERSPACE */
1991
1992 #ifdef CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES
1993 __boot_func
mark_addr_page_reserved(uintptr_t addr,size_t len)1994 static void mark_addr_page_reserved(uintptr_t addr, size_t len)
1995 {
1996 uintptr_t pos = ROUND_DOWN(addr, CONFIG_MMU_PAGE_SIZE);
1997 uintptr_t end = ROUND_UP(addr + len, CONFIG_MMU_PAGE_SIZE);
1998
1999 for (; pos < end; pos += CONFIG_MMU_PAGE_SIZE) {
2000 if (!z_is_page_frame(pos)) {
2001 continue;
2002 }
2003
2004 struct z_page_frame *pf = z_phys_to_page_frame(pos);
2005
2006 pf->flags |= Z_PAGE_FRAME_RESERVED;
2007 }
2008 }
2009
2010 __boot_func
arch_reserved_pages_update(void)2011 void arch_reserved_pages_update(void)
2012 {
2013 #ifdef CONFIG_X86_PC_COMPATIBLE
2014 /*
2015 * Best is to do some E820 or similar enumeration to specifically
2016 * identify all page frames which are reserved by the hardware or
2017 * firmware. Or use x86_memmap[] with Multiboot if available.
2018 *
2019 * But still, reserve everything in the first megabyte of physical
2020 * memory on PC-compatible platforms.
2021 */
2022 mark_addr_page_reserved(0, MB(1));
2023 #endif /* CONFIG_X86_PC_COMPATIBLE */
2024
2025 #ifdef CONFIG_X86_MEMMAP
2026 for (int i = 0; i < CONFIG_X86_MEMMAP_ENTRIES; i++) {
2027 struct x86_memmap_entry *entry = &x86_memmap[i];
2028
2029 switch (entry->type) {
2030 case X86_MEMMAP_ENTRY_UNUSED:
2031 __fallthrough;
2032 case X86_MEMMAP_ENTRY_RAM:
2033 continue;
2034
2035 case X86_MEMMAP_ENTRY_ACPI:
2036 __fallthrough;
2037 case X86_MEMMAP_ENTRY_NVS:
2038 __fallthrough;
2039 case X86_MEMMAP_ENTRY_DEFECTIVE:
2040 __fallthrough;
2041 default:
2042 /* If any of three above cases satisfied, exit switch
2043 * and mark page reserved
2044 */
2045 break;
2046 }
2047
2048 mark_addr_page_reserved(entry->base, entry->length);
2049 }
2050 #endif /* CONFIG_X86_MEMMAP */
2051 }
2052 #endif /* CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES */
2053
arch_page_phys_get(void * virt,uintptr_t * phys)2054 int arch_page_phys_get(void *virt, uintptr_t *phys)
2055 {
2056 pentry_t pte = 0;
2057 int level, ret;
2058
2059 __ASSERT(POINTER_TO_UINT(virt) % CONFIG_MMU_PAGE_SIZE == 0U,
2060 "unaligned address %p to %s", virt, __func__);
2061
2062 pentry_get(&level, &pte, z_x86_page_tables_get(), virt);
2063
2064 if ((pte & MMU_P) != 0) {
2065 if (phys != NULL) {
2066 *phys = (uintptr_t)get_entry_phys(pte, PTE_LEVEL);
2067 }
2068 ret = 0;
2069 } else {
2070 /* Not mapped */
2071 ret = -EFAULT;
2072 }
2073
2074 return ret;
2075 }
2076
2077 #ifdef CONFIG_DEMAND_PAGING
2078 #define PTE_MASK (paging_levels[PTE_LEVEL].mask)
2079
2080 __pinned_func
arch_mem_page_out(void * addr,uintptr_t location)2081 void arch_mem_page_out(void *addr, uintptr_t location)
2082 {
2083 int ret;
2084 pentry_t mask = PTE_MASK | MMU_P | MMU_A;
2085
2086 /* Accessed bit set to guarantee the entry is not completely 0 in
2087 * case of location value 0. A totally 0 PTE is un-mapped.
2088 */
2089 ret = range_map(addr, location, CONFIG_MMU_PAGE_SIZE, MMU_A, mask,
2090 OPTION_FLUSH);
2091 __ASSERT_NO_MSG(ret == 0);
2092 ARG_UNUSED(ret);
2093 }
2094
2095 __pinned_func
arch_mem_page_in(void * addr,uintptr_t phys)2096 void arch_mem_page_in(void *addr, uintptr_t phys)
2097 {
2098 int ret;
2099 pentry_t mask = PTE_MASK | MMU_P | MMU_D | MMU_A;
2100
2101 ret = range_map(addr, phys, CONFIG_MMU_PAGE_SIZE, MMU_P, mask,
2102 OPTION_FLUSH);
2103 __ASSERT_NO_MSG(ret == 0);
2104 ARG_UNUSED(ret);
2105 }
2106
2107 __pinned_func
arch_mem_scratch(uintptr_t phys)2108 void arch_mem_scratch(uintptr_t phys)
2109 {
2110 page_map_set(z_x86_page_tables_get(), Z_SCRATCH_PAGE,
2111 phys | MMU_P | MMU_RW | MMU_XD, NULL, MASK_ALL,
2112 OPTION_FLUSH);
2113 }
2114
2115 __pinned_func
arch_page_info_get(void * addr,uintptr_t * phys,bool clear_accessed)2116 uintptr_t arch_page_info_get(void *addr, uintptr_t *phys, bool clear_accessed)
2117 {
2118 pentry_t all_pte, mask;
2119 uint32_t options;
2120
2121 /* What to change, if anything, in the page_map_set() calls */
2122 if (clear_accessed) {
2123 mask = MMU_A;
2124 options = OPTION_FLUSH;
2125 } else {
2126 /* In this configuration page_map_set() just queries the
2127 * page table and makes no changes
2128 */
2129 mask = 0;
2130 options = 0U;
2131 }
2132
2133 page_map_set(z_x86_kernel_ptables, addr, 0, &all_pte, mask, options);
2134
2135 /* Un-mapped PTEs are completely zeroed. No need to report anything
2136 * else in this case.
2137 */
2138 if (all_pte == 0) {
2139 return ARCH_DATA_PAGE_NOT_MAPPED;
2140 }
2141
2142 #if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
2143 /* Don't bother looking at other page tables if non-present as we
2144 * are not required to report accurate accessed/dirty in this case
2145 * and all mappings are otherwise the same.
2146 */
2147 if ((all_pte & MMU_P) != 0) {
2148 sys_snode_t *node;
2149
2150 /* IRQs are locked, safe to do this */
2151 SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
2152 pentry_t cur_pte;
2153 struct arch_mem_domain *domain =
2154 CONTAINER_OF(node, struct arch_mem_domain,
2155 node);
2156
2157 page_map_set(domain->ptables, addr, 0, &cur_pte,
2158 mask, options | OPTION_USER);
2159
2160 /* Logical OR of relevant PTE in all page tables.
2161 * addr/location and present state should be identical
2162 * among them.
2163 */
2164 all_pte |= cur_pte;
2165 }
2166 }
2167 #endif /* USERSPACE && ~X86_COMMON_PAGE_TABLE */
2168
2169 /* NOTE: We are truncating the PTE on PAE systems, whose pentry_t
2170 * are larger than a uintptr_t.
2171 *
2172 * We currently aren't required to report back XD state (bit 63), and
2173 * Zephyr just doesn't support large physical memory on 32-bit
2174 * systems, PAE was only implemented for XD support.
2175 */
2176 if (phys != NULL) {
2177 *phys = (uintptr_t)get_entry_phys(all_pte, PTE_LEVEL);
2178 }
2179
2180 /* We don't filter out any other bits in the PTE and the kernel
2181 * ignores them. For the case of ARCH_DATA_PAGE_NOT_MAPPED,
2182 * we use a bit which is never set in a real PTE (the PAT bit) in the
2183 * current system.
2184 *
2185 * The other ARCH_DATA_PAGE_* macros are defined to their corresponding
2186 * bits in the PTE.
2187 */
2188 return (uintptr_t)all_pte;
2189 }
2190
2191 __pinned_func
arch_page_location_get(void * addr,uintptr_t * location)2192 enum arch_page_location arch_page_location_get(void *addr, uintptr_t *location)
2193 {
2194 pentry_t pte;
2195 int level;
2196
2197 /* TODO: since we only have to query the current set of page tables,
2198 * could optimize this with recursive page table mapping
2199 */
2200 pentry_get(&level, &pte, z_x86_page_tables_get(), addr);
2201
2202 if (pte == 0) {
2203 /* Not mapped */
2204 return ARCH_PAGE_LOCATION_BAD;
2205 }
2206
2207 __ASSERT(level == PTE_LEVEL, "bigpage found at %p", addr);
2208 *location = (uintptr_t)get_entry_phys(pte, PTE_LEVEL);
2209
2210 if ((pte & MMU_P) != 0) {
2211 return ARCH_PAGE_LOCATION_PAGED_IN;
2212 } else {
2213 return ARCH_PAGE_LOCATION_PAGED_OUT;
2214 }
2215 }
2216
2217 #ifdef CONFIG_X86_KPTI
2218 __pinned_func
z_x86_kpti_is_access_ok(void * addr,pentry_t * ptables)2219 bool z_x86_kpti_is_access_ok(void *addr, pentry_t *ptables)
2220 {
2221 pentry_t pte;
2222 int level;
2223
2224 pentry_get(&level, &pte, ptables, addr);
2225
2226 /* Might as well also check if it's un-mapped, normally we don't
2227 * fetch the PTE from the page tables until we are inside
2228 * z_page_fault() and call arch_page_fault_status_get()
2229 */
2230 if (level != PTE_LEVEL || pte == 0 || is_flipped_pte(pte)) {
2231 return false;
2232 }
2233
2234 return true;
2235 }
2236 #endif /* CONFIG_X86_KPTI */
2237 #endif /* CONFIG_DEMAND_PAGING */
2238