1 /*
2  * Copyright (c) 2011-2014 Wind River Systems, Inc.
3  * Copyright (c) 2017-2020 Intel Corporation
4  *
5  * SPDX-License-Identifier: Apache-2.0
6  */
7 
8 #include <zephyr/kernel.h>
9 #include <zephyr/arch/x86/mmustructs.h>
10 #include <zephyr/sys/mem_manage.h>
11 #include <zephyr/sys/__assert.h>
12 #include <zephyr/sys/check.h>
13 #include <zephyr/logging/log.h>
14 #include <errno.h>
15 #include <ctype.h>
16 #include <zephyr/spinlock.h>
17 #include <kernel_arch_func.h>
18 #include <x86_mmu.h>
19 #include <zephyr/init.h>
20 #include <kernel_internal.h>
21 #include <mmu.h>
22 #include <zephyr/drivers/interrupt_controller/loapic.h>
23 #include <mmu.h>
24 #include <zephyr/arch/x86/memmap.h>
25 
26 LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL);
27 
28 /* We will use some ignored bits in the PTE to backup permission settings
29  * when the mapping was made. This is used to un-apply memory domain memory
30  * partitions to page tables when the partitions are removed.
31  */
32 #define MMU_RW_ORIG	MMU_IGNORED0
33 #define MMU_US_ORIG	MMU_IGNORED1
34 #define MMU_XD_ORIG	MMU_IGNORED2
35 
36 /* Bits in the PTE that form the set of permission bits, when resetting */
37 #define MASK_PERM	(MMU_RW | MMU_US | MMU_XD)
38 
39 /* When we want to set up a new mapping, discarding any previous state */
40 #define MASK_ALL	(~((pentry_t)0U))
41 
42 /* Bits to set at mapping time for particular permissions. We set the actual
43  * page table bit effecting the policy and also the backup bit.
44  */
45 #define ENTRY_RW	(MMU_RW | MMU_RW_ORIG)
46 #define ENTRY_US	(MMU_US | MMU_US_ORIG)
47 #define ENTRY_XD	(MMU_XD | MMU_XD_ORIG)
48 
49 /* Bit position which is always zero in a PTE. We'll use the PAT bit.
50  * This helps disambiguate PTEs that do not have the Present bit set (MMU_P):
51  * - If the entire entry is zero, it's an un-mapped virtual page
52  * - If PTE_ZERO is set, we flipped this page due to KPTI
53  * - Otherwise, this was a page-out
54  */
55 #define PTE_ZERO	MMU_PAT
56 
57 /* Protects x86_domain_list and serializes instantiation of intermediate
58  * paging structures.
59  */
60 __pinned_bss
61 static struct k_spinlock x86_mmu_lock;
62 
63 #if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
64 /* List of all active and initialized memory domains. This is used to make
65  * sure all memory mappings are the same across all page tables when invoking
66  * range_map()
67  */
68 __pinned_bss
69 static sys_slist_t x86_domain_list;
70 #endif
71 
72 /*
73  * Definitions for building an ontology of paging levels and capabilities
74  * at each level
75  */
76 
77 /* Data structure describing the characteristics of a particular paging
78  * level
79  */
80 struct paging_level {
81 	/* What bits are used to store physical address */
82 	pentry_t mask;
83 
84 	/* Number of entries in this paging structure */
85 	size_t entries;
86 
87 	/* How many bits to right-shift a virtual address to obtain the
88 	 * appropriate entry within this table.
89 	 *
90 	 * The memory scope of each entry in this table is 1 << shift.
91 	 */
92 	unsigned int shift;
93 #ifdef CONFIG_EXCEPTION_DEBUG
94 	/* Name of this level, for debug purposes */
95 	const char *name;
96 #endif
97 };
98 
99 /* Flags for all entries in intermediate paging levels.
100  * Fortunately, the same bits are set for all intermediate levels for all
101  * three paging modes.
102  *
103  * Obviously P is set.
104  *
105  * We want RW and US bit always set; actual access control will be
106  * done at the leaf level.
107  *
108  * XD (if supported) always 0. Disabling execution done at leaf level.
109  *
110  * PCD/PWT always 0. Caching properties again done at leaf level.
111  */
112 #define INT_FLAGS	(MMU_P | MMU_RW | MMU_US)
113 
114 /* Paging level ontology for the selected paging mode.
115  *
116  * See Figures 4-4, 4-7, 4-11 in the Intel SDM, vol 3A
117  */
118 __pinned_rodata
119 static const struct paging_level paging_levels[] = {
120 #ifdef CONFIG_X86_64
121 	/* Page Map Level 4 */
122 	{
123 		.mask = 0x7FFFFFFFFFFFF000ULL,
124 		.entries = 512U,
125 		.shift = 39U,
126 #ifdef CONFIG_EXCEPTION_DEBUG
127 		.name = "PML4"
128 #endif
129 	},
130 #endif /* CONFIG_X86_64 */
131 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
132 	/* Page Directory Pointer Table */
133 	{
134 		.mask = 0x7FFFFFFFFFFFF000ULL,
135 #ifdef CONFIG_X86_64
136 		.entries = 512U,
137 #else
138 		/* PAE version */
139 		.entries = 4U,
140 #endif
141 		.shift = 30U,
142 #ifdef CONFIG_EXCEPTION_DEBUG
143 		.name = "PDPT"
144 #endif
145 	},
146 #endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
147 	/* Page Directory */
148 	{
149 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
150 		.mask = 0x7FFFFFFFFFFFF000ULL,
151 		.entries = 512U,
152 		.shift = 21U,
153 #else
154 		/* 32-bit */
155 		.mask = 0xFFFFF000U,
156 		.entries = 1024U,
157 		.shift = 22U,
158 #endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
159 #ifdef CONFIG_EXCEPTION_DEBUG
160 		.name = "PD"
161 #endif
162 	},
163 	/* Page Table */
164 	{
165 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
166 		.mask = 0x07FFFFFFFFFFF000ULL,
167 		.entries = 512U,
168 		.shift = 12U,
169 #else
170 		/* 32-bit */
171 		.mask = 0xFFFFF000U,
172 		.entries = 1024U,
173 		.shift = 12U,
174 #endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
175 #ifdef CONFIG_EXCEPTION_DEBUG
176 		.name = "PT"
177 #endif
178 	}
179 };
180 
181 #define NUM_LEVELS	ARRAY_SIZE(paging_levels)
182 #define PTE_LEVEL	(NUM_LEVELS - 1)
183 #define PDE_LEVEL	(NUM_LEVELS - 2)
184 
185 /*
186  * Macros for reserving space for page tables
187  *
188  * We need to reserve a block of memory equal in size to the page tables
189  * generated by gen_mmu.py so that memory addresses do not shift between
190  * build phases. These macros ultimately specify INITIAL_PAGETABLE_SIZE.
191  */
192 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
193 #ifdef CONFIG_X86_64
194 #define NUM_PML4_ENTRIES 512U
195 #define NUM_PDPT_ENTRIES 512U
196 #else
197 #define NUM_PDPT_ENTRIES 4U
198 #endif /* CONFIG_X86_64 */
199 #define NUM_PD_ENTRIES   512U
200 #define NUM_PT_ENTRIES   512U
201 #else
202 #define NUM_PD_ENTRIES   1024U
203 #define NUM_PT_ENTRIES   1024U
204 #endif /* !CONFIG_X86_64 && !CONFIG_X86_PAE */
205 
206 /* Memory range covered by an instance of various table types */
207 #define PT_AREA		((uintptr_t)(CONFIG_MMU_PAGE_SIZE * NUM_PT_ENTRIES))
208 #define PD_AREA 	(PT_AREA * NUM_PD_ENTRIES)
209 #ifdef CONFIG_X86_64
210 #define PDPT_AREA	(PD_AREA * NUM_PDPT_ENTRIES)
211 #endif
212 
213 #define VM_ADDR		CONFIG_KERNEL_VM_BASE
214 #define VM_SIZE		CONFIG_KERNEL_VM_SIZE
215 
216 /* Define a range [PT_START, PT_END) which is the memory range
217  * covered by all the page tables needed for the address space
218  */
219 #define PT_START	((uintptr_t)ROUND_DOWN(VM_ADDR, PT_AREA))
220 #define PT_END		((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PT_AREA))
221 
222 /* Number of page tables needed to cover address space. Depends on the specific
223  * bounds, but roughly 1 page table per 2MB of RAM
224  */
225 #define NUM_PT	((PT_END - PT_START) / PT_AREA)
226 
227 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
228 /* Same semantics as above, but for the page directories needed to cover
229  * system RAM.
230  */
231 #define PD_START	((uintptr_t)ROUND_DOWN(VM_ADDR, PD_AREA))
232 #define PD_END		((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PD_AREA))
233 /* Number of page directories needed to cover the address space. Depends on the
234  * specific bounds, but roughly 1 page directory per 1GB of RAM
235  */
236 #define NUM_PD	((PD_END - PD_START) / PD_AREA)
237 #else
238 /* 32-bit page tables just have one toplevel page directory */
239 #define NUM_PD	1
240 #endif
241 
242 #ifdef CONFIG_X86_64
243 /* Same semantics as above, but for the page directory pointer tables needed
244  * to cover the address space. On 32-bit there is just one 4-entry PDPT.
245  */
246 #define PDPT_START	((uintptr_t)ROUND_DOWN(VM_ADDR, PDPT_AREA))
247 #define PDPT_END	((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PDPT_AREA))
248 /* Number of PDPTs needed to cover the address space. 1 PDPT per 512GB of VM */
249 #define NUM_PDPT	((PDPT_END - PDPT_START) / PDPT_AREA)
250 
251 /* All pages needed for page tables, using computed values plus one more for
252  * the top-level PML4
253  */
254 #define NUM_TABLE_PAGES	(NUM_PT + NUM_PD + NUM_PDPT + 1)
255 #else /* !CONFIG_X86_64 */
256 /* Number of pages we need to reserve in the stack for per-thread page tables */
257 #define NUM_TABLE_PAGES	(NUM_PT + NUM_PD)
258 #endif /* CONFIG_X86_64 */
259 
260 #define INITIAL_PTABLE_PAGES \
261 	(NUM_TABLE_PAGES + CONFIG_X86_EXTRA_PAGE_TABLE_PAGES)
262 
263 #ifdef CONFIG_X86_PAE
264 /* Toplevel PDPT wasn't included as it is not a page in size */
265 #define INITIAL_PTABLE_SIZE \
266 	((INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE) + 0x20)
267 #else
268 #define INITIAL_PTABLE_SIZE \
269 	(INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE)
270 #endif
271 
272 /* "dummy" pagetables for the first-phase build. The real page tables
273  * are produced by gen-mmu.py based on data read in zephyr-prebuilt.elf,
274  * and this dummy array is discarded.
275  */
276 Z_GENERIC_SECTION(.dummy_pagetables)
277 static __used char dummy_pagetables[INITIAL_PTABLE_SIZE];
278 
279 /*
280  * Utility functions
281  */
282 
283 /* For a table at a particular level, get the entry index that corresponds to
284  * the provided virtual address
285  */
286 __pinned_func
get_index(void * virt,int level)287 static inline int get_index(void *virt, int level)
288 {
289 	return (((uintptr_t)virt >> paging_levels[level].shift) %
290 		paging_levels[level].entries);
291 }
292 
293 __pinned_func
get_entry_ptr(pentry_t * ptables,void * virt,int level)294 static inline pentry_t *get_entry_ptr(pentry_t *ptables, void *virt, int level)
295 {
296 	return &ptables[get_index(virt, level)];
297 }
298 
299 __pinned_func
get_entry(pentry_t * ptables,void * virt,int level)300 static inline pentry_t get_entry(pentry_t *ptables, void *virt, int level)
301 {
302 	return ptables[get_index(virt, level)];
303 }
304 
305 /* Get the physical memory address associated with this table entry */
306 __pinned_func
get_entry_phys(pentry_t entry,int level)307 static inline uintptr_t get_entry_phys(pentry_t entry, int level)
308 {
309 	return entry & paging_levels[level].mask;
310 }
311 
312 /* Return the virtual address of a linked table stored in the provided entry */
313 __pinned_func
next_table(pentry_t entry,int level)314 static inline pentry_t *next_table(pentry_t entry, int level)
315 {
316 	return z_mem_virt_addr(get_entry_phys(entry, level));
317 }
318 
319 /* Number of table entries at this level */
320 __pinned_func
get_num_entries(int level)321 static inline size_t get_num_entries(int level)
322 {
323 	return paging_levels[level].entries;
324 }
325 
326 /* 4K for everything except PAE PDPTs */
327 __pinned_func
table_size(int level)328 static inline size_t table_size(int level)
329 {
330 	return get_num_entries(level) * sizeof(pentry_t);
331 }
332 
333 /* For a table at a particular level, size of the amount of virtual memory
334  * that an entry within the table covers
335  */
336 __pinned_func
get_entry_scope(int level)337 static inline size_t get_entry_scope(int level)
338 {
339 	return (1UL << paging_levels[level].shift);
340 }
341 
342 /* For a table at a particular level, size of the amount of virtual memory
343  * that this entire table covers
344  */
345 __pinned_func
get_table_scope(int level)346 static inline size_t get_table_scope(int level)
347 {
348 	return get_entry_scope(level) * get_num_entries(level);
349 }
350 
351 /* Must have checked Present bit first! Non-present entries may have OS data
352  * stored in any other bits
353  */
354 __pinned_func
is_leaf(int level,pentry_t entry)355 static inline bool is_leaf(int level, pentry_t entry)
356 {
357 	if (level == PTE_LEVEL) {
358 		/* Always true for PTE */
359 		return true;
360 	}
361 
362 	return ((entry & MMU_PS) != 0U);
363 }
364 
365 /* This does NOT (by design) un-flip KPTI PTEs, it's just the raw PTE value */
366 __pinned_func
pentry_get(int * paging_level,pentry_t * val,pentry_t * ptables,void * virt)367 static inline void pentry_get(int *paging_level, pentry_t *val,
368 			      pentry_t *ptables, void *virt)
369 {
370 	pentry_t *table = ptables;
371 
372 	for (int level = 0; level < NUM_LEVELS; level++) {
373 		pentry_t entry = get_entry(table, virt, level);
374 
375 		if ((entry & MMU_P) == 0 || is_leaf(level, entry)) {
376 			*val = entry;
377 			if (paging_level != NULL) {
378 				*paging_level = level;
379 			}
380 			break;
381 		} else {
382 			table = next_table(entry, level);
383 		}
384 	}
385 }
386 
387 __pinned_func
tlb_flush_page(void * addr)388 static inline void tlb_flush_page(void *addr)
389 {
390 	/* Invalidate TLB entries corresponding to the page containing the
391 	 * specified address
392 	 */
393 	char *page = (char *)addr;
394 
395 	__asm__ ("invlpg %0" :: "m" (*page));
396 }
397 
398 #ifdef CONFIG_X86_KPTI
399 __pinned_func
is_flipped_pte(pentry_t pte)400 static inline bool is_flipped_pte(pentry_t pte)
401 {
402 	return (pte & MMU_P) == 0 && (pte & PTE_ZERO) != 0;
403 }
404 #endif
405 
406 #if defined(CONFIG_SMP)
407 __pinned_func
z_x86_tlb_ipi(const void * arg)408 void z_x86_tlb_ipi(const void *arg)
409 {
410 	uintptr_t ptables_phys;
411 
412 	ARG_UNUSED(arg);
413 
414 #ifdef CONFIG_X86_KPTI
415 	/* We're always on the kernel's set of page tables in this context
416 	 * if KPTI is turned on
417 	 */
418 	ptables_phys = z_x86_cr3_get();
419 	__ASSERT(ptables_phys == z_mem_phys_addr(&z_x86_kernel_ptables), "");
420 #else
421 	/* We might have been moved to another memory domain, so always invoke
422 	 * z_x86_thread_page_tables_get() instead of using current CR3 value.
423 	 */
424 	ptables_phys = z_mem_phys_addr(z_x86_thread_page_tables_get(_current));
425 #endif
426 	/*
427 	 * In the future, we can consider making this smarter, such as
428 	 * propagating which page tables were modified (in case they are
429 	 * not active on this CPU) or an address range to call
430 	 * tlb_flush_page() on.
431 	 */
432 	LOG_DBG("%s on CPU %d\n", __func__, arch_curr_cpu()->id);
433 
434 	z_x86_cr3_set(ptables_phys);
435 }
436 
437 /* NOTE: This is not synchronous and the actual flush takes place some short
438  * time after this exits.
439  */
440 __pinned_func
tlb_shootdown(void)441 static inline void tlb_shootdown(void)
442 {
443 	z_loapic_ipi(0, LOAPIC_ICR_IPI_OTHERS, CONFIG_TLB_IPI_VECTOR);
444 }
445 #endif /* CONFIG_SMP */
446 
447 __pinned_func
assert_addr_aligned(uintptr_t addr)448 static inline void assert_addr_aligned(uintptr_t addr)
449 {
450 #if __ASSERT_ON
451 	__ASSERT((addr & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U,
452 		 "unaligned address 0x%" PRIxPTR, addr);
453 #endif
454 }
455 
456 __pinned_func
is_addr_aligned(uintptr_t addr)457 static inline bool is_addr_aligned(uintptr_t addr)
458 {
459 	if ((addr & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U) {
460 		return true;
461 	} else {
462 		return false;
463 	}
464 }
465 
466 __pinned_func
assert_virt_addr_aligned(void * addr)467 static inline void assert_virt_addr_aligned(void *addr)
468 {
469 	assert_addr_aligned((uintptr_t)addr);
470 }
471 
472 __pinned_func
is_virt_addr_aligned(void * addr)473 static inline bool is_virt_addr_aligned(void *addr)
474 {
475 	return is_addr_aligned((uintptr_t)addr);
476 }
477 
478 __pinned_func
assert_size_aligned(size_t size)479 static inline void assert_size_aligned(size_t size)
480 {
481 #if __ASSERT_ON
482 	__ASSERT((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U,
483 		 "unaligned size %zu", size);
484 #endif
485 }
486 
487 __pinned_func
is_size_aligned(size_t size)488 static inline bool is_size_aligned(size_t size)
489 {
490 	if ((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U) {
491 		return true;
492 	} else {
493 		return false;
494 	}
495 }
496 
497 __pinned_func
assert_region_page_aligned(void * addr,size_t size)498 static inline void assert_region_page_aligned(void *addr, size_t size)
499 {
500 	assert_virt_addr_aligned(addr);
501 	assert_size_aligned(size);
502 }
503 
504 __pinned_func
is_region_page_aligned(void * addr,size_t size)505 static inline bool is_region_page_aligned(void *addr, size_t size)
506 {
507 	if (!is_virt_addr_aligned(addr)) {
508 		return false;
509 	}
510 
511 	return is_size_aligned(size);
512 }
513 
514 /*
515  * Debug functions. All conditionally compiled with CONFIG_EXCEPTION_DEBUG.
516  */
517 #ifdef CONFIG_EXCEPTION_DEBUG
518 
519 /* Add colors to page table dumps to indicate mapping type */
520 #define COLOR_PAGE_TABLES	1
521 
522 #if COLOR_PAGE_TABLES
523 #define ANSI_DEFAULT "\x1B" "[0m"
524 #define ANSI_RED     "\x1B" "[1;31m"
525 #define ANSI_GREEN   "\x1B" "[1;32m"
526 #define ANSI_YELLOW  "\x1B" "[1;33m"
527 #define ANSI_BLUE    "\x1B" "[1;34m"
528 #define ANSI_MAGENTA "\x1B" "[1;35m"
529 #define ANSI_CYAN    "\x1B" "[1;36m"
530 #define ANSI_GREY    "\x1B" "[1;90m"
531 
532 #define COLOR(x)	printk(_CONCAT(ANSI_, x))
533 #else
534 #define COLOR(x)	do { } while (false)
535 #endif
536 
537 __pinned_func
get_entry_code(pentry_t value)538 static char get_entry_code(pentry_t value)
539 {
540 	char ret;
541 
542 	if (value == 0U) {
543 		/* Unmapped entry */
544 		ret = '.';
545 	} else {
546 		if ((value & MMU_RW) != 0U) {
547 			/* Writable page */
548 			if ((value & MMU_XD) != 0U) {
549 				/* RW */
550 				ret = 'w';
551 			} else {
552 				/* RWX */
553 				ret = 'a';
554 			}
555 		} else {
556 			if ((value & MMU_XD) != 0U) {
557 				/* R */
558 				ret = 'r';
559 			} else {
560 				/* RX */
561 				ret = 'x';
562 			}
563 		}
564 
565 		if ((value & MMU_US) != 0U) {
566 			/* Uppercase indicates user mode access */
567 			ret = toupper((unsigned char)ret);
568 		}
569 	}
570 
571 	return ret;
572 }
573 
574 __pinned_func
print_entries(pentry_t entries_array[],uint8_t * base,int level,size_t count)575 static void print_entries(pentry_t entries_array[], uint8_t *base, int level,
576 			  size_t count)
577 {
578 	int column = 0;
579 
580 	for (int i = 0; i < count; i++) {
581 		pentry_t entry = entries_array[i];
582 
583 		uintptr_t phys = get_entry_phys(entry, level);
584 		uintptr_t virt =
585 			(uintptr_t)base + (get_entry_scope(level) * i);
586 
587 		if ((entry & MMU_P) != 0U) {
588 			if (is_leaf(level, entry)) {
589 				if (phys == virt) {
590 					/* Identity mappings */
591 					COLOR(YELLOW);
592 				} else if (phys + Z_MEM_VM_OFFSET == virt) {
593 					/* Permanent RAM mappings */
594 					COLOR(GREEN);
595 				} else {
596 					/* General mapped pages */
597 					COLOR(CYAN);
598 				}
599 			} else {
600 				/* Intermediate entry */
601 				COLOR(MAGENTA);
602 			}
603 		} else {
604 			if (is_leaf(level, entry)) {
605 				if (entry == 0U) {
606 					/* Unmapped */
607 					COLOR(GREY);
608 #ifdef CONFIG_X86_KPTI
609 				} else if (is_flipped_pte(entry)) {
610 					/* KPTI, un-flip it */
611 					COLOR(BLUE);
612 					entry = ~entry;
613 					phys = get_entry_phys(entry, level);
614 					if (phys == virt) {
615 						/* Identity mapped */
616 						COLOR(CYAN);
617 					} else {
618 						/* Non-identity mapped */
619 						COLOR(BLUE);
620 					}
621 #endif
622 				} else {
623 					/* Paged out */
624 					COLOR(RED);
625 				}
626 			} else {
627 				/* Un-mapped intermediate entry */
628 				COLOR(GREY);
629 			}
630 		}
631 
632 		printk("%c", get_entry_code(entry));
633 
634 		column++;
635 		if (column == 64) {
636 			column = 0;
637 			printk("\n");
638 		}
639 	}
640 	COLOR(DEFAULT);
641 
642 	if (column != 0) {
643 		printk("\n");
644 	}
645 }
646 
647 __pinned_func
dump_ptables(pentry_t * table,uint8_t * base,int level)648 static void dump_ptables(pentry_t *table, uint8_t *base, int level)
649 {
650 	const struct paging_level *info = &paging_levels[level];
651 
652 #ifdef CONFIG_X86_64
653 	/* Account for the virtual memory "hole" with sign-extension */
654 	if (((uintptr_t)base & BITL(47)) != 0) {
655 		base = (uint8_t *)((uintptr_t)base | (0xFFFFULL << 48));
656 	}
657 #endif
658 
659 	printk("%s at %p (0x%" PRIxPTR "): ", info->name, table,
660 	       z_mem_phys_addr(table));
661 	if (level == 0) {
662 		printk("entire address space\n");
663 	} else {
664 		printk("for %p - %p\n", base,
665 		       base + get_table_scope(level) - 1);
666 	}
667 
668 	print_entries(table, base, level, info->entries);
669 
670 	/* Check if we're a page table */
671 	if (level == PTE_LEVEL) {
672 		return;
673 	}
674 
675 	/* Dump all linked child tables */
676 	for (int j = 0; j < info->entries; j++) {
677 		pentry_t entry = table[j];
678 		pentry_t *next;
679 
680 		if ((entry & MMU_P) == 0U ||
681 			(entry & MMU_PS) != 0U) {
682 			/* Not present or big page, skip */
683 			continue;
684 		}
685 
686 		next = next_table(entry, level);
687 		dump_ptables(next, base + (j * get_entry_scope(level)),
688 			     level + 1);
689 	}
690 }
691 
692 __pinned_func
z_x86_dump_page_tables(pentry_t * ptables)693 void z_x86_dump_page_tables(pentry_t *ptables)
694 {
695 	dump_ptables(ptables, NULL, 0);
696 }
697 
698 /* Enable to dump out the kernel's page table right before main() starts,
699  * sometimes useful for deep debugging. May overwhelm twister.
700  */
701 #define DUMP_PAGE_TABLES 0
702 
703 #if DUMP_PAGE_TABLES
704 __pinned_func
dump_kernel_tables(void)705 static int dump_kernel_tables(void)
706 {
707 	z_x86_dump_page_tables(z_x86_kernel_ptables);
708 
709 	return 0;
710 }
711 
712 SYS_INIT(dump_kernel_tables, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT);
713 #endif
714 
715 __pinned_func
str_append(char ** buf,size_t * size,const char * str)716 static void str_append(char **buf, size_t *size, const char *str)
717 {
718 	int ret = snprintk(*buf, *size, "%s", str);
719 
720 	if (ret >= *size) {
721 		/* Truncated */
722 		*size = 0U;
723 	} else {
724 		*size -= ret;
725 		*buf += ret;
726 	}
727 
728 }
729 
730 __pinned_func
dump_entry(int level,void * virt,pentry_t entry)731 static void dump_entry(int level, void *virt, pentry_t entry)
732 {
733 	const struct paging_level *info = &paging_levels[level];
734 	char buf[24] = { 0 };
735 	char *pos = buf;
736 	size_t sz = sizeof(buf);
737 	uint8_t *virtmap = (uint8_t *)ROUND_DOWN(virt, get_entry_scope(level));
738 
739 	#define DUMP_BIT(bit) do { \
740 			if ((entry & MMU_##bit) != 0U) { \
741 				str_append(&pos, &sz, #bit " "); \
742 			} \
743 		} while (false)
744 
745 	DUMP_BIT(RW);
746 	DUMP_BIT(US);
747 	DUMP_BIT(PWT);
748 	DUMP_BIT(PCD);
749 	DUMP_BIT(A);
750 	DUMP_BIT(D);
751 	DUMP_BIT(G);
752 	DUMP_BIT(XD);
753 
754 	LOG_ERR("%sE: %p -> " PRI_ENTRY ": %s", info->name,
755 		virtmap, entry & info->mask, buf);
756 
757 	#undef DUMP_BIT
758 }
759 
760 __pinned_func
z_x86_pentry_get(int * paging_level,pentry_t * val,pentry_t * ptables,void * virt)761 void z_x86_pentry_get(int *paging_level, pentry_t *val, pentry_t *ptables,
762 		      void *virt)
763 {
764 	pentry_get(paging_level, val, ptables, virt);
765 }
766 
767 /*
768  * Debug function for dumping out MMU table information to the LOG for a
769  * specific virtual address, such as when we get an unexpected page fault.
770  */
771 __pinned_func
z_x86_dump_mmu_flags(pentry_t * ptables,void * virt)772 void z_x86_dump_mmu_flags(pentry_t *ptables, void *virt)
773 {
774 	pentry_t entry = 0;
775 	int level = 0;
776 
777 	pentry_get(&level, &entry, ptables, virt);
778 
779 	if ((entry & MMU_P) == 0) {
780 		LOG_ERR("%sE: not present", paging_levels[level].name);
781 	} else {
782 		dump_entry(level, virt, entry);
783 	}
784 }
785 #endif /* CONFIG_EXCEPTION_DEBUG */
786 
787 /* Reset permissions on a PTE to original state when the mapping was made */
788 __pinned_func
reset_pte(pentry_t old_val)789 static inline pentry_t reset_pte(pentry_t old_val)
790 {
791 	pentry_t new_val;
792 
793 	/* Clear any existing state in permission bits */
794 	new_val = old_val & (~K_MEM_PARTITION_PERM_MASK);
795 
796 	/* Now set permissions based on the stashed original values */
797 	if ((old_val & MMU_RW_ORIG) != 0) {
798 		new_val |= MMU_RW;
799 	}
800 	if ((old_val & MMU_US_ORIG) != 0) {
801 		new_val |= MMU_US;
802 	}
803 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
804 	if ((old_val & MMU_XD_ORIG) != 0) {
805 		new_val |= MMU_XD;
806 	}
807 #endif
808 	return new_val;
809 }
810 
811 /* Wrapper functions for some gross stuff we have to do for Kernel
812  * page table isolation. If these are User mode page tables, the user bit
813  * isn't set, and this is not the shared page, all the bits in the PTE
814  * are flipped. This serves three purposes:
815  *  - The page isn't present, implementing page table isolation
816  *  - Flipping the physical address bits cheaply mitigates L1TF
817  *  - State is preserved; to get original PTE, just complement again
818  */
819 __pinned_func
pte_finalize_value(pentry_t val,bool user_table,int level)820 static inline pentry_t pte_finalize_value(pentry_t val, bool user_table,
821 					  int level)
822 {
823 #ifdef CONFIG_X86_KPTI
824 	static const uintptr_t shared_phys_addr =
825 		Z_MEM_PHYS_ADDR(POINTER_TO_UINT(&z_shared_kernel_page_start));
826 
827 	if (user_table && (val & MMU_US) == 0 && (val & MMU_P) != 0 &&
828 	    get_entry_phys(val, level) != shared_phys_addr) {
829 		val = ~val;
830 	}
831 #endif
832 	return val;
833 }
834 
835 /* Atomic functions for modifying PTEs. These don't map nicely to Zephyr's
836  * atomic API since the only types supported are 'int' and 'void *' and
837  * the size of pentry_t depends on other factors like PAE.
838  */
839 #ifndef CONFIG_X86_PAE
840 /* Non-PAE, pentry_t is same size as void ptr so use atomic_ptr_* APIs */
841 __pinned_func
atomic_pte_get(const pentry_t * target)842 static inline pentry_t atomic_pte_get(const pentry_t *target)
843 {
844 	return (pentry_t)atomic_ptr_get((atomic_ptr_t *)target);
845 }
846 
847 __pinned_func
atomic_pte_cas(pentry_t * target,pentry_t old_value,pentry_t new_value)848 static inline bool atomic_pte_cas(pentry_t *target, pentry_t old_value,
849 				  pentry_t new_value)
850 {
851 	return atomic_ptr_cas((atomic_ptr_t *)target, (void *)old_value,
852 			      (void *)new_value);
853 }
854 #else
855 /* Atomic builtins for 64-bit values on 32-bit x86 require floating point.
856  * Don't do this, just lock local interrupts. Needless to say, this
857  * isn't workable if someone ever adds SMP to the 32-bit x86 port.
858  */
859 BUILD_ASSERT(!IS_ENABLED(CONFIG_SMP));
860 
861 __pinned_func
atomic_pte_get(const pentry_t * target)862 static inline pentry_t atomic_pte_get(const pentry_t *target)
863 {
864 	return *target;
865 }
866 
867 __pinned_func
atomic_pte_cas(pentry_t * target,pentry_t old_value,pentry_t new_value)868 static inline bool atomic_pte_cas(pentry_t *target, pentry_t old_value,
869 				  pentry_t new_value)
870 {
871 	bool ret = false;
872 	int key = arch_irq_lock();
873 
874 	if (*target == old_value) {
875 		*target = new_value;
876 		ret = true;
877 	}
878 	arch_irq_unlock(key);
879 
880 	return ret;
881 }
882 #endif /* CONFIG_X86_PAE */
883 
884 /* Indicates that the target page tables will be used by user mode threads.
885  * This only has implications for CONFIG_X86_KPTI where user thread facing
886  * page tables need nearly all pages that don't have the US bit to also
887  * not be Present.
888  */
889 #define OPTION_USER		BIT(0)
890 
891 /* Indicates that the operation requires TLBs to be flushed as we are altering
892  * existing mappings. Not needed for establishing new mappings
893  */
894 #define OPTION_FLUSH		BIT(1)
895 
896 /* Indicates that each PTE's permission bits should be restored to their
897  * original state when the memory was mapped. All other bits in the PTE are
898  * preserved.
899  */
900 #define OPTION_RESET		BIT(2)
901 
902 /* Indicates that the mapping will need to be cleared entirely. This is
903  * mainly used for unmapping the memory region.
904  */
905 #define OPTION_CLEAR		BIT(3)
906 
907 /**
908  * Atomically update bits in a page table entry
909  *
910  * This is atomic with respect to modifications by other CPUs or preempted
911  * contexts, which can be very important when making decisions based on
912  * the PTE's prior "dirty" state.
913  *
914  * @param pte Pointer to page table entry to update
915  * @param update_val Updated bits to set/clear in PTE. Ignored with
916  *        OPTION_RESET or OPTION_CLEAR.
917  * @param update_mask Which bits to modify in the PTE. Ignored with
918  *        OPTION_RESET or OPTION_CLEAR.
919  * @param options Control flags
920  * @retval Old PTE value
921  */
922 __pinned_func
pte_atomic_update(pentry_t * pte,pentry_t update_val,pentry_t update_mask,uint32_t options)923 static inline pentry_t pte_atomic_update(pentry_t *pte, pentry_t update_val,
924 					 pentry_t update_mask,
925 					 uint32_t options)
926 {
927 	bool user_table = (options & OPTION_USER) != 0U;
928 	bool reset = (options & OPTION_RESET) != 0U;
929 	bool clear = (options & OPTION_CLEAR) != 0U;
930 	pentry_t old_val, new_val;
931 
932 	do {
933 		old_val = atomic_pte_get(pte);
934 
935 		new_val = old_val;
936 #ifdef CONFIG_X86_KPTI
937 		if (is_flipped_pte(new_val)) {
938 			/* Page was flipped for KPTI. Un-flip it */
939 			new_val = ~new_val;
940 		}
941 #endif /* CONFIG_X86_KPTI */
942 
943 		if (reset) {
944 			new_val = reset_pte(new_val);
945 		} else if (clear) {
946 			new_val = 0;
947 		} else {
948 			new_val = ((new_val & ~update_mask) |
949 				   (update_val & update_mask));
950 		}
951 
952 		new_val = pte_finalize_value(new_val, user_table, PTE_LEVEL);
953 	} while (atomic_pte_cas(pte, old_val, new_val) == false);
954 
955 #ifdef CONFIG_X86_KPTI
956 	if (is_flipped_pte(old_val)) {
957 		/* Page was flipped for KPTI. Un-flip it */
958 		old_val = ~old_val;
959 	}
960 #endif /* CONFIG_X86_KPTI */
961 
962 	return old_val;
963 }
964 
965 /**
966  * Low level page table update function for a virtual page
967  *
968  * For the provided set of page tables, update the PTE associated with the
969  * virtual address to a new value, using the mask to control what bits
970  * need to be preserved.
971  *
972  * It is permitted to set up mappings without the Present bit set, in which
973  * case all other bits may be used for OS accounting.
974  *
975  * This function is atomic with respect to the page table entries being
976  * modified by another CPU, using atomic operations to update the requested
977  * bits and return the previous PTE value.
978  *
979  * Common mask values:
980  *  MASK_ALL  - Update all PTE bits. Existing state totally discarded.
981  *  MASK_PERM - Only update permission bits. All other bits and physical
982  *              mapping preserved.
983  *
984  * @param ptables Page tables to modify
985  * @param virt Virtual page table entry to update
986  * @param entry_val Value to update in the PTE (ignored if OPTION_RESET or
987  *        OPTION_CLEAR)
988  * @param [out] old_val_ptr Filled in with previous PTE value. May be NULL.
989  * @param mask What bits to update in the PTE (ignored if OPTION_RESET or
990  *        OPTION_CLEAR)
991  * @param options Control options, described above
992  *
993  * @retval 0 if successful
994  * @retval -EFAULT if large page encountered or missing page table level
995  */
996 __pinned_func
page_map_set(pentry_t * ptables,void * virt,pentry_t entry_val,pentry_t * old_val_ptr,pentry_t mask,uint32_t options)997 static int page_map_set(pentry_t *ptables, void *virt, pentry_t entry_val,
998 			pentry_t *old_val_ptr, pentry_t mask, uint32_t options)
999 {
1000 	pentry_t *table = ptables;
1001 	bool flush = (options & OPTION_FLUSH) != 0U;
1002 	int ret = 0;
1003 
1004 	for (int level = 0; level < NUM_LEVELS; level++) {
1005 		int index;
1006 		pentry_t *entryp;
1007 
1008 		index = get_index(virt, level);
1009 		entryp = &table[index];
1010 
1011 		/* Check if we're a PTE */
1012 		if (level == PTE_LEVEL) {
1013 			pentry_t old_val = pte_atomic_update(entryp, entry_val,
1014 							     mask, options);
1015 			if (old_val_ptr != NULL) {
1016 				*old_val_ptr = old_val;
1017 			}
1018 			break;
1019 		}
1020 
1021 		/* We bail out early here due to no support for
1022 		 * splitting existing bigpage mappings.
1023 		 * If the PS bit is not supported at some level (like
1024 		 * in a PML4 entry) it is always reserved and must be 0
1025 		 */
1026 		CHECKIF(!((*entryp & MMU_PS) == 0U)) {
1027 			/* Cannot continue since we cannot split
1028 			 * bigpage mappings.
1029 			 */
1030 			LOG_ERR("large page encountered");
1031 			ret = -EFAULT;
1032 			goto out;
1033 		}
1034 
1035 		table = next_table(*entryp, level);
1036 
1037 		CHECKIF(!(table != NULL)) {
1038 			/* Cannot continue since table is NULL,
1039 			 * and it cannot be dereferenced in next loop
1040 			 * iteration.
1041 			 */
1042 			LOG_ERR("missing page table level %d when trying to map %p",
1043 				level + 1, virt);
1044 			ret = -EFAULT;
1045 			goto out;
1046 		}
1047 	}
1048 
1049 out:
1050 	if (flush) {
1051 		tlb_flush_page(virt);
1052 	}
1053 
1054 	return ret;
1055 }
1056 
1057 /**
1058  * Map a physical region in a specific set of page tables.
1059  *
1060  * See documentation for page_map_set() for additional notes about masks and
1061  * supported options.
1062  *
1063  * It is vital to remember that all virtual-to-physical mappings must be
1064  * the same with respect to supervisor mode regardless of what thread is
1065  * scheduled (and therefore, if multiple sets of page tables exist, which one
1066  * is active).
1067  *
1068  * It is permitted to set up mappings without the Present bit set.
1069  *
1070  * @param ptables Page tables to modify
1071  * @param virt Base page-aligned virtual memory address to map the region.
1072  * @param phys Base page-aligned physical memory address for the region.
1073  *        Ignored if OPTION_RESET or OPTION_CLEAR. Also affected by the mask
1074  *        parameter. This address is not directly examined, it will simply be
1075  *        programmed into the PTE.
1076  * @param size Size of the physical region to map
1077  * @param entry_flags Non-address bits to set in every PTE. Ignored if
1078  *        OPTION_RESET. Also affected by the mask parameter.
1079  * @param mask What bits to update in each PTE. Un-set bits will never be
1080  *        modified. Ignored if OPTION_RESET or OPTION_CLEAR.
1081  * @param options Control options, described above
1082  *
1083  * @retval 0 if successful
1084  * @retval -EINVAL if invalid parameters are supplied
1085  * @retval -EFAULT if errors encountered when updating page tables
1086  */
1087 __pinned_func
range_map_ptables(pentry_t * ptables,void * virt,uintptr_t phys,size_t size,pentry_t entry_flags,pentry_t mask,uint32_t options)1088 static int range_map_ptables(pentry_t *ptables, void *virt, uintptr_t phys,
1089 			     size_t size, pentry_t entry_flags, pentry_t mask,
1090 			     uint32_t options)
1091 {
1092 	bool zero_entry = (options & (OPTION_RESET | OPTION_CLEAR)) != 0U;
1093 	int ret = 0, ret2;
1094 
1095 	CHECKIF(!is_addr_aligned(phys) || !is_size_aligned(size)) {
1096 		ret = -EINVAL;
1097 		goto out;
1098 	}
1099 
1100 	CHECKIF(!((entry_flags & paging_levels[0].mask) == 0U)) {
1101 		LOG_ERR("entry_flags " PRI_ENTRY " overlaps address area",
1102 			entry_flags);
1103 		ret = -EINVAL;
1104 		goto out;
1105 	}
1106 
1107 	/* This implementation is stack-efficient but not particularly fast.
1108 	 * We do a full page table walk for every page we are updating.
1109 	 * Recursive approaches are possible, but use much more stack space.
1110 	 */
1111 	for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) {
1112 		uint8_t *dest_virt = (uint8_t *)virt + offset;
1113 		pentry_t entry_val;
1114 
1115 		if (zero_entry) {
1116 			entry_val = 0;
1117 		} else {
1118 			entry_val = (pentry_t)(phys + offset) | entry_flags;
1119 		}
1120 
1121 		ret2 = page_map_set(ptables, dest_virt, entry_val, NULL, mask,
1122 				   options);
1123 		ARG_UNUSED(ret2);
1124 		CHECKIF(ret2 != 0) {
1125 			ret = ret2;
1126 		}
1127 	}
1128 
1129 out:
1130 	return ret;
1131 }
1132 
1133 /**
1134  * Establish or update a memory mapping for all page tables
1135  *
1136  * The physical region noted from phys to phys + size will be mapped to
1137  * an equal sized virtual region starting at virt, with the provided flags.
1138  * The mask value denotes what bits in PTEs will actually be modified.
1139  *
1140  * See range_map_ptables() for additional details.
1141  *
1142  * @param virt Page-aligned starting virtual address
1143  * @param phys Page-aligned starting physical address. Ignored if the mask
1144  *             parameter does not enable address bits or OPTION_RESET used.
1145  *             This region is not directly examined, it will simply be
1146  *             programmed into the page tables.
1147  * @param size Size of the physical region to map
1148  * @param entry_flags Desired state of non-address PTE bits covered by mask,
1149  *                    ignored if OPTION_RESET
1150  * @param mask What bits in the PTE to actually modify; unset bits will
1151  *             be preserved. Ignored if OPTION_RESET.
1152  * @param options Control options. Do not set OPTION_USER here. OPTION_FLUSH
1153  *                will trigger a TLB shootdown after all tables are updated.
1154  *
1155  * @retval 0 if successful
1156  * @retval -EINVAL if invalid parameters are supplied
1157  * @retval -EFAULT if errors encountered when updating page tables
1158  */
1159 __pinned_func
range_map(void * virt,uintptr_t phys,size_t size,pentry_t entry_flags,pentry_t mask,uint32_t options)1160 static int range_map(void *virt, uintptr_t phys, size_t size,
1161 		     pentry_t entry_flags, pentry_t mask, uint32_t options)
1162 {
1163 	int ret = 0, ret2;
1164 
1165 	LOG_DBG("%s: %p -> %p (%zu) flags " PRI_ENTRY " mask "
1166 		PRI_ENTRY " opt 0x%x", __func__, (void *)phys, virt, size,
1167 		entry_flags, mask, options);
1168 
1169 #ifdef CONFIG_X86_64
1170 	/* There's a gap in the "64-bit" address space, as 4-level paging
1171 	 * requires bits 48 to 63 to be copies of bit 47. Test this
1172 	 * by treating as a signed value and shifting.
1173 	 */
1174 	__ASSERT(((((intptr_t)virt) << 16) >> 16) == (intptr_t)virt,
1175 		 "non-canonical virtual address mapping %p (size %zu)",
1176 		 virt, size);
1177 #endif /* CONFIG_X86_64 */
1178 
1179 	CHECKIF(!((options & OPTION_USER) == 0U)) {
1180 		LOG_ERR("invalid option for mapping");
1181 		ret = -EINVAL;
1182 		goto out;
1183 	}
1184 
1185 	/* All virtual-to-physical mappings are the same in all page tables.
1186 	 * What can differ is only access permissions, defined by the memory
1187 	 * domain associated with the page tables, and the threads that are
1188 	 * members of that domain.
1189 	 *
1190 	 * Any new mappings need to be applied to all page tables.
1191 	 */
1192 #if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
1193 	sys_snode_t *node;
1194 
1195 	SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
1196 		struct arch_mem_domain *domain =
1197 			CONTAINER_OF(node, struct arch_mem_domain, node);
1198 
1199 		ret2 = range_map_ptables(domain->ptables, virt, phys, size,
1200 					 entry_flags, mask,
1201 					 options | OPTION_USER);
1202 		ARG_UNUSED(ret2);
1203 		CHECKIF(ret2 != 0) {
1204 			ret = ret2;
1205 		}
1206 	}
1207 #endif /* CONFIG_USERSPACE */
1208 
1209 	ret2 = range_map_ptables(z_x86_kernel_ptables, virt, phys, size,
1210 				 entry_flags, mask, options);
1211 	ARG_UNUSED(ret2);
1212 	CHECKIF(ret2 != 0) {
1213 		ret = ret2;
1214 	}
1215 
1216 out:
1217 #ifdef CONFIG_SMP
1218 	if ((options & OPTION_FLUSH) != 0U) {
1219 		tlb_shootdown();
1220 	}
1221 #endif /* CONFIG_SMP */
1222 
1223 	return ret;
1224 }
1225 
1226 __pinned_func
range_map_unlocked(void * virt,uintptr_t phys,size_t size,pentry_t entry_flags,pentry_t mask,uint32_t options)1227 static inline int range_map_unlocked(void *virt, uintptr_t phys, size_t size,
1228 				     pentry_t entry_flags, pentry_t mask,
1229 				     uint32_t options)
1230 {
1231 	k_spinlock_key_t key;
1232 	int ret;
1233 
1234 	key = k_spin_lock(&x86_mmu_lock);
1235 	ret = range_map(virt, phys, size, entry_flags, mask, options);
1236 	k_spin_unlock(&x86_mmu_lock, key);
1237 
1238 	return ret;
1239 }
1240 
1241 __pinned_func
flags_to_entry(uint32_t flags)1242 static pentry_t flags_to_entry(uint32_t flags)
1243 {
1244 	pentry_t entry_flags = MMU_P;
1245 
1246 	/* Translate flags argument into HW-recognized entry flags.
1247 	 *
1248 	 * Support for PAT is not implemented yet. Many systems may have
1249 	 * BIOS-populated MTRR values such that these cache settings are
1250 	 * redundant.
1251 	 */
1252 	switch (flags & K_MEM_CACHE_MASK) {
1253 	case K_MEM_CACHE_NONE:
1254 		entry_flags |= MMU_PCD;
1255 		break;
1256 	case K_MEM_CACHE_WT:
1257 		entry_flags |= MMU_PWT;
1258 		break;
1259 	case K_MEM_CACHE_WB:
1260 		break;
1261 	default:
1262 		__ASSERT(false, "bad memory mapping flags 0x%x", flags);
1263 	}
1264 
1265 	if ((flags & K_MEM_PERM_RW) != 0U) {
1266 		entry_flags |= ENTRY_RW;
1267 	}
1268 
1269 	if ((flags & K_MEM_PERM_USER) != 0U) {
1270 		entry_flags |= ENTRY_US;
1271 	}
1272 
1273 	if ((flags & K_MEM_PERM_EXEC) == 0U) {
1274 		entry_flags |= ENTRY_XD;
1275 	}
1276 
1277 	return entry_flags;
1278 }
1279 
1280 /* map new region virt..virt+size to phys with provided arch-neutral flags */
1281 __pinned_func
arch_mem_map(void * virt,uintptr_t phys,size_t size,uint32_t flags)1282 void arch_mem_map(void *virt, uintptr_t phys, size_t size, uint32_t flags)
1283 {
1284 	int ret;
1285 
1286 	ret = range_map_unlocked(virt, phys, size, flags_to_entry(flags),
1287 				 MASK_ALL, 0);
1288 	__ASSERT_NO_MSG(ret == 0);
1289 	ARG_UNUSED(ret);
1290 }
1291 
1292 /* unmap region addr..addr+size, reset entries and flush TLB */
arch_mem_unmap(void * addr,size_t size)1293 void arch_mem_unmap(void *addr, size_t size)
1294 {
1295 	int ret;
1296 
1297 	ret = range_map_unlocked((void *)addr, 0, size, 0, 0,
1298 				 OPTION_FLUSH | OPTION_CLEAR);
1299 	__ASSERT_NO_MSG(ret == 0);
1300 	ARG_UNUSED(ret);
1301 }
1302 
1303 #ifdef Z_VM_KERNEL
1304 __boot_func
identity_map_remove(uint32_t level)1305 static void identity_map_remove(uint32_t level)
1306 {
1307 	size_t size, scope = get_entry_scope(level);
1308 	pentry_t *table;
1309 	uint32_t cur_level;
1310 	uint8_t *pos;
1311 	pentry_t entry;
1312 	pentry_t *entry_ptr;
1313 
1314 	k_mem_region_align((uintptr_t *)&pos, &size,
1315 			   (uintptr_t)CONFIG_SRAM_BASE_ADDRESS,
1316 			   (size_t)CONFIG_SRAM_SIZE * 1024U, scope);
1317 
1318 	while (size != 0U) {
1319 		/* Need to get to the correct table */
1320 		table = z_x86_kernel_ptables;
1321 		for (cur_level = 0; cur_level < level; cur_level++) {
1322 			entry = get_entry(table, pos, cur_level);
1323 			table = next_table(entry, level);
1324 		}
1325 
1326 		entry_ptr = get_entry_ptr(table, pos, level);
1327 
1328 		/* set_pte */
1329 		*entry_ptr = 0;
1330 		pos += scope;
1331 		size -= scope;
1332 	}
1333 }
1334 #endif
1335 
1336 /* Invoked to remove the identity mappings in the page tables,
1337  * they were only needed to transition the instruction pointer at early boot
1338  */
1339 __boot_func
z_x86_mmu_init(void)1340 void z_x86_mmu_init(void)
1341 {
1342 #ifdef Z_VM_KERNEL
1343 	/* We booted with physical address space being identity mapped.
1344 	 * As we are now executing in virtual address space,
1345 	 * the identity map is no longer needed. So remove them.
1346 	 *
1347 	 * Without PAE, only need to remove the entries at the PD level.
1348 	 * With PAE, need to also remove the entry at PDP level.
1349 	 */
1350 	identity_map_remove(PDE_LEVEL);
1351 
1352 #ifdef CONFIG_X86_PAE
1353 	identity_map_remove(0);
1354 #endif
1355 #endif
1356 }
1357 
1358 #if CONFIG_X86_STACK_PROTECTION
1359 __pinned_func
z_x86_set_stack_guard(k_thread_stack_t * stack)1360 void z_x86_set_stack_guard(k_thread_stack_t *stack)
1361 {
1362 	int ret;
1363 
1364 	/* Applied to all page tables as this affects supervisor mode.
1365 	 * XXX: This never gets reset when the thread exits, which can
1366 	 * cause problems if the memory is later used for something else.
1367 	 * See #29499
1368 	 *
1369 	 * Guard page is always the first page of the stack object for both
1370 	 * kernel and thread stacks.
1371 	 */
1372 	ret = range_map_unlocked(stack, 0, CONFIG_MMU_PAGE_SIZE,
1373 				 MMU_P | ENTRY_XD, MASK_PERM, OPTION_FLUSH);
1374 	__ASSERT_NO_MSG(ret == 0);
1375 	ARG_UNUSED(ret);
1376 }
1377 #endif /* CONFIG_X86_STACK_PROTECTION */
1378 
1379 #ifdef CONFIG_USERSPACE
1380 __pinned_func
page_validate(pentry_t * ptables,uint8_t * addr,bool write)1381 static bool page_validate(pentry_t *ptables, uint8_t *addr, bool write)
1382 {
1383 	pentry_t *table = (pentry_t *)ptables;
1384 
1385 	for (int level = 0; level < NUM_LEVELS; level++) {
1386 		pentry_t entry = get_entry(table, addr, level);
1387 
1388 		if (is_leaf(level, entry)) {
1389 #ifdef CONFIG_X86_KPTI
1390 			if (is_flipped_pte(entry)) {
1391 				/* We flipped this to prevent user access
1392 				 * since just clearing US isn't sufficient
1393 				 */
1394 				return false;
1395 			}
1396 #endif
1397 			/* US and RW bits still carry meaning if non-present.
1398 			 * If the data page is paged out, access bits are
1399 			 * preserved. If un-mapped, the whole entry is 0.
1400 			 */
1401 			if (((entry & MMU_US) == 0U) ||
1402 			    (write && ((entry & MMU_RW) == 0U))) {
1403 				return false;
1404 			}
1405 		} else {
1406 			if ((entry & MMU_P) == 0U) {
1407 				/* Missing intermediate table, address is
1408 				 * un-mapped
1409 				 */
1410 				return false;
1411 			}
1412 			table = next_table(entry, level);
1413 		}
1414 	}
1415 
1416 	return true;
1417 }
1418 
1419 __pinned_func
bcb_fence(void)1420 static inline void bcb_fence(void)
1421 {
1422 #ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION
1423 	__asm__ volatile ("lfence" : : : "memory");
1424 #endif
1425 }
1426 
1427 __pinned_func
arch_buffer_validate(void * addr,size_t size,int write)1428 int arch_buffer_validate(void *addr, size_t size, int write)
1429 {
1430 	pentry_t *ptables = z_x86_thread_page_tables_get(_current);
1431 	uint8_t *virt;
1432 	size_t aligned_size;
1433 	int ret = 0;
1434 
1435 	/* addr/size arbitrary, fix this up into an aligned region */
1436 	k_mem_region_align((uintptr_t *)&virt, &aligned_size,
1437 			   (uintptr_t)addr, size, CONFIG_MMU_PAGE_SIZE);
1438 
1439 	for (size_t offset = 0; offset < aligned_size;
1440 	     offset += CONFIG_MMU_PAGE_SIZE) {
1441 		if (!page_validate(ptables, virt + offset, write)) {
1442 			ret = -1;
1443 			break;
1444 		}
1445 	}
1446 
1447 	bcb_fence();
1448 
1449 	return ret;
1450 }
1451 #ifdef CONFIG_X86_COMMON_PAGE_TABLE
1452 /* Very low memory configuration. A single set of page tables is used for
1453  * all threads. This relies on some assumptions:
1454  *
1455  * - No KPTI. If that were supported, we would need both a kernel and user
1456  *   set of page tables.
1457  * - No SMP. If that were supported, we would need per-core page tables.
1458  * - Memory domains don't affect supervisor mode.
1459  * - All threads have the same virtual-to-physical mappings.
1460  * - Memory domain APIs can't be called by user mode.
1461  *
1462  * Because there is no SMP, only one set of page tables, and user threads can't
1463  * modify their own memory domains, we don't have to do much when
1464  * arch_mem_domain_* APIs are called. We do use a caching scheme to avoid
1465  * updating page tables if the last user thread scheduled was in the same
1466  * domain.
1467  *
1468  * We don't set CONFIG_ARCH_MEM_DOMAIN_DATA, since we aren't setting
1469  * up any arch-specific memory domain data (per domain page tables.)
1470  *
1471  * This is all nice and simple and saves a lot of memory. The cost is that
1472  * context switching is not trivial CR3 update. We have to reset all partitions
1473  * for the current domain configuration and then apply all the partitions for
1474  * the incoming thread's domain if they are not the same. We also need to
1475  * update permissions similarly on the thread stack region.
1476  */
1477 
1478 __pinned_func
reset_region(uintptr_t start,size_t size)1479 static inline int reset_region(uintptr_t start, size_t size)
1480 {
1481 	return range_map_unlocked((void *)start, 0, size, 0, 0,
1482 				  OPTION_FLUSH | OPTION_RESET);
1483 }
1484 
1485 __pinned_func
apply_region(uintptr_t start,size_t size,pentry_t attr)1486 static inline int apply_region(uintptr_t start, size_t size, pentry_t attr)
1487 {
1488 	return range_map_unlocked((void *)start, 0, size, attr, MASK_PERM,
1489 				  OPTION_FLUSH);
1490 }
1491 
1492 /* Cache of the current memory domain applied to the common page tables and
1493  * the stack buffer region that had User access granted.
1494  */
1495 static __pinned_bss struct k_mem_domain *current_domain;
1496 static __pinned_bss uintptr_t current_stack_start;
1497 static __pinned_bss size_t current_stack_size;
1498 
1499 __pinned_func
z_x86_swap_update_common_page_table(struct k_thread * incoming)1500 void z_x86_swap_update_common_page_table(struct k_thread *incoming)
1501 {
1502 	k_spinlock_key_t key;
1503 
1504 	if ((incoming->base.user_options & K_USER) == 0) {
1505 		/* Incoming thread is not a user thread. Memory domains don't
1506 		 * affect supervisor threads and we don't need to enable User
1507 		 * bits for its stack buffer; do nothing.
1508 		 */
1509 		return;
1510 	}
1511 
1512 	/* Step 1: Make sure the thread stack is set up correctly for the
1513 	 * for the incoming thread
1514 	 */
1515 	if (incoming->stack_info.start != current_stack_start ||
1516 	    incoming->stack_info.size != current_stack_size) {
1517 		if (current_stack_size != 0U) {
1518 			reset_region(current_stack_start, current_stack_size);
1519 		}
1520 
1521 		/* The incoming thread's stack region needs User permissions */
1522 		apply_region(incoming->stack_info.start,
1523 			     incoming->stack_info.size,
1524 			     K_MEM_PARTITION_P_RW_U_RW);
1525 
1526 		/* Update cache */
1527 		current_stack_start = incoming->stack_info.start;
1528 		current_stack_size = incoming->stack_info.size;
1529 	}
1530 
1531 	/* Step 2: The page tables always have some memory domain applied to
1532 	 * them. If the incoming thread's memory domain is different,
1533 	 * update the page tables
1534 	 */
1535 	key = k_spin_lock(&z_mem_domain_lock);
1536 	if (incoming->mem_domain_info.mem_domain == current_domain) {
1537 		/* The incoming thread's domain is already applied */
1538 		goto out_unlock;
1539 	}
1540 
1541 	/* Reset the current memory domain regions... */
1542 	if (current_domain != NULL) {
1543 		for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) {
1544 			struct k_mem_partition *ptn =
1545 				&current_domain->partitions[i];
1546 
1547 			if (ptn->size == 0) {
1548 				continue;
1549 			}
1550 			reset_region(ptn->start, ptn->size);
1551 		}
1552 	}
1553 
1554 	/* ...and apply all the incoming domain's regions */
1555 	for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) {
1556 		struct k_mem_partition *ptn =
1557 			&incoming->mem_domain_info.mem_domain->partitions[i];
1558 
1559 		if (ptn->size == 0) {
1560 			continue;
1561 		}
1562 		apply_region(ptn->start, ptn->size, ptn->attr);
1563 	}
1564 	current_domain = incoming->mem_domain_info.mem_domain;
1565 out_unlock:
1566 	k_spin_unlock(&z_mem_domain_lock, key);
1567 }
1568 
1569 /* If a partition was added or removed in the cached domain, update the
1570  * page tables.
1571  */
1572 __pinned_func
arch_mem_domain_partition_remove(struct k_mem_domain * domain,uint32_t partition_id)1573 int arch_mem_domain_partition_remove(struct k_mem_domain *domain,
1574 				      uint32_t partition_id)
1575 {
1576 	struct k_mem_partition *ptn;
1577 
1578 	if (domain != current_domain) {
1579 		return 0;
1580 	}
1581 
1582 	ptn = &domain->partitions[partition_id];
1583 
1584 	return reset_region(ptn->start, ptn->size);
1585 }
1586 
1587 __pinned_func
arch_mem_domain_partition_add(struct k_mem_domain * domain,uint32_t partition_id)1588 int arch_mem_domain_partition_add(struct k_mem_domain *domain,
1589 				   uint32_t partition_id)
1590 {
1591 	struct k_mem_partition *ptn;
1592 
1593 	if (domain != current_domain) {
1594 		return 0;
1595 	}
1596 
1597 	ptn = &domain->partitions[partition_id];
1598 
1599 	return apply_region(ptn->start, ptn->size, ptn->attr);
1600 }
1601 
1602 /* Rest of the APIs don't need to do anything */
1603 __pinned_func
arch_mem_domain_thread_add(struct k_thread * thread)1604 int arch_mem_domain_thread_add(struct k_thread *thread)
1605 {
1606 	return 0;
1607 }
1608 
1609 __pinned_func
arch_mem_domain_thread_remove(struct k_thread * thread)1610 int arch_mem_domain_thread_remove(struct k_thread *thread)
1611 {
1612 	return 0;
1613 }
1614 #else
1615 /* Memory domains each have a set of page tables assigned to them */
1616 
1617 /*
1618  * Pool of free memory pages for copying page tables, as needed.
1619  */
1620 #define PTABLE_COPY_SIZE	(INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE)
1621 
1622 static uint8_t __pinned_noinit
1623 	page_pool[PTABLE_COPY_SIZE * CONFIG_X86_MAX_ADDITIONAL_MEM_DOMAINS]
1624 	__aligned(CONFIG_MMU_PAGE_SIZE);
1625 
1626 __pinned_data
1627 static uint8_t *page_pos = page_pool + sizeof(page_pool);
1628 
1629 /* Return a zeroed and suitably aligned memory page for page table data
1630  * from the global page pool
1631  */
1632 __pinned_func
page_pool_get(void)1633 static void *page_pool_get(void)
1634 {
1635 	void *ret;
1636 
1637 	if (page_pos == page_pool) {
1638 		ret = NULL;
1639 	} else {
1640 		page_pos -= CONFIG_MMU_PAGE_SIZE;
1641 		ret = page_pos;
1642 	}
1643 
1644 	if (ret != NULL) {
1645 		memset(ret, 0, CONFIG_MMU_PAGE_SIZE);
1646 	}
1647 
1648 	return ret;
1649 }
1650 
1651 /* Debugging function to show how many pages are free in the pool */
1652 __pinned_func
pages_free(void)1653 static inline unsigned int pages_free(void)
1654 {
1655 	return (page_pos - page_pool) / CONFIG_MMU_PAGE_SIZE;
1656 }
1657 
1658 /**
1659 *  Duplicate an entire set of page tables
1660  *
1661  * Uses recursion, but depth at any given moment is limited by the number of
1662  * paging levels.
1663  *
1664  * x86_mmu_lock must be held.
1665  *
1666  * @param dst a zeroed out chunk of memory of sufficient size for the indicated
1667  *            paging level.
1668  * @param src some paging structure from within the source page tables to copy
1669  *            at the indicated paging level
1670  * @param level Current paging level
1671  * @retval 0 Success
1672  * @retval -ENOMEM Insufficient page pool memory
1673  */
1674 __pinned_func
copy_page_table(pentry_t * dst,pentry_t * src,int level)1675 static int copy_page_table(pentry_t *dst, pentry_t *src, int level)
1676 {
1677 	if (level == PTE_LEVEL) {
1678 		/* Base case: leaf page table */
1679 		for (int i = 0; i < get_num_entries(level); i++) {
1680 			dst[i] = pte_finalize_value(reset_pte(src[i]), true,
1681 						    PTE_LEVEL);
1682 		}
1683 	} else {
1684 		/* Recursive case: allocate sub-structures as needed and
1685 		 * make recursive calls on them
1686 		 */
1687 		for (int i = 0; i < get_num_entries(level); i++) {
1688 			pentry_t *child_dst;
1689 			int ret;
1690 
1691 			if ((src[i] & MMU_P) == 0) {
1692 				/* Non-present, skip */
1693 				continue;
1694 			}
1695 
1696 			if ((level == PDE_LEVEL) && ((src[i] & MMU_PS) != 0)) {
1697 				/* large page: no lower level table */
1698 				dst[i] = pte_finalize_value(src[i], true,
1699 							    PDE_LEVEL);
1700 				continue;
1701 			}
1702 
1703 			__ASSERT((src[i] & MMU_PS) == 0,
1704 				 "large page encountered");
1705 
1706 			child_dst = page_pool_get();
1707 			if (child_dst == NULL) {
1708 				return -ENOMEM;
1709 			}
1710 
1711 			/* Page table links are by physical address. RAM
1712 			 * for page tables is identity-mapped, but double-
1713 			 * cast needed for PAE case where sizeof(void *) and
1714 			 * sizeof(pentry_t) are not the same.
1715 			 */
1716 			dst[i] = ((pentry_t)z_mem_phys_addr(child_dst) |
1717 				  INT_FLAGS);
1718 
1719 			ret = copy_page_table(child_dst,
1720 					      next_table(src[i], level),
1721 					      level + 1);
1722 			if (ret != 0) {
1723 				return ret;
1724 			}
1725 		}
1726 	}
1727 
1728 	return 0;
1729 }
1730 
1731 __pinned_func
region_map_update(pentry_t * ptables,void * start,size_t size,pentry_t flags,bool reset)1732 static int region_map_update(pentry_t *ptables, void *start,
1733 			      size_t size, pentry_t flags, bool reset)
1734 {
1735 	uint32_t options = OPTION_USER;
1736 	int ret;
1737 	k_spinlock_key_t key;
1738 
1739 	if (reset) {
1740 		options |= OPTION_RESET;
1741 	}
1742 	if (ptables == z_x86_page_tables_get()) {
1743 		options |= OPTION_FLUSH;
1744 	}
1745 
1746 	key = k_spin_lock(&x86_mmu_lock);
1747 	ret = range_map_ptables(ptables, start, 0, size, flags, MASK_PERM,
1748 				options);
1749 	k_spin_unlock(&x86_mmu_lock, key);
1750 
1751 #ifdef CONFIG_SMP
1752 	tlb_shootdown();
1753 #endif
1754 
1755 	return ret;
1756 }
1757 
1758 __pinned_func
reset_region(pentry_t * ptables,void * start,size_t size)1759 static inline int reset_region(pentry_t *ptables, void *start, size_t size)
1760 {
1761 	LOG_DBG("%s(%p, %p, %zu)", __func__, ptables, start, size);
1762 	return region_map_update(ptables, start, size, 0, true);
1763 }
1764 
1765 __pinned_func
apply_region(pentry_t * ptables,void * start,size_t size,pentry_t attr)1766 static inline int apply_region(pentry_t *ptables, void *start,
1767 				size_t size, pentry_t attr)
1768 {
1769 	LOG_DBG("%s(%p, %p, %zu, " PRI_ENTRY ")", __func__, ptables, start,
1770 		size, attr);
1771 	return region_map_update(ptables, start, size, attr, false);
1772 }
1773 
1774 __pinned_func
set_stack_perms(struct k_thread * thread,pentry_t * ptables)1775 static void set_stack_perms(struct k_thread *thread, pentry_t *ptables)
1776 {
1777 	LOG_DBG("update stack for thread %p's ptables at %p: %p (size %zu)",
1778 		thread, ptables, (void *)thread->stack_info.start,
1779 		thread->stack_info.size);
1780 	apply_region(ptables, (void *)thread->stack_info.start,
1781 		     thread->stack_info.size,
1782 		     MMU_P | MMU_XD | MMU_RW | MMU_US);
1783 }
1784 
1785 /*
1786  * Arch interface implementations for memory domains and userspace
1787  */
1788 
1789 __boot_func
arch_mem_domain_init(struct k_mem_domain * domain)1790 int arch_mem_domain_init(struct k_mem_domain *domain)
1791 {
1792 	int ret;
1793 	k_spinlock_key_t key  = k_spin_lock(&x86_mmu_lock);
1794 
1795 	LOG_DBG("%s(%p)", __func__, domain);
1796 #if __ASSERT_ON
1797 	sys_snode_t *node;
1798 
1799 	/* Assert that we have not already initialized this domain */
1800 	SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
1801 		struct arch_mem_domain *list_domain =
1802 			CONTAINER_OF(node, struct arch_mem_domain, node);
1803 
1804 		__ASSERT(list_domain != &domain->arch,
1805 			 "%s(%p) called multiple times", __func__, domain);
1806 	}
1807 #endif /* __ASSERT_ON */
1808 #ifndef CONFIG_X86_KPTI
1809 	/* If we're not using KPTI then we can use the build time page tables
1810 	 * (which are mutable) as the set of page tables for the default
1811 	 * memory domain, saving us some memory.
1812 	 *
1813 	 * We skip adding this domain to x86_domain_list since we already
1814 	 * update z_x86_kernel_ptables directly in range_map().
1815 	 */
1816 	if (domain == &k_mem_domain_default) {
1817 		domain->arch.ptables = z_x86_kernel_ptables;
1818 		k_spin_unlock(&x86_mmu_lock, key);
1819 		return 0;
1820 	}
1821 #endif /* CONFIG_X86_KPTI */
1822 #ifdef CONFIG_X86_PAE
1823 	/* PDPT is stored within the memory domain itself since it is
1824 	 * much smaller than a full page
1825 	 */
1826 	(void)memset(domain->arch.pdpt, 0, sizeof(domain->arch.pdpt));
1827 	domain->arch.ptables = domain->arch.pdpt;
1828 #else
1829 	/* Allocate a page-sized top-level structure, either a PD or PML4 */
1830 	domain->arch.ptables = page_pool_get();
1831 	if (domain->arch.ptables == NULL) {
1832 		k_spin_unlock(&x86_mmu_lock, key);
1833 		return -ENOMEM;
1834 	}
1835 #endif /* CONFIG_X86_PAE */
1836 
1837 	LOG_DBG("copy_page_table(%p, %p, 0)", domain->arch.ptables,
1838 		z_x86_kernel_ptables);
1839 
1840 	/* Make a copy of the boot page tables created by gen_mmu.py */
1841 	ret = copy_page_table(domain->arch.ptables, z_x86_kernel_ptables, 0);
1842 	if (ret == 0) {
1843 		sys_slist_append(&x86_domain_list, &domain->arch.node);
1844 	}
1845 	k_spin_unlock(&x86_mmu_lock, key);
1846 
1847 	return ret;
1848 }
1849 
arch_mem_domain_partition_remove(struct k_mem_domain * domain,uint32_t partition_id)1850 int arch_mem_domain_partition_remove(struct k_mem_domain *domain,
1851 				     uint32_t partition_id)
1852 {
1853 	struct k_mem_partition *partition = &domain->partitions[partition_id];
1854 
1855 	/* Reset the partition's region back to defaults */
1856 	return reset_region(domain->arch.ptables, (void *)partition->start,
1857 			    partition->size);
1858 }
1859 
1860 /* Called on thread exit or when moving it to a different memory domain */
arch_mem_domain_thread_remove(struct k_thread * thread)1861 int arch_mem_domain_thread_remove(struct k_thread *thread)
1862 {
1863 	struct k_mem_domain *domain = thread->mem_domain_info.mem_domain;
1864 
1865 	if ((thread->base.user_options & K_USER) == 0) {
1866 		return 0;
1867 	}
1868 
1869 	if ((thread->base.thread_state & _THREAD_DEAD) == 0) {
1870 		/* Thread is migrating to another memory domain and not
1871 		 * exiting for good; we weren't called from
1872 		 * z_thread_abort().  Resetting the stack region will
1873 		 * take place in the forthcoming thread_add() call.
1874 		 */
1875 		return 0;
1876 	}
1877 
1878 	/* Restore permissions on the thread's stack area since it is no
1879 	 * longer a member of the domain.
1880 	 */
1881 	return reset_region(domain->arch.ptables,
1882 			    (void *)thread->stack_info.start,
1883 			    thread->stack_info.size);
1884 }
1885 
1886 __pinned_func
arch_mem_domain_partition_add(struct k_mem_domain * domain,uint32_t partition_id)1887 int arch_mem_domain_partition_add(struct k_mem_domain *domain,
1888 				   uint32_t partition_id)
1889 {
1890 	struct k_mem_partition *partition = &domain->partitions[partition_id];
1891 
1892 	/* Update the page tables with the partition info */
1893 	return apply_region(domain->arch.ptables, (void *)partition->start,
1894 			    partition->size, partition->attr | MMU_P);
1895 }
1896 
1897 /* Invoked from memory domain API calls, as well as during thread creation */
1898 __pinned_func
arch_mem_domain_thread_add(struct k_thread * thread)1899 int arch_mem_domain_thread_add(struct k_thread *thread)
1900 {
1901 	int ret = 0;
1902 
1903 	/* New memory domain we are being added to */
1904 	struct k_mem_domain *domain = thread->mem_domain_info.mem_domain;
1905 	/* This is only set for threads that were migrating from some other
1906 	 * memory domain; new threads this is NULL.
1907 	 *
1908 	 * Note that NULL check on old_ptables must be done before any
1909 	 * address translation or else (NULL + offset) != NULL.
1910 	 */
1911 	pentry_t *old_ptables = UINT_TO_POINTER(thread->arch.ptables);
1912 	bool is_user = (thread->base.user_options & K_USER) != 0;
1913 	bool is_migration = (old_ptables != NULL) && is_user;
1914 
1915 	/* Allow US access to the thread's stack in its new domain if
1916 	 * we are migrating. If we are not migrating this is done in
1917 	 * z_x86_current_stack_perms()
1918 	 */
1919 	if (is_migration) {
1920 		old_ptables = z_mem_virt_addr(thread->arch.ptables);
1921 		set_stack_perms(thread, domain->arch.ptables);
1922 	}
1923 
1924 	thread->arch.ptables = z_mem_phys_addr(domain->arch.ptables);
1925 	LOG_DBG("set thread %p page tables to %p", thread,
1926 		(void *)thread->arch.ptables);
1927 
1928 	/* Check if we're doing a migration from a different memory domain
1929 	 * and have to remove permissions from its old domain.
1930 	 *
1931 	 * XXX: The checks we have to do here and in
1932 	 * arch_mem_domain_thread_remove() are clumsy, it may be worth looking
1933 	 * into adding a specific arch_mem_domain_thread_migrate() API.
1934 	 * See #29601
1935 	 */
1936 	if (is_migration) {
1937 		ret = reset_region(old_ptables,
1938 				   (void *)thread->stack_info.start,
1939 				   thread->stack_info.size);
1940 	}
1941 
1942 #if !defined(CONFIG_X86_KPTI) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
1943 	/* Need to switch to using these new page tables, in case we drop
1944 	 * to user mode before we are ever context switched out.
1945 	 * IPI takes care of this if the thread is currently running on some
1946 	 * other CPU.
1947 	 */
1948 	if (thread == _current && thread->arch.ptables != z_x86_cr3_get()) {
1949 		z_x86_cr3_set(thread->arch.ptables);
1950 	}
1951 #endif /* CONFIG_X86_KPTI */
1952 
1953 	return ret;
1954 }
1955 #endif /* !CONFIG_X86_COMMON_PAGE_TABLE */
1956 
1957 __pinned_func
arch_mem_domain_max_partitions_get(void)1958 int arch_mem_domain_max_partitions_get(void)
1959 {
1960 	return CONFIG_MAX_DOMAIN_PARTITIONS;
1961 }
1962 
1963 /* Invoked from z_x86_userspace_enter */
1964 __pinned_func
z_x86_current_stack_perms(void)1965 void z_x86_current_stack_perms(void)
1966 {
1967 	/* Clear any previous context in the stack buffer to prevent
1968 	 * unintentional data leakage.
1969 	 */
1970 	(void)memset((void *)_current->stack_info.start, 0xAA,
1971 		     _current->stack_info.size - _current->stack_info.delta);
1972 
1973 	/* Only now is it safe to grant access to the stack buffer since any
1974 	 * previous context has been erased.
1975 	 */
1976 #ifdef CONFIG_X86_COMMON_PAGE_TABLE
1977 	/* Re run swap page table update logic since we're entering User mode.
1978 	 * This will grant stack and memory domain access if it wasn't set
1979 	 * already (in which case this returns very quickly).
1980 	 */
1981 	z_x86_swap_update_common_page_table(_current);
1982 #else
1983 	/* Memory domain access is already programmed into the page tables.
1984 	 * Need to enable access to this new user thread's stack buffer in
1985 	 * its domain-specific page tables.
1986 	 */
1987 	set_stack_perms(_current, z_x86_thread_page_tables_get(_current));
1988 #endif
1989 }
1990 #endif /* CONFIG_USERSPACE */
1991 
1992 #ifdef CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES
1993 __boot_func
mark_addr_page_reserved(uintptr_t addr,size_t len)1994 static void mark_addr_page_reserved(uintptr_t addr, size_t len)
1995 {
1996 	uintptr_t pos = ROUND_DOWN(addr, CONFIG_MMU_PAGE_SIZE);
1997 	uintptr_t end = ROUND_UP(addr + len, CONFIG_MMU_PAGE_SIZE);
1998 
1999 	for (; pos < end; pos += CONFIG_MMU_PAGE_SIZE) {
2000 		if (!z_is_page_frame(pos)) {
2001 			continue;
2002 		}
2003 
2004 		struct z_page_frame *pf = z_phys_to_page_frame(pos);
2005 
2006 		pf->flags |= Z_PAGE_FRAME_RESERVED;
2007 	}
2008 }
2009 
2010 __boot_func
arch_reserved_pages_update(void)2011 void arch_reserved_pages_update(void)
2012 {
2013 #ifdef CONFIG_X86_PC_COMPATIBLE
2014 	/*
2015 	 * Best is to do some E820 or similar enumeration to specifically
2016 	 * identify all page frames which are reserved by the hardware or
2017 	 * firmware. Or use x86_memmap[] with Multiboot if available.
2018 	 *
2019 	 * But still, reserve everything in the first megabyte of physical
2020 	 * memory on PC-compatible platforms.
2021 	 */
2022 	mark_addr_page_reserved(0, MB(1));
2023 #endif /* CONFIG_X86_PC_COMPATIBLE */
2024 
2025 #ifdef CONFIG_X86_MEMMAP
2026 	for (int i = 0; i < CONFIG_X86_MEMMAP_ENTRIES; i++) {
2027 		struct x86_memmap_entry *entry = &x86_memmap[i];
2028 
2029 		switch (entry->type) {
2030 		case X86_MEMMAP_ENTRY_UNUSED:
2031 			__fallthrough;
2032 		case X86_MEMMAP_ENTRY_RAM:
2033 			continue;
2034 
2035 		case X86_MEMMAP_ENTRY_ACPI:
2036 			__fallthrough;
2037 		case X86_MEMMAP_ENTRY_NVS:
2038 			__fallthrough;
2039 		case X86_MEMMAP_ENTRY_DEFECTIVE:
2040 			__fallthrough;
2041 		default:
2042 			/* If any of three above cases satisfied, exit switch
2043 			 * and mark page reserved
2044 			 */
2045 			break;
2046 		}
2047 
2048 		mark_addr_page_reserved(entry->base, entry->length);
2049 	}
2050 #endif /* CONFIG_X86_MEMMAP */
2051 }
2052 #endif /* CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES */
2053 
arch_page_phys_get(void * virt,uintptr_t * phys)2054 int arch_page_phys_get(void *virt, uintptr_t *phys)
2055 {
2056 	pentry_t pte = 0;
2057 	int level, ret;
2058 
2059 	__ASSERT(POINTER_TO_UINT(virt) % CONFIG_MMU_PAGE_SIZE == 0U,
2060 		 "unaligned address %p to %s", virt, __func__);
2061 
2062 	pentry_get(&level, &pte, z_x86_page_tables_get(), virt);
2063 
2064 	if ((pte & MMU_P) != 0) {
2065 		if (phys != NULL) {
2066 			*phys = (uintptr_t)get_entry_phys(pte, PTE_LEVEL);
2067 		}
2068 		ret = 0;
2069 	} else {
2070 		/* Not mapped */
2071 		ret = -EFAULT;
2072 	}
2073 
2074 	return ret;
2075 }
2076 
2077 #ifdef CONFIG_DEMAND_PAGING
2078 #define PTE_MASK (paging_levels[PTE_LEVEL].mask)
2079 
2080 __pinned_func
arch_mem_page_out(void * addr,uintptr_t location)2081 void arch_mem_page_out(void *addr, uintptr_t location)
2082 {
2083 	int ret;
2084 	pentry_t mask = PTE_MASK | MMU_P | MMU_A;
2085 
2086 	/* Accessed bit set to guarantee the entry is not completely 0 in
2087 	 * case of location value 0. A totally 0 PTE is un-mapped.
2088 	 */
2089 	ret = range_map(addr, location, CONFIG_MMU_PAGE_SIZE, MMU_A, mask,
2090 			OPTION_FLUSH);
2091 	__ASSERT_NO_MSG(ret == 0);
2092 	ARG_UNUSED(ret);
2093 }
2094 
2095 __pinned_func
arch_mem_page_in(void * addr,uintptr_t phys)2096 void arch_mem_page_in(void *addr, uintptr_t phys)
2097 {
2098 	int ret;
2099 	pentry_t mask = PTE_MASK | MMU_P | MMU_D | MMU_A;
2100 
2101 	ret = range_map(addr, phys, CONFIG_MMU_PAGE_SIZE, MMU_P, mask,
2102 			OPTION_FLUSH);
2103 	__ASSERT_NO_MSG(ret == 0);
2104 	ARG_UNUSED(ret);
2105 }
2106 
2107 __pinned_func
arch_mem_scratch(uintptr_t phys)2108 void arch_mem_scratch(uintptr_t phys)
2109 {
2110 	page_map_set(z_x86_page_tables_get(), Z_SCRATCH_PAGE,
2111 		     phys | MMU_P | MMU_RW | MMU_XD, NULL, MASK_ALL,
2112 		     OPTION_FLUSH);
2113 }
2114 
2115 __pinned_func
arch_page_info_get(void * addr,uintptr_t * phys,bool clear_accessed)2116 uintptr_t arch_page_info_get(void *addr, uintptr_t *phys, bool clear_accessed)
2117 {
2118 	pentry_t all_pte, mask;
2119 	uint32_t options;
2120 
2121 	/* What to change, if anything, in the page_map_set() calls */
2122 	if (clear_accessed) {
2123 		mask = MMU_A;
2124 		options = OPTION_FLUSH;
2125 	} else {
2126 		/* In this configuration page_map_set() just queries the
2127 		 * page table and makes no changes
2128 		 */
2129 		mask = 0;
2130 		options = 0U;
2131 	}
2132 
2133 	page_map_set(z_x86_kernel_ptables, addr, 0, &all_pte, mask, options);
2134 
2135 	/* Un-mapped PTEs are completely zeroed. No need to report anything
2136 	 * else in this case.
2137 	 */
2138 	if (all_pte == 0) {
2139 		return ARCH_DATA_PAGE_NOT_MAPPED;
2140 	}
2141 
2142 #if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
2143 	/* Don't bother looking at other page tables if non-present as we
2144 	 * are not required to report accurate accessed/dirty in this case
2145 	 * and all mappings are otherwise the same.
2146 	 */
2147 	if ((all_pte & MMU_P) != 0) {
2148 		sys_snode_t *node;
2149 
2150 		/* IRQs are locked, safe to do this */
2151 		SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
2152 			pentry_t cur_pte;
2153 			struct arch_mem_domain *domain =
2154 				CONTAINER_OF(node, struct arch_mem_domain,
2155 					     node);
2156 
2157 			page_map_set(domain->ptables, addr, 0, &cur_pte,
2158 				     mask, options | OPTION_USER);
2159 
2160 			/* Logical OR of relevant PTE in all page tables.
2161 			 * addr/location and present state should be identical
2162 			 * among them.
2163 			 */
2164 			all_pte |= cur_pte;
2165 		}
2166 	}
2167 #endif /* USERSPACE && ~X86_COMMON_PAGE_TABLE */
2168 
2169 	/* NOTE: We are truncating the PTE on PAE systems, whose pentry_t
2170 	 * are larger than a uintptr_t.
2171 	 *
2172 	 * We currently aren't required to report back XD state (bit 63), and
2173 	 * Zephyr just doesn't support large physical memory on 32-bit
2174 	 * systems, PAE was only implemented for XD support.
2175 	 */
2176 	if (phys != NULL) {
2177 		*phys = (uintptr_t)get_entry_phys(all_pte, PTE_LEVEL);
2178 	}
2179 
2180 	/* We don't filter out any other bits in the PTE and the kernel
2181 	 * ignores them. For the case of ARCH_DATA_PAGE_NOT_MAPPED,
2182 	 * we use a bit which is never set in a real PTE (the PAT bit) in the
2183 	 * current system.
2184 	 *
2185 	 * The other ARCH_DATA_PAGE_* macros are defined to their corresponding
2186 	 * bits in the PTE.
2187 	 */
2188 	return (uintptr_t)all_pte;
2189 }
2190 
2191 __pinned_func
arch_page_location_get(void * addr,uintptr_t * location)2192 enum arch_page_location arch_page_location_get(void *addr, uintptr_t *location)
2193 {
2194 	pentry_t pte;
2195 	int level;
2196 
2197 	/* TODO: since we only have to query the current set of page tables,
2198 	 * could optimize this with recursive page table mapping
2199 	 */
2200 	pentry_get(&level, &pte, z_x86_page_tables_get(), addr);
2201 
2202 	if (pte == 0) {
2203 		/* Not mapped */
2204 		return ARCH_PAGE_LOCATION_BAD;
2205 	}
2206 
2207 	__ASSERT(level == PTE_LEVEL, "bigpage found at %p", addr);
2208 	*location = (uintptr_t)get_entry_phys(pte, PTE_LEVEL);
2209 
2210 	if ((pte & MMU_P) != 0) {
2211 		return ARCH_PAGE_LOCATION_PAGED_IN;
2212 	} else {
2213 		return ARCH_PAGE_LOCATION_PAGED_OUT;
2214 	}
2215 }
2216 
2217 #ifdef CONFIG_X86_KPTI
2218 __pinned_func
z_x86_kpti_is_access_ok(void * addr,pentry_t * ptables)2219 bool z_x86_kpti_is_access_ok(void *addr, pentry_t *ptables)
2220 {
2221 	pentry_t pte;
2222 	int level;
2223 
2224 	pentry_get(&level, &pte, ptables, addr);
2225 
2226 	/* Might as well also check if it's un-mapped, normally we don't
2227 	 * fetch the PTE from the page tables until we are inside
2228 	 * z_page_fault() and call arch_page_fault_status_get()
2229 	 */
2230 	if (level != PTE_LEVEL || pte == 0 || is_flipped_pte(pte)) {
2231 		return false;
2232 	}
2233 
2234 	return true;
2235 }
2236 #endif /* CONFIG_X86_KPTI */
2237 #endif /* CONFIG_DEMAND_PAGING */
2238