1 /*
2  * Copyright (c) 2011-2014 Wind River Systems, Inc.
3  * Copyright (c) 2017-2020 Intel Corporation
4  *
5  * SPDX-License-Identifier: Apache-2.0
6  */
7 
8 #include <kernel.h>
9 #include <arch/x86/mmustructs.h>
10 #include <sys/mem_manage.h>
11 #include <sys/__assert.h>
12 #include <logging/log.h>
13 #include <errno.h>
14 #include <ctype.h>
15 #include <spinlock.h>
16 #include <kernel_arch_func.h>
17 #include <x86_mmu.h>
18 #include <init.h>
19 #include <kernel_internal.h>
20 #include <mmu.h>
21 #include <drivers/interrupt_controller/loapic.h>
22 #include <mmu.h>
23 #include <arch/x86/memmap.h>
24 
25 LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL);
26 
27 /* We will use some ignored bits in the PTE to backup permission settings
28  * when the mapping was made. This is used to un-apply memory domain memory
29  * partitions to page tables when the partitions are removed.
30  */
31 #define MMU_RW_ORIG	MMU_IGNORED0
32 #define MMU_US_ORIG	MMU_IGNORED1
33 #define MMU_XD_ORIG	MMU_IGNORED2
34 
35 /* Bits in the PTE that form the set of permission bits, when resetting */
36 #define MASK_PERM	(MMU_RW | MMU_US | MMU_XD)
37 
38 /* When we want to set up a new mapping, discarding any previous state */
39 #define MASK_ALL	(~((pentry_t)0U))
40 
41 /* Bits to set at mapping time for particular permissions. We set the actual
42  * page table bit effecting the policy and also the backup bit.
43  */
44 #define ENTRY_RW	(MMU_RW | MMU_RW_ORIG)
45 #define ENTRY_US	(MMU_US | MMU_US_ORIG)
46 #define ENTRY_XD	(MMU_XD | MMU_XD_ORIG)
47 
48 /* Bit position which is always zero in a PTE. We'll use the PAT bit.
49  * This helps disambiguate PTEs that do not have the Present bit set (MMU_P):
50  * - If the entire entry is zero, it's an un-mapped virtual page
51  * - If PTE_ZERO is set, we flipped this page due to KPTI
52  * - Otherwise, this was a page-out
53  */
54 #define PTE_ZERO	MMU_PAT
55 
56 /* Protects x86_domain_list and serializes instantiation of intermediate
57  * paging structures.
58  */
59 __pinned_bss
60 static struct k_spinlock x86_mmu_lock;
61 
62 #if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
63 /* List of all active and initialized memory domains. This is used to make
64  * sure all memory mappings are the same across all page tables when invoking
65  * range_map()
66  */
67 __pinned_bss
68 static sys_slist_t x86_domain_list;
69 #endif
70 
71 /*
72  * Definitions for building an ontology of paging levels and capabilities
73  * at each level
74  */
75 
76 /* Data structure describing the characteristics of a particular paging
77  * level
78  */
79 struct paging_level {
80 	/* What bits are used to store physical address */
81 	pentry_t mask;
82 
83 	/* Number of entries in this paging structure */
84 	size_t entries;
85 
86 	/* How many bits to right-shift a virtual address to obtain the
87 	 * appropriate entry within this table.
88 	 *
89 	 * The memory scope of each entry in this table is 1 << shift.
90 	 */
91 	unsigned int shift;
92 #ifdef CONFIG_EXCEPTION_DEBUG
93 	/* Name of this level, for debug purposes */
94 	const char *name;
95 #endif
96 };
97 
98 /* Flags for all entries in intermediate paging levels.
99  * Fortunately, the same bits are set for all intermediate levels for all
100  * three paging modes.
101  *
102  * Obviously P is set.
103  *
104  * We want RW and US bit always set; actual access control will be
105  * done at the leaf level.
106  *
107  * XD (if supported) always 0. Disabling execution done at leaf level.
108  *
109  * PCD/PWT always 0. Caching properties again done at leaf level.
110  */
111 #define INT_FLAGS	(MMU_P | MMU_RW | MMU_US)
112 
113 /* Paging level ontology for the selected paging mode.
114  *
115  * See Figures 4-4, 4-7, 4-11 in the Intel SDM, vol 3A
116  */
117 __pinned_rodata
118 static const struct paging_level paging_levels[] = {
119 #ifdef CONFIG_X86_64
120 	/* Page Map Level 4 */
121 	{
122 		.mask = 0x7FFFFFFFFFFFF000ULL,
123 		.entries = 512U,
124 		.shift = 39U,
125 #ifdef CONFIG_EXCEPTION_DEBUG
126 		.name = "PML4"
127 #endif
128 	},
129 #endif /* CONFIG_X86_64 */
130 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
131 	/* Page Directory Pointer Table */
132 	{
133 		.mask = 0x7FFFFFFFFFFFF000ULL,
134 #ifdef CONFIG_X86_64
135 		.entries = 512U,
136 #else
137 		/* PAE version */
138 		.entries = 4U,
139 #endif
140 		.shift = 30U,
141 #ifdef CONFIG_EXCEPTION_DEBUG
142 		.name = "PDPT"
143 #endif
144 	},
145 #endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
146 	/* Page Directory */
147 	{
148 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
149 		.mask = 0x7FFFFFFFFFFFF000ULL,
150 		.entries = 512U,
151 		.shift = 21U,
152 #else
153 		/* 32-bit */
154 		.mask = 0xFFFFF000U,
155 		.entries = 1024U,
156 		.shift = 22U,
157 #endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
158 #ifdef CONFIG_EXCEPTION_DEBUG
159 		.name = "PD"
160 #endif
161 	},
162 	/* Page Table */
163 	{
164 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
165 		.mask = 0x07FFFFFFFFFFF000ULL,
166 		.entries = 512U,
167 		.shift = 12U,
168 #else
169 		/* 32-bit */
170 		.mask = 0xFFFFF000U,
171 		.entries = 1024U,
172 		.shift = 12U,
173 #endif /* CONFIG_X86_64 || CONFIG_X86_PAE */
174 #ifdef CONFIG_EXCEPTION_DEBUG
175 		.name = "PT"
176 #endif
177 	}
178 };
179 
180 #define NUM_LEVELS	ARRAY_SIZE(paging_levels)
181 #define PTE_LEVEL	(NUM_LEVELS - 1)
182 #define PDE_LEVEL	(NUM_LEVELS - 2)
183 
184 /*
185  * Macros for reserving space for page tables
186  *
187  * We need to reserve a block of memory equal in size to the page tables
188  * generated by gen_mmu.py so that memory addresses do not shift between
189  * build phases. These macros ultimately specify INITIAL_PAGETABLE_SIZE.
190  */
191 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
192 #ifdef CONFIG_X86_64
193 #define NUM_PML4_ENTRIES 512U
194 #define NUM_PDPT_ENTRIES 512U
195 #else
196 #define NUM_PDPT_ENTRIES 4U
197 #endif /* CONFIG_X86_64 */
198 #define NUM_PD_ENTRIES   512U
199 #define NUM_PT_ENTRIES   512U
200 #else
201 #define NUM_PD_ENTRIES   1024U
202 #define NUM_PT_ENTRIES   1024U
203 #endif /* !CONFIG_X86_64 && !CONFIG_X86_PAE */
204 
205 /* Memory range covered by an instance of various table types */
206 #define PT_AREA		((uintptr_t)(CONFIG_MMU_PAGE_SIZE * NUM_PT_ENTRIES))
207 #define PD_AREA 	(PT_AREA * NUM_PD_ENTRIES)
208 #ifdef CONFIG_X86_64
209 #define PDPT_AREA	(PD_AREA * NUM_PDPT_ENTRIES)
210 #endif
211 
212 #define VM_ADDR		CONFIG_KERNEL_VM_BASE
213 #define VM_SIZE		CONFIG_KERNEL_VM_SIZE
214 
215 /* Define a range [PT_START, PT_END) which is the memory range
216  * covered by all the page tables needed for the address space
217  */
218 #define PT_START	((uintptr_t)ROUND_DOWN(VM_ADDR, PT_AREA))
219 #define PT_END		((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PT_AREA))
220 
221 /* Number of page tables needed to cover address space. Depends on the specific
222  * bounds, but roughly 1 page table per 2MB of RAM
223  */
224 #define NUM_PT	((PT_END - PT_START) / PT_AREA)
225 
226 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
227 /* Same semantics as above, but for the page directories needed to cover
228  * system RAM.
229  */
230 #define PD_START	((uintptr_t)ROUND_DOWN(VM_ADDR, PD_AREA))
231 #define PD_END		((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PD_AREA))
232 /* Number of page directories needed to cover the address space. Depends on the
233  * specific bounds, but roughly 1 page directory per 1GB of RAM
234  */
235 #define NUM_PD	((PD_END - PD_START) / PD_AREA)
236 #else
237 /* 32-bit page tables just have one toplevel page directory */
238 #define NUM_PD	1
239 #endif
240 
241 #ifdef CONFIG_X86_64
242 /* Same semantics as above, but for the page directory pointer tables needed
243  * to cover the address space. On 32-bit there is just one 4-entry PDPT.
244  */
245 #define PDPT_START	((uintptr_t)ROUND_DOWN(VM_ADDR, PDPT_AREA))
246 #define PDPT_END	((uintptr_t)ROUND_UP(VM_ADDR + VM_SIZE, PDPT_AREA))
247 /* Number of PDPTs needed to cover the address space. 1 PDPT per 512GB of VM */
248 #define NUM_PDPT	((PDPT_END - PDPT_START) / PDPT_AREA)
249 
250 /* All pages needed for page tables, using computed values plus one more for
251  * the top-level PML4
252  */
253 #define NUM_TABLE_PAGES	(NUM_PT + NUM_PD + NUM_PDPT + 1)
254 #else /* !CONFIG_X86_64 */
255 /* Number of pages we need to reserve in the stack for per-thread page tables */
256 #define NUM_TABLE_PAGES	(NUM_PT + NUM_PD)
257 #endif /* CONFIG_X86_64 */
258 
259 #define INITIAL_PTABLE_PAGES \
260 	(NUM_TABLE_PAGES + CONFIG_X86_EXTRA_PAGE_TABLE_PAGES)
261 
262 #ifdef CONFIG_X86_PAE
263 /* Toplevel PDPT wasn't included as it is not a page in size */
264 #define INITIAL_PTABLE_SIZE \
265 	((INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE) + 0x20)
266 #else
267 #define INITIAL_PTABLE_SIZE \
268 	(INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE)
269 #endif
270 
271 /* "dummy" pagetables for the first-phase build. The real page tables
272  * are produced by gen-mmu.py based on data read in zephyr-prebuilt.elf,
273  * and this dummy array is discarded.
274  */
275 Z_GENERIC_SECTION(.dummy_pagetables)
276 static __used char dummy_pagetables[INITIAL_PTABLE_SIZE];
277 
278 /*
279  * Utility functions
280  */
281 
282 /* For a table at a particular level, get the entry index that corresponds to
283  * the provided virtual address
284  */
285 __pinned_func
get_index(void * virt,int level)286 static inline int get_index(void *virt, int level)
287 {
288 	return (((uintptr_t)virt >> paging_levels[level].shift) %
289 		paging_levels[level].entries);
290 }
291 
292 __pinned_func
get_entry_ptr(pentry_t * ptables,void * virt,int level)293 static inline pentry_t *get_entry_ptr(pentry_t *ptables, void *virt, int level)
294 {
295 	return &ptables[get_index(virt, level)];
296 }
297 
298 __pinned_func
get_entry(pentry_t * ptables,void * virt,int level)299 static inline pentry_t get_entry(pentry_t *ptables, void *virt, int level)
300 {
301 	return ptables[get_index(virt, level)];
302 }
303 
304 /* Get the physical memory address associated with this table entry */
305 __pinned_func
get_entry_phys(pentry_t entry,int level)306 static inline uintptr_t get_entry_phys(pentry_t entry, int level)
307 {
308 	return entry & paging_levels[level].mask;
309 }
310 
311 /* Return the virtual address of a linked table stored in the provided entry */
312 __pinned_func
next_table(pentry_t entry,int level)313 static inline pentry_t *next_table(pentry_t entry, int level)
314 {
315 	return z_mem_virt_addr(get_entry_phys(entry, level));
316 }
317 
318 /* Number of table entries at this level */
319 __pinned_func
get_num_entries(int level)320 static inline size_t get_num_entries(int level)
321 {
322 	return paging_levels[level].entries;
323 }
324 
325 /* 4K for everything except PAE PDPTs */
326 __pinned_func
table_size(int level)327 static inline size_t table_size(int level)
328 {
329 	return get_num_entries(level) * sizeof(pentry_t);
330 }
331 
332 /* For a table at a particular level, size of the amount of virtual memory
333  * that an entry within the table covers
334  */
335 __pinned_func
get_entry_scope(int level)336 static inline size_t get_entry_scope(int level)
337 {
338 	return (1UL << paging_levels[level].shift);
339 }
340 
341 /* For a table at a particular level, size of the amount of virtual memory
342  * that this entire table covers
343  */
344 __pinned_func
get_table_scope(int level)345 static inline size_t get_table_scope(int level)
346 {
347 	return get_entry_scope(level) * get_num_entries(level);
348 }
349 
350 /* Must have checked Present bit first! Non-present entries may have OS data
351  * stored in any other bits
352  */
353 __pinned_func
is_leaf(int level,pentry_t entry)354 static inline bool is_leaf(int level, pentry_t entry)
355 {
356 	if (level == PTE_LEVEL) {
357 		/* Always true for PTE */
358 		return true;
359 	}
360 
361 	return ((entry & MMU_PS) != 0U);
362 }
363 
364 /* This does NOT (by design) un-flip KPTI PTEs, it's just the raw PTE value */
365 __pinned_func
pentry_get(int * paging_level,pentry_t * val,pentry_t * ptables,void * virt)366 static inline void pentry_get(int *paging_level, pentry_t *val,
367 			      pentry_t *ptables, void *virt)
368 {
369 	pentry_t *table = ptables;
370 
371 	for (int level = 0; level < NUM_LEVELS; level++) {
372 		pentry_t entry = get_entry(table, virt, level);
373 
374 		if ((entry & MMU_P) == 0 || is_leaf(level, entry)) {
375 			*val = entry;
376 			if (paging_level != NULL) {
377 				*paging_level = level;
378 			}
379 			break;
380 		} else {
381 			table = next_table(entry, level);
382 		}
383 	}
384 }
385 
386 __pinned_func
tlb_flush_page(void * addr)387 static inline void tlb_flush_page(void *addr)
388 {
389 	/* Invalidate TLB entries corresponding to the page containing the
390 	 * specified address
391 	 */
392 	char *page = (char *)addr;
393 
394 	__asm__ ("invlpg %0" :: "m" (*page));
395 }
396 
397 #ifdef CONFIG_X86_KPTI
398 __pinned_func
is_flipped_pte(pentry_t pte)399 static inline bool is_flipped_pte(pentry_t pte)
400 {
401 	return (pte & MMU_P) == 0 && (pte & PTE_ZERO) != 0;
402 }
403 #endif
404 
405 #if defined(CONFIG_SMP)
406 __pinned_func
z_x86_tlb_ipi(const void * arg)407 void z_x86_tlb_ipi(const void *arg)
408 {
409 	uintptr_t ptables_phys;
410 
411 	ARG_UNUSED(arg);
412 
413 #ifdef CONFIG_X86_KPTI
414 	/* We're always on the kernel's set of page tables in this context
415 	 * if KPTI is turned on
416 	 */
417 	ptables_phys = z_x86_cr3_get();
418 	__ASSERT(ptables_phys == z_mem_phys_addr(&z_x86_kernel_ptables), "");
419 #else
420 	/* We might have been moved to another memory domain, so always invoke
421 	 * z_x86_thread_page_tables_get() instead of using current CR3 value.
422 	 */
423 	ptables_phys = z_mem_phys_addr(z_x86_thread_page_tables_get(_current));
424 #endif
425 	/*
426 	 * In the future, we can consider making this smarter, such as
427 	 * propagating which page tables were modified (in case they are
428 	 * not active on this CPU) or an address range to call
429 	 * tlb_flush_page() on.
430 	 */
431 	LOG_DBG("%s on CPU %d\n", __func__, arch_curr_cpu()->id);
432 
433 	z_x86_cr3_set(ptables_phys);
434 }
435 
436 /* NOTE: This is not synchronous and the actual flush takes place some short
437  * time after this exits.
438  */
439 __pinned_func
tlb_shootdown(void)440 static inline void tlb_shootdown(void)
441 {
442 	z_loapic_ipi(0, LOAPIC_ICR_IPI_OTHERS, CONFIG_TLB_IPI_VECTOR);
443 }
444 #endif /* CONFIG_SMP */
445 
446 __pinned_func
assert_addr_aligned(uintptr_t addr)447 static inline void assert_addr_aligned(uintptr_t addr)
448 {
449 #if __ASSERT_ON
450 	__ASSERT((addr & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U,
451 		 "unaligned address 0x%" PRIxPTR, addr);
452 #endif
453 }
454 
455 __pinned_func
assert_virt_addr_aligned(void * addr)456 static inline void assert_virt_addr_aligned(void *addr)
457 {
458 	assert_addr_aligned((uintptr_t)addr);
459 }
460 
461 __pinned_func
assert_region_page_aligned(void * addr,size_t size)462 static inline void assert_region_page_aligned(void *addr, size_t size)
463 {
464 	assert_virt_addr_aligned(addr);
465 #if __ASSERT_ON
466 	__ASSERT((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U,
467 		 "unaligned size %zu", size);
468 #endif
469 }
470 
471 /*
472  * Debug functions. All conditionally compiled with CONFIG_EXCEPTION_DEBUG.
473  */
474 #ifdef CONFIG_EXCEPTION_DEBUG
475 
476 /* Add colors to page table dumps to indicate mapping type */
477 #define COLOR_PAGE_TABLES	1
478 
479 #if COLOR_PAGE_TABLES
480 #define ANSI_DEFAULT "\x1B[0m"
481 #define ANSI_RED     "\x1B[1;31m"
482 #define ANSI_GREEN   "\x1B[1;32m"
483 #define ANSI_YELLOW  "\x1B[1;33m"
484 #define ANSI_BLUE    "\x1B[1;34m"
485 #define ANSI_MAGENTA "\x1B[1;35m"
486 #define ANSI_CYAN    "\x1B[1;36m"
487 #define ANSI_GREY    "\x1B[1;90m"
488 
489 #define COLOR(x)	printk(_CONCAT(ANSI_, x))
490 #else
491 #define COLOR(x)	do { } while (0)
492 #endif
493 
494 __pinned_func
get_entry_code(pentry_t value)495 static char get_entry_code(pentry_t value)
496 {
497 	char ret;
498 
499 	if (value == 0U) {
500 		/* Unmapped entry */
501 		ret = '.';
502 	} else {
503 		if ((value & MMU_RW) != 0U) {
504 			/* Writable page */
505 			if ((value & MMU_XD) != 0U) {
506 				/* RW */
507 				ret = 'w';
508 			} else {
509 				/* RWX */
510 				ret = 'a';
511 			}
512 		} else {
513 			if ((value & MMU_XD) != 0U) {
514 				/* R */
515 				ret = 'r';
516 			} else {
517 				/* RX */
518 				ret = 'x';
519 			}
520 		}
521 
522 		if ((value & MMU_US) != 0U) {
523 			/* Uppercase indicates user mode access */
524 			ret = toupper(ret);
525 		}
526 	}
527 
528 	return ret;
529 }
530 
531 __pinned_func
print_entries(pentry_t entries_array[],uint8_t * base,int level,size_t count)532 static void print_entries(pentry_t entries_array[], uint8_t *base, int level,
533 			  size_t count)
534 {
535 	int column = 0;
536 
537 	for (int i = 0; i < count; i++) {
538 		pentry_t entry = entries_array[i];
539 
540 		uintptr_t phys = get_entry_phys(entry, level);
541 		uintptr_t virt =
542 			(uintptr_t)base + (get_entry_scope(level) * i);
543 
544 		if ((entry & MMU_P) != 0U) {
545 			if (is_leaf(level, entry)) {
546 				if (phys == virt) {
547 					/* Identity mappings */
548 					COLOR(YELLOW);
549 				} else if (phys + Z_MEM_VM_OFFSET == virt) {
550 					/* Permanent RAM mappings */
551 					COLOR(GREEN);
552 				} else {
553 					/* General mapped pages */
554 					COLOR(CYAN);
555 				}
556 			} else {
557 				/* Intermediate entry */
558 				COLOR(MAGENTA);
559 			}
560 		} else {
561 			if (is_leaf(level, entry)) {
562 				if (entry == 0U) {
563 					/* Unmapped */
564 					COLOR(GREY);
565 #ifdef CONFIG_X86_KPTI
566 				} else if (is_flipped_pte(entry)) {
567 					/* KPTI, un-flip it */
568 					COLOR(BLUE);
569 					entry = ~entry;
570 					phys = get_entry_phys(entry, level);
571 					if (phys == virt) {
572 						/* Identity mapped */
573 						COLOR(CYAN);
574 					} else {
575 						/* Non-identity mapped */
576 						COLOR(BLUE);
577 					}
578 #endif
579 				} else {
580 					/* Paged out */
581 					COLOR(RED);
582 				}
583 			} else {
584 				/* Un-mapped intermediate entry */
585 				COLOR(GREY);
586 			}
587 		}
588 
589 		printk("%c", get_entry_code(entry));
590 
591 		column++;
592 		if (column == 64) {
593 			column = 0;
594 			printk("\n");
595 		}
596 	}
597 	COLOR(DEFAULT);
598 
599 	if (column != 0) {
600 		printk("\n");
601 	}
602 }
603 
604 __pinned_func
dump_ptables(pentry_t * table,uint8_t * base,int level)605 static void dump_ptables(pentry_t *table, uint8_t *base, int level)
606 {
607 	const struct paging_level *info = &paging_levels[level];
608 
609 #ifdef CONFIG_X86_64
610 	/* Account for the virtual memory "hole" with sign-extension */
611 	if (((uintptr_t)base & BITL(47)) != 0) {
612 		base = (uint8_t *)((uintptr_t)base | (0xFFFFULL << 48));
613 	}
614 #endif
615 
616 	printk("%s at %p (0x%" PRIxPTR "): ", info->name, table,
617 	       z_mem_phys_addr(table));
618 	if (level == 0) {
619 		printk("entire address space\n");
620 	} else {
621 		printk("for %p - %p\n", base,
622 		       base + get_table_scope(level) - 1);
623 	}
624 
625 	print_entries(table, base, level, info->entries);
626 
627 	/* Check if we're a page table */
628 	if (level == PTE_LEVEL) {
629 		return;
630 	}
631 
632 	/* Dump all linked child tables */
633 	for (int j = 0; j < info->entries; j++) {
634 		pentry_t entry = table[j];
635 		pentry_t *next;
636 
637 		if ((entry & MMU_P) == 0U ||
638 			(entry & MMU_PS) != 0U) {
639 			/* Not present or big page, skip */
640 			continue;
641 		}
642 
643 		next = next_table(entry, level);
644 		dump_ptables(next, base + (j * get_entry_scope(level)),
645 			     level + 1);
646 	}
647 }
648 
649 __pinned_func
z_x86_dump_page_tables(pentry_t * ptables)650 void z_x86_dump_page_tables(pentry_t *ptables)
651 {
652 	dump_ptables(ptables, NULL, 0);
653 }
654 
655 /* Enable to dump out the kernel's page table right before main() starts,
656  * sometimes useful for deep debugging. May overwhelm twister.
657  */
658 #define DUMP_PAGE_TABLES 0
659 
660 #if DUMP_PAGE_TABLES
661 __pinned_func
dump_kernel_tables(const struct device * unused)662 static int dump_kernel_tables(const struct device *unused)
663 {
664 	z_x86_dump_page_tables(z_x86_kernel_ptables);
665 
666 	return 0;
667 }
668 
669 SYS_INIT(dump_kernel_tables, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT);
670 #endif
671 
672 __pinned_func
str_append(char ** buf,size_t * size,const char * str)673 static void str_append(char **buf, size_t *size, const char *str)
674 {
675 	int ret = snprintk(*buf, *size, "%s", str);
676 
677 	if (ret >= *size) {
678 		/* Truncated */
679 		*size = 0U;
680 	} else {
681 		*size -= ret;
682 		*buf += ret;
683 	}
684 
685 }
686 
687 __pinned_func
dump_entry(int level,void * virt,pentry_t entry)688 static void dump_entry(int level, void *virt, pentry_t entry)
689 {
690 	const struct paging_level *info = &paging_levels[level];
691 	char buf[24] = { 0 };
692 	char *pos = buf;
693 	size_t sz = sizeof(buf);
694 	uint8_t *virtmap = (uint8_t *)ROUND_DOWN(virt, get_entry_scope(level));
695 
696 	#define DUMP_BIT(bit) do { \
697 			if ((entry & MMU_##bit) != 0U) { \
698 				str_append(&pos, &sz, #bit " "); \
699 			} \
700 		} while (0)
701 
702 	DUMP_BIT(RW);
703 	DUMP_BIT(US);
704 	DUMP_BIT(PWT);
705 	DUMP_BIT(PCD);
706 	DUMP_BIT(A);
707 	DUMP_BIT(D);
708 	DUMP_BIT(G);
709 	DUMP_BIT(XD);
710 
711 	LOG_ERR("%sE: %p -> " PRI_ENTRY ": %s", info->name,
712 		virtmap, entry & info->mask, log_strdup(buf));
713 
714 	#undef DUMP_BIT
715 }
716 
717 __pinned_func
z_x86_pentry_get(int * paging_level,pentry_t * val,pentry_t * ptables,void * virt)718 void z_x86_pentry_get(int *paging_level, pentry_t *val, pentry_t *ptables,
719 		      void *virt)
720 {
721 	pentry_get(paging_level, val, ptables, virt);
722 }
723 
724 /*
725  * Debug function for dumping out MMU table information to the LOG for a
726  * specific virtual address, such as when we get an unexpected page fault.
727  */
728 __pinned_func
z_x86_dump_mmu_flags(pentry_t * ptables,void * virt)729 void z_x86_dump_mmu_flags(pentry_t *ptables, void *virt)
730 {
731 	pentry_t entry = 0;
732 	int level = 0;
733 
734 	pentry_get(&level, &entry, ptables, virt);
735 
736 	if ((entry & MMU_P) == 0) {
737 		LOG_ERR("%sE: not present", paging_levels[level].name);
738 	} else {
739 		dump_entry(level, virt, entry);
740 	}
741 }
742 #endif /* CONFIG_EXCEPTION_DEBUG */
743 
744 /* Reset permissions on a PTE to original state when the mapping was made */
745 __pinned_func
reset_pte(pentry_t old_val)746 static inline pentry_t reset_pte(pentry_t old_val)
747 {
748 	pentry_t new_val;
749 
750 	/* Clear any existing state in permission bits */
751 	new_val = old_val & (~K_MEM_PARTITION_PERM_MASK);
752 
753 	/* Now set permissions based on the stashed original values */
754 	if ((old_val & MMU_RW_ORIG) != 0) {
755 		new_val |= MMU_RW;
756 	}
757 	if ((old_val & MMU_US_ORIG) != 0) {
758 		new_val |= MMU_US;
759 	}
760 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
761 	if ((old_val & MMU_XD_ORIG) != 0) {
762 		new_val |= MMU_XD;
763 	}
764 #endif
765 	return new_val;
766 }
767 
768 /* Wrapper functions for some gross stuff we have to do for Kernel
769  * page table isolation. If these are User mode page tables, the user bit
770  * isn't set, and this is not the shared page, all the bits in the PTE
771  * are flipped. This serves three purposes:
772  *  - The page isn't present, implementing page table isolation
773  *  - Flipping the physical address bits cheaply mitigates L1TF
774  *  - State is preserved; to get original PTE, just complement again
775  */
776 __pinned_func
pte_finalize_value(pentry_t val,bool user_table,int level)777 static inline pentry_t pte_finalize_value(pentry_t val, bool user_table,
778 					  int level)
779 {
780 #ifdef CONFIG_X86_KPTI
781 	static const uintptr_t shared_phys_addr =
782 		Z_MEM_PHYS_ADDR(POINTER_TO_UINT(&z_shared_kernel_page_start));
783 
784 	if (user_table && (val & MMU_US) == 0 && (val & MMU_P) != 0 &&
785 	    get_entry_phys(val, level) != shared_phys_addr) {
786 		val = ~val;
787 	}
788 #endif
789 	return val;
790 }
791 
792 /* Atomic functions for modifying PTEs. These don't map nicely to Zephyr's
793  * atomic API since the only types supported are 'int' and 'void *' and
794  * the size of pentry_t depends on other factors like PAE.
795  */
796 #ifndef CONFIG_X86_PAE
797 /* Non-PAE, pentry_t is same size as void ptr so use atomic_ptr_* APIs */
798 __pinned_func
atomic_pte_get(const pentry_t * target)799 static inline pentry_t atomic_pte_get(const pentry_t *target)
800 {
801 	return (pentry_t)atomic_ptr_get((atomic_ptr_t *)target);
802 }
803 
804 __pinned_func
atomic_pte_cas(pentry_t * target,pentry_t old_value,pentry_t new_value)805 static inline bool atomic_pte_cas(pentry_t *target, pentry_t old_value,
806 				  pentry_t new_value)
807 {
808 	return atomic_ptr_cas((atomic_ptr_t *)target, (void *)old_value,
809 			      (void *)new_value);
810 }
811 #else
812 /* Atomic builtins for 64-bit values on 32-bit x86 require floating point.
813  * Don't do this, just lock local interrupts. Needless to say, this
814  * isn't workable if someone ever adds SMP to the 32-bit x86 port.
815  */
816 BUILD_ASSERT(!IS_ENABLED(CONFIG_SMP));
817 
818 __pinned_func
atomic_pte_get(const pentry_t * target)819 static inline pentry_t atomic_pte_get(const pentry_t *target)
820 {
821 	return *target;
822 }
823 
824 __pinned_func
atomic_pte_cas(pentry_t * target,pentry_t old_value,pentry_t new_value)825 static inline bool atomic_pte_cas(pentry_t *target, pentry_t old_value,
826 				  pentry_t new_value)
827 {
828 	bool ret = false;
829 	int key = arch_irq_lock();
830 
831 	if (*target == old_value) {
832 		*target = new_value;
833 		ret = true;
834 	}
835 	arch_irq_unlock(key);
836 
837 	return ret;
838 }
839 #endif /* CONFIG_X86_PAE */
840 
841 /* Indicates that the target page tables will be used by user mode threads.
842  * This only has implications for CONFIG_X86_KPTI where user thread facing
843  * page tables need nearly all pages that don't have the US bit to also
844  * not be Present.
845  */
846 #define OPTION_USER		BIT(0)
847 
848 /* Indicates that the operation requires TLBs to be flushed as we are altering
849  * existing mappings. Not needed for establishing new mappings
850  */
851 #define OPTION_FLUSH		BIT(1)
852 
853 /* Indicates that each PTE's permission bits should be restored to their
854  * original state when the memory was mapped. All other bits in the PTE are
855  * preserved.
856  */
857 #define OPTION_RESET		BIT(2)
858 
859 /* Indicates that the mapping will need to be cleared entirely. This is
860  * mainly used for unmapping the memory region.
861  */
862 #define OPTION_CLEAR		BIT(3)
863 
864 /**
865  * Atomically update bits in a page table entry
866  *
867  * This is atomic with respect to modifications by other CPUs or preempted
868  * contexts, which can be very important when making decisions based on
869  * the PTE's prior "dirty" state.
870  *
871  * @param pte Pointer to page table entry to update
872  * @param update_val Updated bits to set/clear in PTE. Ignored with
873  *        OPTION_RESET or OPTION_CLEAR.
874  * @param update_mask Which bits to modify in the PTE. Ignored with
875  *        OPTION_RESET or OPTION_CLEAR.
876  * @param options Control flags
877  * @retval Old PTE value
878  */
879 __pinned_func
pte_atomic_update(pentry_t * pte,pentry_t update_val,pentry_t update_mask,uint32_t options)880 static inline pentry_t pte_atomic_update(pentry_t *pte, pentry_t update_val,
881 					 pentry_t update_mask,
882 					 uint32_t options)
883 {
884 	bool user_table = (options & OPTION_USER) != 0U;
885 	bool reset = (options & OPTION_RESET) != 0U;
886 	bool clear = (options & OPTION_CLEAR) != 0U;
887 	pentry_t old_val, new_val;
888 
889 	do {
890 		old_val = atomic_pte_get(pte);
891 
892 		new_val = old_val;
893 #ifdef CONFIG_X86_KPTI
894 		if (is_flipped_pte(new_val)) {
895 			/* Page was flipped for KPTI. Un-flip it */
896 			new_val = ~new_val;
897 		}
898 #endif /* CONFIG_X86_KPTI */
899 
900 		if (reset) {
901 			new_val = reset_pte(new_val);
902 		} else if (clear) {
903 			new_val = 0;
904 		} else {
905 			new_val = ((new_val & ~update_mask) |
906 				   (update_val & update_mask));
907 		}
908 
909 		new_val = pte_finalize_value(new_val, user_table, PTE_LEVEL);
910 	} while (atomic_pte_cas(pte, old_val, new_val) == false);
911 
912 #ifdef CONFIG_X86_KPTI
913 	if (is_flipped_pte(old_val)) {
914 		/* Page was flipped for KPTI. Un-flip it */
915 		old_val = ~old_val;
916 	}
917 #endif /* CONFIG_X86_KPTI */
918 
919 	return old_val;
920 }
921 
922 /**
923  * Low level page table update function for a virtual page
924  *
925  * For the provided set of page tables, update the PTE associated with the
926  * virtual address to a new value, using the mask to control what bits
927  * need to be preserved.
928  *
929  * It is permitted to set up mappings without the Present bit set, in which
930  * case all other bits may be used for OS accounting.
931  *
932  * This function is atomic with respect to the page table entries being
933  * modified by another CPU, using atomic operations to update the requested
934  * bits and return the previous PTE value.
935  *
936  * Common mask values:
937  *  MASK_ALL  - Update all PTE bits. Exitsing state totally discarded.
938  *  MASK_PERM - Only update permission bits. All other bits and physical
939  *              mapping preserved.
940  *
941  * @param ptables Page tables to modify
942  * @param virt Virtual page table entry to update
943  * @param entry_val Value to update in the PTE (ignored if OPTION_RESET or
944  *        OPTION_CLEAR)
945  * @param [out] old_val_ptr Filled in with previous PTE value. May be NULL.
946  * @param mask What bits to update in the PTE (ignored if OPTION_RESET or
947  *        OPTION_CLEAR)
948  * @param options Control options, described above
949  */
950 __pinned_func
page_map_set(pentry_t * ptables,void * virt,pentry_t entry_val,pentry_t * old_val_ptr,pentry_t mask,uint32_t options)951 static void page_map_set(pentry_t *ptables, void *virt, pentry_t entry_val,
952 			 pentry_t *old_val_ptr, pentry_t mask, uint32_t options)
953 {
954 	pentry_t *table = ptables;
955 	bool flush = (options & OPTION_FLUSH) != 0U;
956 
957 	for (int level = 0; level < NUM_LEVELS; level++) {
958 		int index;
959 		pentry_t *entryp;
960 
961 		index = get_index(virt, level);
962 		entryp = &table[index];
963 
964 		/* Check if we're a PTE */
965 		if (level == PTE_LEVEL) {
966 			pentry_t old_val = pte_atomic_update(entryp, entry_val,
967 							     mask, options);
968 			if (old_val_ptr != NULL) {
969 				*old_val_ptr = old_val;
970 			}
971 			break;
972 		}
973 
974 		/* We fail an assertion here due to no support for
975 		 * splitting existing bigpage mappings.
976 		 * If the PS bit is not supported at some level (like
977 		 * in a PML4 entry) it is always reserved and must be 0
978 		 */
979 		__ASSERT((*entryp & MMU_PS) == 0U, "large page encountered");
980 		table = next_table(*entryp, level);
981 		__ASSERT(table != NULL,
982 			 "missing page table level %d when trying to map %p",
983 			 level + 1, virt);
984 	}
985 	if (flush) {
986 		tlb_flush_page(virt);
987 	}
988 }
989 
990 /**
991  * Map a physical region in a specific set of page tables.
992  *
993  * See documentation for page_map_set() for additional notes about masks and
994  * supported options.
995  *
996  * It is vital to remember that all virtual-to-physical mappings must be
997  * the same with respect to supervisor mode regardless of what thread is
998  * scheduled (and therefore, if multiple sets of page tables exist, which one
999  * is active).
1000  *
1001  * It is permitted to set up mappings without the Present bit set.
1002  *
1003  * @param ptables Page tables to modify
1004  * @param virt Base page-aligned virtual memory address to map the region.
1005  * @param phys Base page-aligned physical memory address for the region.
1006  *        Ignored if OPTION_RESET or OPTION_CLEAR. Also affected by the mask
1007  *        parameter. This address is not directly examined, it will simply be
1008  *        programmed into the PTE.
1009  * @param size Size of the physical region to map
1010  * @param entry_flags Non-address bits to set in every PTE. Ignored if
1011  *        OPTION_RESET. Also affected by the mask parameter.
1012  * @param mask What bits to update in each PTE. Un-set bits will never be
1013  *        modified. Ignored if OPTION_RESET or OPTION_CLEAR.
1014  * @param options Control options, described above
1015  */
1016 __pinned_func
range_map_ptables(pentry_t * ptables,void * virt,uintptr_t phys,size_t size,pentry_t entry_flags,pentry_t mask,uint32_t options)1017 static void range_map_ptables(pentry_t *ptables, void *virt, uintptr_t phys,
1018 			      size_t size, pentry_t entry_flags, pentry_t mask,
1019 			      uint32_t options)
1020 {
1021 	bool zero_entry = (options & (OPTION_RESET | OPTION_CLEAR)) != 0U;
1022 
1023 	assert_addr_aligned(phys);
1024 	__ASSERT((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U,
1025 		 "unaligned size %zu", size);
1026 	__ASSERT((entry_flags & paging_levels[0].mask) == 0U,
1027 		 "entry_flags " PRI_ENTRY " overlaps address area",
1028 		 entry_flags);
1029 
1030 	/* This implementation is stack-efficient but not particularly fast.
1031 	 * We do a full page table walk for every page we are updating.
1032 	 * Recursive approaches are possible, but use much more stack space.
1033 	 */
1034 	for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) {
1035 		uint8_t *dest_virt = (uint8_t *)virt + offset;
1036 		pentry_t entry_val;
1037 
1038 		if (zero_entry) {
1039 			entry_val = 0;
1040 		} else {
1041 			entry_val = (pentry_t)(phys + offset) | entry_flags;
1042 		}
1043 
1044 		page_map_set(ptables, dest_virt, entry_val, NULL, mask,
1045 			     options);
1046 	}
1047 }
1048 
1049 /**
1050  * Establish or update a memory mapping for all page tables
1051  *
1052  * The physical region noted from phys to phys + size will be mapped to
1053  * an equal sized virtual region starting at virt, with the provided flags.
1054  * The mask value denotes what bits in PTEs will actually be modified.
1055  *
1056  * See range_map_ptables() for additional details.
1057  *
1058  * @param virt Page-aligned starting virtual address
1059  * @param phys Page-aligned starting physical address. Ignored if the mask
1060  *             parameter does not enable address bits or OPTION_RESET used.
1061  *             This region is not directly examined, it will simply be
1062  *             programmed into the page tables.
1063  * @param size Size of the physical region to map
1064  * @param entry_flags Desired state of non-address PTE bits covered by mask,
1065  *                    ignored if OPTION_RESET
1066  * @param mask What bits in the PTE to actually modifiy; unset bits will
1067  *             be preserved. Ignored if OPTION_RESET.
1068  * @param options Control options. Do not set OPTION_USER here. OPTION_FLUSH
1069  *                will trigger a TLB shootdown after all tables are updated.
1070  */
1071 __pinned_func
range_map(void * virt,uintptr_t phys,size_t size,pentry_t entry_flags,pentry_t mask,uint32_t options)1072 static void range_map(void *virt, uintptr_t phys, size_t size,
1073 		      pentry_t entry_flags, pentry_t mask, uint32_t options)
1074 {
1075 	LOG_DBG("%s: %p -> %p (%zu) flags " PRI_ENTRY " mask "
1076 		PRI_ENTRY " opt 0x%x", __func__, (void *)phys, virt, size,
1077 		entry_flags, mask, options);
1078 
1079 #ifdef CONFIG_X86_64
1080 	/* There's a gap in the "64-bit" address space, as 4-level paging
1081 	 * requires bits 48 to 63 to be copies of bit 47. Test this
1082 	 * by treating as a signed value and shifting.
1083 	 */
1084 	__ASSERT(((((intptr_t)virt) << 16) >> 16) == (intptr_t)virt,
1085 		 "non-canonical virtual address mapping %p (size %zu)",
1086 		 virt, size);
1087 #endif /* CONFIG_X86_64 */
1088 
1089 	__ASSERT((options & OPTION_USER) == 0U, "invalid option for function");
1090 
1091 	/* All virtual-to-physical mappings are the same in all page tables.
1092 	 * What can differ is only access permissions, defined by the memory
1093 	 * domain associated with the page tables, and the threads that are
1094 	 * members of that domain.
1095 	 *
1096 	 * Any new mappings need to be applied to all page tables.
1097 	 */
1098 #if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
1099 	sys_snode_t *node;
1100 
1101 	SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
1102 		struct arch_mem_domain *domain =
1103 			CONTAINER_OF(node, struct arch_mem_domain, node);
1104 
1105 		range_map_ptables(domain->ptables, virt, phys, size,
1106 				  entry_flags, mask, options | OPTION_USER);
1107 	}
1108 #endif /* CONFIG_USERSPACE */
1109 	range_map_ptables(z_x86_kernel_ptables, virt, phys, size, entry_flags,
1110 			  mask, options);
1111 
1112 #ifdef CONFIG_SMP
1113 	if ((options & OPTION_FLUSH) != 0U) {
1114 		tlb_shootdown();
1115 	}
1116 #endif /* CONFIG_SMP */
1117 }
1118 
1119 __pinned_func
range_map_unlocked(void * virt,uintptr_t phys,size_t size,pentry_t entry_flags,pentry_t mask,uint32_t options)1120 static inline void range_map_unlocked(void *virt, uintptr_t phys, size_t size,
1121 				      pentry_t entry_flags, pentry_t mask,
1122 				      uint32_t options)
1123 {
1124 	k_spinlock_key_t key;
1125 
1126 	key = k_spin_lock(&x86_mmu_lock);
1127 	range_map(virt, phys, size, entry_flags, mask, options);
1128 	k_spin_unlock(&x86_mmu_lock, key);
1129 }
1130 
1131 __pinned_func
flags_to_entry(uint32_t flags)1132 static pentry_t flags_to_entry(uint32_t flags)
1133 {
1134 	pentry_t entry_flags = MMU_P;
1135 
1136 	/* Translate flags argument into HW-recognized entry flags.
1137 	 *
1138 	 * Support for PAT is not implemented yet. Many systems may have
1139 	 * BIOS-populated MTRR values such that these cache settings are
1140 	 * redundant.
1141 	 */
1142 	switch (flags & K_MEM_CACHE_MASK) {
1143 	case K_MEM_CACHE_NONE:
1144 		entry_flags |= MMU_PCD;
1145 		break;
1146 	case K_MEM_CACHE_WT:
1147 		entry_flags |= MMU_PWT;
1148 		break;
1149 	case K_MEM_CACHE_WB:
1150 		break;
1151 	default:
1152 		__ASSERT(false, "bad memory mapping flags 0x%x", flags);
1153 	}
1154 
1155 	if ((flags & K_MEM_PERM_RW) != 0U) {
1156 		entry_flags |= ENTRY_RW;
1157 	}
1158 
1159 	if ((flags & K_MEM_PERM_USER) != 0U) {
1160 		entry_flags |= ENTRY_US;
1161 	}
1162 
1163 	if ((flags & K_MEM_PERM_EXEC) == 0U) {
1164 		entry_flags |= ENTRY_XD;
1165 	}
1166 
1167 	return entry_flags;
1168 }
1169 
1170 /* map new region virt..virt+size to phys with provided arch-neutral flags */
1171 __pinned_func
arch_mem_map(void * virt,uintptr_t phys,size_t size,uint32_t flags)1172 void arch_mem_map(void *virt, uintptr_t phys, size_t size, uint32_t flags)
1173 {
1174 	range_map_unlocked(virt, phys, size, flags_to_entry(flags),
1175 			   MASK_ALL, 0);
1176 }
1177 
1178 /* unmap region addr..addr+size, reset entries and flush TLB */
arch_mem_unmap(void * addr,size_t size)1179 void arch_mem_unmap(void *addr, size_t size)
1180 {
1181 	range_map_unlocked((void *)addr, 0, size, 0, 0,
1182 			   OPTION_FLUSH | OPTION_CLEAR);
1183 }
1184 
1185 #ifdef Z_VM_KERNEL
1186 __boot_func
identity_map_remove(uint32_t level)1187 static void identity_map_remove(uint32_t level)
1188 {
1189 	size_t size, scope = get_entry_scope(level);
1190 	pentry_t *table;
1191 	uint32_t cur_level;
1192 	uint8_t *pos;
1193 	pentry_t entry;
1194 	pentry_t *entry_ptr;
1195 
1196 	k_mem_region_align((uintptr_t *)&pos, &size,
1197 			   (uintptr_t)CONFIG_SRAM_BASE_ADDRESS,
1198 			   (size_t)CONFIG_SRAM_SIZE * 1024U, scope);
1199 
1200 	while (size != 0U) {
1201 		/* Need to get to the correct table */
1202 		table = z_x86_kernel_ptables;
1203 		for (cur_level = 0; cur_level < level; cur_level++) {
1204 			entry = get_entry(table, pos, cur_level);
1205 			table = next_table(entry, level);
1206 		}
1207 
1208 		entry_ptr = get_entry_ptr(table, pos, level);
1209 
1210 		/* set_pte */
1211 		*entry_ptr = 0;
1212 		pos += scope;
1213 		size -= scope;
1214 	}
1215 }
1216 #endif
1217 
1218 /* Invoked to remove the identity mappings in the page tables,
1219  * they were only needed to tranisition the instruction pointer at early boot
1220  */
1221 __boot_func
z_x86_mmu_init(void)1222 void z_x86_mmu_init(void)
1223 {
1224 #ifdef Z_VM_KERNEL
1225 	/* We booted with physical address space being identity mapped.
1226 	 * As we are now executing in virtual address space,
1227 	 * the identity map is no longer needed. So remove them.
1228 	 *
1229 	 * Without PAE, only need to remove the entries at the PD level.
1230 	 * With PAE, need to also remove the entry at PDP level.
1231 	 */
1232 	identity_map_remove(PDE_LEVEL);
1233 
1234 #ifdef CONFIG_X86_PAE
1235 	identity_map_remove(0);
1236 #endif
1237 #endif
1238 }
1239 
1240 #if CONFIG_X86_STACK_PROTECTION
1241 __pinned_func
z_x86_set_stack_guard(k_thread_stack_t * stack)1242 void z_x86_set_stack_guard(k_thread_stack_t *stack)
1243 {
1244 	/* Applied to all page tables as this affects supervisor mode.
1245 	 * XXX: This never gets reset when the thread exits, which can
1246 	 * cause problems if the memory is later used for something else.
1247 	 * See #29499
1248 	 *
1249 	 * Guard page is always the first page of the stack object for both
1250 	 * kernel and thread stacks.
1251 	 */
1252 	range_map_unlocked(stack, 0, CONFIG_MMU_PAGE_SIZE,
1253 			   MMU_P | ENTRY_XD, MASK_PERM, OPTION_FLUSH);
1254 }
1255 #endif /* CONFIG_X86_STACK_PROTECTION */
1256 
1257 #ifdef CONFIG_USERSPACE
1258 __pinned_func
page_validate(pentry_t * ptables,uint8_t * addr,bool write)1259 static bool page_validate(pentry_t *ptables, uint8_t *addr, bool write)
1260 {
1261 	pentry_t *table = (pentry_t *)ptables;
1262 
1263 	for (int level = 0; level < NUM_LEVELS; level++) {
1264 		pentry_t entry = get_entry(table, addr, level);
1265 
1266 		if (is_leaf(level, entry)) {
1267 #ifdef CONFIG_X86_KPTI
1268 			if (is_flipped_pte(entry)) {
1269 				/* We flipped this to prevent user access
1270 				 * since just clearing US isn't sufficient
1271 				 */
1272 				return false;
1273 			}
1274 #endif
1275 			/* US and RW bits still carry meaning if non-present.
1276 			 * If the data page is paged out, access bits are
1277 			 * preserved. If un-mapped, the whole entry is 0.
1278 			 */
1279 			if (((entry & MMU_US) == 0U) ||
1280 			    (write && ((entry & MMU_RW) == 0U))) {
1281 				return false;
1282 			}
1283 		} else {
1284 			if ((entry & MMU_P) == 0U) {
1285 				/* Missing intermediate table, address is
1286 				 * un-mapped
1287 				 */
1288 				return false;
1289 			}
1290 			table = next_table(entry, level);
1291 		}
1292 	}
1293 
1294 	return true;
1295 }
1296 
1297 __pinned_func
bcb_fence(void)1298 static inline void bcb_fence(void)
1299 {
1300 #ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION
1301 	__asm__ volatile ("lfence" : : : "memory");
1302 #endif
1303 }
1304 
1305 __pinned_func
arch_buffer_validate(void * addr,size_t size,int write)1306 int arch_buffer_validate(void *addr, size_t size, int write)
1307 {
1308 	pentry_t *ptables = z_x86_thread_page_tables_get(_current);
1309 	uint8_t *virt;
1310 	size_t aligned_size;
1311 	int ret = 0;
1312 
1313 	/* addr/size arbitrary, fix this up into an aligned region */
1314 	k_mem_region_align((uintptr_t *)&virt, &aligned_size,
1315 			   (uintptr_t)addr, size, CONFIG_MMU_PAGE_SIZE);
1316 
1317 	for (size_t offset = 0; offset < aligned_size;
1318 	     offset += CONFIG_MMU_PAGE_SIZE) {
1319 		if (!page_validate(ptables, virt + offset, write)) {
1320 			ret = -1;
1321 			break;
1322 		}
1323 	}
1324 
1325 	bcb_fence();
1326 
1327 	return ret;
1328 }
1329 #ifdef CONFIG_X86_COMMON_PAGE_TABLE
1330 /* Very low memory configuration. A single set of page tables is used for
1331  * all threads. This relies on some assumptions:
1332  *
1333  * - No KPTI. If that were supported, we would need both a kernel and user
1334  *   set of page tables.
1335  * - No SMP. If that were supported, we would need per-core page tables.
1336  * - Memory domains don't affect supervisor mode.
1337  * - All threads have the same virtual-to-physical mappings.
1338  * - Memory domain APIs can't be called by user mode.
1339  *
1340  * Because there is no SMP, only one set of page tables, and user threads can't
1341  * modify their own memory domains, we don't have to do much when
1342  * arch_mem_domain_* APIs are called. We do use a caching scheme to avoid
1343  * updating page tables if the last user thread scheduled was in the same
1344  * domain.
1345  *
1346  * We don't set CONFIG_ARCH_MEM_DOMAIN_DATA, since we aren't setting
1347  * up any arch-specific memory domain data (per domain page tables.)
1348  *
1349  * This is all nice and simple and saves a lot of memory. The cost is that
1350  * context switching is not trivial CR3 update. We have to reset all partitions
1351  * for the current domain configuration and then apply all the partitions for
1352  * the incoming thread's domain if they are not the same. We also need to
1353  * update permissions similarly on the thread stack region.
1354  */
1355 
1356 __pinned_func
reset_region(uintptr_t start,size_t size)1357 static inline void reset_region(uintptr_t start, size_t size)
1358 {
1359 	range_map_unlocked((void *)start, 0, size, 0, 0,
1360 			   OPTION_FLUSH | OPTION_RESET);
1361 }
1362 
1363 __pinned_func
apply_region(uintptr_t start,size_t size,pentry_t attr)1364 static inline void apply_region(uintptr_t start, size_t size, pentry_t attr)
1365 {
1366 	range_map_unlocked((void *)start, 0, size, attr, MASK_PERM,
1367 			   OPTION_FLUSH);
1368 }
1369 
1370 /* Cache of the current memory domain applied to the common page tables and
1371  * the stack buffer region that had User access granted.
1372  */
1373 static __pinned_bss struct k_mem_domain *current_domain;
1374 static __pinned_bss uintptr_t current_stack_start;
1375 static __pinned_bss size_t current_stack_size;
1376 
1377 __pinned_func
z_x86_swap_update_common_page_table(struct k_thread * incoming)1378 void z_x86_swap_update_common_page_table(struct k_thread *incoming)
1379 {
1380 	k_spinlock_key_t key;
1381 
1382 	if ((incoming->base.user_options & K_USER) == 0) {
1383 		/* Incoming thread is not a user thread. Memory domains don't
1384 		 * affect supervisor threads and we don't need to enable User
1385 		 * bits for its stack buffer; do nothing.
1386 		 */
1387 		return;
1388 	}
1389 
1390 	/* Step 1: Make sure the thread stack is set up correctly for the
1391 	 * for the incoming thread
1392 	 */
1393 	if (incoming->stack_info.start != current_stack_start ||
1394 	    incoming->stack_info.size != current_stack_size) {
1395 		if (current_stack_size != 0U) {
1396 			reset_region(current_stack_start, current_stack_size);
1397 		}
1398 
1399 		/* The incoming thread's stack region needs User permissions */
1400 		apply_region(incoming->stack_info.start,
1401 			     incoming->stack_info.size,
1402 			     K_MEM_PARTITION_P_RW_U_RW);
1403 
1404 		/* Update cache */
1405 		current_stack_start = incoming->stack_info.start;
1406 		current_stack_size = incoming->stack_info.size;
1407 	}
1408 
1409 	/* Step 2: The page tables always have some memory domain applied to
1410 	 * them. If the incoming thread's memory domain is different,
1411 	 * update the page tables
1412 	 */
1413 	key = k_spin_lock(&z_mem_domain_lock);
1414 	if (incoming->mem_domain_info.mem_domain == current_domain) {
1415 		/* The incoming thread's domain is already applied */
1416 		goto out_unlock;
1417 	}
1418 
1419 	/* Reset the current memory domain regions... */
1420 	if (current_domain != NULL) {
1421 		for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) {
1422 			struct k_mem_partition *ptn =
1423 				&current_domain->partitions[i];
1424 
1425 			if (ptn->size == 0) {
1426 				continue;
1427 			}
1428 			reset_region(ptn->start, ptn->size);
1429 		}
1430 	}
1431 
1432 	/* ...and apply all the incoming domain's regions */
1433 	for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) {
1434 		struct k_mem_partition *ptn =
1435 			&incoming->mem_domain_info.mem_domain->partitions[i];
1436 
1437 		if (ptn->size == 0) {
1438 			continue;
1439 		}
1440 		apply_region(ptn->start, ptn->size, ptn->attr);
1441 	}
1442 	current_domain = incoming->mem_domain_info.mem_domain;
1443 out_unlock:
1444 	k_spin_unlock(&z_mem_domain_lock, key);
1445 }
1446 
1447 /* If a partition was added or removed in the cached domain, update the
1448  * page tables.
1449  */
1450 __pinned_func
arch_mem_domain_partition_remove(struct k_mem_domain * domain,uint32_t partition_id)1451 void arch_mem_domain_partition_remove(struct k_mem_domain *domain,
1452 				      uint32_t partition_id)
1453 {
1454 	struct k_mem_partition *ptn;
1455 
1456 	if (domain != current_domain) {
1457 		return;
1458 	}
1459 
1460 	ptn = &domain->partitions[partition_id];
1461 	reset_region(ptn->start, ptn->size);
1462 }
1463 
1464 __pinned_func
arch_mem_domain_partition_add(struct k_mem_domain * domain,uint32_t partition_id)1465 void arch_mem_domain_partition_add(struct k_mem_domain *domain,
1466 				   uint32_t partition_id)
1467 {
1468 	struct k_mem_partition *ptn;
1469 
1470 	if (domain != current_domain) {
1471 		return;
1472 	}
1473 
1474 	ptn = &domain->partitions[partition_id];
1475 	apply_region(ptn->start, ptn->size, ptn->attr);
1476 }
1477 
1478 /* Rest of the APIs don't need to do anything */
1479 __pinned_func
arch_mem_domain_thread_add(struct k_thread * thread)1480 void arch_mem_domain_thread_add(struct k_thread *thread)
1481 {
1482 
1483 }
1484 
1485 __pinned_func
arch_mem_domain_thread_remove(struct k_thread * thread)1486 void arch_mem_domain_thread_remove(struct k_thread *thread)
1487 {
1488 
1489 }
1490 #else
1491 /* Memory domains each have a set of page tables assigned to them */
1492 
1493 /*
1494  * Pool of free memory pages for copying page tables, as needed.
1495  */
1496 #define PTABLE_COPY_SIZE	(INITIAL_PTABLE_PAGES * CONFIG_MMU_PAGE_SIZE)
1497 
1498 static uint8_t __pinned_noinit
1499 	page_pool[PTABLE_COPY_SIZE * CONFIG_X86_MAX_ADDITIONAL_MEM_DOMAINS]
1500 	__aligned(CONFIG_MMU_PAGE_SIZE);
1501 
1502 __pinned_data
1503 static uint8_t *page_pos = page_pool + sizeof(page_pool);
1504 
1505 /* Return a zeroed and suitably aligned memory page for page table data
1506  * from the global page pool
1507  */
1508 __pinned_func
page_pool_get(void)1509 static void *page_pool_get(void)
1510 {
1511 	void *ret;
1512 
1513 	if (page_pos == page_pool) {
1514 		ret = NULL;
1515 	} else {
1516 		page_pos -= CONFIG_MMU_PAGE_SIZE;
1517 		ret = page_pos;
1518 	}
1519 
1520 	if (ret != NULL) {
1521 		memset(ret, 0, CONFIG_MMU_PAGE_SIZE);
1522 	}
1523 
1524 	return ret;
1525 }
1526 
1527 /* Debugging function to show how many pages are free in the pool */
1528 __pinned_func
pages_free(void)1529 static inline unsigned int pages_free(void)
1530 {
1531 	return (page_pos - page_pool) / CONFIG_MMU_PAGE_SIZE;
1532 }
1533 
1534 /**
1535 *  Duplicate an entire set of page tables
1536  *
1537  * Uses recursion, but depth at any given moment is limited by the number of
1538  * paging levels.
1539  *
1540  * x86_mmu_lock must be held.
1541  *
1542  * @param dst a zeroed out chunk of memory of sufficient size for the indicated
1543  *            paging level.
1544  * @param src some paging structure from within the source page tables to copy
1545  *            at the indicated paging level
1546  * @param level Current paging level
1547  * @retval 0 Success
1548  * @retval -ENOMEM Insufficient page pool memory
1549  */
1550 __pinned_func
copy_page_table(pentry_t * dst,pentry_t * src,int level)1551 static int copy_page_table(pentry_t *dst, pentry_t *src, int level)
1552 {
1553 	if (level == PTE_LEVEL) {
1554 		/* Base case: leaf page table */
1555 		for (int i = 0; i < get_num_entries(level); i++) {
1556 			dst[i] = pte_finalize_value(reset_pte(src[i]), true,
1557 						    PTE_LEVEL);
1558 		}
1559 	} else {
1560 		/* Recursive case: allocate sub-structures as needed and
1561 		 * make recursive calls on them
1562 		 */
1563 		for (int i = 0; i < get_num_entries(level); i++) {
1564 			pentry_t *child_dst;
1565 			int ret;
1566 
1567 			if ((src[i] & MMU_P) == 0) {
1568 				/* Non-present, skip */
1569 				continue;
1570 			}
1571 
1572 			if ((level == PDE_LEVEL) && ((src[i] & MMU_PS) != 0)) {
1573 				/* large page: no lower level table */
1574 				dst[i] = pte_finalize_value(src[i], true,
1575 							    PDE_LEVEL);
1576 				continue;
1577 			}
1578 
1579 			__ASSERT((src[i] & MMU_PS) == 0,
1580 				 "large page encountered");
1581 
1582 			child_dst = page_pool_get();
1583 			if (child_dst == NULL) {
1584 				return -ENOMEM;
1585 			}
1586 
1587 			/* Page table links are by physical address. RAM
1588 			 * for page tables is identity-mapped, but double-
1589 			 * cast needed for PAE case where sizeof(void *) and
1590 			 * sizeof(pentry_t) are not the same.
1591 			 */
1592 			dst[i] = ((pentry_t)z_mem_phys_addr(child_dst) |
1593 				  INT_FLAGS);
1594 
1595 			ret = copy_page_table(child_dst,
1596 					      next_table(src[i], level),
1597 					      level + 1);
1598 			if (ret != 0) {
1599 				return ret;
1600 			}
1601 		}
1602 	}
1603 
1604 	return 0;
1605 }
1606 
1607 __pinned_func
region_map_update(pentry_t * ptables,void * start,size_t size,pentry_t flags,bool reset)1608 static void region_map_update(pentry_t *ptables, void *start,
1609 			      size_t size, pentry_t flags, bool reset)
1610 {
1611 	uint32_t options = OPTION_USER;
1612 	k_spinlock_key_t key;
1613 
1614 	if (reset) {
1615 		options |= OPTION_RESET;
1616 	}
1617 	if (ptables == z_x86_page_tables_get()) {
1618 		options |= OPTION_FLUSH;
1619 	}
1620 
1621 	key = k_spin_lock(&x86_mmu_lock);
1622 	(void)range_map_ptables(ptables, start, 0, size, flags, MASK_PERM,
1623 				options);
1624 	k_spin_unlock(&x86_mmu_lock, key);
1625 
1626 #ifdef CONFIG_SMP
1627 	tlb_shootdown();
1628 #endif
1629 }
1630 
1631 __pinned_func
reset_region(pentry_t * ptables,void * start,size_t size)1632 static inline void reset_region(pentry_t *ptables, void *start, size_t size)
1633 {
1634 	LOG_DBG("%s(%p, %p, %zu)", __func__, ptables, start, size);
1635 	region_map_update(ptables, start, size, 0, true);
1636 }
1637 
1638 __pinned_func
apply_region(pentry_t * ptables,void * start,size_t size,pentry_t attr)1639 static inline void apply_region(pentry_t *ptables, void *start,
1640 				size_t size, pentry_t attr)
1641 {
1642 	LOG_DBG("%s(%p, %p, %zu, " PRI_ENTRY ")", __func__, ptables, start,
1643 		size, attr);
1644 	region_map_update(ptables, start, size, attr, false);
1645 }
1646 
1647 __pinned_func
set_stack_perms(struct k_thread * thread,pentry_t * ptables)1648 static void set_stack_perms(struct k_thread *thread, pentry_t *ptables)
1649 {
1650 	LOG_DBG("update stack for thread %p's ptables at %p: %p (size %zu)",
1651 		thread, ptables, (void *)thread->stack_info.start,
1652 		thread->stack_info.size);
1653 	apply_region(ptables, (void *)thread->stack_info.start,
1654 		     thread->stack_info.size,
1655 		     MMU_P | MMU_XD | MMU_RW | MMU_US);
1656 }
1657 
1658 /*
1659  * Arch interface implementations for memory domains and userspace
1660  */
1661 
1662 __boot_func
arch_mem_domain_init(struct k_mem_domain * domain)1663 int arch_mem_domain_init(struct k_mem_domain *domain)
1664 {
1665 	int ret;
1666 	k_spinlock_key_t key  = k_spin_lock(&x86_mmu_lock);
1667 
1668 	LOG_DBG("%s(%p)", __func__, domain);
1669 #if __ASSERT_ON
1670 	sys_snode_t *node;
1671 
1672 	/* Assert that we have not already initialized this domain */
1673 	SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
1674 		struct arch_mem_domain *list_domain =
1675 			CONTAINER_OF(node, struct arch_mem_domain, node);
1676 
1677 		__ASSERT(list_domain != &domain->arch,
1678 			 "%s(%p) called multiple times", __func__, domain);
1679 	}
1680 #endif /* __ASSERT_ON */
1681 #ifndef CONFIG_X86_KPTI
1682 	/* If we're not using KPTI then we can use the build time page tables
1683 	 * (which are mutable) as the set of page tables for the default
1684 	 * memory domain, saving us some memory.
1685 	 *
1686 	 * We skip adding this domain to x86_domain_list since we already
1687 	 * update z_x86_kernel_ptables directly in range_map().
1688 	 */
1689 	if (domain == &k_mem_domain_default) {
1690 		domain->arch.ptables = z_x86_kernel_ptables;
1691 		k_spin_unlock(&x86_mmu_lock, key);
1692 		return 0;
1693 	}
1694 #endif /* CONFIG_X86_KPTI */
1695 #ifdef CONFIG_X86_PAE
1696 	/* PDPT is stored within the memory domain itself since it is
1697 	 * much smaller than a full page
1698 	 */
1699 	(void)memset(domain->arch.pdpt, 0, sizeof(domain->arch.pdpt));
1700 	domain->arch.ptables = domain->arch.pdpt;
1701 #else
1702 	/* Allocate a page-sized top-level structure, either a PD or PML4 */
1703 	domain->arch.ptables = page_pool_get();
1704 	if (domain->arch.ptables == NULL) {
1705 		k_spin_unlock(&x86_mmu_lock, key);
1706 		return -ENOMEM;
1707 	}
1708 #endif /* CONFIG_X86_PAE */
1709 
1710 	LOG_DBG("copy_page_table(%p, %p, 0)", domain->arch.ptables,
1711 		z_x86_kernel_ptables);
1712 
1713 	/* Make a copy of the boot page tables created by gen_mmu.py */
1714 	ret = copy_page_table(domain->arch.ptables, z_x86_kernel_ptables, 0);
1715 	if (ret == 0) {
1716 		sys_slist_append(&x86_domain_list, &domain->arch.node);
1717 	}
1718 	k_spin_unlock(&x86_mmu_lock, key);
1719 
1720 	return ret;
1721 }
1722 
arch_mem_domain_partition_remove(struct k_mem_domain * domain,uint32_t partition_id)1723 void arch_mem_domain_partition_remove(struct k_mem_domain *domain,
1724 				      uint32_t partition_id)
1725 {
1726 	struct k_mem_partition *partition = &domain->partitions[partition_id];
1727 
1728 	/* Reset the partition's region back to defaults */
1729 	reset_region(domain->arch.ptables, (void *)partition->start,
1730 		     partition->size);
1731 }
1732 
1733 /* Called on thread exit or when moving it to a different memory domain */
arch_mem_domain_thread_remove(struct k_thread * thread)1734 void arch_mem_domain_thread_remove(struct k_thread *thread)
1735 {
1736 	struct k_mem_domain *domain = thread->mem_domain_info.mem_domain;
1737 
1738 	if ((thread->base.user_options & K_USER) == 0) {
1739 		return;
1740 	}
1741 
1742 	if ((thread->base.thread_state & _THREAD_DEAD) == 0) {
1743 		/* Thread is migrating to another memory domain and not
1744 		 * exiting for good; we weren't called from
1745 		 * z_thread_abort().  Resetting the stack region will
1746 		 * take place in the forthcoming thread_add() call.
1747 		 */
1748 		return;
1749 	}
1750 
1751 	/* Restore permissions on the thread's stack area since it is no
1752 	 * longer a member of the domain.
1753 	 */
1754 	reset_region(domain->arch.ptables, (void *)thread->stack_info.start,
1755 		     thread->stack_info.size);
1756 }
1757 
1758 __pinned_func
arch_mem_domain_partition_add(struct k_mem_domain * domain,uint32_t partition_id)1759 void arch_mem_domain_partition_add(struct k_mem_domain *domain,
1760 				   uint32_t partition_id)
1761 {
1762 	struct k_mem_partition *partition = &domain->partitions[partition_id];
1763 
1764 	/* Update the page tables with the partition info */
1765 	apply_region(domain->arch.ptables, (void *)partition->start,
1766 		     partition->size, partition->attr | MMU_P);
1767 }
1768 
1769 /* Invoked from memory domain API calls, as well as during thread creation */
1770 __pinned_func
arch_mem_domain_thread_add(struct k_thread * thread)1771 void arch_mem_domain_thread_add(struct k_thread *thread)
1772 {
1773 	/* New memory domain we are being added to */
1774 	struct k_mem_domain *domain = thread->mem_domain_info.mem_domain;
1775 	/* This is only set for threads that were migrating from some other
1776 	 * memory domain; new threads this is NULL.
1777 	 *
1778 	 * Note that NULL check on old_ptables must be done before any
1779 	 * address translation or else (NULL + offset) != NULL.
1780 	 */
1781 	pentry_t *old_ptables = UINT_TO_POINTER(thread->arch.ptables);
1782 	bool is_user = (thread->base.user_options & K_USER) != 0;
1783 	bool is_migration = (old_ptables != NULL) && is_user;
1784 
1785 	/* Allow US access to the thread's stack in its new domain if
1786 	 * we are migrating. If we are not migrating this is done in
1787 	 * z_x86_current_stack_perms()
1788 	 */
1789 	if (is_migration) {
1790 		old_ptables = z_mem_virt_addr(thread->arch.ptables);
1791 		set_stack_perms(thread, domain->arch.ptables);
1792 	}
1793 
1794 	thread->arch.ptables = z_mem_phys_addr(domain->arch.ptables);
1795 	LOG_DBG("set thread %p page tables to %p", thread,
1796 		(void *)thread->arch.ptables);
1797 
1798 	/* Check if we're doing a migration from a different memory domain
1799 	 * and have to remove permissions from its old domain.
1800 	 *
1801 	 * XXX: The checks we have to do here and in
1802 	 * arch_mem_domain_thread_remove() are clumsy, it may be worth looking
1803 	 * into adding a specific arch_mem_domain_thread_migrate() API.
1804 	 * See #29601
1805 	 */
1806 	if (is_migration) {
1807 		reset_region(old_ptables, (void *)thread->stack_info.start,
1808 			     thread->stack_info.size);
1809 	}
1810 
1811 #if !defined(CONFIG_X86_KPTI) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
1812 	/* Need to switch to using these new page tables, in case we drop
1813 	 * to user mode before we are ever context switched out.
1814 	 * IPI takes care of this if the thread is currently running on some
1815 	 * other CPU.
1816 	 */
1817 	if (thread == _current && thread->arch.ptables != z_x86_cr3_get()) {
1818 		z_x86_cr3_set(thread->arch.ptables);
1819 	}
1820 #endif /* CONFIG_X86_KPTI */
1821 }
1822 #endif /* !CONFIG_X86_COMMON_PAGE_TABLE */
1823 
1824 __pinned_func
arch_mem_domain_max_partitions_get(void)1825 int arch_mem_domain_max_partitions_get(void)
1826 {
1827 	return CONFIG_MAX_DOMAIN_PARTITIONS;
1828 }
1829 
1830 /* Invoked from z_x86_userspace_enter */
1831 __pinned_func
z_x86_current_stack_perms(void)1832 void z_x86_current_stack_perms(void)
1833 {
1834 	/* Clear any previous context in the stack buffer to prevent
1835 	 * unintentional data leakage.
1836 	 */
1837 	(void)memset((void *)_current->stack_info.start, 0xAA,
1838 		     _current->stack_info.size - _current->stack_info.delta);
1839 
1840 	/* Only now is it safe to grant access to the stack buffer since any
1841 	 * previous context has been erased.
1842 	 */
1843 #ifdef CONFIG_X86_COMMON_PAGE_TABLE
1844 	/* Re run swap page table update logic since we're entering User mode.
1845 	 * This will grant stack and memory domain access if it wasn't set
1846 	 * already (in which case this returns very quickly).
1847 	 */
1848 	z_x86_swap_update_common_page_table(_current);
1849 #else
1850 	/* Memory domain access is already programmed into the page tables.
1851 	 * Need to enable access to this new user thread's stack buffer in
1852 	 * its domain-specific page tables.
1853 	 */
1854 	set_stack_perms(_current, z_x86_thread_page_tables_get(_current));
1855 #endif
1856 }
1857 #endif /* CONFIG_USERSPACE */
1858 
1859 #ifdef CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES
1860 __boot_func
mark_addr_page_reserved(uintptr_t addr,size_t len)1861 static void mark_addr_page_reserved(uintptr_t addr, size_t len)
1862 {
1863 	uintptr_t pos = ROUND_DOWN(addr, CONFIG_MMU_PAGE_SIZE);
1864 	uintptr_t end = ROUND_UP(addr + len, CONFIG_MMU_PAGE_SIZE);
1865 
1866 	for (; pos < end; pos += CONFIG_MMU_PAGE_SIZE) {
1867 		if (!z_is_page_frame(pos)) {
1868 			continue;
1869 		}
1870 
1871 		struct z_page_frame *pf = z_phys_to_page_frame(pos);
1872 
1873 		pf->flags |= Z_PAGE_FRAME_RESERVED;
1874 	}
1875 }
1876 
1877 __boot_func
arch_reserved_pages_update(void)1878 void arch_reserved_pages_update(void)
1879 {
1880 #ifdef CONFIG_X86_PC_COMPATIBLE
1881 	/*
1882 	 * Best is to do some E820 or similar enumeration to specifically
1883 	 * identify all page frames which are reserved by the hardware or
1884 	 * firmware. Or use x86_memmap[] with Multiboot if available.
1885 	 *
1886 	 * But still, reserve everything in the first megabyte of physical
1887 	 * memory on PC-compatible platforms.
1888 	 */
1889 	mark_addr_page_reserved(0, MB(1));
1890 #endif /* CONFIG_X86_PC_COMPATIBLE */
1891 
1892 #ifdef CONFIG_X86_MEMMAP
1893 	for (int i = 0; i < CONFIG_X86_MEMMAP_ENTRIES; i++) {
1894 		struct x86_memmap_entry *entry = &x86_memmap[i];
1895 
1896 		switch (entry->type) {
1897 		case X86_MEMMAP_ENTRY_UNUSED:
1898 			__fallthrough;
1899 		case X86_MEMMAP_ENTRY_RAM:
1900 			continue;
1901 
1902 		case X86_MEMMAP_ENTRY_ACPI:
1903 			__fallthrough;
1904 		case X86_MEMMAP_ENTRY_NVS:
1905 			__fallthrough;
1906 		case X86_MEMMAP_ENTRY_DEFECTIVE:
1907 			__fallthrough;
1908 		default:
1909 			/* If any of three above cases satisfied, exit switch
1910 			 * and mark page reserved
1911 			 */
1912 			break;
1913 		}
1914 
1915 		mark_addr_page_reserved(entry->base, entry->length);
1916 	}
1917 #endif /* CONFIG_X86_MEMMAP */
1918 }
1919 #endif /* CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES */
1920 
arch_page_phys_get(void * virt,uintptr_t * phys)1921 int arch_page_phys_get(void *virt, uintptr_t *phys)
1922 {
1923 	pentry_t pte = 0;
1924 	int level, ret;
1925 
1926 	__ASSERT(POINTER_TO_UINT(virt) % CONFIG_MMU_PAGE_SIZE == 0U,
1927 		 "unaligned address %p to %s", virt, __func__);
1928 
1929 	pentry_get(&level, &pte, z_x86_page_tables_get(), virt);
1930 
1931 	if ((pte & MMU_P) != 0) {
1932 		if (phys != NULL) {
1933 			*phys = (uintptr_t)get_entry_phys(pte, PTE_LEVEL);
1934 		}
1935 		ret = 0;
1936 	} else {
1937 		/* Not mapped */
1938 		ret = -EFAULT;
1939 	}
1940 
1941 	return ret;
1942 }
1943 
1944 #ifdef CONFIG_DEMAND_PAGING
1945 #define PTE_MASK (paging_levels[PTE_LEVEL].mask)
1946 
1947 __pinned_func
arch_mem_page_out(void * addr,uintptr_t location)1948 void arch_mem_page_out(void *addr, uintptr_t location)
1949 {
1950 	pentry_t mask = PTE_MASK | MMU_P | MMU_A;
1951 
1952 	/* Accessed bit set to guarantee the entry is not completely 0 in
1953 	 * case of location value 0. A totally 0 PTE is un-mapped.
1954 	 */
1955 	range_map(addr, location, CONFIG_MMU_PAGE_SIZE,	MMU_A, mask,
1956 		  OPTION_FLUSH);
1957 }
1958 
1959 __pinned_func
arch_mem_page_in(void * addr,uintptr_t phys)1960 void arch_mem_page_in(void *addr, uintptr_t phys)
1961 {
1962 	pentry_t mask = PTE_MASK | MMU_P | MMU_D | MMU_A;
1963 
1964 	range_map(addr, phys, CONFIG_MMU_PAGE_SIZE,	MMU_P, mask,
1965 		  OPTION_FLUSH);
1966 }
1967 
1968 __pinned_func
arch_mem_scratch(uintptr_t phys)1969 void arch_mem_scratch(uintptr_t phys)
1970 {
1971 	page_map_set(z_x86_page_tables_get(), Z_SCRATCH_PAGE,
1972 		     phys | MMU_P | MMU_RW | MMU_XD, NULL, MASK_ALL,
1973 		     OPTION_FLUSH);
1974 }
1975 
1976 __pinned_func
arch_page_info_get(void * addr,uintptr_t * phys,bool clear_accessed)1977 uintptr_t arch_page_info_get(void *addr, uintptr_t *phys, bool clear_accessed)
1978 {
1979 	pentry_t all_pte, mask;
1980 	uint32_t options;
1981 
1982 	/* What to change, if anything, in the page_map_set() calls */
1983 	if (clear_accessed) {
1984 		mask = MMU_A;
1985 		options = OPTION_FLUSH;
1986 	} else {
1987 		/* In this configuration page_map_set() just queries the
1988 		 * page table and makes no changes
1989 		 */
1990 		mask = 0;
1991 		options = 0U;
1992 	}
1993 
1994 	page_map_set(z_x86_kernel_ptables, addr, 0, &all_pte, mask, options);
1995 
1996 	/* Un-mapped PTEs are completely zeroed. No need to report anything
1997 	 * else in this case.
1998 	 */
1999 	if (all_pte == 0) {
2000 		return ARCH_DATA_PAGE_NOT_MAPPED;
2001 	}
2002 
2003 #if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
2004 	/* Don't bother looking at other page tables if non-present as we
2005 	 * are not required to report accurate accessed/dirty in this case
2006 	 * and all mappings are otherwise the same.
2007 	 */
2008 	if ((all_pte & MMU_P) != 0) {
2009 		sys_snode_t *node;
2010 
2011 		/* IRQs are locked, safe to do this */
2012 		SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
2013 			pentry_t cur_pte;
2014 			struct arch_mem_domain *domain =
2015 				CONTAINER_OF(node, struct arch_mem_domain,
2016 					     node);
2017 
2018 			page_map_set(domain->ptables, addr, 0, &cur_pte,
2019 				     mask, options | OPTION_USER);
2020 
2021 			/* Logical OR of relevant PTE in all page tables.
2022 			 * addr/location and present state should be identical
2023 			 * among them.
2024 			 */
2025 			all_pte |= cur_pte;
2026 		}
2027 	}
2028 #endif /* USERSPACE && ~X86_COMMON_PAGE_TABLE */
2029 
2030 	/* NOTE: We are truncating the PTE on PAE systems, whose pentry_t
2031 	 * are larger than a uintptr_t.
2032 	 *
2033 	 * We currently aren't required to report back XD state (bit 63), and
2034 	 * Zephyr just doesn't support large physical memory on 32-bit
2035 	 * systems, PAE was only implemented for XD support.
2036 	 */
2037 	if (phys != NULL) {
2038 		*phys = (uintptr_t)get_entry_phys(all_pte, PTE_LEVEL);
2039 	}
2040 
2041 	/* We don't filter out any other bits in the PTE and the kernel
2042 	 * ignores them. For the case of ARCH_DATA_PAGE_NOT_MAPPED,
2043 	 * we use a bit which is never set in a real PTE (the PAT bit) in the
2044 	 * current system.
2045 	 *
2046 	 * The other ARCH_DATA_PAGE_* macros are defined to their corresponding
2047 	 * bits in the PTE.
2048 	 */
2049 	return (uintptr_t)all_pte;
2050 }
2051 
2052 __pinned_func
arch_page_location_get(void * addr,uintptr_t * location)2053 enum arch_page_location arch_page_location_get(void *addr, uintptr_t *location)
2054 {
2055 	pentry_t pte;
2056 	int level;
2057 
2058 	/* TODO: since we only have to query the current set of page tables,
2059 	 * could optimize this with recursive page table mapping
2060 	 */
2061 	pentry_get(&level, &pte, z_x86_page_tables_get(), addr);
2062 
2063 	if (pte == 0) {
2064 		/* Not mapped */
2065 		return ARCH_PAGE_LOCATION_BAD;
2066 	}
2067 
2068 	__ASSERT(level == PTE_LEVEL, "bigpage found at %p", addr);
2069 	*location = (uintptr_t)get_entry_phys(pte, PTE_LEVEL);
2070 
2071 	if ((pte & MMU_P) != 0) {
2072 		return ARCH_PAGE_LOCATION_PAGED_IN;
2073 	} else {
2074 		return ARCH_PAGE_LOCATION_PAGED_OUT;
2075 	}
2076 }
2077 
2078 #ifdef CONFIG_X86_KPTI
2079 __pinned_func
z_x86_kpti_is_access_ok(void * addr,pentry_t * ptables)2080 bool z_x86_kpti_is_access_ok(void *addr, pentry_t *ptables)
2081 {
2082 	pentry_t pte;
2083 	int level;
2084 
2085 	pentry_get(&level, &pte, ptables, addr);
2086 
2087 	/* Might as well also check if it's un-mapped, normally we don't
2088 	 * fetch the PTE from the page tables until we are inside
2089 	 * z_page_fault() and call arch_page_fault_status_get()
2090 	 */
2091 	if (level != PTE_LEVEL || pte == 0 || is_flipped_pte(pte)) {
2092 		return false;
2093 	}
2094 
2095 	return true;
2096 }
2097 #endif /* CONFIG_X86_KPTI */
2098 #endif /* CONFIG_DEMAND_PAGING */
2099