1 /*
2  * Copyright (c) 2020 Intel Corporation
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Routines for managing virtual address spaces
7  */
8 
9 #include <stdint.h>
10 #include <kernel_arch_interface.h>
11 #include <zephyr/spinlock.h>
12 #include <mmu.h>
13 #include <zephyr/init.h>
14 #include <kernel_internal.h>
15 #include <zephyr/syscall_handler.h>
16 #include <zephyr/toolchain.h>
17 #include <zephyr/linker/linker-defs.h>
18 #include <zephyr/sys/bitarray.h>
19 #include <zephyr/timing/timing.h>
20 #include <zephyr/logging/log.h>
21 LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL);
22 
23 /*
24  * General terminology:
25  * - A page frame is a page-sized physical memory region in RAM. It is a
26  *   container where a data page may be placed. It is always referred to by
27  *   physical address. We have a convention of using uintptr_t for physical
28  *   addresses. We instantiate a struct z_page_frame to store metadata for
29  *   every page frame.
30  *
31  * - A data page is a page-sized region of data. It may exist in a page frame,
32  *   or be paged out to some backing store. Its location can always be looked
33  *   up in the CPU's page tables (or equivalent) by virtual address.
34  *   The data type will always be void * or in some cases uint8_t * when we
35  *   want to do pointer arithmetic.
36  */
37 
38 /* Spinlock to protect any globals in this file and serialize page table
39  * updates in arch code
40  */
41 struct k_spinlock z_mm_lock;
42 
43 /*
44  * General page frame management
45  */
46 
47 /* Database of all RAM page frames */
48 struct z_page_frame z_page_frames[Z_NUM_PAGE_FRAMES];
49 
50 #if __ASSERT_ON
51 /* Indicator that z_page_frames has been initialized, many of these APIs do
52  * not work before POST_KERNEL
53  */
54 static bool page_frames_initialized;
55 #endif
56 
57 /* Add colors to page table dumps to indicate mapping type */
58 #define COLOR_PAGE_FRAMES	1
59 
60 #if COLOR_PAGE_FRAMES
61 #define ANSI_DEFAULT "\x1B" "[0m"
62 #define ANSI_RED     "\x1B" "[1;31m"
63 #define ANSI_GREEN   "\x1B" "[1;32m"
64 #define ANSI_YELLOW  "\x1B" "[1;33m"
65 #define ANSI_BLUE    "\x1B" "[1;34m"
66 #define ANSI_MAGENTA "\x1B" "[1;35m"
67 #define ANSI_CYAN    "\x1B" "[1;36m"
68 #define ANSI_GREY    "\x1B" "[1;90m"
69 
70 #define COLOR(x)	printk(_CONCAT(ANSI_, x))
71 #else
72 #define COLOR(x)	do { } while (false)
73 #endif
74 
75 /* LCOV_EXCL_START */
page_frame_dump(struct z_page_frame * pf)76 static void page_frame_dump(struct z_page_frame *pf)
77 {
78 	if (z_page_frame_is_reserved(pf)) {
79 		COLOR(CYAN);
80 		printk("R");
81 	} else if (z_page_frame_is_busy(pf)) {
82 		COLOR(MAGENTA);
83 		printk("B");
84 	} else if (z_page_frame_is_pinned(pf)) {
85 		COLOR(YELLOW);
86 		printk("P");
87 	} else if (z_page_frame_is_available(pf)) {
88 		COLOR(GREY);
89 		printk(".");
90 	} else if (z_page_frame_is_mapped(pf)) {
91 		COLOR(DEFAULT);
92 		printk("M");
93 	} else {
94 		COLOR(RED);
95 		printk("?");
96 	}
97 }
98 
z_page_frames_dump(void)99 void z_page_frames_dump(void)
100 {
101 	int column = 0;
102 
103 	__ASSERT(page_frames_initialized, "%s called too early", __func__);
104 	printk("Physical memory from 0x%lx to 0x%lx\n",
105 	       Z_PHYS_RAM_START, Z_PHYS_RAM_END);
106 
107 	for (int i = 0; i < Z_NUM_PAGE_FRAMES; i++) {
108 		struct z_page_frame *pf = &z_page_frames[i];
109 
110 		page_frame_dump(pf);
111 
112 		column++;
113 		if (column == 64) {
114 			column = 0;
115 			printk("\n");
116 		}
117 	}
118 
119 	COLOR(DEFAULT);
120 	if (column != 0) {
121 		printk("\n");
122 	}
123 }
124 /* LCOV_EXCL_STOP */
125 
126 #define VIRT_FOREACH(_base, _size, _pos) \
127 	for (_pos = _base; \
128 	     _pos < ((uint8_t *)_base + _size); _pos += CONFIG_MMU_PAGE_SIZE)
129 
130 #define PHYS_FOREACH(_base, _size, _pos) \
131 	for (_pos = _base; \
132 	     _pos < ((uintptr_t)_base + _size); _pos += CONFIG_MMU_PAGE_SIZE)
133 
134 
135 /*
136  * Virtual address space management
137  *
138  * Call all of these functions with z_mm_lock held.
139  *
140  * Overall virtual memory map: When the kernel starts, it resides in
141  * virtual memory in the region Z_KERNEL_VIRT_START to
142  * Z_KERNEL_VIRT_END. Unused virtual memory past this, up to the limit
143  * noted by CONFIG_KERNEL_VM_SIZE may be used for runtime memory mappings.
144  *
145  * If CONFIG_ARCH_MAPS_ALL_RAM is set, we do not just map the kernel image,
146  * but have a mapping for all RAM in place. This is for special architectural
147  * purposes and does not otherwise affect page frame accounting or flags;
148  * the only guarantee is that such RAM mapping outside of the Zephyr image
149  * won't be disturbed by subsequent memory mapping calls.
150  *
151  * +--------------+ <- Z_VIRT_RAM_START
152  * | Undefined VM | <- May contain ancillary regions like x86_64's locore
153  * +--------------+ <- Z_KERNEL_VIRT_START (often == Z_VIRT_RAM_START)
154  * | Mapping for  |
155  * | main kernel  |
156  * | image        |
157  * |		  |
158  * |		  |
159  * +--------------+ <- Z_FREE_VM_START
160  * |              |
161  * | Unused,      |
162  * | Available VM |
163  * |              |
164  * |..............| <- mapping_pos (grows downward as more mappings are made)
165  * | Mapping      |
166  * +--------------+
167  * | Mapping      |
168  * +--------------+
169  * | ...          |
170  * +--------------+
171  * | Mapping      |
172  * +--------------+ <- mappings start here
173  * | Reserved     | <- special purpose virtual page(s) of size Z_VM_RESERVED
174  * +--------------+ <- Z_VIRT_RAM_END
175  */
176 
177 /* Bitmap of virtual addresses where one bit corresponds to one page.
178  * This is being used for virt_region_alloc() to figure out which
179  * region of virtual addresses can be used for memory mapping.
180  *
181  * Note that bit #0 is the highest address so that allocation is
182  * done in reverse from highest address.
183  */
184 SYS_BITARRAY_DEFINE_STATIC(virt_region_bitmap,
185 			   CONFIG_KERNEL_VM_SIZE / CONFIG_MMU_PAGE_SIZE);
186 
187 static bool virt_region_inited;
188 
189 #define Z_VIRT_REGION_START_ADDR	Z_FREE_VM_START
190 #define Z_VIRT_REGION_END_ADDR		(Z_VIRT_RAM_END - Z_VM_RESERVED)
191 
virt_from_bitmap_offset(size_t offset,size_t size)192 static inline uintptr_t virt_from_bitmap_offset(size_t offset, size_t size)
193 {
194 	return POINTER_TO_UINT(Z_VIRT_RAM_END)
195 	       - (offset * CONFIG_MMU_PAGE_SIZE) - size;
196 }
197 
virt_to_bitmap_offset(void * vaddr,size_t size)198 static inline size_t virt_to_bitmap_offset(void *vaddr, size_t size)
199 {
200 	return (POINTER_TO_UINT(Z_VIRT_RAM_END)
201 		- POINTER_TO_UINT(vaddr) - size) / CONFIG_MMU_PAGE_SIZE;
202 }
203 
virt_region_init(void)204 static void virt_region_init(void)
205 {
206 	size_t offset, num_bits;
207 
208 	/* There are regions where we should never map via
209 	 * k_mem_map() and z_phys_map(). Mark them as
210 	 * already allocated so they will never be used.
211 	 */
212 
213 	if (Z_VM_RESERVED > 0) {
214 		/* Mark reserved region at end of virtual address space */
215 		num_bits = Z_VM_RESERVED / CONFIG_MMU_PAGE_SIZE;
216 		(void)sys_bitarray_set_region(&virt_region_bitmap,
217 					      num_bits, 0);
218 	}
219 
220 	/* Mark all bits up to Z_FREE_VM_START as allocated */
221 	num_bits = POINTER_TO_UINT(Z_FREE_VM_START)
222 		   - POINTER_TO_UINT(Z_VIRT_RAM_START);
223 	offset = virt_to_bitmap_offset(Z_VIRT_RAM_START, num_bits);
224 	num_bits /= CONFIG_MMU_PAGE_SIZE;
225 	(void)sys_bitarray_set_region(&virt_region_bitmap,
226 				      num_bits, offset);
227 
228 	virt_region_inited = true;
229 }
230 
virt_region_free(void * vaddr,size_t size)231 static void virt_region_free(void *vaddr, size_t size)
232 {
233 	size_t offset, num_bits;
234 	uint8_t *vaddr_u8 = (uint8_t *)vaddr;
235 
236 	if (unlikely(!virt_region_inited)) {
237 		virt_region_init();
238 	}
239 
240 #ifndef CONFIG_KERNEL_DIRECT_MAP
241 	__ASSERT((vaddr_u8 >= Z_VIRT_REGION_START_ADDR)
242 		 && ((vaddr_u8 + size - 1) < Z_VIRT_REGION_END_ADDR),
243 		 "invalid virtual address region %p (%zu)", vaddr_u8, size);
244 #endif
245 	if (!((vaddr_u8 >= Z_VIRT_REGION_START_ADDR)
246 	      && ((vaddr_u8 + size - 1) < Z_VIRT_REGION_END_ADDR))) {
247 		return;
248 	}
249 
250 	offset = virt_to_bitmap_offset(vaddr, size);
251 	num_bits = size / CONFIG_MMU_PAGE_SIZE;
252 	(void)sys_bitarray_free(&virt_region_bitmap, num_bits, offset);
253 }
254 
virt_region_alloc(size_t size,size_t align)255 static void *virt_region_alloc(size_t size, size_t align)
256 {
257 	uintptr_t dest_addr;
258 	size_t alloc_size;
259 	size_t offset;
260 	size_t num_bits;
261 	int ret;
262 
263 	if (unlikely(!virt_region_inited)) {
264 		virt_region_init();
265 	}
266 
267 	/* Possibly request more pages to ensure we can get an aligned virtual address */
268 	num_bits = (size + align - CONFIG_MMU_PAGE_SIZE) / CONFIG_MMU_PAGE_SIZE;
269 	alloc_size = num_bits * CONFIG_MMU_PAGE_SIZE;
270 	ret = sys_bitarray_alloc(&virt_region_bitmap, num_bits, &offset);
271 	if (ret != 0) {
272 		LOG_ERR("insufficient virtual address space (requested %zu)",
273 			size);
274 		return NULL;
275 	}
276 
277 	/* Remember that bit #0 in bitmap corresponds to the highest
278 	 * virtual address. So here we need to go downwards (backwards?)
279 	 * to get the starting address of the allocated region.
280 	 */
281 	dest_addr = virt_from_bitmap_offset(offset, alloc_size);
282 
283 	if (alloc_size > size) {
284 		uintptr_t aligned_dest_addr = ROUND_UP(dest_addr, align);
285 
286 		/* Here is the memory organization when trying to get an aligned
287 		 * virtual address:
288 		 *
289 		 * +--------------+ <- Z_VIRT_RAM_START
290 		 * | Undefined VM |
291 		 * +--------------+ <- Z_KERNEL_VIRT_START (often == Z_VIRT_RAM_START)
292 		 * | Mapping for  |
293 		 * | main kernel  |
294 		 * | image        |
295 		 * |		  |
296 		 * |		  |
297 		 * +--------------+ <- Z_FREE_VM_START
298 		 * | ...          |
299 		 * +==============+ <- dest_addr
300 		 * | Unused       |
301 		 * |..............| <- aligned_dest_addr
302 		 * |              |
303 		 * | Aligned      |
304 		 * | Mapping      |
305 		 * |              |
306 		 * |..............| <- aligned_dest_addr + size
307 		 * | Unused       |
308 		 * +==============+ <- offset from Z_VIRT_RAM_END == dest_addr + alloc_size
309 		 * | ...          |
310 		 * +--------------+
311 		 * | Mapping      |
312 		 * +--------------+
313 		 * | Reserved     |
314 		 * +--------------+ <- Z_VIRT_RAM_END
315 		 */
316 
317 		/* Free the two unused regions */
318 		virt_region_free(UINT_TO_POINTER(dest_addr),
319 				 aligned_dest_addr - dest_addr);
320 		if (((dest_addr + alloc_size) - (aligned_dest_addr + size)) > 0) {
321 			virt_region_free(UINT_TO_POINTER(aligned_dest_addr + size),
322 					 (dest_addr + alloc_size) - (aligned_dest_addr + size));
323 		}
324 
325 		dest_addr = aligned_dest_addr;
326 	}
327 
328 	/* Need to make sure this does not step into kernel memory */
329 	if (dest_addr < POINTER_TO_UINT(Z_VIRT_REGION_START_ADDR)) {
330 		(void)sys_bitarray_free(&virt_region_bitmap, size, offset);
331 		return NULL;
332 	}
333 
334 	return UINT_TO_POINTER(dest_addr);
335 }
336 
337 /*
338  * Free page frames management
339  *
340  * Call all of these functions with z_mm_lock held.
341  */
342 
343 /* Linked list of unused and available page frames.
344  *
345  * TODO: This is very simple and treats all free page frames as being equal.
346  * However, there are use-cases to consolidate free pages such that entire
347  * SRAM banks can be switched off to save power, and so obtaining free pages
348  * may require a more complex ontology which prefers page frames in RAM banks
349  * which are still active.
350  *
351  * This implies in the future there may be multiple slists managing physical
352  * pages. Each page frame will still just have one snode link.
353  */
354 static sys_slist_t free_page_frame_list;
355 
356 /* Number of unused and available free page frames */
357 size_t z_free_page_count;
358 
359 #define PF_ASSERT(pf, expr, fmt, ...) \
360 	__ASSERT(expr, "page frame 0x%lx: " fmt, z_page_frame_to_phys(pf), \
361 		 ##__VA_ARGS__)
362 
363 /* Get an unused page frame. don't care which one, or NULL if there are none */
free_page_frame_list_get(void)364 static struct z_page_frame *free_page_frame_list_get(void)
365 {
366 	sys_snode_t *node;
367 	struct z_page_frame *pf = NULL;
368 
369 	node = sys_slist_get(&free_page_frame_list);
370 	if (node != NULL) {
371 		z_free_page_count--;
372 		pf = CONTAINER_OF(node, struct z_page_frame, node);
373 		PF_ASSERT(pf, z_page_frame_is_available(pf),
374 			 "unavailable but somehow on free list");
375 	}
376 
377 	return pf;
378 }
379 
380 /* Release a page frame back into the list of free pages */
free_page_frame_list_put(struct z_page_frame * pf)381 static void free_page_frame_list_put(struct z_page_frame *pf)
382 {
383 	PF_ASSERT(pf, z_page_frame_is_available(pf),
384 		 "unavailable page put on free list");
385 	/* The structure is packed, which ensures that this is true */
386 	void *node = pf;
387 
388 	sys_slist_append(&free_page_frame_list, node);
389 	z_free_page_count++;
390 }
391 
free_page_frame_list_init(void)392 static void free_page_frame_list_init(void)
393 {
394 	sys_slist_init(&free_page_frame_list);
395 }
396 
page_frame_free_locked(struct z_page_frame * pf)397 static void page_frame_free_locked(struct z_page_frame *pf)
398 {
399 	pf->flags = 0;
400 	free_page_frame_list_put(pf);
401 }
402 
403 /*
404  * Memory Mapping
405  */
406 
407 /* Called after the frame is mapped in the arch layer, to update our
408  * local ontology (and do some assertions while we're at it)
409  */
frame_mapped_set(struct z_page_frame * pf,void * addr)410 static void frame_mapped_set(struct z_page_frame *pf, void *addr)
411 {
412 	PF_ASSERT(pf, !z_page_frame_is_reserved(pf),
413 		  "attempted to map a reserved page frame");
414 
415 	/* We do allow multiple mappings for pinned page frames
416 	 * since we will never need to reverse map them.
417 	 * This is uncommon, use-cases are for things like the
418 	 * Zephyr equivalent of VSDOs
419 	 */
420 	PF_ASSERT(pf, !z_page_frame_is_mapped(pf) || z_page_frame_is_pinned(pf),
421 		 "non-pinned and already mapped to %p", pf->addr);
422 
423 	pf->flags |= Z_PAGE_FRAME_MAPPED;
424 	pf->addr = addr;
425 }
426 
427 /* LCOV_EXCL_START */
428 /* Go through page frames to find the physical address mapped
429  * by a virtual address.
430  *
431  * @param[in]  virt Virtual Address
432  * @param[out] phys Physical address mapped to the input virtual address
433  *                  if such mapping exists.
434  *
435  * @retval 0 if mapping is found and valid
436  * @retval -EFAULT if virtual address is not mapped
437  */
virt_to_page_frame(void * virt,uintptr_t * phys)438 static int virt_to_page_frame(void *virt, uintptr_t *phys)
439 {
440 	uintptr_t paddr;
441 	struct z_page_frame *pf;
442 	int ret = -EFAULT;
443 
444 	Z_PAGE_FRAME_FOREACH(paddr, pf) {
445 		if (z_page_frame_is_mapped(pf)) {
446 			if (virt == pf->addr) {
447 				ret = 0;
448 				*phys = z_page_frame_to_phys(pf);
449 				break;
450 			}
451 		}
452 	}
453 
454 	return ret;
455 }
456 /* LCOV_EXCL_STOP */
457 
458 __weak FUNC_ALIAS(virt_to_page_frame, arch_page_phys_get, int);
459 
460 #ifdef CONFIG_DEMAND_PAGING
461 static int page_frame_prepare_locked(struct z_page_frame *pf, bool *dirty_ptr,
462 				     bool page_in, uintptr_t *location_ptr);
463 
464 static inline void do_backing_store_page_in(uintptr_t location);
465 static inline void do_backing_store_page_out(uintptr_t location);
466 #endif /* CONFIG_DEMAND_PAGING */
467 
468 /* Allocate a free page frame, and map it to a specified virtual address
469  *
470  * TODO: Add optional support for copy-on-write mappings to a zero page instead
471  * of allocating, in which case page frames will be allocated lazily as
472  * the mappings to the zero page get touched. This will avoid expensive
473  * page-ins as memory is mapped and physical RAM or backing store storage will
474  * not be used if the mapped memory is unused. The cost is an empty physical
475  * page of zeroes.
476  */
map_anon_page(void * addr,uint32_t flags)477 static int map_anon_page(void *addr, uint32_t flags)
478 {
479 	struct z_page_frame *pf;
480 	uintptr_t phys;
481 	bool lock = (flags & K_MEM_MAP_LOCK) != 0U;
482 	bool uninit = (flags & K_MEM_MAP_UNINIT) != 0U;
483 
484 	pf = free_page_frame_list_get();
485 	if (pf == NULL) {
486 #ifdef CONFIG_DEMAND_PAGING
487 		uintptr_t location;
488 		bool dirty;
489 		int ret;
490 
491 		pf = k_mem_paging_eviction_select(&dirty);
492 		__ASSERT(pf != NULL, "failed to get a page frame");
493 		LOG_DBG("evicting %p at 0x%lx", pf->addr,
494 			z_page_frame_to_phys(pf));
495 		ret = page_frame_prepare_locked(pf, &dirty, false, &location);
496 		if (ret != 0) {
497 			return -ENOMEM;
498 		}
499 		if (dirty) {
500 			do_backing_store_page_out(location);
501 		}
502 		pf->flags = 0;
503 #else
504 		return -ENOMEM;
505 #endif /* CONFIG_DEMAND_PAGING */
506 	}
507 
508 	phys = z_page_frame_to_phys(pf);
509 	arch_mem_map(addr, phys, CONFIG_MMU_PAGE_SIZE, flags | K_MEM_CACHE_WB);
510 
511 	if (lock) {
512 		pf->flags |= Z_PAGE_FRAME_PINNED;
513 	}
514 	frame_mapped_set(pf, addr);
515 
516 	LOG_DBG("memory mapping anon page %p -> 0x%lx", addr, phys);
517 
518 	if (!uninit) {
519 		/* If we later implement mappings to a copy-on-write
520 		 * zero page, won't need this step
521 		 */
522 		memset(addr, 0, CONFIG_MMU_PAGE_SIZE);
523 	}
524 
525 	return 0;
526 }
527 
k_mem_map(size_t size,uint32_t flags)528 void *k_mem_map(size_t size, uint32_t flags)
529 {
530 	uint8_t *dst;
531 	size_t total_size;
532 	int ret;
533 	k_spinlock_key_t key;
534 	uint8_t *pos;
535 
536 	__ASSERT(!(((flags & K_MEM_PERM_USER) != 0U) &&
537 		   ((flags & K_MEM_MAP_UNINIT) != 0U)),
538 		 "user access to anonymous uninitialized pages is forbidden");
539 	__ASSERT(size % CONFIG_MMU_PAGE_SIZE == 0U,
540 		 "unaligned size %zu passed to %s", size, __func__);
541 	__ASSERT(size != 0, "zero sized memory mapping");
542 	__ASSERT(page_frames_initialized, "%s called too early", __func__);
543 	__ASSERT((flags & K_MEM_CACHE_MASK) == 0U,
544 		 "%s does not support explicit cache settings", __func__);
545 
546 	key = k_spin_lock(&z_mm_lock);
547 
548 	/* Need extra for the guard pages (before and after) which we
549 	 * won't map.
550 	 */
551 	total_size = size + CONFIG_MMU_PAGE_SIZE * 2;
552 
553 	dst = virt_region_alloc(total_size, CONFIG_MMU_PAGE_SIZE);
554 	if (dst == NULL) {
555 		/* Address space has no free region */
556 		goto out;
557 	}
558 
559 	/* Unmap both guard pages to make sure accessing them
560 	 * will generate fault.
561 	 */
562 	arch_mem_unmap(dst, CONFIG_MMU_PAGE_SIZE);
563 	arch_mem_unmap(dst + CONFIG_MMU_PAGE_SIZE + size,
564 		       CONFIG_MMU_PAGE_SIZE);
565 
566 	/* Skip over the "before" guard page in returned address. */
567 	dst += CONFIG_MMU_PAGE_SIZE;
568 
569 	VIRT_FOREACH(dst, size, pos) {
570 		ret = map_anon_page(pos, flags);
571 
572 		if (ret != 0) {
573 			/* TODO: call k_mem_unmap(dst, pos - dst)  when
574 			 * implemented in #28990 and release any guard virtual
575 			 * page as well.
576 			 */
577 			dst = NULL;
578 			goto out;
579 		}
580 	}
581 out:
582 	k_spin_unlock(&z_mm_lock, key);
583 	return dst;
584 }
585 
k_mem_unmap(void * addr,size_t size)586 void k_mem_unmap(void *addr, size_t size)
587 {
588 	uintptr_t phys;
589 	uint8_t *pos;
590 	struct z_page_frame *pf;
591 	k_spinlock_key_t key;
592 	size_t total_size;
593 	int ret;
594 
595 	/* Need space for the "before" guard page */
596 	__ASSERT_NO_MSG(POINTER_TO_UINT(addr) >= CONFIG_MMU_PAGE_SIZE);
597 
598 	/* Make sure address range is still valid after accounting
599 	 * for two guard pages.
600 	 */
601 	pos = (uint8_t *)addr - CONFIG_MMU_PAGE_SIZE;
602 	z_mem_assert_virtual_region(pos, size + (CONFIG_MMU_PAGE_SIZE * 2));
603 
604 	key = k_spin_lock(&z_mm_lock);
605 
606 	/* Check if both guard pages are unmapped.
607 	 * Bail if not, as this is probably a region not mapped
608 	 * using k_mem_map().
609 	 */
610 	pos = addr;
611 	ret = arch_page_phys_get(pos - CONFIG_MMU_PAGE_SIZE, NULL);
612 	if (ret == 0) {
613 		__ASSERT(ret == 0,
614 			 "%s: cannot find preceding guard page for (%p, %zu)",
615 			 __func__, addr, size);
616 		goto out;
617 	}
618 
619 	ret = arch_page_phys_get(pos + size, NULL);
620 	if (ret == 0) {
621 		__ASSERT(ret == 0,
622 			 "%s: cannot find succeeding guard page for (%p, %zu)",
623 			 __func__, addr, size);
624 		goto out;
625 	}
626 
627 	VIRT_FOREACH(addr, size, pos) {
628 		ret = arch_page_phys_get(pos, &phys);
629 
630 		__ASSERT(ret == 0,
631 			 "%s: cannot unmap an unmapped address %p",
632 			 __func__, pos);
633 		if (ret != 0) {
634 			/* Found an address not mapped. Do not continue. */
635 			goto out;
636 		}
637 
638 		__ASSERT(z_is_page_frame(phys),
639 			 "%s: 0x%lx is not a page frame", __func__, phys);
640 		if (!z_is_page_frame(phys)) {
641 			/* Physical address has no corresponding page frame
642 			 * description in the page frame array.
643 			 * This should not happen. Do not continue.
644 			 */
645 			goto out;
646 		}
647 
648 		/* Grab the corresponding page frame from physical address */
649 		pf = z_phys_to_page_frame(phys);
650 
651 		__ASSERT(z_page_frame_is_mapped(pf),
652 			 "%s: 0x%lx is not a mapped page frame", __func__, phys);
653 		if (!z_page_frame_is_mapped(pf)) {
654 			/* Page frame is not marked mapped.
655 			 * This should not happen. Do not continue.
656 			 */
657 			goto out;
658 		}
659 
660 		arch_mem_unmap(pos, CONFIG_MMU_PAGE_SIZE);
661 
662 		/* Put the page frame back into free list */
663 		page_frame_free_locked(pf);
664 	}
665 
666 	/* There are guard pages just before and after the mapped
667 	 * region. So we also need to free them from the bitmap.
668 	 */
669 	pos = (uint8_t *)addr - CONFIG_MMU_PAGE_SIZE;
670 	total_size = size + CONFIG_MMU_PAGE_SIZE * 2;
671 	virt_region_free(pos, total_size);
672 
673 out:
674 	k_spin_unlock(&z_mm_lock, key);
675 }
676 
k_mem_free_get(void)677 size_t k_mem_free_get(void)
678 {
679 	size_t ret;
680 	k_spinlock_key_t key;
681 
682 	__ASSERT(page_frames_initialized, "%s called too early", __func__);
683 
684 	key = k_spin_lock(&z_mm_lock);
685 #ifdef CONFIG_DEMAND_PAGING
686 	if (z_free_page_count > CONFIG_DEMAND_PAGING_PAGE_FRAMES_RESERVE) {
687 		ret = z_free_page_count - CONFIG_DEMAND_PAGING_PAGE_FRAMES_RESERVE;
688 	} else {
689 		ret = 0;
690 	}
691 #else
692 	ret = z_free_page_count;
693 #endif
694 	k_spin_unlock(&z_mm_lock, key);
695 
696 	return ret * (size_t)CONFIG_MMU_PAGE_SIZE;
697 }
698 
699 /* Get the default virtual region alignment, here the default MMU page size
700  *
701  * @param[in] phys Physical address of region to be mapped, aligned to MMU_PAGE_SIZE
702  * @param[in] size Size of region to be mapped, aligned to MMU_PAGE_SIZE
703  *
704  * @retval alignment to apply on the virtual address of this region
705  */
virt_region_align(uintptr_t phys,size_t size)706 static size_t virt_region_align(uintptr_t phys, size_t size)
707 {
708 	ARG_UNUSED(phys);
709 	ARG_UNUSED(size);
710 
711 	return CONFIG_MMU_PAGE_SIZE;
712 }
713 
714 __weak FUNC_ALIAS(virt_region_align, arch_virt_region_align, size_t);
715 
716 /* This may be called from arch early boot code before z_cstart() is invoked.
717  * Data will be copied and BSS zeroed, but this must not rely on any
718  * initialization functions being called prior to work correctly.
719  */
z_phys_map(uint8_t ** virt_ptr,uintptr_t phys,size_t size,uint32_t flags)720 void z_phys_map(uint8_t **virt_ptr, uintptr_t phys, size_t size, uint32_t flags)
721 {
722 	uintptr_t aligned_phys, addr_offset;
723 	size_t aligned_size, align_boundary;
724 	k_spinlock_key_t key;
725 	uint8_t *dest_addr;
726 	size_t num_bits;
727 	size_t offset;
728 
729 #ifndef CONFIG_KERNEL_DIRECT_MAP
730 	__ASSERT(!(flags & K_MEM_DIRECT_MAP), "The direct-map is not enabled");
731 #endif
732 	addr_offset = k_mem_region_align(&aligned_phys, &aligned_size,
733 					 phys, size,
734 					 CONFIG_MMU_PAGE_SIZE);
735 	__ASSERT(aligned_size != 0U, "0-length mapping at 0x%lx", aligned_phys);
736 	__ASSERT(aligned_phys < (aligned_phys + (aligned_size - 1)),
737 		 "wraparound for physical address 0x%lx (size %zu)",
738 		 aligned_phys, aligned_size);
739 
740 	align_boundary = arch_virt_region_align(aligned_phys, aligned_size);
741 
742 	key = k_spin_lock(&z_mm_lock);
743 	if (flags & K_MEM_DIRECT_MAP) {
744 		dest_addr = (uint8_t *)aligned_phys;
745 		/* Reserve from the virtual memory space */
746 		if (!(dest_addr + aligned_size < Z_VIRT_RAM_START ||
747 		    dest_addr > Z_VIRT_RAM_END)) {
748 			num_bits = aligned_size / CONFIG_MMU_PAGE_SIZE;
749 			offset = virt_to_bitmap_offset(dest_addr, aligned_size);
750 			if (sys_bitarray_test_and_set_region(
751 			    &virt_region_bitmap, num_bits, offset, true))
752 				goto fail;
753 		}
754 	} else {
755 		/* Obtain an appropriately sized chunk of virtual memory */
756 		dest_addr = virt_region_alloc(aligned_size, align_boundary);
757 		if (!dest_addr) {
758 			goto fail;
759 		}
760 	}
761 
762 	/* If this fails there's something amiss with virt_region_get */
763 	__ASSERT((uintptr_t)dest_addr <
764 		 ((uintptr_t)dest_addr + (size - 1)),
765 		 "wraparound for virtual address %p (size %zu)",
766 		 dest_addr, size);
767 
768 	LOG_DBG("arch_mem_map(%p, 0x%lx, %zu, %x) offset %lu", dest_addr,
769 		aligned_phys, aligned_size, flags, addr_offset);
770 
771 	arch_mem_map(dest_addr, aligned_phys, aligned_size, flags);
772 	k_spin_unlock(&z_mm_lock, key);
773 
774 	*virt_ptr = dest_addr + addr_offset;
775 	return;
776 fail:
777 	/* May re-visit this in the future, but for now running out of
778 	 * virtual address space or failing the arch_mem_map() call is
779 	 * an unrecoverable situation.
780 	 *
781 	 * Other problems not related to resource exhaustion we leave as
782 	 * assertions since they are clearly programming mistakes.
783 	 */
784 	LOG_ERR("memory mapping 0x%lx (size %zu, flags 0x%x) failed",
785 		phys, size, flags);
786 	k_panic();
787 }
788 
z_phys_unmap(uint8_t * virt,size_t size)789 void z_phys_unmap(uint8_t *virt, size_t size)
790 {
791 	uintptr_t aligned_virt, addr_offset;
792 	size_t aligned_size;
793 	k_spinlock_key_t key;
794 
795 	addr_offset = k_mem_region_align(&aligned_virt, &aligned_size,
796 					 POINTER_TO_UINT(virt), size,
797 					 CONFIG_MMU_PAGE_SIZE);
798 	__ASSERT(aligned_size != 0U, "0-length mapping at 0x%lx", aligned_virt);
799 	__ASSERT(aligned_virt < (aligned_virt + (aligned_size - 1)),
800 		 "wraparound for virtual address 0x%lx (size %zu)",
801 		 aligned_virt, aligned_size);
802 
803 	key = k_spin_lock(&z_mm_lock);
804 
805 	LOG_DBG("arch_mem_unmap(0x%lx, %zu) offset %lu",
806 		aligned_virt, aligned_size, addr_offset);
807 
808 	arch_mem_unmap(UINT_TO_POINTER(aligned_virt), aligned_size);
809 	virt_region_free(UINT_TO_POINTER(aligned_virt), aligned_size);
810 	k_spin_unlock(&z_mm_lock, key);
811 }
812 
813 /*
814  * Miscellaneous
815  */
816 
k_mem_region_align(uintptr_t * aligned_addr,size_t * aligned_size,uintptr_t addr,size_t size,size_t align)817 size_t k_mem_region_align(uintptr_t *aligned_addr, size_t *aligned_size,
818 			  uintptr_t addr, size_t size, size_t align)
819 {
820 	size_t addr_offset;
821 
822 	/* The actual mapped region must be page-aligned. Round down the
823 	 * physical address and pad the region size appropriately
824 	 */
825 	*aligned_addr = ROUND_DOWN(addr, align);
826 	addr_offset = addr - *aligned_addr;
827 	*aligned_size = ROUND_UP(size + addr_offset, align);
828 
829 	return addr_offset;
830 }
831 
832 #if defined(CONFIG_LINKER_USE_BOOT_SECTION) || defined(CONFIG_LINKER_USE_PINNED_SECTION)
mark_linker_section_pinned(void * start_addr,void * end_addr,bool pin)833 static void mark_linker_section_pinned(void *start_addr, void *end_addr,
834 				       bool pin)
835 {
836 	struct z_page_frame *pf;
837 	uint8_t *addr;
838 
839 	uintptr_t pinned_start = ROUND_DOWN(POINTER_TO_UINT(start_addr),
840 					    CONFIG_MMU_PAGE_SIZE);
841 	uintptr_t pinned_end = ROUND_UP(POINTER_TO_UINT(end_addr),
842 					CONFIG_MMU_PAGE_SIZE);
843 	size_t pinned_size = pinned_end - pinned_start;
844 
845 	VIRT_FOREACH(UINT_TO_POINTER(pinned_start), pinned_size, addr)
846 	{
847 		pf = z_phys_to_page_frame(Z_BOOT_VIRT_TO_PHYS(addr));
848 		frame_mapped_set(pf, addr);
849 
850 		if (pin) {
851 			pf->flags |= Z_PAGE_FRAME_PINNED;
852 		} else {
853 			pf->flags &= ~Z_PAGE_FRAME_PINNED;
854 		}
855 	}
856 }
857 #endif /* CONFIG_LINKER_USE_BOOT_SECTION) || CONFIG_LINKER_USE_PINNED_SECTION */
858 
z_mem_manage_init(void)859 void z_mem_manage_init(void)
860 {
861 	uintptr_t phys;
862 	uint8_t *addr;
863 	struct z_page_frame *pf;
864 	k_spinlock_key_t key = k_spin_lock(&z_mm_lock);
865 
866 	free_page_frame_list_init();
867 
868 	ARG_UNUSED(addr);
869 
870 #ifdef CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES
871 	/* If some page frames are unavailable for use as memory, arch
872 	 * code will mark Z_PAGE_FRAME_RESERVED in their flags
873 	 */
874 	arch_reserved_pages_update();
875 #endif /* CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES */
876 
877 #ifdef CONFIG_LINKER_GENERIC_SECTIONS_PRESENT_AT_BOOT
878 	/* All pages composing the Zephyr image are mapped at boot in a
879 	 * predictable way. This can change at runtime.
880 	 */
881 	VIRT_FOREACH(Z_KERNEL_VIRT_START, Z_KERNEL_VIRT_SIZE, addr)
882 	{
883 		pf = z_phys_to_page_frame(Z_BOOT_VIRT_TO_PHYS(addr));
884 		frame_mapped_set(pf, addr);
885 
886 		/* TODO: for now we pin the whole Zephyr image. Demand paging
887 		 * currently tested with anonymously-mapped pages which are not
888 		 * pinned.
889 		 *
890 		 * We will need to setup linker regions for a subset of kernel
891 		 * code/data pages which are pinned in memory and
892 		 * may not be evicted. This will contain critical CPU data
893 		 * structures, and any code used to perform page fault
894 		 * handling, page-ins, etc.
895 		 */
896 		pf->flags |= Z_PAGE_FRAME_PINNED;
897 	}
898 #endif /* CONFIG_LINKER_GENERIC_SECTIONS_PRESENT_AT_BOOT */
899 
900 #ifdef CONFIG_LINKER_USE_BOOT_SECTION
901 	/* Pin the boot section to prevent it from being swapped out during
902 	 * boot process. Will be un-pinned once boot process completes.
903 	 */
904 	mark_linker_section_pinned(lnkr_boot_start, lnkr_boot_end, true);
905 #endif
906 
907 #ifdef CONFIG_LINKER_USE_PINNED_SECTION
908 	/* Pin the page frames correspondng to the pinned symbols */
909 	mark_linker_section_pinned(lnkr_pinned_start, lnkr_pinned_end, true);
910 #endif
911 
912 	/* Any remaining pages that aren't mapped, reserved, or pinned get
913 	 * added to the free pages list
914 	 */
915 	Z_PAGE_FRAME_FOREACH(phys, pf) {
916 		if (z_page_frame_is_available(pf)) {
917 			free_page_frame_list_put(pf);
918 		}
919 	}
920 	LOG_DBG("free page frames: %zu", z_free_page_count);
921 
922 #ifdef CONFIG_DEMAND_PAGING
923 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
924 	z_paging_histogram_init();
925 #endif
926 	k_mem_paging_backing_store_init();
927 	k_mem_paging_eviction_init();
928 #endif
929 #if __ASSERT_ON
930 	page_frames_initialized = true;
931 #endif
932 	k_spin_unlock(&z_mm_lock, key);
933 
934 #ifndef CONFIG_LINKER_GENERIC_SECTIONS_PRESENT_AT_BOOT
935 	/* If BSS section is not present in memory at boot,
936 	 * it would not have been cleared. This needs to be
937 	 * done now since paging mechanism has been initialized
938 	 * and the BSS pages can be brought into physical
939 	 * memory to be cleared.
940 	 */
941 	z_bss_zero();
942 #endif
943 }
944 
z_mem_manage_boot_finish(void)945 void z_mem_manage_boot_finish(void)
946 {
947 #ifdef CONFIG_LINKER_USE_BOOT_SECTION
948 	/* At the end of boot process, unpin the boot sections
949 	 * as they don't need to be in memory all the time anymore.
950 	 */
951 	mark_linker_section_pinned(lnkr_boot_start, lnkr_boot_end, false);
952 #endif
953 }
954 
955 #ifdef CONFIG_DEMAND_PAGING
956 
957 #ifdef CONFIG_DEMAND_PAGING_STATS
958 struct k_mem_paging_stats_t paging_stats;
959 extern struct k_mem_paging_histogram_t z_paging_histogram_eviction;
960 extern struct k_mem_paging_histogram_t z_paging_histogram_backing_store_page_in;
961 extern struct k_mem_paging_histogram_t z_paging_histogram_backing_store_page_out;
962 #endif
963 
do_backing_store_page_in(uintptr_t location)964 static inline void do_backing_store_page_in(uintptr_t location)
965 {
966 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
967 	uint32_t time_diff;
968 
969 #ifdef CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS
970 	timing_t time_start, time_end;
971 
972 	time_start = timing_counter_get();
973 #else
974 	uint32_t time_start;
975 
976 	time_start = k_cycle_get_32();
977 #endif /* CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS */
978 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
979 
980 	k_mem_paging_backing_store_page_in(location);
981 
982 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
983 #ifdef CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS
984 	time_end = timing_counter_get();
985 	time_diff = (uint32_t)timing_cycles_get(&time_start, &time_end);
986 #else
987 	time_diff = k_cycle_get_32() - time_start;
988 #endif /* CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS */
989 
990 	z_paging_histogram_inc(&z_paging_histogram_backing_store_page_in,
991 			       time_diff);
992 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
993 }
994 
do_backing_store_page_out(uintptr_t location)995 static inline void do_backing_store_page_out(uintptr_t location)
996 {
997 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
998 	uint32_t time_diff;
999 
1000 #ifdef CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS
1001 	timing_t time_start, time_end;
1002 
1003 	time_start = timing_counter_get();
1004 #else
1005 	uint32_t time_start;
1006 
1007 	time_start = k_cycle_get_32();
1008 #endif /* CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS */
1009 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
1010 
1011 	k_mem_paging_backing_store_page_out(location);
1012 
1013 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
1014 #ifdef CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS
1015 	time_end = timing_counter_get();
1016 	time_diff = (uint32_t)timing_cycles_get(&time_start, &time_end);
1017 #else
1018 	time_diff = k_cycle_get_32() - time_start;
1019 #endif /* CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS */
1020 
1021 	z_paging_histogram_inc(&z_paging_histogram_backing_store_page_out,
1022 			       time_diff);
1023 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
1024 }
1025 
1026 /* Current implementation relies on interrupt locking to any prevent page table
1027  * access, which falls over if other CPUs are active. Addressing this is not
1028  * as simple as using spinlocks as regular memory reads/writes constitute
1029  * "access" in this sense.
1030  *
1031  * Current needs for demand paging are on uniprocessor systems.
1032  */
1033 BUILD_ASSERT(!IS_ENABLED(CONFIG_SMP));
1034 
virt_region_foreach(void * addr,size_t size,void (* func)(void *))1035 static void virt_region_foreach(void *addr, size_t size,
1036 				void (*func)(void *))
1037 {
1038 	z_mem_assert_virtual_region(addr, size);
1039 
1040 	for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) {
1041 		func((uint8_t *)addr + offset);
1042 	}
1043 }
1044 
1045 /*
1046  * Perform some preparatory steps before paging out. The provided page frame
1047  * must be evicted to the backing store immediately after this is called
1048  * with a call to k_mem_paging_backing_store_page_out() if it contains
1049  * a data page.
1050  *
1051  * - Map page frame to scratch area if requested. This always is true if we're
1052  *   doing a page fault, but is only set on manual evictions if the page is
1053  *   dirty.
1054  * - If mapped:
1055  *    - obtain backing store location and populate location parameter
1056  *    - Update page tables with location
1057  * - Mark page frame as busy
1058  *
1059  * Returns -ENOMEM if the backing store is full
1060  */
page_frame_prepare_locked(struct z_page_frame * pf,bool * dirty_ptr,bool page_fault,uintptr_t * location_ptr)1061 static int page_frame_prepare_locked(struct z_page_frame *pf, bool *dirty_ptr,
1062 				     bool page_fault, uintptr_t *location_ptr)
1063 {
1064 	uintptr_t phys;
1065 	int ret;
1066 	bool dirty = *dirty_ptr;
1067 
1068 	phys = z_page_frame_to_phys(pf);
1069 	__ASSERT(!z_page_frame_is_pinned(pf), "page frame 0x%lx is pinned",
1070 		 phys);
1071 
1072 	/* If the backing store doesn't have a copy of the page, even if it
1073 	 * wasn't modified, treat as dirty. This can happen for a few
1074 	 * reasons:
1075 	 * 1) Page has never been swapped out before, and the backing store
1076 	 *    wasn't pre-populated with this data page.
1077 	 * 2) Page was swapped out before, but the page contents were not
1078 	 *    preserved after swapping back in.
1079 	 * 3) Page contents were preserved when swapped back in, but were later
1080 	 *    evicted from the backing store to make room for other evicted
1081 	 *    pages.
1082 	 */
1083 	if (z_page_frame_is_mapped(pf)) {
1084 		dirty = dirty || !z_page_frame_is_backed(pf);
1085 	}
1086 
1087 	if (dirty || page_fault) {
1088 		arch_mem_scratch(phys);
1089 	}
1090 
1091 	if (z_page_frame_is_mapped(pf)) {
1092 		ret = k_mem_paging_backing_store_location_get(pf, location_ptr,
1093 							      page_fault);
1094 		if (ret != 0) {
1095 			LOG_ERR("out of backing store memory");
1096 			return -ENOMEM;
1097 		}
1098 		arch_mem_page_out(pf->addr, *location_ptr);
1099 	} else {
1100 		/* Shouldn't happen unless this function is mis-used */
1101 		__ASSERT(!dirty, "un-mapped page determined to be dirty");
1102 	}
1103 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1104 	/* Mark as busy so that z_page_frame_is_evictable() returns false */
1105 	__ASSERT(!z_page_frame_is_busy(pf), "page frame 0x%lx is already busy",
1106 		 phys);
1107 	pf->flags |= Z_PAGE_FRAME_BUSY;
1108 #endif
1109 	/* Update dirty parameter, since we set to true if it wasn't backed
1110 	 * even if otherwise clean
1111 	 */
1112 	*dirty_ptr = dirty;
1113 
1114 	return 0;
1115 }
1116 
do_mem_evict(void * addr)1117 static int do_mem_evict(void *addr)
1118 {
1119 	bool dirty;
1120 	struct z_page_frame *pf;
1121 	uintptr_t location;
1122 	int key, ret;
1123 	uintptr_t flags, phys;
1124 
1125 #if CONFIG_DEMAND_PAGING_ALLOW_IRQ
1126 	__ASSERT(!k_is_in_isr(),
1127 		 "%s is unavailable in ISRs with CONFIG_DEMAND_PAGING_ALLOW_IRQ",
1128 		 __func__);
1129 	k_sched_lock();
1130 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1131 	key = irq_lock();
1132 	flags = arch_page_info_get(addr, &phys, false);
1133 	__ASSERT((flags & ARCH_DATA_PAGE_NOT_MAPPED) == 0,
1134 		 "address %p isn't mapped", addr);
1135 	if ((flags & ARCH_DATA_PAGE_LOADED) == 0) {
1136 		/* Un-mapped or already evicted. Nothing to do */
1137 		ret = 0;
1138 		goto out;
1139 	}
1140 
1141 	dirty = (flags & ARCH_DATA_PAGE_DIRTY) != 0;
1142 	pf = z_phys_to_page_frame(phys);
1143 	__ASSERT(pf->addr == addr, "page frame address mismatch");
1144 	ret = page_frame_prepare_locked(pf, &dirty, false, &location);
1145 	if (ret != 0) {
1146 		goto out;
1147 	}
1148 
1149 	__ASSERT(ret == 0, "failed to prepare page frame");
1150 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1151 	irq_unlock(key);
1152 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1153 	if (dirty) {
1154 		do_backing_store_page_out(location);
1155 	}
1156 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1157 	key = irq_lock();
1158 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1159 	page_frame_free_locked(pf);
1160 out:
1161 	irq_unlock(key);
1162 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1163 	k_sched_unlock();
1164 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1165 	return ret;
1166 }
1167 
k_mem_page_out(void * addr,size_t size)1168 int k_mem_page_out(void *addr, size_t size)
1169 {
1170 	__ASSERT(page_frames_initialized, "%s called on %p too early", __func__,
1171 		 addr);
1172 	z_mem_assert_virtual_region(addr, size);
1173 
1174 	for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) {
1175 		void *pos = (uint8_t *)addr + offset;
1176 		int ret;
1177 
1178 		ret = do_mem_evict(pos);
1179 		if (ret != 0) {
1180 			return ret;
1181 		}
1182 	}
1183 
1184 	return 0;
1185 }
1186 
z_page_frame_evict(uintptr_t phys)1187 int z_page_frame_evict(uintptr_t phys)
1188 {
1189 	int key, ret;
1190 	struct z_page_frame *pf;
1191 	bool dirty;
1192 	uintptr_t flags;
1193 	uintptr_t location;
1194 
1195 	__ASSERT(page_frames_initialized, "%s called on 0x%lx too early",
1196 		 __func__, phys);
1197 
1198 	/* Implementation is similar to do_page_fault() except there is no
1199 	 * data page to page-in, see comments in that function.
1200 	 */
1201 
1202 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1203 	__ASSERT(!k_is_in_isr(),
1204 		 "%s is unavailable in ISRs with CONFIG_DEMAND_PAGING_ALLOW_IRQ",
1205 		 __func__);
1206 	k_sched_lock();
1207 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1208 	key = irq_lock();
1209 	pf = z_phys_to_page_frame(phys);
1210 	if (!z_page_frame_is_mapped(pf)) {
1211 		/* Nothing to do, free page */
1212 		ret = 0;
1213 		goto out;
1214 	}
1215 	flags = arch_page_info_get(pf->addr, NULL, false);
1216 	/* Shouldn't ever happen */
1217 	__ASSERT((flags & ARCH_DATA_PAGE_LOADED) != 0, "data page not loaded");
1218 	dirty = (flags & ARCH_DATA_PAGE_DIRTY) != 0;
1219 	ret = page_frame_prepare_locked(pf, &dirty, false, &location);
1220 	if (ret != 0) {
1221 		goto out;
1222 	}
1223 
1224 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1225 	irq_unlock(key);
1226 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1227 	if (dirty) {
1228 		do_backing_store_page_out(location);
1229 	}
1230 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1231 	key = irq_lock();
1232 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1233 	page_frame_free_locked(pf);
1234 out:
1235 	irq_unlock(key);
1236 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1237 	k_sched_unlock();
1238 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1239 	return ret;
1240 }
1241 
paging_stats_faults_inc(struct k_thread * faulting_thread,int key)1242 static inline void paging_stats_faults_inc(struct k_thread *faulting_thread,
1243 					   int key)
1244 {
1245 #ifdef CONFIG_DEMAND_PAGING_STATS
1246 	bool is_irq_unlocked = arch_irq_unlocked(key);
1247 
1248 	paging_stats.pagefaults.cnt++;
1249 
1250 	if (is_irq_unlocked) {
1251 		paging_stats.pagefaults.irq_unlocked++;
1252 	} else {
1253 		paging_stats.pagefaults.irq_locked++;
1254 	}
1255 
1256 #ifdef CONFIG_DEMAND_PAGING_THREAD_STATS
1257 	faulting_thread->paging_stats.pagefaults.cnt++;
1258 
1259 	if (is_irq_unlocked) {
1260 		faulting_thread->paging_stats.pagefaults.irq_unlocked++;
1261 	} else {
1262 		faulting_thread->paging_stats.pagefaults.irq_locked++;
1263 	}
1264 #else
1265 	ARG_UNUSED(faulting_thread);
1266 #endif
1267 
1268 #ifndef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1269 	if (k_is_in_isr()) {
1270 		paging_stats.pagefaults.in_isr++;
1271 
1272 #ifdef CONFIG_DEMAND_PAGING_THREAD_STATS
1273 		faulting_thread->paging_stats.pagefaults.in_isr++;
1274 #endif
1275 	}
1276 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1277 #endif /* CONFIG_DEMAND_PAGING_STATS */
1278 }
1279 
paging_stats_eviction_inc(struct k_thread * faulting_thread,bool dirty)1280 static inline void paging_stats_eviction_inc(struct k_thread *faulting_thread,
1281 					     bool dirty)
1282 {
1283 #ifdef CONFIG_DEMAND_PAGING_STATS
1284 	if (dirty) {
1285 		paging_stats.eviction.dirty++;
1286 	} else {
1287 		paging_stats.eviction.clean++;
1288 	}
1289 #ifdef CONFIG_DEMAND_PAGING_THREAD_STATS
1290 	if (dirty) {
1291 		faulting_thread->paging_stats.eviction.dirty++;
1292 	} else {
1293 		faulting_thread->paging_stats.eviction.clean++;
1294 	}
1295 #else
1296 	ARG_UNUSED(faulting_thread);
1297 #endif /* CONFIG_DEMAND_PAGING_THREAD_STATS */
1298 #endif /* CONFIG_DEMAND_PAGING_STATS */
1299 }
1300 
do_eviction_select(bool * dirty)1301 static inline struct z_page_frame *do_eviction_select(bool *dirty)
1302 {
1303 	struct z_page_frame *pf;
1304 
1305 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
1306 	uint32_t time_diff;
1307 
1308 #ifdef CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS
1309 	timing_t time_start, time_end;
1310 
1311 	time_start = timing_counter_get();
1312 #else
1313 	uint32_t time_start;
1314 
1315 	time_start = k_cycle_get_32();
1316 #endif /* CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS */
1317 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
1318 
1319 	pf = k_mem_paging_eviction_select(dirty);
1320 
1321 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
1322 #ifdef CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS
1323 	time_end = timing_counter_get();
1324 	time_diff = (uint32_t)timing_cycles_get(&time_start, &time_end);
1325 #else
1326 	time_diff = k_cycle_get_32() - time_start;
1327 #endif /* CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS */
1328 
1329 	z_paging_histogram_inc(&z_paging_histogram_eviction, time_diff);
1330 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
1331 
1332 	return pf;
1333 }
1334 
do_page_fault(void * addr,bool pin)1335 static bool do_page_fault(void *addr, bool pin)
1336 {
1337 	struct z_page_frame *pf;
1338 	int key, ret;
1339 	uintptr_t page_in_location, page_out_location;
1340 	enum arch_page_location status;
1341 	bool result;
1342 	bool dirty = false;
1343 	struct k_thread *faulting_thread = _current_cpu->current;
1344 
1345 	__ASSERT(page_frames_initialized, "page fault at %p happened too early",
1346 		 addr);
1347 
1348 	LOG_DBG("page fault at %p", addr);
1349 
1350 	/*
1351 	 * TODO: Add performance accounting:
1352 	 * - k_mem_paging_eviction_select() metrics
1353 	 *   * periodic timer execution time histogram (if implemented)
1354 	 */
1355 
1356 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1357 	/* We lock the scheduler so that other threads are never scheduled
1358 	 * during the page-in/out operation.
1359 	 *
1360 	 * We do however re-enable interrupts during the page-in/page-out
1361 	 * operation iff interrupts were enabled when the exception was taken;
1362 	 * in this configuration page faults in an ISR are a bug; all their
1363 	 * code/data must be pinned.
1364 	 *
1365 	 * If interrupts were disabled when the exception was taken, the
1366 	 * arch code is responsible for keeping them that way when entering
1367 	 * this function.
1368 	 *
1369 	 * If this is not enabled, then interrupts are always locked for the
1370 	 * entire operation. This is far worse for system interrupt latency
1371 	 * but requires less pinned pages and ISRs may also take page faults.
1372 	 *
1373 	 * Support for allowing k_mem_paging_backing_store_page_out() and
1374 	 * k_mem_paging_backing_store_page_in() to also sleep and allow
1375 	 * other threads to run (such as in the case where the transfer is
1376 	 * async DMA) is not implemented. Even if limited to thread context,
1377 	 * arbitrary memory access triggering exceptions that put a thread to
1378 	 * sleep on a contended page fault operation will break scheduling
1379 	 * assumptions of cooperative threads or threads that implement
1380 	 * crticial sections with spinlocks or disabling IRQs.
1381 	 */
1382 	k_sched_lock();
1383 	__ASSERT(!k_is_in_isr(), "ISR page faults are forbidden");
1384 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1385 
1386 	key = irq_lock();
1387 	status = arch_page_location_get(addr, &page_in_location);
1388 	if (status == ARCH_PAGE_LOCATION_BAD) {
1389 		/* Return false to treat as a fatal error */
1390 		result = false;
1391 		goto out;
1392 	}
1393 	result = true;
1394 
1395 	if (status == ARCH_PAGE_LOCATION_PAGED_IN) {
1396 		if (pin) {
1397 			/* It's a physical memory address */
1398 			uintptr_t phys = page_in_location;
1399 
1400 			pf = z_phys_to_page_frame(phys);
1401 			pf->flags |= Z_PAGE_FRAME_PINNED;
1402 		}
1403 
1404 		/* This if-block is to pin the page if it is
1405 		 * already present in physical memory. There is
1406 		 * no need to go through the following code to
1407 		 * pull in the data pages. So skip to the end.
1408 		 */
1409 		goto out;
1410 	}
1411 	__ASSERT(status == ARCH_PAGE_LOCATION_PAGED_OUT,
1412 		 "unexpected status value %d", status);
1413 
1414 	paging_stats_faults_inc(faulting_thread, key);
1415 
1416 	pf = free_page_frame_list_get();
1417 	if (pf == NULL) {
1418 		/* Need to evict a page frame */
1419 		pf = do_eviction_select(&dirty);
1420 		__ASSERT(pf != NULL, "failed to get a page frame");
1421 		LOG_DBG("evicting %p at 0x%lx", pf->addr,
1422 			z_page_frame_to_phys(pf));
1423 
1424 		paging_stats_eviction_inc(faulting_thread, dirty);
1425 	}
1426 	ret = page_frame_prepare_locked(pf, &dirty, true, &page_out_location);
1427 	__ASSERT(ret == 0, "failed to prepare page frame");
1428 
1429 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1430 	irq_unlock(key);
1431 	/* Interrupts are now unlocked if they were not locked when we entered
1432 	 * this function, and we may service ISRs. The scheduler is still
1433 	 * locked.
1434 	 */
1435 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1436 	if (dirty) {
1437 		do_backing_store_page_out(page_out_location);
1438 	}
1439 	do_backing_store_page_in(page_in_location);
1440 
1441 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1442 	key = irq_lock();
1443 	pf->flags &= ~Z_PAGE_FRAME_BUSY;
1444 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1445 	if (pin) {
1446 		pf->flags |= Z_PAGE_FRAME_PINNED;
1447 	}
1448 	pf->flags |= Z_PAGE_FRAME_MAPPED;
1449 	pf->addr = UINT_TO_POINTER(POINTER_TO_UINT(addr)
1450 				   & ~(CONFIG_MMU_PAGE_SIZE - 1));
1451 
1452 	arch_mem_page_in(addr, z_page_frame_to_phys(pf));
1453 	k_mem_paging_backing_store_page_finalize(pf, page_in_location);
1454 out:
1455 	irq_unlock(key);
1456 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1457 	k_sched_unlock();
1458 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1459 
1460 	return result;
1461 }
1462 
do_page_in(void * addr)1463 static void do_page_in(void *addr)
1464 {
1465 	bool ret;
1466 
1467 	ret = do_page_fault(addr, false);
1468 	__ASSERT(ret, "unmapped memory address %p", addr);
1469 	(void)ret;
1470 }
1471 
k_mem_page_in(void * addr,size_t size)1472 void k_mem_page_in(void *addr, size_t size)
1473 {
1474 	__ASSERT(!IS_ENABLED(CONFIG_DEMAND_PAGING_ALLOW_IRQ) || !k_is_in_isr(),
1475 		 "%s may not be called in ISRs if CONFIG_DEMAND_PAGING_ALLOW_IRQ is enabled",
1476 		 __func__);
1477 	virt_region_foreach(addr, size, do_page_in);
1478 }
1479 
do_mem_pin(void * addr)1480 static void do_mem_pin(void *addr)
1481 {
1482 	bool ret;
1483 
1484 	ret = do_page_fault(addr, true);
1485 	__ASSERT(ret, "unmapped memory address %p", addr);
1486 	(void)ret;
1487 }
1488 
k_mem_pin(void * addr,size_t size)1489 void k_mem_pin(void *addr, size_t size)
1490 {
1491 	__ASSERT(!IS_ENABLED(CONFIG_DEMAND_PAGING_ALLOW_IRQ) || !k_is_in_isr(),
1492 		 "%s may not be called in ISRs if CONFIG_DEMAND_PAGING_ALLOW_IRQ is enabled",
1493 		 __func__);
1494 	virt_region_foreach(addr, size, do_mem_pin);
1495 }
1496 
z_page_fault(void * addr)1497 bool z_page_fault(void *addr)
1498 {
1499 	return do_page_fault(addr, false);
1500 }
1501 
do_mem_unpin(void * addr)1502 static void do_mem_unpin(void *addr)
1503 {
1504 	struct z_page_frame *pf;
1505 	unsigned int key;
1506 	uintptr_t flags, phys;
1507 
1508 	key = irq_lock();
1509 	flags = arch_page_info_get(addr, &phys, false);
1510 	__ASSERT((flags & ARCH_DATA_PAGE_NOT_MAPPED) == 0,
1511 		 "invalid data page at %p", addr);
1512 	if ((flags & ARCH_DATA_PAGE_LOADED) != 0) {
1513 		pf = z_phys_to_page_frame(phys);
1514 		pf->flags &= ~Z_PAGE_FRAME_PINNED;
1515 	}
1516 	irq_unlock(key);
1517 }
1518 
k_mem_unpin(void * addr,size_t size)1519 void k_mem_unpin(void *addr, size_t size)
1520 {
1521 	__ASSERT(page_frames_initialized, "%s called on %p too early", __func__,
1522 		 addr);
1523 	virt_region_foreach(addr, size, do_mem_unpin);
1524 }
1525 
1526 #endif /* CONFIG_DEMAND_PAGING */
1527