1 /*
2  * Copyright (c) 2020 Intel Corporation
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Routines for managing virtual address spaces
7  */
8 
9 #include <stdint.h>
10 #include <kernel_arch_interface.h>
11 #include <zephyr/spinlock.h>
12 #include <mmu.h>
13 #include <zephyr/init.h>
14 #include <kernel_internal.h>
15 #include <zephyr/internal/syscall_handler.h>
16 #include <zephyr/toolchain.h>
17 #include <zephyr/linker/linker-defs.h>
18 #include <zephyr/sys/bitarray.h>
19 #include <zephyr/sys/check.h>
20 #include <zephyr/sys/math_extras.h>
21 #include <zephyr/timing/timing.h>
22 #include <zephyr/arch/common/init.h>
23 #include <zephyr/logging/log.h>
24 LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL);
25 
26 #ifdef CONFIG_DEMAND_PAGING
27 #include <zephyr/kernel/mm/demand_paging.h>
28 #endif /* CONFIG_DEMAND_PAGING */
29 
30 /*
31  * General terminology:
32  * - A page frame is a page-sized physical memory region in RAM. It is a
33  *   container where a data page may be placed. It is always referred to by
34  *   physical address. We have a convention of using uintptr_t for physical
35  *   addresses. We instantiate a struct k_mem_page_frame to store metadata for
36  *   every page frame.
37  *
38  * - A data page is a page-sized region of data. It may exist in a page frame,
39  *   or be paged out to some backing store. Its location can always be looked
40  *   up in the CPU's page tables (or equivalent) by virtual address.
41  *   The data type will always be void * or in some cases uint8_t * when we
42  *   want to do pointer arithmetic.
43  */
44 
45 /* Spinlock to protect any globals in this file and serialize page table
46  * updates in arch code
47  */
48 struct k_spinlock z_mm_lock;
49 
50 /*
51  * General page frame management
52  */
53 
54 /* Database of all RAM page frames */
55 struct k_mem_page_frame k_mem_page_frames[K_MEM_NUM_PAGE_FRAMES];
56 
57 #if __ASSERT_ON
58 /* Indicator that k_mem_page_frames has been initialized, many of these APIs do
59  * not work before POST_KERNEL
60  */
61 static bool page_frames_initialized;
62 #endif
63 
64 /* Add colors to page table dumps to indicate mapping type */
65 #define COLOR_PAGE_FRAMES	1
66 
67 #if COLOR_PAGE_FRAMES
68 #define ANSI_DEFAULT "\x1B" "[0m"
69 #define ANSI_RED     "\x1B" "[1;31m"
70 #define ANSI_GREEN   "\x1B" "[1;32m"
71 #define ANSI_YELLOW  "\x1B" "[1;33m"
72 #define ANSI_BLUE    "\x1B" "[1;34m"
73 #define ANSI_MAGENTA "\x1B" "[1;35m"
74 #define ANSI_CYAN    "\x1B" "[1;36m"
75 #define ANSI_GREY    "\x1B" "[1;90m"
76 
77 #define COLOR(x)	printk(_CONCAT(ANSI_, x))
78 #else
79 #define COLOR(x)	do { } while (false)
80 #endif /* COLOR_PAGE_FRAMES */
81 
82 /* LCOV_EXCL_START */
page_frame_dump(struct k_mem_page_frame * pf)83 static void page_frame_dump(struct k_mem_page_frame *pf)
84 {
85 	if (k_mem_page_frame_is_free(pf)) {
86 		COLOR(GREY);
87 		printk("-");
88 	} else if (k_mem_page_frame_is_reserved(pf)) {
89 		COLOR(CYAN);
90 		printk("R");
91 	} else if (k_mem_page_frame_is_busy(pf)) {
92 		COLOR(MAGENTA);
93 		printk("B");
94 	} else if (k_mem_page_frame_is_pinned(pf)) {
95 		COLOR(YELLOW);
96 		printk("P");
97 	} else if (k_mem_page_frame_is_available(pf)) {
98 		COLOR(GREY);
99 		printk(".");
100 	} else if (k_mem_page_frame_is_mapped(pf)) {
101 		COLOR(DEFAULT);
102 		printk("M");
103 	} else {
104 		COLOR(RED);
105 		printk("?");
106 	}
107 }
108 
k_mem_page_frames_dump(void)109 void k_mem_page_frames_dump(void)
110 {
111 	int column = 0;
112 
113 	__ASSERT(page_frames_initialized, "%s called too early", __func__);
114 	printk("Physical memory from 0x%lx to 0x%lx\n",
115 	       K_MEM_PHYS_RAM_START, K_MEM_PHYS_RAM_END);
116 
117 	for (int i = 0; i < K_MEM_NUM_PAGE_FRAMES; i++) {
118 		struct k_mem_page_frame *pf = &k_mem_page_frames[i];
119 
120 		page_frame_dump(pf);
121 
122 		column++;
123 		if (column == 64) {
124 			column = 0;
125 			printk("\n");
126 		}
127 	}
128 
129 	COLOR(DEFAULT);
130 	if (column != 0) {
131 		printk("\n");
132 	}
133 }
134 /* LCOV_EXCL_STOP */
135 
136 #define VIRT_FOREACH(_base, _size, _pos) \
137 	for ((_pos) = (_base); \
138 	     (_pos) < ((uint8_t *)(_base) + (_size)); (_pos) += CONFIG_MMU_PAGE_SIZE)
139 
140 #define PHYS_FOREACH(_base, _size, _pos) \
141 	for ((_pos) = (_base); \
142 	     (_pos) < ((uintptr_t)(_base) + (_size)); (_pos) += CONFIG_MMU_PAGE_SIZE)
143 
144 
145 /*
146  * Virtual address space management
147  *
148  * Call all of these functions with z_mm_lock held.
149  *
150  * Overall virtual memory map: When the kernel starts, it resides in
151  * virtual memory in the region K_MEM_KERNEL_VIRT_START to
152  * K_MEM_KERNEL_VIRT_END. Unused virtual memory past this, up to the limit
153  * noted by CONFIG_KERNEL_VM_SIZE may be used for runtime memory mappings.
154  *
155  * If CONFIG_ARCH_MAPS_ALL_RAM is set, we do not just map the kernel image,
156  * but have a mapping for all RAM in place. This is for special architectural
157  * purposes and does not otherwise affect page frame accounting or flags;
158  * the only guarantee is that such RAM mapping outside of the Zephyr image
159  * won't be disturbed by subsequent memory mapping calls.
160  *
161  * +--------------+ <- K_MEM_VIRT_RAM_START
162  * | Undefined VM | <- May contain ancillary regions like x86_64's locore
163  * +--------------+ <- K_MEM_KERNEL_VIRT_START (often == K_MEM_VIRT_RAM_START)
164  * | Mapping for  |
165  * | main kernel  |
166  * | image        |
167  * |		  |
168  * |		  |
169  * +--------------+ <- K_MEM_VM_FREE_START
170  * |              |
171  * | Unused,      |
172  * | Available VM |
173  * |              |
174  * |..............| <- mapping_pos (grows downward as more mappings are made)
175  * | Mapping      |
176  * +--------------+
177  * | Mapping      |
178  * +--------------+
179  * | ...          |
180  * +--------------+
181  * | Mapping      |
182  * +--------------+ <- mappings start here
183  * | Reserved     | <- special purpose virtual page(s) of size K_MEM_VM_RESERVED
184  * +--------------+ <- K_MEM_VIRT_RAM_END
185  */
186 
187 /* Bitmap of virtual addresses where one bit corresponds to one page.
188  * This is being used for virt_region_alloc() to figure out which
189  * region of virtual addresses can be used for memory mapping.
190  *
191  * Note that bit #0 is the highest address so that allocation is
192  * done in reverse from highest address.
193  */
194 SYS_BITARRAY_DEFINE_STATIC(virt_region_bitmap,
195 			   CONFIG_KERNEL_VM_SIZE / CONFIG_MMU_PAGE_SIZE);
196 
197 static bool virt_region_inited;
198 
199 #define Z_VIRT_REGION_START_ADDR	K_MEM_VM_FREE_START
200 #define Z_VIRT_REGION_END_ADDR		(K_MEM_VIRT_RAM_END - K_MEM_VM_RESERVED)
201 
virt_from_bitmap_offset(size_t offset,size_t size)202 static inline uintptr_t virt_from_bitmap_offset(size_t offset, size_t size)
203 {
204 	return POINTER_TO_UINT(K_MEM_VIRT_RAM_END)
205 	       - (offset * CONFIG_MMU_PAGE_SIZE) - size;
206 }
207 
virt_to_bitmap_offset(void * vaddr,size_t size)208 static inline size_t virt_to_bitmap_offset(void *vaddr, size_t size)
209 {
210 	return (POINTER_TO_UINT(K_MEM_VIRT_RAM_END)
211 		- POINTER_TO_UINT(vaddr) - size) / CONFIG_MMU_PAGE_SIZE;
212 }
213 
virt_region_init(void)214 static void virt_region_init(void)
215 {
216 	size_t offset, num_bits;
217 
218 	/* There are regions where we should never map via
219 	 * k_mem_map() and k_mem_map_phys_bare(). Mark them as
220 	 * already allocated so they will never be used.
221 	 */
222 
223 	if (K_MEM_VM_RESERVED > 0) {
224 		/* Mark reserved region at end of virtual address space */
225 		num_bits = K_MEM_VM_RESERVED / CONFIG_MMU_PAGE_SIZE;
226 		(void)sys_bitarray_set_region(&virt_region_bitmap,
227 					      num_bits, 0);
228 	}
229 
230 	/* Mark all bits up to Z_FREE_VM_START as allocated */
231 	num_bits = POINTER_TO_UINT(K_MEM_VM_FREE_START)
232 		   - POINTER_TO_UINT(K_MEM_VIRT_RAM_START);
233 	offset = virt_to_bitmap_offset(K_MEM_VIRT_RAM_START, num_bits);
234 	num_bits /= CONFIG_MMU_PAGE_SIZE;
235 	(void)sys_bitarray_set_region(&virt_region_bitmap,
236 				      num_bits, offset);
237 
238 	virt_region_inited = true;
239 }
240 
virt_region_free(void * vaddr,size_t size)241 static void virt_region_free(void *vaddr, size_t size)
242 {
243 	size_t offset, num_bits;
244 	uint8_t *vaddr_u8 = (uint8_t *)vaddr;
245 
246 	if (unlikely(!virt_region_inited)) {
247 		virt_region_init();
248 	}
249 
250 #ifndef CONFIG_KERNEL_DIRECT_MAP
251 	/* Without the need to support K_MEM_DIRECT_MAP, the region must be
252 	 * able to be represented in the bitmap. So this case is
253 	 * simple.
254 	 */
255 
256 	__ASSERT((vaddr_u8 >= Z_VIRT_REGION_START_ADDR)
257 		 && ((vaddr_u8 + size - 1) < Z_VIRT_REGION_END_ADDR),
258 		 "invalid virtual address region %p (%zu)", vaddr_u8, size);
259 	if (!((vaddr_u8 >= Z_VIRT_REGION_START_ADDR)
260 	      && ((vaddr_u8 + size - 1) < Z_VIRT_REGION_END_ADDR))) {
261 		return;
262 	}
263 
264 	offset = virt_to_bitmap_offset(vaddr, size);
265 	num_bits = size / CONFIG_MMU_PAGE_SIZE;
266 	(void)sys_bitarray_free(&virt_region_bitmap, num_bits, offset);
267 #else /* !CONFIG_KERNEL_DIRECT_MAP */
268 	/* With K_MEM_DIRECT_MAP, the region can be outside of the virtual
269 	 * memory space, wholly within it, or overlap partially.
270 	 * So additional processing is needed to make sure we only
271 	 * mark the pages within the bitmap.
272 	 */
273 	if (((vaddr_u8 >= Z_VIRT_REGION_START_ADDR) &&
274 	     (vaddr_u8 < Z_VIRT_REGION_END_ADDR)) ||
275 	    (((vaddr_u8 + size - 1) >= Z_VIRT_REGION_START_ADDR) &&
276 	     ((vaddr_u8 + size - 1) < Z_VIRT_REGION_END_ADDR))) {
277 		uint8_t *adjusted_start = max(vaddr_u8, Z_VIRT_REGION_START_ADDR);
278 		uint8_t *adjusted_end = min(vaddr_u8 + size,
279 					    Z_VIRT_REGION_END_ADDR);
280 		size_t adjusted_sz = adjusted_end - adjusted_start;
281 
282 		offset = virt_to_bitmap_offset(adjusted_start, adjusted_sz);
283 		num_bits = adjusted_sz / CONFIG_MMU_PAGE_SIZE;
284 		(void)sys_bitarray_free(&virt_region_bitmap, num_bits, offset);
285 	}
286 #endif /* !CONFIG_KERNEL_DIRECT_MAP */
287 }
288 
virt_region_alloc(size_t size,size_t align)289 static void *virt_region_alloc(size_t size, size_t align)
290 {
291 	uintptr_t dest_addr;
292 	size_t alloc_size;
293 	size_t offset;
294 	size_t num_bits;
295 	int ret;
296 
297 	if (unlikely(!virt_region_inited)) {
298 		virt_region_init();
299 	}
300 
301 	/* Possibly request more pages to ensure we can get an aligned virtual address */
302 	num_bits = (size + align - CONFIG_MMU_PAGE_SIZE) / CONFIG_MMU_PAGE_SIZE;
303 	alloc_size = num_bits * CONFIG_MMU_PAGE_SIZE;
304 	ret = sys_bitarray_alloc(&virt_region_bitmap, num_bits, &offset);
305 	if (ret != 0) {
306 		LOG_ERR("insufficient virtual address space (requested %zu)",
307 			size);
308 		return NULL;
309 	}
310 
311 	/* Remember that bit #0 in bitmap corresponds to the highest
312 	 * virtual address. So here we need to go downwards (backwards?)
313 	 * to get the starting address of the allocated region.
314 	 */
315 	dest_addr = virt_from_bitmap_offset(offset, alloc_size);
316 
317 	if (alloc_size > size) {
318 		uintptr_t aligned_dest_addr = ROUND_UP(dest_addr, align);
319 
320 		/* Here is the memory organization when trying to get an aligned
321 		 * virtual address:
322 		 *
323 		 * +--------------+ <- K_MEM_VIRT_RAM_START
324 		 * | Undefined VM |
325 		 * +--------------+ <- K_MEM_KERNEL_VIRT_START (often == K_MEM_VIRT_RAM_START)
326 		 * | Mapping for  |
327 		 * | main kernel  |
328 		 * | image        |
329 		 * |		  |
330 		 * |		  |
331 		 * +--------------+ <- K_MEM_VM_FREE_START
332 		 * | ...          |
333 		 * +==============+ <- dest_addr
334 		 * | Unused       |
335 		 * |..............| <- aligned_dest_addr
336 		 * |              |
337 		 * | Aligned      |
338 		 * | Mapping      |
339 		 * |              |
340 		 * |..............| <- aligned_dest_addr + size
341 		 * | Unused       |
342 		 * +==============+ <- offset from K_MEM_VIRT_RAM_END == dest_addr + alloc_size
343 		 * | ...          |
344 		 * +--------------+
345 		 * | Mapping      |
346 		 * +--------------+
347 		 * | Reserved     |
348 		 * +--------------+ <- K_MEM_VIRT_RAM_END
349 		 */
350 
351 		/* Free the two unused regions */
352 		virt_region_free(UINT_TO_POINTER(dest_addr),
353 				 aligned_dest_addr - dest_addr);
354 		if (((dest_addr + alloc_size) - (aligned_dest_addr + size)) > 0) {
355 			virt_region_free(UINT_TO_POINTER(aligned_dest_addr + size),
356 					 (dest_addr + alloc_size) - (aligned_dest_addr + size));
357 		}
358 
359 		dest_addr = aligned_dest_addr;
360 	}
361 
362 	/* Need to make sure this does not step into kernel memory */
363 	if (dest_addr < POINTER_TO_UINT(Z_VIRT_REGION_START_ADDR)) {
364 		(void)sys_bitarray_free(&virt_region_bitmap, num_bits, offset);
365 		return NULL;
366 	}
367 
368 	return UINT_TO_POINTER(dest_addr);
369 }
370 
371 /*
372  * Free page frames management
373  *
374  * Call all of these functions with z_mm_lock held.
375  */
376 
377 /* Linked list of unused and available page frames.
378  *
379  * TODO: This is very simple and treats all free page frames as being equal.
380  * However, there are use-cases to consolidate free pages such that entire
381  * SRAM banks can be switched off to save power, and so obtaining free pages
382  * may require a more complex ontology which prefers page frames in RAM banks
383  * which are still active.
384  *
385  * This implies in the future there may be multiple slists managing physical
386  * pages. Each page frame will still just have one snode link.
387  */
388 static sys_sflist_t free_page_frame_list;
389 
390 /* Number of unused and available free page frames.
391  * This information may go stale immediately.
392  */
393 static size_t z_free_page_count;
394 
395 #define PF_ASSERT(pf, expr, fmt, ...) \
396 	__ASSERT(expr, "page frame 0x%lx: " fmt, k_mem_page_frame_to_phys(pf), \
397 		 ##__VA_ARGS__)
398 
399 /* Get an unused page frame. don't care which one, or NULL if there are none */
free_page_frame_list_get(void)400 static struct k_mem_page_frame *free_page_frame_list_get(void)
401 {
402 	sys_sfnode_t *node;
403 	struct k_mem_page_frame *pf = NULL;
404 
405 	node = sys_sflist_get(&free_page_frame_list);
406 	if (node != NULL) {
407 		z_free_page_count--;
408 		pf = CONTAINER_OF(node, struct k_mem_page_frame, node);
409 		PF_ASSERT(pf, k_mem_page_frame_is_free(pf),
410 			 "on free list but not free");
411 		pf->va_and_flags = 0;
412 	}
413 
414 	return pf;
415 }
416 
417 /* Release a page frame back into the list of free pages */
free_page_frame_list_put(struct k_mem_page_frame * pf)418 static void free_page_frame_list_put(struct k_mem_page_frame *pf)
419 {
420 	PF_ASSERT(pf, k_mem_page_frame_is_available(pf),
421 		 "unavailable page put on free list");
422 
423 	sys_sfnode_init(&pf->node, K_MEM_PAGE_FRAME_FREE);
424 	sys_sflist_append(&free_page_frame_list, &pf->node);
425 	z_free_page_count++;
426 }
427 
free_page_frame_list_init(void)428 static void free_page_frame_list_init(void)
429 {
430 	sys_sflist_init(&free_page_frame_list);
431 }
432 
page_frame_free_locked(struct k_mem_page_frame * pf)433 static void page_frame_free_locked(struct k_mem_page_frame *pf)
434 {
435 	pf->va_and_flags = 0;
436 	free_page_frame_list_put(pf);
437 }
438 
439 /*
440  * Memory Mapping
441  */
442 
443 /* Called after the frame is mapped in the arch layer, to update our
444  * local ontology (and do some assertions while we're at it)
445  */
frame_mapped_set(struct k_mem_page_frame * pf,void * addr)446 static void frame_mapped_set(struct k_mem_page_frame *pf, void *addr)
447 {
448 	PF_ASSERT(pf, !k_mem_page_frame_is_free(pf),
449 		  "attempted to map a page frame on the free list");
450 	PF_ASSERT(pf, !k_mem_page_frame_is_reserved(pf),
451 		  "attempted to map a reserved page frame");
452 
453 	/* We do allow multiple mappings for pinned page frames
454 	 * since we will never need to reverse map them.
455 	 * This is uncommon, use-cases are for things like the
456 	 * Zephyr equivalent of VSDOs
457 	 */
458 	PF_ASSERT(pf, !k_mem_page_frame_is_mapped(pf) || k_mem_page_frame_is_pinned(pf),
459 		 "non-pinned and already mapped to %p",
460 		 k_mem_page_frame_to_virt(pf));
461 
462 	uintptr_t flags_mask = CONFIG_MMU_PAGE_SIZE - 1;
463 	uintptr_t va = (uintptr_t)addr & ~flags_mask;
464 
465 	pf->va_and_flags &= flags_mask;
466 	pf->va_and_flags |= va | K_MEM_PAGE_FRAME_MAPPED;
467 }
468 
469 /* LCOV_EXCL_START */
470 /* Go through page frames to find the physical address mapped
471  * by a virtual address.
472  *
473  * @param[in]  virt Virtual Address
474  * @param[out] phys Physical address mapped to the input virtual address
475  *                  if such mapping exists.
476  *
477  * @retval 0 if mapping is found and valid
478  * @retval -EFAULT if virtual address is not mapped
479  */
virt_to_page_frame(void * virt,uintptr_t * phys)480 static int virt_to_page_frame(void *virt, uintptr_t *phys)
481 {
482 	uintptr_t paddr;
483 	struct k_mem_page_frame *pf;
484 	int ret = -EFAULT;
485 
486 	K_MEM_PAGE_FRAME_FOREACH(paddr, pf) {
487 		if (k_mem_page_frame_is_mapped(pf)) {
488 			if (virt == k_mem_page_frame_to_virt(pf)) {
489 				ret = 0;
490 				if (phys != NULL) {
491 					*phys = k_mem_page_frame_to_phys(pf);
492 				}
493 				break;
494 			}
495 		}
496 	}
497 
498 	return ret;
499 }
500 /* LCOV_EXCL_STOP */
501 
502 __weak FUNC_ALIAS(virt_to_page_frame, arch_page_phys_get, int);
503 
504 #ifdef CONFIG_DEMAND_PAGING
505 static int page_frame_prepare_locked(struct k_mem_page_frame *pf, bool *dirty_ptr,
506 				     bool page_in, uintptr_t *location_ptr);
507 
508 static inline void do_backing_store_page_in(uintptr_t location);
509 static inline void do_backing_store_page_out(uintptr_t location);
510 #endif /* CONFIG_DEMAND_PAGING */
511 
512 /* Allocate a free page frame, and map it to a specified virtual address
513  *
514  * TODO: Add optional support for copy-on-write mappings to a zero page instead
515  * of allocating, in which case page frames will be allocated lazily as
516  * the mappings to the zero page get touched. This will avoid expensive
517  * page-ins as memory is mapped and physical RAM or backing store storage will
518  * not be used if the mapped memory is unused. The cost is an empty physical
519  * page of zeroes.
520  */
map_anon_page(void * addr,uint32_t flags)521 static int map_anon_page(void *addr, uint32_t flags)
522 {
523 	struct k_mem_page_frame *pf;
524 	uintptr_t phys;
525 	bool lock = (flags & K_MEM_MAP_LOCK) != 0U;
526 
527 	pf = free_page_frame_list_get();
528 	if (pf == NULL) {
529 #ifdef CONFIG_DEMAND_PAGING
530 		uintptr_t location;
531 		bool dirty;
532 		int ret;
533 
534 		pf = k_mem_paging_eviction_select(&dirty);
535 		__ASSERT(pf != NULL, "failed to get a page frame");
536 		LOG_DBG("evicting %p at 0x%lx",
537 			k_mem_page_frame_to_virt(pf),
538 			k_mem_page_frame_to_phys(pf));
539 		ret = page_frame_prepare_locked(pf, &dirty, false, &location);
540 		if (ret != 0) {
541 			return -ENOMEM;
542 		}
543 		if (dirty) {
544 			do_backing_store_page_out(location);
545 		}
546 		pf->va_and_flags = 0;
547 #else
548 		return -ENOMEM;
549 #endif /* CONFIG_DEMAND_PAGING */
550 	}
551 
552 	phys = k_mem_page_frame_to_phys(pf);
553 	arch_mem_map(addr, phys, CONFIG_MMU_PAGE_SIZE, flags);
554 
555 	if (lock) {
556 		k_mem_page_frame_set(pf, K_MEM_PAGE_FRAME_PINNED);
557 	}
558 	frame_mapped_set(pf, addr);
559 #ifdef CONFIG_DEMAND_PAGING
560 	if (IS_ENABLED(CONFIG_EVICTION_TRACKING) && (!lock)) {
561 		k_mem_paging_eviction_add(pf);
562 	}
563 #endif
564 
565 	LOG_DBG("memory mapping anon page %p -> 0x%lx", addr, phys);
566 
567 	return 0;
568 }
569 
k_mem_map_phys_guard(uintptr_t phys,size_t size,uint32_t flags,bool is_anon)570 void *k_mem_map_phys_guard(uintptr_t phys, size_t size, uint32_t flags, bool is_anon)
571 {
572 	uint8_t *dst;
573 	size_t total_size;
574 	int ret;
575 	k_spinlock_key_t key;
576 	uint8_t *pos;
577 	bool uninit = (flags & K_MEM_MAP_UNINIT) != 0U;
578 
579 	__ASSERT(!is_anon || (is_anon && page_frames_initialized),
580 		 "%s called too early", __func__);
581 	__ASSERT((flags & K_MEM_CACHE_MASK) == 0U,
582 		 "%s does not support explicit cache settings", __func__);
583 
584 	if (((flags & K_MEM_PERM_USER) != 0U) &&
585 	    ((flags & K_MEM_MAP_UNINIT) != 0U)) {
586 		LOG_ERR("user access to anonymous uninitialized pages is forbidden");
587 		return NULL;
588 	}
589 	if ((size % CONFIG_MMU_PAGE_SIZE) != 0U) {
590 		LOG_ERR("unaligned size %zu passed to %s", size, __func__);
591 		return NULL;
592 	}
593 	if (size == 0) {
594 		LOG_ERR("zero sized memory mapping");
595 		return NULL;
596 	}
597 
598 	/* Need extra for the guard pages (before and after) which we
599 	 * won't map.
600 	 */
601 	if (size_add_overflow(size, CONFIG_MMU_PAGE_SIZE * 2, &total_size)) {
602 		LOG_ERR("too large size %zu passed to %s", size, __func__);
603 		return NULL;
604 	}
605 
606 	key = k_spin_lock(&z_mm_lock);
607 
608 	dst = virt_region_alloc(total_size, CONFIG_MMU_PAGE_SIZE);
609 	if (dst == NULL) {
610 		/* Address space has no free region */
611 		goto out;
612 	}
613 
614 	/* Unmap both guard pages to make sure accessing them
615 	 * will generate fault.
616 	 */
617 	arch_mem_unmap(dst, CONFIG_MMU_PAGE_SIZE);
618 	arch_mem_unmap(dst + CONFIG_MMU_PAGE_SIZE + size,
619 		       CONFIG_MMU_PAGE_SIZE);
620 
621 	/* Skip over the "before" guard page in returned address. */
622 	dst += CONFIG_MMU_PAGE_SIZE;
623 
624 	if (is_anon) {
625 		/* Mapping from anonymous memory */
626 		flags |= K_MEM_CACHE_WB;
627 #ifdef CONFIG_DEMAND_MAPPING
628 		if ((flags & K_MEM_MAP_LOCK) == 0) {
629 			flags |= K_MEM_MAP_UNPAGED;
630 			VIRT_FOREACH(dst, size, pos) {
631 				arch_mem_map(pos,
632 					     uninit ? ARCH_UNPAGED_ANON_UNINIT
633 						    : ARCH_UNPAGED_ANON_ZERO,
634 					     CONFIG_MMU_PAGE_SIZE, flags);
635 			}
636 			LOG_DBG("memory mapping anon pages %p to %p unpaged", dst, pos-1);
637 			/* skip the memset() below */
638 			uninit = true;
639 		} else
640 #endif
641 		{
642 			VIRT_FOREACH(dst, size, pos) {
643 				ret = map_anon_page(pos, flags);
644 
645 				if (ret != 0) {
646 					/* TODO:
647 					 * call k_mem_unmap(dst, pos - dst)
648 					 * when implemented in #28990 and
649 					 * release any guard virtual page as well.
650 					 */
651 					dst = NULL;
652 					goto out;
653 				}
654 			}
655 		}
656 	} else {
657 		/* Mapping known physical memory.
658 		 *
659 		 * arch_mem_map() is a void function and does not return
660 		 * anything. Arch code usually uses ASSERT() to catch
661 		 * mapping errors. Assume this works correctly for now.
662 		 */
663 		arch_mem_map(dst, phys, size, flags);
664 	}
665 
666 out:
667 	k_spin_unlock(&z_mm_lock, key);
668 
669 	if (dst != NULL && !uninit) {
670 		/* If we later implement mappings to a copy-on-write
671 		 * zero page, won't need this step
672 		 */
673 		memset(dst, 0, size);
674 	}
675 
676 	return dst;
677 }
678 
k_mem_unmap_phys_guard(void * addr,size_t size,bool is_anon)679 void k_mem_unmap_phys_guard(void *addr, size_t size, bool is_anon)
680 {
681 	uintptr_t phys;
682 	uint8_t *pos;
683 	struct k_mem_page_frame *pf;
684 	k_spinlock_key_t key;
685 	size_t total_size;
686 	int ret;
687 
688 	/* Need space for the "before" guard page */
689 	__ASSERT_NO_MSG(POINTER_TO_UINT(addr) >= CONFIG_MMU_PAGE_SIZE);
690 
691 	/* Make sure address range is still valid after accounting
692 	 * for two guard pages.
693 	 */
694 	pos = (uint8_t *)addr - CONFIG_MMU_PAGE_SIZE;
695 	k_mem_assert_virtual_region(pos, size + (CONFIG_MMU_PAGE_SIZE * 2));
696 
697 	key = k_spin_lock(&z_mm_lock);
698 
699 	/* Check if both guard pages are unmapped.
700 	 * Bail if not, as this is probably a region not mapped
701 	 * using k_mem_map().
702 	 */
703 	pos = addr;
704 	ret = arch_page_phys_get(pos - CONFIG_MMU_PAGE_SIZE, NULL);
705 	if (ret == 0) {
706 		__ASSERT(ret == 0,
707 			 "%s: cannot find preceding guard page for (%p, %zu)",
708 			 __func__, addr, size);
709 		goto out;
710 	}
711 
712 	ret = arch_page_phys_get(pos + size, NULL);
713 	if (ret == 0) {
714 		__ASSERT(ret == 0,
715 			 "%s: cannot find succeeding guard page for (%p, %zu)",
716 			 __func__, addr, size);
717 		goto out;
718 	}
719 
720 	if (is_anon) {
721 		/* Unmapping anonymous memory */
722 		VIRT_FOREACH(addr, size, pos) {
723 #ifdef CONFIG_DEMAND_PAGING
724 			enum arch_page_location status;
725 			uintptr_t location;
726 
727 			status = arch_page_location_get(pos, &location);
728 			switch (status) {
729 			case ARCH_PAGE_LOCATION_PAGED_OUT:
730 				/*
731 				 * No pf is associated with this mapping.
732 				 * Simply get rid of the MMU entry and free
733 				 * corresponding backing store.
734 				 */
735 				arch_mem_unmap(pos, CONFIG_MMU_PAGE_SIZE);
736 				k_mem_paging_backing_store_location_free(location);
737 				continue;
738 			case ARCH_PAGE_LOCATION_PAGED_IN:
739 				/*
740 				 * The page is in memory but it may not be
741 				 * accessible in order to manage tracking
742 				 * of the ARCH_DATA_PAGE_ACCESSED flag
743 				 * meaning arch_page_phys_get() could fail.
744 				 * Still, we know the actual phys address.
745 				 */
746 				phys = location;
747 				ret = 0;
748 				break;
749 			default:
750 				ret = arch_page_phys_get(pos, &phys);
751 				break;
752 			}
753 #else
754 			ret = arch_page_phys_get(pos, &phys);
755 #endif
756 			__ASSERT(ret == 0,
757 				 "%s: cannot unmap an unmapped address %p",
758 				 __func__, pos);
759 			if (ret != 0) {
760 				/* Found an address not mapped. Do not continue. */
761 				goto out;
762 			}
763 
764 			__ASSERT(k_mem_is_page_frame(phys),
765 				 "%s: 0x%lx is not a page frame", __func__, phys);
766 			if (!k_mem_is_page_frame(phys)) {
767 				/* Physical address has no corresponding page frame
768 				 * description in the page frame array.
769 				 * This should not happen. Do not continue.
770 				 */
771 				goto out;
772 			}
773 
774 			/* Grab the corresponding page frame from physical address */
775 			pf = k_mem_phys_to_page_frame(phys);
776 
777 			__ASSERT(k_mem_page_frame_is_mapped(pf),
778 				 "%s: 0x%lx is not a mapped page frame", __func__, phys);
779 			if (!k_mem_page_frame_is_mapped(pf)) {
780 				/* Page frame is not marked mapped.
781 				 * This should not happen. Do not continue.
782 				 */
783 				goto out;
784 			}
785 
786 			arch_mem_unmap(pos, CONFIG_MMU_PAGE_SIZE);
787 #ifdef CONFIG_DEMAND_PAGING
788 			if (IS_ENABLED(CONFIG_EVICTION_TRACKING) &&
789 			    (!k_mem_page_frame_is_pinned(pf))) {
790 				k_mem_paging_eviction_remove(pf);
791 			}
792 #endif
793 
794 			/* Put the page frame back into free list */
795 			page_frame_free_locked(pf);
796 		}
797 	} else {
798 		/*
799 		 * Unmapping previous mapped memory with specific physical address.
800 		 *
801 		 * Note that we don't have to unmap the guard pages, as they should
802 		 * have been unmapped. We just need to unmapped the in-between
803 		 * region [addr, (addr + size)).
804 		 */
805 		arch_mem_unmap(addr, size);
806 	}
807 
808 	/* There are guard pages just before and after the mapped
809 	 * region. So we also need to free them from the bitmap.
810 	 */
811 	pos = (uint8_t *)addr - CONFIG_MMU_PAGE_SIZE;
812 	total_size = size + (CONFIG_MMU_PAGE_SIZE * 2);
813 	virt_region_free(pos, total_size);
814 
815 out:
816 	k_spin_unlock(&z_mm_lock, key);
817 }
818 
k_mem_update_flags(void * addr,size_t size,uint32_t flags)819 int k_mem_update_flags(void *addr, size_t size, uint32_t flags)
820 {
821 	uintptr_t phys;
822 	k_spinlock_key_t key;
823 	int ret;
824 
825 	k_mem_assert_virtual_region(addr, size);
826 
827 	key = k_spin_lock(&z_mm_lock);
828 
829 	/*
830 	 * We can achieve desired result without explicit architecture support
831 	 * by unmapping and remapping the same physical memory using new flags.
832 	 */
833 
834 	ret = arch_page_phys_get(addr, &phys);
835 	if (ret < 0) {
836 		goto out;
837 	}
838 
839 	/* TODO: detect and handle paged-out memory as well */
840 
841 	arch_mem_unmap(addr, size);
842 	arch_mem_map(addr, phys, size, flags);
843 
844 out:
845 	k_spin_unlock(&z_mm_lock, key);
846 	return ret;
847 }
848 
k_mem_free_get(void)849 size_t k_mem_free_get(void)
850 {
851 	size_t ret;
852 	k_spinlock_key_t key;
853 
854 	__ASSERT(page_frames_initialized, "%s called too early", __func__);
855 
856 	key = k_spin_lock(&z_mm_lock);
857 #ifdef CONFIG_DEMAND_PAGING
858 	if (z_free_page_count > CONFIG_DEMAND_PAGING_PAGE_FRAMES_RESERVE) {
859 		ret = z_free_page_count - CONFIG_DEMAND_PAGING_PAGE_FRAMES_RESERVE;
860 	} else {
861 		ret = 0;
862 	}
863 #else
864 	ret = z_free_page_count;
865 #endif /* CONFIG_DEMAND_PAGING */
866 	k_spin_unlock(&z_mm_lock, key);
867 
868 	return ret * (size_t)CONFIG_MMU_PAGE_SIZE;
869 }
870 
871 /* Get the default virtual region alignment, here the default MMU page size
872  *
873  * @param[in] phys Physical address of region to be mapped, aligned to MMU_PAGE_SIZE
874  * @param[in] size Size of region to be mapped, aligned to MMU_PAGE_SIZE
875  *
876  * @retval alignment to apply on the virtual address of this region
877  */
virt_region_align(uintptr_t phys,size_t size)878 static size_t virt_region_align(uintptr_t phys, size_t size)
879 {
880 	ARG_UNUSED(phys);
881 	ARG_UNUSED(size);
882 
883 	return CONFIG_MMU_PAGE_SIZE;
884 }
885 
886 __weak FUNC_ALIAS(virt_region_align, arch_virt_region_align, size_t);
887 
888 /* This may be called from arch early boot code before z_cstart() is invoked.
889  * Data will be copied and BSS zeroed, but this must not rely on any
890  * initialization functions being called prior to work correctly.
891  */
k_mem_map_phys_bare(uint8_t ** virt_ptr,uintptr_t phys,size_t size,uint32_t flags)892 void k_mem_map_phys_bare(uint8_t **virt_ptr, uintptr_t phys, size_t size, uint32_t flags)
893 {
894 	uintptr_t aligned_phys, addr_offset;
895 	size_t aligned_size, align_boundary;
896 	k_spinlock_key_t key;
897 	uint8_t *dest_addr;
898 	size_t num_bits;
899 	size_t offset;
900 
901 #ifndef CONFIG_KERNEL_DIRECT_MAP
902 	__ASSERT(!(flags & K_MEM_DIRECT_MAP), "The direct-map is not enabled");
903 #endif /* CONFIG_KERNEL_DIRECT_MAP */
904 	addr_offset = k_mem_region_align(&aligned_phys, &aligned_size,
905 					 phys, size,
906 					 CONFIG_MMU_PAGE_SIZE);
907 	__ASSERT(aligned_size != 0U, "0-length mapping at 0x%lx", aligned_phys);
908 	__ASSERT(aligned_phys < (aligned_phys + (aligned_size - 1)),
909 		 "wraparound for physical address 0x%lx (size %zu)",
910 		 aligned_phys, aligned_size);
911 
912 	align_boundary = arch_virt_region_align(aligned_phys, aligned_size);
913 
914 	key = k_spin_lock(&z_mm_lock);
915 
916 	if (IS_ENABLED(CONFIG_KERNEL_DIRECT_MAP) &&
917 	    (flags & K_MEM_DIRECT_MAP)) {
918 		dest_addr = (uint8_t *)aligned_phys;
919 
920 		/* Mark the region of virtual memory bitmap as used
921 		 * if the region overlaps the virtual memory space.
922 		 *
923 		 * Basically if either end of region is within
924 		 * virtual memory space, we need to mark the bits.
925 		 */
926 
927 		if (IN_RANGE(aligned_phys,
928 			      (uintptr_t)K_MEM_VIRT_RAM_START,
929 			      (uintptr_t)(K_MEM_VIRT_RAM_END - 1)) ||
930 		    IN_RANGE(aligned_phys + aligned_size - 1,
931 			      (uintptr_t)K_MEM_VIRT_RAM_START,
932 			      (uintptr_t)(K_MEM_VIRT_RAM_END - 1))) {
933 			uint8_t *adjusted_start = max(dest_addr, K_MEM_VIRT_RAM_START);
934 			uint8_t *adjusted_end = min(dest_addr + aligned_size,
935 						    K_MEM_VIRT_RAM_END);
936 			size_t adjusted_sz = adjusted_end - adjusted_start;
937 
938 			num_bits = adjusted_sz / CONFIG_MMU_PAGE_SIZE;
939 			offset = virt_to_bitmap_offset(adjusted_start, adjusted_sz);
940 			if (sys_bitarray_test_and_set_region(
941 			    &virt_region_bitmap, num_bits, offset, true)) {
942 				goto fail;
943 			}
944 		}
945 	} else {
946 		/* Obtain an appropriately sized chunk of virtual memory */
947 		dest_addr = virt_region_alloc(aligned_size, align_boundary);
948 		if (!dest_addr) {
949 			goto fail;
950 		}
951 	}
952 
953 	/* If this fails there's something amiss with virt_region_get */
954 	__ASSERT((uintptr_t)dest_addr <
955 		 ((uintptr_t)dest_addr + (size - 1)),
956 		 "wraparound for virtual address %p (size %zu)",
957 		 dest_addr, size);
958 
959 	LOG_DBG("arch_mem_map(%p, 0x%lx, %zu, %x) offset %lu", (void *)dest_addr,
960 		aligned_phys, aligned_size, flags, addr_offset);
961 
962 	arch_mem_map(dest_addr, aligned_phys, aligned_size, flags);
963 	k_spin_unlock(&z_mm_lock, key);
964 
965 	*virt_ptr = dest_addr + addr_offset;
966 	return;
967 fail:
968 	/* May re-visit this in the future, but for now running out of
969 	 * virtual address space or failing the arch_mem_map() call is
970 	 * an unrecoverable situation.
971 	 *
972 	 * Other problems not related to resource exhaustion we leave as
973 	 * assertions since they are clearly programming mistakes.
974 	 */
975 	LOG_ERR("memory mapping 0x%lx (size %zu, flags 0x%x) failed",
976 		phys, size, flags);
977 	k_panic();
978 }
979 
k_mem_unmap_phys_bare(uint8_t * virt,size_t size)980 void k_mem_unmap_phys_bare(uint8_t *virt, size_t size)
981 {
982 	uintptr_t aligned_virt, addr_offset;
983 	size_t aligned_size;
984 	k_spinlock_key_t key;
985 
986 	addr_offset = k_mem_region_align(&aligned_virt, &aligned_size,
987 					 POINTER_TO_UINT(virt), size,
988 					 CONFIG_MMU_PAGE_SIZE);
989 	__ASSERT(aligned_size != 0U, "0-length mapping at 0x%lx", aligned_virt);
990 	__ASSERT(aligned_virt < (aligned_virt + (aligned_size - 1)),
991 		 "wraparound for virtual address 0x%lx (size %zu)",
992 		 aligned_virt, aligned_size);
993 
994 	key = k_spin_lock(&z_mm_lock);
995 
996 	LOG_DBG("arch_mem_unmap(0x%lx, %zu) offset %lu",
997 		aligned_virt, aligned_size, addr_offset);
998 
999 	arch_mem_unmap(UINT_TO_POINTER(aligned_virt), aligned_size);
1000 	virt_region_free(UINT_TO_POINTER(aligned_virt), aligned_size);
1001 	k_spin_unlock(&z_mm_lock, key);
1002 }
1003 
1004 /*
1005  * Miscellaneous
1006  */
1007 
k_mem_region_align(uintptr_t * aligned_addr,size_t * aligned_size,uintptr_t addr,size_t size,size_t align)1008 size_t k_mem_region_align(uintptr_t *aligned_addr, size_t *aligned_size,
1009 			  uintptr_t addr, size_t size, size_t align)
1010 {
1011 	size_t addr_offset;
1012 
1013 	/* The actual mapped region must be page-aligned. Round down the
1014 	 * physical address and pad the region size appropriately
1015 	 */
1016 	*aligned_addr = ROUND_DOWN(addr, align);
1017 	addr_offset = addr - *aligned_addr;
1018 	*aligned_size = ROUND_UP(size + addr_offset, align);
1019 
1020 	return addr_offset;
1021 }
1022 
1023 #if defined(CONFIG_LINKER_USE_BOOT_SECTION) || defined(CONFIG_LINKER_USE_PINNED_SECTION)
mark_linker_section_pinned(void * start_addr,void * end_addr,bool pin)1024 static void mark_linker_section_pinned(void *start_addr, void *end_addr,
1025 				       bool pin)
1026 {
1027 	struct k_mem_page_frame *pf;
1028 	uint8_t *addr;
1029 
1030 	uintptr_t pinned_start = ROUND_DOWN(POINTER_TO_UINT(start_addr),
1031 					    CONFIG_MMU_PAGE_SIZE);
1032 	uintptr_t pinned_end = ROUND_UP(POINTER_TO_UINT(end_addr),
1033 					CONFIG_MMU_PAGE_SIZE);
1034 	size_t pinned_size = pinned_end - pinned_start;
1035 
1036 	VIRT_FOREACH(UINT_TO_POINTER(pinned_start), pinned_size, addr)
1037 	{
1038 		pf = k_mem_phys_to_page_frame(K_MEM_BOOT_VIRT_TO_PHYS(addr));
1039 		frame_mapped_set(pf, addr);
1040 
1041 		if (pin) {
1042 			k_mem_page_frame_set(pf, K_MEM_PAGE_FRAME_PINNED);
1043 		} else {
1044 			k_mem_page_frame_clear(pf, K_MEM_PAGE_FRAME_PINNED);
1045 #ifdef CONFIG_DEMAND_PAGING
1046 			if (IS_ENABLED(CONFIG_EVICTION_TRACKING) &&
1047 			    k_mem_page_frame_is_evictable(pf)) {
1048 				k_mem_paging_eviction_add(pf);
1049 			}
1050 #endif
1051 		}
1052 	}
1053 }
1054 #endif /* CONFIG_LINKER_USE_BOOT_SECTION) || CONFIG_LINKER_USE_PINNED_SECTION */
1055 
1056 #ifdef CONFIG_LINKER_USE_ONDEMAND_SECTION
z_paging_ondemand_section_map(void)1057 static void z_paging_ondemand_section_map(void)
1058 {
1059 	uint8_t *addr;
1060 	size_t size;
1061 	uintptr_t location;
1062 	uint32_t flags;
1063 
1064 	size = (uintptr_t)lnkr_ondemand_text_size;
1065 	flags = K_MEM_MAP_UNPAGED | K_MEM_PERM_EXEC | K_MEM_CACHE_WB;
1066 	VIRT_FOREACH(lnkr_ondemand_text_start, size, addr) {
1067 		k_mem_paging_backing_store_location_query(addr, &location);
1068 		arch_mem_map(addr, location, CONFIG_MMU_PAGE_SIZE, flags);
1069 		sys_bitarray_set_region(&virt_region_bitmap, 1,
1070 					virt_to_bitmap_offset(addr, CONFIG_MMU_PAGE_SIZE));
1071 	}
1072 
1073 	size = (uintptr_t)lnkr_ondemand_rodata_size;
1074 	flags = K_MEM_MAP_UNPAGED | K_MEM_CACHE_WB;
1075 	VIRT_FOREACH(lnkr_ondemand_rodata_start, size, addr) {
1076 		k_mem_paging_backing_store_location_query(addr, &location);
1077 		arch_mem_map(addr, location, CONFIG_MMU_PAGE_SIZE, flags);
1078 		sys_bitarray_set_region(&virt_region_bitmap, 1,
1079 					virt_to_bitmap_offset(addr, CONFIG_MMU_PAGE_SIZE));
1080 	}
1081 }
1082 #endif /* CONFIG_LINKER_USE_ONDEMAND_SECTION */
1083 
z_mem_manage_init(void)1084 void z_mem_manage_init(void)
1085 {
1086 	uintptr_t phys;
1087 	uint8_t *addr;
1088 	struct k_mem_page_frame *pf;
1089 	k_spinlock_key_t key = k_spin_lock(&z_mm_lock);
1090 
1091 	free_page_frame_list_init();
1092 
1093 	ARG_UNUSED(addr);
1094 
1095 #ifdef CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES
1096 	/* If some page frames are unavailable for use as memory, arch
1097 	 * code will mark K_MEM_PAGE_FRAME_RESERVED in their flags
1098 	 */
1099 	arch_reserved_pages_update();
1100 #endif /* CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES */
1101 
1102 #ifdef CONFIG_LINKER_GENERIC_SECTIONS_PRESENT_AT_BOOT
1103 	/* All pages composing the Zephyr image are mapped at boot in a
1104 	 * predictable way. This can change at runtime.
1105 	 */
1106 	VIRT_FOREACH(K_MEM_KERNEL_VIRT_START, K_MEM_KERNEL_VIRT_SIZE, addr)
1107 	{
1108 		pf = k_mem_phys_to_page_frame(K_MEM_BOOT_VIRT_TO_PHYS(addr));
1109 		frame_mapped_set(pf, addr);
1110 
1111 		/* TODO: for now we pin the whole Zephyr image. Demand paging
1112 		 * currently tested with anonymously-mapped pages which are not
1113 		 * pinned.
1114 		 *
1115 		 * We will need to setup linker regions for a subset of kernel
1116 		 * code/data pages which are pinned in memory and
1117 		 * may not be evicted. This will contain critical CPU data
1118 		 * structures, and any code used to perform page fault
1119 		 * handling, page-ins, etc.
1120 		 */
1121 		k_mem_page_frame_set(pf, K_MEM_PAGE_FRAME_PINNED);
1122 	}
1123 #endif /* CONFIG_LINKER_GENERIC_SECTIONS_PRESENT_AT_BOOT */
1124 
1125 #ifdef CONFIG_LINKER_USE_BOOT_SECTION
1126 	/* Pin the boot section to prevent it from being swapped out during
1127 	 * boot process. Will be un-pinned once boot process completes.
1128 	 */
1129 	mark_linker_section_pinned(lnkr_boot_start, lnkr_boot_end, true);
1130 #endif /* CONFIG_LINKER_USE_BOOT_SECTION */
1131 
1132 #ifdef CONFIG_LINKER_USE_PINNED_SECTION
1133 	/* Pin the page frames correspondng to the pinned symbols */
1134 	mark_linker_section_pinned(lnkr_pinned_start, lnkr_pinned_end, true);
1135 #endif /* CONFIG_LINKER_USE_PINNED_SECTION */
1136 
1137 	/* Any remaining pages that aren't mapped, reserved, or pinned get
1138 	 * added to the free pages list
1139 	 */
1140 	K_MEM_PAGE_FRAME_FOREACH(phys, pf) {
1141 		if (k_mem_page_frame_is_available(pf)) {
1142 			free_page_frame_list_put(pf);
1143 		}
1144 	}
1145 	LOG_DBG("free page frames: %zu", z_free_page_count);
1146 
1147 #ifdef CONFIG_DEMAND_PAGING
1148 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
1149 	z_paging_histogram_init();
1150 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
1151 	k_mem_paging_backing_store_init();
1152 	k_mem_paging_eviction_init();
1153 
1154 	if (IS_ENABLED(CONFIG_EVICTION_TRACKING)) {
1155 		/* start tracking evictable page installed above if any */
1156 		K_MEM_PAGE_FRAME_FOREACH(phys, pf) {
1157 			if (k_mem_page_frame_is_evictable(pf)) {
1158 				k_mem_paging_eviction_add(pf);
1159 			}
1160 		}
1161 	}
1162 #endif /* CONFIG_DEMAND_PAGING */
1163 
1164 #ifdef CONFIG_LINKER_USE_ONDEMAND_SECTION
1165 	z_paging_ondemand_section_map();
1166 #endif
1167 
1168 #if __ASSERT_ON
1169 	page_frames_initialized = true;
1170 #endif
1171 	k_spin_unlock(&z_mm_lock, key);
1172 
1173 #ifndef CONFIG_LINKER_GENERIC_SECTIONS_PRESENT_AT_BOOT
1174 	/* If BSS section is not present in memory at boot,
1175 	 * it would not have been cleared. This needs to be
1176 	 * done now since paging mechanism has been initialized
1177 	 * and the BSS pages can be brought into physical
1178 	 * memory to be cleared.
1179 	 */
1180 	arch_bss_zero();
1181 #endif /* CONFIG_LINKER_GENERIC_SECTIONS_PRESENT_AT_BOOT */
1182 }
1183 
z_mem_manage_boot_finish(void)1184 void z_mem_manage_boot_finish(void)
1185 {
1186 #ifdef CONFIG_LINKER_USE_BOOT_SECTION
1187 	/* At the end of boot process, unpin the boot sections
1188 	 * as they don't need to be in memory all the time anymore.
1189 	 */
1190 	mark_linker_section_pinned(lnkr_boot_start, lnkr_boot_end, false);
1191 #endif /* CONFIG_LINKER_USE_BOOT_SECTION */
1192 }
1193 
1194 #ifdef CONFIG_DEMAND_PAGING
1195 
1196 #ifdef CONFIG_DEMAND_PAGING_STATS
1197 struct k_mem_paging_stats_t paging_stats;
1198 extern struct k_mem_paging_histogram_t z_paging_histogram_eviction;
1199 extern struct k_mem_paging_histogram_t z_paging_histogram_backing_store_page_in;
1200 extern struct k_mem_paging_histogram_t z_paging_histogram_backing_store_page_out;
1201 #endif /* CONFIG_DEMAND_PAGING_STATS */
1202 
do_backing_store_page_in(uintptr_t location)1203 static inline void do_backing_store_page_in(uintptr_t location)
1204 {
1205 #ifdef CONFIG_DEMAND_MAPPING
1206 	/* Check for special cases */
1207 	switch (location) {
1208 	case ARCH_UNPAGED_ANON_ZERO:
1209 		memset(K_MEM_SCRATCH_PAGE, 0, CONFIG_MMU_PAGE_SIZE);
1210 		__fallthrough;
1211 	case ARCH_UNPAGED_ANON_UNINIT:
1212 		/* nothing else to do */
1213 		return;
1214 	default:
1215 		break;
1216 	}
1217 #endif /* CONFIG_DEMAND_MAPPING */
1218 
1219 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
1220 	uint32_t time_diff;
1221 
1222 #ifdef CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS
1223 	timing_t time_start, time_end;
1224 
1225 	time_start = timing_counter_get();
1226 #else
1227 	uint32_t time_start;
1228 
1229 	time_start = k_cycle_get_32();
1230 #endif /* CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS */
1231 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
1232 
1233 	k_mem_paging_backing_store_page_in(location);
1234 
1235 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
1236 #ifdef CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS
1237 	time_end = timing_counter_get();
1238 	time_diff = (uint32_t)timing_cycles_get(&time_start, &time_end);
1239 #else
1240 	time_diff = k_cycle_get_32() - time_start;
1241 #endif /* CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS */
1242 
1243 	z_paging_histogram_inc(&z_paging_histogram_backing_store_page_in,
1244 			       time_diff);
1245 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
1246 }
1247 
do_backing_store_page_out(uintptr_t location)1248 static inline void do_backing_store_page_out(uintptr_t location)
1249 {
1250 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
1251 	uint32_t time_diff;
1252 
1253 #ifdef CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS
1254 	timing_t time_start, time_end;
1255 
1256 	time_start = timing_counter_get();
1257 #else
1258 	uint32_t time_start;
1259 
1260 	time_start = k_cycle_get_32();
1261 #endif /* CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS */
1262 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
1263 
1264 	k_mem_paging_backing_store_page_out(location);
1265 
1266 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
1267 #ifdef CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS
1268 	time_end = timing_counter_get();
1269 	time_diff = (uint32_t)timing_cycles_get(&time_start, &time_end);
1270 #else
1271 	time_diff = k_cycle_get_32() - time_start;
1272 #endif /* CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS */
1273 
1274 	z_paging_histogram_inc(&z_paging_histogram_backing_store_page_out,
1275 			       time_diff);
1276 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
1277 }
1278 
1279 #if defined(CONFIG_SMP) && defined(CONFIG_DEMAND_PAGING_ALLOW_IRQ)
1280 /*
1281  * SMP support is very simple. Some resources such as the scratch page could
1282  * be made per CPU, backing store driver execution be confined to the faulting
1283  * CPU, statistics be made to cope with access concurrency, etc. But in the
1284  * end we're dealing with memory transfer to/from some external storage which
1285  * is inherently slow and whose access is most likely serialized anyway.
1286  * So let's simply enforce global demand paging serialization across all CPUs
1287  * with a mutex as there is no real gain from added parallelism here.
1288  */
1289 static K_MUTEX_DEFINE(z_mm_paging_lock);
1290 #endif
1291 
virt_region_foreach(void * addr,size_t size,void (* func)(void *))1292 static void virt_region_foreach(void *addr, size_t size,
1293 				void (*func)(void *))
1294 {
1295 	k_mem_assert_virtual_region(addr, size);
1296 
1297 	for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) {
1298 		func((uint8_t *)addr + offset);
1299 	}
1300 }
1301 
1302 /*
1303  * Perform some preparatory steps before paging out. The provided page frame
1304  * must be evicted to the backing store immediately after this is called
1305  * with a call to k_mem_paging_backing_store_page_out() if it contains
1306  * a data page.
1307  *
1308  * - Map page frame to scratch area if requested. This always is true if we're
1309  *   doing a page fault, but is only set on manual evictions if the page is
1310  *   dirty.
1311  * - If mapped:
1312  *    - obtain backing store location and populate location parameter
1313  *    - Update page tables with location
1314  * - Mark page frame as busy
1315  *
1316  * Returns -ENOMEM if the backing store is full
1317  */
page_frame_prepare_locked(struct k_mem_page_frame * pf,bool * dirty_ptr,bool page_fault,uintptr_t * location_ptr)1318 static int page_frame_prepare_locked(struct k_mem_page_frame *pf, bool *dirty_ptr,
1319 				     bool page_fault, uintptr_t *location_ptr)
1320 {
1321 	uintptr_t phys;
1322 	int ret;
1323 	bool dirty = *dirty_ptr;
1324 
1325 	phys = k_mem_page_frame_to_phys(pf);
1326 	__ASSERT(!k_mem_page_frame_is_pinned(pf), "page frame 0x%lx is pinned",
1327 		 phys);
1328 
1329 	/* If the backing store doesn't have a copy of the page, even if it
1330 	 * wasn't modified, treat as dirty. This can happen for a few
1331 	 * reasons:
1332 	 * 1) Page has never been swapped out before, and the backing store
1333 	 *    wasn't pre-populated with this data page.
1334 	 * 2) Page was swapped out before, but the page contents were not
1335 	 *    preserved after swapping back in.
1336 	 * 3) Page contents were preserved when swapped back in, but were later
1337 	 *    evicted from the backing store to make room for other evicted
1338 	 *    pages.
1339 	 */
1340 	if (k_mem_page_frame_is_mapped(pf)) {
1341 		dirty = dirty || !k_mem_page_frame_is_backed(pf);
1342 	}
1343 
1344 	if (dirty || page_fault) {
1345 		arch_mem_scratch(phys);
1346 	}
1347 
1348 	if (k_mem_page_frame_is_mapped(pf)) {
1349 		ret = k_mem_paging_backing_store_location_get(pf, location_ptr,
1350 							      page_fault);
1351 		if (ret != 0) {
1352 			LOG_ERR("out of backing store memory");
1353 			return -ENOMEM;
1354 		}
1355 		arch_mem_page_out(k_mem_page_frame_to_virt(pf), *location_ptr);
1356 
1357 		if (IS_ENABLED(CONFIG_EVICTION_TRACKING)) {
1358 			k_mem_paging_eviction_remove(pf);
1359 		}
1360 	} else {
1361 		/* Shouldn't happen unless this function is mis-used */
1362 		__ASSERT(!dirty, "un-mapped page determined to be dirty");
1363 	}
1364 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1365 	/* Mark as busy so that k_mem_page_frame_is_evictable() returns false */
1366 	__ASSERT(!k_mem_page_frame_is_busy(pf), "page frame 0x%lx is already busy",
1367 		 phys);
1368 	k_mem_page_frame_set(pf, K_MEM_PAGE_FRAME_BUSY);
1369 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1370 	/* Update dirty parameter, since we set to true if it wasn't backed
1371 	 * even if otherwise clean
1372 	 */
1373 	*dirty_ptr = dirty;
1374 
1375 	return 0;
1376 }
1377 
do_mem_evict(void * addr)1378 static int do_mem_evict(void *addr)
1379 {
1380 	bool dirty;
1381 	struct k_mem_page_frame *pf;
1382 	uintptr_t location;
1383 	k_spinlock_key_t key;
1384 	uintptr_t flags, phys;
1385 	int ret;
1386 
1387 #if CONFIG_DEMAND_PAGING_ALLOW_IRQ
1388 	__ASSERT(!k_is_in_isr(),
1389 		 "%s is unavailable in ISRs with CONFIG_DEMAND_PAGING_ALLOW_IRQ",
1390 		 __func__);
1391 #ifdef CONFIG_SMP
1392 	k_mutex_lock(&z_mm_paging_lock, K_FOREVER);
1393 #else
1394 	k_sched_lock();
1395 #endif
1396 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1397 	key = k_spin_lock(&z_mm_lock);
1398 	flags = arch_page_info_get(addr, &phys, false);
1399 	__ASSERT((flags & ARCH_DATA_PAGE_NOT_MAPPED) == 0,
1400 		 "address %p isn't mapped", addr);
1401 	if ((flags & ARCH_DATA_PAGE_LOADED) == 0) {
1402 		/* Un-mapped or already evicted. Nothing to do */
1403 		ret = 0;
1404 		goto out;
1405 	}
1406 
1407 	dirty = (flags & ARCH_DATA_PAGE_DIRTY) != 0;
1408 	pf = k_mem_phys_to_page_frame(phys);
1409 	__ASSERT(k_mem_page_frame_to_virt(pf) == addr, "page frame address mismatch");
1410 	ret = page_frame_prepare_locked(pf, &dirty, false, &location);
1411 	if (ret != 0) {
1412 		goto out;
1413 	}
1414 
1415 	__ASSERT(ret == 0, "failed to prepare page frame");
1416 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1417 	k_spin_unlock(&z_mm_lock, key);
1418 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1419 	if (dirty) {
1420 		do_backing_store_page_out(location);
1421 	}
1422 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1423 	key = k_spin_lock(&z_mm_lock);
1424 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1425 	page_frame_free_locked(pf);
1426 out:
1427 	k_spin_unlock(&z_mm_lock, key);
1428 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1429 #ifdef CONFIG_SMP
1430 	k_mutex_unlock(&z_mm_paging_lock);
1431 #else
1432 	k_sched_unlock();
1433 #endif
1434 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1435 	return ret;
1436 }
1437 
k_mem_page_out(void * addr,size_t size)1438 int k_mem_page_out(void *addr, size_t size)
1439 {
1440 	__ASSERT(page_frames_initialized, "%s called on %p too early", __func__,
1441 		 addr);
1442 	k_mem_assert_virtual_region(addr, size);
1443 
1444 	for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) {
1445 		void *pos = (uint8_t *)addr + offset;
1446 		int ret;
1447 
1448 		ret = do_mem_evict(pos);
1449 		if (ret != 0) {
1450 			return ret;
1451 		}
1452 	}
1453 
1454 	return 0;
1455 }
1456 
k_mem_page_frame_evict(uintptr_t phys)1457 int k_mem_page_frame_evict(uintptr_t phys)
1458 {
1459 	k_spinlock_key_t key;
1460 	struct k_mem_page_frame *pf;
1461 	bool dirty;
1462 	uintptr_t flags;
1463 	uintptr_t location;
1464 	int ret;
1465 
1466 	__ASSERT(page_frames_initialized, "%s called on 0x%lx too early",
1467 		 __func__, phys);
1468 
1469 	/* Implementation is similar to do_page_fault() except there is no
1470 	 * data page to page-in, see comments in that function.
1471 	 */
1472 
1473 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1474 	__ASSERT(!k_is_in_isr(),
1475 		 "%s is unavailable in ISRs with CONFIG_DEMAND_PAGING_ALLOW_IRQ",
1476 		 __func__);
1477 #ifdef CONFIG_SMP
1478 	k_mutex_lock(&z_mm_paging_lock, K_FOREVER);
1479 #else
1480 	k_sched_lock();
1481 #endif
1482 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1483 	key = k_spin_lock(&z_mm_lock);
1484 	pf = k_mem_phys_to_page_frame(phys);
1485 	if (!k_mem_page_frame_is_mapped(pf)) {
1486 		/* Nothing to do, free page */
1487 		ret = 0;
1488 		goto out;
1489 	}
1490 	flags = arch_page_info_get(k_mem_page_frame_to_virt(pf), NULL, false);
1491 	/* Shouldn't ever happen */
1492 	__ASSERT((flags & ARCH_DATA_PAGE_LOADED) != 0, "data page not loaded");
1493 	dirty = (flags & ARCH_DATA_PAGE_DIRTY) != 0;
1494 	ret = page_frame_prepare_locked(pf, &dirty, false, &location);
1495 	if (ret != 0) {
1496 		goto out;
1497 	}
1498 
1499 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1500 	k_spin_unlock(&z_mm_lock, key);
1501 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1502 	if (dirty) {
1503 		do_backing_store_page_out(location);
1504 	}
1505 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1506 	key = k_spin_lock(&z_mm_lock);
1507 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1508 	page_frame_free_locked(pf);
1509 out:
1510 	k_spin_unlock(&z_mm_lock, key);
1511 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1512 #ifdef CONFIG_SMP
1513 	k_mutex_unlock(&z_mm_paging_lock);
1514 #else
1515 	k_sched_unlock();
1516 #endif
1517 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1518 	return ret;
1519 }
1520 
paging_stats_faults_inc(struct k_thread * faulting_thread,int key)1521 static inline void paging_stats_faults_inc(struct k_thread *faulting_thread,
1522 					   int key)
1523 {
1524 #ifdef CONFIG_DEMAND_PAGING_STATS
1525 	bool is_irq_unlocked = arch_irq_unlocked(key);
1526 
1527 	paging_stats.pagefaults.cnt++;
1528 
1529 	if (is_irq_unlocked) {
1530 		paging_stats.pagefaults.irq_unlocked++;
1531 	} else {
1532 		paging_stats.pagefaults.irq_locked++;
1533 	}
1534 
1535 #ifdef CONFIG_DEMAND_PAGING_THREAD_STATS
1536 	faulting_thread->paging_stats.pagefaults.cnt++;
1537 
1538 	if (is_irq_unlocked) {
1539 		faulting_thread->paging_stats.pagefaults.irq_unlocked++;
1540 	} else {
1541 		faulting_thread->paging_stats.pagefaults.irq_locked++;
1542 	}
1543 #else
1544 	ARG_UNUSED(faulting_thread);
1545 #endif /* CONFIG_DEMAND_PAGING_THREAD_STATS */
1546 
1547 #ifndef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1548 	if (k_is_in_isr()) {
1549 		paging_stats.pagefaults.in_isr++;
1550 
1551 #ifdef CONFIG_DEMAND_PAGING_THREAD_STATS
1552 		faulting_thread->paging_stats.pagefaults.in_isr++;
1553 #endif /* CONFIG_DEMAND_PAGING_THREAD_STATS */
1554 	}
1555 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1556 #endif /* CONFIG_DEMAND_PAGING_STATS */
1557 }
1558 
paging_stats_eviction_inc(struct k_thread * faulting_thread,bool dirty)1559 static inline void paging_stats_eviction_inc(struct k_thread *faulting_thread,
1560 					     bool dirty)
1561 {
1562 #ifdef CONFIG_DEMAND_PAGING_STATS
1563 	if (dirty) {
1564 		paging_stats.eviction.dirty++;
1565 	} else {
1566 		paging_stats.eviction.clean++;
1567 	}
1568 #ifdef CONFIG_DEMAND_PAGING_THREAD_STATS
1569 	if (dirty) {
1570 		faulting_thread->paging_stats.eviction.dirty++;
1571 	} else {
1572 		faulting_thread->paging_stats.eviction.clean++;
1573 	}
1574 #else
1575 	ARG_UNUSED(faulting_thread);
1576 #endif /* CONFIG_DEMAND_PAGING_THREAD_STATS */
1577 #endif /* CONFIG_DEMAND_PAGING_STATS */
1578 }
1579 
do_eviction_select(bool * dirty)1580 static inline struct k_mem_page_frame *do_eviction_select(bool *dirty)
1581 {
1582 	struct k_mem_page_frame *pf;
1583 
1584 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
1585 	uint32_t time_diff;
1586 
1587 #ifdef CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS
1588 	timing_t time_start, time_end;
1589 
1590 	time_start = timing_counter_get();
1591 #else
1592 	uint32_t time_start;
1593 
1594 	time_start = k_cycle_get_32();
1595 #endif /* CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS */
1596 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
1597 
1598 	pf = k_mem_paging_eviction_select(dirty);
1599 
1600 #ifdef CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM
1601 #ifdef CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS
1602 	time_end = timing_counter_get();
1603 	time_diff = (uint32_t)timing_cycles_get(&time_start, &time_end);
1604 #else
1605 	time_diff = k_cycle_get_32() - time_start;
1606 #endif /* CONFIG_DEMAND_PAGING_STATS_USING_TIMING_FUNCTIONS */
1607 
1608 	z_paging_histogram_inc(&z_paging_histogram_eviction, time_diff);
1609 #endif /* CONFIG_DEMAND_PAGING_TIMING_HISTOGRAM */
1610 
1611 	return pf;
1612 }
1613 
do_page_fault(void * addr,bool pin)1614 static bool do_page_fault(void *addr, bool pin)
1615 {
1616 	struct k_mem_page_frame *pf;
1617 	k_spinlock_key_t key;
1618 	uintptr_t page_in_location, page_out_location;
1619 	enum arch_page_location status;
1620 	bool result;
1621 	bool dirty = false;
1622 	struct k_thread *faulting_thread;
1623 	int ret;
1624 
1625 	__ASSERT(page_frames_initialized, "page fault at %p happened too early",
1626 		 addr);
1627 
1628 	LOG_DBG("page fault at %p", addr);
1629 
1630 	/*
1631 	 * TODO: Add performance accounting:
1632 	 * - k_mem_paging_eviction_select() metrics
1633 	 *   * periodic timer execution time histogram (if implemented)
1634 	 */
1635 
1636 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1637 	/*
1638 	 * We do re-enable interrupts during the page-in/page-out operation
1639 	 * if and only if interrupts were enabled when the exception was
1640 	 * taken; in this configuration page faults in an ISR are a bug; all
1641 	 * their code/data must be pinned.
1642 	 *
1643 	 * If interrupts were disabled when the exception was taken, the
1644 	 * arch code is responsible for keeping them that way when entering
1645 	 * this function.
1646 	 *
1647 	 * If this is not enabled, then interrupts are always locked for the
1648 	 * entire operation. This is far worse for system interrupt latency
1649 	 * but requires less pinned pages and ISRs may also take page faults.
1650 	 *
1651 	 * On UP we lock the scheduler so that other threads are never
1652 	 * scheduled during the page-in/out operation. Support for
1653 	 * allowing k_mem_paging_backing_store_page_out() and
1654 	 * k_mem_paging_backing_store_page_in() to also sleep and allow
1655 	 * other threads to run (such as in the case where the transfer is
1656 	 * async DMA) is not supported on UP. Even if limited to thread
1657 	 * context, arbitrary memory access triggering exceptions that put
1658 	 * a thread to sleep on a contended page fault operation will break
1659 	 * scheduling assumptions of cooperative threads or threads that
1660 	 * implement critical sections with spinlocks or disabling IRQs.
1661 	 *
1662 	 * On SMP, though, exclusivity cannot be assumed solely from being
1663 	 * a cooperative thread. Another thread with any prio may be running
1664 	 * on another CPU so exclusion must already be enforced by other
1665 	 * means. Therefore trying to prevent scheduling on SMP is pointless,
1666 	 * and k_sched_lock()  is equivalent to a no-op on SMP anyway.
1667 	 * As a result, sleeping/rescheduling in the SMP case is fine.
1668 	 */
1669 	__ASSERT(!k_is_in_isr(), "ISR page faults are forbidden");
1670 #ifdef CONFIG_SMP
1671 	k_mutex_lock(&z_mm_paging_lock, K_FOREVER);
1672 #else
1673 	k_sched_lock();
1674 #endif
1675 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1676 
1677 	key = k_spin_lock(&z_mm_lock);
1678 	faulting_thread = _current;
1679 
1680 	status = arch_page_location_get(addr, &page_in_location);
1681 	if (status == ARCH_PAGE_LOCATION_BAD) {
1682 		/* Return false to treat as a fatal error */
1683 		result = false;
1684 		goto out;
1685 	}
1686 	result = true;
1687 
1688 	if (status == ARCH_PAGE_LOCATION_PAGED_IN) {
1689 		if (pin) {
1690 			/* It's a physical memory address */
1691 			uintptr_t phys = page_in_location;
1692 
1693 			pf = k_mem_phys_to_page_frame(phys);
1694 			if (!k_mem_page_frame_is_pinned(pf)) {
1695 				if (IS_ENABLED(CONFIG_EVICTION_TRACKING)) {
1696 					k_mem_paging_eviction_remove(pf);
1697 				}
1698 				k_mem_page_frame_set(pf, K_MEM_PAGE_FRAME_PINNED);
1699 			}
1700 		}
1701 
1702 		/* This if-block is to pin the page if it is
1703 		 * already present in physical memory. There is
1704 		 * no need to go through the following code to
1705 		 * pull in the data pages. So skip to the end.
1706 		 */
1707 		goto out;
1708 	}
1709 	__ASSERT(status == ARCH_PAGE_LOCATION_PAGED_OUT,
1710 		 "unexpected status value %d", status);
1711 
1712 	paging_stats_faults_inc(faulting_thread, key.key);
1713 
1714 	pf = free_page_frame_list_get();
1715 	if (pf == NULL) {
1716 		/* Need to evict a page frame */
1717 		pf = do_eviction_select(&dirty);
1718 		__ASSERT(pf != NULL, "failed to get a page frame");
1719 		LOG_DBG("evicting %p at 0x%lx",
1720 			k_mem_page_frame_to_virt(pf),
1721 			k_mem_page_frame_to_phys(pf));
1722 
1723 		paging_stats_eviction_inc(faulting_thread, dirty);
1724 	}
1725 	ret = page_frame_prepare_locked(pf, &dirty, true, &page_out_location);
1726 	__ASSERT(ret == 0, "failed to prepare page frame");
1727 
1728 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1729 	k_spin_unlock(&z_mm_lock, key);
1730 	/* Interrupts are now unlocked if they were not locked when we entered
1731 	 * this function, and we may service ISRs. The scheduler is still
1732 	 * locked.
1733 	 */
1734 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1735 	if (dirty) {
1736 		do_backing_store_page_out(page_out_location);
1737 	}
1738 	do_backing_store_page_in(page_in_location);
1739 
1740 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1741 	key = k_spin_lock(&z_mm_lock);
1742 	k_mem_page_frame_clear(pf, K_MEM_PAGE_FRAME_BUSY);
1743 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1744 	k_mem_page_frame_clear(pf, K_MEM_PAGE_FRAME_MAPPED);
1745 	frame_mapped_set(pf, addr);
1746 	if (pin) {
1747 		k_mem_page_frame_set(pf, K_MEM_PAGE_FRAME_PINNED);
1748 	}
1749 
1750 	arch_mem_page_in(addr, k_mem_page_frame_to_phys(pf));
1751 	k_mem_paging_backing_store_page_finalize(pf, page_in_location);
1752 	if (IS_ENABLED(CONFIG_EVICTION_TRACKING) && (!pin)) {
1753 		k_mem_paging_eviction_add(pf);
1754 	}
1755 out:
1756 	k_spin_unlock(&z_mm_lock, key);
1757 #ifdef CONFIG_DEMAND_PAGING_ALLOW_IRQ
1758 #ifdef CONFIG_SMP
1759 	k_mutex_unlock(&z_mm_paging_lock);
1760 #else
1761 	k_sched_unlock();
1762 #endif
1763 #endif /* CONFIG_DEMAND_PAGING_ALLOW_IRQ */
1764 
1765 	return result;
1766 }
1767 
do_page_in(void * addr)1768 static void do_page_in(void *addr)
1769 {
1770 	bool ret;
1771 
1772 	ret = do_page_fault(addr, false);
1773 	__ASSERT(ret, "unmapped memory address %p", addr);
1774 	(void)ret;
1775 }
1776 
k_mem_page_in(void * addr,size_t size)1777 void k_mem_page_in(void *addr, size_t size)
1778 {
1779 	__ASSERT(!IS_ENABLED(CONFIG_DEMAND_PAGING_ALLOW_IRQ) || !k_is_in_isr(),
1780 		 "%s may not be called in ISRs if CONFIG_DEMAND_PAGING_ALLOW_IRQ is enabled",
1781 		 __func__);
1782 	virt_region_foreach(addr, size, do_page_in);
1783 }
1784 
do_mem_pin(void * addr)1785 static void do_mem_pin(void *addr)
1786 {
1787 	bool ret;
1788 
1789 	ret = do_page_fault(addr, true);
1790 	__ASSERT(ret, "unmapped memory address %p", addr);
1791 	(void)ret;
1792 }
1793 
k_mem_pin(void * addr,size_t size)1794 void k_mem_pin(void *addr, size_t size)
1795 {
1796 	__ASSERT(!IS_ENABLED(CONFIG_DEMAND_PAGING_ALLOW_IRQ) || !k_is_in_isr(),
1797 		 "%s may not be called in ISRs if CONFIG_DEMAND_PAGING_ALLOW_IRQ is enabled",
1798 		 __func__);
1799 	virt_region_foreach(addr, size, do_mem_pin);
1800 }
1801 
k_mem_page_fault(void * addr)1802 bool k_mem_page_fault(void *addr)
1803 {
1804 	return do_page_fault(addr, false);
1805 }
1806 
do_mem_unpin(void * addr)1807 static void do_mem_unpin(void *addr)
1808 {
1809 	struct k_mem_page_frame *pf;
1810 	k_spinlock_key_t key;
1811 	uintptr_t flags, phys;
1812 
1813 	key = k_spin_lock(&z_mm_lock);
1814 	flags = arch_page_info_get(addr, &phys, false);
1815 	__ASSERT((flags & ARCH_DATA_PAGE_NOT_MAPPED) == 0,
1816 		 "invalid data page at %p", addr);
1817 	if ((flags & ARCH_DATA_PAGE_LOADED) != 0) {
1818 		pf = k_mem_phys_to_page_frame(phys);
1819 		if (k_mem_page_frame_is_pinned(pf)) {
1820 			k_mem_page_frame_clear(pf, K_MEM_PAGE_FRAME_PINNED);
1821 
1822 			if (IS_ENABLED(CONFIG_EVICTION_TRACKING)) {
1823 				k_mem_paging_eviction_add(pf);
1824 			}
1825 		}
1826 	}
1827 	k_spin_unlock(&z_mm_lock, key);
1828 }
1829 
k_mem_unpin(void * addr,size_t size)1830 void k_mem_unpin(void *addr, size_t size)
1831 {
1832 	__ASSERT(page_frames_initialized, "%s called on %p too early", __func__,
1833 		 addr);
1834 	virt_region_foreach(addr, size, do_mem_unpin);
1835 }
1836 
1837 #endif /* CONFIG_DEMAND_PAGING */
1838