1 /*
2  * Copyright 2019 Broadcom
3  * The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries.
4  *
5  * Copyright (c) 2021 BayLibre, SAS
6  *
7  * SPDX-License-Identifier: Apache-2.0
8  */
9 
10 #include <zephyr/cache.h>
11 #include <zephyr/device.h>
12 #include <zephyr/init.h>
13 #include <zephyr/kernel.h>
14 #include <zephyr/kernel/mm/demand_paging.h>
15 #include <kernel_arch_func.h>
16 #include <kernel_arch_interface.h>
17 #include <kernel_internal.h>
18 #include <zephyr/logging/log.h>
19 #include <zephyr/arch/arm64/cpu.h>
20 #include <zephyr/arch/arm64/lib_helpers.h>
21 #include <zephyr/arch/arm64/mm.h>
22 #include <zephyr/linker/linker-defs.h>
23 #include <zephyr/spinlock.h>
24 #include <zephyr/sys/util.h>
25 #include <mmu.h>
26 
27 #include "mmu.h"
28 #include "paging.h"
29 
30 LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL);
31 
32 static uint64_t xlat_tables[CONFIG_MAX_XLAT_TABLES * Ln_XLAT_NUM_ENTRIES]
33 		__aligned(Ln_XLAT_NUM_ENTRIES * sizeof(uint64_t));
34 static int xlat_use_count[CONFIG_MAX_XLAT_TABLES];
35 static struct k_spinlock xlat_lock;
36 
37 /* Usage count value range */
38 #define XLAT_PTE_COUNT_MASK	GENMASK(15, 0)
39 #define XLAT_REF_COUNT_UNIT	BIT(16)
40 
41 /* Returns a reference to a free table */
new_table(void)42 static uint64_t *new_table(void)
43 {
44 	uint64_t *table;
45 	unsigned int i;
46 
47 	/* Look for a free table. */
48 	for (i = 0U; i < CONFIG_MAX_XLAT_TABLES; i++) {
49 		if (xlat_use_count[i] == 0) {
50 			table = &xlat_tables[i * Ln_XLAT_NUM_ENTRIES];
51 			xlat_use_count[i] = XLAT_REF_COUNT_UNIT;
52 			MMU_DEBUG("allocating table [%d]%p\n", i, table);
53 			return table;
54 		}
55 	}
56 
57 	LOG_ERR("CONFIG_MAX_XLAT_TABLES, too small");
58 	return NULL;
59 }
60 
table_index(uint64_t * pte)61 static inline unsigned int table_index(uint64_t *pte)
62 {
63 	unsigned int i = (pte - xlat_tables) / Ln_XLAT_NUM_ENTRIES;
64 
65 	__ASSERT(i < CONFIG_MAX_XLAT_TABLES, "table %p out of range", pte);
66 	return i;
67 }
68 
69 /* Adjusts usage count and returns current count. */
table_usage(uint64_t * table,int adjustment)70 static int table_usage(uint64_t *table, int adjustment)
71 {
72 	unsigned int i = table_index(table);
73 	int prev_count = xlat_use_count[i];
74 	int new_count = prev_count + adjustment;
75 
76 	/* be reasonable not to always create a debug flood */
77 	if ((IS_ENABLED(DUMP_PTE) && adjustment != 0) || new_count == 0) {
78 		MMU_DEBUG("table [%d]%p: usage %#x -> %#x\n", i, table, prev_count, new_count);
79 	}
80 
81 	__ASSERT(new_count >= 0,
82 		 "table use count underflow");
83 	__ASSERT(new_count == 0 || new_count >= XLAT_REF_COUNT_UNIT,
84 		 "table in use with no reference to it");
85 	__ASSERT((new_count & XLAT_PTE_COUNT_MASK) <= Ln_XLAT_NUM_ENTRIES,
86 		 "table PTE count overflow");
87 
88 	xlat_use_count[i] = new_count;
89 	return new_count;
90 }
91 
inc_table_ref(uint64_t * table)92 static inline void inc_table_ref(uint64_t *table)
93 {
94 	table_usage(table, XLAT_REF_COUNT_UNIT);
95 }
96 
dec_table_ref(uint64_t * table)97 static inline void dec_table_ref(uint64_t *table)
98 {
99 	int ref_unit = XLAT_REF_COUNT_UNIT;
100 
101 	table_usage(table, -ref_unit);
102 }
103 
is_table_unused(uint64_t * table)104 static inline bool is_table_unused(uint64_t *table)
105 {
106 	return (table_usage(table, 0) & XLAT_PTE_COUNT_MASK) == 0;
107 }
108 
is_table_single_referenced(uint64_t * table)109 static inline bool is_table_single_referenced(uint64_t *table)
110 {
111 	return table_usage(table, 0) < (2 * XLAT_REF_COUNT_UNIT);
112 }
113 
114 #ifdef CONFIG_TEST
115 /* Hooks to let test code peek at table states */
116 
arm64_mmu_nb_free_tables(void)117 int arm64_mmu_nb_free_tables(void)
118 {
119 	int count = 0;
120 
121 	for (int i = 0; i < CONFIG_MAX_XLAT_TABLES; i++) {
122 		if (xlat_use_count[i] == 0) {
123 			count++;
124 		}
125 	}
126 
127 	return count;
128 }
129 
arm64_mmu_tables_total_usage(void)130 int arm64_mmu_tables_total_usage(void)
131 {
132 	int count = 0;
133 
134 	for (int i = 0; i < CONFIG_MAX_XLAT_TABLES; i++) {
135 		count += xlat_use_count[i];
136 	}
137 
138 	return count;
139 }
140 
141 #endif /* CONFIG_TEST */
142 
is_free_desc(uint64_t desc)143 static inline bool is_free_desc(uint64_t desc)
144 {
145 	return desc == 0;
146 }
147 
is_inval_desc(uint64_t desc)148 static inline bool is_inval_desc(uint64_t desc)
149 {
150 	/* invalid descriptors aren't necessarily free */
151 	return (desc & PTE_DESC_TYPE_MASK) == PTE_INVALID_DESC;
152 }
153 
is_table_desc(uint64_t desc,unsigned int level)154 static inline bool is_table_desc(uint64_t desc, unsigned int level)
155 {
156 	return level != XLAT_LAST_LEVEL &&
157 	       (desc & PTE_DESC_TYPE_MASK) == PTE_TABLE_DESC;
158 }
159 
is_block_desc(uint64_t desc)160 static inline bool is_block_desc(uint64_t desc)
161 {
162 	return (desc & PTE_DESC_TYPE_MASK) == PTE_BLOCK_DESC;
163 }
164 
pte_desc_table(uint64_t desc)165 static inline uint64_t *pte_desc_table(uint64_t desc)
166 {
167 	uint64_t address = desc & PTE_PHYSADDR_MASK;
168 
169 	/* tables use a 1:1 physical:virtual mapping */
170 	return (uint64_t *)address;
171 }
172 
is_desc_block_aligned(uint64_t desc,unsigned int level_size)173 static inline bool is_desc_block_aligned(uint64_t desc, unsigned int level_size)
174 {
175 	bool aligned = (desc & PTE_PHYSADDR_MASK & (level_size - 1)) == 0;
176 
177 	if (!aligned) {
178 		MMU_DEBUG("misaligned desc 0x%016llx for block size 0x%x\n",
179 			  desc, level_size);
180 	}
181 
182 	return aligned;
183 }
184 
is_desc_superset(uint64_t desc1,uint64_t desc2,unsigned int level)185 static inline bool is_desc_superset(uint64_t desc1, uint64_t desc2,
186 				    unsigned int level)
187 {
188 	uint64_t mask = DESC_ATTRS_MASK | GENMASK64(47, LEVEL_TO_VA_SIZE_SHIFT(level));
189 
190 	return (desc1 & mask) == (desc2 & mask);
191 }
192 
193 #if DUMP_PTE
debug_show_pte(uint64_t * pte,unsigned int level)194 static void debug_show_pte(uint64_t *pte, unsigned int level)
195 {
196 	MMU_DEBUG("%.*s", level * 2U, ". . . ");
197 	MMU_DEBUG("[%d]%p: ", table_index(pte), pte);
198 
199 	if (is_free_desc(*pte)) {
200 		MMU_DEBUG("---\n");
201 		return;
202 	}
203 
204 	MMU_DEBUG("0x%016llx ", *pte);
205 
206 	if (is_table_desc(*pte, level)) {
207 		uint64_t *table = pte_desc_table(*pte);
208 
209 		MMU_DEBUG("[Table] [%d]%p\n", table_index(table), table);
210 		return;
211 	}
212 
213 	if (is_block_desc(*pte)) {
214 		MMU_DEBUG("[Block] ");
215 	} else if (!is_inval_desc(*pte)) {
216 		MMU_DEBUG("[Page] ");
217 	} else {
218 		MMU_DEBUG("[paged-out] ");
219 	}
220 
221 	uint8_t mem_type = (*pte >> 2) & MT_TYPE_MASK;
222 
223 	MMU_DEBUG((mem_type == MT_NORMAL) ? "MEM" :
224 		  ((mem_type == MT_NORMAL_NC) ? "NC" : "DEV"));
225 	MMU_DEBUG((*pte & PTE_BLOCK_DESC_AP_RO) ? "-RO" : "-RW");
226 	MMU_DEBUG((*pte & PTE_BLOCK_DESC_NS) ? "-NS" : "-S");
227 	MMU_DEBUG((*pte & PTE_BLOCK_DESC_AP_ELx) ? "-ELx" : "-ELh");
228 	MMU_DEBUG((*pte & PTE_BLOCK_DESC_PXN) ? "-PXN" : "-PX");
229 	MMU_DEBUG((*pte & PTE_BLOCK_DESC_UXN) ? "-UXN" : "-UX");
230 	MMU_DEBUG((*pte & PTE_SW_WRITABLE) ? "-WRITABLE" : "");
231 	MMU_DEBUG("\n");
232 }
233 #else
debug_show_pte(uint64_t * pte,unsigned int level)234 static inline void debug_show_pte(uint64_t *pte, unsigned int level) { }
235 #endif
236 
set_pte_table_desc(uint64_t * pte,uint64_t * table,unsigned int level)237 static void set_pte_table_desc(uint64_t *pte, uint64_t *table, unsigned int level)
238 {
239 	/* Point pte to new table */
240 	*pte = PTE_TABLE_DESC | (uint64_t)table;
241 	debug_show_pte(pte, level);
242 }
243 
set_pte_block_desc(uint64_t * pte,uint64_t desc,unsigned int level)244 static void set_pte_block_desc(uint64_t *pte, uint64_t desc, unsigned int level)
245 {
246 	if (level != XLAT_LAST_LEVEL) {
247 		desc |= PTE_BLOCK_DESC;
248 	} else if (!IS_ENABLED(CONFIG_DEMAND_PAGING) || (desc & PTE_BLOCK_DESC_AF) != 0) {
249 		desc |= PTE_PAGE_DESC;
250 	} else {
251 		/*
252 		 * Demand paging configured and AF unset: leave the descriptor
253 		 * type to "invalid" as in arch_mem_page_out().
254 		 */
255 	}
256 	*pte = desc;
257 	debug_show_pte(pte, level);
258 }
259 
expand_to_table(uint64_t * pte,unsigned int level)260 static uint64_t *expand_to_table(uint64_t *pte, unsigned int level)
261 {
262 	uint64_t *table;
263 
264 	__ASSERT(level < XLAT_LAST_LEVEL, "can't expand last level");
265 
266 	table = new_table();
267 	if (!table) {
268 		return NULL;
269 	}
270 
271 	if (!is_free_desc(*pte)) {
272 		/*
273 		 * If entry at current level was already populated
274 		 * then we need to reflect that in the new table.
275 		 */
276 		uint64_t desc = *pte;
277 		unsigned int i, stride_shift;
278 
279 		MMU_DEBUG("expanding PTE 0x%016llx into table [%d]%p\n",
280 			  desc, table_index(table), table);
281 		__ASSERT(is_block_desc(desc), "");
282 
283 		if (level + 1 == XLAT_LAST_LEVEL) {
284 			desc |= PTE_PAGE_DESC;
285 		}
286 
287 		stride_shift = LEVEL_TO_VA_SIZE_SHIFT(level + 1);
288 		for (i = 0U; i < Ln_XLAT_NUM_ENTRIES; i++) {
289 			table[i] = desc | (i << stride_shift);
290 		}
291 		table_usage(table, Ln_XLAT_NUM_ENTRIES);
292 	} else {
293 		/*
294 		 * Adjust usage count for parent table's entry
295 		 * that will no longer be free.
296 		 */
297 		table_usage(pte, 1);
298 	}
299 
300 	/* Link the new table in place of the pte it replaces */
301 	set_pte_table_desc(pte, table, level);
302 
303 	return table;
304 }
305 
set_mapping(uint64_t * top_table,uintptr_t virt,size_t size,uint64_t desc,bool may_overwrite)306 static int set_mapping(uint64_t *top_table, uintptr_t virt, size_t size,
307 		       uint64_t desc, bool may_overwrite)
308 {
309 	uint64_t *table = top_table;
310 	uint64_t *pte;
311 	uint64_t level_size;
312 	unsigned int level = BASE_XLAT_LEVEL;
313 
314 	while (size) {
315 		__ASSERT(level <= XLAT_LAST_LEVEL,
316 			 "max translation table level exceeded\n");
317 
318 		/* Locate PTE for given virtual address and page table level */
319 		pte = &table[XLAT_TABLE_VA_IDX(virt, level)];
320 
321 		if (is_table_desc(*pte, level)) {
322 			/* Move to the next translation table level */
323 			level++;
324 			table = pte_desc_table(*pte);
325 			continue;
326 		}
327 
328 		if (!may_overwrite && !is_free_desc(*pte)) {
329 			/* the entry is already allocated */
330 			LOG_ERR("entry already in use: "
331 				"level %d pte %p *pte 0x%016llx",
332 				level, pte, *pte);
333 			return -EBUSY;
334 		}
335 
336 		level_size = 1ULL << LEVEL_TO_VA_SIZE_SHIFT(level);
337 
338 		if (is_desc_superset(*pte, desc, level)) {
339 			/* This block already covers our range */
340 			level_size -= (virt & (level_size - 1));
341 			if (level_size > size) {
342 				level_size = size;
343 			}
344 			goto move_on;
345 		}
346 
347 		if ((size < level_size) || (virt & (level_size - 1)) ||
348 		    !is_desc_block_aligned(desc, level_size)) {
349 			/* Range doesn't fit, create subtable */
350 			table = expand_to_table(pte, level);
351 			if (!table) {
352 				return -ENOMEM;
353 			}
354 			level++;
355 			continue;
356 		}
357 
358 		/* Adjust usage count for corresponding table */
359 		if (is_free_desc(*pte)) {
360 			table_usage(pte, 1);
361 		}
362 		/* Create block/page descriptor */
363 		set_pte_block_desc(pte, desc, level);
364 
365 move_on:
366 		virt += level_size;
367 		desc += level_size;
368 		size -= level_size;
369 
370 		/* Range is mapped, start again for next range */
371 		table = top_table;
372 		level = BASE_XLAT_LEVEL;
373 	}
374 
375 	return 0;
376 }
377 
del_mapping(uint64_t * table,uintptr_t virt,size_t size,unsigned int level)378 static void del_mapping(uint64_t *table, uintptr_t virt, size_t size,
379 			unsigned int level)
380 {
381 	size_t step, level_size = 1ULL << LEVEL_TO_VA_SIZE_SHIFT(level);
382 	uint64_t *pte, *subtable;
383 
384 	for ( ; size; virt += step, size -= step) {
385 		step = level_size - (virt & (level_size - 1));
386 		if (step > size) {
387 			step = size;
388 		}
389 		pte = &table[XLAT_TABLE_VA_IDX(virt, level)];
390 
391 		if (is_free_desc(*pte)) {
392 			continue;
393 		}
394 
395 		if (step != level_size && is_block_desc(*pte)) {
396 			/* need to split this block mapping */
397 			expand_to_table(pte, level);
398 		}
399 
400 		if (is_table_desc(*pte, level)) {
401 			subtable = pte_desc_table(*pte);
402 			del_mapping(subtable, virt, step, level + 1);
403 			if (!is_table_unused(subtable)) {
404 				continue;
405 			}
406 			dec_table_ref(subtable);
407 		}
408 
409 		/* free this entry */
410 		*pte = 0;
411 		table_usage(pte, -1);
412 	}
413 }
414 
415 #ifdef CONFIG_USERSPACE
416 
dup_table(uint64_t * src_table,unsigned int level)417 static uint64_t *dup_table(uint64_t *src_table, unsigned int level)
418 {
419 	uint64_t *dst_table = new_table();
420 	int i, usage_count = 0;
421 
422 	if (!dst_table) {
423 		return NULL;
424 	}
425 
426 	MMU_DEBUG("dup (level %d) [%d]%p to [%d]%p\n", level,
427 		  table_index(src_table), src_table,
428 		  table_index(dst_table), dst_table);
429 
430 	for (i = 0; i < Ln_XLAT_NUM_ENTRIES; i++) {
431 		/*
432 		 * After the table duplication, each table can be independently
433 		 *  updated. Thus, entries may become non-global.
434 		 * To keep the invariants very simple, we thus force the non-global
435 		 *  bit on duplication. Moreover, there is no process to revert this
436 		 *  (e.g. in `globalize_table`). Could be improved in future work.
437 		 */
438 		if (!is_free_desc(src_table[i]) && !is_table_desc(src_table[i], level)) {
439 			src_table[i] |= PTE_BLOCK_DESC_NG;
440 		}
441 
442 		dst_table[i] = src_table[i];
443 		if (is_table_desc(dst_table[i], level)) {
444 			inc_table_ref(pte_desc_table(dst_table[i]));
445 		}
446 		if (!is_free_desc(dst_table[i])) {
447 			usage_count++;
448 		}
449 	}
450 	table_usage(dst_table, usage_count);
451 
452 	return dst_table;
453 }
454 
privatize_table(uint64_t * dst_table,uint64_t * src_table,uintptr_t virt,size_t size,unsigned int level)455 static int privatize_table(uint64_t *dst_table, uint64_t *src_table,
456 			   uintptr_t virt, size_t size, unsigned int level)
457 {
458 	size_t step, level_size = 1ULL << LEVEL_TO_VA_SIZE_SHIFT(level);
459 	unsigned int i;
460 	int ret;
461 
462 	for ( ; size; virt += step, size -= step) {
463 		step = level_size - (virt & (level_size - 1));
464 		if (step > size) {
465 			step = size;
466 		}
467 		i = XLAT_TABLE_VA_IDX(virt, level);
468 
469 		if (!is_table_desc(dst_table[i], level) ||
470 		    !is_table_desc(src_table[i], level)) {
471 			/* this entry is already private */
472 			continue;
473 		}
474 
475 		uint64_t *dst_subtable = pte_desc_table(dst_table[i]);
476 		uint64_t *src_subtable = pte_desc_table(src_table[i]);
477 
478 		if (dst_subtable == src_subtable) {
479 			/* need to make a private copy of this table */
480 			dst_subtable = dup_table(src_subtable, level + 1);
481 			if (!dst_subtable) {
482 				return -ENOMEM;
483 			}
484 			set_pte_table_desc(&dst_table[i], dst_subtable, level);
485 			dec_table_ref(src_subtable);
486 		}
487 
488 		ret = privatize_table(dst_subtable, src_subtable,
489 				      virt, step, level + 1);
490 		if (ret) {
491 			return ret;
492 		}
493 	}
494 
495 	return 0;
496 }
497 
498 /*
499  * Make the given virtual address range private in dst_pt with regards to
500  * src_pt. By "private" this means that corresponding page tables in dst_pt
501  * will be duplicated so not to share the same table(s) with src_pt.
502  * If corresponding page tables in dst_pt are already distinct from src_pt
503  * then nothing is done. This allows for subsequent mapping changes in that
504  * range to affect only dst_pt.
505  */
privatize_page_range(struct arm_mmu_ptables * dst_pt,struct arm_mmu_ptables * src_pt,uintptr_t virt_start,size_t size,const char * name)506 static int privatize_page_range(struct arm_mmu_ptables *dst_pt,
507 				struct arm_mmu_ptables *src_pt,
508 				uintptr_t virt_start, size_t size,
509 				const char *name)
510 {
511 	k_spinlock_key_t key;
512 	int ret;
513 
514 	MMU_DEBUG("privatize [%s]: virt %lx size %lx\n",
515 		  name, virt_start, size);
516 
517 	key = k_spin_lock(&xlat_lock);
518 
519 	ret = privatize_table(dst_pt->base_xlat_table, src_pt->base_xlat_table,
520 			      virt_start, size, BASE_XLAT_LEVEL);
521 
522 	k_spin_unlock(&xlat_lock, key);
523 	return ret;
524 }
525 
discard_table(uint64_t * table,unsigned int level)526 static void discard_table(uint64_t *table, unsigned int level)
527 {
528 	unsigned int i;
529 	int free_count = 0;
530 
531 	for (i = 0U; i < Ln_XLAT_NUM_ENTRIES; i++) {
532 		if (is_table_desc(table[i], level)) {
533 			uint64_t *subtable = pte_desc_table(table[i]);
534 
535 			if (is_table_single_referenced(subtable)) {
536 				discard_table(subtable, level + 1);
537 			}
538 			dec_table_ref(subtable);
539 		}
540 		if (!is_free_desc(table[i])) {
541 			table[i] = 0U;
542 			free_count++;
543 		}
544 	}
545 	table_usage(table, -free_count);
546 }
547 
globalize_table(uint64_t * dst_table,uint64_t * src_table,uintptr_t virt,size_t size,unsigned int level)548 static int globalize_table(uint64_t *dst_table, uint64_t *src_table,
549 			   uintptr_t virt, size_t size, unsigned int level)
550 {
551 	size_t step, level_size = 1ULL << LEVEL_TO_VA_SIZE_SHIFT(level);
552 	unsigned int i;
553 	int ret;
554 
555 	for ( ; size; virt += step, size -= step) {
556 		step = level_size - (virt & (level_size - 1));
557 		if (step > size) {
558 			step = size;
559 		}
560 		i = XLAT_TABLE_VA_IDX(virt, level);
561 
562 		if (dst_table[i] == src_table[i]) {
563 			/* already identical to global table */
564 			continue;
565 		}
566 
567 		if (is_free_desc(src_table[i]) &&
568 		    is_table_desc(dst_table[i], level)) {
569 			uint64_t *subtable = pte_desc_table(dst_table[i]);
570 
571 			del_mapping(subtable, virt, step, level + 1);
572 			if (is_table_unused(subtable)) {
573 				/* unreference the empty table */
574 				dst_table[i] = 0;
575 				table_usage(dst_table, -1);
576 				dec_table_ref(subtable);
577 			}
578 			continue;
579 		}
580 
581 		if (step != level_size) {
582 			/* boundary falls in the middle of this pte */
583 			__ASSERT(is_table_desc(src_table[i], level),
584 				 "can't have partial block pte here");
585 			if (!is_table_desc(dst_table[i], level)) {
586 				/* we need more fine grained boundaries */
587 				if (!expand_to_table(&dst_table[i], level)) {
588 					return -ENOMEM;
589 				}
590 			}
591 			ret = globalize_table(pte_desc_table(dst_table[i]),
592 					      pte_desc_table(src_table[i]),
593 					      virt, step, level + 1);
594 			if (ret) {
595 				return ret;
596 			}
597 			continue;
598 		}
599 
600 		/* we discard current pte and replace with global one */
601 
602 		uint64_t *old_table = is_table_desc(dst_table[i], level) ?
603 					pte_desc_table(dst_table[i]) : NULL;
604 
605 		if (is_free_desc(dst_table[i])) {
606 			table_usage(dst_table, 1);
607 		}
608 		if (is_free_desc(src_table[i])) {
609 			table_usage(dst_table, -1);
610 		}
611 		if (is_table_desc(src_table[i], level)) {
612 			inc_table_ref(pte_desc_table(src_table[i]));
613 		}
614 		dst_table[i] = src_table[i];
615 		debug_show_pte(&dst_table[i], level);
616 
617 		if (old_table) {
618 			/* we can discard the whole branch */
619 			discard_table(old_table, level + 1);
620 			dec_table_ref(old_table);
621 		}
622 	}
623 
624 	return 0;
625 }
626 
627 /*
628  * Globalize the given virtual address range in dst_pt from src_pt. We make
629  * it global by sharing as much page table content from src_pt as possible,
630  * including page tables themselves, and corresponding private tables in
631  * dst_pt are then discarded. If page tables in the given range are already
632  * shared then nothing is done. If page table sharing is not possible then
633  * page table entries in dst_pt are synchronized with those from src_pt.
634  */
globalize_page_range(struct arm_mmu_ptables * dst_pt,struct arm_mmu_ptables * src_pt,uintptr_t virt_start,size_t size,const char * name)635 static int globalize_page_range(struct arm_mmu_ptables *dst_pt,
636 				struct arm_mmu_ptables *src_pt,
637 				uintptr_t virt_start, size_t size,
638 				const char *name)
639 {
640 	k_spinlock_key_t key;
641 	int ret;
642 
643 	MMU_DEBUG("globalize [%s]: virt %lx size %lx\n",
644 		  name, virt_start, size);
645 
646 	key = k_spin_lock(&xlat_lock);
647 
648 	ret = globalize_table(dst_pt->base_xlat_table, src_pt->base_xlat_table,
649 			      virt_start, size, BASE_XLAT_LEVEL);
650 
651 	k_spin_unlock(&xlat_lock, key);
652 	return ret;
653 }
654 
655 #endif /* CONFIG_USERSPACE */
656 
get_region_desc(uint32_t attrs)657 static uint64_t get_region_desc(uint32_t attrs)
658 {
659 	unsigned int mem_type;
660 	uint64_t desc = 0U;
661 
662 	/* NS bit for security memory access from secure state */
663 	desc |= (attrs & MT_NS) ? PTE_BLOCK_DESC_NS : 0;
664 
665 	/*
666 	 * AP bits for EL0 / ELh Data access permission
667 	 *
668 	 *   AP[2:1]   ELh  EL0
669 	 * +--------------------+
670 	 *     00      RW   NA
671 	 *     01      RW   RW
672 	 *     10      RO   NA
673 	 *     11      RO   RO
674 	 */
675 
676 	/* AP bits for Data access permission */
677 	desc |= (attrs & MT_RW) ? PTE_BLOCK_DESC_AP_RW : PTE_BLOCK_DESC_AP_RO;
678 	desc |= (IS_ENABLED(CONFIG_DEMAND_PAGING) && (attrs & MT_RW)) ?
679 		PTE_SW_WRITABLE : 0;
680 
681 	/* Mirror permissions to EL0 */
682 	desc |= (attrs & MT_RW_AP_ELx) ?
683 		 PTE_BLOCK_DESC_AP_ELx : PTE_BLOCK_DESC_AP_EL_HIGHER;
684 
685 	/* the access flag */
686 	desc |= PTE_BLOCK_DESC_AF;
687 	if (IS_ENABLED(CONFIG_DEMAND_PAGING) && (attrs & MT_PAGED_OUT) != 0) {
688 		/* set it up for demand paging like arch_mem_page_out() */
689 		desc &= ~PTE_BLOCK_DESC_AF;
690 		desc |= PTE_BLOCK_DESC_AP_RO;
691 	}
692 
693 	/* memory attribute index field */
694 	mem_type = MT_TYPE(attrs);
695 	desc |= PTE_BLOCK_DESC_MEMTYPE(mem_type);
696 
697 	switch (mem_type) {
698 	case MT_DEVICE_nGnRnE:
699 	case MT_DEVICE_nGnRE:
700 	case MT_DEVICE_GRE:
701 		/* Access to Device memory and non-cacheable memory are coherent
702 		 * for all observers in the system and are treated as
703 		 * Outer shareable, so, for these 2 types of memory,
704 		 * it is not strictly needed to set shareability field
705 		 */
706 		desc |= PTE_BLOCK_DESC_OUTER_SHARE;
707 		/* Map device memory as execute-never */
708 		desc |= PTE_BLOCK_DESC_PXN;
709 		desc |= PTE_BLOCK_DESC_UXN;
710 		break;
711 	case MT_NORMAL_NC:
712 	case MT_NORMAL:
713 		/* Make Normal RW memory as execute never */
714 		if ((attrs & MT_RW) || (attrs & MT_P_EXECUTE_NEVER)) {
715 			desc |= PTE_BLOCK_DESC_PXN;
716 		}
717 
718 		if (((attrs & MT_RW) && (attrs & MT_RW_AP_ELx)) ||
719 		     (attrs & MT_U_EXECUTE_NEVER)) {
720 			desc |= PTE_BLOCK_DESC_UXN;
721 		}
722 
723 		if (mem_type == MT_NORMAL) {
724 			desc |= PTE_BLOCK_DESC_INNER_SHARE;
725 		} else {
726 			desc |= PTE_BLOCK_DESC_OUTER_SHARE;
727 		}
728 	}
729 
730 	/* non-Global bit */
731 	if (attrs & MT_NG) {
732 		desc |= PTE_BLOCK_DESC_NG;
733 	}
734 
735 	return desc;
736 }
737 
__add_map(struct arm_mmu_ptables * ptables,const char * name,uintptr_t phys,uintptr_t virt,size_t size,uint32_t attrs)738 static int __add_map(struct arm_mmu_ptables *ptables, const char *name,
739 		     uintptr_t phys, uintptr_t virt, size_t size, uint32_t attrs)
740 {
741 	uint64_t desc = get_region_desc(attrs);
742 	bool may_overwrite = !(attrs & MT_NO_OVERWRITE);
743 
744 	MMU_DEBUG("mmap [%s]: virt %lx phys %lx size %lx attr %llx %s overwrite\n",
745 		  name, virt, phys, size, desc,
746 		  may_overwrite ? "may" : "no");
747 	__ASSERT(((virt | phys | size) & (CONFIG_MMU_PAGE_SIZE - 1)) == 0,
748 		 "address/size are not page aligned\n");
749 	desc |= phys;
750 	return set_mapping(ptables->base_xlat_table, virt, size, desc, may_overwrite);
751 }
752 
add_map(struct arm_mmu_ptables * ptables,const char * name,uintptr_t phys,uintptr_t virt,size_t size,uint32_t attrs)753 static int add_map(struct arm_mmu_ptables *ptables, const char *name,
754 		   uintptr_t phys, uintptr_t virt, size_t size, uint32_t attrs)
755 {
756 	k_spinlock_key_t key;
757 	int ret;
758 
759 	key = k_spin_lock(&xlat_lock);
760 	ret = __add_map(ptables, name, phys, virt, size, attrs);
761 	k_spin_unlock(&xlat_lock, key);
762 	return ret;
763 }
764 
remove_map(struct arm_mmu_ptables * ptables,const char * name,uintptr_t virt,size_t size)765 static void remove_map(struct arm_mmu_ptables *ptables, const char *name,
766 		       uintptr_t virt, size_t size)
767 {
768 	k_spinlock_key_t key;
769 
770 	MMU_DEBUG("unmmap [%s]: virt %lx size %lx\n", name, virt, size);
771 	__ASSERT(((virt | size) & (CONFIG_MMU_PAGE_SIZE - 1)) == 0,
772 		 "address/size are not page aligned\n");
773 
774 	key = k_spin_lock(&xlat_lock);
775 	del_mapping(ptables->base_xlat_table, virt, size, BASE_XLAT_LEVEL);
776 	k_spin_unlock(&xlat_lock, key);
777 }
778 
invalidate_tlb_all(void)779 static void invalidate_tlb_all(void)
780 {
781 	__asm__ volatile (
782 	"dsb ishst; tlbi vmalle1; dsb ish; isb"
783 	: : : "memory");
784 }
785 
invalidate_tlb_page(uintptr_t virt)786 static inline void invalidate_tlb_page(uintptr_t virt)
787 {
788 	/* to be refined */
789 	invalidate_tlb_all();
790 }
791 
792 /* zephyr execution regions with appropriate attributes */
793 
794 struct arm_mmu_flat_range {
795 	char *name;
796 	void *start;
797 	void *end;
798 	uint32_t attrs;
799 };
800 
801 static const struct arm_mmu_flat_range mmu_zephyr_ranges[] = {
802 
803 	/* Mark the zephyr execution regions (data, bss, noinit, etc.)
804 	 * cacheable, read-write
805 	 * Note: read-write region is marked execute-never internally
806 	 */
807 	{ .name  = "zephyr_data",
808 	  .start = _image_ram_start,
809 	  .end   = _image_ram_end,
810 	  .attrs = MT_NORMAL | MT_P_RW_U_NA | MT_DEFAULT_SECURE_STATE },
811 
812 	/* Mark text segment cacheable,read only and executable */
813 	{ .name  = "zephyr_code",
814 	  .start = __text_region_start,
815 	  .end   = __text_region_end,
816 	  .attrs = MT_NORMAL | MT_P_RX_U_RX | MT_DEFAULT_SECURE_STATE },
817 
818 	/* Mark rodata segment cacheable, read only and execute-never */
819 	{ .name  = "zephyr_rodata",
820 	  .start = __rodata_region_start,
821 	  .end   = __rodata_region_end,
822 	  .attrs = MT_NORMAL | MT_P_RO_U_RO | MT_DEFAULT_SECURE_STATE },
823 
824 #ifdef CONFIG_NOCACHE_MEMORY
825 	/* Mark nocache segment noncachable, read-write and execute-never */
826 	{ .name  = "nocache_data",
827 	  .start = _nocache_ram_start,
828 	  .end   = _nocache_ram_end,
829 	  .attrs = MT_NORMAL_NC | MT_P_RW_U_RW | MT_DEFAULT_SECURE_STATE },
830 #endif
831 };
832 
add_arm_mmu_flat_range(struct arm_mmu_ptables * ptables,const struct arm_mmu_flat_range * range,uint32_t extra_flags)833 static inline void add_arm_mmu_flat_range(struct arm_mmu_ptables *ptables,
834 					  const struct arm_mmu_flat_range *range,
835 					  uint32_t extra_flags)
836 {
837 	uintptr_t address = (uintptr_t)range->start;
838 	size_t size = (uintptr_t)range->end - address;
839 
840 	if (size) {
841 		/* MMU not yet active: must use unlocked version */
842 		__add_map(ptables, range->name, address, address,
843 			  size, range->attrs | extra_flags);
844 	}
845 }
846 
add_arm_mmu_region(struct arm_mmu_ptables * ptables,const struct arm_mmu_region * region,uint32_t extra_flags)847 static inline void add_arm_mmu_region(struct arm_mmu_ptables *ptables,
848 				      const struct arm_mmu_region *region,
849 				      uint32_t extra_flags)
850 {
851 	if (region->size || region->attrs) {
852 		/* MMU not yet active: must use unlocked version */
853 		__add_map(ptables, region->name, region->base_pa, region->base_va,
854 			  region->size, region->attrs | extra_flags);
855 	}
856 }
857 
inv_dcache_after_map_helper(void * virt,size_t size,uint32_t attrs)858 static inline void inv_dcache_after_map_helper(void *virt, size_t size, uint32_t attrs)
859 {
860 	/*
861 	 * DC IVAC instruction requires write access permission to the VA,
862 	 * otherwise it can generate a permission fault
863 	 */
864 	if ((attrs & MT_RW) != MT_RW) {
865 		return;
866 	}
867 
868 	if (MT_TYPE(attrs) == MT_NORMAL || MT_TYPE(attrs) == MT_NORMAL_WT) {
869 		sys_cache_data_invd_range(virt, size);
870 	}
871 }
872 
setup_page_tables(struct arm_mmu_ptables * ptables)873 static void setup_page_tables(struct arm_mmu_ptables *ptables)
874 {
875 	unsigned int index;
876 	const struct arm_mmu_flat_range *range;
877 	const struct arm_mmu_region *region;
878 	uintptr_t max_va = 0, max_pa = 0;
879 
880 	MMU_DEBUG("xlat tables:\n");
881 	for (index = 0U; index < CONFIG_MAX_XLAT_TABLES; index++) {
882 		MMU_DEBUG("%d: %p\n", index, xlat_tables + index * Ln_XLAT_NUM_ENTRIES);
883 	}
884 
885 	for (index = 0U; index < mmu_config.num_regions; index++) {
886 		region = &mmu_config.mmu_regions[index];
887 		max_va = MAX(max_va, region->base_va + region->size);
888 		max_pa = MAX(max_pa, region->base_pa + region->size);
889 	}
890 
891 	__ASSERT(max_va <= (1ULL << CONFIG_ARM64_VA_BITS),
892 		 "Maximum VA not supported\n");
893 	__ASSERT(max_pa <= (1ULL << CONFIG_ARM64_PA_BITS),
894 		 "Maximum PA not supported\n");
895 
896 	/* setup translation table for zephyr execution regions */
897 	for (index = 0U; index < ARRAY_SIZE(mmu_zephyr_ranges); index++) {
898 		range = &mmu_zephyr_ranges[index];
899 		add_arm_mmu_flat_range(ptables, range, 0);
900 	}
901 
902 	/*
903 	 * Create translation tables for user provided platform regions.
904 	 * Those must not conflict with our default mapping.
905 	 */
906 	for (index = 0U; index < mmu_config.num_regions; index++) {
907 		region = &mmu_config.mmu_regions[index];
908 		add_arm_mmu_region(ptables, region, MT_NO_OVERWRITE);
909 	}
910 
911 	invalidate_tlb_all();
912 
913 	for (index = 0U; index < ARRAY_SIZE(mmu_zephyr_ranges); index++) {
914 		size_t size;
915 
916 		range = &mmu_zephyr_ranges[index];
917 		size = POINTER_TO_UINT(range->end) - POINTER_TO_UINT(range->start);
918 		inv_dcache_after_map_helper(range->start, size, range->attrs);
919 	}
920 
921 	for (index = 0U; index < mmu_config.num_regions; index++) {
922 		region = &mmu_config.mmu_regions[index];
923 		inv_dcache_after_map_helper(UINT_TO_POINTER(region->base_va), region->size,
924 					    region->attrs);
925 	}
926 }
927 
928 /* Translation table control register settings */
get_tcr(int el)929 static uint64_t get_tcr(int el)
930 {
931 	uint64_t tcr;
932 	uint64_t va_bits = CONFIG_ARM64_VA_BITS;
933 	uint64_t tcr_ps_bits;
934 
935 	tcr_ps_bits = TCR_PS_BITS;
936 
937 	if (el == 1) {
938 		tcr = (tcr_ps_bits << TCR_EL1_IPS_SHIFT);
939 		/*
940 		 * TCR_EL1.EPD1: Disable translation table walk for addresses
941 		 * that are translated using TTBR1_EL1.
942 		 */
943 		tcr |= TCR_EPD1_DISABLE;
944 	} else {
945 		tcr = (tcr_ps_bits << TCR_EL3_PS_SHIFT);
946 	}
947 
948 	tcr |= TCR_T0SZ(va_bits);
949 
950 	/*
951 	 * Translation table walk is cacheable, inner/outer WBWA and
952 	 * inner shareable.  Due to Cortex-A57 erratum #822227 we must
953 	 * set TG1[1] = 4KB.
954 	 */
955 	tcr |= TCR_TG1_4K | TCR_TG0_4K | TCR_SHARED_INNER |
956 	       TCR_ORGN_WBWA | TCR_IRGN_WBWA;
957 
958 	return tcr;
959 }
960 
enable_mmu_el1(struct arm_mmu_ptables * ptables,unsigned int flags)961 static void enable_mmu_el1(struct arm_mmu_ptables *ptables, unsigned int flags)
962 {
963 	ARG_UNUSED(flags);
964 	uint64_t val;
965 
966 	/* Set MAIR, TCR and TBBR registers */
967 	write_mair_el1(MEMORY_ATTRIBUTES);
968 	write_tcr_el1(get_tcr(1));
969 	write_ttbr0_el1((uint64_t)ptables->base_xlat_table);
970 
971 	/* Ensure these changes are seen before MMU is enabled */
972 	barrier_isync_fence_full();
973 
974 	/* Enable the MMU and data cache */
975 	val = read_sctlr_el1();
976 	write_sctlr_el1(val | SCTLR_M_BIT | SCTLR_C_BIT);
977 
978 	/* Ensure the MMU enable takes effect immediately */
979 	barrier_isync_fence_full();
980 
981 	MMU_DEBUG("MMU enabled with dcache\n");
982 }
983 
984 /* ARM MMU Driver Initial Setup */
985 
986 static struct arm_mmu_ptables kernel_ptables;
987 #ifdef CONFIG_USERSPACE
988 static sys_slist_t domain_list;
989 #endif
990 
991 /*
992  * @brief MMU default configuration
993  *
994  * This function provides the default configuration mechanism for the Memory
995  * Management Unit (MMU).
996  */
z_arm64_mm_init(bool is_primary_core)997 void z_arm64_mm_init(bool is_primary_core)
998 {
999 	unsigned int flags = 0U;
1000 
1001 	__ASSERT(CONFIG_MMU_PAGE_SIZE == KB(4),
1002 		 "Only 4K page size is supported\n");
1003 
1004 	__ASSERT(GET_EL(read_currentel()) == MODE_EL1,
1005 		 "Exception level not EL1, MMU not enabled!\n");
1006 
1007 	/* Ensure that MMU is already not enabled */
1008 	__ASSERT((read_sctlr_el1() & SCTLR_M_BIT) == 0, "MMU is already enabled\n");
1009 
1010 	/*
1011 	 * Only booting core setup up the page tables.
1012 	 */
1013 	if (is_primary_core) {
1014 		kernel_ptables.base_xlat_table = new_table();
1015 		setup_page_tables(&kernel_ptables);
1016 	}
1017 
1018 	/* currently only EL1 is supported */
1019 	enable_mmu_el1(&kernel_ptables, flags);
1020 }
1021 
sync_domains(uintptr_t virt,size_t size,const char * name)1022 static void sync_domains(uintptr_t virt, size_t size, const char *name)
1023 {
1024 #ifdef CONFIG_USERSPACE
1025 	sys_snode_t *node;
1026 	struct arch_mem_domain *domain;
1027 	struct arm_mmu_ptables *domain_ptables;
1028 	k_spinlock_key_t key;
1029 	int ret;
1030 
1031 	key = k_spin_lock(&z_mem_domain_lock);
1032 	SYS_SLIST_FOR_EACH_NODE(&domain_list, node) {
1033 		domain = CONTAINER_OF(node, struct arch_mem_domain, node);
1034 		domain_ptables = &domain->ptables;
1035 		ret = globalize_page_range(domain_ptables, &kernel_ptables,
1036 					   virt, size, name);
1037 		if (ret) {
1038 			LOG_ERR("globalize_page_range() returned %d", ret);
1039 		}
1040 	}
1041 	k_spin_unlock(&z_mem_domain_lock, key);
1042 #endif
1043 }
1044 
__arch_mem_map(void * virt,uintptr_t phys,size_t size,uint32_t flags)1045 static int __arch_mem_map(void *virt, uintptr_t phys, size_t size, uint32_t flags)
1046 {
1047 	struct arm_mmu_ptables *ptables;
1048 	uint32_t entry_flags = MT_DEFAULT_SECURE_STATE | MT_P_RX_U_NA | MT_NO_OVERWRITE;
1049 
1050 	/* Always map in the kernel page tables */
1051 	ptables = &kernel_ptables;
1052 
1053 	/* Translate flags argument into HW-recognized entry flags. */
1054 	switch (flags & K_MEM_CACHE_MASK) {
1055 	/*
1056 	 * K_MEM_CACHE_NONE, K_MEM_ARM_DEVICE_nGnRnE => MT_DEVICE_nGnRnE
1057 	 *			(Device memory nGnRnE)
1058 	 * K_MEM_ARM_DEVICE_nGnRE => MT_DEVICE_nGnRE
1059 	 *			(Device memory nGnRE)
1060 	 * K_MEM_ARM_DEVICE_GRE => MT_DEVICE_GRE
1061 	 *			(Device memory GRE)
1062 	 * K_MEM_ARM_NORMAL_NC   => MT_NORMAL_NC
1063 	 *			(Normal memory Non-cacheable)
1064 	 * K_MEM_CACHE_WB   => MT_NORMAL
1065 	 *			(Normal memory Outer WB + Inner WB)
1066 	 * K_MEM_CACHE_WT   => MT_NORMAL_WT
1067 	 *			(Normal memory Outer WT + Inner WT)
1068 	 */
1069 	case K_MEM_CACHE_NONE:
1070 	/* K_MEM_CACHE_NONE equal to K_MEM_ARM_DEVICE_nGnRnE */
1071 	/* case K_MEM_ARM_DEVICE_nGnRnE: */
1072 		entry_flags |= MT_DEVICE_nGnRnE;
1073 		break;
1074 	case K_MEM_ARM_DEVICE_nGnRE:
1075 		entry_flags |= MT_DEVICE_nGnRE;
1076 		break;
1077 	case K_MEM_ARM_DEVICE_GRE:
1078 		entry_flags |= MT_DEVICE_GRE;
1079 		break;
1080 	case K_MEM_ARM_NORMAL_NC:
1081 		entry_flags |= MT_NORMAL_NC;
1082 		break;
1083 	case K_MEM_CACHE_WT:
1084 		entry_flags |= MT_NORMAL_WT;
1085 		break;
1086 	case K_MEM_CACHE_WB:
1087 		entry_flags |= MT_NORMAL;
1088 		break;
1089 	default:
1090 		return -ENOTSUP;
1091 	}
1092 
1093 	if ((flags & K_MEM_PERM_RW) != 0U) {
1094 		entry_flags |= MT_RW;
1095 	}
1096 
1097 	if ((flags & K_MEM_PERM_EXEC) == 0U) {
1098 		entry_flags |= MT_P_EXECUTE_NEVER;
1099 	}
1100 
1101 	if ((flags & K_MEM_PERM_USER) != 0U) {
1102 		entry_flags |= MT_RW_AP_ELx;
1103 	}
1104 
1105 	if (IS_ENABLED(CONFIG_DEMAND_PAGING) && (flags & K_MEM_MAP_UNPAGED) != 0) {
1106 		entry_flags |= MT_PAGED_OUT;
1107 	}
1108 
1109 	return add_map(ptables, "generic", phys, (uintptr_t)virt, size, entry_flags);
1110 }
1111 
arch_mem_map(void * virt,uintptr_t phys,size_t size,uint32_t flags)1112 void arch_mem_map(void *virt, uintptr_t phys, size_t size, uint32_t flags)
1113 {
1114 	int ret = __arch_mem_map(virt, phys, size, flags);
1115 
1116 	if (ret) {
1117 		LOG_ERR("__arch_mem_map() returned %d", ret);
1118 		k_panic();
1119 	} else {
1120 		uint32_t mem_flags = flags & K_MEM_CACHE_MASK;
1121 
1122 		sync_domains((uintptr_t)virt, size, "mem_map");
1123 		invalidate_tlb_all();
1124 
1125 		switch (mem_flags) {
1126 		case K_MEM_CACHE_WB:
1127 		case K_MEM_CACHE_WT:
1128 			mem_flags = (mem_flags == K_MEM_CACHE_WB) ? MT_NORMAL : MT_NORMAL_WT;
1129 			mem_flags |= (flags & K_MEM_PERM_RW) ? MT_RW : 0;
1130 			inv_dcache_after_map_helper(virt, size, mem_flags);
1131 		default:
1132 			break;
1133 		}
1134 	}
1135 }
1136 
arch_mem_unmap(void * addr,size_t size)1137 void arch_mem_unmap(void *addr, size_t size)
1138 {
1139 	remove_map(&kernel_ptables, "generic", (uintptr_t)addr, size);
1140 	sync_domains((uintptr_t)addr, size, "mem_unmap");
1141 	invalidate_tlb_all();
1142 }
1143 
arch_page_phys_get(void * virt,uintptr_t * phys)1144 int arch_page_phys_get(void *virt, uintptr_t *phys)
1145 {
1146 	uint64_t par;
1147 	int key;
1148 
1149 	key = arch_irq_lock();
1150 	__asm__ volatile ("at S1E1R, %0" : : "r" (virt));
1151 	barrier_isync_fence_full();
1152 	par = read_par_el1();
1153 	arch_irq_unlock(key);
1154 
1155 	if (par & BIT(0)) {
1156 		return -EFAULT;
1157 	}
1158 
1159 	if (phys) {
1160 		*phys = par & GENMASK64(47, 12);
1161 	}
1162 	return 0;
1163 }
1164 
arch_virt_region_align(uintptr_t phys,size_t size)1165 size_t arch_virt_region_align(uintptr_t phys, size_t size)
1166 {
1167 	size_t alignment = CONFIG_MMU_PAGE_SIZE;
1168 	size_t level_size;
1169 	int level;
1170 
1171 	for (level = XLAT_LAST_LEVEL; level >= BASE_XLAT_LEVEL; level--) {
1172 		level_size = 1 << LEVEL_TO_VA_SIZE_SHIFT(level);
1173 
1174 		if (size < level_size) {
1175 			break;
1176 		}
1177 
1178 		if ((phys & (level_size - 1))) {
1179 			break;
1180 		}
1181 
1182 		alignment = level_size;
1183 	}
1184 
1185 	return alignment;
1186 }
1187 
1188 #ifdef CONFIG_USERSPACE
1189 
1190 static uint16_t next_asid = 1;
1191 
get_asid(uint64_t ttbr0)1192 static uint16_t get_asid(uint64_t ttbr0)
1193 {
1194 	return ttbr0 >> TTBR_ASID_SHIFT;
1195 }
1196 
1197 static void z_arm64_swap_ptables(struct k_thread *incoming);
1198 
arch_mem_domain_max_partitions_get(void)1199 int arch_mem_domain_max_partitions_get(void)
1200 {
1201 	return CONFIG_MAX_DOMAIN_PARTITIONS;
1202 }
1203 
arch_mem_domain_init(struct k_mem_domain * domain)1204 int arch_mem_domain_init(struct k_mem_domain *domain)
1205 {
1206 	struct arm_mmu_ptables *domain_ptables = &domain->arch.ptables;
1207 	k_spinlock_key_t key;
1208 	uint16_t asid;
1209 
1210 	MMU_DEBUG("%s\n", __func__);
1211 
1212 	key = k_spin_lock(&xlat_lock);
1213 
1214 	/*
1215 	 * Pick a new ASID. We use round-robin
1216 	 * Note: `next_asid` is an uint16_t and `VM_ASID_BITS` could
1217 	 *  be up to 16, hence `next_asid` might overflow to 0 below.
1218 	 */
1219 	asid = next_asid++;
1220 	if ((next_asid >= (1UL << VM_ASID_BITS)) || (next_asid == 0)) {
1221 		next_asid = 1;
1222 	}
1223 
1224 	domain_ptables->base_xlat_table =
1225 		dup_table(kernel_ptables.base_xlat_table, BASE_XLAT_LEVEL);
1226 	k_spin_unlock(&xlat_lock, key);
1227 	if (!domain_ptables->base_xlat_table) {
1228 		return -ENOMEM;
1229 	}
1230 
1231 	domain_ptables->ttbr0 =	(((uint64_t)asid) << TTBR_ASID_SHIFT) |
1232 		((uint64_t)(uintptr_t)domain_ptables->base_xlat_table);
1233 
1234 	sys_slist_append(&domain_list, &domain->arch.node);
1235 	return 0;
1236 }
1237 
private_map(struct arm_mmu_ptables * ptables,const char * name,uintptr_t phys,uintptr_t virt,size_t size,uint32_t attrs)1238 static int private_map(struct arm_mmu_ptables *ptables, const char *name,
1239 		       uintptr_t phys, uintptr_t virt, size_t size, uint32_t attrs)
1240 {
1241 	int ret;
1242 
1243 	ret = privatize_page_range(ptables, &kernel_ptables, virt, size, name);
1244 	__ASSERT(ret == 0, "privatize_page_range() returned %d", ret);
1245 	ret = add_map(ptables, name, phys, virt, size, attrs | MT_NG);
1246 	__ASSERT(ret == 0, "add_map() returned %d", ret);
1247 	invalidate_tlb_all();
1248 
1249 	inv_dcache_after_map_helper(UINT_TO_POINTER(virt), size, attrs);
1250 	return ret;
1251 }
1252 
reset_map(struct arm_mmu_ptables * ptables,const char * name,uintptr_t addr,size_t size)1253 static int reset_map(struct arm_mmu_ptables *ptables, const char *name,
1254 		     uintptr_t addr, size_t size)
1255 {
1256 	int ret;
1257 
1258 	ret = globalize_page_range(ptables, &kernel_ptables, addr, size, name);
1259 	__ASSERT(ret == 0, "globalize_page_range() returned %d", ret);
1260 	invalidate_tlb_all();
1261 
1262 	return ret;
1263 }
1264 
arch_mem_domain_partition_add(struct k_mem_domain * domain,uint32_t partition_id)1265 int arch_mem_domain_partition_add(struct k_mem_domain *domain,
1266 				  uint32_t partition_id)
1267 {
1268 	struct arm_mmu_ptables *domain_ptables = &domain->arch.ptables;
1269 	struct k_mem_partition *ptn = &domain->partitions[partition_id];
1270 
1271 	return private_map(domain_ptables, "partition", ptn->start, ptn->start,
1272 			   ptn->size, ptn->attr.attrs | MT_NORMAL);
1273 }
1274 
arch_mem_domain_partition_remove(struct k_mem_domain * domain,uint32_t partition_id)1275 int arch_mem_domain_partition_remove(struct k_mem_domain *domain,
1276 				     uint32_t partition_id)
1277 {
1278 	struct arm_mmu_ptables *domain_ptables = &domain->arch.ptables;
1279 	struct k_mem_partition *ptn = &domain->partitions[partition_id];
1280 
1281 	return reset_map(domain_ptables, "partition removal",
1282 			 ptn->start, ptn->size);
1283 }
1284 
map_thread_stack(struct k_thread * thread,struct arm_mmu_ptables * ptables)1285 static int map_thread_stack(struct k_thread *thread,
1286 			    struct arm_mmu_ptables *ptables)
1287 {
1288 	return private_map(ptables, "thread_stack", thread->stack_info.start,
1289 			    thread->stack_info.start, thread->stack_info.size,
1290 			    MT_P_RW_U_RW | MT_NORMAL);
1291 }
1292 
arch_mem_domain_thread_add(struct k_thread * thread)1293 int arch_mem_domain_thread_add(struct k_thread *thread)
1294 {
1295 	struct arm_mmu_ptables *old_ptables, *domain_ptables;
1296 	struct k_mem_domain *domain;
1297 	bool is_user, is_migration;
1298 	int ret = 0;
1299 
1300 	domain = thread->mem_domain_info.mem_domain;
1301 	domain_ptables = &domain->arch.ptables;
1302 	old_ptables = thread->arch.ptables;
1303 
1304 	is_user = (thread->base.user_options & K_USER) != 0;
1305 	is_migration = (old_ptables != NULL) && is_user;
1306 
1307 	if (is_migration) {
1308 		ret = map_thread_stack(thread, domain_ptables);
1309 	}
1310 
1311 	thread->arch.ptables = domain_ptables;
1312 	if (thread == arch_current_thread()) {
1313 		z_arm64_swap_ptables(thread);
1314 	} else {
1315 #ifdef CONFIG_SMP
1316 		/* the thread could be running on another CPU right now */
1317 		z_arm64_mem_cfg_ipi();
1318 #endif
1319 	}
1320 
1321 	if (is_migration) {
1322 		ret = reset_map(old_ptables, __func__, thread->stack_info.start,
1323 				thread->stack_info.size);
1324 	}
1325 
1326 	return ret;
1327 }
1328 
arch_mem_domain_thread_remove(struct k_thread * thread)1329 int arch_mem_domain_thread_remove(struct k_thread *thread)
1330 {
1331 	struct arm_mmu_ptables *domain_ptables;
1332 	struct k_mem_domain *domain;
1333 
1334 	domain = thread->mem_domain_info.mem_domain;
1335 	domain_ptables = &domain->arch.ptables;
1336 
1337 	if ((thread->base.user_options & K_USER) == 0) {
1338 		return 0;
1339 	}
1340 
1341 	if ((thread->base.thread_state & _THREAD_DEAD) == 0) {
1342 		return 0;
1343 	}
1344 
1345 	return reset_map(domain_ptables, __func__, thread->stack_info.start,
1346 			 thread->stack_info.size);
1347 }
1348 
z_arm64_swap_ptables(struct k_thread * incoming)1349 static void z_arm64_swap_ptables(struct k_thread *incoming)
1350 {
1351 	struct arm_mmu_ptables *ptables = incoming->arch.ptables;
1352 	uint64_t curr_ttbr0 = read_ttbr0_el1();
1353 	uint64_t new_ttbr0 = ptables->ttbr0;
1354 
1355 	if (curr_ttbr0 == new_ttbr0) {
1356 		return; /* Already the right tables */
1357 	}
1358 
1359 	MMU_DEBUG("TTBR0 switch from %#llx to %#llx\n", curr_ttbr0, new_ttbr0);
1360 	z_arm64_set_ttbr0(new_ttbr0);
1361 
1362 	if (get_asid(curr_ttbr0) == get_asid(new_ttbr0)) {
1363 		invalidate_tlb_all();
1364 	}
1365 }
1366 
z_arm64_thread_mem_domains_init(struct k_thread * incoming)1367 void z_arm64_thread_mem_domains_init(struct k_thread *incoming)
1368 {
1369 	struct arm_mmu_ptables *ptables;
1370 
1371 	if ((incoming->base.user_options & K_USER) == 0) {
1372 		return;
1373 	}
1374 
1375 	ptables = incoming->arch.ptables;
1376 
1377 	/* Map the thread stack */
1378 	map_thread_stack(incoming, ptables);
1379 
1380 	z_arm64_swap_ptables(incoming);
1381 }
1382 
z_arm64_swap_mem_domains(struct k_thread * incoming)1383 void z_arm64_swap_mem_domains(struct k_thread *incoming)
1384 {
1385 	z_arm64_swap_ptables(incoming);
1386 }
1387 
1388 #endif /* CONFIG_USERSPACE */
1389 
1390 #ifdef CONFIG_DEMAND_PAGING
1391 
get_pte_location(struct arm_mmu_ptables * ptables,uintptr_t virt)1392 static uint64_t *get_pte_location(struct arm_mmu_ptables *ptables,
1393 				  uintptr_t virt)
1394 {
1395 	uint64_t *pte;
1396 	uint64_t *table = ptables->base_xlat_table;
1397 	unsigned int level = BASE_XLAT_LEVEL;
1398 
1399 	for (;;) {
1400 		pte = &table[XLAT_TABLE_VA_IDX(virt, level)];
1401 		if (level == XLAT_LAST_LEVEL) {
1402 			return pte;
1403 		}
1404 
1405 		if (is_table_desc(*pte, level)) {
1406 			level++;
1407 			table = pte_desc_table(*pte);
1408 			continue;
1409 		}
1410 
1411 		/* anything else is unexpected */
1412 		return NULL;
1413 	}
1414 }
1415 
arch_mem_page_out(void * addr,uintptr_t location)1416 void arch_mem_page_out(void *addr, uintptr_t location)
1417 {
1418 	uintptr_t virt = (uintptr_t)addr;
1419 	uint64_t *pte = get_pte_location(&kernel_ptables, virt);
1420 	uint64_t desc;
1421 
1422 	__ASSERT(pte != NULL, "");
1423 	desc = *pte;
1424 
1425 	/* mark the entry invalid to the hardware */
1426 	desc &= ~PTE_DESC_TYPE_MASK;
1427 	desc |= PTE_INVALID_DESC;
1428 
1429 	/* store the location token in place of the physical address */
1430 	__ASSERT((location & ~PTE_PHYSADDR_MASK) == 0, "");
1431 	desc &= ~PTE_PHYSADDR_MASK;
1432 	desc |= location;
1433 
1434 	/*
1435 	 * The location token may be 0. Make sure the whole descriptor
1436 	 * doesn't end up being zero as this would be seen as a free entry.
1437 	 */
1438 	desc |= PTE_BLOCK_DESC_AP_RO;
1439 
1440 	*pte = desc;
1441 	MMU_DEBUG("page_out: virt=%#lx location=%#lx\n", virt, location);
1442 	debug_show_pte(pte, XLAT_LAST_LEVEL);
1443 
1444 	sync_domains(virt, CONFIG_MMU_PAGE_SIZE, "page_out");
1445 	invalidate_tlb_page(virt);
1446 }
1447 
arch_mem_page_in(void * addr,uintptr_t phys)1448 void arch_mem_page_in(void *addr, uintptr_t phys)
1449 {
1450 	uintptr_t virt = (uintptr_t)addr;
1451 	uint64_t *pte = get_pte_location(&kernel_ptables, virt);
1452 	uint64_t desc;
1453 
1454 	__ASSERT((phys & ~PTE_PHYSADDR_MASK) == 0, "");
1455 
1456 	__ASSERT(pte != NULL, "");
1457 	desc = *pte;
1458 	__ASSERT(!is_free_desc(desc), "");
1459 
1460 	/* mark the entry valid again to the hardware */
1461 	desc &= ~PTE_DESC_TYPE_MASK;
1462 	desc |= PTE_PAGE_DESC;
1463 
1464 	/* store the physical address */
1465 	desc &= ~PTE_PHYSADDR_MASK;
1466 	desc |= phys;
1467 
1468 	/* mark as clean */
1469 	desc |= PTE_BLOCK_DESC_AP_RO;
1470 
1471 	/* and make it initially unaccessible to track unaccessed pages */
1472 	desc &= ~PTE_BLOCK_DESC_AF;
1473 
1474 	*pte = desc;
1475 	MMU_DEBUG("page_in: virt=%#lx phys=%#lx\n", virt, phys);
1476 	debug_show_pte(pte, XLAT_LAST_LEVEL);
1477 
1478 	sync_domains(virt, CONFIG_MMU_PAGE_SIZE, "page_in");
1479 	invalidate_tlb_page(virt);
1480 }
1481 
arch_page_location_get(void * addr,uintptr_t * location)1482 enum arch_page_location arch_page_location_get(void *addr, uintptr_t *location)
1483 {
1484 	uintptr_t virt = (uintptr_t)addr;
1485 	uint64_t *pte = get_pte_location(&kernel_ptables, virt);
1486 	uint64_t desc;
1487 	enum arch_page_location status;
1488 
1489 	if (!pte) {
1490 		return ARCH_PAGE_LOCATION_BAD;
1491 	}
1492 	desc = *pte;
1493 	if (is_free_desc(desc)) {
1494 		return ARCH_PAGE_LOCATION_BAD;
1495 	}
1496 
1497 	switch (desc & PTE_DESC_TYPE_MASK) {
1498 	case PTE_PAGE_DESC:
1499 		status = ARCH_PAGE_LOCATION_PAGED_IN;
1500 		break;
1501 	case PTE_INVALID_DESC:
1502 		status = ARCH_PAGE_LOCATION_PAGED_OUT;
1503 		break;
1504 	default:
1505 		return ARCH_PAGE_LOCATION_BAD;
1506 	}
1507 
1508 	*location = desc & PTE_PHYSADDR_MASK;
1509 	return status;
1510 }
1511 
arch_page_info_get(void * addr,uintptr_t * phys,bool clear_accessed)1512 uintptr_t arch_page_info_get(void *addr, uintptr_t *phys, bool clear_accessed)
1513 {
1514 	uintptr_t virt = (uintptr_t)addr;
1515 	uint64_t *pte = get_pte_location(&kernel_ptables, virt);
1516 	uint64_t desc;
1517 	uintptr_t status = 0;
1518 
1519 	if (!pte) {
1520 		return ARCH_DATA_PAGE_NOT_MAPPED;
1521 	}
1522 	desc = *pte;
1523 	if (is_free_desc(desc)) {
1524 		return ARCH_DATA_PAGE_NOT_MAPPED;
1525 	}
1526 
1527 	switch (desc & PTE_DESC_TYPE_MASK) {
1528 	case PTE_PAGE_DESC:
1529 		status |= ARCH_DATA_PAGE_LOADED;
1530 		break;
1531 	case PTE_INVALID_DESC:
1532 		/* page not loaded */
1533 		break;
1534 	default:
1535 		return ARCH_DATA_PAGE_NOT_MAPPED;
1536 	}
1537 
1538 	if (phys) {
1539 		*phys = desc & PTE_PHYSADDR_MASK;
1540 	}
1541 
1542 	if ((status & ARCH_DATA_PAGE_LOADED) == 0) {
1543 		return status;
1544 	}
1545 
1546 	if ((desc & PTE_BLOCK_DESC_AF) != 0) {
1547 		status |= ARCH_DATA_PAGE_ACCESSED;
1548 	}
1549 
1550 	if ((desc & PTE_BLOCK_DESC_AP_RO) == 0) {
1551 		status |= ARCH_DATA_PAGE_DIRTY;
1552 	}
1553 
1554 	if (clear_accessed) {
1555 		desc &= ~PTE_BLOCK_DESC_AF;
1556 		*pte = desc;
1557 		MMU_DEBUG("page_info: virt=%#lx (clearing AF)\n", virt);
1558 		debug_show_pte(pte, XLAT_LAST_LEVEL);
1559 		sync_domains(virt, CONFIG_MMU_PAGE_SIZE, "unaccessible");
1560 		invalidate_tlb_page(virt);
1561 	}
1562 
1563 	return status;
1564 }
1565 
1566 #define MT_SCRATCH (MT_NORMAL | MT_P_RW_U_NA | MT_DEFAULT_SECURE_STATE)
1567 
arch_mem_scratch(uintptr_t phys)1568 void arch_mem_scratch(uintptr_t phys)
1569 {
1570 	uintptr_t virt = (uintptr_t)K_MEM_SCRATCH_PAGE;
1571 	size_t size = CONFIG_MMU_PAGE_SIZE;
1572 	int ret = add_map(&kernel_ptables, "scratch", phys, virt, size, MT_SCRATCH);
1573 
1574 	if (ret) {
1575 		LOG_ERR("add_map() returned %d", ret);
1576 	} else {
1577 		sync_domains(virt, size, "scratch");
1578 		invalidate_tlb_page(virt);
1579 	}
1580 }
1581 
do_mem_page_fault(struct arch_esf * esf,uintptr_t virt)1582 static bool do_mem_page_fault(struct arch_esf *esf, uintptr_t virt)
1583 {
1584 	/*
1585 	 * The k_mem_page_fault() code expects to be called with IRQs enabled
1586 	 * if the fault happened in a context where IRQs were enabled.
1587 	 */
1588 	if (arch_irq_unlocked(esf->spsr)) {
1589 		enable_irq();
1590 	}
1591 
1592 	bool ok = k_mem_page_fault((void *)virt);
1593 
1594 	disable_irq();
1595 	return ok;
1596 }
1597 
1598 /* Called from the fault handler. Returns true if the fault is resolved. */
z_arm64_do_demand_paging(struct arch_esf * esf,uint64_t esr,uint64_t far)1599 bool z_arm64_do_demand_paging(struct arch_esf *esf, uint64_t esr, uint64_t far)
1600 {
1601 	uintptr_t virt = far;
1602 	uint64_t *pte, desc;
1603 	uintptr_t phys;
1604 
1605 	/* filter relevant exceptions */
1606 	switch (GET_ESR_EC(esr)) {
1607 	case 0x21: /* insn abort from current EL */
1608 	case 0x25: /* data abort from current EL */
1609 		break;
1610 	default:
1611 		return false;
1612 	}
1613 
1614 	/* make sure the fault happened in the expected range */
1615 	if (!IN_RANGE(virt,
1616 		      (uintptr_t)K_MEM_VIRT_RAM_START,
1617 		      ((uintptr_t)K_MEM_VIRT_RAM_END - 1))) {
1618 		return false;
1619 	}
1620 
1621 	virt = ROUND_DOWN(virt, CONFIG_MMU_PAGE_SIZE);
1622 
1623 	pte = get_pte_location(&kernel_ptables, virt);
1624 	if (!pte) {
1625 		/* page mapping doesn't exist, let the core code do its thing */
1626 		return do_mem_page_fault(esf, virt);
1627 	}
1628 	desc = *pte;
1629 	if ((desc & PTE_DESC_TYPE_MASK) != PTE_PAGE_DESC) {
1630 		/* page is not loaded/mapped */
1631 		return do_mem_page_fault(esf, virt);
1632 	}
1633 
1634 	/*
1635 	 * From this point, we expect only 2 cases:
1636 	 *
1637 	 * 1) the Access Flag was not set so we set it marking the page
1638 	 *    as accessed;
1639 	 *
1640 	 * 2) the page was read-only and a write occurred so we clear the
1641 	 *    RO flag marking the page dirty.
1642 	 *
1643 	 * We bail out on anything else.
1644 	 *
1645 	 * Fault status codes for Data aborts (DFSC):
1646 	 *  0b0010LL	Access flag fault
1647 	 *  0b0011LL	Permission fault
1648 	 */
1649 	uint32_t dfsc = GET_ESR_ISS(esr) & GENMASK(5, 0);
1650 	bool write = (GET_ESR_ISS(esr) & BIT(6)) != 0; /* WnR */
1651 
1652 	if (dfsc == (0b001000 | XLAT_LAST_LEVEL) &&
1653 	    (desc & PTE_BLOCK_DESC_AF) == 0) {
1654 		/* page is being accessed: set the access flag */
1655 		desc |= PTE_BLOCK_DESC_AF;
1656 		if (write) {
1657 			if ((desc & PTE_SW_WRITABLE) == 0) {
1658 				/* we don't actually have write permission */
1659 				return false;
1660 			}
1661 			/*
1662 			 * Let's avoid another fault immediately after
1663 			 * returning by making the page read-write right away
1664 			 * effectively marking it "dirty" as well.
1665 			 */
1666 			desc &= ~PTE_BLOCK_DESC_AP_RO;
1667 		}
1668 		*pte = desc;
1669 		sync_domains(virt, CONFIG_MMU_PAGE_SIZE, "accessed");
1670 		/* no TLB inval needed after setting AF */
1671 
1672 		/* tell the eviction algorithm about it */
1673 		phys = desc & PTE_PHYSADDR_MASK;
1674 		k_mem_paging_eviction_accessed(phys);
1675 		return true;
1676 	}
1677 
1678 	if (dfsc == (0b001100 | XLAT_LAST_LEVEL) && write &&
1679 	    (desc & PTE_BLOCK_DESC_AP_RO) != 0 &&
1680 	    (desc & PTE_SW_WRITABLE) != 0) {
1681 		/* make it "dirty" i.e. read-write */
1682 		desc &= ~PTE_BLOCK_DESC_AP_RO;
1683 		*pte = desc;
1684 		sync_domains(virt, CONFIG_MMU_PAGE_SIZE, "dirtied");
1685 		invalidate_tlb_page(virt);
1686 
1687 		/* this also counts as an access refresh */
1688 		phys = desc & PTE_PHYSADDR_MASK;
1689 		k_mem_paging_eviction_accessed(phys);
1690 		return true;
1691 	}
1692 
1693 	return false;
1694 }
1695 
1696 #endif /* CONFIG_DEMAND_PAGING */
1697