/* * Copyright 2019 Broadcom * The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. * * Copyright (c) 2021 BayLibre, SAS * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mmu.h" #include "paging.h" LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL); static uint64_t xlat_tables[CONFIG_MAX_XLAT_TABLES * Ln_XLAT_NUM_ENTRIES] __aligned(Ln_XLAT_NUM_ENTRIES * sizeof(uint64_t)); static int xlat_use_count[CONFIG_MAX_XLAT_TABLES]; static struct k_spinlock xlat_lock; /* Usage count value range */ #define XLAT_PTE_COUNT_MASK GENMASK(15, 0) #define XLAT_REF_COUNT_UNIT BIT(16) /* Returns a reference to a free table */ static uint64_t *new_table(void) { uint64_t *table; unsigned int i; /* Look for a free table. */ for (i = 0U; i < CONFIG_MAX_XLAT_TABLES; i++) { if (xlat_use_count[i] == 0) { table = &xlat_tables[i * Ln_XLAT_NUM_ENTRIES]; xlat_use_count[i] = XLAT_REF_COUNT_UNIT; MMU_DEBUG("allocating table [%d]%p\n", i, table); return table; } } LOG_ERR("CONFIG_MAX_XLAT_TABLES, too small"); return NULL; } static inline unsigned int table_index(uint64_t *pte) { unsigned int i = (pte - xlat_tables) / Ln_XLAT_NUM_ENTRIES; __ASSERT(i < CONFIG_MAX_XLAT_TABLES, "table %p out of range", pte); return i; } /* Adjusts usage count and returns current count. */ static int table_usage(uint64_t *table, int adjustment) { unsigned int i = table_index(table); int prev_count = xlat_use_count[i]; int new_count = prev_count + adjustment; /* be reasonable not to always create a debug flood */ if ((IS_ENABLED(DUMP_PTE) && adjustment != 0) || new_count == 0) { MMU_DEBUG("table [%d]%p: usage %#x -> %#x\n", i, table, prev_count, new_count); } __ASSERT(new_count >= 0, "table use count underflow"); __ASSERT(new_count == 0 || new_count >= XLAT_REF_COUNT_UNIT, "table in use with no reference to it"); __ASSERT((new_count & XLAT_PTE_COUNT_MASK) <= Ln_XLAT_NUM_ENTRIES, "table PTE count overflow"); xlat_use_count[i] = new_count; return new_count; } static inline void inc_table_ref(uint64_t *table) { table_usage(table, XLAT_REF_COUNT_UNIT); } static inline void dec_table_ref(uint64_t *table) { int ref_unit = XLAT_REF_COUNT_UNIT; table_usage(table, -ref_unit); } static inline bool is_table_unused(uint64_t *table) { return (table_usage(table, 0) & XLAT_PTE_COUNT_MASK) == 0; } static inline bool is_table_single_referenced(uint64_t *table) { return table_usage(table, 0) < (2 * XLAT_REF_COUNT_UNIT); } #ifdef CONFIG_TEST /* Hooks to let test code peek at table states */ int arm64_mmu_nb_free_tables(void) { int count = 0; for (int i = 0; i < CONFIG_MAX_XLAT_TABLES; i++) { if (xlat_use_count[i] == 0) { count++; } } return count; } int arm64_mmu_tables_total_usage(void) { int count = 0; for (int i = 0; i < CONFIG_MAX_XLAT_TABLES; i++) { count += xlat_use_count[i]; } return count; } #endif /* CONFIG_TEST */ static inline bool is_free_desc(uint64_t desc) { return desc == 0; } static inline bool is_inval_desc(uint64_t desc) { /* invalid descriptors aren't necessarily free */ return (desc & PTE_DESC_TYPE_MASK) == PTE_INVALID_DESC; } static inline bool is_table_desc(uint64_t desc, unsigned int level) { return level != XLAT_LAST_LEVEL && (desc & PTE_DESC_TYPE_MASK) == PTE_TABLE_DESC; } static inline bool is_block_desc(uint64_t desc) { return (desc & PTE_DESC_TYPE_MASK) == PTE_BLOCK_DESC; } static inline uint64_t *pte_desc_table(uint64_t desc) { uint64_t address = desc & PTE_PHYSADDR_MASK; /* tables use a 1:1 physical:virtual mapping */ return (uint64_t *)address; } static inline bool is_desc_block_aligned(uint64_t desc, unsigned int level_size) { bool aligned = (desc & PTE_PHYSADDR_MASK & (level_size - 1)) == 0; if (!aligned) { MMU_DEBUG("misaligned desc 0x%016llx for block size 0x%x\n", desc, level_size); } return aligned; } static inline bool is_desc_superset(uint64_t desc1, uint64_t desc2, unsigned int level) { uint64_t mask = DESC_ATTRS_MASK | GENMASK64(47, LEVEL_TO_VA_SIZE_SHIFT(level)); return (desc1 & mask) == (desc2 & mask); } #if DUMP_PTE static void debug_show_pte(uint64_t *pte, unsigned int level) { MMU_DEBUG("%.*s", level * 2U, ". . . "); MMU_DEBUG("[%d]%p: ", table_index(pte), pte); if (is_free_desc(*pte)) { MMU_DEBUG("---\n"); return; } MMU_DEBUG("0x%016llx ", *pte); if (is_table_desc(*pte, level)) { uint64_t *table = pte_desc_table(*pte); MMU_DEBUG("[Table] [%d]%p\n", table_index(table), table); return; } if (is_block_desc(*pte)) { MMU_DEBUG("[Block] "); } else if (!is_inval_desc(*pte)) { MMU_DEBUG("[Page] "); } else { MMU_DEBUG("[paged-out] "); } uint8_t mem_type = (*pte >> 2) & MT_TYPE_MASK; MMU_DEBUG((mem_type == MT_NORMAL) ? "MEM" : ((mem_type == MT_NORMAL_NC) ? "NC" : "DEV")); MMU_DEBUG((*pte & PTE_BLOCK_DESC_AP_RO) ? "-RO" : "-RW"); MMU_DEBUG((*pte & PTE_BLOCK_DESC_NS) ? "-NS" : "-S"); MMU_DEBUG((*pte & PTE_BLOCK_DESC_AP_ELx) ? "-ELx" : "-ELh"); MMU_DEBUG((*pte & PTE_BLOCK_DESC_PXN) ? "-PXN" : "-PX"); MMU_DEBUG((*pte & PTE_BLOCK_DESC_UXN) ? "-UXN" : "-UX"); MMU_DEBUG((*pte & PTE_SW_WRITABLE) ? "-WRITABLE" : ""); MMU_DEBUG("\n"); } #else static inline void debug_show_pte(uint64_t *pte, unsigned int level) { } #endif static void set_pte_table_desc(uint64_t *pte, uint64_t *table, unsigned int level) { /* Point pte to new table */ *pte = PTE_TABLE_DESC | (uint64_t)table; debug_show_pte(pte, level); } static void set_pte_block_desc(uint64_t *pte, uint64_t desc, unsigned int level) { if (level != XLAT_LAST_LEVEL) { desc |= PTE_BLOCK_DESC; } else if (!IS_ENABLED(CONFIG_DEMAND_PAGING) || (desc & PTE_BLOCK_DESC_AF) != 0) { desc |= PTE_PAGE_DESC; } else { /* * Demand paging configured and AF unset: leave the descriptor * type to "invalid" as in arch_mem_page_out(). */ } *pte = desc; debug_show_pte(pte, level); } static uint64_t *expand_to_table(uint64_t *pte, unsigned int level) { uint64_t *table; __ASSERT(level < XLAT_LAST_LEVEL, "can't expand last level"); table = new_table(); if (!table) { return NULL; } if (!is_free_desc(*pte)) { /* * If entry at current level was already populated * then we need to reflect that in the new table. */ uint64_t desc = *pte; unsigned int i, stride_shift; MMU_DEBUG("expanding PTE 0x%016llx into table [%d]%p\n", desc, table_index(table), table); __ASSERT(is_block_desc(desc), ""); if (level + 1 == XLAT_LAST_LEVEL) { desc |= PTE_PAGE_DESC; } stride_shift = LEVEL_TO_VA_SIZE_SHIFT(level + 1); for (i = 0U; i < Ln_XLAT_NUM_ENTRIES; i++) { table[i] = desc | (i << stride_shift); } table_usage(table, Ln_XLAT_NUM_ENTRIES); } else { /* * Adjust usage count for parent table's entry * that will no longer be free. */ table_usage(pte, 1); } /* Link the new table in place of the pte it replaces */ set_pte_table_desc(pte, table, level); return table; } static int set_mapping(uint64_t *top_table, uintptr_t virt, size_t size, uint64_t desc, bool may_overwrite) { uint64_t *table = top_table; uint64_t *pte; uint64_t level_size; unsigned int level = BASE_XLAT_LEVEL; while (size) { __ASSERT(level <= XLAT_LAST_LEVEL, "max translation table level exceeded\n"); /* Locate PTE for given virtual address and page table level */ pte = &table[XLAT_TABLE_VA_IDX(virt, level)]; if (is_table_desc(*pte, level)) { /* Move to the next translation table level */ level++; table = pte_desc_table(*pte); continue; } if (!may_overwrite && !is_free_desc(*pte)) { /* the entry is already allocated */ LOG_ERR("entry already in use: " "level %d pte %p *pte 0x%016llx", level, pte, *pte); return -EBUSY; } level_size = 1ULL << LEVEL_TO_VA_SIZE_SHIFT(level); if (is_desc_superset(*pte, desc, level)) { /* This block already covers our range */ level_size -= (virt & (level_size - 1)); if (level_size > size) { level_size = size; } goto move_on; } if ((size < level_size) || (virt & (level_size - 1)) || !is_desc_block_aligned(desc, level_size)) { /* Range doesn't fit, create subtable */ table = expand_to_table(pte, level); if (!table) { return -ENOMEM; } level++; continue; } /* Adjust usage count for corresponding table */ if (is_free_desc(*pte)) { table_usage(pte, 1); } /* Create block/page descriptor */ set_pte_block_desc(pte, desc, level); move_on: virt += level_size; desc += level_size; size -= level_size; /* Range is mapped, start again for next range */ table = top_table; level = BASE_XLAT_LEVEL; } return 0; } static void del_mapping(uint64_t *table, uintptr_t virt, size_t size, unsigned int level) { size_t step, level_size = 1ULL << LEVEL_TO_VA_SIZE_SHIFT(level); uint64_t *pte, *subtable; for ( ; size; virt += step, size -= step) { step = level_size - (virt & (level_size - 1)); if (step > size) { step = size; } pte = &table[XLAT_TABLE_VA_IDX(virt, level)]; if (is_free_desc(*pte)) { continue; } if (step != level_size && is_block_desc(*pte)) { /* need to split this block mapping */ expand_to_table(pte, level); } if (is_table_desc(*pte, level)) { subtable = pte_desc_table(*pte); del_mapping(subtable, virt, step, level + 1); if (!is_table_unused(subtable)) { continue; } dec_table_ref(subtable); } /* free this entry */ *pte = 0; table_usage(pte, -1); } } #ifdef CONFIG_USERSPACE static uint64_t *dup_table(uint64_t *src_table, unsigned int level) { uint64_t *dst_table = new_table(); int i, usage_count = 0; if (!dst_table) { return NULL; } MMU_DEBUG("dup (level %d) [%d]%p to [%d]%p\n", level, table_index(src_table), src_table, table_index(dst_table), dst_table); for (i = 0; i < Ln_XLAT_NUM_ENTRIES; i++) { /* * After the table duplication, each table can be independently * updated. Thus, entries may become non-global. * To keep the invariants very simple, we thus force the non-global * bit on duplication. Moreover, there is no process to revert this * (e.g. in `globalize_table`). Could be improved in future work. */ if (!is_free_desc(src_table[i]) && !is_table_desc(src_table[i], level)) { src_table[i] |= PTE_BLOCK_DESC_NG; } dst_table[i] = src_table[i]; if (is_table_desc(dst_table[i], level)) { inc_table_ref(pte_desc_table(dst_table[i])); } if (!is_free_desc(dst_table[i])) { usage_count++; } } table_usage(dst_table, usage_count); return dst_table; } static int privatize_table(uint64_t *dst_table, uint64_t *src_table, uintptr_t virt, size_t size, unsigned int level) { size_t step, level_size = 1ULL << LEVEL_TO_VA_SIZE_SHIFT(level); unsigned int i; int ret; for ( ; size; virt += step, size -= step) { step = level_size - (virt & (level_size - 1)); if (step > size) { step = size; } i = XLAT_TABLE_VA_IDX(virt, level); if (!is_table_desc(dst_table[i], level) || !is_table_desc(src_table[i], level)) { /* this entry is already private */ continue; } uint64_t *dst_subtable = pte_desc_table(dst_table[i]); uint64_t *src_subtable = pte_desc_table(src_table[i]); if (dst_subtable == src_subtable) { /* need to make a private copy of this table */ dst_subtable = dup_table(src_subtable, level + 1); if (!dst_subtable) { return -ENOMEM; } set_pte_table_desc(&dst_table[i], dst_subtable, level); dec_table_ref(src_subtable); } ret = privatize_table(dst_subtable, src_subtable, virt, step, level + 1); if (ret) { return ret; } } return 0; } /* * Make the given virtual address range private in dst_pt with regards to * src_pt. By "private" this means that corresponding page tables in dst_pt * will be duplicated so not to share the same table(s) with src_pt. * If corresponding page tables in dst_pt are already distinct from src_pt * then nothing is done. This allows for subsequent mapping changes in that * range to affect only dst_pt. */ static int privatize_page_range(struct arm_mmu_ptables *dst_pt, struct arm_mmu_ptables *src_pt, uintptr_t virt_start, size_t size, const char *name) { k_spinlock_key_t key; int ret; MMU_DEBUG("privatize [%s]: virt %lx size %lx\n", name, virt_start, size); key = k_spin_lock(&xlat_lock); ret = privatize_table(dst_pt->base_xlat_table, src_pt->base_xlat_table, virt_start, size, BASE_XLAT_LEVEL); k_spin_unlock(&xlat_lock, key); return ret; } static void discard_table(uint64_t *table, unsigned int level) { unsigned int i; int free_count = 0; for (i = 0U; i < Ln_XLAT_NUM_ENTRIES; i++) { if (is_table_desc(table[i], level)) { uint64_t *subtable = pte_desc_table(table[i]); if (is_table_single_referenced(subtable)) { discard_table(subtable, level + 1); } dec_table_ref(subtable); } if (!is_free_desc(table[i])) { table[i] = 0U; free_count++; } } table_usage(table, -free_count); } static int globalize_table(uint64_t *dst_table, uint64_t *src_table, uintptr_t virt, size_t size, unsigned int level) { size_t step, level_size = 1ULL << LEVEL_TO_VA_SIZE_SHIFT(level); unsigned int i; int ret; for ( ; size; virt += step, size -= step) { step = level_size - (virt & (level_size - 1)); if (step > size) { step = size; } i = XLAT_TABLE_VA_IDX(virt, level); if (dst_table[i] == src_table[i]) { /* already identical to global table */ continue; } if (is_free_desc(src_table[i]) && is_table_desc(dst_table[i], level)) { uint64_t *subtable = pte_desc_table(dst_table[i]); del_mapping(subtable, virt, step, level + 1); if (is_table_unused(subtable)) { /* unreference the empty table */ dst_table[i] = 0; table_usage(dst_table, -1); dec_table_ref(subtable); } continue; } if (step != level_size) { /* boundary falls in the middle of this pte */ __ASSERT(is_table_desc(src_table[i], level), "can't have partial block pte here"); if (!is_table_desc(dst_table[i], level)) { /* we need more fine grained boundaries */ if (!expand_to_table(&dst_table[i], level)) { return -ENOMEM; } } ret = globalize_table(pte_desc_table(dst_table[i]), pte_desc_table(src_table[i]), virt, step, level + 1); if (ret) { return ret; } continue; } /* we discard current pte and replace with global one */ uint64_t *old_table = is_table_desc(dst_table[i], level) ? pte_desc_table(dst_table[i]) : NULL; if (is_free_desc(dst_table[i])) { table_usage(dst_table, 1); } if (is_free_desc(src_table[i])) { table_usage(dst_table, -1); } if (is_table_desc(src_table[i], level)) { inc_table_ref(pte_desc_table(src_table[i])); } dst_table[i] = src_table[i]; debug_show_pte(&dst_table[i], level); if (old_table) { /* we can discard the whole branch */ discard_table(old_table, level + 1); dec_table_ref(old_table); } } return 0; } /* * Globalize the given virtual address range in dst_pt from src_pt. We make * it global by sharing as much page table content from src_pt as possible, * including page tables themselves, and corresponding private tables in * dst_pt are then discarded. If page tables in the given range are already * shared then nothing is done. If page table sharing is not possible then * page table entries in dst_pt are synchronized with those from src_pt. */ static int globalize_page_range(struct arm_mmu_ptables *dst_pt, struct arm_mmu_ptables *src_pt, uintptr_t virt_start, size_t size, const char *name) { k_spinlock_key_t key; int ret; MMU_DEBUG("globalize [%s]: virt %lx size %lx\n", name, virt_start, size); key = k_spin_lock(&xlat_lock); ret = globalize_table(dst_pt->base_xlat_table, src_pt->base_xlat_table, virt_start, size, BASE_XLAT_LEVEL); k_spin_unlock(&xlat_lock, key); return ret; } #endif /* CONFIG_USERSPACE */ static uint64_t get_region_desc(uint32_t attrs) { unsigned int mem_type; uint64_t desc = 0U; /* NS bit for security memory access from secure state */ desc |= (attrs & MT_NS) ? PTE_BLOCK_DESC_NS : 0; /* * AP bits for EL0 / ELh Data access permission * * AP[2:1] ELh EL0 * +--------------------+ * 00 RW NA * 01 RW RW * 10 RO NA * 11 RO RO */ /* AP bits for Data access permission */ desc |= (attrs & MT_RW) ? PTE_BLOCK_DESC_AP_RW : PTE_BLOCK_DESC_AP_RO; desc |= (IS_ENABLED(CONFIG_DEMAND_PAGING) && (attrs & MT_RW)) ? PTE_SW_WRITABLE : 0; /* Mirror permissions to EL0 */ desc |= (attrs & MT_RW_AP_ELx) ? PTE_BLOCK_DESC_AP_ELx : PTE_BLOCK_DESC_AP_EL_HIGHER; /* the access flag */ desc |= PTE_BLOCK_DESC_AF; if (IS_ENABLED(CONFIG_DEMAND_PAGING) && (attrs & MT_PAGED_OUT) != 0) { /* set it up for demand paging like arch_mem_page_out() */ desc &= ~PTE_BLOCK_DESC_AF; desc |= PTE_BLOCK_DESC_AP_RO; } /* memory attribute index field */ mem_type = MT_TYPE(attrs); desc |= PTE_BLOCK_DESC_MEMTYPE(mem_type); switch (mem_type) { case MT_DEVICE_nGnRnE: case MT_DEVICE_nGnRE: case MT_DEVICE_GRE: /* Access to Device memory and non-cacheable memory are coherent * for all observers in the system and are treated as * Outer shareable, so, for these 2 types of memory, * it is not strictly needed to set shareability field */ desc |= PTE_BLOCK_DESC_OUTER_SHARE; /* Map device memory as execute-never */ desc |= PTE_BLOCK_DESC_PXN; desc |= PTE_BLOCK_DESC_UXN; break; case MT_NORMAL_NC: case MT_NORMAL: /* Make Normal RW memory as execute never */ if ((attrs & MT_RW) || (attrs & MT_P_EXECUTE_NEVER)) { desc |= PTE_BLOCK_DESC_PXN; } if (((attrs & MT_RW) && (attrs & MT_RW_AP_ELx)) || (attrs & MT_U_EXECUTE_NEVER)) { desc |= PTE_BLOCK_DESC_UXN; } if (mem_type == MT_NORMAL) { desc |= PTE_BLOCK_DESC_INNER_SHARE; } else { desc |= PTE_BLOCK_DESC_OUTER_SHARE; } } /* non-Global bit */ if (attrs & MT_NG) { desc |= PTE_BLOCK_DESC_NG; } return desc; } static int __add_map(struct arm_mmu_ptables *ptables, const char *name, uintptr_t phys, uintptr_t virt, size_t size, uint32_t attrs) { uint64_t desc = get_region_desc(attrs); bool may_overwrite = !(attrs & MT_NO_OVERWRITE); MMU_DEBUG("mmap [%s]: virt %lx phys %lx size %lx attr %llx %s overwrite\n", name, virt, phys, size, desc, may_overwrite ? "may" : "no"); __ASSERT(((virt | phys | size) & (CONFIG_MMU_PAGE_SIZE - 1)) == 0, "address/size are not page aligned\n"); desc |= phys; return set_mapping(ptables->base_xlat_table, virt, size, desc, may_overwrite); } static int add_map(struct arm_mmu_ptables *ptables, const char *name, uintptr_t phys, uintptr_t virt, size_t size, uint32_t attrs) { k_spinlock_key_t key; int ret; key = k_spin_lock(&xlat_lock); ret = __add_map(ptables, name, phys, virt, size, attrs); k_spin_unlock(&xlat_lock, key); return ret; } static void remove_map(struct arm_mmu_ptables *ptables, const char *name, uintptr_t virt, size_t size) { k_spinlock_key_t key; MMU_DEBUG("unmmap [%s]: virt %lx size %lx\n", name, virt, size); __ASSERT(((virt | size) & (CONFIG_MMU_PAGE_SIZE - 1)) == 0, "address/size are not page aligned\n"); key = k_spin_lock(&xlat_lock); del_mapping(ptables->base_xlat_table, virt, size, BASE_XLAT_LEVEL); k_spin_unlock(&xlat_lock, key); } static void invalidate_tlb_all(void) { __asm__ volatile ( "dsb ishst; tlbi vmalle1; dsb ish; isb" : : : "memory"); } static inline void invalidate_tlb_page(uintptr_t virt) { /* to be refined */ invalidate_tlb_all(); } /* zephyr execution regions with appropriate attributes */ struct arm_mmu_flat_range { char *name; void *start; void *end; uint32_t attrs; }; static const struct arm_mmu_flat_range mmu_zephyr_ranges[] = { /* Mark the zephyr execution regions (data, bss, noinit, etc.) * cacheable, read-write * Note: read-write region is marked execute-never internally */ { .name = "zephyr_data", .start = _image_ram_start, .end = _image_ram_end, .attrs = MT_NORMAL | MT_P_RW_U_NA | MT_DEFAULT_SECURE_STATE }, /* Mark text segment cacheable,read only and executable */ { .name = "zephyr_code", .start = __text_region_start, .end = __text_region_end, .attrs = MT_NORMAL | MT_P_RX_U_RX | MT_DEFAULT_SECURE_STATE }, /* Mark rodata segment cacheable, read only and execute-never */ { .name = "zephyr_rodata", .start = __rodata_region_start, .end = __rodata_region_end, .attrs = MT_NORMAL | MT_P_RO_U_RO | MT_DEFAULT_SECURE_STATE }, #ifdef CONFIG_NOCACHE_MEMORY /* Mark nocache segment noncachable, read-write and execute-never */ { .name = "nocache_data", .start = _nocache_ram_start, .end = _nocache_ram_end, .attrs = MT_NORMAL_NC | MT_P_RW_U_RW | MT_DEFAULT_SECURE_STATE }, #endif }; static inline void add_arm_mmu_flat_range(struct arm_mmu_ptables *ptables, const struct arm_mmu_flat_range *range, uint32_t extra_flags) { uintptr_t address = (uintptr_t)range->start; size_t size = (uintptr_t)range->end - address; if (size) { /* MMU not yet active: must use unlocked version */ __add_map(ptables, range->name, address, address, size, range->attrs | extra_flags); } } static inline void add_arm_mmu_region(struct arm_mmu_ptables *ptables, const struct arm_mmu_region *region, uint32_t extra_flags) { if (region->size || region->attrs) { /* MMU not yet active: must use unlocked version */ __add_map(ptables, region->name, region->base_pa, region->base_va, region->size, region->attrs | extra_flags); } } static inline void inv_dcache_after_map_helper(void *virt, size_t size, uint32_t attrs) { /* * DC IVAC instruction requires write access permission to the VA, * otherwise it can generate a permission fault */ if ((attrs & MT_RW) != MT_RW) { return; } if (MT_TYPE(attrs) == MT_NORMAL || MT_TYPE(attrs) == MT_NORMAL_WT) { sys_cache_data_invd_range(virt, size); } } static void setup_page_tables(struct arm_mmu_ptables *ptables) { unsigned int index; const struct arm_mmu_flat_range *range; const struct arm_mmu_region *region; uintptr_t max_va = 0, max_pa = 0; MMU_DEBUG("xlat tables:\n"); for (index = 0U; index < CONFIG_MAX_XLAT_TABLES; index++) { MMU_DEBUG("%d: %p\n", index, xlat_tables + index * Ln_XLAT_NUM_ENTRIES); } for (index = 0U; index < mmu_config.num_regions; index++) { region = &mmu_config.mmu_regions[index]; max_va = MAX(max_va, region->base_va + region->size); max_pa = MAX(max_pa, region->base_pa + region->size); } __ASSERT(max_va <= (1ULL << CONFIG_ARM64_VA_BITS), "Maximum VA not supported\n"); __ASSERT(max_pa <= (1ULL << CONFIG_ARM64_PA_BITS), "Maximum PA not supported\n"); /* setup translation table for zephyr execution regions */ for (index = 0U; index < ARRAY_SIZE(mmu_zephyr_ranges); index++) { range = &mmu_zephyr_ranges[index]; add_arm_mmu_flat_range(ptables, range, 0); } /* * Create translation tables for user provided platform regions. * Those must not conflict with our default mapping. */ for (index = 0U; index < mmu_config.num_regions; index++) { region = &mmu_config.mmu_regions[index]; add_arm_mmu_region(ptables, region, MT_NO_OVERWRITE); } invalidate_tlb_all(); for (index = 0U; index < ARRAY_SIZE(mmu_zephyr_ranges); index++) { size_t size; range = &mmu_zephyr_ranges[index]; size = POINTER_TO_UINT(range->end) - POINTER_TO_UINT(range->start); inv_dcache_after_map_helper(range->start, size, range->attrs); } for (index = 0U; index < mmu_config.num_regions; index++) { region = &mmu_config.mmu_regions[index]; inv_dcache_after_map_helper(UINT_TO_POINTER(region->base_va), region->size, region->attrs); } } /* Translation table control register settings */ static uint64_t get_tcr(int el) { uint64_t tcr; uint64_t va_bits = CONFIG_ARM64_VA_BITS; uint64_t tcr_ps_bits; tcr_ps_bits = TCR_PS_BITS; if (el == 1) { tcr = (tcr_ps_bits << TCR_EL1_IPS_SHIFT); /* * TCR_EL1.EPD1: Disable translation table walk for addresses * that are translated using TTBR1_EL1. */ tcr |= TCR_EPD1_DISABLE; } else { tcr = (tcr_ps_bits << TCR_EL3_PS_SHIFT); } tcr |= TCR_T0SZ(va_bits); /* * Translation table walk is cacheable, inner/outer WBWA and * inner shareable. Due to Cortex-A57 erratum #822227 we must * set TG1[1] = 4KB. */ tcr |= TCR_TG1_4K | TCR_TG0_4K | TCR_SHARED_INNER | TCR_ORGN_WBWA | TCR_IRGN_WBWA; return tcr; } static void enable_mmu_el1(struct arm_mmu_ptables *ptables, unsigned int flags) { ARG_UNUSED(flags); uint64_t val; /* Set MAIR, TCR and TBBR registers */ write_mair_el1(MEMORY_ATTRIBUTES); write_tcr_el1(get_tcr(1)); write_ttbr0_el1((uint64_t)ptables->base_xlat_table); /* Ensure these changes are seen before MMU is enabled */ barrier_isync_fence_full(); /* Enable the MMU and data cache */ val = read_sctlr_el1(); write_sctlr_el1(val | SCTLR_M_BIT | SCTLR_C_BIT); /* Ensure the MMU enable takes effect immediately */ barrier_isync_fence_full(); MMU_DEBUG("MMU enabled with dcache\n"); } /* ARM MMU Driver Initial Setup */ static struct arm_mmu_ptables kernel_ptables; #ifdef CONFIG_USERSPACE static sys_slist_t domain_list; #endif /* * @brief MMU default configuration * * This function provides the default configuration mechanism for the Memory * Management Unit (MMU). */ void z_arm64_mm_init(bool is_primary_core) { unsigned int flags = 0U; __ASSERT(CONFIG_MMU_PAGE_SIZE == KB(4), "Only 4K page size is supported\n"); __ASSERT(GET_EL(read_currentel()) == MODE_EL1, "Exception level not EL1, MMU not enabled!\n"); /* Ensure that MMU is already not enabled */ __ASSERT((read_sctlr_el1() & SCTLR_M_BIT) == 0, "MMU is already enabled\n"); /* * Only booting core setup up the page tables. */ if (is_primary_core) { kernel_ptables.base_xlat_table = new_table(); setup_page_tables(&kernel_ptables); } /* currently only EL1 is supported */ enable_mmu_el1(&kernel_ptables, flags); } static void sync_domains(uintptr_t virt, size_t size, const char *name) { #ifdef CONFIG_USERSPACE sys_snode_t *node; struct arch_mem_domain *domain; struct arm_mmu_ptables *domain_ptables; k_spinlock_key_t key; int ret; key = k_spin_lock(&z_mem_domain_lock); SYS_SLIST_FOR_EACH_NODE(&domain_list, node) { domain = CONTAINER_OF(node, struct arch_mem_domain, node); domain_ptables = &domain->ptables; ret = globalize_page_range(domain_ptables, &kernel_ptables, virt, size, name); if (ret) { LOG_ERR("globalize_page_range() returned %d", ret); } } k_spin_unlock(&z_mem_domain_lock, key); #endif } static int __arch_mem_map(void *virt, uintptr_t phys, size_t size, uint32_t flags) { struct arm_mmu_ptables *ptables; uint32_t entry_flags = MT_DEFAULT_SECURE_STATE | MT_P_RX_U_NA | MT_NO_OVERWRITE; /* Always map in the kernel page tables */ ptables = &kernel_ptables; /* Translate flags argument into HW-recognized entry flags. */ switch (flags & K_MEM_CACHE_MASK) { /* * K_MEM_CACHE_NONE, K_MEM_ARM_DEVICE_nGnRnE => MT_DEVICE_nGnRnE * (Device memory nGnRnE) * K_MEM_ARM_DEVICE_nGnRE => MT_DEVICE_nGnRE * (Device memory nGnRE) * K_MEM_ARM_DEVICE_GRE => MT_DEVICE_GRE * (Device memory GRE) * K_MEM_ARM_NORMAL_NC => MT_NORMAL_NC * (Normal memory Non-cacheable) * K_MEM_CACHE_WB => MT_NORMAL * (Normal memory Outer WB + Inner WB) * K_MEM_CACHE_WT => MT_NORMAL_WT * (Normal memory Outer WT + Inner WT) */ case K_MEM_CACHE_NONE: /* K_MEM_CACHE_NONE equal to K_MEM_ARM_DEVICE_nGnRnE */ /* case K_MEM_ARM_DEVICE_nGnRnE: */ entry_flags |= MT_DEVICE_nGnRnE; break; case K_MEM_ARM_DEVICE_nGnRE: entry_flags |= MT_DEVICE_nGnRE; break; case K_MEM_ARM_DEVICE_GRE: entry_flags |= MT_DEVICE_GRE; break; case K_MEM_ARM_NORMAL_NC: entry_flags |= MT_NORMAL_NC; break; case K_MEM_CACHE_WT: entry_flags |= MT_NORMAL_WT; break; case K_MEM_CACHE_WB: entry_flags |= MT_NORMAL; break; default: return -ENOTSUP; } if ((flags & K_MEM_PERM_RW) != 0U) { entry_flags |= MT_RW; } if ((flags & K_MEM_PERM_EXEC) == 0U) { entry_flags |= MT_P_EXECUTE_NEVER; } if ((flags & K_MEM_PERM_USER) != 0U) { entry_flags |= MT_RW_AP_ELx; } if (IS_ENABLED(CONFIG_DEMAND_PAGING) && (flags & K_MEM_MAP_UNPAGED) != 0) { entry_flags |= MT_PAGED_OUT; } return add_map(ptables, "generic", phys, (uintptr_t)virt, size, entry_flags); } void arch_mem_map(void *virt, uintptr_t phys, size_t size, uint32_t flags) { int ret = __arch_mem_map(virt, phys, size, flags); if (ret) { LOG_ERR("__arch_mem_map() returned %d", ret); k_panic(); } else { uint32_t mem_flags = flags & K_MEM_CACHE_MASK; sync_domains((uintptr_t)virt, size, "mem_map"); invalidate_tlb_all(); switch (mem_flags) { case K_MEM_CACHE_WB: case K_MEM_CACHE_WT: mem_flags = (mem_flags == K_MEM_CACHE_WB) ? MT_NORMAL : MT_NORMAL_WT; mem_flags |= (flags & K_MEM_PERM_RW) ? MT_RW : 0; inv_dcache_after_map_helper(virt, size, mem_flags); default: break; } } } void arch_mem_unmap(void *addr, size_t size) { remove_map(&kernel_ptables, "generic", (uintptr_t)addr, size); sync_domains((uintptr_t)addr, size, "mem_unmap"); invalidate_tlb_all(); } int arch_page_phys_get(void *virt, uintptr_t *phys) { uint64_t par; int key; key = arch_irq_lock(); __asm__ volatile ("at S1E1R, %0" : : "r" (virt)); barrier_isync_fence_full(); par = read_par_el1(); arch_irq_unlock(key); if (par & BIT(0)) { return -EFAULT; } if (phys) { *phys = par & GENMASK64(47, 12); } return 0; } size_t arch_virt_region_align(uintptr_t phys, size_t size) { size_t alignment = CONFIG_MMU_PAGE_SIZE; size_t level_size; int level; for (level = XLAT_LAST_LEVEL; level >= BASE_XLAT_LEVEL; level--) { level_size = 1 << LEVEL_TO_VA_SIZE_SHIFT(level); if (size < level_size) { break; } if ((phys & (level_size - 1))) { break; } alignment = level_size; } return alignment; } #ifdef CONFIG_USERSPACE static uint16_t next_asid = 1; static uint16_t get_asid(uint64_t ttbr0) { return ttbr0 >> TTBR_ASID_SHIFT; } static void z_arm64_swap_ptables(struct k_thread *incoming); int arch_mem_domain_max_partitions_get(void) { return CONFIG_MAX_DOMAIN_PARTITIONS; } int arch_mem_domain_init(struct k_mem_domain *domain) { struct arm_mmu_ptables *domain_ptables = &domain->arch.ptables; k_spinlock_key_t key; uint16_t asid; MMU_DEBUG("%s\n", __func__); key = k_spin_lock(&xlat_lock); /* * Pick a new ASID. We use round-robin * Note: `next_asid` is an uint16_t and `VM_ASID_BITS` could * be up to 16, hence `next_asid` might overflow to 0 below. */ asid = next_asid++; if ((next_asid >= (1UL << VM_ASID_BITS)) || (next_asid == 0)) { next_asid = 1; } domain_ptables->base_xlat_table = dup_table(kernel_ptables.base_xlat_table, BASE_XLAT_LEVEL); k_spin_unlock(&xlat_lock, key); if (!domain_ptables->base_xlat_table) { return -ENOMEM; } domain_ptables->ttbr0 = (((uint64_t)asid) << TTBR_ASID_SHIFT) | ((uint64_t)(uintptr_t)domain_ptables->base_xlat_table); sys_slist_append(&domain_list, &domain->arch.node); return 0; } static int private_map(struct arm_mmu_ptables *ptables, const char *name, uintptr_t phys, uintptr_t virt, size_t size, uint32_t attrs) { int ret; ret = privatize_page_range(ptables, &kernel_ptables, virt, size, name); __ASSERT(ret == 0, "privatize_page_range() returned %d", ret); ret = add_map(ptables, name, phys, virt, size, attrs | MT_NG); __ASSERT(ret == 0, "add_map() returned %d", ret); invalidate_tlb_all(); inv_dcache_after_map_helper(UINT_TO_POINTER(virt), size, attrs); return ret; } static int reset_map(struct arm_mmu_ptables *ptables, const char *name, uintptr_t addr, size_t size) { int ret; ret = globalize_page_range(ptables, &kernel_ptables, addr, size, name); __ASSERT(ret == 0, "globalize_page_range() returned %d", ret); invalidate_tlb_all(); return ret; } int arch_mem_domain_partition_add(struct k_mem_domain *domain, uint32_t partition_id) { struct arm_mmu_ptables *domain_ptables = &domain->arch.ptables; struct k_mem_partition *ptn = &domain->partitions[partition_id]; return private_map(domain_ptables, "partition", ptn->start, ptn->start, ptn->size, ptn->attr.attrs | MT_NORMAL); } int arch_mem_domain_partition_remove(struct k_mem_domain *domain, uint32_t partition_id) { struct arm_mmu_ptables *domain_ptables = &domain->arch.ptables; struct k_mem_partition *ptn = &domain->partitions[partition_id]; return reset_map(domain_ptables, "partition removal", ptn->start, ptn->size); } static int map_thread_stack(struct k_thread *thread, struct arm_mmu_ptables *ptables) { return private_map(ptables, "thread_stack", thread->stack_info.start, thread->stack_info.start, thread->stack_info.size, MT_P_RW_U_RW | MT_NORMAL); } int arch_mem_domain_thread_add(struct k_thread *thread) { struct arm_mmu_ptables *old_ptables, *domain_ptables; struct k_mem_domain *domain; bool is_user, is_migration; int ret = 0; domain = thread->mem_domain_info.mem_domain; domain_ptables = &domain->arch.ptables; old_ptables = thread->arch.ptables; is_user = (thread->base.user_options & K_USER) != 0; is_migration = (old_ptables != NULL) && is_user; if (is_migration) { ret = map_thread_stack(thread, domain_ptables); } thread->arch.ptables = domain_ptables; if (thread == arch_current_thread()) { z_arm64_swap_ptables(thread); } else { #ifdef CONFIG_SMP /* the thread could be running on another CPU right now */ z_arm64_mem_cfg_ipi(); #endif } if (is_migration) { ret = reset_map(old_ptables, __func__, thread->stack_info.start, thread->stack_info.size); } return ret; } int arch_mem_domain_thread_remove(struct k_thread *thread) { struct arm_mmu_ptables *domain_ptables; struct k_mem_domain *domain; domain = thread->mem_domain_info.mem_domain; domain_ptables = &domain->arch.ptables; if ((thread->base.user_options & K_USER) == 0) { return 0; } if ((thread->base.thread_state & _THREAD_DEAD) == 0) { return 0; } return reset_map(domain_ptables, __func__, thread->stack_info.start, thread->stack_info.size); } static void z_arm64_swap_ptables(struct k_thread *incoming) { struct arm_mmu_ptables *ptables = incoming->arch.ptables; uint64_t curr_ttbr0 = read_ttbr0_el1(); uint64_t new_ttbr0 = ptables->ttbr0; if (curr_ttbr0 == new_ttbr0) { return; /* Already the right tables */ } MMU_DEBUG("TTBR0 switch from %#llx to %#llx\n", curr_ttbr0, new_ttbr0); z_arm64_set_ttbr0(new_ttbr0); if (get_asid(curr_ttbr0) == get_asid(new_ttbr0)) { invalidate_tlb_all(); } } void z_arm64_thread_mem_domains_init(struct k_thread *incoming) { struct arm_mmu_ptables *ptables; if ((incoming->base.user_options & K_USER) == 0) { return; } ptables = incoming->arch.ptables; /* Map the thread stack */ map_thread_stack(incoming, ptables); z_arm64_swap_ptables(incoming); } void z_arm64_swap_mem_domains(struct k_thread *incoming) { z_arm64_swap_ptables(incoming); } #endif /* CONFIG_USERSPACE */ #ifdef CONFIG_DEMAND_PAGING static uint64_t *get_pte_location(struct arm_mmu_ptables *ptables, uintptr_t virt) { uint64_t *pte; uint64_t *table = ptables->base_xlat_table; unsigned int level = BASE_XLAT_LEVEL; for (;;) { pte = &table[XLAT_TABLE_VA_IDX(virt, level)]; if (level == XLAT_LAST_LEVEL) { return pte; } if (is_table_desc(*pte, level)) { level++; table = pte_desc_table(*pte); continue; } /* anything else is unexpected */ return NULL; } } void arch_mem_page_out(void *addr, uintptr_t location) { uintptr_t virt = (uintptr_t)addr; uint64_t *pte = get_pte_location(&kernel_ptables, virt); uint64_t desc; __ASSERT(pte != NULL, ""); desc = *pte; /* mark the entry invalid to the hardware */ desc &= ~PTE_DESC_TYPE_MASK; desc |= PTE_INVALID_DESC; /* store the location token in place of the physical address */ __ASSERT((location & ~PTE_PHYSADDR_MASK) == 0, ""); desc &= ~PTE_PHYSADDR_MASK; desc |= location; /* * The location token may be 0. Make sure the whole descriptor * doesn't end up being zero as this would be seen as a free entry. */ desc |= PTE_BLOCK_DESC_AP_RO; *pte = desc; MMU_DEBUG("page_out: virt=%#lx location=%#lx\n", virt, location); debug_show_pte(pte, XLAT_LAST_LEVEL); sync_domains(virt, CONFIG_MMU_PAGE_SIZE, "page_out"); invalidate_tlb_page(virt); } void arch_mem_page_in(void *addr, uintptr_t phys) { uintptr_t virt = (uintptr_t)addr; uint64_t *pte = get_pte_location(&kernel_ptables, virt); uint64_t desc; __ASSERT((phys & ~PTE_PHYSADDR_MASK) == 0, ""); __ASSERT(pte != NULL, ""); desc = *pte; __ASSERT(!is_free_desc(desc), ""); /* mark the entry valid again to the hardware */ desc &= ~PTE_DESC_TYPE_MASK; desc |= PTE_PAGE_DESC; /* store the physical address */ desc &= ~PTE_PHYSADDR_MASK; desc |= phys; /* mark as clean */ desc |= PTE_BLOCK_DESC_AP_RO; /* and make it initially unaccessible to track unaccessed pages */ desc &= ~PTE_BLOCK_DESC_AF; *pte = desc; MMU_DEBUG("page_in: virt=%#lx phys=%#lx\n", virt, phys); debug_show_pte(pte, XLAT_LAST_LEVEL); sync_domains(virt, CONFIG_MMU_PAGE_SIZE, "page_in"); invalidate_tlb_page(virt); } enum arch_page_location arch_page_location_get(void *addr, uintptr_t *location) { uintptr_t virt = (uintptr_t)addr; uint64_t *pte = get_pte_location(&kernel_ptables, virt); uint64_t desc; enum arch_page_location status; if (!pte) { return ARCH_PAGE_LOCATION_BAD; } desc = *pte; if (is_free_desc(desc)) { return ARCH_PAGE_LOCATION_BAD; } switch (desc & PTE_DESC_TYPE_MASK) { case PTE_PAGE_DESC: status = ARCH_PAGE_LOCATION_PAGED_IN; break; case PTE_INVALID_DESC: status = ARCH_PAGE_LOCATION_PAGED_OUT; break; default: return ARCH_PAGE_LOCATION_BAD; } *location = desc & PTE_PHYSADDR_MASK; return status; } uintptr_t arch_page_info_get(void *addr, uintptr_t *phys, bool clear_accessed) { uintptr_t virt = (uintptr_t)addr; uint64_t *pte = get_pte_location(&kernel_ptables, virt); uint64_t desc; uintptr_t status = 0; if (!pte) { return ARCH_DATA_PAGE_NOT_MAPPED; } desc = *pte; if (is_free_desc(desc)) { return ARCH_DATA_PAGE_NOT_MAPPED; } switch (desc & PTE_DESC_TYPE_MASK) { case PTE_PAGE_DESC: status |= ARCH_DATA_PAGE_LOADED; break; case PTE_INVALID_DESC: /* page not loaded */ break; default: return ARCH_DATA_PAGE_NOT_MAPPED; } if (phys) { *phys = desc & PTE_PHYSADDR_MASK; } if ((status & ARCH_DATA_PAGE_LOADED) == 0) { return status; } if ((desc & PTE_BLOCK_DESC_AF) != 0) { status |= ARCH_DATA_PAGE_ACCESSED; } if ((desc & PTE_BLOCK_DESC_AP_RO) == 0) { status |= ARCH_DATA_PAGE_DIRTY; } if (clear_accessed) { desc &= ~PTE_BLOCK_DESC_AF; *pte = desc; MMU_DEBUG("page_info: virt=%#lx (clearing AF)\n", virt); debug_show_pte(pte, XLAT_LAST_LEVEL); sync_domains(virt, CONFIG_MMU_PAGE_SIZE, "unaccessible"); invalidate_tlb_page(virt); } return status; } #define MT_SCRATCH (MT_NORMAL | MT_P_RW_U_NA | MT_DEFAULT_SECURE_STATE) void arch_mem_scratch(uintptr_t phys) { uintptr_t virt = (uintptr_t)K_MEM_SCRATCH_PAGE; size_t size = CONFIG_MMU_PAGE_SIZE; int ret = add_map(&kernel_ptables, "scratch", phys, virt, size, MT_SCRATCH); if (ret) { LOG_ERR("add_map() returned %d", ret); } else { sync_domains(virt, size, "scratch"); invalidate_tlb_page(virt); } } static bool do_mem_page_fault(struct arch_esf *esf, uintptr_t virt) { /* * The k_mem_page_fault() code expects to be called with IRQs enabled * if the fault happened in a context where IRQs were enabled. */ if (arch_irq_unlocked(esf->spsr)) { enable_irq(); } bool ok = k_mem_page_fault((void *)virt); disable_irq(); return ok; } /* Called from the fault handler. Returns true if the fault is resolved. */ bool z_arm64_do_demand_paging(struct arch_esf *esf, uint64_t esr, uint64_t far) { uintptr_t virt = far; uint64_t *pte, desc; uintptr_t phys; /* filter relevant exceptions */ switch (GET_ESR_EC(esr)) { case 0x21: /* insn abort from current EL */ case 0x25: /* data abort from current EL */ break; default: return false; } /* make sure the fault happened in the expected range */ if (!IN_RANGE(virt, (uintptr_t)K_MEM_VIRT_RAM_START, ((uintptr_t)K_MEM_VIRT_RAM_END - 1))) { return false; } virt = ROUND_DOWN(virt, CONFIG_MMU_PAGE_SIZE); pte = get_pte_location(&kernel_ptables, virt); if (!pte) { /* page mapping doesn't exist, let the core code do its thing */ return do_mem_page_fault(esf, virt); } desc = *pte; if ((desc & PTE_DESC_TYPE_MASK) != PTE_PAGE_DESC) { /* page is not loaded/mapped */ return do_mem_page_fault(esf, virt); } /* * From this point, we expect only 2 cases: * * 1) the Access Flag was not set so we set it marking the page * as accessed; * * 2) the page was read-only and a write occurred so we clear the * RO flag marking the page dirty. * * We bail out on anything else. * * Fault status codes for Data aborts (DFSC): * 0b0010LL Access flag fault * 0b0011LL Permission fault */ uint32_t dfsc = GET_ESR_ISS(esr) & GENMASK(5, 0); bool write = (GET_ESR_ISS(esr) & BIT(6)) != 0; /* WnR */ if (dfsc == (0b001000 | XLAT_LAST_LEVEL) && (desc & PTE_BLOCK_DESC_AF) == 0) { /* page is being accessed: set the access flag */ desc |= PTE_BLOCK_DESC_AF; if (write) { if ((desc & PTE_SW_WRITABLE) == 0) { /* we don't actually have write permission */ return false; } /* * Let's avoid another fault immediately after * returning by making the page read-write right away * effectively marking it "dirty" as well. */ desc &= ~PTE_BLOCK_DESC_AP_RO; } *pte = desc; sync_domains(virt, CONFIG_MMU_PAGE_SIZE, "accessed"); /* no TLB inval needed after setting AF */ /* tell the eviction algorithm about it */ phys = desc & PTE_PHYSADDR_MASK; k_mem_paging_eviction_accessed(phys); return true; } if (dfsc == (0b001100 | XLAT_LAST_LEVEL) && write && (desc & PTE_BLOCK_DESC_AP_RO) != 0 && (desc & PTE_SW_WRITABLE) != 0) { /* make it "dirty" i.e. read-write */ desc &= ~PTE_BLOCK_DESC_AP_RO; *pte = desc; sync_domains(virt, CONFIG_MMU_PAGE_SIZE, "dirtied"); invalidate_tlb_page(virt); /* this also counts as an access refresh */ phys = desc & PTE_PHYSADDR_MASK; k_mem_paging_eviction_accessed(phys); return true; } return false; } #endif /* CONFIG_DEMAND_PAGING */