1 /*
2 * Copyright (c) 2009 Corey Tabaka
3 * Copyright (c) 2015-2018 Intel Corporation
4 * Copyright (c) 2016 Travis Geiselbrecht
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files
8 * (the "Software"), to deal in the Software without restriction,
9 * including without limitation the rights to use, copy, modify, merge,
10 * publish, distribute, sublicense, and/or sell copies of the Software,
11 * and to permit persons to whom the Software is furnished to do so,
12 * subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
21 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25 #include <debug.h>
26 #include <trace.h>
27 #include <sys/types.h>
28 #include <compiler.h>
29 #include <arch.h>
30 #include <arch/x86.h>
31 #include <arch/x86/mmu.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <arch/mmu.h>
35 #include <assert.h>
36 #include <err.h>
37 #include <arch/arch_ops.h>
38 #include <kernel/vm.h>
39 #include <inttypes.h>
40
41 #define LOCAL_TRACE 0
42
43 /* Address width including virtual/physical address*/
44 uint8_t g_vaddr_width = 0;
45 uint8_t g_paddr_width = 0;
46
47 paddr_t x86_kernel_page_table = 0;
48
49 /*
50 * Page table 1:
51 *
52 * This page table is used for bootstrap code
53 * VA - start, size : PA - start, size
54 * MEMBASE+KERNEL_LOAD_OFFSET, 1 PAGE : MEMBASE+KERNEL_LOAD_OFFSET, 1 PAGE
55 * PHYS(_gdt), 1 PAGE : PHYS(_gdt), 1 PAGE
56 * KERNEL_BASE+KERNEL_LOAD_OFFSET, 1 PAGE : MEMBASE+KERNEL_LOAD_OFFSET, 1 PAGE
57 *
58 * 4-level paging is used to cover bootstrap code:
59 * entry in pml4(Page Map Level 4) covers 512GB,
60 * entry in pdpt(Page-directory-pointer table) covers 1GB,
61 * entry in pd(Page directory) covers 2MB,
62 * entry in pt(Page table) covers 4KB.
63 *
64 * pml4_trampoline->pdpt_trampoline->pd_trampoline->pt_trampoline
65 * covers VA (from ~ end):
66 * MEMBASE+KERNEL_LOAD_OFFSET ~ MEMBASE+KERNEL_LOAD_OFFSET + 1 PAGE
67 * and
68 * pml4_trampoline->pdpt_trampoline->pd_trampoline->pt_trampoline_gdt
69 * covers VA (from ~ end):
70 * PHYS(_gdtr_phys) ~ PHYS(_gdtr_phys) + 1 PAGE
71 *
72 */
73 map_addr_t pml4_trampoline[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
74 map_addr_t pdpt_trampoline[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
75 map_addr_t pd_trampoline[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
76 map_addr_t pt_trampoline[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
77 map_addr_t pt_trampoline_gdt[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
78
79 /*
80 * Page table 2:
81 * This page table is used at run time in 64bit
82 * (memsize equals to upper memory passed in by bootloader minus
83 * physical start address of lk binary, if memsize is larger than 1GB,
84 * more page directories for this page table will be allocated in boot mem)
85 * VA start, size : PA start, size
86 * KERNEL_BASE, memsize : MEMBASE, memsize
87 */
88 map_addr_t pml4[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
89 map_addr_t pdpt[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
90 map_addr_t pd[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
91 map_addr_t pt[NO_OF_PT_ENTRIES][NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
92
93 /**
94 * @brief check if the virtual address is aligned and canonical
95 *
96 */
x86_mmu_check_vaddr(vaddr_t vaddr)97 static bool x86_mmu_check_vaddr(vaddr_t vaddr)
98 {
99 uint64_t addr = (uint64_t)vaddr;
100 uint64_t max_vaddr_lohalf,
101 min_vaddr_hihalf;
102
103 /* Check to see if the address is PAGE aligned */
104 if (!IS_ALIGNED(addr, PAGE_SIZE))
105 return false;
106
107 /* get max address in lower-half canonical addr space */
108 /* e.g. if width is 48, then 0x00007FFF_FFFFFFFF */
109 max_vaddr_lohalf = ((uint64_t)1ull << (g_vaddr_width - 1)) - 1;
110
111 /* get min address in higher-half canonical addr space */
112 /* e.g. if width is 48, then 0xFFFF8000_00000000*/
113 min_vaddr_hihalf = ~ max_vaddr_lohalf;
114
115 /* Check to see if the address in a canonical address */
116 if ((addr > max_vaddr_lohalf) && (addr < min_vaddr_hihalf))
117 return false;
118
119 return true;
120 }
121
122
123 /**
124 * @brief check if the physical address is valid and aligned
125 *
126 */
x86_mmu_check_paddr(paddr_t paddr)127 static bool x86_mmu_check_paddr(paddr_t paddr)
128 {
129 uint64_t addr = (uint64_t)paddr;
130 uint64_t max_paddr;
131
132 /* Check to see if the address is PAGE aligned */
133 if (!IS_ALIGNED(addr, PAGE_SIZE))
134 return false;
135
136 max_paddr = ((uint64_t)1ull << g_paddr_width) - 1;
137
138 return addr <= max_paddr;
139 }
140
141
get_pml4_entry_from_pml4_table(vaddr_t vaddr,addr_t pml4_addr)142 static inline uint64_t get_pml4_entry_from_pml4_table(vaddr_t vaddr, addr_t pml4_addr)
143 {
144 uint32_t pml4_index;
145 uint64_t *pml4_table = (uint64_t *)pml4_addr;
146
147 pml4_index = (((uint64_t)vaddr >> PML4_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
148 return X86_PHYS_TO_VIRT(pml4_table[pml4_index]);
149 }
150
get_pdp_entry_from_pdp_table(vaddr_t vaddr,uint64_t pml4e)151 static inline uint64_t get_pdp_entry_from_pdp_table(vaddr_t vaddr, uint64_t pml4e)
152 {
153 uint32_t pdp_index;
154 uint64_t *pdpe;
155
156 pdp_index = (((uint64_t)vaddr >> PDP_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
157 pdpe = (uint64_t *)(pml4e & X86_PG_FRAME);
158 return X86_PHYS_TO_VIRT(pdpe[pdp_index]);
159 }
160
get_pd_entry_from_pd_table(vaddr_t vaddr,uint64_t pdpe)161 static inline uint64_t get_pd_entry_from_pd_table(vaddr_t vaddr, uint64_t pdpe)
162 {
163 uint32_t pd_index;
164 uint64_t *pde;
165
166 pd_index = (((uint64_t)vaddr >> PD_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
167 pde = (uint64_t *)(pdpe & X86_PG_FRAME);
168 return X86_PHYS_TO_VIRT(pde[pd_index]);
169 }
170
get_pt_entry_from_pt_table(vaddr_t vaddr,uint64_t pde)171 static inline uint64_t get_pt_entry_from_pt_table(vaddr_t vaddr, uint64_t pde)
172 {
173 uint32_t pt_index;
174 uint64_t *pte;
175
176 pt_index = (((uint64_t)vaddr >> PT_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
177 pte = (uint64_t *)(pde & X86_PG_FRAME);
178 return pte[pt_index];
179 }
180
get_pfn_from_pte(uint64_t pte)181 static inline uint64_t get_pfn_from_pte(uint64_t pte)
182 {
183 uint64_t pfn;
184
185 /* Clear low 12 bits */
186 pfn = (pte & X86_PG_FRAME);
187
188 /* Clear high 12 bits */
189 pfn &= X86_PG_PHY_ADDR_MASK;
190
191 return pfn;
192 }
193
get_pfn_from_pde(uint64_t pde)194 static inline uint64_t get_pfn_from_pde(uint64_t pde)
195 {
196 uint64_t pfn;
197
198 pfn = (pde & X86_2MB_PAGE_FRAME);
199
200 LTRACEF_LEVEL(2, "pde 0x%" PRIx64 ", pfn 0x%" PRIx64 "\n", pde, pfn);
201
202 return pfn;
203 }
204
205 /**
206 * @brief Returning the x86 arch flags from generic mmu flags
207 */
get_x86_arch_flags(arch_flags_t flags)208 arch_flags_t get_x86_arch_flags(arch_flags_t flags)
209 {
210 arch_flags_t arch_flags = 0;
211 uint cache_flag = flags & ARCH_MMU_FLAG_CACHE_MASK;
212
213 if (!(flags & ARCH_MMU_FLAG_PERM_RO))
214 arch_flags |= X86_MMU_PG_RW;
215
216 if (flags & ARCH_MMU_FLAG_PERM_USER)
217 arch_flags |= X86_MMU_PG_U;
218
219 if (cache_flag == ARCH_MMU_FLAG_UNCACHED ||
220 cache_flag == ARCH_MMU_FLAG_UNCACHED_DEVICE)
221 arch_flags |= X86_MMU_CACHE_DISABLE;
222
223 if (flags & ARCH_MMU_FLAG_PERM_NO_EXECUTE)
224 arch_flags |= X86_MMU_PG_NX;
225
226 return arch_flags;
227 }
228
x86_mmu_check_flags(uint flags)229 bool x86_mmu_check_flags(uint flags)
230 {
231 uint cache_flag = flags & ARCH_MMU_FLAG_CACHE_MASK;
232 if (cache_flag != ARCH_MMU_FLAG_CACHED &&
233 cache_flag != ARCH_MMU_FLAG_UNCACHED &&
234 cache_flag != ARCH_MMU_FLAG_UNCACHED_DEVICE) {
235 LTRACEF("unsupported cache type: 0x%x, flags 0x%x\n",
236 cache_flag, flags);
237 return false;
238 }
239 uint unsupported_flags = flags & ~ARCH_MMU_FLAG_CACHE_MASK;
240 unsupported_flags &= ~ARCH_MMU_FLAG_PERM_RO;
241 unsupported_flags &= ~ARCH_MMU_FLAG_PERM_USER;
242 unsupported_flags &= ~ARCH_MMU_FLAG_PERM_NO_EXECUTE;
243 if (unsupported_flags) {
244 LTRACEF("unsupported flags: 0x%x, flags 0x%x\n",
245 unsupported_flags, flags);
246 return false;
247 }
248 return true;
249 }
250
251 /**
252 * @brief Returning the generic mmu flags from x86 arch flags
253 */
get_arch_mmu_flags(arch_flags_t flags)254 uint get_arch_mmu_flags(arch_flags_t flags)
255 {
256 arch_flags_t mmu_flags = 0;
257
258 if (!(flags & X86_MMU_PG_RW))
259 mmu_flags |= ARCH_MMU_FLAG_PERM_RO;
260
261 if (flags & X86_MMU_PG_U)
262 mmu_flags |= ARCH_MMU_FLAG_PERM_USER;
263
264 if (flags & X86_MMU_CACHE_DISABLE)
265 mmu_flags |= ARCH_MMU_FLAG_UNCACHED;
266
267 if (flags & X86_MMU_PG_NX)
268 mmu_flags |= ARCH_MMU_FLAG_PERM_NO_EXECUTE;
269
270 return (uint)mmu_flags;
271 }
272
273 /**
274 * @brief Walk the page table structures
275 *
276 * In this scenario, we are considering the paging scheme to be a PAE mode with
277 * 4KB pages.
278 *
279 */
x86_mmu_get_mapping(map_addr_t pml4,vaddr_t vaddr,uint32_t * ret_level,arch_flags_t * mmu_flags,map_addr_t * last_valid_entry)280 status_t x86_mmu_get_mapping(map_addr_t pml4, vaddr_t vaddr, uint32_t *ret_level,
281 arch_flags_t *mmu_flags, map_addr_t *last_valid_entry)
282 {
283 uint64_t pml4e, pdpe, pde, pte;
284
285 DEBUG_ASSERT(pml4);
286 if ((!ret_level) || (!last_valid_entry) || (!mmu_flags)) {
287 return ERR_INVALID_ARGS;
288 }
289
290 *ret_level = PML4_L;
291 *last_valid_entry = pml4;
292 *mmu_flags = 0;
293
294 LTRACEF_LEVEL(2, "pml4 0x%" PRIx64 "\n", pml4);
295
296 pml4e = get_pml4_entry_from_pml4_table(vaddr, pml4);
297 if ((pml4e & X86_MMU_PG_P) == 0) {
298 return ERR_NOT_FOUND;
299 }
300 LTRACEF_LEVEL(2, "pml4e 0x%" PRIx64 "\n", pml4e);
301
302 pdpe = get_pdp_entry_from_pdp_table(vaddr, pml4e);
303 if ((pdpe & X86_MMU_PG_P) == 0) {
304 *ret_level = PDP_L;
305 *last_valid_entry = pml4e;
306 return ERR_NOT_FOUND;
307 }
308 LTRACEF_LEVEL(2, "pdpe 0x%" PRIx64 "\n", pdpe);
309
310 pde = get_pd_entry_from_pd_table(vaddr, pdpe);
311 if ((pde & X86_MMU_PG_P) == 0) {
312 *ret_level = PD_L;
313 *last_valid_entry = pdpe;
314 return ERR_NOT_FOUND;
315 }
316 LTRACEF_LEVEL(2, "pde 0x%" PRIx64 "\n", pde);
317
318 /* 2 MB pages */
319 if (pde & X86_MMU_PG_PS) {
320 /* Getting the Page frame & adding the 4KB page offset from the vaddr */
321 *last_valid_entry = get_pfn_from_pde(X86_VIRT_TO_PHYS(pde)) + ((uint64_t)vaddr & PAGE_OFFSET_MASK_2MB);
322 *mmu_flags = get_arch_mmu_flags(pde & X86_FLAGS_MASK);
323 goto last;
324 }
325
326 /* 4 KB pages */
327 pte = get_pt_entry_from_pt_table(vaddr, pde);
328 if ((pte & X86_MMU_PG_P) == 0) {
329 *ret_level = PT_L;
330 *last_valid_entry = pde;
331 return ERR_NOT_FOUND;
332 }
333
334 /* Getting the Page frame & adding the 4KB page offset from the vaddr */
335 *last_valid_entry = get_pfn_from_pte(pte) + ((uint64_t)vaddr & PAGE_OFFSET_MASK_4KB);
336 *mmu_flags = get_arch_mmu_flags(pte & X86_FLAGS_MASK);
337
338 last:
339 *ret_level = PF_L;
340 return NO_ERROR;
341 }
342
343 /**
344 * Walk the page table structures to see if the mapping between a virtual address
345 * and a physical address exists. Also, check the flags.
346 *
347 */
x86_mmu_check_mapping(addr_t pml4,paddr_t paddr,vaddr_t vaddr,arch_flags_t in_flags,uint32_t * ret_level,arch_flags_t * ret_flags,map_addr_t * last_valid_entry)348 status_t x86_mmu_check_mapping(addr_t pml4, paddr_t paddr,
349 vaddr_t vaddr, arch_flags_t in_flags,
350 uint32_t *ret_level, arch_flags_t *ret_flags,
351 map_addr_t *last_valid_entry)
352 {
353 status_t status;
354 arch_flags_t existing_flags = 0;
355
356 DEBUG_ASSERT(pml4);
357 if ((!ret_level) || (!last_valid_entry) || (!ret_flags) ||
358 (!x86_mmu_check_vaddr(vaddr)) ||
359 (!x86_mmu_check_paddr(paddr))) {
360 return ERR_INVALID_ARGS;
361 }
362
363 status = x86_mmu_get_mapping(pml4, vaddr, ret_level, &existing_flags, last_valid_entry);
364 if (status || ((*last_valid_entry) != (uint64_t)paddr)) {
365 /* We did not reach till we check the access flags for the mapping */
366 *ret_flags = in_flags;
367 return ERR_NOT_FOUND;
368 }
369
370 /* Checking the access flags for the mapped address. If it is not zero, then
371 * the access flags are different & the return flag will have those access bits
372 * which are different.
373 */
374 *ret_flags = (in_flags ^ get_x86_arch_flags(existing_flags)) & X86_DIRTY_ACCESS_MASK;
375
376 if (!(*ret_flags))
377 return NO_ERROR;
378
379 return ERR_NOT_FOUND;
380 }
381
update_pt_entry(vaddr_t vaddr,paddr_t paddr,uint64_t pde,arch_flags_t flags)382 static void update_pt_entry(vaddr_t vaddr, paddr_t paddr, uint64_t pde, arch_flags_t flags)
383 {
384 uint32_t pt_index;
385
386 uint64_t *pt_table = (uint64_t *)(pde & X86_PG_FRAME);
387 pt_index = (((uint64_t)vaddr >> PT_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
388 pt_table[pt_index] = (uint64_t)paddr;
389 pt_table[pt_index] |= flags | X86_MMU_PG_P;
390 if (!(flags & X86_MMU_PG_U))
391 pt_table[pt_index] |= X86_MMU_PG_G; /* setting global flag for kernel pages */
392
393 if (flags & X86_MMU_PG_NX)
394 pt_table[pt_index] |= X86_MMU_PG_NX;
395 else
396 pt_table[pt_index] &= ~X86_MMU_PG_NX;
397 }
398
update_pd_entry(vaddr_t vaddr,uint64_t pdpe,map_addr_t m,arch_flags_t flags)399 static void update_pd_entry(vaddr_t vaddr, uint64_t pdpe, map_addr_t m, arch_flags_t flags)
400 {
401 uint32_t pd_index;
402
403 uint64_t *pd_table = (uint64_t *)(pdpe & X86_PG_FRAME);
404 pd_index = (((uint64_t)vaddr >> PD_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
405 pd_table[pd_index] = m;
406 pd_table[pd_index] |= X86_MMU_PG_P | X86_MMU_PG_RW;
407 DEBUG_ASSERT(!(pd_table[pd_index] & X86_MMU_PG_PS));
408 pd_table[pd_index] |= X86_MMU_PG_U; /* set U flag on all inner entries */
409 if (!(flags & X86_MMU_PG_U))
410 pd_table[pd_index] |= X86_MMU_PG_G; /* setting global flag for kernel pages */
411 }
412
update_pdp_entry(vaddr_t vaddr,uint64_t pml4e,map_addr_t m,arch_flags_t flags)413 static void update_pdp_entry(vaddr_t vaddr, uint64_t pml4e, map_addr_t m, arch_flags_t flags)
414 {
415 uint32_t pdp_index;
416
417 uint64_t *pdp_table = (uint64_t *)(pml4e & X86_PG_FRAME);
418 pdp_index = (((uint64_t)vaddr >> PDP_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
419 pdp_table[pdp_index] = m;
420 pdp_table[pdp_index] |= X86_MMU_PG_P | X86_MMU_PG_RW;
421 DEBUG_ASSERT(!(pdp_table[pdp_index] & X86_MMU_PG_PS));
422 pdp_table[pdp_index] |= X86_MMU_PG_U; /* set U flag on all inner entries */
423 if (!(flags & X86_MMU_PG_U))
424 pdp_table[pdp_index] |= X86_MMU_PG_G; /* setting global flag for kernel pages */
425 }
426
update_pml4_entry(vaddr_t vaddr,addr_t pml4_addr,map_addr_t m,arch_flags_t flags)427 static void update_pml4_entry(vaddr_t vaddr, addr_t pml4_addr, map_addr_t m, arch_flags_t flags)
428 {
429 uint32_t pml4_index;
430 uint64_t *pml4_table = (uint64_t *)(pml4_addr);
431
432 pml4_index = (((uint64_t)vaddr >> PML4_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
433 pml4_table[pml4_index] = m;
434 pml4_table[pml4_index] |= X86_MMU_PG_P | X86_MMU_PG_RW;
435 DEBUG_ASSERT(!(pml4_table[pml4_index] & X86_MMU_PG_PS));
436 pml4_table[pml4_index] |= X86_MMU_PG_U; /* set U flag on all inner entries */
437 if (!(flags & X86_MMU_PG_U))
438 pml4_table[pml4_index] |= X86_MMU_PG_G; /* setting global flag for kernel pages */
439 }
440
441 /**
442 * @brief Allocating a new page table
443 */
_map_alloc_page(void)444 static map_addr_t *_map_alloc_page(void)
445 {
446 map_addr_t *page_ptr = pmm_alloc_kpage();
447 DEBUG_ASSERT(page_ptr);
448
449 if (page_ptr)
450 memset(page_ptr, 0, PAGE_SIZE);
451
452 return page_ptr;
453 }
454
455 /**
456 * @brief Add a new mapping for the given virtual address & physical address
457 *
458 * This is a API which handles the mapping b/w a virtual address & physical address
459 * either by checking if the mapping already exists and is valid OR by adding a
460 * new mapping with the required flags.
461 *
462 * In this scenario, we are considering the paging scheme to be a PAE mode with
463 * 4KB pages.
464 *
465 */
x86_mmu_add_mapping(map_addr_t pml4,map_addr_t paddr,vaddr_t vaddr,arch_flags_t mmu_flags)466 status_t x86_mmu_add_mapping(map_addr_t pml4, map_addr_t paddr,
467 vaddr_t vaddr, arch_flags_t mmu_flags)
468 {
469 uint32_t pd_new = 0, pdp_new = 0;
470 uint64_t pml4e, pdpe, pde;
471 map_addr_t *m = NULL;
472 status_t ret = NO_ERROR;
473
474 LTRACEF("pml4 0x%" PRIxMAP_ADDR " paddr 0x%" PRIxMAP_ADDR " vaddr 0x%lx flags 0x%" PRIxARCH_FLAGS "\n", pml4, paddr, vaddr, mmu_flags);
475
476 DEBUG_ASSERT(pml4);
477 if ((!x86_mmu_check_vaddr(vaddr)) || (!x86_mmu_check_paddr(paddr)) )
478 return ERR_INVALID_ARGS;
479
480 pml4e = get_pml4_entry_from_pml4_table(vaddr, pml4);
481
482 if ((pml4e & X86_MMU_PG_P) == 0) {
483 /* Creating a new pdp table */
484 m = _map_alloc_page();
485 if (m == NULL) {
486 ret = ERR_NO_MEMORY;
487 goto clean;
488 }
489
490 update_pml4_entry(vaddr, pml4, X86_VIRT_TO_PHYS(m), get_x86_arch_flags(mmu_flags));
491 pml4e = (uint64_t)m;
492 X86_SET_FLAG(pdp_new);
493 }
494
495 if (!pdp_new)
496 pdpe = get_pdp_entry_from_pdp_table(vaddr, pml4e);
497
498 if (pdp_new || (pdpe & X86_MMU_PG_P) == 0) {
499 /* Creating a new pd table */
500 m = _map_alloc_page();
501 if (m == NULL) {
502 ret = ERR_NO_MEMORY;
503 if (pdp_new)
504 goto clean_pdp;
505 goto clean;
506 }
507
508 update_pdp_entry(vaddr, pml4e, X86_VIRT_TO_PHYS(m), get_x86_arch_flags(mmu_flags));
509 pdpe = (uint64_t)m;
510 X86_SET_FLAG(pd_new);
511 }
512
513 if (!pd_new)
514 pde = get_pd_entry_from_pd_table(vaddr, pdpe);
515
516 if (pd_new || (pde & X86_MMU_PG_P) == 0) {
517 /* Creating a new pt */
518 m = _map_alloc_page();
519 if (m == NULL) {
520 ret = ERR_NO_MEMORY;
521 if (pd_new)
522 goto clean_pd;
523 goto clean;
524 }
525
526 update_pd_entry(vaddr, pdpe, X86_VIRT_TO_PHYS(m), get_x86_arch_flags(mmu_flags));
527 pde = (uint64_t)m;
528 }
529
530 /* Updating the page table entry with the paddr and access flags required for the mapping */
531 update_pt_entry(vaddr, paddr, pde, get_x86_arch_flags(mmu_flags));
532 ret = NO_ERROR;
533 goto clean;
534
535 clean_pd:
536 if (pd_new)
537 pmm_free_page(paddr_to_vm_page(X86_PHYS_TO_VIRT(pd_new)));
538
539 clean_pdp:
540 if (pdp_new)
541 pmm_free_page(paddr_to_vm_page(X86_PHYS_TO_VIRT(pml4e)));
542
543 clean:
544 return ret;
545 }
546
547 /**
548 * @brief x86-64 MMU unmap an entry in the page tables recursively and clear out tables
549 *
550 */
x86_mmu_unmap_entry(vaddr_t vaddr,int level,vaddr_t table_entry)551 static void x86_mmu_unmap_entry(vaddr_t vaddr, int level, vaddr_t table_entry)
552 {
553 uint32_t offset = 0, next_level_offset = 0;
554 vaddr_t *table, *next_table_addr, value;
555
556 LTRACEF("vaddr 0x%lx level %d table_entry 0x%lx\n", vaddr, level, table_entry);
557
558 next_table_addr = NULL;
559 table = (vaddr_t *)(table_entry & X86_PG_FRAME);
560 LTRACEF_LEVEL(2, "table %p\n", table);
561
562 switch (level) {
563 case PML4_L:
564 offset = (((uint64_t)vaddr >> PML4_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
565 LTRACEF_LEVEL(2, "offset %u\n", offset);
566 next_table_addr = (vaddr_t *)X86_PHYS_TO_VIRT(table[offset]);
567 LTRACEF_LEVEL(2, "next_table_addr %p\n", next_table_addr);
568 if ((X86_PHYS_TO_VIRT(table[offset]) & X86_MMU_PG_P)== 0)
569 return;
570 break;
571 case PDP_L:
572 offset = (((uint64_t)vaddr >> PDP_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
573 LTRACEF_LEVEL(2, "offset %u\n", offset);
574 next_table_addr = (vaddr_t *)X86_PHYS_TO_VIRT(table[offset]);
575 LTRACEF_LEVEL(2, "next_table_addr %p\n", next_table_addr);
576 if ((X86_PHYS_TO_VIRT(table[offset]) & X86_MMU_PG_P) == 0)
577 return;
578 break;
579 case PD_L:
580 offset = (((uint64_t)vaddr >> PD_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
581 LTRACEF_LEVEL(2, "offset %u\n", offset);
582 next_table_addr = (vaddr_t *)X86_PHYS_TO_VIRT(table[offset]);
583 LTRACEF_LEVEL(2, "next_table_addr %p\n", next_table_addr);
584 if ((X86_PHYS_TO_VIRT(table[offset]) & X86_MMU_PG_P) == 0)
585 return;
586 break;
587 case PT_L:
588 offset = (((uint64_t)vaddr >> PT_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
589 LTRACEF_LEVEL(2, "offset %u\n", offset);
590 next_table_addr = (vaddr_t *)X86_PHYS_TO_VIRT(table[offset]);
591 LTRACEF_LEVEL(2, "next_table_addr %p\n", next_table_addr);
592 if ((X86_PHYS_TO_VIRT(table[offset]) & X86_MMU_PG_P) == 0)
593 return;
594 break;
595 case PF_L:
596 /* Reached page frame, Let's go back */
597 default:
598 return;
599 }
600
601 LTRACEF_LEVEL(2, "recursing\n");
602
603 level -= 1;
604 x86_mmu_unmap_entry(vaddr, level, (vaddr_t)next_table_addr);
605 level += 1;
606
607 LTRACEF_LEVEL(2, "next_table_addr %p\n", next_table_addr);
608
609 next_table_addr = (vaddr_t *)((vaddr_t)(next_table_addr) & X86_PG_FRAME);
610 if (level > PT_L) {
611 /* Check all entries of next level table for present bit */
612 for (next_level_offset = 0; next_level_offset < (PAGE_SIZE/8); next_level_offset++) {
613 if ((next_table_addr[next_level_offset] & X86_MMU_PG_P) != 0)
614 return; /* There is an entry in the next level table */
615 }
616 pmm_free_page(paddr_to_vm_page(X86_VIRT_TO_PHYS(next_table_addr)));
617 }
618 /* All present bits for all entries in next level table for this address are 0 */
619 if ((X86_PHYS_TO_VIRT(table[offset]) & X86_MMU_PG_P) != 0) {
620 arch_disable_ints();
621 value = table[offset];
622 value = value & X86_PTE_NOT_PRESENT;
623 table[offset] = value;
624 arch_enable_ints();
625 }
626 }
627
x86_mmu_unmap(map_addr_t pml4,vaddr_t vaddr,size_t count)628 status_t x86_mmu_unmap(map_addr_t pml4, vaddr_t vaddr, size_t count)
629 {
630 vaddr_t next_aligned_v_addr;
631
632 DEBUG_ASSERT(pml4);
633 if (!(x86_mmu_check_vaddr(vaddr)))
634 return ERR_INVALID_ARGS;
635
636 if (count == 0)
637 return NO_ERROR;
638
639 next_aligned_v_addr = vaddr;
640 while (count > 0) {
641 x86_mmu_unmap_entry(next_aligned_v_addr, X86_PAGING_LEVELS, pml4);
642 /*
643 * Flush page mapping in TLB when unmapping pages,
644 * need to invalid page to avoid data loss.
645 */
646 __asm__ __volatile__ ("invlpg (%0)": : "r" (next_aligned_v_addr) : "memory");
647 next_aligned_v_addr += PAGE_SIZE;
648 count--;
649 }
650 return NO_ERROR;
651 }
652
arch_mmu_unmap(arch_aspace_t * aspace,vaddr_t vaddr,size_t count)653 int arch_mmu_unmap(arch_aspace_t *aspace, vaddr_t vaddr, size_t count)
654 {
655 addr_t current_cr3_val;
656 vmm_aspace_t *kernel_aspace = vmm_get_kernel_aspace();
657
658 LTRACEF("aspace %p, vaddr 0x%lx, count %zu\n", aspace, vaddr, count);
659
660 ASSERT(aspace);
661
662 /*
663 * Kernel level page table is mapped in user level space for syscall
664 * and interrupt handling.
665 *
666 * Add check here to make sure supervisor page would never be unmapped
667 * in user level aspace accidentally.
668 */
669 if (&kernel_aspace->arch_aspace != aspace) {
670 if (is_kernel_address(vaddr)) {
671 return ERR_INVALID_ARGS;
672 }
673 }
674
675 if (!(x86_mmu_check_vaddr(vaddr)))
676 return ERR_INVALID_ARGS;
677
678 if (count == 0)
679 return NO_ERROR;
680
681 current_cr3_val = aspace->page_table;
682 ASSERT(current_cr3_val);
683
684 return (x86_mmu_unmap(X86_PHYS_TO_VIRT(current_cr3_val), vaddr, count));
685 }
686
687 /**
688 * @brief Mapping a section/range with specific permissions
689 *
690 */
x86_mmu_map_range(map_addr_t pml4,struct map_range * range,arch_flags_t flags)691 status_t x86_mmu_map_range(map_addr_t pml4, struct map_range *range, arch_flags_t flags)
692 {
693 vaddr_t next_aligned_v_addr;
694 paddr_t next_aligned_p_addr;
695 status_t map_status;
696 uint32_t no_of_pages, index;
697
698 LTRACEF("pml4 0x%" PRIxMAP_ADDR ", range v 0x%" PRIxVADDR " p 0x%" PRIxMAP_RANGE_PADDR " size %u flags 0x%" PRIxARCH_FLAGS "\n",
699 pml4, range->start_vaddr, range->start_paddr, range->size, flags);
700
701 DEBUG_ASSERT(pml4);
702 if (!range)
703 return ERR_INVALID_ARGS;
704
705 /* Calculating the number of 4k pages */
706 if (IS_ALIGNED(range->size, PAGE_SIZE))
707 no_of_pages = (range->size) >> PAGE_DIV_SHIFT;
708 else
709 no_of_pages = ((range->size) >> PAGE_DIV_SHIFT) + 1;
710
711 next_aligned_v_addr = range->start_vaddr;
712 next_aligned_p_addr = range->start_paddr;
713
714 for (index = 0; index < no_of_pages; index++) {
715 map_status = x86_mmu_add_mapping(pml4, next_aligned_p_addr, next_aligned_v_addr, flags);
716 if (map_status) {
717 dprintf(SPEW, "Add mapping failed with err=%d\n", map_status);
718 /* Unmap the partial mapping - if any */
719 x86_mmu_unmap(pml4, range->start_vaddr, index);
720 return map_status;
721 }
722 next_aligned_v_addr += PAGE_SIZE;
723 next_aligned_p_addr += PAGE_SIZE;
724 }
725 return NO_ERROR;
726 }
727
arch_mmu_query(arch_aspace_t * aspace,vaddr_t vaddr,paddr_t * paddr,uint * flags)728 status_t arch_mmu_query(arch_aspace_t *aspace, vaddr_t vaddr, paddr_t *paddr, uint *flags)
729 {
730 addr_t current_cr3_val;
731 uint32_t ret_level;
732 map_addr_t last_valid_entry;
733 arch_flags_t ret_flags;
734 status_t stat;
735
736 LTRACEF("aspace %p, vaddr 0x%lx, paddr %p, flags %p\n", aspace, vaddr, paddr, flags);
737
738 ASSERT(aspace);
739
740 current_cr3_val = aspace->page_table;
741 ASSERT(current_cr3_val);
742
743 stat = x86_mmu_get_mapping(X86_PHYS_TO_VIRT(current_cr3_val), vaddr, &ret_level, &ret_flags, &last_valid_entry);
744 if (stat)
745 return stat;
746
747 if (paddr) {
748 *paddr = (paddr_t)(last_valid_entry);
749 }
750
751 LTRACEF("paddr 0x%" PRIxMAP_ADDR "\n", last_valid_entry);
752
753 /* converting x86 arch specific flags to arch mmu flags */
754 if (flags)
755 *flags = ret_flags;
756
757 return NO_ERROR;
758 }
759
arch_mmu_map(arch_aspace_t * aspace,vaddr_t vaddr,paddr_t paddr,size_t count,uint flags)760 int arch_mmu_map(arch_aspace_t *aspace, vaddr_t vaddr, paddr_t paddr, size_t count, uint flags)
761 {
762 addr_t current_cr3_val;
763 struct map_range range;
764
765 DEBUG_ASSERT(aspace);
766
767 LTRACEF("aspace %p, vaddr 0x%lx paddr 0x%lx count %zu flags 0x%x\n", aspace, vaddr, paddr, count, flags);
768
769 if ((!x86_mmu_check_paddr(paddr)))
770 return ERR_INVALID_ARGS;
771
772 if (!x86_mmu_check_vaddr(vaddr))
773 return ERR_INVALID_ARGS;
774
775 if (!x86_mmu_check_flags(flags)) {
776 return ERR_NOT_SUPPORTED;
777 }
778
779 if (count == 0)
780 return NO_ERROR;
781
782 current_cr3_val = aspace->page_table;
783 ASSERT(current_cr3_val);
784
785 range.start_vaddr = vaddr;
786 range.start_paddr = paddr;
787 range.size = count * PAGE_SIZE;
788
789 return (x86_mmu_map_range(X86_PHYS_TO_VIRT(current_cr3_val), &range, flags));
790 }
791
x86_mmu_early_init(void)792 void x86_mmu_early_init(void)
793 {
794 volatile uint64_t cr0, cr4;
795
796 /* Set WP bit in CR0*/
797 cr0 = x86_get_cr0();
798 cr0 |= X86_CR0_WP;
799 x86_set_cr0(cr0);
800
801 /* Setting the SMEP & SMAP bit in CR4 */
802 cr4 = x86_get_cr4();
803 if (check_smep_avail())
804 cr4 |= X86_CR4_SMEP;
805 if (check_smap_avail())
806 cr4 |=X86_CR4_SMAP;
807 x86_set_cr4(cr4);
808
809 /* getting the address width from CPUID instr */
810 /* Bits 07-00: Physical Address width info */
811 /* Bits 15-08: Linear Address width info */
812 uint32_t addr_width = x86_get_address_width();
813 g_paddr_width = (uint8_t)(addr_width & 0xFF);
814 g_vaddr_width = (uint8_t)((addr_width >> 8) & 0xFF);
815
816 LTRACEF("paddr_width %u vaddr_width %u\n", g_paddr_width, g_vaddr_width);
817
818 x86_kernel_page_table = x86_get_cr3();
819
820 /* tlb flush */
821 x86_set_cr3(x86_get_cr3());
822 }
823
x86_mmu_init(void)824 void x86_mmu_init(void)
825 {
826 }
827
x86_create_page_table(void)828 static paddr_t x86_create_page_table(void)
829 {
830 addr_t *new_table = NULL;
831
832 new_table = (addr_t *)_map_alloc_page();
833 ASSERT(new_table);
834
835 /*
836 * Copy kernel level mapping to user level mapping to support syscall and
837 * interrupt handling in user level.
838 *
839 * TODO:
840 * Update to Kernel page-table isolation (KPTI) to mitigates Meltdown
841 * security vulnerabilty.
842 */
843 new_table[511] = pml4[511];
844
845 return (paddr_t)X86_VIRT_TO_PHYS(new_table);
846 }
847
848 /*
849 * x86-64 does not support multiple address spaces at the moment, so fail if these apis
850 * are used for it.
851 */
arch_mmu_init_aspace(arch_aspace_t * aspace,vaddr_t base,size_t size,uint flags)852 status_t arch_mmu_init_aspace(arch_aspace_t *aspace, vaddr_t base, size_t size, uint flags)
853 {
854 ASSERT(aspace);
855
856 ASSERT(size > PAGE_SIZE);
857 ASSERT(base + size - 1 > base);
858
859 aspace->size = size;
860 aspace->base = base;
861
862 if ((flags & ARCH_ASPACE_FLAG_KERNEL)) {
863 aspace->page_table = x86_kernel_page_table;
864 } else {
865 aspace->page_table = x86_create_page_table();
866 }
867
868 return NO_ERROR;
869 }
870
arch_mmu_destroy_aspace(arch_aspace_t * aspace)871 status_t arch_mmu_destroy_aspace(arch_aspace_t *aspace)
872 {
873 ASSERT(aspace);
874
875 pmm_free_page(paddr_to_vm_page(aspace->page_table));
876
877 aspace->size = 0;
878 aspace->base = 0;
879 aspace->page_table = 0;
880
881 return NO_ERROR;
882 }
883
arch_mmu_context_switch(arch_aspace_t * aspace)884 void arch_mmu_context_switch(arch_aspace_t *aspace)
885 {
886 if (NULL == aspace) {
887 x86_set_cr3(x86_kernel_page_table);
888 } else {
889 vmm_aspace_t *kernel_aspace = vmm_get_kernel_aspace();
890 ASSERT(&kernel_aspace->arch_aspace != aspace);
891
892 x86_set_cr3(aspace->page_table);
893 }
894 }
895
896