1 /*
2  * Copyright (c) 2009 Corey Tabaka
3  * Copyright (c) 2015-2018 Intel Corporation
4  * Copyright (c) 2016 Travis Geiselbrecht
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files
8  * (the "Software"), to deal in the Software without restriction,
9  * including without limitation the rights to use, copy, modify, merge,
10  * publish, distribute, sublicense, and/or sell copies of the Software,
11  * and to permit persons to whom the Software is furnished to do so,
12  * subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be
15  * included in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
21  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24  */
25 #include <debug.h>
26 #include <trace.h>
27 #include <sys/types.h>
28 #include <compiler.h>
29 #include <arch.h>
30 #include <arch/x86.h>
31 #include <arch/x86/mmu.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <arch/mmu.h>
35 #include <assert.h>
36 #include <err.h>
37 #include <arch/arch_ops.h>
38 #include <kernel/vm.h>
39 #include <inttypes.h>
40 
41 #define LOCAL_TRACE 0
42 
43 /* Address width including virtual/physical address*/
44 uint8_t g_vaddr_width = 0;
45 uint8_t g_paddr_width = 0;
46 
47 paddr_t x86_kernel_page_table = 0;
48 
49 /*
50  * Page table 1:
51  *
52  * This page table is used for bootstrap code
53  * VA - start, size                       : PA - start, size
54  * MEMBASE+KERNEL_LOAD_OFFSET, 1 PAGE     : MEMBASE+KERNEL_LOAD_OFFSET, 1 PAGE
55  * PHYS(_gdt),  1 PAGE                    : PHYS(_gdt), 1 PAGE
56  * KERNEL_BASE+KERNEL_LOAD_OFFSET, 1 PAGE : MEMBASE+KERNEL_LOAD_OFFSET, 1 PAGE
57  *
58  * 4-level paging is used to cover bootstrap code:
59  * entry in pml4(Page Map Level 4) covers 512GB,
60  * entry in pdpt(Page-directory-pointer table) covers 1GB,
61  * entry in pd(Page directory) covers 2MB,
62  * entry in pt(Page table) covers 4KB.
63  *
64  * pml4_trampoline->pdpt_trampoline->pd_trampoline->pt_trampoline
65  * covers VA (from ~ end):
66  *   MEMBASE+KERNEL_LOAD_OFFSET ~ MEMBASE+KERNEL_LOAD_OFFSET + 1 PAGE
67  * and
68  * pml4_trampoline->pdpt_trampoline->pd_trampoline->pt_trampoline_gdt
69  * covers VA (from ~ end):
70  *   PHYS(_gdtr_phys) ~ PHYS(_gdtr_phys) + 1 PAGE
71  *
72  */
73 map_addr_t pml4_trampoline[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
74 map_addr_t pdpt_trampoline[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
75 map_addr_t pd_trampoline[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
76 map_addr_t pt_trampoline[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
77 map_addr_t pt_trampoline_gdt[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
78 
79 /*
80  * Page table 2:
81  * This page table is used at run time in 64bit
82  * (memsize equals to upper memory passed in by bootloader minus
83  *  physical start address of lk binary, if memsize is larger than 1GB,
84  *  more page directories for this page table will be allocated in boot mem)
85  * VA  start, size      : PA  start, size
86  * KERNEL_BASE, memsize : MEMBASE, memsize
87  */
88 map_addr_t pml4[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
89 map_addr_t pdpt[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
90 map_addr_t pd[NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
91 map_addr_t pt[NO_OF_PT_ENTRIES][NO_OF_PT_ENTRIES] __ALIGNED(PAGE_SIZE);
92 
93 /**
94  * @brief  check if the virtual address is aligned and canonical
95  *
96  */
x86_mmu_check_vaddr(vaddr_t vaddr)97 static bool x86_mmu_check_vaddr(vaddr_t vaddr)
98 {
99     uint64_t addr = (uint64_t)vaddr;
100     uint64_t max_vaddr_lohalf,
101              min_vaddr_hihalf;
102 
103     /* Check to see if the address is PAGE aligned */
104     if (!IS_ALIGNED(addr, PAGE_SIZE))
105         return false;
106 
107     /* get max address in lower-half canonical addr space */
108     /* e.g. if width is 48, then 0x00007FFF_FFFFFFFF */
109     max_vaddr_lohalf = ((uint64_t)1ull << (g_vaddr_width - 1)) - 1;
110 
111     /* get min address in higher-half canonical addr space */
112     /* e.g. if width is 48, then 0xFFFF8000_00000000*/
113     min_vaddr_hihalf = ~ max_vaddr_lohalf;
114 
115     /* Check to see if the address in a canonical address */
116     if ((addr > max_vaddr_lohalf) && (addr < min_vaddr_hihalf))
117         return false;
118 
119     return true;
120 }
121 
122 
123 /**
124  * @brief  check if the physical address is valid and aligned
125  *
126  */
x86_mmu_check_paddr(paddr_t paddr)127 static bool x86_mmu_check_paddr(paddr_t paddr)
128 {
129     uint64_t addr = (uint64_t)paddr;
130     uint64_t max_paddr;
131 
132     /* Check to see if the address is PAGE aligned */
133     if (!IS_ALIGNED(addr, PAGE_SIZE))
134         return false;
135 
136     max_paddr = ((uint64_t)1ull << g_paddr_width) - 1;
137 
138     return addr <= max_paddr;
139 }
140 
141 
get_pml4_entry_from_pml4_table(vaddr_t vaddr,addr_t pml4_addr)142 static inline uint64_t get_pml4_entry_from_pml4_table(vaddr_t vaddr, addr_t pml4_addr)
143 {
144     uint32_t pml4_index;
145     uint64_t *pml4_table = (uint64_t *)pml4_addr;
146 
147     pml4_index = (((uint64_t)vaddr >> PML4_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
148     return X86_PHYS_TO_VIRT(pml4_table[pml4_index]);
149 }
150 
get_pdp_entry_from_pdp_table(vaddr_t vaddr,uint64_t pml4e)151 static inline uint64_t get_pdp_entry_from_pdp_table(vaddr_t vaddr, uint64_t pml4e)
152 {
153     uint32_t pdp_index;
154     uint64_t *pdpe;
155 
156     pdp_index = (((uint64_t)vaddr >> PDP_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
157     pdpe = (uint64_t *)(pml4e & X86_PG_FRAME);
158     return X86_PHYS_TO_VIRT(pdpe[pdp_index]);
159 }
160 
get_pd_entry_from_pd_table(vaddr_t vaddr,uint64_t pdpe)161 static inline uint64_t get_pd_entry_from_pd_table(vaddr_t vaddr, uint64_t pdpe)
162 {
163     uint32_t pd_index;
164     uint64_t *pde;
165 
166     pd_index = (((uint64_t)vaddr >> PD_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
167     pde = (uint64_t *)(pdpe & X86_PG_FRAME);
168     return X86_PHYS_TO_VIRT(pde[pd_index]);
169 }
170 
get_pt_entry_from_pt_table(vaddr_t vaddr,uint64_t pde)171 static inline uint64_t get_pt_entry_from_pt_table(vaddr_t vaddr, uint64_t pde)
172 {
173     uint32_t pt_index;
174     uint64_t *pte;
175 
176     pt_index = (((uint64_t)vaddr >> PT_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
177     pte = (uint64_t *)(pde & X86_PG_FRAME);
178     return pte[pt_index];
179 }
180 
get_pfn_from_pte(uint64_t pte)181 static inline uint64_t get_pfn_from_pte(uint64_t pte)
182 {
183     uint64_t pfn;
184 
185     /* Clear low 12 bits */
186     pfn = (pte & X86_PG_FRAME);
187 
188     /* Clear high 12 bits */
189     pfn &= X86_PG_PHY_ADDR_MASK;
190 
191     return pfn;
192 }
193 
get_pfn_from_pde(uint64_t pde)194 static inline uint64_t get_pfn_from_pde(uint64_t pde)
195 {
196     uint64_t pfn;
197 
198     pfn = (pde & X86_2MB_PAGE_FRAME);
199 
200     LTRACEF_LEVEL(2, "pde 0x%" PRIx64 ", pfn 0x%" PRIx64 "\n", pde, pfn);
201 
202     return pfn;
203 }
204 
205 /**
206  * @brief Returning the x86 arch flags from generic mmu flags
207  */
get_x86_arch_flags(arch_flags_t flags)208 arch_flags_t get_x86_arch_flags(arch_flags_t flags)
209 {
210     arch_flags_t arch_flags = 0;
211     uint cache_flag = flags & ARCH_MMU_FLAG_CACHE_MASK;
212 
213     if (!(flags & ARCH_MMU_FLAG_PERM_RO))
214         arch_flags |= X86_MMU_PG_RW;
215 
216     if (flags & ARCH_MMU_FLAG_PERM_USER)
217         arch_flags |= X86_MMU_PG_U;
218 
219     if (cache_flag == ARCH_MMU_FLAG_UNCACHED ||
220         cache_flag == ARCH_MMU_FLAG_UNCACHED_DEVICE)
221         arch_flags |= X86_MMU_CACHE_DISABLE;
222 
223     if (flags & ARCH_MMU_FLAG_PERM_NO_EXECUTE)
224         arch_flags |= X86_MMU_PG_NX;
225 
226     return arch_flags;
227 }
228 
x86_mmu_check_flags(uint flags)229 bool x86_mmu_check_flags(uint flags)
230 {
231     uint cache_flag = flags & ARCH_MMU_FLAG_CACHE_MASK;
232     if (cache_flag != ARCH_MMU_FLAG_CACHED &&
233         cache_flag != ARCH_MMU_FLAG_UNCACHED &&
234         cache_flag != ARCH_MMU_FLAG_UNCACHED_DEVICE) {
235         LTRACEF("unsupported cache type: 0x%x, flags 0x%x\n",
236                 cache_flag, flags);
237         return false;
238     }
239     uint unsupported_flags = flags & ~ARCH_MMU_FLAG_CACHE_MASK;
240     unsupported_flags &= ~ARCH_MMU_FLAG_PERM_RO;
241     unsupported_flags &= ~ARCH_MMU_FLAG_PERM_USER;
242     unsupported_flags &= ~ARCH_MMU_FLAG_PERM_NO_EXECUTE;
243     if (unsupported_flags) {
244         LTRACEF("unsupported flags: 0x%x, flags 0x%x\n",
245                 unsupported_flags, flags);
246         return false;
247     }
248     return true;
249 }
250 
251 /**
252  * @brief Returning the generic mmu flags from x86 arch flags
253  */
get_arch_mmu_flags(arch_flags_t flags)254 uint get_arch_mmu_flags(arch_flags_t flags)
255 {
256     arch_flags_t mmu_flags = 0;
257 
258     if (!(flags & X86_MMU_PG_RW))
259         mmu_flags |= ARCH_MMU_FLAG_PERM_RO;
260 
261     if (flags & X86_MMU_PG_U)
262         mmu_flags |= ARCH_MMU_FLAG_PERM_USER;
263 
264     if (flags & X86_MMU_CACHE_DISABLE)
265         mmu_flags |= ARCH_MMU_FLAG_UNCACHED;
266 
267     if (flags & X86_MMU_PG_NX)
268         mmu_flags |= ARCH_MMU_FLAG_PERM_NO_EXECUTE;
269 
270     return (uint)mmu_flags;
271 }
272 
273 /**
274  * @brief  Walk the page table structures
275  *
276  * In this scenario, we are considering the paging scheme to be a PAE mode with
277  * 4KB pages.
278  *
279  */
x86_mmu_get_mapping(map_addr_t pml4,vaddr_t vaddr,uint32_t * ret_level,arch_flags_t * mmu_flags,map_addr_t * last_valid_entry)280 status_t x86_mmu_get_mapping(map_addr_t pml4, vaddr_t vaddr, uint32_t *ret_level,
281                                     arch_flags_t *mmu_flags, map_addr_t *last_valid_entry)
282 {
283     uint64_t pml4e, pdpe, pde, pte;
284 
285     DEBUG_ASSERT(pml4);
286     if ((!ret_level) || (!last_valid_entry) || (!mmu_flags)) {
287         return ERR_INVALID_ARGS;
288     }
289 
290     *ret_level = PML4_L;
291     *last_valid_entry = pml4;
292     *mmu_flags = 0;
293 
294     LTRACEF_LEVEL(2, "pml4 0x%" PRIx64 "\n", pml4);
295 
296     pml4e = get_pml4_entry_from_pml4_table(vaddr, pml4);
297     if ((pml4e & X86_MMU_PG_P) == 0) {
298         return ERR_NOT_FOUND;
299     }
300     LTRACEF_LEVEL(2, "pml4e 0x%" PRIx64 "\n", pml4e);
301 
302     pdpe = get_pdp_entry_from_pdp_table(vaddr, pml4e);
303     if ((pdpe & X86_MMU_PG_P) == 0) {
304         *ret_level = PDP_L;
305         *last_valid_entry = pml4e;
306         return ERR_NOT_FOUND;
307     }
308     LTRACEF_LEVEL(2, "pdpe 0x%" PRIx64 "\n", pdpe);
309 
310     pde = get_pd_entry_from_pd_table(vaddr, pdpe);
311     if ((pde & X86_MMU_PG_P) == 0) {
312         *ret_level = PD_L;
313         *last_valid_entry = pdpe;
314         return ERR_NOT_FOUND;
315     }
316     LTRACEF_LEVEL(2, "pde 0x%" PRIx64 "\n", pde);
317 
318     /* 2 MB pages */
319     if (pde & X86_MMU_PG_PS) {
320         /* Getting the Page frame & adding the 4KB page offset from the vaddr */
321         *last_valid_entry = get_pfn_from_pde(X86_VIRT_TO_PHYS(pde)) + ((uint64_t)vaddr & PAGE_OFFSET_MASK_2MB);
322         *mmu_flags = get_arch_mmu_flags(pde & X86_FLAGS_MASK);
323         goto last;
324     }
325 
326     /* 4 KB pages */
327     pte = get_pt_entry_from_pt_table(vaddr, pde);
328     if ((pte & X86_MMU_PG_P) == 0) {
329         *ret_level = PT_L;
330         *last_valid_entry = pde;
331         return ERR_NOT_FOUND;
332     }
333 
334     /* Getting the Page frame & adding the 4KB page offset from the vaddr */
335     *last_valid_entry = get_pfn_from_pte(pte) + ((uint64_t)vaddr & PAGE_OFFSET_MASK_4KB);
336     *mmu_flags = get_arch_mmu_flags(pte & X86_FLAGS_MASK);
337 
338 last:
339     *ret_level = PF_L;
340     return NO_ERROR;
341 }
342 
343 /**
344  * Walk the page table structures to see if the mapping between a virtual address
345  * and a physical address exists. Also, check the flags.
346  *
347  */
x86_mmu_check_mapping(addr_t pml4,paddr_t paddr,vaddr_t vaddr,arch_flags_t in_flags,uint32_t * ret_level,arch_flags_t * ret_flags,map_addr_t * last_valid_entry)348 status_t x86_mmu_check_mapping(addr_t pml4, paddr_t paddr,
349                                vaddr_t vaddr, arch_flags_t in_flags,
350                                uint32_t *ret_level, arch_flags_t *ret_flags,
351                                map_addr_t *last_valid_entry)
352 {
353     status_t status;
354     arch_flags_t existing_flags = 0;
355 
356     DEBUG_ASSERT(pml4);
357     if ((!ret_level) || (!last_valid_entry) || (!ret_flags) ||
358             (!x86_mmu_check_vaddr(vaddr)) ||
359             (!x86_mmu_check_paddr(paddr))) {
360         return ERR_INVALID_ARGS;
361     }
362 
363     status = x86_mmu_get_mapping(pml4, vaddr, ret_level, &existing_flags, last_valid_entry);
364     if (status || ((*last_valid_entry) != (uint64_t)paddr)) {
365         /* We did not reach till we check the access flags for the mapping */
366         *ret_flags = in_flags;
367         return ERR_NOT_FOUND;
368     }
369 
370     /* Checking the access flags for the mapped address. If it is not zero, then
371      * the access flags are different & the return flag will have those access bits
372      * which are different.
373      */
374     *ret_flags = (in_flags ^ get_x86_arch_flags(existing_flags)) & X86_DIRTY_ACCESS_MASK;
375 
376     if (!(*ret_flags))
377         return NO_ERROR;
378 
379     return ERR_NOT_FOUND;
380 }
381 
update_pt_entry(vaddr_t vaddr,paddr_t paddr,uint64_t pde,arch_flags_t flags)382 static void update_pt_entry(vaddr_t vaddr, paddr_t paddr,  uint64_t pde, arch_flags_t flags)
383 {
384     uint32_t pt_index;
385 
386     uint64_t *pt_table = (uint64_t *)(pde & X86_PG_FRAME);
387     pt_index = (((uint64_t)vaddr >> PT_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
388     pt_table[pt_index] = (uint64_t)paddr;
389     pt_table[pt_index] |= flags | X86_MMU_PG_P;
390     if (!(flags & X86_MMU_PG_U))
391         pt_table[pt_index] |= X86_MMU_PG_G; /* setting global flag for kernel pages */
392 
393     if (flags & X86_MMU_PG_NX)
394         pt_table[pt_index] |= X86_MMU_PG_NX;
395     else
396         pt_table[pt_index] &= ~X86_MMU_PG_NX;
397 }
398 
update_pd_entry(vaddr_t vaddr,uint64_t pdpe,map_addr_t m,arch_flags_t flags)399 static void update_pd_entry(vaddr_t vaddr, uint64_t pdpe, map_addr_t m, arch_flags_t flags)
400 {
401     uint32_t pd_index;
402 
403     uint64_t *pd_table = (uint64_t *)(pdpe & X86_PG_FRAME);
404     pd_index = (((uint64_t)vaddr >> PD_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
405     pd_table[pd_index] = m;
406     pd_table[pd_index] |= X86_MMU_PG_P | X86_MMU_PG_RW;
407     DEBUG_ASSERT(!(pd_table[pd_index] & X86_MMU_PG_PS));
408     pd_table[pd_index] |= X86_MMU_PG_U; /* set U flag on all inner entries */
409     if (!(flags & X86_MMU_PG_U))
410         pd_table[pd_index] |= X86_MMU_PG_G; /* setting global flag for kernel pages */
411 }
412 
update_pdp_entry(vaddr_t vaddr,uint64_t pml4e,map_addr_t m,arch_flags_t flags)413 static void update_pdp_entry(vaddr_t vaddr, uint64_t pml4e, map_addr_t m, arch_flags_t flags)
414 {
415     uint32_t pdp_index;
416 
417     uint64_t *pdp_table = (uint64_t *)(pml4e & X86_PG_FRAME);
418     pdp_index = (((uint64_t)vaddr >> PDP_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
419     pdp_table[pdp_index] = m;
420     pdp_table[pdp_index] |= X86_MMU_PG_P | X86_MMU_PG_RW;
421     DEBUG_ASSERT(!(pdp_table[pdp_index] & X86_MMU_PG_PS));
422     pdp_table[pdp_index] |= X86_MMU_PG_U; /* set U flag on all inner entries */
423     if (!(flags & X86_MMU_PG_U))
424         pdp_table[pdp_index] |= X86_MMU_PG_G; /* setting global flag for kernel pages */
425 }
426 
update_pml4_entry(vaddr_t vaddr,addr_t pml4_addr,map_addr_t m,arch_flags_t flags)427 static void update_pml4_entry(vaddr_t vaddr, addr_t pml4_addr, map_addr_t m, arch_flags_t flags)
428 {
429     uint32_t pml4_index;
430     uint64_t *pml4_table = (uint64_t *)(pml4_addr);
431 
432     pml4_index = (((uint64_t)vaddr >> PML4_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
433     pml4_table[pml4_index] = m;
434     pml4_table[pml4_index] |= X86_MMU_PG_P | X86_MMU_PG_RW;
435     DEBUG_ASSERT(!(pml4_table[pml4_index] & X86_MMU_PG_PS));
436     pml4_table[pml4_index] |= X86_MMU_PG_U; /* set U flag on all inner entries */
437     if (!(flags & X86_MMU_PG_U))
438         pml4_table[pml4_index] |= X86_MMU_PG_G; /* setting global flag for kernel pages */
439 }
440 
441 /**
442  * @brief Allocating a new page table
443  */
_map_alloc_page(void)444 static map_addr_t *_map_alloc_page(void)
445 {
446     map_addr_t *page_ptr = pmm_alloc_kpage();
447     DEBUG_ASSERT(page_ptr);
448 
449     if (page_ptr)
450         memset(page_ptr, 0, PAGE_SIZE);
451 
452     return page_ptr;
453 }
454 
455 /**
456  * @brief  Add a new mapping for the given virtual address & physical address
457  *
458  * This is a API which handles the mapping b/w a virtual address & physical address
459  * either by checking if the mapping already exists and is valid OR by adding a
460  * new mapping with the required flags.
461  *
462  * In this scenario, we are considering the paging scheme to be a PAE mode with
463  * 4KB pages.
464  *
465  */
x86_mmu_add_mapping(map_addr_t pml4,map_addr_t paddr,vaddr_t vaddr,arch_flags_t mmu_flags)466 status_t x86_mmu_add_mapping(map_addr_t pml4, map_addr_t paddr,
467                              vaddr_t vaddr, arch_flags_t mmu_flags)
468 {
469     uint32_t pd_new = 0, pdp_new = 0;
470     uint64_t pml4e, pdpe, pde;
471     map_addr_t *m = NULL;
472     status_t ret = NO_ERROR;
473 
474     LTRACEF("pml4 0x%" PRIxMAP_ADDR " paddr 0x%" PRIxMAP_ADDR " vaddr 0x%lx flags 0x%" PRIxARCH_FLAGS "\n", pml4, paddr, vaddr, mmu_flags);
475 
476     DEBUG_ASSERT(pml4);
477     if ((!x86_mmu_check_vaddr(vaddr)) || (!x86_mmu_check_paddr(paddr)) )
478         return ERR_INVALID_ARGS;
479 
480     pml4e = get_pml4_entry_from_pml4_table(vaddr, pml4);
481 
482     if ((pml4e & X86_MMU_PG_P) == 0) {
483         /* Creating a new pdp table */
484         m = _map_alloc_page();
485         if (m == NULL) {
486             ret = ERR_NO_MEMORY;
487             goto clean;
488         }
489 
490         update_pml4_entry(vaddr, pml4, X86_VIRT_TO_PHYS(m), get_x86_arch_flags(mmu_flags));
491         pml4e = (uint64_t)m;
492         X86_SET_FLAG(pdp_new);
493     }
494 
495     if (!pdp_new)
496         pdpe = get_pdp_entry_from_pdp_table(vaddr, pml4e);
497 
498     if (pdp_new || (pdpe & X86_MMU_PG_P) == 0) {
499         /* Creating a new pd table  */
500         m  = _map_alloc_page();
501         if (m == NULL) {
502             ret = ERR_NO_MEMORY;
503             if (pdp_new)
504                 goto clean_pdp;
505             goto clean;
506         }
507 
508         update_pdp_entry(vaddr, pml4e, X86_VIRT_TO_PHYS(m), get_x86_arch_flags(mmu_flags));
509         pdpe = (uint64_t)m;
510         X86_SET_FLAG(pd_new);
511     }
512 
513     if (!pd_new)
514         pde = get_pd_entry_from_pd_table(vaddr, pdpe);
515 
516     if (pd_new || (pde & X86_MMU_PG_P) == 0) {
517         /* Creating a new pt */
518         m  = _map_alloc_page();
519         if (m == NULL) {
520             ret = ERR_NO_MEMORY;
521             if (pd_new)
522                 goto clean_pd;
523             goto clean;
524         }
525 
526         update_pd_entry(vaddr, pdpe, X86_VIRT_TO_PHYS(m), get_x86_arch_flags(mmu_flags));
527         pde = (uint64_t)m;
528     }
529 
530     /* Updating the page table entry with the paddr and access flags required for the mapping */
531     update_pt_entry(vaddr, paddr, pde, get_x86_arch_flags(mmu_flags));
532     ret = NO_ERROR;
533     goto clean;
534 
535 clean_pd:
536     if (pd_new)
537         pmm_free_page(paddr_to_vm_page(X86_PHYS_TO_VIRT(pd_new)));
538 
539 clean_pdp:
540     if (pdp_new)
541         pmm_free_page(paddr_to_vm_page(X86_PHYS_TO_VIRT(pml4e)));
542 
543 clean:
544     return ret;
545 }
546 
547 /**
548  * @brief  x86-64 MMU unmap an entry in the page tables recursively and clear out tables
549  *
550  */
x86_mmu_unmap_entry(vaddr_t vaddr,int level,vaddr_t table_entry)551 static void x86_mmu_unmap_entry(vaddr_t vaddr, int level, vaddr_t table_entry)
552 {
553     uint32_t offset = 0, next_level_offset = 0;
554     vaddr_t *table, *next_table_addr, value;
555 
556     LTRACEF("vaddr 0x%lx level %d table_entry 0x%lx\n", vaddr, level, table_entry);
557 
558     next_table_addr = NULL;
559     table = (vaddr_t *)(table_entry & X86_PG_FRAME);
560     LTRACEF_LEVEL(2, "table %p\n", table);
561 
562     switch (level) {
563         case PML4_L:
564             offset = (((uint64_t)vaddr >> PML4_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
565             LTRACEF_LEVEL(2, "offset %u\n", offset);
566             next_table_addr = (vaddr_t *)X86_PHYS_TO_VIRT(table[offset]);
567             LTRACEF_LEVEL(2, "next_table_addr %p\n", next_table_addr);
568             if ((X86_PHYS_TO_VIRT(table[offset]) & X86_MMU_PG_P)== 0)
569                 return;
570             break;
571         case PDP_L:
572             offset = (((uint64_t)vaddr >> PDP_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
573             LTRACEF_LEVEL(2, "offset %u\n", offset);
574             next_table_addr = (vaddr_t *)X86_PHYS_TO_VIRT(table[offset]);
575             LTRACEF_LEVEL(2, "next_table_addr %p\n", next_table_addr);
576             if ((X86_PHYS_TO_VIRT(table[offset]) & X86_MMU_PG_P) == 0)
577                 return;
578             break;
579         case PD_L:
580             offset = (((uint64_t)vaddr >> PD_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
581             LTRACEF_LEVEL(2, "offset %u\n", offset);
582             next_table_addr = (vaddr_t *)X86_PHYS_TO_VIRT(table[offset]);
583             LTRACEF_LEVEL(2, "next_table_addr %p\n", next_table_addr);
584             if ((X86_PHYS_TO_VIRT(table[offset]) & X86_MMU_PG_P) == 0)
585                 return;
586             break;
587         case PT_L:
588             offset = (((uint64_t)vaddr >> PT_SHIFT) & ((1ul << ADDR_OFFSET) - 1));
589             LTRACEF_LEVEL(2, "offset %u\n", offset);
590             next_table_addr = (vaddr_t *)X86_PHYS_TO_VIRT(table[offset]);
591             LTRACEF_LEVEL(2, "next_table_addr %p\n", next_table_addr);
592             if ((X86_PHYS_TO_VIRT(table[offset]) & X86_MMU_PG_P) == 0)
593                 return;
594             break;
595         case PF_L:
596             /* Reached page frame, Let's go back */
597         default:
598             return;
599     }
600 
601     LTRACEF_LEVEL(2, "recursing\n");
602 
603     level -= 1;
604     x86_mmu_unmap_entry(vaddr, level, (vaddr_t)next_table_addr);
605     level += 1;
606 
607     LTRACEF_LEVEL(2, "next_table_addr %p\n", next_table_addr);
608 
609     next_table_addr = (vaddr_t *)((vaddr_t)(next_table_addr) & X86_PG_FRAME);
610     if (level > PT_L) {
611         /* Check all entries of next level table for present bit */
612         for (next_level_offset = 0; next_level_offset < (PAGE_SIZE/8); next_level_offset++) {
613             if ((next_table_addr[next_level_offset] & X86_MMU_PG_P) != 0)
614                 return; /* There is an entry in the next level table */
615         }
616         pmm_free_page(paddr_to_vm_page(X86_VIRT_TO_PHYS(next_table_addr)));
617     }
618     /* All present bits for all entries in next level table for this address are 0 */
619     if ((X86_PHYS_TO_VIRT(table[offset]) & X86_MMU_PG_P) != 0) {
620         arch_disable_ints();
621         value = table[offset];
622         value = value & X86_PTE_NOT_PRESENT;
623         table[offset] = value;
624         arch_enable_ints();
625     }
626 }
627 
x86_mmu_unmap(map_addr_t pml4,vaddr_t vaddr,size_t count)628 status_t x86_mmu_unmap(map_addr_t pml4, vaddr_t vaddr, size_t count)
629 {
630     vaddr_t next_aligned_v_addr;
631 
632     DEBUG_ASSERT(pml4);
633     if (!(x86_mmu_check_vaddr(vaddr)))
634         return ERR_INVALID_ARGS;
635 
636     if (count == 0)
637         return NO_ERROR;
638 
639     next_aligned_v_addr = vaddr;
640     while (count > 0) {
641         x86_mmu_unmap_entry(next_aligned_v_addr, X86_PAGING_LEVELS, pml4);
642         /*
643          * Flush page mapping in TLB when unmapping pages,
644          * need to invalid page to avoid data loss.
645          */
646         __asm__ __volatile__ ("invlpg (%0)": : "r" (next_aligned_v_addr) : "memory");
647         next_aligned_v_addr += PAGE_SIZE;
648         count--;
649     }
650     return NO_ERROR;
651 }
652 
arch_mmu_unmap(arch_aspace_t * aspace,vaddr_t vaddr,size_t count)653 int arch_mmu_unmap(arch_aspace_t *aspace, vaddr_t vaddr, size_t count)
654 {
655     addr_t current_cr3_val;
656     vmm_aspace_t *kernel_aspace = vmm_get_kernel_aspace();
657 
658     LTRACEF("aspace %p, vaddr 0x%lx, count %zu\n", aspace, vaddr, count);
659 
660     ASSERT(aspace);
661 
662     /*
663      * Kernel level page table is mapped in user level space for syscall
664      * and interrupt handling.
665      *
666      * Add check here to make sure supervisor page would never be unmapped
667      * in user level aspace accidentally.
668      */
669     if (&kernel_aspace->arch_aspace != aspace) {
670         if (is_kernel_address(vaddr)) {
671             return ERR_INVALID_ARGS;
672         }
673     }
674 
675     if (!(x86_mmu_check_vaddr(vaddr)))
676         return ERR_INVALID_ARGS;
677 
678     if (count == 0)
679         return NO_ERROR;
680 
681     current_cr3_val = aspace->page_table;
682     ASSERT(current_cr3_val);
683 
684     return (x86_mmu_unmap(X86_PHYS_TO_VIRT(current_cr3_val), vaddr, count));
685 }
686 
687 /**
688  * @brief  Mapping a section/range with specific permissions
689  *
690  */
x86_mmu_map_range(map_addr_t pml4,struct map_range * range,arch_flags_t flags)691 status_t x86_mmu_map_range(map_addr_t pml4, struct map_range *range, arch_flags_t flags)
692 {
693     vaddr_t next_aligned_v_addr;
694     paddr_t next_aligned_p_addr;
695     status_t map_status;
696     uint32_t no_of_pages, index;
697 
698     LTRACEF("pml4 0x%" PRIxMAP_ADDR ", range v 0x%" PRIxVADDR " p 0x%" PRIxMAP_RANGE_PADDR " size %u flags 0x%" PRIxARCH_FLAGS "\n",
699         pml4, range->start_vaddr, range->start_paddr, range->size, flags);
700 
701     DEBUG_ASSERT(pml4);
702     if (!range)
703         return ERR_INVALID_ARGS;
704 
705     /* Calculating the number of 4k pages */
706     if (IS_ALIGNED(range->size, PAGE_SIZE))
707         no_of_pages = (range->size) >> PAGE_DIV_SHIFT;
708     else
709         no_of_pages = ((range->size) >> PAGE_DIV_SHIFT) + 1;
710 
711     next_aligned_v_addr = range->start_vaddr;
712     next_aligned_p_addr = range->start_paddr;
713 
714     for (index = 0; index < no_of_pages; index++) {
715         map_status = x86_mmu_add_mapping(pml4, next_aligned_p_addr, next_aligned_v_addr, flags);
716         if (map_status) {
717             dprintf(SPEW, "Add mapping failed with err=%d\n", map_status);
718             /* Unmap the partial mapping - if any */
719             x86_mmu_unmap(pml4, range->start_vaddr, index);
720             return map_status;
721         }
722         next_aligned_v_addr += PAGE_SIZE;
723         next_aligned_p_addr += PAGE_SIZE;
724     }
725     return NO_ERROR;
726 }
727 
arch_mmu_query(arch_aspace_t * aspace,vaddr_t vaddr,paddr_t * paddr,uint * flags)728 status_t arch_mmu_query(arch_aspace_t *aspace, vaddr_t vaddr, paddr_t *paddr, uint *flags)
729 {
730     addr_t current_cr3_val;
731     uint32_t ret_level;
732     map_addr_t last_valid_entry;
733     arch_flags_t ret_flags;
734     status_t stat;
735 
736     LTRACEF("aspace %p, vaddr 0x%lx, paddr %p, flags %p\n", aspace, vaddr, paddr, flags);
737 
738     ASSERT(aspace);
739 
740     current_cr3_val = aspace->page_table;
741     ASSERT(current_cr3_val);
742 
743     stat = x86_mmu_get_mapping(X86_PHYS_TO_VIRT(current_cr3_val), vaddr, &ret_level, &ret_flags, &last_valid_entry);
744     if (stat)
745         return stat;
746 
747     if (paddr) {
748         *paddr = (paddr_t)(last_valid_entry);
749     }
750 
751     LTRACEF("paddr 0x%" PRIxMAP_ADDR "\n", last_valid_entry);
752 
753     /* converting x86 arch specific flags to arch mmu flags */
754     if (flags)
755         *flags = ret_flags;
756 
757     return NO_ERROR;
758 }
759 
arch_mmu_map(arch_aspace_t * aspace,vaddr_t vaddr,paddr_t paddr,size_t count,uint flags)760 int arch_mmu_map(arch_aspace_t *aspace, vaddr_t vaddr, paddr_t paddr, size_t count, uint flags)
761 {
762     addr_t current_cr3_val;
763     struct map_range range;
764 
765     DEBUG_ASSERT(aspace);
766 
767     LTRACEF("aspace %p, vaddr 0x%lx paddr 0x%lx count %zu flags 0x%x\n", aspace, vaddr, paddr, count, flags);
768 
769     if ((!x86_mmu_check_paddr(paddr)))
770         return ERR_INVALID_ARGS;
771 
772     if (!x86_mmu_check_vaddr(vaddr))
773         return ERR_INVALID_ARGS;
774 
775     if (!x86_mmu_check_flags(flags)) {
776         return ERR_NOT_SUPPORTED;
777     }
778 
779     if (count == 0)
780         return NO_ERROR;
781 
782     current_cr3_val = aspace->page_table;
783     ASSERT(current_cr3_val);
784 
785     range.start_vaddr = vaddr;
786     range.start_paddr = paddr;
787     range.size = count * PAGE_SIZE;
788 
789     return (x86_mmu_map_range(X86_PHYS_TO_VIRT(current_cr3_val), &range, flags));
790 }
791 
x86_mmu_early_init(void)792 void x86_mmu_early_init(void)
793 {
794     volatile uint64_t cr0, cr4;
795 
796     /* Set WP bit in CR0*/
797     cr0 = x86_get_cr0();
798     cr0 |= X86_CR0_WP;
799     x86_set_cr0(cr0);
800 
801     /* Setting the SMEP & SMAP bit in CR4 */
802     cr4 = x86_get_cr4();
803     if (check_smep_avail())
804         cr4 |= X86_CR4_SMEP;
805     if (check_smap_avail())
806         cr4 |=X86_CR4_SMAP;
807     x86_set_cr4(cr4);
808 
809     /* getting the address width from CPUID instr */
810     /* Bits 07-00: Physical Address width info */
811     /* Bits 15-08: Linear Address width info */
812     uint32_t addr_width    = x86_get_address_width();
813     g_paddr_width = (uint8_t)(addr_width & 0xFF);
814     g_vaddr_width = (uint8_t)((addr_width >> 8) & 0xFF);
815 
816     LTRACEF("paddr_width %u vaddr_width %u\n", g_paddr_width, g_vaddr_width);
817 
818     x86_kernel_page_table = x86_get_cr3();
819 
820     /* tlb flush */
821     x86_set_cr3(x86_get_cr3());
822 }
823 
x86_mmu_init(void)824 void x86_mmu_init(void)
825 {
826 }
827 
x86_create_page_table(void)828 static paddr_t x86_create_page_table(void)
829 {
830     addr_t *new_table = NULL;
831 
832     new_table = (addr_t *)_map_alloc_page();
833     ASSERT(new_table);
834 
835     /*
836      * Copy kernel level mapping to user level mapping to support syscall and
837      * interrupt handling in user level.
838      *
839      * TODO:
840      * Update to Kernel page-table isolation (KPTI) to mitigates Meltdown
841      * security vulnerabilty.
842      */
843     new_table[511] = pml4[511];
844 
845     return (paddr_t)X86_VIRT_TO_PHYS(new_table);
846 }
847 
848 /*
849  * x86-64 does not support multiple address spaces at the moment, so fail if these apis
850  * are used for it.
851  */
arch_mmu_init_aspace(arch_aspace_t * aspace,vaddr_t base,size_t size,uint flags)852 status_t arch_mmu_init_aspace(arch_aspace_t *aspace, vaddr_t base, size_t size, uint flags)
853 {
854     ASSERT(aspace);
855 
856     ASSERT(size > PAGE_SIZE);
857     ASSERT(base + size - 1 > base);
858 
859     aspace->size = size;
860     aspace->base = base;
861 
862     if ((flags & ARCH_ASPACE_FLAG_KERNEL)) {
863         aspace->page_table = x86_kernel_page_table;
864     } else {
865         aspace->page_table = x86_create_page_table();
866     }
867 
868     return NO_ERROR;
869 }
870 
arch_mmu_destroy_aspace(arch_aspace_t * aspace)871 status_t arch_mmu_destroy_aspace(arch_aspace_t *aspace)
872 {
873     ASSERT(aspace);
874 
875     pmm_free_page(paddr_to_vm_page(aspace->page_table));
876 
877     aspace->size = 0;
878     aspace->base = 0;
879     aspace->page_table = 0;
880 
881     return NO_ERROR;
882 }
883 
arch_mmu_context_switch(arch_aspace_t * aspace)884 void arch_mmu_context_switch(arch_aspace_t *aspace)
885 {
886     if (NULL == aspace) {
887         x86_set_cr3(x86_kernel_page_table);
888     } else {
889         vmm_aspace_t *kernel_aspace = vmm_get_kernel_aspace();
890         ASSERT(&kernel_aspace->arch_aspace != aspace);
891 
892         x86_set_cr3(aspace->page_table);
893     }
894 }
895 
896