1 /*
2  * Copyright (c) 2009 Corey Tabaka
3  * Copyright (c) 2015-2018 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining
6  * a copy of this software and associated documentation files
7  * (the "Software"), to deal in the Software without restriction,
8  * including without limitation the rights to use, copy, modify, merge,
9  * publish, distribute, sublicense, and/or sell copies of the Software,
10  * and to permit persons to whom the Software is furnished to do so,
11  * subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be
14  * included in all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include <debug.h>
26 #include <arch.h>
27 #include <arch/ops.h>
28 #include <arch/x86.h>
29 #include <arch/x86/mmu.h>
30 #include <arch/x86/mp.h>
31 #include <arch/x86/descriptor.h>
32 #include <arch/fpu.h>
33 #include <arch/mmu.h>
34 #include <assert.h>
35 #include <platform.h>
36 #include <sys/types.h>
37 #include <string.h>
38 #include "arch/arch_thread.h"
39 
40 /* early stack */
41 uint8_t _kstack[PAGE_SIZE] __ALIGNED(8);
42 uint8_t _tss_start[SMP_MAX_CPUS][PAGE_SIZE] __ALIGNED(8);
43 uint8_t _double_fault_stack[SMP_MAX_CPUS][PAGE_SIZE] __ALIGNED(8);
44 
45 /* save a pointer to the multiboot information coming in from whoever called us */
46 /* make sure it lives in .data to avoid it being wiped out by bss clearing */
47 __SECTION(".data") void *_multiboot_info;
48 
49 /* main tss */
50 tss_t system_tss[SMP_MAX_CPUS];
51 x86_per_cpu_states_t per_cpu_states[SMP_MAX_CPUS];
52 
53 volatile int cpu_woken_up = 0;
54 
init_per_cpu_state(uint cpu)55 static void init_per_cpu_state(uint cpu)
56 {
57     x86_per_cpu_states_t states;
58 
59     /*
60      * At this point, BSP has already set up current thread in global state,
61      * init global states of AP(s) only.
62      */
63     if (0 != cpu) {
64         states = per_cpu_states[cpu];
65 
66         states.cur_thread    = NULL;
67         states.syscall_stack = 0;
68 
69         write_msr(X86_MSR_GS_BASE, (uint64_t)&states);
70     }
71 }
72 
x86_check_and_fix_gs(void)73 void x86_check_and_fix_gs(void)
74 {
75     uint cpu = arch_curr_cpu_num();
76     x86_per_cpu_states_t *expected_gs_base = &per_cpu_states[cpu];
77     x86_per_cpu_states_t *current_gs_base = (void *)read_msr(X86_MSR_GS_BASE);
78 
79     if (current_gs_base != expected_gs_base) {
80         printf("GS base is wrong %p != %p, try swapgs\n", current_gs_base, expected_gs_base);
81         __asm__ __volatile__ (
82             "swapgs"
83         );
84         current_gs_base = (void *)read_msr(X86_MSR_GS_BASE);
85         if (current_gs_base != expected_gs_base) {
86             printf("GS base is still wrong after swapgs %p != %p\n",
87                    current_gs_base, expected_gs_base);
88             write_msr(X86_MSR_GS_BASE, (uint64_t)expected_gs_base);
89             current_gs_base = (void *)read_msr(X86_MSR_GS_BASE);
90         }
91     }
92 }
93 
set_tss_segment_percpu(void)94 static void set_tss_segment_percpu(void)
95 {
96     uint64_t addr;
97 
98     tss_t *tss_base = get_tss_base();
99     uint cpu_id = arch_curr_cpu_num();
100     ASSERT(tss_base);
101 
102     addr = (uint64_t)&_tss_start[cpu_id + 1];
103 
104     /*
105      * Care about privilege 0 only, since privilege 1 and 2 are unused.
106      * This stack is used when inter-privilege changes from privilege
107      * level 3 to level 0, for instance interrupt handling when interrupt
108      * raised at level 3.
109      */
110     tss_base->rsp0 = addr;
111 
112     /* Syscall uses same stack with RSP0 in TSS */
113     x86_write_gs_with_offset(SYSCALL_STACK_OFF, addr);
114 
115     /*
116      * Exception and interrupt handlers share same stack with kernel context,
117      * if kernel stack is corrupted or misused, exception handler will
118      * continue to use this corrupted kernel stack, it hard to trace this
119      * error especially in Page Fault handler.
120      *
121      * In order to ensure Page Fault handler does not trigger an infinite loop,
122      * Interrupt Stack Table 1 (IST1) is dedicated to Double Fault handler.
123      * With this dedicated double fault stack, a Page Fault while the stack
124      * pointer is invalid, will trigger a double fault, that can then exit
125      * cleanly.
126      */
127     addr = (uint64_t)&_double_fault_stack[cpu_id + 1];
128     tss_base->ist1 = addr;
129 }
130 
x86_syscall(void)131 __WEAK void x86_syscall(void)
132 {
133     panic("unhandled syscall\n");
134 }
135 
setup_syscall_percpu(void)136 static void setup_syscall_percpu(void)
137 {
138     /*
139      * SYSENTER instruction is used to execute a fast syscall to a level 0
140      * system procedure or routine from level 3. According instruction
141      * description about SYSENTER in ISDM VOL 2, if all conditions check
142      * passed, then:
143      *      RSP          <-  SYSENTER_ESP_MSR
144      *      RIP          <-  SYSENTER_EIP_MSR
145      *      CS.Selector  <-  SYSENTER_CS_MSR[15:0] & 0xFFFCH
146      *      SS.Selector  <-  CS.Selector + 8
147      * SYSEXIT (w/64-bit operand):
148      *      CS.Selector  <-  (SYSENTER_CS_MSR[15:0] + 32) | 3
149      *      SS.Selector  <-  CS.Selector + 8
150      */
151     static_assert(CODE_64_SELECTOR + 8 == STACK_64_SELECTOR);
152     static_assert(CODE_64_SELECTOR + 32 == USER_CODE_64_SELECTOR);
153     static_assert(CODE_64_SELECTOR + 32 + 8 == USER_DATA_64_SELECTOR);
154 
155     write_msr(SYSENTER_CS_MSR, CODE_64_SELECTOR);
156     write_msr(SYSENTER_ESP_MSR, x86_read_gs_with_offset(SYSCALL_STACK_OFF));
157     write_msr(SYSENTER_EIP_MSR, (uint64_t)(x86_syscall));
158 
159     /*
160      * SYSCALL:
161      *      RIP          <-  LSTAR_MSR
162      *      CS.Selector  <-  STAR_MSR[47:32] & 0xFFFCH
163      *      SS.Selector  <-  STAR_MSR[47:32] + 8
164      * SYSRET (w/64-bit operand):
165      *      CS.Selector  <-  (STAR_MSR[63:48] + 16) | 3
166      *      SS.Selector  <-  (STAR_MSR[63:48] + 8) | 3 - On Intel
167      *      SS.Selector  <-  (STAR_MSR[63:48] + 8) - On AMD
168      *
169      * AMD says the hidden parts of SS are set to fixed values for SYSCALL but
170      * perplexingly left unchanged for SYSRET. Intel sets the SS hidden parts
171      * to (different) fixed values for both SYSCALL and SYSRET.
172      *
173      * AMD also states that the hidden parts of SS are ignored in 64 bit mode,
174      * but IRET throws a GP exception if SS.RPL != CS.RPL. We therefore need
175      * to set STAR_MSR[49:48] to 3 (USER_RPL) to be compatible with AMD CPUs.
176      */
177     static_assert(CODE_64_SELECTOR + 8 == STACK_64_SELECTOR);
178     static_assert(USER_CODE_COMPAT_SELECTOR + 16 == USER_CODE_64_SELECTOR);
179     /*
180      * Note that USER_DATA_COMPAT_SELECTOR is not the same value as
181      * USER_DATA_64_SELECTOR (since these instructions use hardcoded offsets),
182      * but the content of the descriptor is the same. The process will start
183      * with one SS value, but then get a different value after the syscall.
184      */
185     static_assert(USER_CODE_COMPAT_SELECTOR + 8 == USER_DATA_COMPAT_SELECTOR);
186 
187     write_msr(STAR_MSR, (uint64_t)CODE_64_SELECTOR << 32 |
188                         (uint64_t)(USER_CODE_COMPAT_SELECTOR | USER_RPL) << 48);
189     write_msr(LSTAR_MSR, (uint64_t)(x86_syscall));
190     write_msr(SFMASK_MSR, IF_MASK | DF_MASK);
191 }
192 
arch_early_init(void)193 void arch_early_init(void)
194 {
195     seg_sel_t sel = 0;
196     uint cpu_id = 1;
197 
198     cpu_id = atomic_add(&cpu_woken_up, cpu_id);
199 
200     init_per_cpu_state(cpu_id);
201 
202     if (check_fsgsbase_avail()) {
203         x86_set_cr4(x86_get_cr4() | X86_CR4_FSGSBASE);
204     }
205 
206     sel = (seg_sel_t)(cpu_id << 4);
207     sel += TSS_SELECTOR;
208 
209     /* enable caches here for now */
210     clear_in_cr0(X86_CR0_NW | X86_CR0_CD);
211 
212     set_global_desc(sel,
213             &system_tss[cpu_id],
214             sizeof(tss_t),
215             1,
216             0,
217             0,
218             SEG_TYPE_TSS,
219             0,
220             0);
221     x86_ltr(sel);
222 
223     x86_mmu_early_init();
224     platform_init_mmu_mappings();
225 }
226 
arch_init(void)227 void arch_init(void)
228 {
229     x86_mmu_init();
230 
231     set_tss_segment_percpu();
232     setup_syscall_percpu();
233 
234 #ifdef X86_WITH_FPU
235     fpu_init();
236 #endif
237 }
238 
arch_chain_load(void * ep,ulong arg0,ulong arg1,ulong arg2,ulong arg3)239 void arch_chain_load(void *ep, ulong arg0, ulong arg1, ulong arg2, ulong arg3)
240 {
241     PANIC_UNIMPLEMENTED;
242 }
243 
arch_enter_uspace(vaddr_t ep,vaddr_t stack,vaddr_t shadow_stack_base,uint32_t flags,ulong arg0)244 void arch_enter_uspace(vaddr_t ep,
245                        vaddr_t stack,
246                        vaddr_t shadow_stack_base,
247                        uint32_t flags,
248                        ulong arg0)
249 {
250     register uint64_t sp_usr = round_down(stack + 8, 16) - 8;
251     register uint64_t entry = ep;
252     register uint64_t code_seg = USER_CODE_64_SELECTOR | USER_RPL;
253     register uint64_t data_seg = USER_DATA_64_SELECTOR | USER_RPL;
254     register uint64_t usr_flags = USER_EFLAGS;
255 
256     //DEBUG_ASSERT(shadow_stack_base == 0);
257 
258     /*
259      * Clear all General Purpose Registers except RDI, since RDI carries
260      * parameter to user space.
261      *
262      * IRETQ instruction is used here to perform inter-privilege level return.
263      * Input parameters 'flags' is ignored when entering level 3.
264      *
265      * LK kernel runs at IA-32e mode, when iretq instruction invoked,
266      * processor performs:
267      *
268      * 1. IA-32e-MODE operation steps, pops RIP/CS/tempRFLAGS:
269      *      RIP          <- POP()       --  entry
270      *      CS.Selector  <- POP()       --  code_seg
271      *      tempRFLAGS   <- POP()       --  usr_flags
272      * 2. Since CS.RPL(3) > CPL(0), then goto return-to-outer-privilege-level:
273      *      RSP          <- POP()       --  sp_usr
274      *      SS           <- POP()       --  data_seg
275      *      RFLAGS       <- tempRFLAGS
276      *      CPL          <- CS.RPL
277      *
278      * After IRETQ executed, processor runs at RIP in 64-bit level 3.
279      *
280      * More details please refer "IRET/IRETD -- Interrupt Return" in Intel
281      * ISDM VOL2 <Instruction Set Reference>.
282      *
283      * Disable interrupts before swapgs so avoid getting entering the
284      * interrupt vector with a user-space gs descriptor and a kernel cs
285      * selector (which exceptions.S:interrupt_common checks).
286      */
287     __asm__ __volatile__ (
288             "pushq %0   \n"
289             "pushq %1   \n"
290             "pushq %2   \n"
291             "pushq %3   \n"
292             "pushq %4   \n"
293             "pushq %5   \n"
294             "cli \n"
295             "swapgs \n"
296             "xorq %%r15, %%r15 \n"
297             "xorq %%r14, %%r14 \n"
298             "xorq %%r13, %%r13 \n"
299             "xorq %%r12, %%r12 \n"
300             "xorq %%r11, %%r11 \n"
301             "xorq %%r10, %%r10 \n"
302             "xorq %%r9, %%r9 \n"
303             "xorq %%r8, %%r8 \n"
304             "xorq %%rbp, %%rbp \n"
305             "xorq %%rdx, %%rdx \n"
306             "xorq %%rcx, %%rcx \n"
307             "xorq %%rbx, %%rbx \n"
308             "xorq %%rax, %%rax \n"
309             "xorq %%rsi, %%rsi \n"
310             "popq %%rdi \n"
311             "iretq"
312             :
313             :"r" (data_seg), "r" (sp_usr), "r" (usr_flags),
314              "r" (code_seg), "r"(entry), "r" (arg0));
315 
316     __UNREACHABLE;
317 }
318 
arch_set_user_tls(vaddr_t tls_ptr)319 void arch_set_user_tls(vaddr_t tls_ptr)
320 {
321     thread_t *cur_thread = get_current_thread();
322 
323     cur_thread->arch.fs_base = tls_ptr;
324     write_msr(X86_MSR_FS_BASE, tls_ptr);
325 }
326