1 /*
2 * Copyright (c) 2009 Corey Tabaka
3 * Copyright (c) 2015-2018 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining
6 * a copy of this software and associated documentation files
7 * (the "Software"), to deal in the Software without restriction,
8 * including without limitation the rights to use, copy, modify, merge,
9 * publish, distribute, sublicense, and/or sell copies of the Software,
10 * and to permit persons to whom the Software is furnished to do so,
11 * subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be
14 * included in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include <debug.h>
26 #include <arch.h>
27 #include <arch/ops.h>
28 #include <arch/x86.h>
29 #include <arch/x86/mmu.h>
30 #include <arch/x86/mp.h>
31 #include <arch/x86/descriptor.h>
32 #include <arch/fpu.h>
33 #include <arch/mmu.h>
34 #include <assert.h>
35 #include <platform.h>
36 #include <sys/types.h>
37 #include <string.h>
38 #include "arch/arch_thread.h"
39
40 /* early stack */
41 uint8_t _kstack[PAGE_SIZE] __ALIGNED(8);
42 uint8_t _tss_start[SMP_MAX_CPUS][PAGE_SIZE] __ALIGNED(8);
43 uint8_t _double_fault_stack[SMP_MAX_CPUS][PAGE_SIZE] __ALIGNED(8);
44
45 /* save a pointer to the multiboot information coming in from whoever called us */
46 /* make sure it lives in .data to avoid it being wiped out by bss clearing */
47 __SECTION(".data") void *_multiboot_info;
48
49 /* main tss */
50 tss_t system_tss[SMP_MAX_CPUS];
51 x86_per_cpu_states_t per_cpu_states[SMP_MAX_CPUS];
52
53 volatile int cpu_woken_up = 0;
54
init_per_cpu_state(uint cpu)55 static void init_per_cpu_state(uint cpu)
56 {
57 x86_per_cpu_states_t states;
58
59 /*
60 * At this point, BSP has already set up current thread in global state,
61 * init global states of AP(s) only.
62 */
63 if (0 != cpu) {
64 states = per_cpu_states[cpu];
65
66 states.cur_thread = NULL;
67 states.syscall_stack = 0;
68
69 write_msr(X86_MSR_GS_BASE, (uint64_t)&states);
70 }
71 }
72
x86_check_and_fix_gs(void)73 void x86_check_and_fix_gs(void)
74 {
75 uint cpu = arch_curr_cpu_num();
76 x86_per_cpu_states_t *expected_gs_base = &per_cpu_states[cpu];
77 x86_per_cpu_states_t *current_gs_base = (void *)read_msr(X86_MSR_GS_BASE);
78
79 if (current_gs_base != expected_gs_base) {
80 printf("GS base is wrong %p != %p, try swapgs\n", current_gs_base, expected_gs_base);
81 __asm__ __volatile__ (
82 "swapgs"
83 );
84 current_gs_base = (void *)read_msr(X86_MSR_GS_BASE);
85 if (current_gs_base != expected_gs_base) {
86 printf("GS base is still wrong after swapgs %p != %p\n",
87 current_gs_base, expected_gs_base);
88 write_msr(X86_MSR_GS_BASE, (uint64_t)expected_gs_base);
89 current_gs_base = (void *)read_msr(X86_MSR_GS_BASE);
90 }
91 }
92 }
93
set_tss_segment_percpu(void)94 static void set_tss_segment_percpu(void)
95 {
96 uint64_t addr;
97
98 tss_t *tss_base = get_tss_base();
99 uint cpu_id = arch_curr_cpu_num();
100 ASSERT(tss_base);
101
102 addr = (uint64_t)&_tss_start[cpu_id + 1];
103
104 /*
105 * Care about privilege 0 only, since privilege 1 and 2 are unused.
106 * This stack is used when inter-privilege changes from privilege
107 * level 3 to level 0, for instance interrupt handling when interrupt
108 * raised at level 3.
109 */
110 tss_base->rsp0 = addr;
111
112 /* Syscall uses same stack with RSP0 in TSS */
113 x86_write_gs_with_offset(SYSCALL_STACK_OFF, addr);
114
115 /*
116 * Exception and interrupt handlers share same stack with kernel context,
117 * if kernel stack is corrupted or misused, exception handler will
118 * continue to use this corrupted kernel stack, it hard to trace this
119 * error especially in Page Fault handler.
120 *
121 * In order to ensure Page Fault handler does not trigger an infinite loop,
122 * Interrupt Stack Table 1 (IST1) is dedicated to Double Fault handler.
123 * With this dedicated double fault stack, a Page Fault while the stack
124 * pointer is invalid, will trigger a double fault, that can then exit
125 * cleanly.
126 */
127 addr = (uint64_t)&_double_fault_stack[cpu_id + 1];
128 tss_base->ist1 = addr;
129 }
130
x86_syscall(void)131 __WEAK void x86_syscall(void)
132 {
133 panic("unhandled syscall\n");
134 }
135
setup_syscall_percpu(void)136 static void setup_syscall_percpu(void)
137 {
138 /*
139 * SYSENTER instruction is used to execute a fast syscall to a level 0
140 * system procedure or routine from level 3. According instruction
141 * description about SYSENTER in ISDM VOL 2, if all conditions check
142 * passed, then:
143 * RSP <- SYSENTER_ESP_MSR
144 * RIP <- SYSENTER_EIP_MSR
145 * CS.Selector <- SYSENTER_CS_MSR[15:0] & 0xFFFCH
146 * SS.Selector <- CS.Selector + 8
147 * SYSEXIT (w/64-bit operand):
148 * CS.Selector <- (SYSENTER_CS_MSR[15:0] + 32) | 3
149 * SS.Selector <- CS.Selector + 8
150 */
151 static_assert(CODE_64_SELECTOR + 8 == STACK_64_SELECTOR);
152 static_assert(CODE_64_SELECTOR + 32 == USER_CODE_64_SELECTOR);
153 static_assert(CODE_64_SELECTOR + 32 + 8 == USER_DATA_64_SELECTOR);
154
155 write_msr(SYSENTER_CS_MSR, CODE_64_SELECTOR);
156 write_msr(SYSENTER_ESP_MSR, x86_read_gs_with_offset(SYSCALL_STACK_OFF));
157 write_msr(SYSENTER_EIP_MSR, (uint64_t)(x86_syscall));
158
159 /*
160 * SYSCALL:
161 * RIP <- LSTAR_MSR
162 * CS.Selector <- STAR_MSR[47:32] & 0xFFFCH
163 * SS.Selector <- STAR_MSR[47:32] + 8
164 * SYSRET (w/64-bit operand):
165 * CS.Selector <- (STAR_MSR[63:48] + 16) | 3
166 * SS.Selector <- (STAR_MSR[63:48] + 8) | 3 - On Intel
167 * SS.Selector <- (STAR_MSR[63:48] + 8) - On AMD
168 *
169 * AMD says the hidden parts of SS are set to fixed values for SYSCALL but
170 * perplexingly left unchanged for SYSRET. Intel sets the SS hidden parts
171 * to (different) fixed values for both SYSCALL and SYSRET.
172 *
173 * AMD also states that the hidden parts of SS are ignored in 64 bit mode,
174 * but IRET throws a GP exception if SS.RPL != CS.RPL. We therefore need
175 * to set STAR_MSR[49:48] to 3 (USER_RPL) to be compatible with AMD CPUs.
176 */
177 static_assert(CODE_64_SELECTOR + 8 == STACK_64_SELECTOR);
178 static_assert(USER_CODE_COMPAT_SELECTOR + 16 == USER_CODE_64_SELECTOR);
179 /*
180 * Note that USER_DATA_COMPAT_SELECTOR is not the same value as
181 * USER_DATA_64_SELECTOR (since these instructions use hardcoded offsets),
182 * but the content of the descriptor is the same. The process will start
183 * with one SS value, but then get a different value after the syscall.
184 */
185 static_assert(USER_CODE_COMPAT_SELECTOR + 8 == USER_DATA_COMPAT_SELECTOR);
186
187 write_msr(STAR_MSR, (uint64_t)CODE_64_SELECTOR << 32 |
188 (uint64_t)(USER_CODE_COMPAT_SELECTOR | USER_RPL) << 48);
189 write_msr(LSTAR_MSR, (uint64_t)(x86_syscall));
190 write_msr(SFMASK_MSR, IF_MASK | DF_MASK);
191 }
192
arch_early_init(void)193 void arch_early_init(void)
194 {
195 seg_sel_t sel = 0;
196 uint cpu_id = 1;
197
198 cpu_id = atomic_add(&cpu_woken_up, cpu_id);
199
200 init_per_cpu_state(cpu_id);
201
202 if (check_fsgsbase_avail()) {
203 x86_set_cr4(x86_get_cr4() | X86_CR4_FSGSBASE);
204 }
205
206 sel = (seg_sel_t)(cpu_id << 4);
207 sel += TSS_SELECTOR;
208
209 /* enable caches here for now */
210 clear_in_cr0(X86_CR0_NW | X86_CR0_CD);
211
212 set_global_desc(sel,
213 &system_tss[cpu_id],
214 sizeof(tss_t),
215 1,
216 0,
217 0,
218 SEG_TYPE_TSS,
219 0,
220 0);
221 x86_ltr(sel);
222
223 x86_mmu_early_init();
224 platform_init_mmu_mappings();
225 }
226
arch_init(void)227 void arch_init(void)
228 {
229 x86_mmu_init();
230
231 set_tss_segment_percpu();
232 setup_syscall_percpu();
233
234 #ifdef X86_WITH_FPU
235 fpu_init();
236 #endif
237 }
238
arch_chain_load(void * ep,ulong arg0,ulong arg1,ulong arg2,ulong arg3)239 void arch_chain_load(void *ep, ulong arg0, ulong arg1, ulong arg2, ulong arg3)
240 {
241 PANIC_UNIMPLEMENTED;
242 }
243
arch_enter_uspace(vaddr_t ep,vaddr_t stack,vaddr_t shadow_stack_base,uint32_t flags,ulong arg0)244 void arch_enter_uspace(vaddr_t ep,
245 vaddr_t stack,
246 vaddr_t shadow_stack_base,
247 uint32_t flags,
248 ulong arg0)
249 {
250 register uint64_t sp_usr = round_down(stack + 8, 16) - 8;
251 register uint64_t entry = ep;
252 register uint64_t code_seg = USER_CODE_64_SELECTOR | USER_RPL;
253 register uint64_t data_seg = USER_DATA_64_SELECTOR | USER_RPL;
254 register uint64_t usr_flags = USER_EFLAGS;
255
256 //DEBUG_ASSERT(shadow_stack_base == 0);
257
258 /*
259 * Clear all General Purpose Registers except RDI, since RDI carries
260 * parameter to user space.
261 *
262 * IRETQ instruction is used here to perform inter-privilege level return.
263 * Input parameters 'flags' is ignored when entering level 3.
264 *
265 * LK kernel runs at IA-32e mode, when iretq instruction invoked,
266 * processor performs:
267 *
268 * 1. IA-32e-MODE operation steps, pops RIP/CS/tempRFLAGS:
269 * RIP <- POP() -- entry
270 * CS.Selector <- POP() -- code_seg
271 * tempRFLAGS <- POP() -- usr_flags
272 * 2. Since CS.RPL(3) > CPL(0), then goto return-to-outer-privilege-level:
273 * RSP <- POP() -- sp_usr
274 * SS <- POP() -- data_seg
275 * RFLAGS <- tempRFLAGS
276 * CPL <- CS.RPL
277 *
278 * After IRETQ executed, processor runs at RIP in 64-bit level 3.
279 *
280 * More details please refer "IRET/IRETD -- Interrupt Return" in Intel
281 * ISDM VOL2 <Instruction Set Reference>.
282 *
283 * Disable interrupts before swapgs so avoid getting entering the
284 * interrupt vector with a user-space gs descriptor and a kernel cs
285 * selector (which exceptions.S:interrupt_common checks).
286 */
287 __asm__ __volatile__ (
288 "pushq %0 \n"
289 "pushq %1 \n"
290 "pushq %2 \n"
291 "pushq %3 \n"
292 "pushq %4 \n"
293 "pushq %5 \n"
294 "cli \n"
295 "swapgs \n"
296 "xorq %%r15, %%r15 \n"
297 "xorq %%r14, %%r14 \n"
298 "xorq %%r13, %%r13 \n"
299 "xorq %%r12, %%r12 \n"
300 "xorq %%r11, %%r11 \n"
301 "xorq %%r10, %%r10 \n"
302 "xorq %%r9, %%r9 \n"
303 "xorq %%r8, %%r8 \n"
304 "xorq %%rbp, %%rbp \n"
305 "xorq %%rdx, %%rdx \n"
306 "xorq %%rcx, %%rcx \n"
307 "xorq %%rbx, %%rbx \n"
308 "xorq %%rax, %%rax \n"
309 "xorq %%rsi, %%rsi \n"
310 "popq %%rdi \n"
311 "iretq"
312 :
313 :"r" (data_seg), "r" (sp_usr), "r" (usr_flags),
314 "r" (code_seg), "r"(entry), "r" (arg0));
315
316 __UNREACHABLE;
317 }
318
arch_set_user_tls(vaddr_t tls_ptr)319 void arch_set_user_tls(vaddr_t tls_ptr)
320 {
321 thread_t *cur_thread = get_current_thread();
322
323 cur_thread->arch.fs_base = tls_ptr;
324 write_msr(X86_MSR_FS_BASE, tls_ptr);
325 }
326