1 // Copyright 2020 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::arch::x86_64::CpuidResult;
6 #[cfg(any(unix, feature = "haxm", feature = "whpx"))]
7 use std::arch::x86_64::__cpuid;
8 use std::arch::x86_64::_rdtsc;
9 use std::collections::BTreeMap;
10 use std::collections::HashSet;
11
12 use anyhow::Context;
13 use base::custom_serde::deserialize_seq_to_arr;
14 use base::custom_serde::serialize_arr;
15 use base::error;
16 use base::warn;
17 use base::Result;
18 use bit_field::*;
19 use downcast_rs::impl_downcast;
20 use libc::c_void;
21 use serde::Deserialize;
22 use serde::Serialize;
23 use vm_memory::GuestAddress;
24
25 use crate::Hypervisor;
26 use crate::IrqRoute;
27 use crate::IrqSource;
28 use crate::IrqSourceChip;
29 use crate::Vcpu;
30 use crate::Vm;
31
32 const MSR_F15H_PERF_CTL0: u32 = 0xc0010200;
33 const MSR_F15H_PERF_CTL1: u32 = 0xc0010202;
34 const MSR_F15H_PERF_CTL2: u32 = 0xc0010204;
35 const MSR_F15H_PERF_CTL3: u32 = 0xc0010206;
36 const MSR_F15H_PERF_CTL4: u32 = 0xc0010208;
37 const MSR_F15H_PERF_CTL5: u32 = 0xc001020a;
38 const MSR_F15H_PERF_CTR0: u32 = 0xc0010201;
39 const MSR_F15H_PERF_CTR1: u32 = 0xc0010203;
40 const MSR_F15H_PERF_CTR2: u32 = 0xc0010205;
41 const MSR_F15H_PERF_CTR3: u32 = 0xc0010207;
42 const MSR_F15H_PERF_CTR4: u32 = 0xc0010209;
43 const MSR_F15H_PERF_CTR5: u32 = 0xc001020b;
44 const MSR_IA32_PERF_CAPABILITIES: u32 = 0x00000345;
45
46 /// A trait for managing cpuids for an x86_64 hypervisor and for checking its capabilities.
47 pub trait HypervisorX86_64: Hypervisor {
48 /// Get the system supported CPUID values.
get_supported_cpuid(&self) -> Result<CpuId>49 fn get_supported_cpuid(&self) -> Result<CpuId>;
50
51 /// Gets the list of supported MSRs.
get_msr_index_list(&self) -> Result<Vec<u32>>52 fn get_msr_index_list(&self) -> Result<Vec<u32>>;
53 }
54
55 /// A wrapper for using a VM on x86_64 and getting/setting its state.
56 pub trait VmX86_64: Vm {
57 /// Gets the `HypervisorX86_64` that created this VM.
get_hypervisor(&self) -> &dyn HypervisorX86_6458 fn get_hypervisor(&self) -> &dyn HypervisorX86_64;
59
60 /// Create a Vcpu with the specified Vcpu ID.
create_vcpu(&self, id: usize) -> Result<Box<dyn VcpuX86_64>>61 fn create_vcpu(&self, id: usize) -> Result<Box<dyn VcpuX86_64>>;
62
63 /// Sets the address of the three-page region in the VM's address space.
set_tss_addr(&self, addr: GuestAddress) -> Result<()>64 fn set_tss_addr(&self, addr: GuestAddress) -> Result<()>;
65
66 /// Sets the address of a one-page region in the VM's address space.
set_identity_map_addr(&self, addr: GuestAddress) -> Result<()>67 fn set_identity_map_addr(&self, addr: GuestAddress) -> Result<()>;
68 }
69
70 /// A wrapper around creating and using a VCPU on x86_64.
71 pub trait VcpuX86_64: Vcpu {
72 /// Sets or clears the flag that requests the VCPU to exit when it becomes possible to inject
73 /// interrupts into the guest.
set_interrupt_window_requested(&self, requested: bool)74 fn set_interrupt_window_requested(&self, requested: bool);
75
76 /// Checks if we can inject an interrupt into the VCPU.
ready_for_interrupt(&self) -> bool77 fn ready_for_interrupt(&self) -> bool;
78
79 /// Injects interrupt vector `irq` into the VCPU.
80 ///
81 /// This function should only be called when [`Self::ready_for_interrupt`] returns true.
82 /// Otherwise the interrupt injection may fail or the next VCPU run may fail. However, if
83 /// [`Self::interrupt`] returns [`Ok`], the implementation must guarantee that the interrupt
84 /// isn't injected in an uninterruptible window (e.g. right after the mov ss instruction).
85 ///
86 /// The caller should avoid calling this function more than 1 time for one VMEXIT, because the
87 /// hypervisor may behave differently: some hypervisors(e.g. WHPX, KVM) will only try to inject
88 /// the last `irq` requested, while some other hypervisors(e.g. HAXM) may try to inject all
89 /// `irq`s requested.
interrupt(&self, irq: u8) -> Result<()>90 fn interrupt(&self, irq: u8) -> Result<()>;
91
92 /// Injects a non-maskable interrupt into the VCPU.
inject_nmi(&self) -> Result<()>93 fn inject_nmi(&self) -> Result<()>;
94
95 /// Gets the VCPU general purpose registers.
get_regs(&self) -> Result<Regs>96 fn get_regs(&self) -> Result<Regs>;
97
98 /// Sets the VCPU general purpose registers.
set_regs(&self, regs: &Regs) -> Result<()>99 fn set_regs(&self, regs: &Regs) -> Result<()>;
100
101 /// Gets the VCPU special registers.
get_sregs(&self) -> Result<Sregs>102 fn get_sregs(&self) -> Result<Sregs>;
103
104 /// Sets the VCPU special registers.
set_sregs(&self, sregs: &Sregs) -> Result<()>105 fn set_sregs(&self, sregs: &Sregs) -> Result<()>;
106
107 /// Gets the VCPU FPU registers.
get_fpu(&self) -> Result<Fpu>108 fn get_fpu(&self) -> Result<Fpu>;
109
110 /// Sets the VCPU FPU registers.
set_fpu(&self, fpu: &Fpu) -> Result<()>111 fn set_fpu(&self, fpu: &Fpu) -> Result<()>;
112
113 /// Gets the VCPU debug registers.
get_debugregs(&self) -> Result<DebugRegs>114 fn get_debugregs(&self) -> Result<DebugRegs>;
115
116 /// Sets the VCPU debug registers.
set_debugregs(&self, debugregs: &DebugRegs) -> Result<()>117 fn set_debugregs(&self, debugregs: &DebugRegs) -> Result<()>;
118
119 /// Gets the VCPU extended control registers.
get_xcrs(&self) -> Result<BTreeMap<u32, u64>>120 fn get_xcrs(&self) -> Result<BTreeMap<u32, u64>>;
121
122 /// Sets a VCPU extended control register.
set_xcr(&self, xcr: u32, value: u64) -> Result<()>123 fn set_xcr(&self, xcr: u32, value: u64) -> Result<()>;
124
125 /// Gets the VCPU x87 FPU, MMX, XMM, YMM and MXCSR registers.
get_xsave(&self) -> Result<Xsave>126 fn get_xsave(&self) -> Result<Xsave>;
127
128 /// Sets the VCPU x87 FPU, MMX, XMM, YMM and MXCSR registers.
set_xsave(&self, xsave: &Xsave) -> Result<()>129 fn set_xsave(&self, xsave: &Xsave) -> Result<()>;
130
131 /// Gets interrupt state (hypervisor specific) for this VCPU that must be
132 /// saved/restored for snapshotting.
get_interrupt_state(&self) -> Result<serde_json::Value>133 fn get_interrupt_state(&self) -> Result<serde_json::Value>;
134
135 /// Sets interrupt state (hypervisor specific) for this VCPU. Only used for
136 /// snapshotting.
set_interrupt_state(&self, data: serde_json::Value) -> Result<()>137 fn set_interrupt_state(&self, data: serde_json::Value) -> Result<()>;
138
139 /// Gets a single model-specific register's value.
get_msr(&self, msr_index: u32) -> Result<u64>140 fn get_msr(&self, msr_index: u32) -> Result<u64>;
141
142 /// Gets the model-specific registers. Returns all the MSRs for the VCPU.
get_all_msrs(&self) -> Result<BTreeMap<u32, u64>>143 fn get_all_msrs(&self) -> Result<BTreeMap<u32, u64>>;
144
145 /// Sets a single model-specific register's value.
set_msr(&self, msr_index: u32, value: u64) -> Result<()>146 fn set_msr(&self, msr_index: u32, value: u64) -> Result<()>;
147
148 /// Sets up the data returned by the CPUID instruction.
set_cpuid(&self, cpuid: &CpuId) -> Result<()>149 fn set_cpuid(&self, cpuid: &CpuId) -> Result<()>;
150
151 /// Sets up debug registers and configure vcpu for handling guest debug events.
set_guest_debug(&self, addrs: &[GuestAddress], enable_singlestep: bool) -> Result<()>152 fn set_guest_debug(&self, addrs: &[GuestAddress], enable_singlestep: bool) -> Result<()>;
153
154 /// This function should be called after `Vcpu::run` returns `VcpuExit::Cpuid`, and `entry`
155 /// should represent the result of emulating the CPUID instruction. The `handle_cpuid` function
156 /// will then set the appropriate registers on the vcpu.
handle_cpuid(&mut self, entry: &CpuIdEntry) -> Result<()>157 fn handle_cpuid(&mut self, entry: &CpuIdEntry) -> Result<()>;
158
159 /// Gets the guest->host TSC offset.
160 ///
161 /// The default implementation uses [`VcpuX86_64::get_msr()`] to read the guest TSC.
get_tsc_offset(&self) -> Result<u64>162 fn get_tsc_offset(&self) -> Result<u64> {
163 // SAFETY:
164 // Safe because _rdtsc takes no arguments
165 let host_before_tsc = unsafe { _rdtsc() };
166
167 // get guest TSC value from our hypervisor
168 let guest_tsc = self.get_msr(crate::MSR_IA32_TSC)?;
169
170 // SAFETY:
171 // Safe because _rdtsc takes no arguments
172 let host_after_tsc = unsafe { _rdtsc() };
173
174 // Average the before and after host tsc to get the best value
175 let host_tsc = ((host_before_tsc as u128 + host_after_tsc as u128) / 2) as u64;
176
177 Ok(guest_tsc.wrapping_sub(host_tsc))
178 }
179
180 /// Sets the guest->host TSC offset.
181 ///
182 /// The default implementation uses [`VcpuX86_64::set_tsc_value()`] to set the TSC value.
183 ///
184 /// It sets TSC_OFFSET (VMCS / CB field) by setting the TSC MSR to the current
185 /// host TSC value plus the desired offset. We rely on the fact that hypervisors
186 /// determine the value of TSC_OFFSET by computing TSC_OFFSET = `new_tsc_value - _rdtsc()` =
187 /// `_rdtsc() + offset - _rdtsc()` ~= `offset`. Note that the ~= is important: this is an
188 /// approximate operation, because the two _rdtsc() calls
189 /// are separated by at least a few ticks.
190 ///
191 /// Note: TSC_OFFSET, host TSC, guest TSC, and TSC MSR are all different
192 /// concepts.
193 /// * When a guest executes rdtsc, the value (guest TSC) returned is host_tsc * TSC_MULTIPLIER +
194 /// TSC_OFFSET + TSC_ADJUST.
195 /// * The TSC MSR is a special MSR that when written to by the host, will cause TSC_OFFSET to be
196 /// set accordingly by the hypervisor.
197 /// * When the guest *writes* to TSC MSR, it actually changes the TSC_ADJUST MSR *for the
198 /// guest*. Generally this is only happens if the guest is trying to re-zero or synchronize
199 /// TSCs.
set_tsc_offset(&self, offset: u64) -> Result<()>200 fn set_tsc_offset(&self, offset: u64) -> Result<()> {
201 // SAFETY: _rdtsc takes no arguments.
202 let host_tsc = unsafe { _rdtsc() };
203 self.set_tsc_value(host_tsc.wrapping_add(offset))
204 }
205
206 /// Sets the guest TSC exactly to the provided value.
207 ///
208 /// The default implementation sets the guest's TSC by writing the value to the MSR directly.
209 ///
210 /// See [`VcpuX86_64::set_tsc_offset()`] for an explanation of how this value is actually read
211 /// by the guest after being set.
set_tsc_value(&self, value: u64) -> Result<()>212 fn set_tsc_value(&self, value: u64) -> Result<()> {
213 self.set_msr(crate::MSR_IA32_TSC, value)
214 }
215
216 /// Some hypervisors require special handling to restore timekeeping when
217 /// a snapshot is restored. They are provided with a host TSC reference
218 /// moment, guaranteed to be the same across all Vcpus, and the Vcpu's TSC
219 /// offset at the moment it was snapshotted.
restore_timekeeping(&self, host_tsc_reference_moment: u64, tsc_offset: u64) -> Result<()>220 fn restore_timekeeping(&self, host_tsc_reference_moment: u64, tsc_offset: u64) -> Result<()>;
221
222 /// Snapshot vCPU state
snapshot(&self) -> anyhow::Result<VcpuSnapshot>223 fn snapshot(&self) -> anyhow::Result<VcpuSnapshot> {
224 Ok(VcpuSnapshot {
225 vcpu_id: self.id(),
226 regs: self.get_regs()?,
227 sregs: self.get_sregs()?,
228 debug_regs: self.get_debugregs()?,
229 xcrs: self.get_xcrs()?,
230 msrs: self.get_all_msrs()?,
231 xsave: self.get_xsave()?,
232 hypervisor_data: self.get_interrupt_state()?,
233 tsc_offset: self.get_tsc_offset()?,
234 })
235 }
236
restore( &mut self, snapshot: &VcpuSnapshot, host_tsc_reference_moment: u64, ) -> anyhow::Result<()>237 fn restore(
238 &mut self,
239 snapshot: &VcpuSnapshot,
240 host_tsc_reference_moment: u64,
241 ) -> anyhow::Result<()> {
242 // List of MSRs that may fail to restore due to lack of support in the host kernel.
243 // Some hosts are may be running older kernels which do not support all MSRs, but
244 // get_all_msrs will still fetch the MSRs supported by the CPU. Trying to set those MSRs
245 // will result in failures, so they will throw a warning instead.
246 let msr_allowlist = HashSet::from([
247 MSR_F15H_PERF_CTL0,
248 MSR_F15H_PERF_CTL1,
249 MSR_F15H_PERF_CTL2,
250 MSR_F15H_PERF_CTL3,
251 MSR_F15H_PERF_CTL4,
252 MSR_F15H_PERF_CTL5,
253 MSR_F15H_PERF_CTR0,
254 MSR_F15H_PERF_CTR1,
255 MSR_F15H_PERF_CTR2,
256 MSR_F15H_PERF_CTR3,
257 MSR_F15H_PERF_CTR4,
258 MSR_F15H_PERF_CTR5,
259 MSR_IA32_PERF_CAPABILITIES,
260 ]);
261 assert_eq!(snapshot.vcpu_id, self.id());
262 self.set_regs(&snapshot.regs)?;
263 self.set_sregs(&snapshot.sregs)?;
264 self.set_debugregs(&snapshot.debug_regs)?;
265 for (xcr_index, value) in &snapshot.xcrs {
266 self.set_xcr(*xcr_index, *value)?;
267 }
268
269 for (msr_index, value) in snapshot.msrs.iter() {
270 if self.get_msr(*msr_index) == Ok(*value) {
271 continue; // no need to set MSR since the values are the same.
272 }
273 if let Err(e) = self.set_msr(*msr_index, *value) {
274 if msr_allowlist.contains(msr_index) {
275 warn!(
276 "Failed to set MSR. MSR might not be supported in this kernel. Err: {}",
277 e
278 );
279 } else {
280 return Err(e).context(
281 "Failed to set MSR. MSR might not be supported by the CPU or by the kernel,
282 and was not allow-listed.",
283 );
284 }
285 };
286 }
287 self.set_xsave(&snapshot.xsave)?;
288 self.set_interrupt_state(snapshot.hypervisor_data.clone())?;
289 self.restore_timekeeping(host_tsc_reference_moment, snapshot.tsc_offset)?;
290 Ok(())
291 }
292 }
293
294 /// x86 specific vCPU snapshot.
295 #[derive(Clone, Debug, Serialize, Deserialize)]
296 pub struct VcpuSnapshot {
297 pub vcpu_id: usize,
298 regs: Regs,
299 sregs: Sregs,
300 debug_regs: DebugRegs,
301 xcrs: BTreeMap<u32, u64>,
302 msrs: BTreeMap<u32, u64>,
303 xsave: Xsave,
304 hypervisor_data: serde_json::Value,
305 tsc_offset: u64,
306 }
307
308 impl_downcast!(VcpuX86_64);
309
310 // TSC MSR
311 pub const MSR_IA32_TSC: u32 = 0x00000010;
312
313 /// Gets host cpu max physical address bits.
314 #[cfg(any(unix, feature = "haxm", feature = "whpx"))]
host_phys_addr_bits() -> u8315 pub(crate) fn host_phys_addr_bits() -> u8 {
316 // SAFETY: trivially safe
317 let highest_ext_function = unsafe { __cpuid(0x80000000) };
318 if highest_ext_function.eax >= 0x80000008 {
319 // SAFETY: trivially safe
320 let addr_size = unsafe { __cpuid(0x80000008) };
321 // Low 8 bits of 0x80000008 leaf: host physical address size in bits.
322 addr_size.eax as u8
323 } else {
324 36
325 }
326 }
327
328 /// Initial state for x86_64 VCPUs.
329 #[derive(Clone, Default)]
330 pub struct VcpuInitX86_64 {
331 /// General-purpose registers.
332 pub regs: Regs,
333
334 /// Special registers.
335 pub sregs: Sregs,
336
337 /// Floating-point registers.
338 pub fpu: Fpu,
339
340 /// Machine-specific registers.
341 pub msrs: BTreeMap<u32, u64>,
342 }
343
344 /// Hold the CPU feature configurations that are needed to setup a vCPU.
345 #[derive(Clone, Debug, PartialEq, Eq)]
346 pub struct CpuConfigX86_64 {
347 /// whether to force using a calibrated TSC leaf (0x15).
348 pub force_calibrated_tsc_leaf: bool,
349
350 /// whether enabling host cpu topology.
351 pub host_cpu_topology: bool,
352
353 /// whether expose HWP feature to the guest.
354 pub enable_hwp: bool,
355
356 /// Wheter diabling SMT (Simultaneous Multithreading).
357 pub no_smt: bool,
358
359 /// whether enabling ITMT scheduler
360 pub itmt: bool,
361
362 /// whether setting hybrid CPU type
363 pub hybrid_type: Option<CpuHybridType>,
364 }
365
366 impl CpuConfigX86_64 {
new( force_calibrated_tsc_leaf: bool, host_cpu_topology: bool, enable_hwp: bool, no_smt: bool, itmt: bool, hybrid_type: Option<CpuHybridType>, ) -> Self367 pub fn new(
368 force_calibrated_tsc_leaf: bool,
369 host_cpu_topology: bool,
370 enable_hwp: bool,
371 no_smt: bool,
372 itmt: bool,
373 hybrid_type: Option<CpuHybridType>,
374 ) -> Self {
375 CpuConfigX86_64 {
376 force_calibrated_tsc_leaf,
377 host_cpu_topology,
378 enable_hwp,
379 no_smt,
380 itmt,
381 hybrid_type,
382 }
383 }
384 }
385
386 /// A CpuId Entry contains supported feature information for the given processor.
387 /// This can be modified by the hypervisor to pass additional information to the guest kernel
388 /// about the hypervisor or vm. Information is returned in the eax, ebx, ecx and edx registers
389 /// by the cpu for a given function and index/subfunction (passed into the cpu via the eax and ecx
390 /// register respectively).
391 #[repr(C)]
392 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
393 pub struct CpuIdEntry {
394 pub function: u32,
395 pub index: u32,
396 // flags is needed for KVM. We store it on CpuIdEntry to preserve the flags across
397 // get_supported_cpuids() -> kvm_cpuid2 -> CpuId -> kvm_cpuid2 -> set_cpuid().
398 pub flags: u32,
399 pub cpuid: CpuidResult,
400 }
401
402 /// A container for the list of cpu id entries for the hypervisor and underlying cpu.
403 pub struct CpuId {
404 pub cpu_id_entries: Vec<CpuIdEntry>,
405 }
406
407 impl CpuId {
408 /// Constructs a new CpuId, with space allocated for `initial_capacity` CpuIdEntries.
new(initial_capacity: usize) -> Self409 pub fn new(initial_capacity: usize) -> Self {
410 CpuId {
411 cpu_id_entries: Vec::with_capacity(initial_capacity),
412 }
413 }
414 }
415
416 #[bitfield]
417 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
418 pub enum DestinationMode {
419 Physical = 0,
420 Logical = 1,
421 }
422
423 #[bitfield]
424 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
425 pub enum TriggerMode {
426 Edge = 0,
427 Level = 1,
428 }
429
430 #[bitfield]
431 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
432 pub enum DeliveryMode {
433 Fixed = 0b000,
434 Lowest = 0b001,
435 SMI = 0b010, // System management interrupt
436 RemoteRead = 0b011, // This is no longer supported by intel.
437 NMI = 0b100, // Non maskable interrupt
438 Init = 0b101,
439 Startup = 0b110,
440 External = 0b111,
441 }
442
443 // These MSI structures are for Intel's implementation of MSI. The PCI spec defines most of MSI,
444 // but the Intel spec defines the format of messages for raising interrupts. The PCI spec defines
445 // three u32s -- the address, address_high, and data -- but Intel only makes use of the address and
446 // data. The Intel portion of the specification is in Volume 3 section 10.11.
447 #[bitfield]
448 #[derive(Clone, Copy, PartialEq, Eq)]
449 pub struct MsiAddressMessage {
450 pub reserved: BitField2,
451 #[bits = 1]
452 pub destination_mode: DestinationMode,
453 pub redirection_hint: BitField1,
454 pub reserved_2: BitField8,
455 pub destination_id: BitField8,
456 // According to Intel's implementation of MSI, these bits must always be 0xfee.
457 pub always_0xfee: BitField12,
458 }
459
460 #[bitfield]
461 #[derive(Clone, Copy, PartialEq, Eq)]
462 pub struct MsiDataMessage {
463 pub vector: BitField8,
464 #[bits = 3]
465 pub delivery_mode: DeliveryMode,
466 pub reserved: BitField3,
467 #[bits = 1]
468 pub level: Level,
469 #[bits = 1]
470 pub trigger: TriggerMode,
471 pub reserved2: BitField16,
472 }
473
474 #[bitfield]
475 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
476 pub enum DeliveryStatus {
477 Idle = 0,
478 Pending = 1,
479 }
480
481 /// The level of a level-triggered interrupt: asserted or deasserted.
482 #[bitfield]
483 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
484 pub enum Level {
485 Deassert = 0,
486 Assert = 1,
487 }
488
489 /// Represents a IOAPIC redirection table entry.
490 #[bitfield]
491 #[derive(Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
492 pub struct IoapicRedirectionTableEntry {
493 vector: BitField8,
494 #[bits = 3]
495 delivery_mode: DeliveryMode,
496 #[bits = 1]
497 dest_mode: DestinationMode,
498 #[bits = 1]
499 delivery_status: DeliveryStatus,
500 polarity: BitField1,
501 remote_irr: bool,
502 #[bits = 1]
503 trigger_mode: TriggerMode,
504 interrupt_mask: bool, // true iff interrupts are masked.
505 reserved: BitField39,
506 dest_id: BitField8,
507 }
508
509 /// Number of pins on the standard KVM/IOAPIC.
510 pub const NUM_IOAPIC_PINS: usize = 24;
511
512 /// Represents the state of the IOAPIC.
513 #[repr(C)]
514 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
515 pub struct IoapicState {
516 /// base_address is the memory base address for this IOAPIC. It cannot be changed.
517 pub base_address: u64,
518 /// ioregsel register. Used for selecting which entry of the redirect table to read/write.
519 pub ioregsel: u8,
520 /// ioapicid register. Bits 24 - 27 contain the APIC ID for this device.
521 pub ioapicid: u32,
522 /// current_interrupt_level_bitmap represents a bitmap of the state of all of the irq lines
523 pub current_interrupt_level_bitmap: u32,
524 /// redirect_table contains the irq settings for each irq line
525 #[serde(
526 serialize_with = "serialize_arr",
527 deserialize_with = "deserialize_seq_to_arr"
528 )]
529 pub redirect_table: [IoapicRedirectionTableEntry; NUM_IOAPIC_PINS],
530 }
531
532 impl Default for IoapicState {
default() -> IoapicState533 fn default() -> IoapicState {
534 // SAFETY: trivially safe
535 unsafe { std::mem::zeroed() }
536 }
537 }
538
539 #[repr(C)]
540 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
541 pub enum PicSelect {
542 Primary = 0,
543 Secondary = 1,
544 }
545
546 #[repr(C)]
547 #[derive(enumn::N, Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
548 pub enum PicInitState {
549 #[default]
550 Icw1 = 0,
551 Icw2 = 1,
552 Icw3 = 2,
553 Icw4 = 3,
554 }
555
556 /// Convenience implementation for converting from a u8
557 impl From<u8> for PicInitState {
from(item: u8) -> Self558 fn from(item: u8) -> Self {
559 PicInitState::n(item).unwrap_or_else(|| {
560 error!("Invalid PicInitState {}, setting to 0", item);
561 PicInitState::Icw1
562 })
563 }
564 }
565
566 /// Represents the state of the PIC.
567 #[repr(C)]
568 #[derive(Clone, Copy, Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
569 pub struct PicState {
570 /// Edge detection.
571 pub last_irr: u8,
572 /// Interrupt Request Register.
573 pub irr: u8,
574 /// Interrupt Mask Register.
575 pub imr: u8,
576 /// Interrupt Service Register.
577 pub isr: u8,
578 /// Highest priority, for priority rotation.
579 pub priority_add: u8,
580 pub irq_base: u8,
581 pub read_reg_select: bool,
582 pub poll: bool,
583 pub special_mask: bool,
584 pub init_state: PicInitState,
585 pub auto_eoi: bool,
586 pub rotate_on_auto_eoi: bool,
587 pub special_fully_nested_mode: bool,
588 /// PIC takes either 3 or 4 bytes of initialization command word during
589 /// initialization. use_4_byte_icw is true if 4 bytes of ICW are needed.
590 pub use_4_byte_icw: bool,
591 /// "Edge/Level Control Registers", for edge trigger selection.
592 /// When a particular bit is set, the corresponding IRQ is in level-triggered mode. Otherwise
593 /// it is in edge-triggered mode.
594 pub elcr: u8,
595 pub elcr_mask: u8,
596 }
597
598 /// The LapicState represents the state of an x86 CPU's Local APIC.
599 /// The Local APIC consists of 64 128-bit registers, but only the first 32-bits of each register
600 /// can be used, so this structure only stores the first 32-bits of each register.
601 #[repr(C)]
602 #[derive(Clone, Copy, Serialize, Deserialize)]
603 pub struct LapicState {
604 #[serde(
605 serialize_with = "serialize_arr",
606 deserialize_with = "deserialize_seq_to_arr"
607 )]
608 pub regs: [LapicRegister; 64],
609 }
610
611 pub type LapicRegister = u32;
612
613 // rust arrays longer than 32 need custom implementations of Debug
614 impl std::fmt::Debug for LapicState {
fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result615 fn fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
616 self.regs[..].fmt(formatter)
617 }
618 }
619
620 // rust arrays longer than 32 need custom implementations of PartialEq
621 impl PartialEq for LapicState {
eq(&self, other: &LapicState) -> bool622 fn eq(&self, other: &LapicState) -> bool {
623 self.regs[..] == other.regs[..]
624 }
625 }
626
627 // Lapic equality is reflexive, so we impl Eq
628 impl Eq for LapicState {}
629
630 /// The PitState represents the state of the PIT (aka the Programmable Interval Timer).
631 /// The state is simply the state of it's three channels.
632 #[repr(C)]
633 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
634 pub struct PitState {
635 pub channels: [PitChannelState; 3],
636 /// Hypervisor-specific flags for setting the pit state.
637 pub flags: u32,
638 }
639
640 /// The PitRWMode enum represents the access mode of a PIT channel.
641 /// Reads and writes to the Pit happen over Port-mapped I/O, which happens one byte at a time,
642 /// but the count values and latch values are two bytes. So the access mode controls which of the
643 /// two bytes will be read when.
644 #[repr(C)]
645 #[derive(enumn::N, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
646 pub enum PitRWMode {
647 /// None mode means that no access mode has been set.
648 None = 0,
649 /// Least mode means all reads/writes will read/write the least significant byte.
650 Least = 1,
651 /// Most mode means all reads/writes will read/write the most significant byte.
652 Most = 2,
653 /// Both mode means first the least significant byte will be read/written, then the
654 /// next read/write will read/write the most significant byte.
655 Both = 3,
656 }
657
658 /// Convenience implementation for converting from a u8
659 impl From<u8> for PitRWMode {
from(item: u8) -> Self660 fn from(item: u8) -> Self {
661 PitRWMode::n(item).unwrap_or_else(|| {
662 error!("Invalid PitRWMode value {}, setting to 0", item);
663 PitRWMode::None
664 })
665 }
666 }
667
668 /// The PitRWState enum represents the state of reading to or writing from a channel.
669 /// This is related to the PitRWMode, it mainly gives more detail about the state of the channel
670 /// with respect to PitRWMode::Both.
671 #[repr(C)]
672 #[derive(enumn::N, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
673 pub enum PitRWState {
674 /// None mode means that no access mode has been set.
675 None = 0,
676 /// LSB means that the channel is in PitRWMode::Least access mode.
677 LSB = 1,
678 /// MSB means that the channel is in PitRWMode::Most access mode.
679 MSB = 2,
680 /// Word0 means that the channel is in PitRWMode::Both mode, and the least sginificant byte
681 /// has not been read/written yet.
682 Word0 = 3,
683 /// Word1 means that the channel is in PitRWMode::Both mode and the least significant byte
684 /// has already been read/written, and the next byte to be read/written will be the most
685 /// significant byte.
686 Word1 = 4,
687 }
688
689 /// Convenience implementation for converting from a u8
690 impl From<u8> for PitRWState {
from(item: u8) -> Self691 fn from(item: u8) -> Self {
692 PitRWState::n(item).unwrap_or_else(|| {
693 error!("Invalid PitRWState value {}, setting to 0", item);
694 PitRWState::None
695 })
696 }
697 }
698
699 /// The PitChannelState represents the state of one of the PIT's three counters.
700 #[repr(C)]
701 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
702 pub struct PitChannelState {
703 /// The starting value for the counter.
704 pub count: u32,
705 /// Stores the channel count from the last time the count was latched.
706 pub latched_count: u16,
707 /// Indicates the PitRWState state of reading the latch value.
708 pub count_latched: PitRWState,
709 /// Indicates whether ReadBack status has been latched.
710 pub status_latched: bool,
711 /// Stores the channel status from the last time the status was latched. The status contains
712 /// information about the access mode of this channel, but changing those bits in the status
713 /// will not change the behavior of the pit.
714 pub status: u8,
715 /// Indicates the PitRWState state of reading the counter.
716 pub read_state: PitRWState,
717 /// Indicates the PitRWState state of writing the counter.
718 pub write_state: PitRWState,
719 /// Stores the value with which the counter was initialized. Counters are 16-
720 /// bit values with an effective range of 1-65536 (65536 represented by 0).
721 pub reload_value: u16,
722 /// The command access mode of this channel.
723 pub rw_mode: PitRWMode,
724 /// The operation mode of this channel.
725 pub mode: u8,
726 /// Whether or not we are in bcd mode. Not supported by KVM or crosvm's PIT implementation.
727 pub bcd: bool,
728 /// Value of the gate input pin. This only applies to channel 2.
729 pub gate: bool,
730 /// Nanosecond timestamp of when the count value was loaded.
731 pub count_load_time: u64,
732 }
733
734 // Convenience constructors for IrqRoutes
735 impl IrqRoute {
ioapic_irq_route(irq_num: u32) -> IrqRoute736 pub fn ioapic_irq_route(irq_num: u32) -> IrqRoute {
737 IrqRoute {
738 gsi: irq_num,
739 source: IrqSource::Irqchip {
740 chip: IrqSourceChip::Ioapic,
741 pin: irq_num,
742 },
743 }
744 }
745
pic_irq_route(id: IrqSourceChip, irq_num: u32) -> IrqRoute746 pub fn pic_irq_route(id: IrqSourceChip, irq_num: u32) -> IrqRoute {
747 IrqRoute {
748 gsi: irq_num,
749 source: IrqSource::Irqchip {
750 chip: id,
751 pin: irq_num % 8,
752 },
753 }
754 }
755 }
756
757 /// State of a VCPU's general purpose registers.
758 #[repr(C)]
759 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
760 pub struct Regs {
761 pub rax: u64,
762 pub rbx: u64,
763 pub rcx: u64,
764 pub rdx: u64,
765 pub rsi: u64,
766 pub rdi: u64,
767 pub rsp: u64,
768 pub rbp: u64,
769 pub r8: u64,
770 pub r9: u64,
771 pub r10: u64,
772 pub r11: u64,
773 pub r12: u64,
774 pub r13: u64,
775 pub r14: u64,
776 pub r15: u64,
777 pub rip: u64,
778 pub rflags: u64,
779 }
780
781 impl Default for Regs {
default() -> Self782 fn default() -> Self {
783 Regs {
784 rax: 0,
785 rbx: 0,
786 rcx: 0,
787 rdx: 0,
788 rsi: 0,
789 rdi: 0,
790 rsp: 0,
791 rbp: 0,
792 r8: 0,
793 r9: 0,
794 r10: 0,
795 r11: 0,
796 r12: 0,
797 r13: 0,
798 r14: 0,
799 r15: 0,
800 rip: 0xfff0, // Reset vector.
801 rflags: 0x2, // Bit 1 (0x2) is always 1.
802 }
803 }
804 }
805
806 /// State of a memory segment.
807 #[repr(C)]
808 #[derive(Debug, Default, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
809 pub struct Segment {
810 pub base: u64,
811 /// Limit of the segment - always in bytes, regardless of granularity (`g`) field.
812 pub limit_bytes: u32,
813 pub selector: u16,
814 pub type_: u8,
815 pub present: u8,
816 pub dpl: u8,
817 pub db: u8,
818 pub s: u8,
819 pub l: u8,
820 pub g: u8,
821 pub avl: u8,
822 }
823
824 /// State of a global descriptor table or interrupt descriptor table.
825 #[repr(C)]
826 #[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
827 pub struct DescriptorTable {
828 pub base: u64,
829 pub limit: u16,
830 }
831
832 /// State of a VCPU's special registers.
833 #[repr(C)]
834 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
835 pub struct Sregs {
836 pub cs: Segment,
837 pub ds: Segment,
838 pub es: Segment,
839 pub fs: Segment,
840 pub gs: Segment,
841 pub ss: Segment,
842 pub tr: Segment,
843 pub ldt: Segment,
844 pub gdt: DescriptorTable,
845 pub idt: DescriptorTable,
846 pub cr0: u64,
847 pub cr2: u64,
848 pub cr3: u64,
849 pub cr4: u64,
850 pub cr8: u64,
851 pub efer: u64,
852 }
853
854 impl Default for Sregs {
default() -> Self855 fn default() -> Self {
856 // Intel SDM Vol. 3A, 3.4.5.1 ("Code- and Data-Segment Descriptor Types")
857 const SEG_TYPE_DATA: u8 = 0b0000;
858 const SEG_TYPE_DATA_WRITABLE: u8 = 0b0010;
859
860 const SEG_TYPE_CODE: u8 = 0b1000;
861 const SEG_TYPE_CODE_READABLE: u8 = 0b0010;
862
863 const SEG_TYPE_ACCESSED: u8 = 0b0001;
864
865 // Intel SDM Vol. 3A, 3.4.5 ("Segment Descriptors")
866 const SEG_S_SYSTEM: u8 = 0; // System segment.
867 const SEG_S_CODE_OR_DATA: u8 = 1; // Data/code segment.
868
869 // 16-bit real-mode code segment (reset vector).
870 let code_seg = Segment {
871 base: 0xffff0000,
872 limit_bytes: 0xffff,
873 selector: 0xf000,
874 type_: SEG_TYPE_CODE | SEG_TYPE_CODE_READABLE | SEG_TYPE_ACCESSED, // 11
875 present: 1,
876 s: SEG_S_CODE_OR_DATA,
877 ..Default::default()
878 };
879
880 // 16-bit real-mode data segment.
881 let data_seg = Segment {
882 base: 0,
883 limit_bytes: 0xffff,
884 selector: 0,
885 type_: SEG_TYPE_DATA | SEG_TYPE_DATA_WRITABLE | SEG_TYPE_ACCESSED, // 3
886 present: 1,
887 s: SEG_S_CODE_OR_DATA,
888 ..Default::default()
889 };
890
891 // 16-bit TSS segment.
892 let task_seg = Segment {
893 base: 0,
894 limit_bytes: 0xffff,
895 selector: 0,
896 type_: SEG_TYPE_CODE | SEG_TYPE_CODE_READABLE | SEG_TYPE_ACCESSED, // 11
897 present: 1,
898 s: SEG_S_SYSTEM,
899 ..Default::default()
900 };
901
902 // Local descriptor table.
903 let ldt = Segment {
904 base: 0,
905 limit_bytes: 0xffff,
906 selector: 0,
907 type_: SEG_TYPE_DATA | SEG_TYPE_DATA_WRITABLE, // 2
908 present: 1,
909 s: SEG_S_SYSTEM,
910 ..Default::default()
911 };
912
913 // Global descriptor table.
914 let gdt = DescriptorTable {
915 base: 0,
916 limit: 0xffff,
917 };
918
919 // Interrupt descriptor table.
920 let idt = DescriptorTable {
921 base: 0,
922 limit: 0xffff,
923 };
924
925 let cr0 = (1 << 4) // CR0.ET (reserved, always 1)
926 | (1 << 30); // CR0.CD (cache disable)
927
928 Sregs {
929 cs: code_seg,
930 ds: data_seg,
931 es: data_seg,
932 fs: data_seg,
933 gs: data_seg,
934 ss: data_seg,
935 tr: task_seg,
936 ldt,
937 gdt,
938 idt,
939 cr0,
940 cr2: 0,
941 cr3: 0,
942 cr4: 0,
943 cr8: 0,
944 efer: 0,
945 }
946 }
947 }
948
949 /// x87 80-bit floating point value.
950 #[repr(C)]
951 #[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
952 pub struct FpuReg {
953 /// 64-bit mantissa.
954 pub significand: u64,
955
956 /// 15-bit biased exponent and sign bit.
957 pub sign_exp: u16,
958 }
959
960 impl FpuReg {
961 /// Convert an array of 8x16-byte arrays to an array of 8 `FpuReg`.
962 ///
963 /// Ignores any data in the upper 6 bytes of each element; the values represent 80-bit FPU
964 /// registers, so the upper 48 bits are unused.
from_16byte_arrays(byte_arrays: &[[u8; 16]; 8]) -> [FpuReg; 8]965 pub fn from_16byte_arrays(byte_arrays: &[[u8; 16]; 8]) -> [FpuReg; 8] {
966 let mut regs = [FpuReg::default(); 8];
967 for (dst, src) in regs.iter_mut().zip(byte_arrays.iter()) {
968 let tbyte: [u8; 10] = src[0..10].try_into().unwrap();
969 *dst = FpuReg::from(tbyte);
970 }
971 regs
972 }
973
974 /// Convert an array of 8 `FpuReg` into 8x16-byte arrays.
to_16byte_arrays(regs: &[FpuReg; 8]) -> [[u8; 16]; 8]975 pub fn to_16byte_arrays(regs: &[FpuReg; 8]) -> [[u8; 16]; 8] {
976 let mut byte_arrays = [[0u8; 16]; 8];
977 for (dst, src) in byte_arrays.iter_mut().zip(regs.iter()) {
978 *dst = (*src).into();
979 }
980 byte_arrays
981 }
982 }
983
984 impl From<[u8; 10]> for FpuReg {
985 /// Construct a `FpuReg` from an 80-bit representation.
from(value: [u8; 10]) -> FpuReg986 fn from(value: [u8; 10]) -> FpuReg {
987 // These array sub-slices can't fail, but there's no (safe) way to express that in Rust
988 // without an `unwrap()`.
989 let significand_bytes = value[0..8].try_into().unwrap();
990 let significand = u64::from_le_bytes(significand_bytes);
991 let sign_exp_bytes = value[8..10].try_into().unwrap();
992 let sign_exp = u16::from_le_bytes(sign_exp_bytes);
993 FpuReg {
994 significand,
995 sign_exp,
996 }
997 }
998 }
999
1000 impl From<FpuReg> for [u8; 10] {
1001 /// Convert an `FpuReg` into its 80-bit "TBYTE" representation.
from(value: FpuReg) -> [u8; 10]1002 fn from(value: FpuReg) -> [u8; 10] {
1003 let mut bytes = [0u8; 10];
1004 bytes[0..8].copy_from_slice(&value.significand.to_le_bytes());
1005 bytes[8..10].copy_from_slice(&value.sign_exp.to_le_bytes());
1006 bytes
1007 }
1008 }
1009
1010 impl From<FpuReg> for [u8; 16] {
1011 /// Convert an `FpuReg` into its 80-bit representation plus 6 unused upper bytes.
1012 /// This is a convenience function for converting to hypervisor types.
from(value: FpuReg) -> [u8; 16]1013 fn from(value: FpuReg) -> [u8; 16] {
1014 let mut bytes = [0u8; 16];
1015 bytes[0..8].copy_from_slice(&value.significand.to_le_bytes());
1016 bytes[8..10].copy_from_slice(&value.sign_exp.to_le_bytes());
1017 bytes
1018 }
1019 }
1020
1021 /// State of a VCPU's floating point unit.
1022 #[repr(C)]
1023 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
1024 pub struct Fpu {
1025 pub fpr: [FpuReg; 8],
1026 pub fcw: u16,
1027 pub fsw: u16,
1028 pub ftwx: u8,
1029 pub last_opcode: u16,
1030 pub last_ip: u64,
1031 pub last_dp: u64,
1032 pub xmm: [[u8; 16usize]; 16usize],
1033 pub mxcsr: u32,
1034 }
1035
1036 impl Default for Fpu {
default() -> Self1037 fn default() -> Self {
1038 Fpu {
1039 fpr: Default::default(),
1040 fcw: 0x37f, // Intel SDM Vol. 1, 13.6
1041 fsw: 0,
1042 ftwx: 0,
1043 last_opcode: 0,
1044 last_ip: 0,
1045 last_dp: 0,
1046 xmm: Default::default(),
1047 mxcsr: 0x1f80, // Intel SDM Vol. 1, 11.6.4
1048 }
1049 }
1050 }
1051
1052 /// State of a VCPU's debug registers.
1053 #[repr(C)]
1054 #[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
1055 pub struct DebugRegs {
1056 pub db: [u64; 4usize],
1057 pub dr6: u64,
1058 pub dr7: u64,
1059 }
1060
1061 /// The hybrid type for intel hybrid CPU.
1062 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
1063 pub enum CpuHybridType {
1064 /// Intel Atom.
1065 Atom,
1066 /// Intel Core.
1067 Core,
1068 }
1069
1070 /// State of the VCPU's x87 FPU, MMX, XMM, YMM registers.
1071 /// May contain more state depending on enabled extensions.
1072 #[derive(Clone, Debug, Serialize, Deserialize)]
1073 pub struct Xsave {
1074 data: Vec<u32>,
1075
1076 // Actual length in bytes. May be smaller than data if a non-u32 multiple of bytes is
1077 // requested.
1078 len: usize,
1079 }
1080
1081 impl Xsave {
1082 /// Create a new buffer to store Xsave data.
1083 ///
1084 /// # Argments
1085 /// * `len` size in bytes.
new(len: usize) -> Self1086 pub fn new(len: usize) -> Self {
1087 Xsave {
1088 data: vec![0; (len + 3) / 4],
1089 len,
1090 }
1091 }
1092
as_ptr(&self) -> *const c_void1093 pub fn as_ptr(&self) -> *const c_void {
1094 self.data.as_ptr() as *const c_void
1095 }
1096
as_mut_ptr(&mut self) -> *mut c_void1097 pub fn as_mut_ptr(&mut self) -> *mut c_void {
1098 self.data.as_mut_ptr() as *mut c_void
1099 }
1100
1101 /// Length in bytes of the XSAVE data.
len(&self) -> usize1102 pub fn len(&self) -> usize {
1103 self.len
1104 }
1105
1106 /// Returns true is length of XSAVE data is zero
is_empty(&self) -> bool1107 pub fn is_empty(&self) -> bool {
1108 self.len() == 0
1109 }
1110 }
1111