xref: /aosp_15_r20/external/crosvm/hypervisor/src/x86_64.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2020 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::arch::x86_64::CpuidResult;
6 #[cfg(any(unix, feature = "haxm", feature = "whpx"))]
7 use std::arch::x86_64::__cpuid;
8 use std::arch::x86_64::_rdtsc;
9 use std::collections::BTreeMap;
10 use std::collections::HashSet;
11 
12 use anyhow::Context;
13 use base::custom_serde::deserialize_seq_to_arr;
14 use base::custom_serde::serialize_arr;
15 use base::error;
16 use base::warn;
17 use base::Result;
18 use bit_field::*;
19 use downcast_rs::impl_downcast;
20 use libc::c_void;
21 use serde::Deserialize;
22 use serde::Serialize;
23 use vm_memory::GuestAddress;
24 
25 use crate::Hypervisor;
26 use crate::IrqRoute;
27 use crate::IrqSource;
28 use crate::IrqSourceChip;
29 use crate::Vcpu;
30 use crate::Vm;
31 
32 const MSR_F15H_PERF_CTL0: u32 = 0xc0010200;
33 const MSR_F15H_PERF_CTL1: u32 = 0xc0010202;
34 const MSR_F15H_PERF_CTL2: u32 = 0xc0010204;
35 const MSR_F15H_PERF_CTL3: u32 = 0xc0010206;
36 const MSR_F15H_PERF_CTL4: u32 = 0xc0010208;
37 const MSR_F15H_PERF_CTL5: u32 = 0xc001020a;
38 const MSR_F15H_PERF_CTR0: u32 = 0xc0010201;
39 const MSR_F15H_PERF_CTR1: u32 = 0xc0010203;
40 const MSR_F15H_PERF_CTR2: u32 = 0xc0010205;
41 const MSR_F15H_PERF_CTR3: u32 = 0xc0010207;
42 const MSR_F15H_PERF_CTR4: u32 = 0xc0010209;
43 const MSR_F15H_PERF_CTR5: u32 = 0xc001020b;
44 const MSR_IA32_PERF_CAPABILITIES: u32 = 0x00000345;
45 
46 /// A trait for managing cpuids for an x86_64 hypervisor and for checking its capabilities.
47 pub trait HypervisorX86_64: Hypervisor {
48     /// Get the system supported CPUID values.
get_supported_cpuid(&self) -> Result<CpuId>49     fn get_supported_cpuid(&self) -> Result<CpuId>;
50 
51     /// Gets the list of supported MSRs.
get_msr_index_list(&self) -> Result<Vec<u32>>52     fn get_msr_index_list(&self) -> Result<Vec<u32>>;
53 }
54 
55 /// A wrapper for using a VM on x86_64 and getting/setting its state.
56 pub trait VmX86_64: Vm {
57     /// Gets the `HypervisorX86_64` that created this VM.
get_hypervisor(&self) -> &dyn HypervisorX86_6458     fn get_hypervisor(&self) -> &dyn HypervisorX86_64;
59 
60     /// Create a Vcpu with the specified Vcpu ID.
create_vcpu(&self, id: usize) -> Result<Box<dyn VcpuX86_64>>61     fn create_vcpu(&self, id: usize) -> Result<Box<dyn VcpuX86_64>>;
62 
63     /// Sets the address of the three-page region in the VM's address space.
set_tss_addr(&self, addr: GuestAddress) -> Result<()>64     fn set_tss_addr(&self, addr: GuestAddress) -> Result<()>;
65 
66     /// Sets the address of a one-page region in the VM's address space.
set_identity_map_addr(&self, addr: GuestAddress) -> Result<()>67     fn set_identity_map_addr(&self, addr: GuestAddress) -> Result<()>;
68 }
69 
70 /// A wrapper around creating and using a VCPU on x86_64.
71 pub trait VcpuX86_64: Vcpu {
72     /// Sets or clears the flag that requests the VCPU to exit when it becomes possible to inject
73     /// interrupts into the guest.
set_interrupt_window_requested(&self, requested: bool)74     fn set_interrupt_window_requested(&self, requested: bool);
75 
76     /// Checks if we can inject an interrupt into the VCPU.
ready_for_interrupt(&self) -> bool77     fn ready_for_interrupt(&self) -> bool;
78 
79     /// Injects interrupt vector `irq` into the VCPU.
80     ///
81     /// This function should only be called when [`Self::ready_for_interrupt`] returns true.
82     /// Otherwise the interrupt injection may fail or the next VCPU run may fail. However, if
83     /// [`Self::interrupt`] returns [`Ok`], the implementation must guarantee that the interrupt
84     /// isn't injected in an uninterruptible window (e.g. right after the mov ss instruction).
85     ///
86     /// The caller should avoid calling this function more than 1 time for one VMEXIT, because the
87     /// hypervisor may behave differently: some hypervisors(e.g. WHPX, KVM) will only try to inject
88     /// the last `irq` requested, while some other hypervisors(e.g. HAXM) may try to inject all
89     /// `irq`s requested.
interrupt(&self, irq: u8) -> Result<()>90     fn interrupt(&self, irq: u8) -> Result<()>;
91 
92     /// Injects a non-maskable interrupt into the VCPU.
inject_nmi(&self) -> Result<()>93     fn inject_nmi(&self) -> Result<()>;
94 
95     /// Gets the VCPU general purpose registers.
get_regs(&self) -> Result<Regs>96     fn get_regs(&self) -> Result<Regs>;
97 
98     /// Sets the VCPU general purpose registers.
set_regs(&self, regs: &Regs) -> Result<()>99     fn set_regs(&self, regs: &Regs) -> Result<()>;
100 
101     /// Gets the VCPU special registers.
get_sregs(&self) -> Result<Sregs>102     fn get_sregs(&self) -> Result<Sregs>;
103 
104     /// Sets the VCPU special registers.
set_sregs(&self, sregs: &Sregs) -> Result<()>105     fn set_sregs(&self, sregs: &Sregs) -> Result<()>;
106 
107     /// Gets the VCPU FPU registers.
get_fpu(&self) -> Result<Fpu>108     fn get_fpu(&self) -> Result<Fpu>;
109 
110     /// Sets the VCPU FPU registers.
set_fpu(&self, fpu: &Fpu) -> Result<()>111     fn set_fpu(&self, fpu: &Fpu) -> Result<()>;
112 
113     /// Gets the VCPU debug registers.
get_debugregs(&self) -> Result<DebugRegs>114     fn get_debugregs(&self) -> Result<DebugRegs>;
115 
116     /// Sets the VCPU debug registers.
set_debugregs(&self, debugregs: &DebugRegs) -> Result<()>117     fn set_debugregs(&self, debugregs: &DebugRegs) -> Result<()>;
118 
119     /// Gets the VCPU extended control registers.
get_xcrs(&self) -> Result<BTreeMap<u32, u64>>120     fn get_xcrs(&self) -> Result<BTreeMap<u32, u64>>;
121 
122     /// Sets a VCPU extended control register.
set_xcr(&self, xcr: u32, value: u64) -> Result<()>123     fn set_xcr(&self, xcr: u32, value: u64) -> Result<()>;
124 
125     /// Gets the VCPU x87 FPU, MMX, XMM, YMM and MXCSR registers.
get_xsave(&self) -> Result<Xsave>126     fn get_xsave(&self) -> Result<Xsave>;
127 
128     /// Sets the VCPU x87 FPU, MMX, XMM, YMM and MXCSR registers.
set_xsave(&self, xsave: &Xsave) -> Result<()>129     fn set_xsave(&self, xsave: &Xsave) -> Result<()>;
130 
131     /// Gets interrupt state (hypervisor specific) for this VCPU that must be
132     /// saved/restored for snapshotting.
get_interrupt_state(&self) -> Result<serde_json::Value>133     fn get_interrupt_state(&self) -> Result<serde_json::Value>;
134 
135     /// Sets interrupt state (hypervisor specific) for this VCPU. Only used for
136     /// snapshotting.
set_interrupt_state(&self, data: serde_json::Value) -> Result<()>137     fn set_interrupt_state(&self, data: serde_json::Value) -> Result<()>;
138 
139     /// Gets a single model-specific register's value.
get_msr(&self, msr_index: u32) -> Result<u64>140     fn get_msr(&self, msr_index: u32) -> Result<u64>;
141 
142     /// Gets the model-specific registers. Returns all the MSRs for the VCPU.
get_all_msrs(&self) -> Result<BTreeMap<u32, u64>>143     fn get_all_msrs(&self) -> Result<BTreeMap<u32, u64>>;
144 
145     /// Sets a single model-specific register's value.
set_msr(&self, msr_index: u32, value: u64) -> Result<()>146     fn set_msr(&self, msr_index: u32, value: u64) -> Result<()>;
147 
148     /// Sets up the data returned by the CPUID instruction.
set_cpuid(&self, cpuid: &CpuId) -> Result<()>149     fn set_cpuid(&self, cpuid: &CpuId) -> Result<()>;
150 
151     /// Sets up debug registers and configure vcpu for handling guest debug events.
set_guest_debug(&self, addrs: &[GuestAddress], enable_singlestep: bool) -> Result<()>152     fn set_guest_debug(&self, addrs: &[GuestAddress], enable_singlestep: bool) -> Result<()>;
153 
154     /// This function should be called after `Vcpu::run` returns `VcpuExit::Cpuid`, and `entry`
155     /// should represent the result of emulating the CPUID instruction. The `handle_cpuid` function
156     /// will then set the appropriate registers on the vcpu.
handle_cpuid(&mut self, entry: &CpuIdEntry) -> Result<()>157     fn handle_cpuid(&mut self, entry: &CpuIdEntry) -> Result<()>;
158 
159     /// Gets the guest->host TSC offset.
160     ///
161     /// The default implementation uses [`VcpuX86_64::get_msr()`] to read the guest TSC.
get_tsc_offset(&self) -> Result<u64>162     fn get_tsc_offset(&self) -> Result<u64> {
163         // SAFETY:
164         // Safe because _rdtsc takes no arguments
165         let host_before_tsc = unsafe { _rdtsc() };
166 
167         // get guest TSC value from our hypervisor
168         let guest_tsc = self.get_msr(crate::MSR_IA32_TSC)?;
169 
170         // SAFETY:
171         // Safe because _rdtsc takes no arguments
172         let host_after_tsc = unsafe { _rdtsc() };
173 
174         // Average the before and after host tsc to get the best value
175         let host_tsc = ((host_before_tsc as u128 + host_after_tsc as u128) / 2) as u64;
176 
177         Ok(guest_tsc.wrapping_sub(host_tsc))
178     }
179 
180     /// Sets the guest->host TSC offset.
181     ///
182     /// The default implementation uses [`VcpuX86_64::set_tsc_value()`] to set the TSC value.
183     ///
184     /// It sets TSC_OFFSET (VMCS / CB field) by setting the TSC MSR to the current
185     /// host TSC value plus the desired offset. We rely on the fact that hypervisors
186     /// determine the value of TSC_OFFSET by computing TSC_OFFSET = `new_tsc_value - _rdtsc()` =
187     /// `_rdtsc() + offset - _rdtsc()` ~= `offset`. Note that the ~= is important: this is an
188     /// approximate operation, because the two _rdtsc() calls
189     /// are separated by at least a few ticks.
190     ///
191     /// Note: TSC_OFFSET, host TSC, guest TSC, and TSC MSR are all different
192     /// concepts.
193     /// * When a guest executes rdtsc, the value (guest TSC) returned is host_tsc * TSC_MULTIPLIER +
194     ///   TSC_OFFSET + TSC_ADJUST.
195     /// * The TSC MSR is a special MSR that when written to by the host, will cause TSC_OFFSET to be
196     ///   set accordingly by the hypervisor.
197     /// * When the guest *writes* to TSC MSR, it actually changes the TSC_ADJUST MSR *for the
198     ///   guest*. Generally this is only happens if the guest is trying to re-zero or synchronize
199     ///   TSCs.
set_tsc_offset(&self, offset: u64) -> Result<()>200     fn set_tsc_offset(&self, offset: u64) -> Result<()> {
201         // SAFETY: _rdtsc takes no arguments.
202         let host_tsc = unsafe { _rdtsc() };
203         self.set_tsc_value(host_tsc.wrapping_add(offset))
204     }
205 
206     /// Sets the guest TSC exactly to the provided value.
207     ///
208     /// The default implementation sets the guest's TSC by writing the value to the MSR directly.
209     ///
210     /// See [`VcpuX86_64::set_tsc_offset()`] for an explanation of how this value is actually read
211     /// by the guest after being set.
set_tsc_value(&self, value: u64) -> Result<()>212     fn set_tsc_value(&self, value: u64) -> Result<()> {
213         self.set_msr(crate::MSR_IA32_TSC, value)
214     }
215 
216     /// Some hypervisors require special handling to restore timekeeping when
217     /// a snapshot is restored. They are provided with a host TSC reference
218     /// moment, guaranteed to be the same across all Vcpus, and the Vcpu's TSC
219     /// offset at the moment it was snapshotted.
restore_timekeeping(&self, host_tsc_reference_moment: u64, tsc_offset: u64) -> Result<()>220     fn restore_timekeeping(&self, host_tsc_reference_moment: u64, tsc_offset: u64) -> Result<()>;
221 
222     /// Snapshot vCPU state
snapshot(&self) -> anyhow::Result<VcpuSnapshot>223     fn snapshot(&self) -> anyhow::Result<VcpuSnapshot> {
224         Ok(VcpuSnapshot {
225             vcpu_id: self.id(),
226             regs: self.get_regs()?,
227             sregs: self.get_sregs()?,
228             debug_regs: self.get_debugregs()?,
229             xcrs: self.get_xcrs()?,
230             msrs: self.get_all_msrs()?,
231             xsave: self.get_xsave()?,
232             hypervisor_data: self.get_interrupt_state()?,
233             tsc_offset: self.get_tsc_offset()?,
234         })
235     }
236 
restore( &mut self, snapshot: &VcpuSnapshot, host_tsc_reference_moment: u64, ) -> anyhow::Result<()>237     fn restore(
238         &mut self,
239         snapshot: &VcpuSnapshot,
240         host_tsc_reference_moment: u64,
241     ) -> anyhow::Result<()> {
242         // List of MSRs that may fail to restore due to lack of support in the host kernel.
243         // Some hosts are may be running older kernels which do not support all MSRs, but
244         // get_all_msrs will still fetch the MSRs supported by the CPU. Trying to set those MSRs
245         // will result in failures, so they will throw a warning instead.
246         let msr_allowlist = HashSet::from([
247             MSR_F15H_PERF_CTL0,
248             MSR_F15H_PERF_CTL1,
249             MSR_F15H_PERF_CTL2,
250             MSR_F15H_PERF_CTL3,
251             MSR_F15H_PERF_CTL4,
252             MSR_F15H_PERF_CTL5,
253             MSR_F15H_PERF_CTR0,
254             MSR_F15H_PERF_CTR1,
255             MSR_F15H_PERF_CTR2,
256             MSR_F15H_PERF_CTR3,
257             MSR_F15H_PERF_CTR4,
258             MSR_F15H_PERF_CTR5,
259             MSR_IA32_PERF_CAPABILITIES,
260         ]);
261         assert_eq!(snapshot.vcpu_id, self.id());
262         self.set_regs(&snapshot.regs)?;
263         self.set_sregs(&snapshot.sregs)?;
264         self.set_debugregs(&snapshot.debug_regs)?;
265         for (xcr_index, value) in &snapshot.xcrs {
266             self.set_xcr(*xcr_index, *value)?;
267         }
268 
269         for (msr_index, value) in snapshot.msrs.iter() {
270             if self.get_msr(*msr_index) == Ok(*value) {
271                 continue; // no need to set MSR since the values are the same.
272             }
273             if let Err(e) = self.set_msr(*msr_index, *value) {
274                 if msr_allowlist.contains(msr_index) {
275                     warn!(
276                         "Failed to set MSR. MSR might not be supported in this kernel. Err: {}",
277                         e
278                     );
279                 } else {
280                     return Err(e).context(
281                         "Failed to set MSR. MSR might not be supported by the CPU or by the kernel,
282                          and was not allow-listed.",
283                     );
284                 }
285             };
286         }
287         self.set_xsave(&snapshot.xsave)?;
288         self.set_interrupt_state(snapshot.hypervisor_data.clone())?;
289         self.restore_timekeeping(host_tsc_reference_moment, snapshot.tsc_offset)?;
290         Ok(())
291     }
292 }
293 
294 /// x86 specific vCPU snapshot.
295 #[derive(Clone, Debug, Serialize, Deserialize)]
296 pub struct VcpuSnapshot {
297     pub vcpu_id: usize,
298     regs: Regs,
299     sregs: Sregs,
300     debug_regs: DebugRegs,
301     xcrs: BTreeMap<u32, u64>,
302     msrs: BTreeMap<u32, u64>,
303     xsave: Xsave,
304     hypervisor_data: serde_json::Value,
305     tsc_offset: u64,
306 }
307 
308 impl_downcast!(VcpuX86_64);
309 
310 // TSC MSR
311 pub const MSR_IA32_TSC: u32 = 0x00000010;
312 
313 /// Gets host cpu max physical address bits.
314 #[cfg(any(unix, feature = "haxm", feature = "whpx"))]
host_phys_addr_bits() -> u8315 pub(crate) fn host_phys_addr_bits() -> u8 {
316     // SAFETY: trivially safe
317     let highest_ext_function = unsafe { __cpuid(0x80000000) };
318     if highest_ext_function.eax >= 0x80000008 {
319         // SAFETY: trivially safe
320         let addr_size = unsafe { __cpuid(0x80000008) };
321         // Low 8 bits of 0x80000008 leaf: host physical address size in bits.
322         addr_size.eax as u8
323     } else {
324         36
325     }
326 }
327 
328 /// Initial state for x86_64 VCPUs.
329 #[derive(Clone, Default)]
330 pub struct VcpuInitX86_64 {
331     /// General-purpose registers.
332     pub regs: Regs,
333 
334     /// Special registers.
335     pub sregs: Sregs,
336 
337     /// Floating-point registers.
338     pub fpu: Fpu,
339 
340     /// Machine-specific registers.
341     pub msrs: BTreeMap<u32, u64>,
342 }
343 
344 /// Hold the CPU feature configurations that are needed to setup a vCPU.
345 #[derive(Clone, Debug, PartialEq, Eq)]
346 pub struct CpuConfigX86_64 {
347     /// whether to force using a calibrated TSC leaf (0x15).
348     pub force_calibrated_tsc_leaf: bool,
349 
350     /// whether enabling host cpu topology.
351     pub host_cpu_topology: bool,
352 
353     /// whether expose HWP feature to the guest.
354     pub enable_hwp: bool,
355 
356     /// Wheter diabling SMT (Simultaneous Multithreading).
357     pub no_smt: bool,
358 
359     /// whether enabling ITMT scheduler
360     pub itmt: bool,
361 
362     /// whether setting hybrid CPU type
363     pub hybrid_type: Option<CpuHybridType>,
364 }
365 
366 impl CpuConfigX86_64 {
new( force_calibrated_tsc_leaf: bool, host_cpu_topology: bool, enable_hwp: bool, no_smt: bool, itmt: bool, hybrid_type: Option<CpuHybridType>, ) -> Self367     pub fn new(
368         force_calibrated_tsc_leaf: bool,
369         host_cpu_topology: bool,
370         enable_hwp: bool,
371         no_smt: bool,
372         itmt: bool,
373         hybrid_type: Option<CpuHybridType>,
374     ) -> Self {
375         CpuConfigX86_64 {
376             force_calibrated_tsc_leaf,
377             host_cpu_topology,
378             enable_hwp,
379             no_smt,
380             itmt,
381             hybrid_type,
382         }
383     }
384 }
385 
386 /// A CpuId Entry contains supported feature information for the given processor.
387 /// This can be modified by the hypervisor to pass additional information to the guest kernel
388 /// about the hypervisor or vm. Information is returned in the eax, ebx, ecx and edx registers
389 /// by the cpu for a given function and index/subfunction (passed into the cpu via the eax and ecx
390 /// register respectively).
391 #[repr(C)]
392 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
393 pub struct CpuIdEntry {
394     pub function: u32,
395     pub index: u32,
396     // flags is needed for KVM.  We store it on CpuIdEntry to preserve the flags across
397     // get_supported_cpuids() -> kvm_cpuid2 -> CpuId -> kvm_cpuid2 -> set_cpuid().
398     pub flags: u32,
399     pub cpuid: CpuidResult,
400 }
401 
402 /// A container for the list of cpu id entries for the hypervisor and underlying cpu.
403 pub struct CpuId {
404     pub cpu_id_entries: Vec<CpuIdEntry>,
405 }
406 
407 impl CpuId {
408     /// Constructs a new CpuId, with space allocated for `initial_capacity` CpuIdEntries.
new(initial_capacity: usize) -> Self409     pub fn new(initial_capacity: usize) -> Self {
410         CpuId {
411             cpu_id_entries: Vec::with_capacity(initial_capacity),
412         }
413     }
414 }
415 
416 #[bitfield]
417 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
418 pub enum DestinationMode {
419     Physical = 0,
420     Logical = 1,
421 }
422 
423 #[bitfield]
424 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
425 pub enum TriggerMode {
426     Edge = 0,
427     Level = 1,
428 }
429 
430 #[bitfield]
431 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
432 pub enum DeliveryMode {
433     Fixed = 0b000,
434     Lowest = 0b001,
435     SMI = 0b010,        // System management interrupt
436     RemoteRead = 0b011, // This is no longer supported by intel.
437     NMI = 0b100,        // Non maskable interrupt
438     Init = 0b101,
439     Startup = 0b110,
440     External = 0b111,
441 }
442 
443 // These MSI structures are for Intel's implementation of MSI.  The PCI spec defines most of MSI,
444 // but the Intel spec defines the format of messages for raising interrupts.  The PCI spec defines
445 // three u32s -- the address, address_high, and data -- but Intel only makes use of the address and
446 // data.  The Intel portion of the specification is in Volume 3 section 10.11.
447 #[bitfield]
448 #[derive(Clone, Copy, PartialEq, Eq)]
449 pub struct MsiAddressMessage {
450     pub reserved: BitField2,
451     #[bits = 1]
452     pub destination_mode: DestinationMode,
453     pub redirection_hint: BitField1,
454     pub reserved_2: BitField8,
455     pub destination_id: BitField8,
456     // According to Intel's implementation of MSI, these bits must always be 0xfee.
457     pub always_0xfee: BitField12,
458 }
459 
460 #[bitfield]
461 #[derive(Clone, Copy, PartialEq, Eq)]
462 pub struct MsiDataMessage {
463     pub vector: BitField8,
464     #[bits = 3]
465     pub delivery_mode: DeliveryMode,
466     pub reserved: BitField3,
467     #[bits = 1]
468     pub level: Level,
469     #[bits = 1]
470     pub trigger: TriggerMode,
471     pub reserved2: BitField16,
472 }
473 
474 #[bitfield]
475 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
476 pub enum DeliveryStatus {
477     Idle = 0,
478     Pending = 1,
479 }
480 
481 /// The level of a level-triggered interrupt: asserted or deasserted.
482 #[bitfield]
483 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
484 pub enum Level {
485     Deassert = 0,
486     Assert = 1,
487 }
488 
489 /// Represents a IOAPIC redirection table entry.
490 #[bitfield]
491 #[derive(Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
492 pub struct IoapicRedirectionTableEntry {
493     vector: BitField8,
494     #[bits = 3]
495     delivery_mode: DeliveryMode,
496     #[bits = 1]
497     dest_mode: DestinationMode,
498     #[bits = 1]
499     delivery_status: DeliveryStatus,
500     polarity: BitField1,
501     remote_irr: bool,
502     #[bits = 1]
503     trigger_mode: TriggerMode,
504     interrupt_mask: bool, // true iff interrupts are masked.
505     reserved: BitField39,
506     dest_id: BitField8,
507 }
508 
509 /// Number of pins on the standard KVM/IOAPIC.
510 pub const NUM_IOAPIC_PINS: usize = 24;
511 
512 /// Represents the state of the IOAPIC.
513 #[repr(C)]
514 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
515 pub struct IoapicState {
516     /// base_address is the memory base address for this IOAPIC. It cannot be changed.
517     pub base_address: u64,
518     /// ioregsel register. Used for selecting which entry of the redirect table to read/write.
519     pub ioregsel: u8,
520     /// ioapicid register. Bits 24 - 27 contain the APIC ID for this device.
521     pub ioapicid: u32,
522     /// current_interrupt_level_bitmap represents a bitmap of the state of all of the irq lines
523     pub current_interrupt_level_bitmap: u32,
524     /// redirect_table contains the irq settings for each irq line
525     #[serde(
526         serialize_with = "serialize_arr",
527         deserialize_with = "deserialize_seq_to_arr"
528     )]
529     pub redirect_table: [IoapicRedirectionTableEntry; NUM_IOAPIC_PINS],
530 }
531 
532 impl Default for IoapicState {
default() -> IoapicState533     fn default() -> IoapicState {
534         // SAFETY: trivially safe
535         unsafe { std::mem::zeroed() }
536     }
537 }
538 
539 #[repr(C)]
540 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
541 pub enum PicSelect {
542     Primary = 0,
543     Secondary = 1,
544 }
545 
546 #[repr(C)]
547 #[derive(enumn::N, Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
548 pub enum PicInitState {
549     #[default]
550     Icw1 = 0,
551     Icw2 = 1,
552     Icw3 = 2,
553     Icw4 = 3,
554 }
555 
556 /// Convenience implementation for converting from a u8
557 impl From<u8> for PicInitState {
from(item: u8) -> Self558     fn from(item: u8) -> Self {
559         PicInitState::n(item).unwrap_or_else(|| {
560             error!("Invalid PicInitState {}, setting to 0", item);
561             PicInitState::Icw1
562         })
563     }
564 }
565 
566 /// Represents the state of the PIC.
567 #[repr(C)]
568 #[derive(Clone, Copy, Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
569 pub struct PicState {
570     /// Edge detection.
571     pub last_irr: u8,
572     /// Interrupt Request Register.
573     pub irr: u8,
574     /// Interrupt Mask Register.
575     pub imr: u8,
576     /// Interrupt Service Register.
577     pub isr: u8,
578     /// Highest priority, for priority rotation.
579     pub priority_add: u8,
580     pub irq_base: u8,
581     pub read_reg_select: bool,
582     pub poll: bool,
583     pub special_mask: bool,
584     pub init_state: PicInitState,
585     pub auto_eoi: bool,
586     pub rotate_on_auto_eoi: bool,
587     pub special_fully_nested_mode: bool,
588     /// PIC takes either 3 or 4 bytes of initialization command word during
589     /// initialization. use_4_byte_icw is true if 4 bytes of ICW are needed.
590     pub use_4_byte_icw: bool,
591     /// "Edge/Level Control Registers", for edge trigger selection.
592     /// When a particular bit is set, the corresponding IRQ is in level-triggered mode. Otherwise
593     /// it is in edge-triggered mode.
594     pub elcr: u8,
595     pub elcr_mask: u8,
596 }
597 
598 /// The LapicState represents the state of an x86 CPU's Local APIC.
599 /// The Local APIC consists of 64 128-bit registers, but only the first 32-bits of each register
600 /// can be used, so this structure only stores the first 32-bits of each register.
601 #[repr(C)]
602 #[derive(Clone, Copy, Serialize, Deserialize)]
603 pub struct LapicState {
604     #[serde(
605         serialize_with = "serialize_arr",
606         deserialize_with = "deserialize_seq_to_arr"
607     )]
608     pub regs: [LapicRegister; 64],
609 }
610 
611 pub type LapicRegister = u32;
612 
613 // rust arrays longer than 32 need custom implementations of Debug
614 impl std::fmt::Debug for LapicState {
fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result615     fn fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
616         self.regs[..].fmt(formatter)
617     }
618 }
619 
620 // rust arrays longer than 32 need custom implementations of PartialEq
621 impl PartialEq for LapicState {
eq(&self, other: &LapicState) -> bool622     fn eq(&self, other: &LapicState) -> bool {
623         self.regs[..] == other.regs[..]
624     }
625 }
626 
627 // Lapic equality is reflexive, so we impl Eq
628 impl Eq for LapicState {}
629 
630 /// The PitState represents the state of the PIT (aka the Programmable Interval Timer).
631 /// The state is simply the state of it's three channels.
632 #[repr(C)]
633 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
634 pub struct PitState {
635     pub channels: [PitChannelState; 3],
636     /// Hypervisor-specific flags for setting the pit state.
637     pub flags: u32,
638 }
639 
640 /// The PitRWMode enum represents the access mode of a PIT channel.
641 /// Reads and writes to the Pit happen over Port-mapped I/O, which happens one byte at a time,
642 /// but the count values and latch values are two bytes. So the access mode controls which of the
643 /// two bytes will be read when.
644 #[repr(C)]
645 #[derive(enumn::N, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
646 pub enum PitRWMode {
647     /// None mode means that no access mode has been set.
648     None = 0,
649     /// Least mode means all reads/writes will read/write the least significant byte.
650     Least = 1,
651     /// Most mode means all reads/writes will read/write the most significant byte.
652     Most = 2,
653     /// Both mode means first the least significant byte will be read/written, then the
654     /// next read/write will read/write the most significant byte.
655     Both = 3,
656 }
657 
658 /// Convenience implementation for converting from a u8
659 impl From<u8> for PitRWMode {
from(item: u8) -> Self660     fn from(item: u8) -> Self {
661         PitRWMode::n(item).unwrap_or_else(|| {
662             error!("Invalid PitRWMode value {}, setting to 0", item);
663             PitRWMode::None
664         })
665     }
666 }
667 
668 /// The PitRWState enum represents the state of reading to or writing from a channel.
669 /// This is related to the PitRWMode, it mainly gives more detail about the state of the channel
670 /// with respect to PitRWMode::Both.
671 #[repr(C)]
672 #[derive(enumn::N, Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
673 pub enum PitRWState {
674     /// None mode means that no access mode has been set.
675     None = 0,
676     /// LSB means that the channel is in PitRWMode::Least access mode.
677     LSB = 1,
678     /// MSB means that the channel is in PitRWMode::Most access mode.
679     MSB = 2,
680     /// Word0 means that the channel is in PitRWMode::Both mode, and the least sginificant byte
681     /// has not been read/written yet.
682     Word0 = 3,
683     /// Word1 means that the channel is in PitRWMode::Both mode and the least significant byte
684     /// has already been read/written, and the next byte to be read/written will be the most
685     /// significant byte.
686     Word1 = 4,
687 }
688 
689 /// Convenience implementation for converting from a u8
690 impl From<u8> for PitRWState {
from(item: u8) -> Self691     fn from(item: u8) -> Self {
692         PitRWState::n(item).unwrap_or_else(|| {
693             error!("Invalid PitRWState value {}, setting to 0", item);
694             PitRWState::None
695         })
696     }
697 }
698 
699 /// The PitChannelState represents the state of one of the PIT's three counters.
700 #[repr(C)]
701 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
702 pub struct PitChannelState {
703     /// The starting value for the counter.
704     pub count: u32,
705     /// Stores the channel count from the last time the count was latched.
706     pub latched_count: u16,
707     /// Indicates the PitRWState state of reading the latch value.
708     pub count_latched: PitRWState,
709     /// Indicates whether ReadBack status has been latched.
710     pub status_latched: bool,
711     /// Stores the channel status from the last time the status was latched. The status contains
712     /// information about the access mode of this channel, but changing those bits in the status
713     /// will not change the behavior of the pit.
714     pub status: u8,
715     /// Indicates the PitRWState state of reading the counter.
716     pub read_state: PitRWState,
717     /// Indicates the PitRWState state of writing the counter.
718     pub write_state: PitRWState,
719     /// Stores the value with which the counter was initialized. Counters are 16-
720     /// bit values with an effective range of 1-65536 (65536 represented by 0).
721     pub reload_value: u16,
722     /// The command access mode of this channel.
723     pub rw_mode: PitRWMode,
724     /// The operation mode of this channel.
725     pub mode: u8,
726     /// Whether or not we are in bcd mode. Not supported by KVM or crosvm's PIT implementation.
727     pub bcd: bool,
728     /// Value of the gate input pin. This only applies to channel 2.
729     pub gate: bool,
730     /// Nanosecond timestamp of when the count value was loaded.
731     pub count_load_time: u64,
732 }
733 
734 // Convenience constructors for IrqRoutes
735 impl IrqRoute {
ioapic_irq_route(irq_num: u32) -> IrqRoute736     pub fn ioapic_irq_route(irq_num: u32) -> IrqRoute {
737         IrqRoute {
738             gsi: irq_num,
739             source: IrqSource::Irqchip {
740                 chip: IrqSourceChip::Ioapic,
741                 pin: irq_num,
742             },
743         }
744     }
745 
pic_irq_route(id: IrqSourceChip, irq_num: u32) -> IrqRoute746     pub fn pic_irq_route(id: IrqSourceChip, irq_num: u32) -> IrqRoute {
747         IrqRoute {
748             gsi: irq_num,
749             source: IrqSource::Irqchip {
750                 chip: id,
751                 pin: irq_num % 8,
752             },
753         }
754     }
755 }
756 
757 /// State of a VCPU's general purpose registers.
758 #[repr(C)]
759 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
760 pub struct Regs {
761     pub rax: u64,
762     pub rbx: u64,
763     pub rcx: u64,
764     pub rdx: u64,
765     pub rsi: u64,
766     pub rdi: u64,
767     pub rsp: u64,
768     pub rbp: u64,
769     pub r8: u64,
770     pub r9: u64,
771     pub r10: u64,
772     pub r11: u64,
773     pub r12: u64,
774     pub r13: u64,
775     pub r14: u64,
776     pub r15: u64,
777     pub rip: u64,
778     pub rflags: u64,
779 }
780 
781 impl Default for Regs {
default() -> Self782     fn default() -> Self {
783         Regs {
784             rax: 0,
785             rbx: 0,
786             rcx: 0,
787             rdx: 0,
788             rsi: 0,
789             rdi: 0,
790             rsp: 0,
791             rbp: 0,
792             r8: 0,
793             r9: 0,
794             r10: 0,
795             r11: 0,
796             r12: 0,
797             r13: 0,
798             r14: 0,
799             r15: 0,
800             rip: 0xfff0, // Reset vector.
801             rflags: 0x2, // Bit 1 (0x2) is always 1.
802         }
803     }
804 }
805 
806 /// State of a memory segment.
807 #[repr(C)]
808 #[derive(Debug, Default, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
809 pub struct Segment {
810     pub base: u64,
811     /// Limit of the segment - always in bytes, regardless of granularity (`g`) field.
812     pub limit_bytes: u32,
813     pub selector: u16,
814     pub type_: u8,
815     pub present: u8,
816     pub dpl: u8,
817     pub db: u8,
818     pub s: u8,
819     pub l: u8,
820     pub g: u8,
821     pub avl: u8,
822 }
823 
824 /// State of a global descriptor table or interrupt descriptor table.
825 #[repr(C)]
826 #[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
827 pub struct DescriptorTable {
828     pub base: u64,
829     pub limit: u16,
830 }
831 
832 /// State of a VCPU's special registers.
833 #[repr(C)]
834 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
835 pub struct Sregs {
836     pub cs: Segment,
837     pub ds: Segment,
838     pub es: Segment,
839     pub fs: Segment,
840     pub gs: Segment,
841     pub ss: Segment,
842     pub tr: Segment,
843     pub ldt: Segment,
844     pub gdt: DescriptorTable,
845     pub idt: DescriptorTable,
846     pub cr0: u64,
847     pub cr2: u64,
848     pub cr3: u64,
849     pub cr4: u64,
850     pub cr8: u64,
851     pub efer: u64,
852 }
853 
854 impl Default for Sregs {
default() -> Self855     fn default() -> Self {
856         // Intel SDM Vol. 3A, 3.4.5.1 ("Code- and Data-Segment Descriptor Types")
857         const SEG_TYPE_DATA: u8 = 0b0000;
858         const SEG_TYPE_DATA_WRITABLE: u8 = 0b0010;
859 
860         const SEG_TYPE_CODE: u8 = 0b1000;
861         const SEG_TYPE_CODE_READABLE: u8 = 0b0010;
862 
863         const SEG_TYPE_ACCESSED: u8 = 0b0001;
864 
865         // Intel SDM Vol. 3A, 3.4.5 ("Segment Descriptors")
866         const SEG_S_SYSTEM: u8 = 0; // System segment.
867         const SEG_S_CODE_OR_DATA: u8 = 1; // Data/code segment.
868 
869         // 16-bit real-mode code segment (reset vector).
870         let code_seg = Segment {
871             base: 0xffff0000,
872             limit_bytes: 0xffff,
873             selector: 0xf000,
874             type_: SEG_TYPE_CODE | SEG_TYPE_CODE_READABLE | SEG_TYPE_ACCESSED, // 11
875             present: 1,
876             s: SEG_S_CODE_OR_DATA,
877             ..Default::default()
878         };
879 
880         // 16-bit real-mode data segment.
881         let data_seg = Segment {
882             base: 0,
883             limit_bytes: 0xffff,
884             selector: 0,
885             type_: SEG_TYPE_DATA | SEG_TYPE_DATA_WRITABLE | SEG_TYPE_ACCESSED, // 3
886             present: 1,
887             s: SEG_S_CODE_OR_DATA,
888             ..Default::default()
889         };
890 
891         // 16-bit TSS segment.
892         let task_seg = Segment {
893             base: 0,
894             limit_bytes: 0xffff,
895             selector: 0,
896             type_: SEG_TYPE_CODE | SEG_TYPE_CODE_READABLE | SEG_TYPE_ACCESSED, // 11
897             present: 1,
898             s: SEG_S_SYSTEM,
899             ..Default::default()
900         };
901 
902         // Local descriptor table.
903         let ldt = Segment {
904             base: 0,
905             limit_bytes: 0xffff,
906             selector: 0,
907             type_: SEG_TYPE_DATA | SEG_TYPE_DATA_WRITABLE, // 2
908             present: 1,
909             s: SEG_S_SYSTEM,
910             ..Default::default()
911         };
912 
913         // Global descriptor table.
914         let gdt = DescriptorTable {
915             base: 0,
916             limit: 0xffff,
917         };
918 
919         // Interrupt descriptor table.
920         let idt = DescriptorTable {
921             base: 0,
922             limit: 0xffff,
923         };
924 
925         let cr0 = (1 << 4) // CR0.ET (reserved, always 1)
926                 | (1 << 30); // CR0.CD (cache disable)
927 
928         Sregs {
929             cs: code_seg,
930             ds: data_seg,
931             es: data_seg,
932             fs: data_seg,
933             gs: data_seg,
934             ss: data_seg,
935             tr: task_seg,
936             ldt,
937             gdt,
938             idt,
939             cr0,
940             cr2: 0,
941             cr3: 0,
942             cr4: 0,
943             cr8: 0,
944             efer: 0,
945         }
946     }
947 }
948 
949 /// x87 80-bit floating point value.
950 #[repr(C)]
951 #[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
952 pub struct FpuReg {
953     /// 64-bit mantissa.
954     pub significand: u64,
955 
956     /// 15-bit biased exponent and sign bit.
957     pub sign_exp: u16,
958 }
959 
960 impl FpuReg {
961     /// Convert an array of 8x16-byte arrays to an array of 8 `FpuReg`.
962     ///
963     /// Ignores any data in the upper 6 bytes of each element; the values represent 80-bit FPU
964     /// registers, so the upper 48 bits are unused.
from_16byte_arrays(byte_arrays: &[[u8; 16]; 8]) -> [FpuReg; 8]965     pub fn from_16byte_arrays(byte_arrays: &[[u8; 16]; 8]) -> [FpuReg; 8] {
966         let mut regs = [FpuReg::default(); 8];
967         for (dst, src) in regs.iter_mut().zip(byte_arrays.iter()) {
968             let tbyte: [u8; 10] = src[0..10].try_into().unwrap();
969             *dst = FpuReg::from(tbyte);
970         }
971         regs
972     }
973 
974     /// Convert an array of 8 `FpuReg` into 8x16-byte arrays.
to_16byte_arrays(regs: &[FpuReg; 8]) -> [[u8; 16]; 8]975     pub fn to_16byte_arrays(regs: &[FpuReg; 8]) -> [[u8; 16]; 8] {
976         let mut byte_arrays = [[0u8; 16]; 8];
977         for (dst, src) in byte_arrays.iter_mut().zip(regs.iter()) {
978             *dst = (*src).into();
979         }
980         byte_arrays
981     }
982 }
983 
984 impl From<[u8; 10]> for FpuReg {
985     /// Construct a `FpuReg` from an 80-bit representation.
from(value: [u8; 10]) -> FpuReg986     fn from(value: [u8; 10]) -> FpuReg {
987         // These array sub-slices can't fail, but there's no (safe) way to express that in Rust
988         // without an `unwrap()`.
989         let significand_bytes = value[0..8].try_into().unwrap();
990         let significand = u64::from_le_bytes(significand_bytes);
991         let sign_exp_bytes = value[8..10].try_into().unwrap();
992         let sign_exp = u16::from_le_bytes(sign_exp_bytes);
993         FpuReg {
994             significand,
995             sign_exp,
996         }
997     }
998 }
999 
1000 impl From<FpuReg> for [u8; 10] {
1001     /// Convert an `FpuReg` into its 80-bit "TBYTE" representation.
from(value: FpuReg) -> [u8; 10]1002     fn from(value: FpuReg) -> [u8; 10] {
1003         let mut bytes = [0u8; 10];
1004         bytes[0..8].copy_from_slice(&value.significand.to_le_bytes());
1005         bytes[8..10].copy_from_slice(&value.sign_exp.to_le_bytes());
1006         bytes
1007     }
1008 }
1009 
1010 impl From<FpuReg> for [u8; 16] {
1011     /// Convert an `FpuReg` into its 80-bit representation plus 6 unused upper bytes.
1012     /// This is a convenience function for converting to hypervisor types.
from(value: FpuReg) -> [u8; 16]1013     fn from(value: FpuReg) -> [u8; 16] {
1014         let mut bytes = [0u8; 16];
1015         bytes[0..8].copy_from_slice(&value.significand.to_le_bytes());
1016         bytes[8..10].copy_from_slice(&value.sign_exp.to_le_bytes());
1017         bytes
1018     }
1019 }
1020 
1021 /// State of a VCPU's floating point unit.
1022 #[repr(C)]
1023 #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
1024 pub struct Fpu {
1025     pub fpr: [FpuReg; 8],
1026     pub fcw: u16,
1027     pub fsw: u16,
1028     pub ftwx: u8,
1029     pub last_opcode: u16,
1030     pub last_ip: u64,
1031     pub last_dp: u64,
1032     pub xmm: [[u8; 16usize]; 16usize],
1033     pub mxcsr: u32,
1034 }
1035 
1036 impl Default for Fpu {
default() -> Self1037     fn default() -> Self {
1038         Fpu {
1039             fpr: Default::default(),
1040             fcw: 0x37f, // Intel SDM Vol. 1, 13.6
1041             fsw: 0,
1042             ftwx: 0,
1043             last_opcode: 0,
1044             last_ip: 0,
1045             last_dp: 0,
1046             xmm: Default::default(),
1047             mxcsr: 0x1f80, // Intel SDM Vol. 1, 11.6.4
1048         }
1049     }
1050 }
1051 
1052 /// State of a VCPU's debug registers.
1053 #[repr(C)]
1054 #[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
1055 pub struct DebugRegs {
1056     pub db: [u64; 4usize],
1057     pub dr6: u64,
1058     pub dr7: u64,
1059 }
1060 
1061 /// The hybrid type for intel hybrid CPU.
1062 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
1063 pub enum CpuHybridType {
1064     /// Intel Atom.
1065     Atom,
1066     /// Intel Core.
1067     Core,
1068 }
1069 
1070 /// State of the VCPU's x87 FPU, MMX, XMM, YMM registers.
1071 /// May contain more state depending on enabled extensions.
1072 #[derive(Clone, Debug, Serialize, Deserialize)]
1073 pub struct Xsave {
1074     data: Vec<u32>,
1075 
1076     // Actual length in bytes. May be smaller than data if a non-u32 multiple of bytes is
1077     // requested.
1078     len: usize,
1079 }
1080 
1081 impl Xsave {
1082     /// Create a new buffer to store Xsave data.
1083     ///
1084     /// # Argments
1085     /// * `len` size in bytes.
new(len: usize) -> Self1086     pub fn new(len: usize) -> Self {
1087         Xsave {
1088             data: vec![0; (len + 3) / 4],
1089             len,
1090         }
1091     }
1092 
as_ptr(&self) -> *const c_void1093     pub fn as_ptr(&self) -> *const c_void {
1094         self.data.as_ptr() as *const c_void
1095     }
1096 
as_mut_ptr(&mut self) -> *mut c_void1097     pub fn as_mut_ptr(&mut self) -> *mut c_void {
1098         self.data.as_mut_ptr() as *mut c_void
1099     }
1100 
1101     /// Length in bytes of the XSAVE data.
len(&self) -> usize1102     pub fn len(&self) -> usize {
1103         self.len
1104     }
1105 
1106     /// Returns true is length of XSAVE data is zero
is_empty(&self) -> bool1107     pub fn is_empty(&self) -> bool {
1108         self.len() == 0
1109     }
1110 }
1111