xref: /aosp_15_r20/external/crosvm/x86_64/src/lib.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! x86 architecture support.
6 
7 #![cfg(target_arch = "x86_64")]
8 
9 mod fdt;
10 
11 #[cfg(feature = "gdb")]
12 mod gdb;
13 
14 const SETUP_DTB: u32 = 2;
15 const SETUP_RNG_SEED: u32 = 9;
16 
17 #[allow(dead_code)]
18 #[allow(non_upper_case_globals)]
19 #[allow(non_camel_case_types)]
20 #[allow(non_snake_case)]
21 pub mod bootparam;
22 
23 #[allow(dead_code)]
24 #[allow(non_upper_case_globals)]
25 mod msr_index;
26 
27 #[allow(dead_code)]
28 #[allow(non_upper_case_globals)]
29 #[allow(non_camel_case_types)]
30 #[allow(clippy::all)]
31 mod mpspec;
32 
33 pub mod acpi;
34 mod bzimage;
35 pub mod cpuid;
36 mod gdt;
37 pub mod interrupts;
38 pub mod mptable;
39 pub mod regs;
40 pub mod smbios;
41 
42 use std::arch::x86_64::CpuidResult;
43 use std::collections::BTreeMap;
44 use std::fmt;
45 use std::fs::File;
46 use std::io;
47 use std::io::Write;
48 use std::mem;
49 use std::path::PathBuf;
50 use std::sync::mpsc;
51 use std::sync::Arc;
52 
53 use acpi_tables::aml;
54 use acpi_tables::aml::Aml;
55 use acpi_tables::sdt::SDT;
56 use anyhow::Context;
57 use arch::get_serial_cmdline;
58 use arch::serial::SerialDeviceInfo;
59 use arch::CpuSet;
60 use arch::DtbOverlay;
61 use arch::FdtPosition;
62 use arch::GetSerialCmdlineError;
63 use arch::MemoryRegionConfig;
64 use arch::PciConfig;
65 use arch::RunnableLinuxVm;
66 use arch::VmComponents;
67 use arch::VmImage;
68 use base::debug;
69 use base::info;
70 use base::warn;
71 #[cfg(any(target_os = "android", target_os = "linux"))]
72 use base::AsRawDescriptors;
73 use base::Event;
74 use base::FileGetLen;
75 use base::FileReadWriteAtVolatile;
76 use base::SendTube;
77 use base::Tube;
78 use base::TubeError;
79 use chrono::Utc;
80 pub use cpuid::adjust_cpuid;
81 pub use cpuid::CpuIdContext;
82 use devices::acpi::PM_WAKEUP_GPIO;
83 use devices::Bus;
84 use devices::BusDevice;
85 use devices::BusDeviceObj;
86 use devices::BusResumeDevice;
87 use devices::BusType;
88 use devices::Debugcon;
89 use devices::FwCfgParameters;
90 use devices::IrqChip;
91 use devices::IrqChipX86_64;
92 use devices::IrqEventSource;
93 use devices::PciAddress;
94 use devices::PciConfigIo;
95 use devices::PciConfigMmio;
96 use devices::PciDevice;
97 use devices::PciInterruptPin;
98 use devices::PciRoot;
99 use devices::PciRootCommand;
100 use devices::PciVirtualConfigMmio;
101 use devices::Pflash;
102 #[cfg(any(target_os = "android", target_os = "linux"))]
103 use devices::ProxyDevice;
104 use devices::Serial;
105 use devices::SerialHardware;
106 use devices::SerialParameters;
107 use devices::VirtualPmc;
108 use devices::FW_CFG_BASE_PORT;
109 use devices::FW_CFG_MAX_FILE_SLOTS;
110 use devices::FW_CFG_WIDTH;
111 use hypervisor::CpuConfigX86_64;
112 use hypervisor::Hypervisor;
113 use hypervisor::HypervisorX86_64;
114 use hypervisor::ProtectionType;
115 use hypervisor::VcpuInitX86_64;
116 use hypervisor::VcpuX86_64;
117 use hypervisor::Vm;
118 use hypervisor::VmCap;
119 use hypervisor::VmX86_64;
120 #[cfg(feature = "seccomp_trace")]
121 use jail::read_jail_addr;
122 #[cfg(windows)]
123 use jail::FakeMinijailStub as Minijail;
124 #[cfg(any(target_os = "android", target_os = "linux"))]
125 use minijail::Minijail;
126 use rand::rngs::OsRng;
127 use rand::RngCore;
128 use remain::sorted;
129 use resources::AddressRange;
130 use resources::SystemAllocator;
131 use resources::SystemAllocatorConfig;
132 use sync::Condvar;
133 use sync::Mutex;
134 use thiserror::Error;
135 use vm_control::BatControl;
136 use vm_control::BatteryType;
137 use vm_memory::GuestAddress;
138 use vm_memory::GuestMemory;
139 use vm_memory::GuestMemoryError;
140 use vm_memory::MemoryRegionOptions;
141 use vm_memory::MemoryRegionPurpose;
142 use zerocopy::AsBytes;
143 use zerocopy::FromBytes;
144 use zerocopy::FromZeroes;
145 
146 use crate::bootparam::boot_params;
147 use crate::bootparam::setup_header;
148 use crate::bootparam::XLF_CAN_BE_LOADED_ABOVE_4G;
149 use crate::cpuid::EDX_HYBRID_CPU_SHIFT;
150 
151 #[sorted]
152 #[derive(Error, Debug)]
153 pub enum Error {
154     #[error("error allocating a single gpe")]
155     AllocateGpe,
156     #[error("error allocating IO resource: {0}")]
157     AllocateIOResouce(resources::Error),
158     #[error("error allocating a single irq")]
159     AllocateIrq,
160     #[error("unable to clone an Event: {0}")]
161     CloneEvent(base::Error),
162     #[error("failed to clone IRQ chip: {0}")]
163     CloneIrqChip(base::Error),
164     #[cfg(any(target_os = "android", target_os = "linux"))]
165     #[error("failed to clone jail: {0}")]
166     CloneJail(minijail::Error),
167     #[error("unable to clone a Tube: {0}")]
168     CloneTube(TubeError),
169     #[error("the given kernel command line was invalid: {0}")]
170     Cmdline(kernel_cmdline::Error),
171     #[error("failed writing command line to guest memory")]
172     CommandLineCopy,
173     #[error("command line overflowed guest memory")]
174     CommandLineOverflow,
175     #[error("failed to configure hotplugged pci device: {0}")]
176     ConfigurePciDevice(arch::DeviceRegistrationError),
177     #[error("bad PCI ECAM configuration: {0}")]
178     ConfigurePciEcam(String),
179     #[error("bad PCI mem configuration: {0}")]
180     ConfigurePciMem(String),
181     #[error("failed to configure segment registers: {0}")]
182     ConfigureSegments(regs::Error),
183     #[error("error configuring the system")]
184     ConfigureSystem,
185     #[error("unable to create ACPI tables")]
186     CreateAcpi,
187     #[error("unable to create battery devices: {0}")]
188     CreateBatDevices(arch::DeviceRegistrationError),
189     #[error("could not create debugcon device: {0}")]
190     CreateDebugconDevice(devices::SerialError),
191     #[error("unable to make an Event: {0}")]
192     CreateEvent(base::Error),
193     #[error("failed to create fdt: {0}")]
194     CreateFdt(cros_fdt::Error),
195     #[error("failed to create fw_cfg device: {0}")]
196     CreateFwCfgDevice(devices::FwCfgError),
197     #[error("failed to create IOAPIC device: {0}")]
198     CreateIoapicDevice(base::Error),
199     #[error("failed to create a PCI root hub: {0}")]
200     CreatePciRoot(arch::DeviceRegistrationError),
201     #[error("unable to create PIT: {0}")]
202     CreatePit(base::Error),
203     #[error("unable to make PIT device: {0}")]
204     CreatePitDevice(devices::PitError),
205     #[cfg(any(target_os = "android", target_os = "linux"))]
206     #[error("unable to create proxy device: {0}")]
207     CreateProxyDevice(devices::ProxyError),
208     #[error("unable to create serial devices: {0}")]
209     CreateSerialDevices(arch::DeviceRegistrationError),
210     #[error("failed to create socket: {0}")]
211     CreateSocket(io::Error),
212     #[error("failed to create tube: {0}")]
213     CreateTube(base::TubeError),
214     #[error("failed to create VCPU: {0}")]
215     CreateVcpu(base::Error),
216     #[error("failed to create Virtio MMIO bus: {0}")]
217     CreateVirtioMmioBus(arch::DeviceRegistrationError),
218     #[error("invalid e820 setup params")]
219     E820Configuration,
220     #[error("failed to enable singlestep execution: {0}")]
221     EnableSinglestep(base::Error),
222     #[error("failed to enable split irqchip: {0}")]
223     EnableSplitIrqchip(base::Error),
224     #[error("failed to get serial cmdline: {0}")]
225     GetSerialCmdline(GetSerialCmdlineError),
226     #[error("failed to insert device onto bus: {0}")]
227     InsertBus(devices::BusError),
228     #[error("the kernel extends past the end of RAM")]
229     InvalidCpuConfig,
230     #[error("invalid CPU config parameters")]
231     KernelOffsetPastEnd,
232     #[error("error loading bios: {0}")]
233     LoadBios(io::Error),
234     #[error("error loading kernel bzImage: {0}")]
235     LoadBzImage(bzimage::Error),
236     #[error("error loading custom pVM firmware: {0}")]
237     LoadCustomPvmFw(arch::LoadImageError),
238     #[error("error loading initrd: {0}")]
239     LoadInitrd(arch::LoadImageError),
240     #[error("error loading Kernel: {0}")]
241     LoadKernel(kernel_loader::Error),
242     #[error("error loading pflash: {0}")]
243     LoadPflash(io::Error),
244     #[error("error translating address: Page not present")]
245     PageNotPresent,
246     #[error("pci mmio overlaps with pVM firmware memory")]
247     PciMmioOverlapPvmFw,
248     #[error("pVM firmware not supported when bios is used on x86_64")]
249     PvmFwBiosUnsupported,
250     #[error("error reading guest memory {0}")]
251     ReadingGuestMemory(vm_memory::GuestMemoryError),
252     #[error("single register read not supported on x86_64")]
253     ReadRegIsUnsupported,
254     #[error("error reading CPU registers {0}")]
255     ReadRegs(base::Error),
256     #[error("error registering an IrqFd: {0}")]
257     RegisterIrqfd(base::Error),
258     #[error("error registering virtual socket device: {0}")]
259     RegisterVsock(arch::DeviceRegistrationError),
260     #[error("error reserved pcie config mmio")]
261     ReservePcieCfgMmio(resources::Error),
262     #[error("failed to set a hardware breakpoint: {0}")]
263     SetHwBreakpoint(base::Error),
264     #[error("failed to set identity map addr: {0}")]
265     SetIdentityMapAddr(base::Error),
266     #[error("failed to set interrupts: {0}")]
267     SetLint(interrupts::Error),
268     #[error("failed to set tss addr: {0}")]
269     SetTssAddr(base::Error),
270     #[error("failed to set up cmos: {0}")]
271     SetupCmos(anyhow::Error),
272     #[error("failed to set up cpuid: {0}")]
273     SetupCpuid(cpuid::Error),
274     #[error("setup data too large")]
275     SetupDataTooLarge,
276     #[error("failed to set up FPU: {0}")]
277     SetupFpu(base::Error),
278     #[error("failed to set up guest memory: {0}")]
279     SetupGuestMemory(GuestMemoryError),
280     #[error("failed to set up mptable: {0}")]
281     SetupMptable(mptable::Error),
282     #[error("failed to set up MSRs: {0}")]
283     SetupMsrs(base::Error),
284     #[error("failed to set up page tables: {0}")]
285     SetupPageTables(regs::Error),
286     #[error("failed to set up pflash: {0}")]
287     SetupPflash(anyhow::Error),
288     #[error("failed to set up registers: {0}")]
289     SetupRegs(regs::Error),
290     #[error("failed to set up SMBIOS: {0}")]
291     SetupSmbios(smbios::Error),
292     #[error("failed to set up sregs: {0}")]
293     SetupSregs(base::Error),
294     #[error("failed to translate virtual address")]
295     TranslatingVirtAddr,
296     #[error("protected VMs not supported on x86_64")]
297     UnsupportedProtectionType,
298     #[error("single register write not supported on x86_64")]
299     WriteRegIsUnsupported,
300     #[error("error writing CPU registers {0}")]
301     WriteRegs(base::Error),
302     #[error("error writing guest memory {0}")]
303     WritingGuestMemory(GuestMemoryError),
304     #[error("error writing setup_data: {0}")]
305     WritingSetupData(GuestMemoryError),
306     #[error("the zero page extends past the end of guest_mem")]
307     ZeroPagePastRamEnd,
308     #[error("error writing the zero page of guest memory")]
309     ZeroPageSetup,
310 }
311 
312 pub type Result<T> = std::result::Result<T, Error>;
313 
314 pub struct X8664arch;
315 
316 // Like `bootparam::setup_data` without the incomplete array field at the end, which allows us to
317 // safely implement Copy, Clone
318 #[repr(C)]
319 #[derive(Copy, Clone, Default, FromZeroes, FromBytes, AsBytes)]
320 struct setup_data_hdr {
321     pub next: u64,
322     pub type_: u32,
323     pub len: u32,
324 }
325 
326 #[repr(u32)]
327 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
328 pub enum SetupDataType {
329     Dtb = SETUP_DTB,
330     RngSeed = SETUP_RNG_SEED,
331 }
332 
333 /// A single entry to be inserted in the bootparam `setup_data` linked list.
334 pub struct SetupData {
335     pub data: Vec<u8>,
336     pub type_: SetupDataType,
337 }
338 
339 #[derive(Copy, Clone, Debug)]
340 enum E820Type {
341     Ram = 0x01,
342     Reserved = 0x2,
343 }
344 
345 #[derive(Copy, Clone, Debug)]
346 struct E820Entry {
347     pub address: GuestAddress,
348     pub len: u64,
349     pub mem_type: E820Type,
350 }
351 
352 const MB: u64 = 1 << 20;
353 const GB: u64 = 1 << 30;
354 
355 pub const BOOT_STACK_POINTER: u64 = 0x8000;
356 const START_OF_RAM_32BITS: u64 = 0;
357 const FIRST_ADDR_PAST_20BITS: u64 = 1 << 20;
358 const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32;
359 // Make sure it align to 256MB for MTRR convenient
360 const MEM_32BIT_GAP_SIZE: u64 = 768 * MB;
361 const END_ADDR_BEFORE_32BITS: u64 = FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE;
362 // Reserved memory for nand_bios/LAPIC/IOAPIC/HPET/.....
363 const RESERVED_MEM_SIZE: u64 = 0x800_0000;
364 const DEFAULT_PCI_MEM_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1;
365 // Reserve 64MB for pcie enhanced configuration
366 const DEFAULT_PCIE_CFG_MMIO_SIZE: u64 = 0x400_0000;
367 const DEFAULT_PCIE_CFG_MMIO_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1;
368 const DEFAULT_PCIE_CFG_MMIO_START: u64 = DEFAULT_PCIE_CFG_MMIO_END - DEFAULT_PCIE_CFG_MMIO_SIZE + 1;
369 // Linux (with 4-level paging) has a physical memory limit of 46 bits (64 TiB).
370 const HIGH_MMIO_MAX_END: u64 = (1u64 << 46) - 1;
371 pub const KERNEL_32BIT_ENTRY_OFFSET: u64 = 0x0;
372 pub const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200;
373 pub const ZERO_PAGE_OFFSET: u64 = 0x7000;
374 // Set BIOS max size to 16M: this is used only when `unrestricted guest` is disabled
375 const BIOS_MAX_SIZE: u64 = 0x1000000;
376 
377 pub const KERNEL_START_OFFSET: u64 = 0x20_0000;
378 const CMDLINE_OFFSET: u64 = 0x2_0000;
379 const CMDLINE_MAX_SIZE: u64 = 0x800; // including terminating zero
380 const SETUP_DATA_START: u64 = CMDLINE_OFFSET + CMDLINE_MAX_SIZE;
381 const SETUP_DATA_END: u64 = ACPI_HI_RSDP_WINDOW_BASE;
382 const X86_64_SERIAL_1_3_IRQ: u32 = 4;
383 const X86_64_SERIAL_2_4_IRQ: u32 = 3;
384 // X86_64_SCI_IRQ is used to fill the ACPI FACP table.
385 // The sci_irq number is better to be a legacy
386 // IRQ number which is less than 16(actually most of the
387 // platforms have fixed IRQ number 9). So we can
388 // reserve the IRQ number 5 for SCI and let the
389 // the other devices starts from next.
390 pub const X86_64_SCI_IRQ: u32 = 5;
391 // The CMOS RTC uses IRQ 8; start allocating IRQs at 9.
392 pub const X86_64_IRQ_BASE: u32 = 9;
393 const ACPI_HI_RSDP_WINDOW_BASE: u64 = 0x000E_0000;
394 
395 // pVM firmware memory. Should be within the low 4GB, so that it is identity-mapped
396 // by setup_page_tables() when a protected VM boots in long mode, since the pVM firmware is
397 // the VM entry point.
398 const PROTECTED_VM_FW_MAX_SIZE: u64 = 0x40_0000;
399 const PROTECTED_VM_FW_START: u64 = END_ADDR_BEFORE_32BITS - PROTECTED_VM_FW_MAX_SIZE;
400 
401 #[derive(Debug, PartialEq, Eq)]
402 pub enum CpuManufacturer {
403     Intel,
404     Amd,
405     Unknown,
406 }
407 
get_cpu_manufacturer() -> CpuManufacturer408 pub fn get_cpu_manufacturer() -> CpuManufacturer {
409     cpuid::cpu_manufacturer()
410 }
411 
412 pub struct ArchMemoryLayout {
413     // the pci mmio range below 4G
414     pci_mmio_before_32bit: AddressRange,
415     // the pcie cfg mmio range
416     pcie_cfg_mmio: AddressRange,
417     // the pVM firmware memory (if running a protected VM)
418     pvmfw_mem: Option<AddressRange>,
419 }
420 
create_arch_memory_layout( pci_config: &PciConfig, has_protected_vm_firmware: bool, ) -> Result<ArchMemoryLayout>421 pub fn create_arch_memory_layout(
422     pci_config: &PciConfig,
423     has_protected_vm_firmware: bool,
424 ) -> Result<ArchMemoryLayout> {
425     // the max bus number is 256 and each bus occupy 1MB, so the max pcie cfg mmio size = 256M
426     const MAX_PCIE_ECAM_SIZE: u64 = 256 * MB;
427     let pcie_cfg_mmio = match pci_config.ecam {
428         Some(MemoryRegionConfig {
429             start,
430             size: Some(size),
431         }) => AddressRange::from_start_and_size(start, size.min(MAX_PCIE_ECAM_SIZE)).unwrap(),
432         Some(MemoryRegionConfig { start, size: None }) => {
433             AddressRange::from_start_and_end(start, DEFAULT_PCIE_CFG_MMIO_END)
434         }
435         None => {
436             AddressRange::from_start_and_end(DEFAULT_PCIE_CFG_MMIO_START, DEFAULT_PCIE_CFG_MMIO_END)
437         }
438     };
439     if pcie_cfg_mmio.start % pcie_cfg_mmio.len().unwrap() != 0
440         || pcie_cfg_mmio.start % MB != 0
441         || pcie_cfg_mmio.len().unwrap() % MB != 0
442     {
443         return Err(Error::ConfigurePciEcam(
444             "base and len must be aligned to 1MB and base must be a multiple of len".to_string(),
445         ));
446     }
447     if pcie_cfg_mmio.end >= 0x1_0000_0000 {
448         return Err(Error::ConfigurePciEcam(
449             "end address can't go beyond 4G".to_string(),
450         ));
451     }
452 
453     let pci_mmio_before_32bit = match pci_config.mem {
454         Some(MemoryRegionConfig {
455             start,
456             size: Some(size),
457         }) => AddressRange::from_start_and_size(start, size)
458             .ok_or(Error::ConfigurePciMem("region overflowed".to_string()))?,
459         Some(MemoryRegionConfig { start, size: None }) => {
460             AddressRange::from_start_and_end(start, DEFAULT_PCI_MEM_END)
461         }
462         None => AddressRange::from_start_and_end(
463             pcie_cfg_mmio
464                 .start
465                 .min(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE),
466             DEFAULT_PCI_MEM_END,
467         ),
468     };
469 
470     let pvmfw_mem = if has_protected_vm_firmware {
471         let range = AddressRange {
472             start: PROTECTED_VM_FW_START,
473             end: PROTECTED_VM_FW_START + PROTECTED_VM_FW_MAX_SIZE - 1,
474         };
475         if !pci_mmio_before_32bit.intersect(range).is_empty() {
476             return Err(Error::PciMmioOverlapPvmFw);
477         }
478 
479         Some(range)
480     } else {
481         None
482     };
483 
484     Ok(ArchMemoryLayout {
485         pci_mmio_before_32bit,
486         pcie_cfg_mmio,
487         pvmfw_mem,
488     })
489 }
490 
max_ram_end_before_32bit( arch_memory_layout: &ArchMemoryLayout, has_protected_vm_firmware: bool, ) -> u64491 fn max_ram_end_before_32bit(
492     arch_memory_layout: &ArchMemoryLayout,
493     has_protected_vm_firmware: bool,
494 ) -> u64 {
495     let pci_start = arch_memory_layout.pci_mmio_before_32bit.start;
496     if has_protected_vm_firmware {
497         pci_start.min(PROTECTED_VM_FW_START)
498     } else {
499         pci_start
500     }
501 }
502 
503 /// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it
504 /// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction
505 /// pointer at the effective physical address 0xFFFF_FFF0.
bios_start(bios_size: u64) -> GuestAddress506 fn bios_start(bios_size: u64) -> GuestAddress {
507     GuestAddress(FIRST_ADDR_PAST_32BITS - bios_size)
508 }
509 
identity_map_addr_start() -> GuestAddress510 fn identity_map_addr_start() -> GuestAddress {
511     // Set Identity map address 4 pages before the max BIOS size
512     GuestAddress(FIRST_ADDR_PAST_32BITS - BIOS_MAX_SIZE - 4 * 0x1000)
513 }
514 
tss_addr_start() -> GuestAddress515 fn tss_addr_start() -> GuestAddress {
516     // Set TSS address one page after identity map address
517     GuestAddress(identity_map_addr_start().offset() + 0x1000)
518 }
519 
tss_addr_end() -> GuestAddress520 fn tss_addr_end() -> GuestAddress {
521     // Set TSS address section to have 3 pages
522     GuestAddress(tss_addr_start().offset() + 0x3000)
523 }
524 
configure_system( guest_mem: &GuestMemory, cmdline_addr: GuestAddress, setup_data: Option<GuestAddress>, initrd: Option<(GuestAddress, usize)>, mut params: boot_params, e820_entries: &[E820Entry], ) -> Result<()>525 fn configure_system(
526     guest_mem: &GuestMemory,
527     cmdline_addr: GuestAddress,
528     setup_data: Option<GuestAddress>,
529     initrd: Option<(GuestAddress, usize)>,
530     mut params: boot_params,
531     e820_entries: &[E820Entry],
532 ) -> Result<()> {
533     const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55;
534     const KERNEL_HDR_MAGIC: u32 = 0x5372_6448;
535     const KERNEL_LOADER_OTHER: u8 = 0xff;
536     const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x100_0000; // Must be non-zero.
537 
538     params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
539     params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC;
540     params.hdr.header = KERNEL_HDR_MAGIC;
541     params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32;
542     params.ext_cmd_line_ptr = (cmdline_addr.offset() >> 32) as u32;
543     params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES;
544     if let Some(setup_data) = setup_data {
545         params.hdr.setup_data = setup_data.offset();
546     }
547     if let Some((initrd_addr, initrd_size)) = initrd {
548         params.hdr.ramdisk_image = initrd_addr.offset() as u32;
549         params.ext_ramdisk_image = (initrd_addr.offset() >> 32) as u32;
550         params.hdr.ramdisk_size = initrd_size as u32;
551         params.ext_ramdisk_size = (initrd_size as u64 >> 32) as u32;
552     }
553 
554     if e820_entries.len() >= params.e820_table.len() {
555         return Err(Error::E820Configuration);
556     }
557 
558     for (src, dst) in e820_entries.iter().zip(params.e820_table.iter_mut()) {
559         dst.addr = src.address.offset();
560         dst.size = src.len;
561         dst.type_ = src.mem_type as u32;
562     }
563     params.e820_entries = e820_entries.len() as u8;
564 
565     let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET);
566     if !guest_mem.is_valid_range(zero_page_addr, mem::size_of::<boot_params>() as u64) {
567         return Err(Error::ZeroPagePastRamEnd);
568     }
569 
570     guest_mem
571         .write_obj_at_addr(params, zero_page_addr)
572         .map_err(|_| Error::ZeroPageSetup)?;
573 
574     Ok(())
575 }
576 
577 /// Write setup_data entries in guest memory and link them together with the `next` field.
578 ///
579 /// Returns the guest address of the first entry in the setup_data list, if any.
write_setup_data( guest_mem: &GuestMemory, setup_data_start: GuestAddress, setup_data_end: GuestAddress, setup_data: &[SetupData], ) -> Result<Option<GuestAddress>>580 fn write_setup_data(
581     guest_mem: &GuestMemory,
582     setup_data_start: GuestAddress,
583     setup_data_end: GuestAddress,
584     setup_data: &[SetupData],
585 ) -> Result<Option<GuestAddress>> {
586     let mut setup_data_list_head = None;
587 
588     // Place the first setup_data at the first 64-bit aligned offset following setup_data_start.
589     let mut setup_data_addr = setup_data_start.align(8).ok_or(Error::SetupDataTooLarge)?;
590 
591     let mut entry_iter = setup_data.iter().peekable();
592     while let Some(entry) = entry_iter.next() {
593         if setup_data_list_head.is_none() {
594             setup_data_list_head = Some(setup_data_addr);
595         }
596 
597         // Ensure the entry (header plus data) fits into guest memory.
598         let entry_size = (mem::size_of::<setup_data_hdr>() + entry.data.len()) as u64;
599         let entry_end = setup_data_addr
600             .checked_add(entry_size)
601             .ok_or(Error::SetupDataTooLarge)?;
602 
603         if entry_end >= setup_data_end {
604             return Err(Error::SetupDataTooLarge);
605         }
606 
607         let next_setup_data_addr = if entry_iter.peek().is_some() {
608             // Place the next setup_data at a 64-bit aligned address.
609             setup_data_addr
610                 .checked_add(entry_size)
611                 .and_then(|addr| addr.align(8))
612                 .ok_or(Error::SetupDataTooLarge)?
613         } else {
614             // This is the final entry. Terminate the list with next == 0.
615             GuestAddress(0)
616         };
617 
618         let hdr = setup_data_hdr {
619             next: next_setup_data_addr.offset(),
620             type_: entry.type_ as u32,
621             len: entry
622                 .data
623                 .len()
624                 .try_into()
625                 .map_err(|_| Error::SetupDataTooLarge)?,
626         };
627 
628         guest_mem
629             .write_obj_at_addr(hdr, setup_data_addr)
630             .map_err(Error::WritingSetupData)?;
631         guest_mem
632             .write_all_at_addr(
633                 &entry.data,
634                 setup_data_addr.unchecked_add(mem::size_of::<setup_data_hdr>() as u64),
635             )
636             .map_err(Error::WritingSetupData)?;
637 
638         setup_data_addr = next_setup_data_addr;
639     }
640 
641     Ok(setup_data_list_head)
642 }
643 
644 /// Generate a SETUP_RNG_SEED SetupData with random seed data.
setup_data_rng_seed() -> SetupData645 fn setup_data_rng_seed() -> SetupData {
646     let mut data = vec![0u8; 256];
647     OsRng.fill_bytes(&mut data);
648     SetupData {
649         data,
650         type_: SetupDataType::RngSeed,
651     }
652 }
653 
654 /// Add an e820 region to the e820 map.
add_e820_entry( e820_entries: &mut Vec<E820Entry>, range: AddressRange, mem_type: E820Type, ) -> Result<()>655 fn add_e820_entry(
656     e820_entries: &mut Vec<E820Entry>,
657     range: AddressRange,
658     mem_type: E820Type,
659 ) -> Result<()> {
660     e820_entries.push(E820Entry {
661         address: GuestAddress(range.start),
662         len: range.len().ok_or(Error::E820Configuration)?,
663         mem_type,
664     });
665 
666     Ok(())
667 }
668 
669 /// Generate a memory map in INT 0x15 AX=0xE820 format.
generate_e820_memory_map( arch_memory_layout: &ArchMemoryLayout, guest_mem: &GuestMemory, ram_below_1m: AddressRange, ram_below_4g: AddressRange, ram_above_4g: AddressRange, has_protected_vm_firmware: bool, ) -> Result<Vec<E820Entry>>670 fn generate_e820_memory_map(
671     arch_memory_layout: &ArchMemoryLayout,
672     guest_mem: &GuestMemory,
673     ram_below_1m: AddressRange,
674     ram_below_4g: AddressRange,
675     ram_above_4g: AddressRange,
676     has_protected_vm_firmware: bool,
677 ) -> Result<Vec<E820Entry>> {
678     let mut e820_entries = Vec::new();
679 
680     add_e820_entry(&mut e820_entries, ram_below_1m, E820Type::Ram)?;
681     add_e820_entry(&mut e820_entries, ram_below_4g, E820Type::Ram)?;
682     if !ram_above_4g.is_empty() {
683         add_e820_entry(&mut e820_entries, ram_above_4g, E820Type::Ram)?
684     }
685 
686     if has_protected_vm_firmware {
687         // After the pVM firmware jumped to the guest, the pVM firmware itself
688         // is no longer running, so its memory is reusable by the guest OS.
689         // So add this memory as RAM rather than Reserved.
690         let pvmfw_range = arch_memory_layout.pvmfw_mem.unwrap();
691         add_e820_entry(&mut e820_entries, pvmfw_range, E820Type::Ram)?;
692     }
693 
694     let pcie_cfg_mmio_range = arch_memory_layout.pcie_cfg_mmio;
695     add_e820_entry(&mut e820_entries, pcie_cfg_mmio_range, E820Type::Reserved)?;
696 
697     add_e820_entry(
698         &mut e820_entries,
699         X8664arch::get_pcie_vcfg_mmio_range(guest_mem, &pcie_cfg_mmio_range),
700         E820Type::Reserved,
701     )?;
702 
703     // Reserve memory section for Identity map and TSS
704     add_e820_entry(
705         &mut e820_entries,
706         AddressRange {
707             start: identity_map_addr_start().offset(),
708             end: tss_addr_end().offset() - 1,
709         },
710         E820Type::Reserved,
711     )?;
712 
713     Ok(e820_entries)
714 }
715 
716 /// Returns a Vec of the valid memory addresses.
717 /// These should be used to configure the GuestMemory structure for the platform.
718 /// For x86_64 all addresses are valid from the start of the kernel except a
719 /// carve out at the end of 32bit address space.
arch_memory_regions( arch_memory_layout: &ArchMemoryLayout, size: u64, bios_size: Option<u64>, has_protected_vm_firmware: bool, ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)>720 pub fn arch_memory_regions(
721     arch_memory_layout: &ArchMemoryLayout,
722     size: u64,
723     bios_size: Option<u64>,
724     has_protected_vm_firmware: bool,
725 ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> {
726     let mut mem_size = size;
727     let mut regions = Vec::new();
728 
729     if has_protected_vm_firmware {
730         regions.push((
731             GuestAddress(PROTECTED_VM_FW_START),
732             PROTECTED_VM_FW_MAX_SIZE,
733             MemoryRegionOptions::new().purpose(MemoryRegionPurpose::ProtectedFirmwareRegion),
734         ));
735 
736         // pVM firmware memory is a part of normal guest memory, since it is reusable
737         // by the guest OS once the pVM firmware jumped to the guest. So count its size
738         // as a part of the total guest memory size.
739         if mem_size > PROTECTED_VM_FW_MAX_SIZE {
740             mem_size -= PROTECTED_VM_FW_MAX_SIZE;
741         }
742     }
743 
744     let mem_start = START_OF_RAM_32BITS;
745     let mem_end = GuestAddress(mem_size + mem_start);
746 
747     let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
748     let max_end_32bits = GuestAddress(max_ram_end_before_32bit(
749         arch_memory_layout,
750         has_protected_vm_firmware,
751     ));
752 
753     if mem_end <= max_end_32bits {
754         regions.push((GuestAddress(mem_start), mem_size, Default::default()));
755         if let Some(bios_size) = bios_size {
756             regions.push((bios_start(bios_size), bios_size, Default::default()));
757         }
758     } else {
759         regions.push((
760             GuestAddress(mem_start),
761             max_end_32bits.offset() - mem_start,
762             Default::default(),
763         ));
764         if let Some(bios_size) = bios_size {
765             regions.push((bios_start(bios_size), bios_size, Default::default()));
766         }
767         regions.push((
768             first_addr_past_32bits,
769             mem_end.offset_from(max_end_32bits),
770             Default::default(),
771         ));
772     }
773 
774     regions
775 }
776 
777 impl arch::LinuxArch for X8664arch {
778     type Error = Error;
779     type ArchMemoryLayout = ArchMemoryLayout;
780 
arch_memory_layout( components: &VmComponents, ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error>781     fn arch_memory_layout(
782         components: &VmComponents,
783     ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error> {
784         create_arch_memory_layout(
785             &components.pci_config,
786             components.hv_cfg.protection_type.runs_firmware(),
787         )
788     }
789 
guest_memory_layout( components: &VmComponents, arch_memory_layout: &Self::ArchMemoryLayout, _hypervisor: &impl Hypervisor, ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>790     fn guest_memory_layout(
791         components: &VmComponents,
792         arch_memory_layout: &Self::ArchMemoryLayout,
793         _hypervisor: &impl Hypervisor,
794     ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error> {
795         let has_protected_vm_firmware = components.hv_cfg.protection_type.runs_firmware();
796 
797         let bios_size = match &components.vm_image {
798             VmImage::Bios(bios_file) => Some(bios_file.metadata().map_err(Error::LoadBios)?.len()),
799             VmImage::Kernel(_) => None,
800         };
801 
802         Ok(arch_memory_regions(
803             arch_memory_layout,
804             components.memory_size,
805             bios_size,
806             has_protected_vm_firmware,
807         ))
808     }
809 
get_system_allocator_config<V: Vm>( vm: &V, arch_memory_layout: &Self::ArchMemoryLayout, ) -> SystemAllocatorConfig810     fn get_system_allocator_config<V: Vm>(
811         vm: &V,
812         arch_memory_layout: &Self::ArchMemoryLayout,
813     ) -> SystemAllocatorConfig {
814         SystemAllocatorConfig {
815             io: Some(AddressRange {
816                 start: 0xc000,
817                 end: 0xffff,
818             }),
819             low_mmio: arch_memory_layout.pci_mmio_before_32bit,
820             high_mmio: Self::get_high_mmio_range(vm, arch_memory_layout),
821             platform_mmio: None,
822             first_irq: X86_64_IRQ_BASE,
823         }
824     }
825 
build_vm<V, Vcpu>( mut components: VmComponents, arch_memory_layout: &Self::ArchMemoryLayout, vm_evt_wrtube: &SendTube, system_allocator: &mut SystemAllocator, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, battery: (Option<BatteryType>, Option<Minijail>), mut vm: V, ramoops_region: Option<arch::pstore::RamoopsRegion>, devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, irq_chip: &mut dyn IrqChipX86_64, vcpu_ids: &mut Vec<usize>, dump_device_tree_blob: Option<PathBuf>, debugcon_jail: Option<Minijail>, pflash_jail: Option<Minijail>, fw_cfg_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, device_tree_overlays: Vec<DtbOverlay>, _fdt_position: Option<FdtPosition>, _no_pmu: bool, ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error> where V: VmX86_64, Vcpu: VcpuX86_64,826     fn build_vm<V, Vcpu>(
827         mut components: VmComponents,
828         arch_memory_layout: &Self::ArchMemoryLayout,
829         vm_evt_wrtube: &SendTube,
830         system_allocator: &mut SystemAllocator,
831         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
832         serial_jail: Option<Minijail>,
833         battery: (Option<BatteryType>, Option<Minijail>),
834         mut vm: V,
835         ramoops_region: Option<arch::pstore::RamoopsRegion>,
836         devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
837         irq_chip: &mut dyn IrqChipX86_64,
838         vcpu_ids: &mut Vec<usize>,
839         dump_device_tree_blob: Option<PathBuf>,
840         debugcon_jail: Option<Minijail>,
841         pflash_jail: Option<Minijail>,
842         fw_cfg_jail: Option<Minijail>,
843         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
844         guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
845         device_tree_overlays: Vec<DtbOverlay>,
846         _fdt_position: Option<FdtPosition>,
847         _no_pmu: bool,
848     ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error>
849     where
850         V: VmX86_64,
851         Vcpu: VcpuX86_64,
852     {
853         let mem = vm.get_memory().clone();
854 
855         let vcpu_count = components.vcpu_count;
856 
857         vm.set_identity_map_addr(identity_map_addr_start())
858             .map_err(Error::SetIdentityMapAddr)?;
859 
860         vm.set_tss_addr(tss_addr_start())
861             .map_err(Error::SetTssAddr)?;
862 
863         // Use IRQ info in ACPI if provided by the user.
864         let mut mptable = true;
865         let mut sci_irq = X86_64_SCI_IRQ;
866 
867         // punch pcie config mmio from pci low mmio, so that it couldn't be
868         // allocated to any device.
869         let pcie_cfg_mmio_range = arch_memory_layout.pcie_cfg_mmio;
870         system_allocator
871             .reserve_mmio(pcie_cfg_mmio_range)
872             .map_err(Error::ReservePcieCfgMmio)?;
873 
874         for sdt in components.acpi_sdts.iter() {
875             if sdt.is_signature(b"FACP") {
876                 mptable = false;
877                 let sci_irq_fadt: u16 = sdt.read(acpi::FADT_FIELD_SCI_INTERRUPT);
878                 sci_irq = sci_irq_fadt.into();
879                 if !system_allocator.reserve_irq(sci_irq) {
880                     warn!("sci irq {} already reserved.", sci_irq);
881                 }
882             }
883         }
884 
885         let pcie_vcfg_range = Self::get_pcie_vcfg_mmio_range(&mem, &pcie_cfg_mmio_range);
886         let mmio_bus = Arc::new(Bus::new(BusType::Mmio));
887         let io_bus = Arc::new(Bus::new(BusType::Io));
888 
889         let (pci_devices, devs): (Vec<_>, Vec<_>) = devs
890             .into_iter()
891             .partition(|(dev, _)| dev.as_pci_device().is_some());
892 
893         let pci_devices = pci_devices
894             .into_iter()
895             .map(|(dev, jail_orig)| (dev.into_pci_device().unwrap(), jail_orig))
896             .collect();
897 
898         let (pci, pci_irqs, mut pid_debug_label_map, amls, gpe_scope_amls) =
899             arch::generate_pci_root(
900                 pci_devices,
901                 irq_chip.as_irq_chip_mut(),
902                 mmio_bus.clone(),
903                 GuestAddress(pcie_cfg_mmio_range.start),
904                 12,
905                 io_bus.clone(),
906                 system_allocator,
907                 &mut vm,
908                 4, // Share the four pin interrupts (INTx#)
909                 Some(pcie_vcfg_range.start),
910                 #[cfg(feature = "swap")]
911                 swap_controller,
912             )
913             .map_err(Error::CreatePciRoot)?;
914 
915         let pci = Arc::new(Mutex::new(pci));
916         pci.lock().enable_pcie_cfg_mmio(pcie_cfg_mmio_range.start);
917         let pci_cfg = PciConfigIo::new(
918             pci.clone(),
919             components.break_linux_pci_config_io,
920             vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
921         );
922         let pci_bus = Arc::new(Mutex::new(pci_cfg));
923         io_bus.insert(pci_bus, 0xcf8, 0x8).unwrap();
924 
925         let pcie_cfg_mmio = Arc::new(Mutex::new(PciConfigMmio::new(pci.clone(), 12)));
926         let pcie_cfg_mmio_len = pcie_cfg_mmio_range.len().unwrap();
927         mmio_bus
928             .insert(pcie_cfg_mmio, pcie_cfg_mmio_range.start, pcie_cfg_mmio_len)
929             .unwrap();
930 
931         let pcie_vcfg_mmio = Arc::new(Mutex::new(PciVirtualConfigMmio::new(pci.clone(), 13)));
932         mmio_bus
933             .insert(
934                 pcie_vcfg_mmio,
935                 pcie_vcfg_range.start,
936                 pcie_vcfg_range.len().unwrap(),
937             )
938             .unwrap();
939 
940         let (virtio_mmio_devices, _others): (Vec<_>, Vec<_>) = devs
941             .into_iter()
942             .partition(|(dev, _)| dev.as_virtio_mmio_device().is_some());
943 
944         let virtio_mmio_devices = virtio_mmio_devices
945             .into_iter()
946             .map(|(dev, jail_orig)| (*(dev.into_virtio_mmio_device().unwrap()), jail_orig))
947             .collect();
948         let (mut virtio_mmio_pid, sdts) = arch::generate_virtio_mmio_bus(
949             virtio_mmio_devices,
950             irq_chip.as_irq_chip_mut(),
951             &mmio_bus,
952             system_allocator,
953             &mut vm,
954             components.acpi_sdts,
955             #[cfg(feature = "swap")]
956             swap_controller,
957         )
958         .map_err(Error::CreateVirtioMmioBus)?;
959         components.acpi_sdts = sdts;
960         pid_debug_label_map.append(&mut virtio_mmio_pid);
961 
962         // Event used to notify crosvm that guest OS is trying to suspend.
963         let (suspend_tube_send, suspend_tube_recv) =
964             Tube::directional_pair().map_err(Error::CreateTube)?;
965         let suspend_tube_send = Arc::new(Mutex::new(suspend_tube_send));
966 
967         if components.fw_cfg_enable {
968             Self::setup_fw_cfg_device(
969                 &io_bus,
970                 components.fw_cfg_parameters.clone(),
971                 components.bootorder_fw_cfg_blob.clone(),
972                 fw_cfg_jail,
973                 #[cfg(feature = "swap")]
974                 swap_controller,
975             )?;
976         }
977 
978         if !components.no_i8042 {
979             Self::setup_legacy_i8042_device(
980                 &io_bus,
981                 irq_chip.pit_uses_speaker_port(),
982                 vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
983             )?;
984         }
985         let mut vm_request_tube = if !components.no_rtc {
986             let (host_tube, device_tube) = Tube::pair()
987                 .context("create tube")
988                 .map_err(Error::SetupCmos)?;
989             Self::setup_legacy_cmos_device(
990                 arch_memory_layout,
991                 &io_bus,
992                 irq_chip,
993                 device_tube,
994                 components.memory_size,
995                 components.hv_cfg.protection_type.runs_firmware(),
996             )
997             .map_err(Error::SetupCmos)?;
998             Some(host_tube)
999         } else {
1000             None
1001         };
1002         let serial_devices = Self::setup_serial_devices(
1003             components.hv_cfg.protection_type,
1004             irq_chip.as_irq_chip_mut(),
1005             &io_bus,
1006             serial_parameters,
1007             serial_jail,
1008             #[cfg(feature = "swap")]
1009             swap_controller,
1010         )?;
1011         Self::setup_debugcon_devices(
1012             components.hv_cfg.protection_type,
1013             &io_bus,
1014             serial_parameters,
1015             debugcon_jail,
1016             #[cfg(feature = "swap")]
1017             swap_controller,
1018         )?;
1019 
1020         let bios_size = if let VmImage::Bios(ref bios) = components.vm_image {
1021             bios.metadata().map_err(Error::LoadBios)?.len()
1022         } else {
1023             0
1024         };
1025         if let Some(pflash_image) = components.pflash_image {
1026             Self::setup_pflash(
1027                 pflash_image,
1028                 components.pflash_block_size,
1029                 bios_size,
1030                 &mmio_bus,
1031                 pflash_jail,
1032                 #[cfg(feature = "swap")]
1033                 swap_controller,
1034             )?;
1035         }
1036 
1037         // Functions that use/create jails MUST be used before the call to
1038         // setup_acpi_devices below, as this move us into a multiprocessing state
1039         // from which we can no longer fork.
1040 
1041         let mut resume_notify_devices = Vec::new();
1042 
1043         // each bus occupy 1MB mmio for pcie enhanced configuration
1044         let max_bus = (pcie_cfg_mmio_len / 0x100000 - 1) as u8;
1045         let (mut acpi_dev_resource, bat_control) = Self::setup_acpi_devices(
1046             arch_memory_layout,
1047             pci.clone(),
1048             &mem,
1049             &io_bus,
1050             system_allocator,
1051             suspend_tube_send.clone(),
1052             vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
1053             components.acpi_sdts,
1054             irq_chip.as_irq_chip_mut(),
1055             sci_irq,
1056             battery,
1057             &mmio_bus,
1058             max_bus,
1059             &mut resume_notify_devices,
1060             #[cfg(feature = "swap")]
1061             swap_controller,
1062             #[cfg(any(target_os = "android", target_os = "linux"))]
1063             components.ac_adapter,
1064             guest_suspended_cvar,
1065             &pci_irqs,
1066         )?;
1067 
1068         // Create customized SSDT table
1069         let sdt = acpi::create_customize_ssdt(pci.clone(), amls, gpe_scope_amls);
1070         if let Some(sdt) = sdt {
1071             acpi_dev_resource.sdts.push(sdt);
1072         }
1073 
1074         irq_chip
1075             .finalize_devices(system_allocator, &io_bus, &mmio_bus)
1076             .map_err(Error::RegisterIrqfd)?;
1077 
1078         // All of these bios generated tables are set manually for the benefit of the kernel boot
1079         // flow (since there's no BIOS to set it) and for the BIOS boot flow since crosvm doesn't
1080         // have a way to pass the BIOS these configs.
1081         // This works right now because the only guest BIOS used with crosvm (u-boot) ignores these
1082         // tables and the guest OS picks them up.
1083         // If another guest does need a way to pass these tables down to it's BIOS, this approach
1084         // should be rethought.
1085 
1086         if mptable {
1087             // Note that this puts the mptable at 0x9FC00 in guest physical memory.
1088             mptable::setup_mptable(&mem, vcpu_count as u8, &pci_irqs)
1089                 .map_err(Error::SetupMptable)?;
1090         }
1091         smbios::setup_smbios(&mem, &components.smbios, bios_size).map_err(Error::SetupSmbios)?;
1092 
1093         let host_cpus = if components.host_cpu_topology {
1094             components.vcpu_affinity.clone()
1095         } else {
1096             None
1097         };
1098 
1099         // TODO (tjeznach) Write RSDP to bootconfig before writing to memory
1100         acpi::create_acpi_tables(
1101             &mem,
1102             vcpu_count as u8,
1103             sci_irq,
1104             0xcf9,
1105             6, // RST_CPU|SYS_RST
1106             &acpi_dev_resource,
1107             host_cpus,
1108             vcpu_ids,
1109             &pci_irqs,
1110             pcie_cfg_mmio_range.start,
1111             max_bus,
1112             components.force_s2idle,
1113         )
1114         .ok_or(Error::CreateAcpi)?;
1115 
1116         let mut cmdline = Self::get_base_linux_cmdline();
1117 
1118         get_serial_cmdline(&mut cmdline, serial_parameters, "io", &serial_devices)
1119             .map_err(Error::GetSerialCmdline)?;
1120 
1121         for param in components.extra_kernel_params {
1122             cmdline.insert_str(&param).map_err(Error::Cmdline)?;
1123         }
1124 
1125         if let Some(ramoops_region) = ramoops_region {
1126             arch::pstore::add_ramoops_kernel_cmdline(&mut cmdline, &ramoops_region)
1127                 .map_err(Error::Cmdline)?;
1128         }
1129 
1130         let pci_start = arch_memory_layout.pci_mmio_before_32bit.start;
1131 
1132         let mut vcpu_init = vec![VcpuInitX86_64::default(); vcpu_count];
1133         let mut msrs = BTreeMap::new();
1134 
1135         let protection_type = components.hv_cfg.protection_type;
1136 
1137         match components.vm_image {
1138             VmImage::Bios(ref mut bios) => {
1139                 if protection_type.runs_firmware() {
1140                     return Err(Error::PvmFwBiosUnsupported);
1141                 }
1142 
1143                 // Allow a bios to hardcode CMDLINE_OFFSET and read the kernel command line from it.
1144                 Self::load_cmdline(
1145                     &mem,
1146                     GuestAddress(CMDLINE_OFFSET),
1147                     cmdline,
1148                     CMDLINE_MAX_SIZE as usize - 1,
1149                 )?;
1150                 Self::load_bios(&mem, bios)?;
1151                 regs::set_default_msrs(&mut msrs);
1152                 // The default values for `Regs` and `Sregs` already set up the reset vector.
1153             }
1154             VmImage::Kernel(ref mut kernel_image) => {
1155                 let (params, kernel_end, kernel_entry, cpu_mode, kernel_type) =
1156                     Self::load_kernel(&mem, kernel_image)?;
1157 
1158                 info!("Loaded {} kernel", kernel_type);
1159 
1160                 Self::setup_system_memory(
1161                     arch_memory_layout,
1162                     &mem,
1163                     cmdline,
1164                     components.initrd_image,
1165                     components.android_fstab,
1166                     kernel_end,
1167                     params,
1168                     dump_device_tree_blob,
1169                     device_tree_overlays,
1170                     protection_type.runs_firmware(),
1171                 )?;
1172 
1173                 if protection_type.needs_firmware_loaded() {
1174                     arch::load_image(
1175                         &mem,
1176                         &mut components
1177                             .pvm_fw
1178                             .expect("pvmfw must be available if ProtectionType loads it"),
1179                         GuestAddress(PROTECTED_VM_FW_START),
1180                         PROTECTED_VM_FW_MAX_SIZE,
1181                     )
1182                     .map_err(Error::LoadCustomPvmFw)?;
1183                 }
1184 
1185                 let entry_addr = if protection_type.runs_firmware() {
1186                     PROTECTED_VM_FW_START
1187                 } else {
1188                     kernel_entry.offset()
1189                 };
1190 
1191                 vcpu_init[0].regs.rip = entry_addr;
1192 
1193                 match kernel_type {
1194                     KernelType::BzImage | KernelType::Elf => {
1195                         // Configure the bootstrap VCPU for the Linux/x86 boot protocol.
1196                         // <https://www.kernel.org/doc/html/latest/x86/boot.html>
1197                         vcpu_init[0].regs.rsp = BOOT_STACK_POINTER;
1198                         vcpu_init[0].regs.rsi = ZERO_PAGE_OFFSET;
1199                     }
1200                 }
1201 
1202                 if protection_type.runs_firmware() {
1203                     // Pass pVM payload entry address to pVM firmware.
1204                     // NOTE: this ABI is subject to change. Possibly we will pass
1205                     // all the needed info (payload entry, start and size) in in-memory
1206                     // structures (e.g. DTB) instead.
1207                     vcpu_init[0].regs.rdi = kernel_entry.offset();
1208                 }
1209 
1210                 match cpu_mode {
1211                     CpuMode::LongMode => {
1212                         regs::set_long_mode_msrs(&mut msrs);
1213 
1214                         // Set up long mode and enable paging.
1215                         regs::configure_segments_and_sregs(&mem, &mut vcpu_init[0].sregs)
1216                             .map_err(Error::ConfigureSegments)?;
1217                         regs::setup_page_tables(&mem, &mut vcpu_init[0].sregs)
1218                             .map_err(Error::SetupPageTables)?;
1219                     }
1220                     CpuMode::FlatProtectedMode => {
1221                         regs::set_default_msrs(&mut msrs);
1222 
1223                         // Set up 32-bit protected mode with paging disabled.
1224                         regs::configure_segments_and_sregs_flat32(&mem, &mut vcpu_init[0].sregs)
1225                             .map_err(Error::ConfigureSegments)?;
1226                     }
1227                 }
1228 
1229                 regs::set_mtrr_msrs(&mut msrs, &vm, pci_start);
1230             }
1231         }
1232 
1233         // Initialize MSRs for all VCPUs.
1234         for vcpu in vcpu_init.iter_mut() {
1235             vcpu.msrs = msrs.clone();
1236         }
1237 
1238         let mut vm_request_tubes = Vec::new();
1239         if let Some(req_tube) = vm_request_tube.take() {
1240             vm_request_tubes.push(req_tube);
1241         }
1242 
1243         Ok(RunnableLinuxVm {
1244             vm,
1245             vcpu_count,
1246             vcpus: None,
1247             vcpu_affinity: components.vcpu_affinity,
1248             vcpu_init,
1249             no_smt: components.no_smt,
1250             irq_chip: irq_chip.try_box_clone().map_err(Error::CloneIrqChip)?,
1251             io_bus,
1252             mmio_bus,
1253             pid_debug_label_map,
1254             suspend_tube: (suspend_tube_send, suspend_tube_recv),
1255             resume_notify_devices,
1256             rt_cpus: components.rt_cpus,
1257             delay_rt: components.delay_rt,
1258             bat_control,
1259             pm: Some(acpi_dev_resource.pm),
1260             root_config: pci,
1261             #[cfg(any(target_os = "android", target_os = "linux"))]
1262             platform_devices: Vec::new(),
1263             hotplug_bus: BTreeMap::new(),
1264             devices_thread: None,
1265             vm_request_tubes,
1266         })
1267     }
1268 
configure_vcpu<V: Vm>( vm: &V, hypervisor: &dyn HypervisorX86_64, irq_chip: &mut dyn IrqChipX86_64, vcpu: &mut dyn VcpuX86_64, vcpu_init: VcpuInitX86_64, vcpu_id: usize, num_cpus: usize, cpu_config: Option<CpuConfigX86_64>, ) -> Result<()>1269     fn configure_vcpu<V: Vm>(
1270         vm: &V,
1271         hypervisor: &dyn HypervisorX86_64,
1272         irq_chip: &mut dyn IrqChipX86_64,
1273         vcpu: &mut dyn VcpuX86_64,
1274         vcpu_init: VcpuInitX86_64,
1275         vcpu_id: usize,
1276         num_cpus: usize,
1277         cpu_config: Option<CpuConfigX86_64>,
1278     ) -> Result<()> {
1279         let cpu_config = match cpu_config {
1280             Some(config) => config,
1281             None => return Err(Error::InvalidCpuConfig),
1282         };
1283         if !vm.check_capability(VmCap::EarlyInitCpuid) {
1284             cpuid::setup_cpuid(hypervisor, irq_chip, vcpu, vcpu_id, num_cpus, cpu_config)
1285                 .map_err(Error::SetupCpuid)?;
1286         }
1287 
1288         vcpu.set_regs(&vcpu_init.regs).map_err(Error::WriteRegs)?;
1289 
1290         vcpu.set_sregs(&vcpu_init.sregs)
1291             .map_err(Error::SetupSregs)?;
1292 
1293         vcpu.set_fpu(&vcpu_init.fpu).map_err(Error::SetupFpu)?;
1294 
1295         let vcpu_supported_var_mtrrs = regs::vcpu_supported_variable_mtrrs(vcpu);
1296         let num_var_mtrrs = regs::count_variable_mtrrs(&vcpu_init.msrs);
1297         let skip_mtrr_msrs = if num_var_mtrrs > vcpu_supported_var_mtrrs {
1298             warn!(
1299                 "Too many variable MTRR entries ({} required, {} supported),
1300                 please check pci_start addr, guest with pass through device may be very slow",
1301                 num_var_mtrrs, vcpu_supported_var_mtrrs,
1302             );
1303             // Filter out the MTRR entries from the MSR list.
1304             true
1305         } else {
1306             false
1307         };
1308 
1309         for (msr_index, value) in vcpu_init.msrs.into_iter() {
1310             if skip_mtrr_msrs && regs::is_mtrr_msr(msr_index) {
1311                 continue;
1312             }
1313 
1314             vcpu.set_msr(msr_index, value).map_err(Error::SetupMsrs)?;
1315         }
1316 
1317         interrupts::set_lint(vcpu_id, irq_chip).map_err(Error::SetLint)?;
1318 
1319         Ok(())
1320     }
1321 
register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>( linux: &mut RunnableLinuxVm<V, Vcpu>, device: Box<dyn PciDevice>, #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>, resources: &mut SystemAllocator, hp_control_tube: &mpsc::Sender<PciRootCommand>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<PciAddress>1322     fn register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>(
1323         linux: &mut RunnableLinuxVm<V, Vcpu>,
1324         device: Box<dyn PciDevice>,
1325         #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>,
1326         resources: &mut SystemAllocator,
1327         hp_control_tube: &mpsc::Sender<PciRootCommand>,
1328         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1329     ) -> Result<PciAddress> {
1330         arch::configure_pci_device(
1331             linux,
1332             device,
1333             #[cfg(any(target_os = "android", target_os = "linux"))]
1334             minijail,
1335             resources,
1336             hp_control_tube,
1337             #[cfg(feature = "swap")]
1338             swap_controller,
1339         )
1340         .map_err(Error::ConfigurePciDevice)
1341     }
1342 
get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>>1343     fn get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>> {
1344         Ok(BTreeMap::new())
1345     }
1346 
get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>>1347     fn get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>> {
1348         Ok(BTreeMap::new())
1349     }
1350 
get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>>1351     fn get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>> {
1352         Ok(BTreeMap::new())
1353     }
1354 
get_host_cpu_clusters() -> Result<Vec<CpuSet>>1355     fn get_host_cpu_clusters() -> Result<Vec<CpuSet>> {
1356         Ok(Vec::new())
1357     }
1358 }
1359 
1360 // OSC returned status register in CDW1
1361 const OSC_STATUS_UNSUPPORT_UUID: u32 = 0x4;
1362 // pci host bridge OSC returned control register in CDW3
1363 #[allow(dead_code)]
1364 const PCI_HB_OSC_CONTROL_PCIE_HP: u32 = 0x1;
1365 const PCI_HB_OSC_CONTROL_SHPC_HP: u32 = 0x2;
1366 #[allow(dead_code)]
1367 const PCI_HB_OSC_CONTROL_PCIE_PME: u32 = 0x4;
1368 const PCI_HB_OSC_CONTROL_PCIE_AER: u32 = 0x8;
1369 #[allow(dead_code)]
1370 const PCI_HB_OSC_CONTROL_PCIE_CAP: u32 = 0x10;
1371 
1372 struct PciRootOSC {}
1373 
1374 // Method (_OSC, 4, NotSerialized)  // _OSC: Operating System Capabilities
1375 // {
1376 //     CreateDWordField (Arg3, Zero, CDW1)  // flag and return value
1377 //     If (Arg0 == ToUUID ("33db4d5b-1ff7-401c-9657-7441c03dd766"))
1378 //     {
1379 //         CreateDWordField (Arg3, 8, CDW3) // control field
1380 //         if ( 0 == (CDW1 & 0x01))  // Query flag ?
1381 //         {
1382 //              CDW3 &= !(SHPC_HP | AER)
1383 //         }
1384 //     } Else {
1385 //         CDW1 |= UNSUPPORT_UUID
1386 //     }
1387 //     Return (Arg3)
1388 // }
1389 impl Aml for PciRootOSC {
to_aml_bytes(&self, aml: &mut Vec<u8>)1390     fn to_aml_bytes(&self, aml: &mut Vec<u8>) {
1391         let osc_uuid = "33DB4D5B-1FF7-401C-9657-7441C03DD766";
1392         // virtual pcie root port supports hotplug, pme, and pcie cap register, clear all
1393         // the other bits.
1394         let mask = !(PCI_HB_OSC_CONTROL_SHPC_HP | PCI_HB_OSC_CONTROL_PCIE_AER);
1395         aml::Method::new(
1396             "_OSC".into(),
1397             4,
1398             false,
1399             vec![
1400                 &aml::CreateDWordField::new(
1401                     &aml::Name::new_field_name("CDW1"),
1402                     &aml::Arg(3),
1403                     &aml::ZERO,
1404                 ),
1405                 &aml::If::new(
1406                     &aml::Equal::new(&aml::Arg(0), &aml::Uuid::new(osc_uuid)),
1407                     vec![
1408                         &aml::CreateDWordField::new(
1409                             &aml::Name::new_field_name("CDW3"),
1410                             &aml::Arg(3),
1411                             &(8_u8),
1412                         ),
1413                         &aml::If::new(
1414                             &aml::Equal::new(
1415                                 &aml::ZERO,
1416                                 &aml::And::new(
1417                                     &aml::ZERO,
1418                                     &aml::Name::new_field_name("CDW1"),
1419                                     &aml::ONE,
1420                                 ),
1421                             ),
1422                             vec![&aml::And::new(
1423                                 &aml::Name::new_field_name("CDW3"),
1424                                 &mask,
1425                                 &aml::Name::new_field_name("CDW3"),
1426                             )],
1427                         ),
1428                     ],
1429                 ),
1430                 &aml::Else::new(vec![&aml::Or::new(
1431                     &aml::Name::new_field_name("CDW1"),
1432                     &OSC_STATUS_UNSUPPORT_UUID,
1433                     &aml::Name::new_field_name("CDW1"),
1434                 )]),
1435                 &aml::Return::new(&aml::Arg(3)),
1436             ],
1437         )
1438         .to_aml_bytes(aml)
1439     }
1440 }
1441 
1442 pub enum CpuMode {
1443     /// 32-bit protected mode with paging disabled.
1444     FlatProtectedMode,
1445 
1446     /// 64-bit long mode.
1447     LongMode,
1448 }
1449 
1450 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
1451 pub enum KernelType {
1452     BzImage,
1453     Elf,
1454 }
1455 
1456 impl fmt::Display for KernelType {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1457     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1458         match self {
1459             KernelType::BzImage => write!(f, "bzImage"),
1460             KernelType::Elf => write!(f, "ELF"),
1461         }
1462     }
1463 }
1464 
1465 impl X8664arch {
1466     /// Loads the bios from an open file.
1467     ///
1468     /// # Arguments
1469     ///
1470     /// * `mem` - The memory to be used by the guest.
1471     /// * `bios_image` - the File object for the specified bios
load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()>1472     fn load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()> {
1473         let bios_image_length = bios_image.get_len().map_err(Error::LoadBios)?;
1474         if bios_image_length >= FIRST_ADDR_PAST_32BITS {
1475             return Err(Error::LoadBios(io::Error::new(
1476                 io::ErrorKind::InvalidData,
1477                 format!(
1478                     "bios was {} bytes, expected less than {}",
1479                     bios_image_length, FIRST_ADDR_PAST_32BITS,
1480                 ),
1481             )));
1482         }
1483 
1484         let guest_slice = mem
1485             .get_slice_at_addr(bios_start(bios_image_length), bios_image_length as usize)
1486             .map_err(Error::SetupGuestMemory)?;
1487         bios_image
1488             .read_exact_at_volatile(guest_slice, 0)
1489             .map_err(Error::LoadBios)?;
1490         Ok(())
1491     }
1492 
setup_pflash( pflash_image: File, block_size: u32, bios_size: u64, mmio_bus: &Bus, jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<()>1493     fn setup_pflash(
1494         pflash_image: File,
1495         block_size: u32,
1496         bios_size: u64,
1497         mmio_bus: &Bus,
1498         jail: Option<Minijail>,
1499         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1500     ) -> Result<()> {
1501         let size = pflash_image.metadata().map_err(Error::LoadPflash)?.len();
1502         let start = FIRST_ADDR_PAST_32BITS - bios_size - size;
1503         let pflash_image = Box::new(pflash_image);
1504 
1505         #[cfg(any(target_os = "android", target_os = "linux"))]
1506         let fds = pflash_image.as_raw_descriptors();
1507 
1508         let pflash = Pflash::new(pflash_image, block_size).map_err(Error::SetupPflash)?;
1509         let pflash: Arc<Mutex<dyn BusDevice>> = match jail {
1510             #[cfg(any(target_os = "android", target_os = "linux"))]
1511             Some(jail) => Arc::new(Mutex::new(
1512                 ProxyDevice::new(
1513                     pflash,
1514                     jail,
1515                     fds,
1516                     #[cfg(feature = "swap")]
1517                     swap_controller,
1518                 )
1519                 .map_err(Error::CreateProxyDevice)?,
1520             )),
1521             #[cfg(windows)]
1522             Some(_) => unreachable!(),
1523             None => Arc::new(Mutex::new(pflash)),
1524         };
1525         mmio_bus
1526             .insert(pflash, start, size)
1527             .map_err(Error::InsertBus)?;
1528 
1529         Ok(())
1530     }
1531 
1532     /// Writes the command line string to the given memory slice.
1533     ///
1534     /// # Arguments
1535     ///
1536     /// * `guest_mem` - A u8 slice that will be partially overwritten by the command line.
1537     /// * `guest_addr` - The address in `guest_mem` at which to load the command line.
1538     /// * `cmdline` - The kernel command line.
1539     /// * `kernel_max_cmdline_len` - The maximum command line length (without NUL terminator)
1540     ///   supported by the kernel.
load_cmdline( guest_mem: &GuestMemory, guest_addr: GuestAddress, cmdline: kernel_cmdline::Cmdline, kernel_max_cmdline_len: usize, ) -> Result<()>1541     fn load_cmdline(
1542         guest_mem: &GuestMemory,
1543         guest_addr: GuestAddress,
1544         cmdline: kernel_cmdline::Cmdline,
1545         kernel_max_cmdline_len: usize,
1546     ) -> Result<()> {
1547         let mut cmdline_guest_mem_slice = guest_mem
1548             .get_slice_at_addr(guest_addr, CMDLINE_MAX_SIZE as usize)
1549             .map_err(|_| Error::CommandLineOverflow)?;
1550 
1551         let mut cmdline_bytes: Vec<u8> = cmdline
1552             .into_bytes_with_max_len(kernel_max_cmdline_len)
1553             .map_err(Error::Cmdline)?;
1554         cmdline_bytes.push(0u8); // Add NUL terminator.
1555 
1556         cmdline_guest_mem_slice
1557             .write_all(&cmdline_bytes)
1558             .map_err(|_| Error::CommandLineOverflow)?;
1559 
1560         Ok(())
1561     }
1562 
1563     /// Loads the kernel from an open file.
1564     ///
1565     /// # Arguments
1566     ///
1567     /// * `mem` - The memory to be used by the guest.
1568     /// * `kernel_image` - the File object for the specified kernel.
1569     ///
1570     /// # Returns
1571     ///
1572     /// On success, returns the Linux x86_64 boot protocol parameters, the first address past the
1573     /// end of the kernel, the entry point (initial `RIP` value), the initial CPU mode, and the type
1574     /// of kernel.
load_kernel( mem: &GuestMemory, kernel_image: &mut File, ) -> Result<(boot_params, u64, GuestAddress, CpuMode, KernelType)>1575     fn load_kernel(
1576         mem: &GuestMemory,
1577         kernel_image: &mut File,
1578     ) -> Result<(boot_params, u64, GuestAddress, CpuMode, KernelType)> {
1579         let kernel_start = GuestAddress(KERNEL_START_OFFSET);
1580         match kernel_loader::load_elf64(mem, kernel_start, kernel_image, 0) {
1581             Ok(loaded_kernel) => {
1582                 // ELF kernels don't contain a `boot_params` structure, so synthesize a default one.
1583                 let boot_params = boot_params {
1584                     hdr: setup_header {
1585                         cmdline_size: CMDLINE_MAX_SIZE as u32 - 1,
1586                         ..Default::default()
1587                     },
1588                     ..Default::default()
1589                 };
1590                 Ok((
1591                     boot_params,
1592                     loaded_kernel.address_range.end,
1593                     loaded_kernel.entry,
1594                     CpuMode::LongMode,
1595                     KernelType::Elf,
1596                 ))
1597             }
1598             Err(kernel_loader::Error::InvalidMagicNumber) => {
1599                 // The image failed to parse as ELF, so try to load it as a bzImage.
1600                 let (boot_params, bzimage_end, bzimage_entry, cpu_mode) =
1601                     bzimage::load_bzimage(mem, kernel_start, kernel_image)
1602                         .map_err(Error::LoadBzImage)?;
1603                 Ok((
1604                     boot_params,
1605                     bzimage_end,
1606                     bzimage_entry,
1607                     cpu_mode,
1608                     KernelType::BzImage,
1609                 ))
1610             }
1611             Err(e) => Err(Error::LoadKernel(e)),
1612         }
1613     }
1614 
1615     /// Configures the system memory space should be called once per vm before
1616     /// starting vcpu threads.
1617     ///
1618     /// # Arguments
1619     ///
1620     /// * `mem` - The memory to be used by the guest.
1621     /// * `cmdline` - the kernel commandline
1622     /// * `initrd_file` - an initial ramdisk image
setup_system_memory( arch_memory_layout: &ArchMemoryLayout, mem: &GuestMemory, cmdline: kernel_cmdline::Cmdline, initrd_file: Option<File>, android_fstab: Option<File>, kernel_end: u64, params: boot_params, dump_device_tree_blob: Option<PathBuf>, device_tree_overlays: Vec<DtbOverlay>, has_protected_vm_firmware: bool, ) -> Result<()>1623     pub fn setup_system_memory(
1624         arch_memory_layout: &ArchMemoryLayout,
1625         mem: &GuestMemory,
1626         cmdline: kernel_cmdline::Cmdline,
1627         initrd_file: Option<File>,
1628         android_fstab: Option<File>,
1629         kernel_end: u64,
1630         params: boot_params,
1631         dump_device_tree_blob: Option<PathBuf>,
1632         device_tree_overlays: Vec<DtbOverlay>,
1633         has_protected_vm_firmware: bool,
1634     ) -> Result<()> {
1635         // Some guest kernels expect a typical PC memory layout where the region between 640 KB and
1636         // 1 MB is reserved for device memory/ROMs and get confused if there is a RAM region
1637         // spanning this area, so we provide the traditional 640 KB low memory and 1 MB+
1638         // high memory regions.
1639         let ram_below_1m_end = 640 * 1024;
1640         let ram_below_1m = AddressRange {
1641             start: START_OF_RAM_32BITS,
1642             end: ram_below_1m_end - 1,
1643         };
1644 
1645         // GuestMemory::end_addr() returns the first address past the end, so subtract 1 to get the
1646         // inclusive end.
1647         let guest_mem_end = mem.end_addr().offset() - 1;
1648 
1649         // Find the end of the part of guest memory below 4G that is not pVM firmware memory.
1650         // This part of guest memory includes just one region, so just find the end of this region.
1651         let max_ram_end_below_4g =
1652             max_ram_end_before_32bit(arch_memory_layout, has_protected_vm_firmware) - 1;
1653         let guest_mem_end_below_4g = mem
1654             .regions()
1655             .map(|r| r.guest_addr.offset() + r.size as u64 - 1)
1656             .find(|&addr| addr <= max_ram_end_below_4g)
1657             .expect("no memory region below 4G");
1658 
1659         let ram_below_4g = AddressRange {
1660             start: FIRST_ADDR_PAST_20BITS,
1661             end: guest_mem_end_below_4g,
1662         };
1663         let ram_above_4g = AddressRange {
1664             start: FIRST_ADDR_PAST_32BITS,
1665             end: guest_mem_end,
1666         };
1667 
1668         let e820_entries = generate_e820_memory_map(
1669             arch_memory_layout,
1670             mem,
1671             ram_below_1m,
1672             ram_below_4g,
1673             ram_above_4g,
1674             has_protected_vm_firmware,
1675         )?;
1676 
1677         let kernel_max_cmdline_len = if params.hdr.cmdline_size == 0 {
1678             // Old kernels have a maximum length of 255 bytes, not including the NUL.
1679             255
1680         } else {
1681             params.hdr.cmdline_size as usize
1682         };
1683         debug!("kernel_max_cmdline_len={kernel_max_cmdline_len}");
1684         Self::load_cmdline(
1685             mem,
1686             GuestAddress(CMDLINE_OFFSET),
1687             cmdline,
1688             kernel_max_cmdline_len,
1689         )?;
1690 
1691         let mut setup_data = Vec::<SetupData>::new();
1692         if android_fstab.is_some() || !device_tree_overlays.is_empty() {
1693             let device_tree_blob =
1694                 fdt::create_fdt(android_fstab, dump_device_tree_blob, device_tree_overlays)
1695                     .map_err(Error::CreateFdt)?;
1696             setup_data.push(SetupData {
1697                 data: device_tree_blob,
1698                 type_: SetupDataType::Dtb,
1699             });
1700         }
1701 
1702         setup_data.push(setup_data_rng_seed());
1703 
1704         let setup_data = write_setup_data(
1705             mem,
1706             GuestAddress(SETUP_DATA_START),
1707             GuestAddress(SETUP_DATA_END),
1708             &setup_data,
1709         )?;
1710 
1711         let initrd = match initrd_file {
1712             Some(mut initrd_file) => {
1713                 let initrd_addr_max = if params.hdr.xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G != 0 {
1714                     u64::MAX
1715                 } else if params.hdr.initrd_addr_max == 0 {
1716                     // Default initrd_addr_max for old kernels (see Documentation/x86/boot.txt).
1717                     0x37FFFFFF
1718                 } else {
1719                     u64::from(params.hdr.initrd_addr_max)
1720                 };
1721 
1722                 let (initrd_start, initrd_size) = arch::load_image_high(
1723                     mem,
1724                     &mut initrd_file,
1725                     GuestAddress(kernel_end),
1726                     GuestAddress(initrd_addr_max),
1727                     Some(|region| {
1728                         region.options.purpose != MemoryRegionPurpose::ProtectedFirmwareRegion
1729                     }),
1730                     base::pagesize() as u64,
1731                 )
1732                 .map_err(Error::LoadInitrd)?;
1733                 Some((initrd_start, initrd_size))
1734             }
1735             None => None,
1736         };
1737 
1738         configure_system(
1739             mem,
1740             GuestAddress(CMDLINE_OFFSET),
1741             setup_data,
1742             initrd,
1743             params,
1744             &e820_entries,
1745         )?;
1746         Ok(())
1747     }
1748 
get_pcie_vcfg_mmio_range(mem: &GuestMemory, pcie_cfg_mmio: &AddressRange) -> AddressRange1749     fn get_pcie_vcfg_mmio_range(mem: &GuestMemory, pcie_cfg_mmio: &AddressRange) -> AddressRange {
1750         // Put PCIe VCFG region at a 2MB boundary after physical memory or 4gb, whichever is
1751         // greater.
1752         let ram_end_round_2mb = (mem.end_addr().offset() + 2 * MB - 1) / (2 * MB) * (2 * MB);
1753         let start = std::cmp::max(ram_end_round_2mb, 4 * GB);
1754         // Each pci device's ECAM size is 4kb and its vcfg size is 8kb
1755         let end = start + pcie_cfg_mmio.len().unwrap() * 2 - 1;
1756         AddressRange { start, end }
1757     }
1758 
1759     /// Returns the high mmio range
get_high_mmio_range<V: Vm>(vm: &V, arch_memory_layout: &ArchMemoryLayout) -> AddressRange1760     fn get_high_mmio_range<V: Vm>(vm: &V, arch_memory_layout: &ArchMemoryLayout) -> AddressRange {
1761         let mem = vm.get_memory();
1762         let start = Self::get_pcie_vcfg_mmio_range(mem, &arch_memory_layout.pcie_cfg_mmio).end + 1;
1763 
1764         let phys_mem_end = (1u64 << vm.get_guest_phys_addr_bits()) - 1;
1765         let high_mmio_end = std::cmp::min(phys_mem_end, HIGH_MMIO_MAX_END);
1766 
1767         AddressRange {
1768             start,
1769             end: high_mmio_end,
1770         }
1771     }
1772 
1773     /// This returns a minimal kernel command for this architecture
get_base_linux_cmdline() -> kernel_cmdline::Cmdline1774     pub fn get_base_linux_cmdline() -> kernel_cmdline::Cmdline {
1775         let mut cmdline = kernel_cmdline::Cmdline::new();
1776         cmdline.insert_str("panic=-1").unwrap();
1777 
1778         cmdline
1779     }
1780 
1781     /// Sets up fw_cfg device.
1782     ///  # Arguments
1783     ///
1784     /// * `io_bus` - the IO bus object
1785     /// * `fw_cfg_parameters` - command-line specified data to add to device. May contain all None
1786     ///   fields if user did not specify data to add to the device
setup_fw_cfg_device( io_bus: &Bus, fw_cfg_parameters: Vec<FwCfgParameters>, bootorder_fw_cfg_blob: Vec<u8>, fw_cfg_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<()>1787     fn setup_fw_cfg_device(
1788         io_bus: &Bus,
1789         fw_cfg_parameters: Vec<FwCfgParameters>,
1790         bootorder_fw_cfg_blob: Vec<u8>,
1791         fw_cfg_jail: Option<Minijail>,
1792         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1793     ) -> Result<()> {
1794         let fw_cfg = match devices::FwCfgDevice::new(FW_CFG_MAX_FILE_SLOTS, fw_cfg_parameters) {
1795             Ok(mut device) => {
1796                 // this condition will only be true if the user specified at least one bootindex
1797                 // option on the command line. If none were specified, bootorder_fw_cfg_blob will
1798                 // only have a null byte (null terminator)
1799                 if bootorder_fw_cfg_blob.len() > 1 {
1800                     // Add boot order file to the device. If the file is not present, firmware may
1801                     // not be able to boot.
1802                     if let Err(err) = device.add_file(
1803                         "bootorder",
1804                         bootorder_fw_cfg_blob,
1805                         devices::FwCfgItemType::GenericItem,
1806                     ) {
1807                         return Err(Error::CreateFwCfgDevice(err));
1808                     }
1809                 }
1810                 device
1811             }
1812             Err(err) => {
1813                 return Err(Error::CreateFwCfgDevice(err));
1814             }
1815         };
1816 
1817         let fw_cfg: Arc<Mutex<dyn BusDevice>> = match fw_cfg_jail.as_ref() {
1818             #[cfg(any(target_os = "android", target_os = "linux"))]
1819             Some(jail) => {
1820                 let jail_clone = jail.try_clone().map_err(Error::CloneJail)?;
1821                 #[cfg(feature = "seccomp_trace")]
1822                 debug!(
1823                     "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}",
1824                     read_jail_addr(jail),
1825                     read_jail_addr(&jail_clone)
1826                 );
1827                 Arc::new(Mutex::new(
1828                     ProxyDevice::new(
1829                         fw_cfg,
1830                         jail_clone,
1831                         Vec::new(),
1832                         #[cfg(feature = "swap")]
1833                         swap_controller,
1834                     )
1835                     .map_err(Error::CreateProxyDevice)?,
1836                 ))
1837             }
1838             #[cfg(windows)]
1839             Some(_) => unreachable!(),
1840             None => Arc::new(Mutex::new(fw_cfg)),
1841         };
1842 
1843         io_bus
1844             .insert(fw_cfg, FW_CFG_BASE_PORT, FW_CFG_WIDTH)
1845             .map_err(Error::InsertBus)?;
1846 
1847         Ok(())
1848     }
1849 
1850     /// Sets up the legacy x86 i8042/KBD platform device
1851     ///
1852     /// # Arguments
1853     ///
1854     /// * - `io_bus` - the IO bus object
1855     /// * - `pit_uses_speaker_port` - does the PIT use port 0x61 for the PC speaker
1856     /// * - `vm_evt_wrtube` - the event object which should receive exit events
setup_legacy_i8042_device( io_bus: &Bus, pit_uses_speaker_port: bool, vm_evt_wrtube: SendTube, ) -> Result<()>1857     pub fn setup_legacy_i8042_device(
1858         io_bus: &Bus,
1859         pit_uses_speaker_port: bool,
1860         vm_evt_wrtube: SendTube,
1861     ) -> Result<()> {
1862         let i8042 = Arc::new(Mutex::new(devices::I8042Device::new(
1863             vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
1864         )));
1865 
1866         if pit_uses_speaker_port {
1867             io_bus.insert(i8042, 0x062, 0x3).unwrap();
1868         } else {
1869             io_bus.insert(i8042, 0x061, 0x4).unwrap();
1870         }
1871 
1872         Ok(())
1873     }
1874 
1875     /// Sets up the legacy x86 CMOS/RTC platform device
1876     /// # Arguments
1877     ///
1878     /// * - `io_bus` - the IO bus object
1879     /// * - `mem_size` - the size in bytes of physical ram for the guest
setup_legacy_cmos_device( arch_memory_layout: &ArchMemoryLayout, io_bus: &Bus, irq_chip: &mut dyn IrqChipX86_64, vm_control: Tube, mem_size: u64, has_protected_vm_firmware: bool, ) -> anyhow::Result<()>1880     pub fn setup_legacy_cmos_device(
1881         arch_memory_layout: &ArchMemoryLayout,
1882         io_bus: &Bus,
1883         irq_chip: &mut dyn IrqChipX86_64,
1884         vm_control: Tube,
1885         mem_size: u64,
1886         has_protected_vm_firmware: bool,
1887     ) -> anyhow::Result<()> {
1888         let mem_regions = arch_memory_regions(
1889             arch_memory_layout,
1890             mem_size,
1891             None,
1892             has_protected_vm_firmware,
1893         );
1894 
1895         let mem_below_4g = mem_regions
1896             .iter()
1897             .filter(|r| r.0.offset() < FIRST_ADDR_PAST_32BITS)
1898             .map(|r| r.1)
1899             .sum();
1900 
1901         let mem_above_4g = mem_regions
1902             .iter()
1903             .filter(|r| r.0.offset() >= FIRST_ADDR_PAST_32BITS)
1904             .map(|r| r.1)
1905             .sum();
1906 
1907         let irq_evt = devices::IrqEdgeEvent::new().context("cmos irq")?;
1908         let cmos = devices::cmos::Cmos::new(
1909             mem_below_4g,
1910             mem_above_4g,
1911             Utc::now,
1912             vm_control,
1913             irq_evt.try_clone().context("cmos irq clone")?,
1914         )
1915         .context("create cmos")?;
1916 
1917         irq_chip
1918             .register_edge_irq_event(
1919                 devices::cmos::RTC_IRQ as u32,
1920                 &irq_evt,
1921                 IrqEventSource::from_device(&cmos),
1922             )
1923             .context("cmos register irq")?;
1924         io_bus
1925             .insert(Arc::new(Mutex::new(cmos)), 0x70, 0x2)
1926             .context("cmos insert irq")?;
1927 
1928         Ok(())
1929     }
1930 
1931     /// Sets up the acpi devices for this platform and
1932     /// return the resources which is used to set the ACPI tables.
1933     ///
1934     /// # Arguments
1935     ///
1936     /// * `io_bus` the I/O bus to add the devices to
1937     /// * `resources` the SystemAllocator to allocate IO and MMIO for acpi devices.
1938     /// * `suspend_tube` the tube object which used to suspend/resume the VM.
1939     /// * `sdts` ACPI system description tables
1940     /// * `irq_chip` the IrqChip object for registering irq events
1941     /// * `battery` indicate whether to create the battery
1942     /// * `mmio_bus` the MMIO bus to add the devices to
1943     /// * `pci_irqs` IRQ assignment of PCI devices. Tuples of (PCI address, gsi, PCI interrupt pin).
1944     ///   Note that this matches one of the return values of generate_pci_root.
setup_acpi_devices( arch_memory_layout: &ArchMemoryLayout, pci_root: Arc<Mutex<PciRoot>>, mem: &GuestMemory, io_bus: &Bus, resources: &mut SystemAllocator, suspend_tube: Arc<Mutex<SendTube>>, vm_evt_wrtube: SendTube, sdts: Vec<SDT>, irq_chip: &mut dyn IrqChip, sci_irq: u32, battery: (Option<BatteryType>, Option<Minijail>), #[cfg_attr(windows, allow(unused_variables))] mmio_bus: &Bus, max_bus: u8, resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, #[cfg(any(target_os = "android", target_os = "linux"))] ac_adapter: bool, guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, pci_irqs: &[(PciAddress, u32, PciInterruptPin)], ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)>1945     pub fn setup_acpi_devices(
1946         arch_memory_layout: &ArchMemoryLayout,
1947         pci_root: Arc<Mutex<PciRoot>>,
1948         mem: &GuestMemory,
1949         io_bus: &Bus,
1950         resources: &mut SystemAllocator,
1951         suspend_tube: Arc<Mutex<SendTube>>,
1952         vm_evt_wrtube: SendTube,
1953         sdts: Vec<SDT>,
1954         irq_chip: &mut dyn IrqChip,
1955         sci_irq: u32,
1956         battery: (Option<BatteryType>, Option<Minijail>),
1957         #[cfg_attr(windows, allow(unused_variables))] mmio_bus: &Bus,
1958         max_bus: u8,
1959         resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>,
1960         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1961         #[cfg(any(target_os = "android", target_os = "linux"))] ac_adapter: bool,
1962         guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
1963         pci_irqs: &[(PciAddress, u32, PciInterruptPin)],
1964     ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)> {
1965         // The AML data for the acpi devices
1966         let mut amls = Vec::new();
1967 
1968         let bat_control = if let Some(battery_type) = battery.0 {
1969             match battery_type {
1970                 #[cfg(any(target_os = "android", target_os = "linux"))]
1971                 BatteryType::Goldfish => {
1972                     let irq_num = resources.allocate_irq().ok_or(Error::CreateBatDevices(
1973                         arch::DeviceRegistrationError::AllocateIrq,
1974                     ))?;
1975                     let (control_tube, _mmio_base) = arch::sys::linux::add_goldfish_battery(
1976                         &mut amls,
1977                         battery.1,
1978                         mmio_bus,
1979                         irq_chip,
1980                         irq_num,
1981                         resources,
1982                         #[cfg(feature = "swap")]
1983                         swap_controller,
1984                     )
1985                     .map_err(Error::CreateBatDevices)?;
1986                     Some(BatControl {
1987                         type_: BatteryType::Goldfish,
1988                         control_tube,
1989                     })
1990                 }
1991                 #[cfg(windows)]
1992                 _ => None,
1993             }
1994         } else {
1995             None
1996         };
1997 
1998         let pm_alloc = resources.get_anon_alloc();
1999         let pm_iobase = match resources.io_allocator() {
2000             Some(io) => io
2001                 .allocate_with_align(
2002                     devices::acpi::ACPIPM_RESOURCE_LEN as u64,
2003                     pm_alloc,
2004                     "ACPIPM".to_string(),
2005                     4, // must be 32-bit aligned
2006                 )
2007                 .map_err(Error::AllocateIOResouce)?,
2008             None => 0x600,
2009         };
2010 
2011         let pcie_vcfg = aml::Name::new(
2012             "VCFG".into(),
2013             &Self::get_pcie_vcfg_mmio_range(mem, &arch_memory_layout.pcie_cfg_mmio).start,
2014         );
2015         pcie_vcfg.to_aml_bytes(&mut amls);
2016 
2017         let pm_sci_evt = devices::IrqLevelEvent::new().map_err(Error::CreateEvent)?;
2018 
2019         #[cfg(any(target_os = "android", target_os = "linux"))]
2020         let acdc = if ac_adapter {
2021             // Allocate GPE for AC adapter notfication
2022             let gpe = resources.allocate_gpe().ok_or(Error::AllocateGpe)?;
2023 
2024             let alloc = resources.get_anon_alloc();
2025             let mmio_base = resources
2026                 .allocate_mmio(
2027                     devices::ac_adapter::ACDC_VIRT_MMIO_SIZE,
2028                     alloc,
2029                     "AcAdapter".to_string(),
2030                     resources::AllocOptions::new().align(devices::ac_adapter::ACDC_VIRT_MMIO_SIZE),
2031                 )
2032                 .unwrap();
2033             let ac_adapter_dev = devices::ac_adapter::AcAdapter::new(mmio_base, gpe);
2034             let ac_dev = Arc::new(Mutex::new(ac_adapter_dev));
2035             mmio_bus
2036                 .insert(
2037                     ac_dev.clone(),
2038                     mmio_base,
2039                     devices::ac_adapter::ACDC_VIRT_MMIO_SIZE,
2040                 )
2041                 .unwrap();
2042 
2043             ac_dev.lock().to_aml_bytes(&mut amls);
2044             Some(ac_dev)
2045         } else {
2046             None
2047         };
2048         #[cfg(windows)]
2049         let acdc = None;
2050 
2051         //Virtual PMC
2052         if let Some(guest_suspended_cvar) = guest_suspended_cvar {
2053             let alloc = resources.get_anon_alloc();
2054             let mmio_base = resources
2055                 .allocate_mmio(
2056                     devices::pmc_virt::VPMC_VIRT_MMIO_SIZE,
2057                     alloc,
2058                     "VirtualPmc".to_string(),
2059                     resources::AllocOptions::new().align(devices::pmc_virt::VPMC_VIRT_MMIO_SIZE),
2060                 )
2061                 .unwrap();
2062 
2063             let pmc_virtio_mmio =
2064                 Arc::new(Mutex::new(VirtualPmc::new(mmio_base, guest_suspended_cvar)));
2065             mmio_bus
2066                 .insert(
2067                     pmc_virtio_mmio.clone(),
2068                     mmio_base,
2069                     devices::pmc_virt::VPMC_VIRT_MMIO_SIZE,
2070                 )
2071                 .unwrap();
2072             pmc_virtio_mmio.lock().to_aml_bytes(&mut amls);
2073         }
2074 
2075         let mut pmresource = devices::ACPIPMResource::new(
2076             pm_sci_evt.try_clone().map_err(Error::CloneEvent)?,
2077             suspend_tube,
2078             vm_evt_wrtube,
2079             acdc,
2080         );
2081         pmresource.to_aml_bytes(&mut amls);
2082         irq_chip
2083             .register_level_irq_event(
2084                 sci_irq,
2085                 &pm_sci_evt,
2086                 IrqEventSource::from_device(&pmresource),
2087             )
2088             .map_err(Error::RegisterIrqfd)?;
2089         pmresource.start();
2090 
2091         let mut crs_entries: Vec<Box<dyn Aml>> = vec![
2092             Box::new(aml::AddressSpace::new_bus_number(0x0u16, max_bus as u16)),
2093             Box::new(aml::IO::new(0xcf8, 0xcf8, 1, 0x8)),
2094         ];
2095         for r in resources.mmio_pools() {
2096             let entry: Box<dyn Aml> = match (u32::try_from(r.start), u32::try_from(r.end)) {
2097                 (Ok(start), Ok(end)) => Box::new(aml::AddressSpace::new_memory(
2098                     aml::AddressSpaceCachable::NotCacheable,
2099                     true,
2100                     start,
2101                     end,
2102                 )),
2103                 _ => Box::new(aml::AddressSpace::new_memory(
2104                     aml::AddressSpaceCachable::NotCacheable,
2105                     true,
2106                     r.start,
2107                     r.end,
2108                 )),
2109             };
2110             crs_entries.push(entry);
2111         }
2112 
2113         let prt_entries: Vec<aml::Package> = pci_irqs
2114             .iter()
2115             .map(|(pci_address, gsi, pci_intr_pin)| {
2116                 aml::Package::new(vec![
2117                     &pci_address.acpi_adr(),
2118                     &pci_intr_pin.to_mask(),
2119                     &aml::ZERO,
2120                     gsi,
2121                 ])
2122             })
2123             .collect();
2124 
2125         aml::Device::new(
2126             "_SB_.PC00".into(),
2127             vec![
2128                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A08")),
2129                 &aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A03")),
2130                 &aml::Name::new("_ADR".into(), &aml::ZERO),
2131                 &aml::Name::new("_SEG".into(), &aml::ZERO),
2132                 &aml::Name::new("_UID".into(), &aml::ZERO),
2133                 &aml::Name::new("SUPP".into(), &aml::ZERO),
2134                 &aml::Name::new(
2135                     "_CRS".into(),
2136                     &aml::ResourceTemplate::new(crs_entries.iter().map(|b| b.as_ref()).collect()),
2137                 ),
2138                 &PciRootOSC {},
2139                 &aml::Name::new(
2140                     "_PRT".into(),
2141                     &aml::Package::new(prt_entries.iter().map(|p| p as &dyn Aml).collect()),
2142                 ),
2143             ],
2144         )
2145         .to_aml_bytes(&mut amls);
2146 
2147         if let (Some(start), Some(len)) = (
2148             u32::try_from(arch_memory_layout.pcie_cfg_mmio.start).ok(),
2149             arch_memory_layout
2150                 .pcie_cfg_mmio
2151                 .len()
2152                 .and_then(|l| u32::try_from(l).ok()),
2153         ) {
2154             aml::Device::new(
2155                 "_SB_.MB00".into(),
2156                 vec![
2157                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C02")),
2158                     &aml::Name::new(
2159                         "_CRS".into(),
2160                         &aml::ResourceTemplate::new(vec![&aml::Memory32Fixed::new(
2161                             true, start, len,
2162                         )]),
2163                     ),
2164                 ],
2165             )
2166             .to_aml_bytes(&mut amls);
2167         } else {
2168             warn!("Failed to create ACPI MMCFG region reservation");
2169         }
2170 
2171         let root_bus = pci_root.lock().get_root_bus();
2172         let addresses = root_bus.lock().get_downstream_devices();
2173         for address in addresses {
2174             if let Some(acpi_path) = pci_root.lock().acpi_path(&address) {
2175                 const DEEPEST_SLEEP_STATE: u32 = 3;
2176                 aml::Device::new(
2177                     (*acpi_path).into(),
2178                     vec![
2179                         &aml::Name::new("_ADR".into(), &address.acpi_adr()),
2180                         &aml::Name::new(
2181                             "_PRW".into(),
2182                             &aml::Package::new(vec![&PM_WAKEUP_GPIO, &DEEPEST_SLEEP_STATE]),
2183                         ),
2184                     ],
2185                 )
2186                 .to_aml_bytes(&mut amls);
2187             }
2188         }
2189 
2190         let pm = Arc::new(Mutex::new(pmresource));
2191         io_bus
2192             .insert(
2193                 pm.clone(),
2194                 pm_iobase,
2195                 devices::acpi::ACPIPM_RESOURCE_LEN as u64,
2196             )
2197             .unwrap();
2198         resume_notify_devices.push(pm.clone());
2199 
2200         Ok((
2201             acpi::AcpiDevResource {
2202                 amls,
2203                 pm_iobase,
2204                 pm,
2205                 sdts,
2206             },
2207             bat_control,
2208         ))
2209     }
2210 
2211     /// Sets up the serial devices for this platform. Returns a list of configured serial devices.
2212     ///
2213     /// # Arguments
2214     ///
2215     /// * - `irq_chip` the IrqChip object for registering irq events
2216     /// * - `io_bus` the I/O bus to add the devices to
2217     /// * - `serial_parameters` - definitions for how the serial devices should be configured
setup_serial_devices( protection_type: ProtectionType, irq_chip: &mut dyn IrqChip, io_bus: &Bus, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<Vec<SerialDeviceInfo>>2218     pub fn setup_serial_devices(
2219         protection_type: ProtectionType,
2220         irq_chip: &mut dyn IrqChip,
2221         io_bus: &Bus,
2222         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
2223         serial_jail: Option<Minijail>,
2224         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
2225     ) -> Result<Vec<SerialDeviceInfo>> {
2226         let com_evt_1_3 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
2227         let com_evt_2_4 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
2228 
2229         let serial_devices = arch::add_serial_devices(
2230             protection_type,
2231             io_bus,
2232             (X86_64_SERIAL_1_3_IRQ, com_evt_1_3.get_trigger()),
2233             (X86_64_SERIAL_2_4_IRQ, com_evt_2_4.get_trigger()),
2234             serial_parameters,
2235             serial_jail,
2236             #[cfg(feature = "swap")]
2237             swap_controller,
2238         )
2239         .map_err(Error::CreateSerialDevices)?;
2240 
2241         let source = IrqEventSource {
2242             device_id: Serial::device_id(),
2243             queue_id: 0,
2244             device_name: Serial::debug_label(),
2245         };
2246         irq_chip
2247             .register_edge_irq_event(X86_64_SERIAL_1_3_IRQ, &com_evt_1_3, source.clone())
2248             .map_err(Error::RegisterIrqfd)?;
2249         irq_chip
2250             .register_edge_irq_event(X86_64_SERIAL_2_4_IRQ, &com_evt_2_4, source)
2251             .map_err(Error::RegisterIrqfd)?;
2252 
2253         Ok(serial_devices)
2254     }
2255 
setup_debugcon_devices( protection_type: ProtectionType, io_bus: &Bus, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, debugcon_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<()>2256     fn setup_debugcon_devices(
2257         protection_type: ProtectionType,
2258         io_bus: &Bus,
2259         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
2260         debugcon_jail: Option<Minijail>,
2261         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
2262     ) -> Result<()> {
2263         for param in serial_parameters.values() {
2264             if param.hardware != SerialHardware::Debugcon {
2265                 continue;
2266             }
2267 
2268             let mut preserved_fds = Vec::new();
2269             let con = param
2270                 .create_serial_device::<Debugcon>(
2271                     protection_type,
2272                     // Debugcon doesn't use the interrupt event
2273                     &Event::new().map_err(Error::CreateEvent)?,
2274                     &mut preserved_fds,
2275                 )
2276                 .map_err(Error::CreateDebugconDevice)?;
2277 
2278             let con: Arc<Mutex<dyn BusDevice>> = match debugcon_jail.as_ref() {
2279                 #[cfg(any(target_os = "android", target_os = "linux"))]
2280                 Some(jail) => {
2281                     let jail_clone = jail.try_clone().map_err(Error::CloneJail)?;
2282                     #[cfg(feature = "seccomp_trace")]
2283                     debug!(
2284                         "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}",
2285                         read_jail_addr(jail),
2286                         read_jail_addr(&jail_clone)
2287                     );
2288                     Arc::new(Mutex::new(
2289                         ProxyDevice::new(
2290                             con,
2291                             jail_clone,
2292                             preserved_fds,
2293                             #[cfg(feature = "swap")]
2294                             swap_controller,
2295                         )
2296                         .map_err(Error::CreateProxyDevice)?,
2297                     ))
2298                 }
2299                 #[cfg(windows)]
2300                 Some(_) => unreachable!(),
2301                 None => Arc::new(Mutex::new(con)),
2302             };
2303             io_bus
2304                 .insert(con.clone(), param.debugcon_port.into(), 1)
2305                 .map_err(Error::InsertBus)?;
2306         }
2307 
2308         Ok(())
2309     }
2310 }
2311 
2312 #[sorted]
2313 #[derive(Error, Debug)]
2314 pub enum MsrError {
2315     #[error("CPU not support. Only intel CPUs support ITMT.")]
2316     CpuUnSupport,
2317     #[error("msr must be unique: {0}")]
2318     MsrDuplicate(u32),
2319 }
2320 
2321 #[derive(Error, Debug)]
2322 pub enum HybridSupportError {
2323     #[error("Host CPU doesn't support hybrid architecture.")]
2324     UnsupportedHostCpu,
2325 }
2326 
2327 /// The wrapper for CPUID call functions.
2328 pub struct CpuIdCall {
2329     /// __cpuid_count or a fake function for test.
2330     cpuid_count: unsafe fn(u32, u32) -> CpuidResult,
2331     /// __cpuid or a fake function for test.
2332     cpuid: unsafe fn(u32) -> CpuidResult,
2333 }
2334 
2335 impl CpuIdCall {
new( cpuid_count: unsafe fn(u32, u32) -> CpuidResult, cpuid: unsafe fn(u32) -> CpuidResult, ) -> CpuIdCall2336     pub fn new(
2337         cpuid_count: unsafe fn(u32, u32) -> CpuidResult,
2338         cpuid: unsafe fn(u32) -> CpuidResult,
2339     ) -> CpuIdCall {
2340         CpuIdCall { cpuid_count, cpuid }
2341     }
2342 }
2343 
2344 /// Check if host supports hybrid CPU feature. The check include:
2345 ///     1. Check if CPUID.1AH exists. CPUID.1AH is hybrid information enumeration leaf.
2346 ///     2. Check if CPUID.07H.00H:EDX[bit 15] sets. This bit means the processor is identified as a
2347 ///        hybrid part.
2348 ///     3. Check if CPUID.1AH:EAX sets. The hybrid core type is set in EAX.
2349 ///
2350 /// # Arguments
2351 ///
2352 /// * - `cpuid` the wrapped cpuid functions used to get CPUID info.
check_host_hybrid_support(cpuid: &CpuIdCall) -> std::result::Result<(), HybridSupportError>2353 pub fn check_host_hybrid_support(cpuid: &CpuIdCall) -> std::result::Result<(), HybridSupportError> {
2354     // CPUID.0H.EAX returns maximum input value for basic CPUID information.
2355     //
2356     // SAFETY:
2357     // Safe because we pass 0 for this call and the host supports the
2358     // `cpuid` instruction.
2359     let mut cpuid_entry = unsafe { (cpuid.cpuid)(0x0) };
2360     if cpuid_entry.eax < 0x1A {
2361         return Err(HybridSupportError::UnsupportedHostCpu);
2362     }
2363     // SAFETY:
2364     // Safe because we pass 0x7 and 0 for this call and the host supports the
2365     // `cpuid` instruction.
2366     cpuid_entry = unsafe { (cpuid.cpuid_count)(0x7, 0) };
2367     if cpuid_entry.edx & 1 << EDX_HYBRID_CPU_SHIFT == 0 {
2368         return Err(HybridSupportError::UnsupportedHostCpu);
2369     }
2370     // From SDM, if a value entered for CPUID.EAX is less than or equal to the
2371     // maximum input value and the leaf is not supported on that processor then
2372     // 0 is returned in all the registers.
2373     // For the CPU with hybrid support, its CPUID.1AH.EAX shouldn't be zero.
2374     //
2375     // SAFETY:
2376     // Safe because we pass 0 for this call and the host supports the
2377     // `cpuid` instruction.
2378     cpuid_entry = unsafe { (cpuid.cpuid)(0x1A) };
2379     if cpuid_entry.eax == 0 {
2380         return Err(HybridSupportError::UnsupportedHostCpu);
2381     }
2382     Ok(())
2383 }
2384 
2385 #[cfg(test)]
2386 mod tests {
2387     use std::mem::size_of;
2388 
2389     use super::*;
2390 
2391     const TEST_MEMORY_SIZE: u64 = 2 * GB;
2392 
setup() -> ArchMemoryLayout2393     fn setup() -> ArchMemoryLayout {
2394         let pci_config = PciConfig {
2395             ecam: Some(MemoryRegionConfig {
2396                 start: 3 * GB,
2397                 size: Some(256 * MB),
2398             }),
2399             mem: Some(MemoryRegionConfig {
2400                 start: 2 * GB,
2401                 size: None,
2402             }),
2403         };
2404         create_arch_memory_layout(&pci_config, false).unwrap()
2405     }
2406 
2407     #[test]
regions_lt_4gb_nobios()2408     fn regions_lt_4gb_nobios() {
2409         let arch_memory_layout = setup();
2410         let regions = arch_memory_regions(
2411             &arch_memory_layout,
2412             512 * MB,
2413             /* bios_size */ None,
2414             /* has_protected_vm_firmware */ false,
2415         );
2416         assert_eq!(1, regions.len());
2417         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2418         assert_eq!(1u64 << 29, regions[0].1);
2419     }
2420 
2421     #[test]
regions_gt_4gb_nobios()2422     fn regions_gt_4gb_nobios() {
2423         let arch_memory_layout = setup();
2424         let size = 4 * GB + 0x8000;
2425         let regions = arch_memory_regions(
2426             &arch_memory_layout,
2427             size,
2428             /* bios_size */ None,
2429             /* has_protected_vm_firmware */ false,
2430         );
2431         assert_eq!(2, regions.len());
2432         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2433         assert_eq!(GuestAddress(4 * GB), regions[1].0);
2434         assert_eq!(4 * GB + 0x8000, regions[0].1 + regions[1].1);
2435     }
2436 
2437     #[test]
regions_lt_4gb_bios()2438     fn regions_lt_4gb_bios() {
2439         let arch_memory_layout = setup();
2440         let bios_len = 1 * MB;
2441         let regions = arch_memory_regions(
2442             &arch_memory_layout,
2443             512 * MB,
2444             Some(bios_len),
2445             /* has_protected_vm_firmware */ false,
2446         );
2447         assert_eq!(2, regions.len());
2448         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2449         assert_eq!(512 * MB, regions[0].1);
2450         assert_eq!(
2451             GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2452             regions[1].0
2453         );
2454         assert_eq!(bios_len, regions[1].1);
2455     }
2456 
2457     #[test]
regions_gt_4gb_bios()2458     fn regions_gt_4gb_bios() {
2459         let arch_memory_layout = setup();
2460         let bios_len = 1 * MB;
2461         let regions = arch_memory_regions(
2462             &arch_memory_layout,
2463             4 * GB + 0x8000,
2464             Some(bios_len),
2465             /* has_protected_vm_firmware */ false,
2466         );
2467         assert_eq!(3, regions.len());
2468         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2469         assert_eq!(
2470             GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2471             regions[1].0
2472         );
2473         assert_eq!(bios_len, regions[1].1);
2474         assert_eq!(GuestAddress(4 * GB), regions[2].0);
2475     }
2476 
2477     #[test]
regions_eq_4gb_nobios()2478     fn regions_eq_4gb_nobios() {
2479         let arch_memory_layout = setup();
2480         // Test with exact size of 4GB - the overhead.
2481         let regions = arch_memory_regions(
2482             &arch_memory_layout,
2483             TEST_MEMORY_SIZE - START_OF_RAM_32BITS,
2484             /* bios_size */ None,
2485             /* has_protected_vm_firmware */ false,
2486         );
2487         dbg!(&regions);
2488         assert_eq!(1, regions.len());
2489         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2490         assert_eq!(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, regions[0].1);
2491     }
2492 
2493     #[test]
regions_eq_4gb_bios()2494     fn regions_eq_4gb_bios() {
2495         let arch_memory_layout = setup();
2496         // Test with exact size of 4GB - the overhead.
2497         let bios_len = 1 * MB;
2498         let regions = arch_memory_regions(
2499             &arch_memory_layout,
2500             TEST_MEMORY_SIZE - START_OF_RAM_32BITS,
2501             Some(bios_len),
2502             /* has_protected_vm_firmware */ false,
2503         );
2504         assert_eq!(2, regions.len());
2505         assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2506         assert_eq!(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, regions[0].1);
2507         assert_eq!(
2508             GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2509             regions[1].0
2510         );
2511         assert_eq!(bios_len, regions[1].1);
2512     }
2513 
2514     #[test]
check_pci_mmio_layout()2515     fn check_pci_mmio_layout() {
2516         let arch_memory_layout = setup();
2517 
2518         assert_eq!(arch_memory_layout.pci_mmio_before_32bit.start, 2 * GB);
2519         assert_eq!(arch_memory_layout.pcie_cfg_mmio.start, 3 * GB);
2520         assert_eq!(arch_memory_layout.pcie_cfg_mmio.len().unwrap(), 256 * MB);
2521     }
2522 
2523     #[test]
check_32bit_gap_size_alignment()2524     fn check_32bit_gap_size_alignment() {
2525         let arch_memory_layout = setup();
2526         // pci_mmio_before_32bit is 256 MB aligned to be friendly for MTRR mappings.
2527         assert_eq!(
2528             arch_memory_layout.pci_mmio_before_32bit.start % (256 * MB),
2529             0
2530         );
2531     }
2532 
2533     #[test]
write_setup_data_empty()2534     fn write_setup_data_empty() {
2535         let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap();
2536         let setup_data = [];
2537         let setup_data_addr = write_setup_data(
2538             &mem,
2539             GuestAddress(0x1000),
2540             GuestAddress(0x2000),
2541             &setup_data,
2542         )
2543         .expect("write_setup_data");
2544         assert_eq!(setup_data_addr, None);
2545     }
2546 
2547     #[test]
write_setup_data_two_of_them()2548     fn write_setup_data_two_of_them() {
2549         let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap();
2550 
2551         let entry1_addr = GuestAddress(0x1000);
2552         let entry1_next_addr = entry1_addr;
2553         let entry1_len_addr = entry1_addr.checked_add(12).unwrap();
2554         let entry1_data_addr = entry1_addr.checked_add(16).unwrap();
2555         let entry1_data = [0x55u8; 13];
2556         let entry1_size = (size_of::<setup_data_hdr>() + entry1_data.len()) as u64;
2557         let entry1_align = 3;
2558 
2559         let entry2_addr = GuestAddress(entry1_addr.offset() + entry1_size + entry1_align);
2560         let entry2_next_addr = entry2_addr;
2561         let entry2_len_addr = entry2_addr.checked_add(12).unwrap();
2562         let entry2_data_addr = entry2_addr.checked_add(16).unwrap();
2563         let entry2_data = [0xAAu8; 9];
2564 
2565         let setup_data = [
2566             SetupData {
2567                 data: entry1_data.to_vec(),
2568                 type_: SetupDataType::Dtb,
2569             },
2570             SetupData {
2571                 data: entry2_data.to_vec(),
2572                 type_: SetupDataType::Dtb,
2573             },
2574         ];
2575 
2576         let setup_data_head_addr = write_setup_data(
2577             &mem,
2578             GuestAddress(0x1000),
2579             GuestAddress(0x2000),
2580             &setup_data,
2581         )
2582         .expect("write_setup_data");
2583         assert_eq!(setup_data_head_addr, Some(entry1_addr));
2584 
2585         assert_eq!(
2586             mem.read_obj_from_addr::<u64>(entry1_next_addr).unwrap(),
2587             entry2_addr.offset()
2588         );
2589         assert_eq!(
2590             mem.read_obj_from_addr::<u32>(entry1_len_addr).unwrap(),
2591             entry1_data.len() as u32
2592         );
2593         assert_eq!(
2594             mem.read_obj_from_addr::<[u8; 13]>(entry1_data_addr)
2595                 .unwrap(),
2596             entry1_data
2597         );
2598 
2599         assert_eq!(mem.read_obj_from_addr::<u64>(entry2_next_addr).unwrap(), 0);
2600         assert_eq!(
2601             mem.read_obj_from_addr::<u32>(entry2_len_addr).unwrap(),
2602             entry2_data.len() as u32
2603         );
2604         assert_eq!(
2605             mem.read_obj_from_addr::<[u8; 9]>(entry2_data_addr).unwrap(),
2606             entry2_data
2607         );
2608     }
2609 
2610     #[test]
cmdline_overflow()2611     fn cmdline_overflow() {
2612         const MEM_SIZE: u64 = 0x1000;
2613         let gm = GuestMemory::new(&[(GuestAddress(0x0), MEM_SIZE)]).unwrap();
2614         let mut cmdline = kernel_cmdline::Cmdline::new();
2615         cmdline.insert_str("12345").unwrap();
2616         let cmdline_address = GuestAddress(MEM_SIZE - 5);
2617         let err =
2618             X8664arch::load_cmdline(&gm, cmdline_address, cmdline, CMDLINE_MAX_SIZE as usize - 1)
2619                 .unwrap_err();
2620         assert!(matches!(err, Error::CommandLineOverflow));
2621     }
2622 
2623     #[test]
cmdline_write_end()2624     fn cmdline_write_end() {
2625         const MEM_SIZE: u64 = 0x1000;
2626         let gm = GuestMemory::new(&[(GuestAddress(0x0), MEM_SIZE)]).unwrap();
2627         let mut cmdline = kernel_cmdline::Cmdline::new();
2628         cmdline.insert_str("1234").unwrap();
2629         let mut cmdline_address = GuestAddress(45);
2630         X8664arch::load_cmdline(&gm, cmdline_address, cmdline, CMDLINE_MAX_SIZE as usize - 1)
2631             .unwrap();
2632         let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
2633         assert_eq!(val, b'1');
2634         cmdline_address = cmdline_address.unchecked_add(1);
2635         let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
2636         assert_eq!(val, b'2');
2637         cmdline_address = cmdline_address.unchecked_add(1);
2638         let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
2639         assert_eq!(val, b'3');
2640         cmdline_address = cmdline_address.unchecked_add(1);
2641         let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
2642         assert_eq!(val, b'4');
2643         cmdline_address = cmdline_address.unchecked_add(1);
2644         let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
2645         assert_eq!(val, b'\0');
2646     }
2647 }
2648