1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 //! x86 architecture support.
6
7 #![cfg(target_arch = "x86_64")]
8
9 mod fdt;
10
11 #[cfg(feature = "gdb")]
12 mod gdb;
13
14 const SETUP_DTB: u32 = 2;
15 const SETUP_RNG_SEED: u32 = 9;
16
17 #[allow(dead_code)]
18 #[allow(non_upper_case_globals)]
19 #[allow(non_camel_case_types)]
20 #[allow(non_snake_case)]
21 pub mod bootparam;
22
23 #[allow(dead_code)]
24 #[allow(non_upper_case_globals)]
25 mod msr_index;
26
27 #[allow(dead_code)]
28 #[allow(non_upper_case_globals)]
29 #[allow(non_camel_case_types)]
30 #[allow(clippy::all)]
31 mod mpspec;
32
33 pub mod acpi;
34 mod bzimage;
35 pub mod cpuid;
36 mod gdt;
37 pub mod interrupts;
38 pub mod mptable;
39 pub mod regs;
40 pub mod smbios;
41
42 use std::arch::x86_64::CpuidResult;
43 use std::collections::BTreeMap;
44 use std::fmt;
45 use std::fs::File;
46 use std::io;
47 use std::io::Write;
48 use std::mem;
49 use std::path::PathBuf;
50 use std::sync::mpsc;
51 use std::sync::Arc;
52
53 use acpi_tables::aml;
54 use acpi_tables::aml::Aml;
55 use acpi_tables::sdt::SDT;
56 use anyhow::Context;
57 use arch::get_serial_cmdline;
58 use arch::serial::SerialDeviceInfo;
59 use arch::CpuSet;
60 use arch::DtbOverlay;
61 use arch::FdtPosition;
62 use arch::GetSerialCmdlineError;
63 use arch::MemoryRegionConfig;
64 use arch::PciConfig;
65 use arch::RunnableLinuxVm;
66 use arch::VmComponents;
67 use arch::VmImage;
68 use base::debug;
69 use base::info;
70 use base::warn;
71 #[cfg(any(target_os = "android", target_os = "linux"))]
72 use base::AsRawDescriptors;
73 use base::Event;
74 use base::FileGetLen;
75 use base::FileReadWriteAtVolatile;
76 use base::SendTube;
77 use base::Tube;
78 use base::TubeError;
79 use chrono::Utc;
80 pub use cpuid::adjust_cpuid;
81 pub use cpuid::CpuIdContext;
82 use devices::acpi::PM_WAKEUP_GPIO;
83 use devices::Bus;
84 use devices::BusDevice;
85 use devices::BusDeviceObj;
86 use devices::BusResumeDevice;
87 use devices::BusType;
88 use devices::Debugcon;
89 use devices::FwCfgParameters;
90 use devices::IrqChip;
91 use devices::IrqChipX86_64;
92 use devices::IrqEventSource;
93 use devices::PciAddress;
94 use devices::PciConfigIo;
95 use devices::PciConfigMmio;
96 use devices::PciDevice;
97 use devices::PciInterruptPin;
98 use devices::PciRoot;
99 use devices::PciRootCommand;
100 use devices::PciVirtualConfigMmio;
101 use devices::Pflash;
102 #[cfg(any(target_os = "android", target_os = "linux"))]
103 use devices::ProxyDevice;
104 use devices::Serial;
105 use devices::SerialHardware;
106 use devices::SerialParameters;
107 use devices::VirtualPmc;
108 use devices::FW_CFG_BASE_PORT;
109 use devices::FW_CFG_MAX_FILE_SLOTS;
110 use devices::FW_CFG_WIDTH;
111 use hypervisor::CpuConfigX86_64;
112 use hypervisor::Hypervisor;
113 use hypervisor::HypervisorX86_64;
114 use hypervisor::ProtectionType;
115 use hypervisor::VcpuInitX86_64;
116 use hypervisor::VcpuX86_64;
117 use hypervisor::Vm;
118 use hypervisor::VmCap;
119 use hypervisor::VmX86_64;
120 #[cfg(feature = "seccomp_trace")]
121 use jail::read_jail_addr;
122 #[cfg(windows)]
123 use jail::FakeMinijailStub as Minijail;
124 #[cfg(any(target_os = "android", target_os = "linux"))]
125 use minijail::Minijail;
126 use rand::rngs::OsRng;
127 use rand::RngCore;
128 use remain::sorted;
129 use resources::AddressRange;
130 use resources::SystemAllocator;
131 use resources::SystemAllocatorConfig;
132 use sync::Condvar;
133 use sync::Mutex;
134 use thiserror::Error;
135 use vm_control::BatControl;
136 use vm_control::BatteryType;
137 use vm_memory::GuestAddress;
138 use vm_memory::GuestMemory;
139 use vm_memory::GuestMemoryError;
140 use vm_memory::MemoryRegionOptions;
141 use vm_memory::MemoryRegionPurpose;
142 use zerocopy::AsBytes;
143 use zerocopy::FromBytes;
144 use zerocopy::FromZeroes;
145
146 use crate::bootparam::boot_params;
147 use crate::bootparam::setup_header;
148 use crate::bootparam::XLF_CAN_BE_LOADED_ABOVE_4G;
149 use crate::cpuid::EDX_HYBRID_CPU_SHIFT;
150
151 #[sorted]
152 #[derive(Error, Debug)]
153 pub enum Error {
154 #[error("error allocating a single gpe")]
155 AllocateGpe,
156 #[error("error allocating IO resource: {0}")]
157 AllocateIOResouce(resources::Error),
158 #[error("error allocating a single irq")]
159 AllocateIrq,
160 #[error("unable to clone an Event: {0}")]
161 CloneEvent(base::Error),
162 #[error("failed to clone IRQ chip: {0}")]
163 CloneIrqChip(base::Error),
164 #[cfg(any(target_os = "android", target_os = "linux"))]
165 #[error("failed to clone jail: {0}")]
166 CloneJail(minijail::Error),
167 #[error("unable to clone a Tube: {0}")]
168 CloneTube(TubeError),
169 #[error("the given kernel command line was invalid: {0}")]
170 Cmdline(kernel_cmdline::Error),
171 #[error("failed writing command line to guest memory")]
172 CommandLineCopy,
173 #[error("command line overflowed guest memory")]
174 CommandLineOverflow,
175 #[error("failed to configure hotplugged pci device: {0}")]
176 ConfigurePciDevice(arch::DeviceRegistrationError),
177 #[error("bad PCI ECAM configuration: {0}")]
178 ConfigurePciEcam(String),
179 #[error("bad PCI mem configuration: {0}")]
180 ConfigurePciMem(String),
181 #[error("failed to configure segment registers: {0}")]
182 ConfigureSegments(regs::Error),
183 #[error("error configuring the system")]
184 ConfigureSystem,
185 #[error("unable to create ACPI tables")]
186 CreateAcpi,
187 #[error("unable to create battery devices: {0}")]
188 CreateBatDevices(arch::DeviceRegistrationError),
189 #[error("could not create debugcon device: {0}")]
190 CreateDebugconDevice(devices::SerialError),
191 #[error("unable to make an Event: {0}")]
192 CreateEvent(base::Error),
193 #[error("failed to create fdt: {0}")]
194 CreateFdt(cros_fdt::Error),
195 #[error("failed to create fw_cfg device: {0}")]
196 CreateFwCfgDevice(devices::FwCfgError),
197 #[error("failed to create IOAPIC device: {0}")]
198 CreateIoapicDevice(base::Error),
199 #[error("failed to create a PCI root hub: {0}")]
200 CreatePciRoot(arch::DeviceRegistrationError),
201 #[error("unable to create PIT: {0}")]
202 CreatePit(base::Error),
203 #[error("unable to make PIT device: {0}")]
204 CreatePitDevice(devices::PitError),
205 #[cfg(any(target_os = "android", target_os = "linux"))]
206 #[error("unable to create proxy device: {0}")]
207 CreateProxyDevice(devices::ProxyError),
208 #[error("unable to create serial devices: {0}")]
209 CreateSerialDevices(arch::DeviceRegistrationError),
210 #[error("failed to create socket: {0}")]
211 CreateSocket(io::Error),
212 #[error("failed to create tube: {0}")]
213 CreateTube(base::TubeError),
214 #[error("failed to create VCPU: {0}")]
215 CreateVcpu(base::Error),
216 #[error("failed to create Virtio MMIO bus: {0}")]
217 CreateVirtioMmioBus(arch::DeviceRegistrationError),
218 #[error("invalid e820 setup params")]
219 E820Configuration,
220 #[error("failed to enable singlestep execution: {0}")]
221 EnableSinglestep(base::Error),
222 #[error("failed to enable split irqchip: {0}")]
223 EnableSplitIrqchip(base::Error),
224 #[error("failed to get serial cmdline: {0}")]
225 GetSerialCmdline(GetSerialCmdlineError),
226 #[error("failed to insert device onto bus: {0}")]
227 InsertBus(devices::BusError),
228 #[error("the kernel extends past the end of RAM")]
229 InvalidCpuConfig,
230 #[error("invalid CPU config parameters")]
231 KernelOffsetPastEnd,
232 #[error("error loading bios: {0}")]
233 LoadBios(io::Error),
234 #[error("error loading kernel bzImage: {0}")]
235 LoadBzImage(bzimage::Error),
236 #[error("error loading custom pVM firmware: {0}")]
237 LoadCustomPvmFw(arch::LoadImageError),
238 #[error("error loading initrd: {0}")]
239 LoadInitrd(arch::LoadImageError),
240 #[error("error loading Kernel: {0}")]
241 LoadKernel(kernel_loader::Error),
242 #[error("error loading pflash: {0}")]
243 LoadPflash(io::Error),
244 #[error("error translating address: Page not present")]
245 PageNotPresent,
246 #[error("pci mmio overlaps with pVM firmware memory")]
247 PciMmioOverlapPvmFw,
248 #[error("pVM firmware not supported when bios is used on x86_64")]
249 PvmFwBiosUnsupported,
250 #[error("error reading guest memory {0}")]
251 ReadingGuestMemory(vm_memory::GuestMemoryError),
252 #[error("single register read not supported on x86_64")]
253 ReadRegIsUnsupported,
254 #[error("error reading CPU registers {0}")]
255 ReadRegs(base::Error),
256 #[error("error registering an IrqFd: {0}")]
257 RegisterIrqfd(base::Error),
258 #[error("error registering virtual socket device: {0}")]
259 RegisterVsock(arch::DeviceRegistrationError),
260 #[error("error reserved pcie config mmio")]
261 ReservePcieCfgMmio(resources::Error),
262 #[error("failed to set a hardware breakpoint: {0}")]
263 SetHwBreakpoint(base::Error),
264 #[error("failed to set identity map addr: {0}")]
265 SetIdentityMapAddr(base::Error),
266 #[error("failed to set interrupts: {0}")]
267 SetLint(interrupts::Error),
268 #[error("failed to set tss addr: {0}")]
269 SetTssAddr(base::Error),
270 #[error("failed to set up cmos: {0}")]
271 SetupCmos(anyhow::Error),
272 #[error("failed to set up cpuid: {0}")]
273 SetupCpuid(cpuid::Error),
274 #[error("setup data too large")]
275 SetupDataTooLarge,
276 #[error("failed to set up FPU: {0}")]
277 SetupFpu(base::Error),
278 #[error("failed to set up guest memory: {0}")]
279 SetupGuestMemory(GuestMemoryError),
280 #[error("failed to set up mptable: {0}")]
281 SetupMptable(mptable::Error),
282 #[error("failed to set up MSRs: {0}")]
283 SetupMsrs(base::Error),
284 #[error("failed to set up page tables: {0}")]
285 SetupPageTables(regs::Error),
286 #[error("failed to set up pflash: {0}")]
287 SetupPflash(anyhow::Error),
288 #[error("failed to set up registers: {0}")]
289 SetupRegs(regs::Error),
290 #[error("failed to set up SMBIOS: {0}")]
291 SetupSmbios(smbios::Error),
292 #[error("failed to set up sregs: {0}")]
293 SetupSregs(base::Error),
294 #[error("failed to translate virtual address")]
295 TranslatingVirtAddr,
296 #[error("protected VMs not supported on x86_64")]
297 UnsupportedProtectionType,
298 #[error("single register write not supported on x86_64")]
299 WriteRegIsUnsupported,
300 #[error("error writing CPU registers {0}")]
301 WriteRegs(base::Error),
302 #[error("error writing guest memory {0}")]
303 WritingGuestMemory(GuestMemoryError),
304 #[error("error writing setup_data: {0}")]
305 WritingSetupData(GuestMemoryError),
306 #[error("the zero page extends past the end of guest_mem")]
307 ZeroPagePastRamEnd,
308 #[error("error writing the zero page of guest memory")]
309 ZeroPageSetup,
310 }
311
312 pub type Result<T> = std::result::Result<T, Error>;
313
314 pub struct X8664arch;
315
316 // Like `bootparam::setup_data` without the incomplete array field at the end, which allows us to
317 // safely implement Copy, Clone
318 #[repr(C)]
319 #[derive(Copy, Clone, Default, FromZeroes, FromBytes, AsBytes)]
320 struct setup_data_hdr {
321 pub next: u64,
322 pub type_: u32,
323 pub len: u32,
324 }
325
326 #[repr(u32)]
327 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
328 pub enum SetupDataType {
329 Dtb = SETUP_DTB,
330 RngSeed = SETUP_RNG_SEED,
331 }
332
333 /// A single entry to be inserted in the bootparam `setup_data` linked list.
334 pub struct SetupData {
335 pub data: Vec<u8>,
336 pub type_: SetupDataType,
337 }
338
339 #[derive(Copy, Clone, Debug)]
340 enum E820Type {
341 Ram = 0x01,
342 Reserved = 0x2,
343 }
344
345 #[derive(Copy, Clone, Debug)]
346 struct E820Entry {
347 pub address: GuestAddress,
348 pub len: u64,
349 pub mem_type: E820Type,
350 }
351
352 const MB: u64 = 1 << 20;
353 const GB: u64 = 1 << 30;
354
355 pub const BOOT_STACK_POINTER: u64 = 0x8000;
356 const START_OF_RAM_32BITS: u64 = 0;
357 const FIRST_ADDR_PAST_20BITS: u64 = 1 << 20;
358 const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32;
359 // Make sure it align to 256MB for MTRR convenient
360 const MEM_32BIT_GAP_SIZE: u64 = 768 * MB;
361 const END_ADDR_BEFORE_32BITS: u64 = FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE;
362 // Reserved memory for nand_bios/LAPIC/IOAPIC/HPET/.....
363 const RESERVED_MEM_SIZE: u64 = 0x800_0000;
364 const DEFAULT_PCI_MEM_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1;
365 // Reserve 64MB for pcie enhanced configuration
366 const DEFAULT_PCIE_CFG_MMIO_SIZE: u64 = 0x400_0000;
367 const DEFAULT_PCIE_CFG_MMIO_END: u64 = FIRST_ADDR_PAST_32BITS - RESERVED_MEM_SIZE - 1;
368 const DEFAULT_PCIE_CFG_MMIO_START: u64 = DEFAULT_PCIE_CFG_MMIO_END - DEFAULT_PCIE_CFG_MMIO_SIZE + 1;
369 // Linux (with 4-level paging) has a physical memory limit of 46 bits (64 TiB).
370 const HIGH_MMIO_MAX_END: u64 = (1u64 << 46) - 1;
371 pub const KERNEL_32BIT_ENTRY_OFFSET: u64 = 0x0;
372 pub const KERNEL_64BIT_ENTRY_OFFSET: u64 = 0x200;
373 pub const ZERO_PAGE_OFFSET: u64 = 0x7000;
374 // Set BIOS max size to 16M: this is used only when `unrestricted guest` is disabled
375 const BIOS_MAX_SIZE: u64 = 0x1000000;
376
377 pub const KERNEL_START_OFFSET: u64 = 0x20_0000;
378 const CMDLINE_OFFSET: u64 = 0x2_0000;
379 const CMDLINE_MAX_SIZE: u64 = 0x800; // including terminating zero
380 const SETUP_DATA_START: u64 = CMDLINE_OFFSET + CMDLINE_MAX_SIZE;
381 const SETUP_DATA_END: u64 = ACPI_HI_RSDP_WINDOW_BASE;
382 const X86_64_SERIAL_1_3_IRQ: u32 = 4;
383 const X86_64_SERIAL_2_4_IRQ: u32 = 3;
384 // X86_64_SCI_IRQ is used to fill the ACPI FACP table.
385 // The sci_irq number is better to be a legacy
386 // IRQ number which is less than 16(actually most of the
387 // platforms have fixed IRQ number 9). So we can
388 // reserve the IRQ number 5 for SCI and let the
389 // the other devices starts from next.
390 pub const X86_64_SCI_IRQ: u32 = 5;
391 // The CMOS RTC uses IRQ 8; start allocating IRQs at 9.
392 pub const X86_64_IRQ_BASE: u32 = 9;
393 const ACPI_HI_RSDP_WINDOW_BASE: u64 = 0x000E_0000;
394
395 // pVM firmware memory. Should be within the low 4GB, so that it is identity-mapped
396 // by setup_page_tables() when a protected VM boots in long mode, since the pVM firmware is
397 // the VM entry point.
398 const PROTECTED_VM_FW_MAX_SIZE: u64 = 0x40_0000;
399 const PROTECTED_VM_FW_START: u64 = END_ADDR_BEFORE_32BITS - PROTECTED_VM_FW_MAX_SIZE;
400
401 #[derive(Debug, PartialEq, Eq)]
402 pub enum CpuManufacturer {
403 Intel,
404 Amd,
405 Unknown,
406 }
407
get_cpu_manufacturer() -> CpuManufacturer408 pub fn get_cpu_manufacturer() -> CpuManufacturer {
409 cpuid::cpu_manufacturer()
410 }
411
412 pub struct ArchMemoryLayout {
413 // the pci mmio range below 4G
414 pci_mmio_before_32bit: AddressRange,
415 // the pcie cfg mmio range
416 pcie_cfg_mmio: AddressRange,
417 // the pVM firmware memory (if running a protected VM)
418 pvmfw_mem: Option<AddressRange>,
419 }
420
create_arch_memory_layout( pci_config: &PciConfig, has_protected_vm_firmware: bool, ) -> Result<ArchMemoryLayout>421 pub fn create_arch_memory_layout(
422 pci_config: &PciConfig,
423 has_protected_vm_firmware: bool,
424 ) -> Result<ArchMemoryLayout> {
425 // the max bus number is 256 and each bus occupy 1MB, so the max pcie cfg mmio size = 256M
426 const MAX_PCIE_ECAM_SIZE: u64 = 256 * MB;
427 let pcie_cfg_mmio = match pci_config.ecam {
428 Some(MemoryRegionConfig {
429 start,
430 size: Some(size),
431 }) => AddressRange::from_start_and_size(start, size.min(MAX_PCIE_ECAM_SIZE)).unwrap(),
432 Some(MemoryRegionConfig { start, size: None }) => {
433 AddressRange::from_start_and_end(start, DEFAULT_PCIE_CFG_MMIO_END)
434 }
435 None => {
436 AddressRange::from_start_and_end(DEFAULT_PCIE_CFG_MMIO_START, DEFAULT_PCIE_CFG_MMIO_END)
437 }
438 };
439 if pcie_cfg_mmio.start % pcie_cfg_mmio.len().unwrap() != 0
440 || pcie_cfg_mmio.start % MB != 0
441 || pcie_cfg_mmio.len().unwrap() % MB != 0
442 {
443 return Err(Error::ConfigurePciEcam(
444 "base and len must be aligned to 1MB and base must be a multiple of len".to_string(),
445 ));
446 }
447 if pcie_cfg_mmio.end >= 0x1_0000_0000 {
448 return Err(Error::ConfigurePciEcam(
449 "end address can't go beyond 4G".to_string(),
450 ));
451 }
452
453 let pci_mmio_before_32bit = match pci_config.mem {
454 Some(MemoryRegionConfig {
455 start,
456 size: Some(size),
457 }) => AddressRange::from_start_and_size(start, size)
458 .ok_or(Error::ConfigurePciMem("region overflowed".to_string()))?,
459 Some(MemoryRegionConfig { start, size: None }) => {
460 AddressRange::from_start_and_end(start, DEFAULT_PCI_MEM_END)
461 }
462 None => AddressRange::from_start_and_end(
463 pcie_cfg_mmio
464 .start
465 .min(FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE),
466 DEFAULT_PCI_MEM_END,
467 ),
468 };
469
470 let pvmfw_mem = if has_protected_vm_firmware {
471 let range = AddressRange {
472 start: PROTECTED_VM_FW_START,
473 end: PROTECTED_VM_FW_START + PROTECTED_VM_FW_MAX_SIZE - 1,
474 };
475 if !pci_mmio_before_32bit.intersect(range).is_empty() {
476 return Err(Error::PciMmioOverlapPvmFw);
477 }
478
479 Some(range)
480 } else {
481 None
482 };
483
484 Ok(ArchMemoryLayout {
485 pci_mmio_before_32bit,
486 pcie_cfg_mmio,
487 pvmfw_mem,
488 })
489 }
490
max_ram_end_before_32bit( arch_memory_layout: &ArchMemoryLayout, has_protected_vm_firmware: bool, ) -> u64491 fn max_ram_end_before_32bit(
492 arch_memory_layout: &ArchMemoryLayout,
493 has_protected_vm_firmware: bool,
494 ) -> u64 {
495 let pci_start = arch_memory_layout.pci_mmio_before_32bit.start;
496 if has_protected_vm_firmware {
497 pci_start.min(PROTECTED_VM_FW_START)
498 } else {
499 pci_start
500 }
501 }
502
503 /// The x86 reset vector for i386+ and x86_64 puts the processor into an "unreal mode" where it
504 /// can access the last 1 MB of the 32-bit address space in 16-bit mode, and starts the instruction
505 /// pointer at the effective physical address 0xFFFF_FFF0.
bios_start(bios_size: u64) -> GuestAddress506 fn bios_start(bios_size: u64) -> GuestAddress {
507 GuestAddress(FIRST_ADDR_PAST_32BITS - bios_size)
508 }
509
identity_map_addr_start() -> GuestAddress510 fn identity_map_addr_start() -> GuestAddress {
511 // Set Identity map address 4 pages before the max BIOS size
512 GuestAddress(FIRST_ADDR_PAST_32BITS - BIOS_MAX_SIZE - 4 * 0x1000)
513 }
514
tss_addr_start() -> GuestAddress515 fn tss_addr_start() -> GuestAddress {
516 // Set TSS address one page after identity map address
517 GuestAddress(identity_map_addr_start().offset() + 0x1000)
518 }
519
tss_addr_end() -> GuestAddress520 fn tss_addr_end() -> GuestAddress {
521 // Set TSS address section to have 3 pages
522 GuestAddress(tss_addr_start().offset() + 0x3000)
523 }
524
configure_system( guest_mem: &GuestMemory, cmdline_addr: GuestAddress, setup_data: Option<GuestAddress>, initrd: Option<(GuestAddress, usize)>, mut params: boot_params, e820_entries: &[E820Entry], ) -> Result<()>525 fn configure_system(
526 guest_mem: &GuestMemory,
527 cmdline_addr: GuestAddress,
528 setup_data: Option<GuestAddress>,
529 initrd: Option<(GuestAddress, usize)>,
530 mut params: boot_params,
531 e820_entries: &[E820Entry],
532 ) -> Result<()> {
533 const KERNEL_BOOT_FLAG_MAGIC: u16 = 0xaa55;
534 const KERNEL_HDR_MAGIC: u32 = 0x5372_6448;
535 const KERNEL_LOADER_OTHER: u8 = 0xff;
536 const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x100_0000; // Must be non-zero.
537
538 params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
539 params.hdr.boot_flag = KERNEL_BOOT_FLAG_MAGIC;
540 params.hdr.header = KERNEL_HDR_MAGIC;
541 params.hdr.cmd_line_ptr = cmdline_addr.offset() as u32;
542 params.ext_cmd_line_ptr = (cmdline_addr.offset() >> 32) as u32;
543 params.hdr.kernel_alignment = KERNEL_MIN_ALIGNMENT_BYTES;
544 if let Some(setup_data) = setup_data {
545 params.hdr.setup_data = setup_data.offset();
546 }
547 if let Some((initrd_addr, initrd_size)) = initrd {
548 params.hdr.ramdisk_image = initrd_addr.offset() as u32;
549 params.ext_ramdisk_image = (initrd_addr.offset() >> 32) as u32;
550 params.hdr.ramdisk_size = initrd_size as u32;
551 params.ext_ramdisk_size = (initrd_size as u64 >> 32) as u32;
552 }
553
554 if e820_entries.len() >= params.e820_table.len() {
555 return Err(Error::E820Configuration);
556 }
557
558 for (src, dst) in e820_entries.iter().zip(params.e820_table.iter_mut()) {
559 dst.addr = src.address.offset();
560 dst.size = src.len;
561 dst.type_ = src.mem_type as u32;
562 }
563 params.e820_entries = e820_entries.len() as u8;
564
565 let zero_page_addr = GuestAddress(ZERO_PAGE_OFFSET);
566 if !guest_mem.is_valid_range(zero_page_addr, mem::size_of::<boot_params>() as u64) {
567 return Err(Error::ZeroPagePastRamEnd);
568 }
569
570 guest_mem
571 .write_obj_at_addr(params, zero_page_addr)
572 .map_err(|_| Error::ZeroPageSetup)?;
573
574 Ok(())
575 }
576
577 /// Write setup_data entries in guest memory and link them together with the `next` field.
578 ///
579 /// Returns the guest address of the first entry in the setup_data list, if any.
write_setup_data( guest_mem: &GuestMemory, setup_data_start: GuestAddress, setup_data_end: GuestAddress, setup_data: &[SetupData], ) -> Result<Option<GuestAddress>>580 fn write_setup_data(
581 guest_mem: &GuestMemory,
582 setup_data_start: GuestAddress,
583 setup_data_end: GuestAddress,
584 setup_data: &[SetupData],
585 ) -> Result<Option<GuestAddress>> {
586 let mut setup_data_list_head = None;
587
588 // Place the first setup_data at the first 64-bit aligned offset following setup_data_start.
589 let mut setup_data_addr = setup_data_start.align(8).ok_or(Error::SetupDataTooLarge)?;
590
591 let mut entry_iter = setup_data.iter().peekable();
592 while let Some(entry) = entry_iter.next() {
593 if setup_data_list_head.is_none() {
594 setup_data_list_head = Some(setup_data_addr);
595 }
596
597 // Ensure the entry (header plus data) fits into guest memory.
598 let entry_size = (mem::size_of::<setup_data_hdr>() + entry.data.len()) as u64;
599 let entry_end = setup_data_addr
600 .checked_add(entry_size)
601 .ok_or(Error::SetupDataTooLarge)?;
602
603 if entry_end >= setup_data_end {
604 return Err(Error::SetupDataTooLarge);
605 }
606
607 let next_setup_data_addr = if entry_iter.peek().is_some() {
608 // Place the next setup_data at a 64-bit aligned address.
609 setup_data_addr
610 .checked_add(entry_size)
611 .and_then(|addr| addr.align(8))
612 .ok_or(Error::SetupDataTooLarge)?
613 } else {
614 // This is the final entry. Terminate the list with next == 0.
615 GuestAddress(0)
616 };
617
618 let hdr = setup_data_hdr {
619 next: next_setup_data_addr.offset(),
620 type_: entry.type_ as u32,
621 len: entry
622 .data
623 .len()
624 .try_into()
625 .map_err(|_| Error::SetupDataTooLarge)?,
626 };
627
628 guest_mem
629 .write_obj_at_addr(hdr, setup_data_addr)
630 .map_err(Error::WritingSetupData)?;
631 guest_mem
632 .write_all_at_addr(
633 &entry.data,
634 setup_data_addr.unchecked_add(mem::size_of::<setup_data_hdr>() as u64),
635 )
636 .map_err(Error::WritingSetupData)?;
637
638 setup_data_addr = next_setup_data_addr;
639 }
640
641 Ok(setup_data_list_head)
642 }
643
644 /// Generate a SETUP_RNG_SEED SetupData with random seed data.
setup_data_rng_seed() -> SetupData645 fn setup_data_rng_seed() -> SetupData {
646 let mut data = vec![0u8; 256];
647 OsRng.fill_bytes(&mut data);
648 SetupData {
649 data,
650 type_: SetupDataType::RngSeed,
651 }
652 }
653
654 /// Add an e820 region to the e820 map.
add_e820_entry( e820_entries: &mut Vec<E820Entry>, range: AddressRange, mem_type: E820Type, ) -> Result<()>655 fn add_e820_entry(
656 e820_entries: &mut Vec<E820Entry>,
657 range: AddressRange,
658 mem_type: E820Type,
659 ) -> Result<()> {
660 e820_entries.push(E820Entry {
661 address: GuestAddress(range.start),
662 len: range.len().ok_or(Error::E820Configuration)?,
663 mem_type,
664 });
665
666 Ok(())
667 }
668
669 /// Generate a memory map in INT 0x15 AX=0xE820 format.
generate_e820_memory_map( arch_memory_layout: &ArchMemoryLayout, guest_mem: &GuestMemory, ram_below_1m: AddressRange, ram_below_4g: AddressRange, ram_above_4g: AddressRange, has_protected_vm_firmware: bool, ) -> Result<Vec<E820Entry>>670 fn generate_e820_memory_map(
671 arch_memory_layout: &ArchMemoryLayout,
672 guest_mem: &GuestMemory,
673 ram_below_1m: AddressRange,
674 ram_below_4g: AddressRange,
675 ram_above_4g: AddressRange,
676 has_protected_vm_firmware: bool,
677 ) -> Result<Vec<E820Entry>> {
678 let mut e820_entries = Vec::new();
679
680 add_e820_entry(&mut e820_entries, ram_below_1m, E820Type::Ram)?;
681 add_e820_entry(&mut e820_entries, ram_below_4g, E820Type::Ram)?;
682 if !ram_above_4g.is_empty() {
683 add_e820_entry(&mut e820_entries, ram_above_4g, E820Type::Ram)?
684 }
685
686 if has_protected_vm_firmware {
687 // After the pVM firmware jumped to the guest, the pVM firmware itself
688 // is no longer running, so its memory is reusable by the guest OS.
689 // So add this memory as RAM rather than Reserved.
690 let pvmfw_range = arch_memory_layout.pvmfw_mem.unwrap();
691 add_e820_entry(&mut e820_entries, pvmfw_range, E820Type::Ram)?;
692 }
693
694 let pcie_cfg_mmio_range = arch_memory_layout.pcie_cfg_mmio;
695 add_e820_entry(&mut e820_entries, pcie_cfg_mmio_range, E820Type::Reserved)?;
696
697 add_e820_entry(
698 &mut e820_entries,
699 X8664arch::get_pcie_vcfg_mmio_range(guest_mem, &pcie_cfg_mmio_range),
700 E820Type::Reserved,
701 )?;
702
703 // Reserve memory section for Identity map and TSS
704 add_e820_entry(
705 &mut e820_entries,
706 AddressRange {
707 start: identity_map_addr_start().offset(),
708 end: tss_addr_end().offset() - 1,
709 },
710 E820Type::Reserved,
711 )?;
712
713 Ok(e820_entries)
714 }
715
716 /// Returns a Vec of the valid memory addresses.
717 /// These should be used to configure the GuestMemory structure for the platform.
718 /// For x86_64 all addresses are valid from the start of the kernel except a
719 /// carve out at the end of 32bit address space.
arch_memory_regions( arch_memory_layout: &ArchMemoryLayout, size: u64, bios_size: Option<u64>, has_protected_vm_firmware: bool, ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)>720 pub fn arch_memory_regions(
721 arch_memory_layout: &ArchMemoryLayout,
722 size: u64,
723 bios_size: Option<u64>,
724 has_protected_vm_firmware: bool,
725 ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> {
726 let mut mem_size = size;
727 let mut regions = Vec::new();
728
729 if has_protected_vm_firmware {
730 regions.push((
731 GuestAddress(PROTECTED_VM_FW_START),
732 PROTECTED_VM_FW_MAX_SIZE,
733 MemoryRegionOptions::new().purpose(MemoryRegionPurpose::ProtectedFirmwareRegion),
734 ));
735
736 // pVM firmware memory is a part of normal guest memory, since it is reusable
737 // by the guest OS once the pVM firmware jumped to the guest. So count its size
738 // as a part of the total guest memory size.
739 if mem_size > PROTECTED_VM_FW_MAX_SIZE {
740 mem_size -= PROTECTED_VM_FW_MAX_SIZE;
741 }
742 }
743
744 let mem_start = START_OF_RAM_32BITS;
745 let mem_end = GuestAddress(mem_size + mem_start);
746
747 let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS);
748 let max_end_32bits = GuestAddress(max_ram_end_before_32bit(
749 arch_memory_layout,
750 has_protected_vm_firmware,
751 ));
752
753 if mem_end <= max_end_32bits {
754 regions.push((GuestAddress(mem_start), mem_size, Default::default()));
755 if let Some(bios_size) = bios_size {
756 regions.push((bios_start(bios_size), bios_size, Default::default()));
757 }
758 } else {
759 regions.push((
760 GuestAddress(mem_start),
761 max_end_32bits.offset() - mem_start,
762 Default::default(),
763 ));
764 if let Some(bios_size) = bios_size {
765 regions.push((bios_start(bios_size), bios_size, Default::default()));
766 }
767 regions.push((
768 first_addr_past_32bits,
769 mem_end.offset_from(max_end_32bits),
770 Default::default(),
771 ));
772 }
773
774 regions
775 }
776
777 impl arch::LinuxArch for X8664arch {
778 type Error = Error;
779 type ArchMemoryLayout = ArchMemoryLayout;
780
arch_memory_layout( components: &VmComponents, ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error>781 fn arch_memory_layout(
782 components: &VmComponents,
783 ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error> {
784 create_arch_memory_layout(
785 &components.pci_config,
786 components.hv_cfg.protection_type.runs_firmware(),
787 )
788 }
789
guest_memory_layout( components: &VmComponents, arch_memory_layout: &Self::ArchMemoryLayout, _hypervisor: &impl Hypervisor, ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>790 fn guest_memory_layout(
791 components: &VmComponents,
792 arch_memory_layout: &Self::ArchMemoryLayout,
793 _hypervisor: &impl Hypervisor,
794 ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error> {
795 let has_protected_vm_firmware = components.hv_cfg.protection_type.runs_firmware();
796
797 let bios_size = match &components.vm_image {
798 VmImage::Bios(bios_file) => Some(bios_file.metadata().map_err(Error::LoadBios)?.len()),
799 VmImage::Kernel(_) => None,
800 };
801
802 Ok(arch_memory_regions(
803 arch_memory_layout,
804 components.memory_size,
805 bios_size,
806 has_protected_vm_firmware,
807 ))
808 }
809
get_system_allocator_config<V: Vm>( vm: &V, arch_memory_layout: &Self::ArchMemoryLayout, ) -> SystemAllocatorConfig810 fn get_system_allocator_config<V: Vm>(
811 vm: &V,
812 arch_memory_layout: &Self::ArchMemoryLayout,
813 ) -> SystemAllocatorConfig {
814 SystemAllocatorConfig {
815 io: Some(AddressRange {
816 start: 0xc000,
817 end: 0xffff,
818 }),
819 low_mmio: arch_memory_layout.pci_mmio_before_32bit,
820 high_mmio: Self::get_high_mmio_range(vm, arch_memory_layout),
821 platform_mmio: None,
822 first_irq: X86_64_IRQ_BASE,
823 }
824 }
825
build_vm<V, Vcpu>( mut components: VmComponents, arch_memory_layout: &Self::ArchMemoryLayout, vm_evt_wrtube: &SendTube, system_allocator: &mut SystemAllocator, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, battery: (Option<BatteryType>, Option<Minijail>), mut vm: V, ramoops_region: Option<arch::pstore::RamoopsRegion>, devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, irq_chip: &mut dyn IrqChipX86_64, vcpu_ids: &mut Vec<usize>, dump_device_tree_blob: Option<PathBuf>, debugcon_jail: Option<Minijail>, pflash_jail: Option<Minijail>, fw_cfg_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, device_tree_overlays: Vec<DtbOverlay>, _fdt_position: Option<FdtPosition>, _no_pmu: bool, ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error> where V: VmX86_64, Vcpu: VcpuX86_64,826 fn build_vm<V, Vcpu>(
827 mut components: VmComponents,
828 arch_memory_layout: &Self::ArchMemoryLayout,
829 vm_evt_wrtube: &SendTube,
830 system_allocator: &mut SystemAllocator,
831 serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
832 serial_jail: Option<Minijail>,
833 battery: (Option<BatteryType>, Option<Minijail>),
834 mut vm: V,
835 ramoops_region: Option<arch::pstore::RamoopsRegion>,
836 devs: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
837 irq_chip: &mut dyn IrqChipX86_64,
838 vcpu_ids: &mut Vec<usize>,
839 dump_device_tree_blob: Option<PathBuf>,
840 debugcon_jail: Option<Minijail>,
841 pflash_jail: Option<Minijail>,
842 fw_cfg_jail: Option<Minijail>,
843 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
844 guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
845 device_tree_overlays: Vec<DtbOverlay>,
846 _fdt_position: Option<FdtPosition>,
847 _no_pmu: bool,
848 ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error>
849 where
850 V: VmX86_64,
851 Vcpu: VcpuX86_64,
852 {
853 let mem = vm.get_memory().clone();
854
855 let vcpu_count = components.vcpu_count;
856
857 vm.set_identity_map_addr(identity_map_addr_start())
858 .map_err(Error::SetIdentityMapAddr)?;
859
860 vm.set_tss_addr(tss_addr_start())
861 .map_err(Error::SetTssAddr)?;
862
863 // Use IRQ info in ACPI if provided by the user.
864 let mut mptable = true;
865 let mut sci_irq = X86_64_SCI_IRQ;
866
867 // punch pcie config mmio from pci low mmio, so that it couldn't be
868 // allocated to any device.
869 let pcie_cfg_mmio_range = arch_memory_layout.pcie_cfg_mmio;
870 system_allocator
871 .reserve_mmio(pcie_cfg_mmio_range)
872 .map_err(Error::ReservePcieCfgMmio)?;
873
874 for sdt in components.acpi_sdts.iter() {
875 if sdt.is_signature(b"FACP") {
876 mptable = false;
877 let sci_irq_fadt: u16 = sdt.read(acpi::FADT_FIELD_SCI_INTERRUPT);
878 sci_irq = sci_irq_fadt.into();
879 if !system_allocator.reserve_irq(sci_irq) {
880 warn!("sci irq {} already reserved.", sci_irq);
881 }
882 }
883 }
884
885 let pcie_vcfg_range = Self::get_pcie_vcfg_mmio_range(&mem, &pcie_cfg_mmio_range);
886 let mmio_bus = Arc::new(Bus::new(BusType::Mmio));
887 let io_bus = Arc::new(Bus::new(BusType::Io));
888
889 let (pci_devices, devs): (Vec<_>, Vec<_>) = devs
890 .into_iter()
891 .partition(|(dev, _)| dev.as_pci_device().is_some());
892
893 let pci_devices = pci_devices
894 .into_iter()
895 .map(|(dev, jail_orig)| (dev.into_pci_device().unwrap(), jail_orig))
896 .collect();
897
898 let (pci, pci_irqs, mut pid_debug_label_map, amls, gpe_scope_amls) =
899 arch::generate_pci_root(
900 pci_devices,
901 irq_chip.as_irq_chip_mut(),
902 mmio_bus.clone(),
903 GuestAddress(pcie_cfg_mmio_range.start),
904 12,
905 io_bus.clone(),
906 system_allocator,
907 &mut vm,
908 4, // Share the four pin interrupts (INTx#)
909 Some(pcie_vcfg_range.start),
910 #[cfg(feature = "swap")]
911 swap_controller,
912 )
913 .map_err(Error::CreatePciRoot)?;
914
915 let pci = Arc::new(Mutex::new(pci));
916 pci.lock().enable_pcie_cfg_mmio(pcie_cfg_mmio_range.start);
917 let pci_cfg = PciConfigIo::new(
918 pci.clone(),
919 components.break_linux_pci_config_io,
920 vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
921 );
922 let pci_bus = Arc::new(Mutex::new(pci_cfg));
923 io_bus.insert(pci_bus, 0xcf8, 0x8).unwrap();
924
925 let pcie_cfg_mmio = Arc::new(Mutex::new(PciConfigMmio::new(pci.clone(), 12)));
926 let pcie_cfg_mmio_len = pcie_cfg_mmio_range.len().unwrap();
927 mmio_bus
928 .insert(pcie_cfg_mmio, pcie_cfg_mmio_range.start, pcie_cfg_mmio_len)
929 .unwrap();
930
931 let pcie_vcfg_mmio = Arc::new(Mutex::new(PciVirtualConfigMmio::new(pci.clone(), 13)));
932 mmio_bus
933 .insert(
934 pcie_vcfg_mmio,
935 pcie_vcfg_range.start,
936 pcie_vcfg_range.len().unwrap(),
937 )
938 .unwrap();
939
940 let (virtio_mmio_devices, _others): (Vec<_>, Vec<_>) = devs
941 .into_iter()
942 .partition(|(dev, _)| dev.as_virtio_mmio_device().is_some());
943
944 let virtio_mmio_devices = virtio_mmio_devices
945 .into_iter()
946 .map(|(dev, jail_orig)| (*(dev.into_virtio_mmio_device().unwrap()), jail_orig))
947 .collect();
948 let (mut virtio_mmio_pid, sdts) = arch::generate_virtio_mmio_bus(
949 virtio_mmio_devices,
950 irq_chip.as_irq_chip_mut(),
951 &mmio_bus,
952 system_allocator,
953 &mut vm,
954 components.acpi_sdts,
955 #[cfg(feature = "swap")]
956 swap_controller,
957 )
958 .map_err(Error::CreateVirtioMmioBus)?;
959 components.acpi_sdts = sdts;
960 pid_debug_label_map.append(&mut virtio_mmio_pid);
961
962 // Event used to notify crosvm that guest OS is trying to suspend.
963 let (suspend_tube_send, suspend_tube_recv) =
964 Tube::directional_pair().map_err(Error::CreateTube)?;
965 let suspend_tube_send = Arc::new(Mutex::new(suspend_tube_send));
966
967 if components.fw_cfg_enable {
968 Self::setup_fw_cfg_device(
969 &io_bus,
970 components.fw_cfg_parameters.clone(),
971 components.bootorder_fw_cfg_blob.clone(),
972 fw_cfg_jail,
973 #[cfg(feature = "swap")]
974 swap_controller,
975 )?;
976 }
977
978 if !components.no_i8042 {
979 Self::setup_legacy_i8042_device(
980 &io_bus,
981 irq_chip.pit_uses_speaker_port(),
982 vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
983 )?;
984 }
985 let mut vm_request_tube = if !components.no_rtc {
986 let (host_tube, device_tube) = Tube::pair()
987 .context("create tube")
988 .map_err(Error::SetupCmos)?;
989 Self::setup_legacy_cmos_device(
990 arch_memory_layout,
991 &io_bus,
992 irq_chip,
993 device_tube,
994 components.memory_size,
995 components.hv_cfg.protection_type.runs_firmware(),
996 )
997 .map_err(Error::SetupCmos)?;
998 Some(host_tube)
999 } else {
1000 None
1001 };
1002 let serial_devices = Self::setup_serial_devices(
1003 components.hv_cfg.protection_type,
1004 irq_chip.as_irq_chip_mut(),
1005 &io_bus,
1006 serial_parameters,
1007 serial_jail,
1008 #[cfg(feature = "swap")]
1009 swap_controller,
1010 )?;
1011 Self::setup_debugcon_devices(
1012 components.hv_cfg.protection_type,
1013 &io_bus,
1014 serial_parameters,
1015 debugcon_jail,
1016 #[cfg(feature = "swap")]
1017 swap_controller,
1018 )?;
1019
1020 let bios_size = if let VmImage::Bios(ref bios) = components.vm_image {
1021 bios.metadata().map_err(Error::LoadBios)?.len()
1022 } else {
1023 0
1024 };
1025 if let Some(pflash_image) = components.pflash_image {
1026 Self::setup_pflash(
1027 pflash_image,
1028 components.pflash_block_size,
1029 bios_size,
1030 &mmio_bus,
1031 pflash_jail,
1032 #[cfg(feature = "swap")]
1033 swap_controller,
1034 )?;
1035 }
1036
1037 // Functions that use/create jails MUST be used before the call to
1038 // setup_acpi_devices below, as this move us into a multiprocessing state
1039 // from which we can no longer fork.
1040
1041 let mut resume_notify_devices = Vec::new();
1042
1043 // each bus occupy 1MB mmio for pcie enhanced configuration
1044 let max_bus = (pcie_cfg_mmio_len / 0x100000 - 1) as u8;
1045 let (mut acpi_dev_resource, bat_control) = Self::setup_acpi_devices(
1046 arch_memory_layout,
1047 pci.clone(),
1048 &mem,
1049 &io_bus,
1050 system_allocator,
1051 suspend_tube_send.clone(),
1052 vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
1053 components.acpi_sdts,
1054 irq_chip.as_irq_chip_mut(),
1055 sci_irq,
1056 battery,
1057 &mmio_bus,
1058 max_bus,
1059 &mut resume_notify_devices,
1060 #[cfg(feature = "swap")]
1061 swap_controller,
1062 #[cfg(any(target_os = "android", target_os = "linux"))]
1063 components.ac_adapter,
1064 guest_suspended_cvar,
1065 &pci_irqs,
1066 )?;
1067
1068 // Create customized SSDT table
1069 let sdt = acpi::create_customize_ssdt(pci.clone(), amls, gpe_scope_amls);
1070 if let Some(sdt) = sdt {
1071 acpi_dev_resource.sdts.push(sdt);
1072 }
1073
1074 irq_chip
1075 .finalize_devices(system_allocator, &io_bus, &mmio_bus)
1076 .map_err(Error::RegisterIrqfd)?;
1077
1078 // All of these bios generated tables are set manually for the benefit of the kernel boot
1079 // flow (since there's no BIOS to set it) and for the BIOS boot flow since crosvm doesn't
1080 // have a way to pass the BIOS these configs.
1081 // This works right now because the only guest BIOS used with crosvm (u-boot) ignores these
1082 // tables and the guest OS picks them up.
1083 // If another guest does need a way to pass these tables down to it's BIOS, this approach
1084 // should be rethought.
1085
1086 if mptable {
1087 // Note that this puts the mptable at 0x9FC00 in guest physical memory.
1088 mptable::setup_mptable(&mem, vcpu_count as u8, &pci_irqs)
1089 .map_err(Error::SetupMptable)?;
1090 }
1091 smbios::setup_smbios(&mem, &components.smbios, bios_size).map_err(Error::SetupSmbios)?;
1092
1093 let host_cpus = if components.host_cpu_topology {
1094 components.vcpu_affinity.clone()
1095 } else {
1096 None
1097 };
1098
1099 // TODO (tjeznach) Write RSDP to bootconfig before writing to memory
1100 acpi::create_acpi_tables(
1101 &mem,
1102 vcpu_count as u8,
1103 sci_irq,
1104 0xcf9,
1105 6, // RST_CPU|SYS_RST
1106 &acpi_dev_resource,
1107 host_cpus,
1108 vcpu_ids,
1109 &pci_irqs,
1110 pcie_cfg_mmio_range.start,
1111 max_bus,
1112 components.force_s2idle,
1113 )
1114 .ok_or(Error::CreateAcpi)?;
1115
1116 let mut cmdline = Self::get_base_linux_cmdline();
1117
1118 get_serial_cmdline(&mut cmdline, serial_parameters, "io", &serial_devices)
1119 .map_err(Error::GetSerialCmdline)?;
1120
1121 for param in components.extra_kernel_params {
1122 cmdline.insert_str(¶m).map_err(Error::Cmdline)?;
1123 }
1124
1125 if let Some(ramoops_region) = ramoops_region {
1126 arch::pstore::add_ramoops_kernel_cmdline(&mut cmdline, &ramoops_region)
1127 .map_err(Error::Cmdline)?;
1128 }
1129
1130 let pci_start = arch_memory_layout.pci_mmio_before_32bit.start;
1131
1132 let mut vcpu_init = vec![VcpuInitX86_64::default(); vcpu_count];
1133 let mut msrs = BTreeMap::new();
1134
1135 let protection_type = components.hv_cfg.protection_type;
1136
1137 match components.vm_image {
1138 VmImage::Bios(ref mut bios) => {
1139 if protection_type.runs_firmware() {
1140 return Err(Error::PvmFwBiosUnsupported);
1141 }
1142
1143 // Allow a bios to hardcode CMDLINE_OFFSET and read the kernel command line from it.
1144 Self::load_cmdline(
1145 &mem,
1146 GuestAddress(CMDLINE_OFFSET),
1147 cmdline,
1148 CMDLINE_MAX_SIZE as usize - 1,
1149 )?;
1150 Self::load_bios(&mem, bios)?;
1151 regs::set_default_msrs(&mut msrs);
1152 // The default values for `Regs` and `Sregs` already set up the reset vector.
1153 }
1154 VmImage::Kernel(ref mut kernel_image) => {
1155 let (params, kernel_end, kernel_entry, cpu_mode, kernel_type) =
1156 Self::load_kernel(&mem, kernel_image)?;
1157
1158 info!("Loaded {} kernel", kernel_type);
1159
1160 Self::setup_system_memory(
1161 arch_memory_layout,
1162 &mem,
1163 cmdline,
1164 components.initrd_image,
1165 components.android_fstab,
1166 kernel_end,
1167 params,
1168 dump_device_tree_blob,
1169 device_tree_overlays,
1170 protection_type.runs_firmware(),
1171 )?;
1172
1173 if protection_type.needs_firmware_loaded() {
1174 arch::load_image(
1175 &mem,
1176 &mut components
1177 .pvm_fw
1178 .expect("pvmfw must be available if ProtectionType loads it"),
1179 GuestAddress(PROTECTED_VM_FW_START),
1180 PROTECTED_VM_FW_MAX_SIZE,
1181 )
1182 .map_err(Error::LoadCustomPvmFw)?;
1183 }
1184
1185 let entry_addr = if protection_type.runs_firmware() {
1186 PROTECTED_VM_FW_START
1187 } else {
1188 kernel_entry.offset()
1189 };
1190
1191 vcpu_init[0].regs.rip = entry_addr;
1192
1193 match kernel_type {
1194 KernelType::BzImage | KernelType::Elf => {
1195 // Configure the bootstrap VCPU for the Linux/x86 boot protocol.
1196 // <https://www.kernel.org/doc/html/latest/x86/boot.html>
1197 vcpu_init[0].regs.rsp = BOOT_STACK_POINTER;
1198 vcpu_init[0].regs.rsi = ZERO_PAGE_OFFSET;
1199 }
1200 }
1201
1202 if protection_type.runs_firmware() {
1203 // Pass pVM payload entry address to pVM firmware.
1204 // NOTE: this ABI is subject to change. Possibly we will pass
1205 // all the needed info (payload entry, start and size) in in-memory
1206 // structures (e.g. DTB) instead.
1207 vcpu_init[0].regs.rdi = kernel_entry.offset();
1208 }
1209
1210 match cpu_mode {
1211 CpuMode::LongMode => {
1212 regs::set_long_mode_msrs(&mut msrs);
1213
1214 // Set up long mode and enable paging.
1215 regs::configure_segments_and_sregs(&mem, &mut vcpu_init[0].sregs)
1216 .map_err(Error::ConfigureSegments)?;
1217 regs::setup_page_tables(&mem, &mut vcpu_init[0].sregs)
1218 .map_err(Error::SetupPageTables)?;
1219 }
1220 CpuMode::FlatProtectedMode => {
1221 regs::set_default_msrs(&mut msrs);
1222
1223 // Set up 32-bit protected mode with paging disabled.
1224 regs::configure_segments_and_sregs_flat32(&mem, &mut vcpu_init[0].sregs)
1225 .map_err(Error::ConfigureSegments)?;
1226 }
1227 }
1228
1229 regs::set_mtrr_msrs(&mut msrs, &vm, pci_start);
1230 }
1231 }
1232
1233 // Initialize MSRs for all VCPUs.
1234 for vcpu in vcpu_init.iter_mut() {
1235 vcpu.msrs = msrs.clone();
1236 }
1237
1238 let mut vm_request_tubes = Vec::new();
1239 if let Some(req_tube) = vm_request_tube.take() {
1240 vm_request_tubes.push(req_tube);
1241 }
1242
1243 Ok(RunnableLinuxVm {
1244 vm,
1245 vcpu_count,
1246 vcpus: None,
1247 vcpu_affinity: components.vcpu_affinity,
1248 vcpu_init,
1249 no_smt: components.no_smt,
1250 irq_chip: irq_chip.try_box_clone().map_err(Error::CloneIrqChip)?,
1251 io_bus,
1252 mmio_bus,
1253 pid_debug_label_map,
1254 suspend_tube: (suspend_tube_send, suspend_tube_recv),
1255 resume_notify_devices,
1256 rt_cpus: components.rt_cpus,
1257 delay_rt: components.delay_rt,
1258 bat_control,
1259 pm: Some(acpi_dev_resource.pm),
1260 root_config: pci,
1261 #[cfg(any(target_os = "android", target_os = "linux"))]
1262 platform_devices: Vec::new(),
1263 hotplug_bus: BTreeMap::new(),
1264 devices_thread: None,
1265 vm_request_tubes,
1266 })
1267 }
1268
configure_vcpu<V: Vm>( vm: &V, hypervisor: &dyn HypervisorX86_64, irq_chip: &mut dyn IrqChipX86_64, vcpu: &mut dyn VcpuX86_64, vcpu_init: VcpuInitX86_64, vcpu_id: usize, num_cpus: usize, cpu_config: Option<CpuConfigX86_64>, ) -> Result<()>1269 fn configure_vcpu<V: Vm>(
1270 vm: &V,
1271 hypervisor: &dyn HypervisorX86_64,
1272 irq_chip: &mut dyn IrqChipX86_64,
1273 vcpu: &mut dyn VcpuX86_64,
1274 vcpu_init: VcpuInitX86_64,
1275 vcpu_id: usize,
1276 num_cpus: usize,
1277 cpu_config: Option<CpuConfigX86_64>,
1278 ) -> Result<()> {
1279 let cpu_config = match cpu_config {
1280 Some(config) => config,
1281 None => return Err(Error::InvalidCpuConfig),
1282 };
1283 if !vm.check_capability(VmCap::EarlyInitCpuid) {
1284 cpuid::setup_cpuid(hypervisor, irq_chip, vcpu, vcpu_id, num_cpus, cpu_config)
1285 .map_err(Error::SetupCpuid)?;
1286 }
1287
1288 vcpu.set_regs(&vcpu_init.regs).map_err(Error::WriteRegs)?;
1289
1290 vcpu.set_sregs(&vcpu_init.sregs)
1291 .map_err(Error::SetupSregs)?;
1292
1293 vcpu.set_fpu(&vcpu_init.fpu).map_err(Error::SetupFpu)?;
1294
1295 let vcpu_supported_var_mtrrs = regs::vcpu_supported_variable_mtrrs(vcpu);
1296 let num_var_mtrrs = regs::count_variable_mtrrs(&vcpu_init.msrs);
1297 let skip_mtrr_msrs = if num_var_mtrrs > vcpu_supported_var_mtrrs {
1298 warn!(
1299 "Too many variable MTRR entries ({} required, {} supported),
1300 please check pci_start addr, guest with pass through device may be very slow",
1301 num_var_mtrrs, vcpu_supported_var_mtrrs,
1302 );
1303 // Filter out the MTRR entries from the MSR list.
1304 true
1305 } else {
1306 false
1307 };
1308
1309 for (msr_index, value) in vcpu_init.msrs.into_iter() {
1310 if skip_mtrr_msrs && regs::is_mtrr_msr(msr_index) {
1311 continue;
1312 }
1313
1314 vcpu.set_msr(msr_index, value).map_err(Error::SetupMsrs)?;
1315 }
1316
1317 interrupts::set_lint(vcpu_id, irq_chip).map_err(Error::SetLint)?;
1318
1319 Ok(())
1320 }
1321
register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>( linux: &mut RunnableLinuxVm<V, Vcpu>, device: Box<dyn PciDevice>, #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>, resources: &mut SystemAllocator, hp_control_tube: &mpsc::Sender<PciRootCommand>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<PciAddress>1322 fn register_pci_device<V: VmX86_64, Vcpu: VcpuX86_64>(
1323 linux: &mut RunnableLinuxVm<V, Vcpu>,
1324 device: Box<dyn PciDevice>,
1325 #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>,
1326 resources: &mut SystemAllocator,
1327 hp_control_tube: &mpsc::Sender<PciRootCommand>,
1328 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1329 ) -> Result<PciAddress> {
1330 arch::configure_pci_device(
1331 linux,
1332 device,
1333 #[cfg(any(target_os = "android", target_os = "linux"))]
1334 minijail,
1335 resources,
1336 hp_control_tube,
1337 #[cfg(feature = "swap")]
1338 swap_controller,
1339 )
1340 .map_err(Error::ConfigurePciDevice)
1341 }
1342
get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>>1343 fn get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>> {
1344 Ok(BTreeMap::new())
1345 }
1346
get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>>1347 fn get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>> {
1348 Ok(BTreeMap::new())
1349 }
1350
get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>>1351 fn get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>> {
1352 Ok(BTreeMap::new())
1353 }
1354
get_host_cpu_clusters() -> Result<Vec<CpuSet>>1355 fn get_host_cpu_clusters() -> Result<Vec<CpuSet>> {
1356 Ok(Vec::new())
1357 }
1358 }
1359
1360 // OSC returned status register in CDW1
1361 const OSC_STATUS_UNSUPPORT_UUID: u32 = 0x4;
1362 // pci host bridge OSC returned control register in CDW3
1363 #[allow(dead_code)]
1364 const PCI_HB_OSC_CONTROL_PCIE_HP: u32 = 0x1;
1365 const PCI_HB_OSC_CONTROL_SHPC_HP: u32 = 0x2;
1366 #[allow(dead_code)]
1367 const PCI_HB_OSC_CONTROL_PCIE_PME: u32 = 0x4;
1368 const PCI_HB_OSC_CONTROL_PCIE_AER: u32 = 0x8;
1369 #[allow(dead_code)]
1370 const PCI_HB_OSC_CONTROL_PCIE_CAP: u32 = 0x10;
1371
1372 struct PciRootOSC {}
1373
1374 // Method (_OSC, 4, NotSerialized) // _OSC: Operating System Capabilities
1375 // {
1376 // CreateDWordField (Arg3, Zero, CDW1) // flag and return value
1377 // If (Arg0 == ToUUID ("33db4d5b-1ff7-401c-9657-7441c03dd766"))
1378 // {
1379 // CreateDWordField (Arg3, 8, CDW3) // control field
1380 // if ( 0 == (CDW1 & 0x01)) // Query flag ?
1381 // {
1382 // CDW3 &= !(SHPC_HP | AER)
1383 // }
1384 // } Else {
1385 // CDW1 |= UNSUPPORT_UUID
1386 // }
1387 // Return (Arg3)
1388 // }
1389 impl Aml for PciRootOSC {
to_aml_bytes(&self, aml: &mut Vec<u8>)1390 fn to_aml_bytes(&self, aml: &mut Vec<u8>) {
1391 let osc_uuid = "33DB4D5B-1FF7-401C-9657-7441C03DD766";
1392 // virtual pcie root port supports hotplug, pme, and pcie cap register, clear all
1393 // the other bits.
1394 let mask = !(PCI_HB_OSC_CONTROL_SHPC_HP | PCI_HB_OSC_CONTROL_PCIE_AER);
1395 aml::Method::new(
1396 "_OSC".into(),
1397 4,
1398 false,
1399 vec![
1400 &aml::CreateDWordField::new(
1401 &aml::Name::new_field_name("CDW1"),
1402 &aml::Arg(3),
1403 &aml::ZERO,
1404 ),
1405 &aml::If::new(
1406 &aml::Equal::new(&aml::Arg(0), &aml::Uuid::new(osc_uuid)),
1407 vec![
1408 &aml::CreateDWordField::new(
1409 &aml::Name::new_field_name("CDW3"),
1410 &aml::Arg(3),
1411 &(8_u8),
1412 ),
1413 &aml::If::new(
1414 &aml::Equal::new(
1415 &aml::ZERO,
1416 &aml::And::new(
1417 &aml::ZERO,
1418 &aml::Name::new_field_name("CDW1"),
1419 &aml::ONE,
1420 ),
1421 ),
1422 vec![&aml::And::new(
1423 &aml::Name::new_field_name("CDW3"),
1424 &mask,
1425 &aml::Name::new_field_name("CDW3"),
1426 )],
1427 ),
1428 ],
1429 ),
1430 &aml::Else::new(vec![&aml::Or::new(
1431 &aml::Name::new_field_name("CDW1"),
1432 &OSC_STATUS_UNSUPPORT_UUID,
1433 &aml::Name::new_field_name("CDW1"),
1434 )]),
1435 &aml::Return::new(&aml::Arg(3)),
1436 ],
1437 )
1438 .to_aml_bytes(aml)
1439 }
1440 }
1441
1442 pub enum CpuMode {
1443 /// 32-bit protected mode with paging disabled.
1444 FlatProtectedMode,
1445
1446 /// 64-bit long mode.
1447 LongMode,
1448 }
1449
1450 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
1451 pub enum KernelType {
1452 BzImage,
1453 Elf,
1454 }
1455
1456 impl fmt::Display for KernelType {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1457 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1458 match self {
1459 KernelType::BzImage => write!(f, "bzImage"),
1460 KernelType::Elf => write!(f, "ELF"),
1461 }
1462 }
1463 }
1464
1465 impl X8664arch {
1466 /// Loads the bios from an open file.
1467 ///
1468 /// # Arguments
1469 ///
1470 /// * `mem` - The memory to be used by the guest.
1471 /// * `bios_image` - the File object for the specified bios
load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()>1472 fn load_bios(mem: &GuestMemory, bios_image: &mut File) -> Result<()> {
1473 let bios_image_length = bios_image.get_len().map_err(Error::LoadBios)?;
1474 if bios_image_length >= FIRST_ADDR_PAST_32BITS {
1475 return Err(Error::LoadBios(io::Error::new(
1476 io::ErrorKind::InvalidData,
1477 format!(
1478 "bios was {} bytes, expected less than {}",
1479 bios_image_length, FIRST_ADDR_PAST_32BITS,
1480 ),
1481 )));
1482 }
1483
1484 let guest_slice = mem
1485 .get_slice_at_addr(bios_start(bios_image_length), bios_image_length as usize)
1486 .map_err(Error::SetupGuestMemory)?;
1487 bios_image
1488 .read_exact_at_volatile(guest_slice, 0)
1489 .map_err(Error::LoadBios)?;
1490 Ok(())
1491 }
1492
setup_pflash( pflash_image: File, block_size: u32, bios_size: u64, mmio_bus: &Bus, jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<()>1493 fn setup_pflash(
1494 pflash_image: File,
1495 block_size: u32,
1496 bios_size: u64,
1497 mmio_bus: &Bus,
1498 jail: Option<Minijail>,
1499 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1500 ) -> Result<()> {
1501 let size = pflash_image.metadata().map_err(Error::LoadPflash)?.len();
1502 let start = FIRST_ADDR_PAST_32BITS - bios_size - size;
1503 let pflash_image = Box::new(pflash_image);
1504
1505 #[cfg(any(target_os = "android", target_os = "linux"))]
1506 let fds = pflash_image.as_raw_descriptors();
1507
1508 let pflash = Pflash::new(pflash_image, block_size).map_err(Error::SetupPflash)?;
1509 let pflash: Arc<Mutex<dyn BusDevice>> = match jail {
1510 #[cfg(any(target_os = "android", target_os = "linux"))]
1511 Some(jail) => Arc::new(Mutex::new(
1512 ProxyDevice::new(
1513 pflash,
1514 jail,
1515 fds,
1516 #[cfg(feature = "swap")]
1517 swap_controller,
1518 )
1519 .map_err(Error::CreateProxyDevice)?,
1520 )),
1521 #[cfg(windows)]
1522 Some(_) => unreachable!(),
1523 None => Arc::new(Mutex::new(pflash)),
1524 };
1525 mmio_bus
1526 .insert(pflash, start, size)
1527 .map_err(Error::InsertBus)?;
1528
1529 Ok(())
1530 }
1531
1532 /// Writes the command line string to the given memory slice.
1533 ///
1534 /// # Arguments
1535 ///
1536 /// * `guest_mem` - A u8 slice that will be partially overwritten by the command line.
1537 /// * `guest_addr` - The address in `guest_mem` at which to load the command line.
1538 /// * `cmdline` - The kernel command line.
1539 /// * `kernel_max_cmdline_len` - The maximum command line length (without NUL terminator)
1540 /// supported by the kernel.
load_cmdline( guest_mem: &GuestMemory, guest_addr: GuestAddress, cmdline: kernel_cmdline::Cmdline, kernel_max_cmdline_len: usize, ) -> Result<()>1541 fn load_cmdline(
1542 guest_mem: &GuestMemory,
1543 guest_addr: GuestAddress,
1544 cmdline: kernel_cmdline::Cmdline,
1545 kernel_max_cmdline_len: usize,
1546 ) -> Result<()> {
1547 let mut cmdline_guest_mem_slice = guest_mem
1548 .get_slice_at_addr(guest_addr, CMDLINE_MAX_SIZE as usize)
1549 .map_err(|_| Error::CommandLineOverflow)?;
1550
1551 let mut cmdline_bytes: Vec<u8> = cmdline
1552 .into_bytes_with_max_len(kernel_max_cmdline_len)
1553 .map_err(Error::Cmdline)?;
1554 cmdline_bytes.push(0u8); // Add NUL terminator.
1555
1556 cmdline_guest_mem_slice
1557 .write_all(&cmdline_bytes)
1558 .map_err(|_| Error::CommandLineOverflow)?;
1559
1560 Ok(())
1561 }
1562
1563 /// Loads the kernel from an open file.
1564 ///
1565 /// # Arguments
1566 ///
1567 /// * `mem` - The memory to be used by the guest.
1568 /// * `kernel_image` - the File object for the specified kernel.
1569 ///
1570 /// # Returns
1571 ///
1572 /// On success, returns the Linux x86_64 boot protocol parameters, the first address past the
1573 /// end of the kernel, the entry point (initial `RIP` value), the initial CPU mode, and the type
1574 /// of kernel.
load_kernel( mem: &GuestMemory, kernel_image: &mut File, ) -> Result<(boot_params, u64, GuestAddress, CpuMode, KernelType)>1575 fn load_kernel(
1576 mem: &GuestMemory,
1577 kernel_image: &mut File,
1578 ) -> Result<(boot_params, u64, GuestAddress, CpuMode, KernelType)> {
1579 let kernel_start = GuestAddress(KERNEL_START_OFFSET);
1580 match kernel_loader::load_elf64(mem, kernel_start, kernel_image, 0) {
1581 Ok(loaded_kernel) => {
1582 // ELF kernels don't contain a `boot_params` structure, so synthesize a default one.
1583 let boot_params = boot_params {
1584 hdr: setup_header {
1585 cmdline_size: CMDLINE_MAX_SIZE as u32 - 1,
1586 ..Default::default()
1587 },
1588 ..Default::default()
1589 };
1590 Ok((
1591 boot_params,
1592 loaded_kernel.address_range.end,
1593 loaded_kernel.entry,
1594 CpuMode::LongMode,
1595 KernelType::Elf,
1596 ))
1597 }
1598 Err(kernel_loader::Error::InvalidMagicNumber) => {
1599 // The image failed to parse as ELF, so try to load it as a bzImage.
1600 let (boot_params, bzimage_end, bzimage_entry, cpu_mode) =
1601 bzimage::load_bzimage(mem, kernel_start, kernel_image)
1602 .map_err(Error::LoadBzImage)?;
1603 Ok((
1604 boot_params,
1605 bzimage_end,
1606 bzimage_entry,
1607 cpu_mode,
1608 KernelType::BzImage,
1609 ))
1610 }
1611 Err(e) => Err(Error::LoadKernel(e)),
1612 }
1613 }
1614
1615 /// Configures the system memory space should be called once per vm before
1616 /// starting vcpu threads.
1617 ///
1618 /// # Arguments
1619 ///
1620 /// * `mem` - The memory to be used by the guest.
1621 /// * `cmdline` - the kernel commandline
1622 /// * `initrd_file` - an initial ramdisk image
setup_system_memory( arch_memory_layout: &ArchMemoryLayout, mem: &GuestMemory, cmdline: kernel_cmdline::Cmdline, initrd_file: Option<File>, android_fstab: Option<File>, kernel_end: u64, params: boot_params, dump_device_tree_blob: Option<PathBuf>, device_tree_overlays: Vec<DtbOverlay>, has_protected_vm_firmware: bool, ) -> Result<()>1623 pub fn setup_system_memory(
1624 arch_memory_layout: &ArchMemoryLayout,
1625 mem: &GuestMemory,
1626 cmdline: kernel_cmdline::Cmdline,
1627 initrd_file: Option<File>,
1628 android_fstab: Option<File>,
1629 kernel_end: u64,
1630 params: boot_params,
1631 dump_device_tree_blob: Option<PathBuf>,
1632 device_tree_overlays: Vec<DtbOverlay>,
1633 has_protected_vm_firmware: bool,
1634 ) -> Result<()> {
1635 // Some guest kernels expect a typical PC memory layout where the region between 640 KB and
1636 // 1 MB is reserved for device memory/ROMs and get confused if there is a RAM region
1637 // spanning this area, so we provide the traditional 640 KB low memory and 1 MB+
1638 // high memory regions.
1639 let ram_below_1m_end = 640 * 1024;
1640 let ram_below_1m = AddressRange {
1641 start: START_OF_RAM_32BITS,
1642 end: ram_below_1m_end - 1,
1643 };
1644
1645 // GuestMemory::end_addr() returns the first address past the end, so subtract 1 to get the
1646 // inclusive end.
1647 let guest_mem_end = mem.end_addr().offset() - 1;
1648
1649 // Find the end of the part of guest memory below 4G that is not pVM firmware memory.
1650 // This part of guest memory includes just one region, so just find the end of this region.
1651 let max_ram_end_below_4g =
1652 max_ram_end_before_32bit(arch_memory_layout, has_protected_vm_firmware) - 1;
1653 let guest_mem_end_below_4g = mem
1654 .regions()
1655 .map(|r| r.guest_addr.offset() + r.size as u64 - 1)
1656 .find(|&addr| addr <= max_ram_end_below_4g)
1657 .expect("no memory region below 4G");
1658
1659 let ram_below_4g = AddressRange {
1660 start: FIRST_ADDR_PAST_20BITS,
1661 end: guest_mem_end_below_4g,
1662 };
1663 let ram_above_4g = AddressRange {
1664 start: FIRST_ADDR_PAST_32BITS,
1665 end: guest_mem_end,
1666 };
1667
1668 let e820_entries = generate_e820_memory_map(
1669 arch_memory_layout,
1670 mem,
1671 ram_below_1m,
1672 ram_below_4g,
1673 ram_above_4g,
1674 has_protected_vm_firmware,
1675 )?;
1676
1677 let kernel_max_cmdline_len = if params.hdr.cmdline_size == 0 {
1678 // Old kernels have a maximum length of 255 bytes, not including the NUL.
1679 255
1680 } else {
1681 params.hdr.cmdline_size as usize
1682 };
1683 debug!("kernel_max_cmdline_len={kernel_max_cmdline_len}");
1684 Self::load_cmdline(
1685 mem,
1686 GuestAddress(CMDLINE_OFFSET),
1687 cmdline,
1688 kernel_max_cmdline_len,
1689 )?;
1690
1691 let mut setup_data = Vec::<SetupData>::new();
1692 if android_fstab.is_some() || !device_tree_overlays.is_empty() {
1693 let device_tree_blob =
1694 fdt::create_fdt(android_fstab, dump_device_tree_blob, device_tree_overlays)
1695 .map_err(Error::CreateFdt)?;
1696 setup_data.push(SetupData {
1697 data: device_tree_blob,
1698 type_: SetupDataType::Dtb,
1699 });
1700 }
1701
1702 setup_data.push(setup_data_rng_seed());
1703
1704 let setup_data = write_setup_data(
1705 mem,
1706 GuestAddress(SETUP_DATA_START),
1707 GuestAddress(SETUP_DATA_END),
1708 &setup_data,
1709 )?;
1710
1711 let initrd = match initrd_file {
1712 Some(mut initrd_file) => {
1713 let initrd_addr_max = if params.hdr.xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G != 0 {
1714 u64::MAX
1715 } else if params.hdr.initrd_addr_max == 0 {
1716 // Default initrd_addr_max for old kernels (see Documentation/x86/boot.txt).
1717 0x37FFFFFF
1718 } else {
1719 u64::from(params.hdr.initrd_addr_max)
1720 };
1721
1722 let (initrd_start, initrd_size) = arch::load_image_high(
1723 mem,
1724 &mut initrd_file,
1725 GuestAddress(kernel_end),
1726 GuestAddress(initrd_addr_max),
1727 Some(|region| {
1728 region.options.purpose != MemoryRegionPurpose::ProtectedFirmwareRegion
1729 }),
1730 base::pagesize() as u64,
1731 )
1732 .map_err(Error::LoadInitrd)?;
1733 Some((initrd_start, initrd_size))
1734 }
1735 None => None,
1736 };
1737
1738 configure_system(
1739 mem,
1740 GuestAddress(CMDLINE_OFFSET),
1741 setup_data,
1742 initrd,
1743 params,
1744 &e820_entries,
1745 )?;
1746 Ok(())
1747 }
1748
get_pcie_vcfg_mmio_range(mem: &GuestMemory, pcie_cfg_mmio: &AddressRange) -> AddressRange1749 fn get_pcie_vcfg_mmio_range(mem: &GuestMemory, pcie_cfg_mmio: &AddressRange) -> AddressRange {
1750 // Put PCIe VCFG region at a 2MB boundary after physical memory or 4gb, whichever is
1751 // greater.
1752 let ram_end_round_2mb = (mem.end_addr().offset() + 2 * MB - 1) / (2 * MB) * (2 * MB);
1753 let start = std::cmp::max(ram_end_round_2mb, 4 * GB);
1754 // Each pci device's ECAM size is 4kb and its vcfg size is 8kb
1755 let end = start + pcie_cfg_mmio.len().unwrap() * 2 - 1;
1756 AddressRange { start, end }
1757 }
1758
1759 /// Returns the high mmio range
get_high_mmio_range<V: Vm>(vm: &V, arch_memory_layout: &ArchMemoryLayout) -> AddressRange1760 fn get_high_mmio_range<V: Vm>(vm: &V, arch_memory_layout: &ArchMemoryLayout) -> AddressRange {
1761 let mem = vm.get_memory();
1762 let start = Self::get_pcie_vcfg_mmio_range(mem, &arch_memory_layout.pcie_cfg_mmio).end + 1;
1763
1764 let phys_mem_end = (1u64 << vm.get_guest_phys_addr_bits()) - 1;
1765 let high_mmio_end = std::cmp::min(phys_mem_end, HIGH_MMIO_MAX_END);
1766
1767 AddressRange {
1768 start,
1769 end: high_mmio_end,
1770 }
1771 }
1772
1773 /// This returns a minimal kernel command for this architecture
get_base_linux_cmdline() -> kernel_cmdline::Cmdline1774 pub fn get_base_linux_cmdline() -> kernel_cmdline::Cmdline {
1775 let mut cmdline = kernel_cmdline::Cmdline::new();
1776 cmdline.insert_str("panic=-1").unwrap();
1777
1778 cmdline
1779 }
1780
1781 /// Sets up fw_cfg device.
1782 /// # Arguments
1783 ///
1784 /// * `io_bus` - the IO bus object
1785 /// * `fw_cfg_parameters` - command-line specified data to add to device. May contain all None
1786 /// fields if user did not specify data to add to the device
setup_fw_cfg_device( io_bus: &Bus, fw_cfg_parameters: Vec<FwCfgParameters>, bootorder_fw_cfg_blob: Vec<u8>, fw_cfg_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<()>1787 fn setup_fw_cfg_device(
1788 io_bus: &Bus,
1789 fw_cfg_parameters: Vec<FwCfgParameters>,
1790 bootorder_fw_cfg_blob: Vec<u8>,
1791 fw_cfg_jail: Option<Minijail>,
1792 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1793 ) -> Result<()> {
1794 let fw_cfg = match devices::FwCfgDevice::new(FW_CFG_MAX_FILE_SLOTS, fw_cfg_parameters) {
1795 Ok(mut device) => {
1796 // this condition will only be true if the user specified at least one bootindex
1797 // option on the command line. If none were specified, bootorder_fw_cfg_blob will
1798 // only have a null byte (null terminator)
1799 if bootorder_fw_cfg_blob.len() > 1 {
1800 // Add boot order file to the device. If the file is not present, firmware may
1801 // not be able to boot.
1802 if let Err(err) = device.add_file(
1803 "bootorder",
1804 bootorder_fw_cfg_blob,
1805 devices::FwCfgItemType::GenericItem,
1806 ) {
1807 return Err(Error::CreateFwCfgDevice(err));
1808 }
1809 }
1810 device
1811 }
1812 Err(err) => {
1813 return Err(Error::CreateFwCfgDevice(err));
1814 }
1815 };
1816
1817 let fw_cfg: Arc<Mutex<dyn BusDevice>> = match fw_cfg_jail.as_ref() {
1818 #[cfg(any(target_os = "android", target_os = "linux"))]
1819 Some(jail) => {
1820 let jail_clone = jail.try_clone().map_err(Error::CloneJail)?;
1821 #[cfg(feature = "seccomp_trace")]
1822 debug!(
1823 "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}",
1824 read_jail_addr(jail),
1825 read_jail_addr(&jail_clone)
1826 );
1827 Arc::new(Mutex::new(
1828 ProxyDevice::new(
1829 fw_cfg,
1830 jail_clone,
1831 Vec::new(),
1832 #[cfg(feature = "swap")]
1833 swap_controller,
1834 )
1835 .map_err(Error::CreateProxyDevice)?,
1836 ))
1837 }
1838 #[cfg(windows)]
1839 Some(_) => unreachable!(),
1840 None => Arc::new(Mutex::new(fw_cfg)),
1841 };
1842
1843 io_bus
1844 .insert(fw_cfg, FW_CFG_BASE_PORT, FW_CFG_WIDTH)
1845 .map_err(Error::InsertBus)?;
1846
1847 Ok(())
1848 }
1849
1850 /// Sets up the legacy x86 i8042/KBD platform device
1851 ///
1852 /// # Arguments
1853 ///
1854 /// * - `io_bus` - the IO bus object
1855 /// * - `pit_uses_speaker_port` - does the PIT use port 0x61 for the PC speaker
1856 /// * - `vm_evt_wrtube` - the event object which should receive exit events
setup_legacy_i8042_device( io_bus: &Bus, pit_uses_speaker_port: bool, vm_evt_wrtube: SendTube, ) -> Result<()>1857 pub fn setup_legacy_i8042_device(
1858 io_bus: &Bus,
1859 pit_uses_speaker_port: bool,
1860 vm_evt_wrtube: SendTube,
1861 ) -> Result<()> {
1862 let i8042 = Arc::new(Mutex::new(devices::I8042Device::new(
1863 vm_evt_wrtube.try_clone().map_err(Error::CloneTube)?,
1864 )));
1865
1866 if pit_uses_speaker_port {
1867 io_bus.insert(i8042, 0x062, 0x3).unwrap();
1868 } else {
1869 io_bus.insert(i8042, 0x061, 0x4).unwrap();
1870 }
1871
1872 Ok(())
1873 }
1874
1875 /// Sets up the legacy x86 CMOS/RTC platform device
1876 /// # Arguments
1877 ///
1878 /// * - `io_bus` - the IO bus object
1879 /// * - `mem_size` - the size in bytes of physical ram for the guest
setup_legacy_cmos_device( arch_memory_layout: &ArchMemoryLayout, io_bus: &Bus, irq_chip: &mut dyn IrqChipX86_64, vm_control: Tube, mem_size: u64, has_protected_vm_firmware: bool, ) -> anyhow::Result<()>1880 pub fn setup_legacy_cmos_device(
1881 arch_memory_layout: &ArchMemoryLayout,
1882 io_bus: &Bus,
1883 irq_chip: &mut dyn IrqChipX86_64,
1884 vm_control: Tube,
1885 mem_size: u64,
1886 has_protected_vm_firmware: bool,
1887 ) -> anyhow::Result<()> {
1888 let mem_regions = arch_memory_regions(
1889 arch_memory_layout,
1890 mem_size,
1891 None,
1892 has_protected_vm_firmware,
1893 );
1894
1895 let mem_below_4g = mem_regions
1896 .iter()
1897 .filter(|r| r.0.offset() < FIRST_ADDR_PAST_32BITS)
1898 .map(|r| r.1)
1899 .sum();
1900
1901 let mem_above_4g = mem_regions
1902 .iter()
1903 .filter(|r| r.0.offset() >= FIRST_ADDR_PAST_32BITS)
1904 .map(|r| r.1)
1905 .sum();
1906
1907 let irq_evt = devices::IrqEdgeEvent::new().context("cmos irq")?;
1908 let cmos = devices::cmos::Cmos::new(
1909 mem_below_4g,
1910 mem_above_4g,
1911 Utc::now,
1912 vm_control,
1913 irq_evt.try_clone().context("cmos irq clone")?,
1914 )
1915 .context("create cmos")?;
1916
1917 irq_chip
1918 .register_edge_irq_event(
1919 devices::cmos::RTC_IRQ as u32,
1920 &irq_evt,
1921 IrqEventSource::from_device(&cmos),
1922 )
1923 .context("cmos register irq")?;
1924 io_bus
1925 .insert(Arc::new(Mutex::new(cmos)), 0x70, 0x2)
1926 .context("cmos insert irq")?;
1927
1928 Ok(())
1929 }
1930
1931 /// Sets up the acpi devices for this platform and
1932 /// return the resources which is used to set the ACPI tables.
1933 ///
1934 /// # Arguments
1935 ///
1936 /// * `io_bus` the I/O bus to add the devices to
1937 /// * `resources` the SystemAllocator to allocate IO and MMIO for acpi devices.
1938 /// * `suspend_tube` the tube object which used to suspend/resume the VM.
1939 /// * `sdts` ACPI system description tables
1940 /// * `irq_chip` the IrqChip object for registering irq events
1941 /// * `battery` indicate whether to create the battery
1942 /// * `mmio_bus` the MMIO bus to add the devices to
1943 /// * `pci_irqs` IRQ assignment of PCI devices. Tuples of (PCI address, gsi, PCI interrupt pin).
1944 /// Note that this matches one of the return values of generate_pci_root.
setup_acpi_devices( arch_memory_layout: &ArchMemoryLayout, pci_root: Arc<Mutex<PciRoot>>, mem: &GuestMemory, io_bus: &Bus, resources: &mut SystemAllocator, suspend_tube: Arc<Mutex<SendTube>>, vm_evt_wrtube: SendTube, sdts: Vec<SDT>, irq_chip: &mut dyn IrqChip, sci_irq: u32, battery: (Option<BatteryType>, Option<Minijail>), #[cfg_attr(windows, allow(unused_variables))] mmio_bus: &Bus, max_bus: u8, resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, #[cfg(any(target_os = "android", target_os = "linux"))] ac_adapter: bool, guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, pci_irqs: &[(PciAddress, u32, PciInterruptPin)], ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)>1945 pub fn setup_acpi_devices(
1946 arch_memory_layout: &ArchMemoryLayout,
1947 pci_root: Arc<Mutex<PciRoot>>,
1948 mem: &GuestMemory,
1949 io_bus: &Bus,
1950 resources: &mut SystemAllocator,
1951 suspend_tube: Arc<Mutex<SendTube>>,
1952 vm_evt_wrtube: SendTube,
1953 sdts: Vec<SDT>,
1954 irq_chip: &mut dyn IrqChip,
1955 sci_irq: u32,
1956 battery: (Option<BatteryType>, Option<Minijail>),
1957 #[cfg_attr(windows, allow(unused_variables))] mmio_bus: &Bus,
1958 max_bus: u8,
1959 resume_notify_devices: &mut Vec<Arc<Mutex<dyn BusResumeDevice>>>,
1960 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1961 #[cfg(any(target_os = "android", target_os = "linux"))] ac_adapter: bool,
1962 guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
1963 pci_irqs: &[(PciAddress, u32, PciInterruptPin)],
1964 ) -> Result<(acpi::AcpiDevResource, Option<BatControl>)> {
1965 // The AML data for the acpi devices
1966 let mut amls = Vec::new();
1967
1968 let bat_control = if let Some(battery_type) = battery.0 {
1969 match battery_type {
1970 #[cfg(any(target_os = "android", target_os = "linux"))]
1971 BatteryType::Goldfish => {
1972 let irq_num = resources.allocate_irq().ok_or(Error::CreateBatDevices(
1973 arch::DeviceRegistrationError::AllocateIrq,
1974 ))?;
1975 let (control_tube, _mmio_base) = arch::sys::linux::add_goldfish_battery(
1976 &mut amls,
1977 battery.1,
1978 mmio_bus,
1979 irq_chip,
1980 irq_num,
1981 resources,
1982 #[cfg(feature = "swap")]
1983 swap_controller,
1984 )
1985 .map_err(Error::CreateBatDevices)?;
1986 Some(BatControl {
1987 type_: BatteryType::Goldfish,
1988 control_tube,
1989 })
1990 }
1991 #[cfg(windows)]
1992 _ => None,
1993 }
1994 } else {
1995 None
1996 };
1997
1998 let pm_alloc = resources.get_anon_alloc();
1999 let pm_iobase = match resources.io_allocator() {
2000 Some(io) => io
2001 .allocate_with_align(
2002 devices::acpi::ACPIPM_RESOURCE_LEN as u64,
2003 pm_alloc,
2004 "ACPIPM".to_string(),
2005 4, // must be 32-bit aligned
2006 )
2007 .map_err(Error::AllocateIOResouce)?,
2008 None => 0x600,
2009 };
2010
2011 let pcie_vcfg = aml::Name::new(
2012 "VCFG".into(),
2013 &Self::get_pcie_vcfg_mmio_range(mem, &arch_memory_layout.pcie_cfg_mmio).start,
2014 );
2015 pcie_vcfg.to_aml_bytes(&mut amls);
2016
2017 let pm_sci_evt = devices::IrqLevelEvent::new().map_err(Error::CreateEvent)?;
2018
2019 #[cfg(any(target_os = "android", target_os = "linux"))]
2020 let acdc = if ac_adapter {
2021 // Allocate GPE for AC adapter notfication
2022 let gpe = resources.allocate_gpe().ok_or(Error::AllocateGpe)?;
2023
2024 let alloc = resources.get_anon_alloc();
2025 let mmio_base = resources
2026 .allocate_mmio(
2027 devices::ac_adapter::ACDC_VIRT_MMIO_SIZE,
2028 alloc,
2029 "AcAdapter".to_string(),
2030 resources::AllocOptions::new().align(devices::ac_adapter::ACDC_VIRT_MMIO_SIZE),
2031 )
2032 .unwrap();
2033 let ac_adapter_dev = devices::ac_adapter::AcAdapter::new(mmio_base, gpe);
2034 let ac_dev = Arc::new(Mutex::new(ac_adapter_dev));
2035 mmio_bus
2036 .insert(
2037 ac_dev.clone(),
2038 mmio_base,
2039 devices::ac_adapter::ACDC_VIRT_MMIO_SIZE,
2040 )
2041 .unwrap();
2042
2043 ac_dev.lock().to_aml_bytes(&mut amls);
2044 Some(ac_dev)
2045 } else {
2046 None
2047 };
2048 #[cfg(windows)]
2049 let acdc = None;
2050
2051 //Virtual PMC
2052 if let Some(guest_suspended_cvar) = guest_suspended_cvar {
2053 let alloc = resources.get_anon_alloc();
2054 let mmio_base = resources
2055 .allocate_mmio(
2056 devices::pmc_virt::VPMC_VIRT_MMIO_SIZE,
2057 alloc,
2058 "VirtualPmc".to_string(),
2059 resources::AllocOptions::new().align(devices::pmc_virt::VPMC_VIRT_MMIO_SIZE),
2060 )
2061 .unwrap();
2062
2063 let pmc_virtio_mmio =
2064 Arc::new(Mutex::new(VirtualPmc::new(mmio_base, guest_suspended_cvar)));
2065 mmio_bus
2066 .insert(
2067 pmc_virtio_mmio.clone(),
2068 mmio_base,
2069 devices::pmc_virt::VPMC_VIRT_MMIO_SIZE,
2070 )
2071 .unwrap();
2072 pmc_virtio_mmio.lock().to_aml_bytes(&mut amls);
2073 }
2074
2075 let mut pmresource = devices::ACPIPMResource::new(
2076 pm_sci_evt.try_clone().map_err(Error::CloneEvent)?,
2077 suspend_tube,
2078 vm_evt_wrtube,
2079 acdc,
2080 );
2081 pmresource.to_aml_bytes(&mut amls);
2082 irq_chip
2083 .register_level_irq_event(
2084 sci_irq,
2085 &pm_sci_evt,
2086 IrqEventSource::from_device(&pmresource),
2087 )
2088 .map_err(Error::RegisterIrqfd)?;
2089 pmresource.start();
2090
2091 let mut crs_entries: Vec<Box<dyn Aml>> = vec![
2092 Box::new(aml::AddressSpace::new_bus_number(0x0u16, max_bus as u16)),
2093 Box::new(aml::IO::new(0xcf8, 0xcf8, 1, 0x8)),
2094 ];
2095 for r in resources.mmio_pools() {
2096 let entry: Box<dyn Aml> = match (u32::try_from(r.start), u32::try_from(r.end)) {
2097 (Ok(start), Ok(end)) => Box::new(aml::AddressSpace::new_memory(
2098 aml::AddressSpaceCachable::NotCacheable,
2099 true,
2100 start,
2101 end,
2102 )),
2103 _ => Box::new(aml::AddressSpace::new_memory(
2104 aml::AddressSpaceCachable::NotCacheable,
2105 true,
2106 r.start,
2107 r.end,
2108 )),
2109 };
2110 crs_entries.push(entry);
2111 }
2112
2113 let prt_entries: Vec<aml::Package> = pci_irqs
2114 .iter()
2115 .map(|(pci_address, gsi, pci_intr_pin)| {
2116 aml::Package::new(vec![
2117 &pci_address.acpi_adr(),
2118 &pci_intr_pin.to_mask(),
2119 &aml::ZERO,
2120 gsi,
2121 ])
2122 })
2123 .collect();
2124
2125 aml::Device::new(
2126 "_SB_.PC00".into(),
2127 vec![
2128 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A08")),
2129 &aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A03")),
2130 &aml::Name::new("_ADR".into(), &aml::ZERO),
2131 &aml::Name::new("_SEG".into(), &aml::ZERO),
2132 &aml::Name::new("_UID".into(), &aml::ZERO),
2133 &aml::Name::new("SUPP".into(), &aml::ZERO),
2134 &aml::Name::new(
2135 "_CRS".into(),
2136 &aml::ResourceTemplate::new(crs_entries.iter().map(|b| b.as_ref()).collect()),
2137 ),
2138 &PciRootOSC {},
2139 &aml::Name::new(
2140 "_PRT".into(),
2141 &aml::Package::new(prt_entries.iter().map(|p| p as &dyn Aml).collect()),
2142 ),
2143 ],
2144 )
2145 .to_aml_bytes(&mut amls);
2146
2147 if let (Some(start), Some(len)) = (
2148 u32::try_from(arch_memory_layout.pcie_cfg_mmio.start).ok(),
2149 arch_memory_layout
2150 .pcie_cfg_mmio
2151 .len()
2152 .and_then(|l| u32::try_from(l).ok()),
2153 ) {
2154 aml::Device::new(
2155 "_SB_.MB00".into(),
2156 vec![
2157 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C02")),
2158 &aml::Name::new(
2159 "_CRS".into(),
2160 &aml::ResourceTemplate::new(vec![&aml::Memory32Fixed::new(
2161 true, start, len,
2162 )]),
2163 ),
2164 ],
2165 )
2166 .to_aml_bytes(&mut amls);
2167 } else {
2168 warn!("Failed to create ACPI MMCFG region reservation");
2169 }
2170
2171 let root_bus = pci_root.lock().get_root_bus();
2172 let addresses = root_bus.lock().get_downstream_devices();
2173 for address in addresses {
2174 if let Some(acpi_path) = pci_root.lock().acpi_path(&address) {
2175 const DEEPEST_SLEEP_STATE: u32 = 3;
2176 aml::Device::new(
2177 (*acpi_path).into(),
2178 vec![
2179 &aml::Name::new("_ADR".into(), &address.acpi_adr()),
2180 &aml::Name::new(
2181 "_PRW".into(),
2182 &aml::Package::new(vec![&PM_WAKEUP_GPIO, &DEEPEST_SLEEP_STATE]),
2183 ),
2184 ],
2185 )
2186 .to_aml_bytes(&mut amls);
2187 }
2188 }
2189
2190 let pm = Arc::new(Mutex::new(pmresource));
2191 io_bus
2192 .insert(
2193 pm.clone(),
2194 pm_iobase,
2195 devices::acpi::ACPIPM_RESOURCE_LEN as u64,
2196 )
2197 .unwrap();
2198 resume_notify_devices.push(pm.clone());
2199
2200 Ok((
2201 acpi::AcpiDevResource {
2202 amls,
2203 pm_iobase,
2204 pm,
2205 sdts,
2206 },
2207 bat_control,
2208 ))
2209 }
2210
2211 /// Sets up the serial devices for this platform. Returns a list of configured serial devices.
2212 ///
2213 /// # Arguments
2214 ///
2215 /// * - `irq_chip` the IrqChip object for registering irq events
2216 /// * - `io_bus` the I/O bus to add the devices to
2217 /// * - `serial_parameters` - definitions for how the serial devices should be configured
setup_serial_devices( protection_type: ProtectionType, irq_chip: &mut dyn IrqChip, io_bus: &Bus, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<Vec<SerialDeviceInfo>>2218 pub fn setup_serial_devices(
2219 protection_type: ProtectionType,
2220 irq_chip: &mut dyn IrqChip,
2221 io_bus: &Bus,
2222 serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
2223 serial_jail: Option<Minijail>,
2224 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
2225 ) -> Result<Vec<SerialDeviceInfo>> {
2226 let com_evt_1_3 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
2227 let com_evt_2_4 = devices::IrqEdgeEvent::new().map_err(Error::CreateEvent)?;
2228
2229 let serial_devices = arch::add_serial_devices(
2230 protection_type,
2231 io_bus,
2232 (X86_64_SERIAL_1_3_IRQ, com_evt_1_3.get_trigger()),
2233 (X86_64_SERIAL_2_4_IRQ, com_evt_2_4.get_trigger()),
2234 serial_parameters,
2235 serial_jail,
2236 #[cfg(feature = "swap")]
2237 swap_controller,
2238 )
2239 .map_err(Error::CreateSerialDevices)?;
2240
2241 let source = IrqEventSource {
2242 device_id: Serial::device_id(),
2243 queue_id: 0,
2244 device_name: Serial::debug_label(),
2245 };
2246 irq_chip
2247 .register_edge_irq_event(X86_64_SERIAL_1_3_IRQ, &com_evt_1_3, source.clone())
2248 .map_err(Error::RegisterIrqfd)?;
2249 irq_chip
2250 .register_edge_irq_event(X86_64_SERIAL_2_4_IRQ, &com_evt_2_4, source)
2251 .map_err(Error::RegisterIrqfd)?;
2252
2253 Ok(serial_devices)
2254 }
2255
setup_debugcon_devices( protection_type: ProtectionType, io_bus: &Bus, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, debugcon_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<()>2256 fn setup_debugcon_devices(
2257 protection_type: ProtectionType,
2258 io_bus: &Bus,
2259 serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
2260 debugcon_jail: Option<Minijail>,
2261 #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
2262 ) -> Result<()> {
2263 for param in serial_parameters.values() {
2264 if param.hardware != SerialHardware::Debugcon {
2265 continue;
2266 }
2267
2268 let mut preserved_fds = Vec::new();
2269 let con = param
2270 .create_serial_device::<Debugcon>(
2271 protection_type,
2272 // Debugcon doesn't use the interrupt event
2273 &Event::new().map_err(Error::CreateEvent)?,
2274 &mut preserved_fds,
2275 )
2276 .map_err(Error::CreateDebugconDevice)?;
2277
2278 let con: Arc<Mutex<dyn BusDevice>> = match debugcon_jail.as_ref() {
2279 #[cfg(any(target_os = "android", target_os = "linux"))]
2280 Some(jail) => {
2281 let jail_clone = jail.try_clone().map_err(Error::CloneJail)?;
2282 #[cfg(feature = "seccomp_trace")]
2283 debug!(
2284 "seccomp_trace {{\"event\": \"minijail_clone\", \"src_jail_addr\": \"0x{:x}\", \"dst_jail_addr\": \"0x{:x}\"}}",
2285 read_jail_addr(jail),
2286 read_jail_addr(&jail_clone)
2287 );
2288 Arc::new(Mutex::new(
2289 ProxyDevice::new(
2290 con,
2291 jail_clone,
2292 preserved_fds,
2293 #[cfg(feature = "swap")]
2294 swap_controller,
2295 )
2296 .map_err(Error::CreateProxyDevice)?,
2297 ))
2298 }
2299 #[cfg(windows)]
2300 Some(_) => unreachable!(),
2301 None => Arc::new(Mutex::new(con)),
2302 };
2303 io_bus
2304 .insert(con.clone(), param.debugcon_port.into(), 1)
2305 .map_err(Error::InsertBus)?;
2306 }
2307
2308 Ok(())
2309 }
2310 }
2311
2312 #[sorted]
2313 #[derive(Error, Debug)]
2314 pub enum MsrError {
2315 #[error("CPU not support. Only intel CPUs support ITMT.")]
2316 CpuUnSupport,
2317 #[error("msr must be unique: {0}")]
2318 MsrDuplicate(u32),
2319 }
2320
2321 #[derive(Error, Debug)]
2322 pub enum HybridSupportError {
2323 #[error("Host CPU doesn't support hybrid architecture.")]
2324 UnsupportedHostCpu,
2325 }
2326
2327 /// The wrapper for CPUID call functions.
2328 pub struct CpuIdCall {
2329 /// __cpuid_count or a fake function for test.
2330 cpuid_count: unsafe fn(u32, u32) -> CpuidResult,
2331 /// __cpuid or a fake function for test.
2332 cpuid: unsafe fn(u32) -> CpuidResult,
2333 }
2334
2335 impl CpuIdCall {
new( cpuid_count: unsafe fn(u32, u32) -> CpuidResult, cpuid: unsafe fn(u32) -> CpuidResult, ) -> CpuIdCall2336 pub fn new(
2337 cpuid_count: unsafe fn(u32, u32) -> CpuidResult,
2338 cpuid: unsafe fn(u32) -> CpuidResult,
2339 ) -> CpuIdCall {
2340 CpuIdCall { cpuid_count, cpuid }
2341 }
2342 }
2343
2344 /// Check if host supports hybrid CPU feature. The check include:
2345 /// 1. Check if CPUID.1AH exists. CPUID.1AH is hybrid information enumeration leaf.
2346 /// 2. Check if CPUID.07H.00H:EDX[bit 15] sets. This bit means the processor is identified as a
2347 /// hybrid part.
2348 /// 3. Check if CPUID.1AH:EAX sets. The hybrid core type is set in EAX.
2349 ///
2350 /// # Arguments
2351 ///
2352 /// * - `cpuid` the wrapped cpuid functions used to get CPUID info.
check_host_hybrid_support(cpuid: &CpuIdCall) -> std::result::Result<(), HybridSupportError>2353 pub fn check_host_hybrid_support(cpuid: &CpuIdCall) -> std::result::Result<(), HybridSupportError> {
2354 // CPUID.0H.EAX returns maximum input value for basic CPUID information.
2355 //
2356 // SAFETY:
2357 // Safe because we pass 0 for this call and the host supports the
2358 // `cpuid` instruction.
2359 let mut cpuid_entry = unsafe { (cpuid.cpuid)(0x0) };
2360 if cpuid_entry.eax < 0x1A {
2361 return Err(HybridSupportError::UnsupportedHostCpu);
2362 }
2363 // SAFETY:
2364 // Safe because we pass 0x7 and 0 for this call and the host supports the
2365 // `cpuid` instruction.
2366 cpuid_entry = unsafe { (cpuid.cpuid_count)(0x7, 0) };
2367 if cpuid_entry.edx & 1 << EDX_HYBRID_CPU_SHIFT == 0 {
2368 return Err(HybridSupportError::UnsupportedHostCpu);
2369 }
2370 // From SDM, if a value entered for CPUID.EAX is less than or equal to the
2371 // maximum input value and the leaf is not supported on that processor then
2372 // 0 is returned in all the registers.
2373 // For the CPU with hybrid support, its CPUID.1AH.EAX shouldn't be zero.
2374 //
2375 // SAFETY:
2376 // Safe because we pass 0 for this call and the host supports the
2377 // `cpuid` instruction.
2378 cpuid_entry = unsafe { (cpuid.cpuid)(0x1A) };
2379 if cpuid_entry.eax == 0 {
2380 return Err(HybridSupportError::UnsupportedHostCpu);
2381 }
2382 Ok(())
2383 }
2384
2385 #[cfg(test)]
2386 mod tests {
2387 use std::mem::size_of;
2388
2389 use super::*;
2390
2391 const TEST_MEMORY_SIZE: u64 = 2 * GB;
2392
setup() -> ArchMemoryLayout2393 fn setup() -> ArchMemoryLayout {
2394 let pci_config = PciConfig {
2395 ecam: Some(MemoryRegionConfig {
2396 start: 3 * GB,
2397 size: Some(256 * MB),
2398 }),
2399 mem: Some(MemoryRegionConfig {
2400 start: 2 * GB,
2401 size: None,
2402 }),
2403 };
2404 create_arch_memory_layout(&pci_config, false).unwrap()
2405 }
2406
2407 #[test]
regions_lt_4gb_nobios()2408 fn regions_lt_4gb_nobios() {
2409 let arch_memory_layout = setup();
2410 let regions = arch_memory_regions(
2411 &arch_memory_layout,
2412 512 * MB,
2413 /* bios_size */ None,
2414 /* has_protected_vm_firmware */ false,
2415 );
2416 assert_eq!(1, regions.len());
2417 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2418 assert_eq!(1u64 << 29, regions[0].1);
2419 }
2420
2421 #[test]
regions_gt_4gb_nobios()2422 fn regions_gt_4gb_nobios() {
2423 let arch_memory_layout = setup();
2424 let size = 4 * GB + 0x8000;
2425 let regions = arch_memory_regions(
2426 &arch_memory_layout,
2427 size,
2428 /* bios_size */ None,
2429 /* has_protected_vm_firmware */ false,
2430 );
2431 assert_eq!(2, regions.len());
2432 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2433 assert_eq!(GuestAddress(4 * GB), regions[1].0);
2434 assert_eq!(4 * GB + 0x8000, regions[0].1 + regions[1].1);
2435 }
2436
2437 #[test]
regions_lt_4gb_bios()2438 fn regions_lt_4gb_bios() {
2439 let arch_memory_layout = setup();
2440 let bios_len = 1 * MB;
2441 let regions = arch_memory_regions(
2442 &arch_memory_layout,
2443 512 * MB,
2444 Some(bios_len),
2445 /* has_protected_vm_firmware */ false,
2446 );
2447 assert_eq!(2, regions.len());
2448 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2449 assert_eq!(512 * MB, regions[0].1);
2450 assert_eq!(
2451 GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2452 regions[1].0
2453 );
2454 assert_eq!(bios_len, regions[1].1);
2455 }
2456
2457 #[test]
regions_gt_4gb_bios()2458 fn regions_gt_4gb_bios() {
2459 let arch_memory_layout = setup();
2460 let bios_len = 1 * MB;
2461 let regions = arch_memory_regions(
2462 &arch_memory_layout,
2463 4 * GB + 0x8000,
2464 Some(bios_len),
2465 /* has_protected_vm_firmware */ false,
2466 );
2467 assert_eq!(3, regions.len());
2468 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2469 assert_eq!(
2470 GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2471 regions[1].0
2472 );
2473 assert_eq!(bios_len, regions[1].1);
2474 assert_eq!(GuestAddress(4 * GB), regions[2].0);
2475 }
2476
2477 #[test]
regions_eq_4gb_nobios()2478 fn regions_eq_4gb_nobios() {
2479 let arch_memory_layout = setup();
2480 // Test with exact size of 4GB - the overhead.
2481 let regions = arch_memory_regions(
2482 &arch_memory_layout,
2483 TEST_MEMORY_SIZE - START_OF_RAM_32BITS,
2484 /* bios_size */ None,
2485 /* has_protected_vm_firmware */ false,
2486 );
2487 dbg!(®ions);
2488 assert_eq!(1, regions.len());
2489 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2490 assert_eq!(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, regions[0].1);
2491 }
2492
2493 #[test]
regions_eq_4gb_bios()2494 fn regions_eq_4gb_bios() {
2495 let arch_memory_layout = setup();
2496 // Test with exact size of 4GB - the overhead.
2497 let bios_len = 1 * MB;
2498 let regions = arch_memory_regions(
2499 &arch_memory_layout,
2500 TEST_MEMORY_SIZE - START_OF_RAM_32BITS,
2501 Some(bios_len),
2502 /* has_protected_vm_firmware */ false,
2503 );
2504 assert_eq!(2, regions.len());
2505 assert_eq!(GuestAddress(START_OF_RAM_32BITS), regions[0].0);
2506 assert_eq!(TEST_MEMORY_SIZE - START_OF_RAM_32BITS, regions[0].1);
2507 assert_eq!(
2508 GuestAddress(FIRST_ADDR_PAST_32BITS - bios_len),
2509 regions[1].0
2510 );
2511 assert_eq!(bios_len, regions[1].1);
2512 }
2513
2514 #[test]
check_pci_mmio_layout()2515 fn check_pci_mmio_layout() {
2516 let arch_memory_layout = setup();
2517
2518 assert_eq!(arch_memory_layout.pci_mmio_before_32bit.start, 2 * GB);
2519 assert_eq!(arch_memory_layout.pcie_cfg_mmio.start, 3 * GB);
2520 assert_eq!(arch_memory_layout.pcie_cfg_mmio.len().unwrap(), 256 * MB);
2521 }
2522
2523 #[test]
check_32bit_gap_size_alignment()2524 fn check_32bit_gap_size_alignment() {
2525 let arch_memory_layout = setup();
2526 // pci_mmio_before_32bit is 256 MB aligned to be friendly for MTRR mappings.
2527 assert_eq!(
2528 arch_memory_layout.pci_mmio_before_32bit.start % (256 * MB),
2529 0
2530 );
2531 }
2532
2533 #[test]
write_setup_data_empty()2534 fn write_setup_data_empty() {
2535 let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap();
2536 let setup_data = [];
2537 let setup_data_addr = write_setup_data(
2538 &mem,
2539 GuestAddress(0x1000),
2540 GuestAddress(0x2000),
2541 &setup_data,
2542 )
2543 .expect("write_setup_data");
2544 assert_eq!(setup_data_addr, None);
2545 }
2546
2547 #[test]
write_setup_data_two_of_them()2548 fn write_setup_data_two_of_them() {
2549 let mem = GuestMemory::new(&[(GuestAddress(0), 0x2_0000)]).unwrap();
2550
2551 let entry1_addr = GuestAddress(0x1000);
2552 let entry1_next_addr = entry1_addr;
2553 let entry1_len_addr = entry1_addr.checked_add(12).unwrap();
2554 let entry1_data_addr = entry1_addr.checked_add(16).unwrap();
2555 let entry1_data = [0x55u8; 13];
2556 let entry1_size = (size_of::<setup_data_hdr>() + entry1_data.len()) as u64;
2557 let entry1_align = 3;
2558
2559 let entry2_addr = GuestAddress(entry1_addr.offset() + entry1_size + entry1_align);
2560 let entry2_next_addr = entry2_addr;
2561 let entry2_len_addr = entry2_addr.checked_add(12).unwrap();
2562 let entry2_data_addr = entry2_addr.checked_add(16).unwrap();
2563 let entry2_data = [0xAAu8; 9];
2564
2565 let setup_data = [
2566 SetupData {
2567 data: entry1_data.to_vec(),
2568 type_: SetupDataType::Dtb,
2569 },
2570 SetupData {
2571 data: entry2_data.to_vec(),
2572 type_: SetupDataType::Dtb,
2573 },
2574 ];
2575
2576 let setup_data_head_addr = write_setup_data(
2577 &mem,
2578 GuestAddress(0x1000),
2579 GuestAddress(0x2000),
2580 &setup_data,
2581 )
2582 .expect("write_setup_data");
2583 assert_eq!(setup_data_head_addr, Some(entry1_addr));
2584
2585 assert_eq!(
2586 mem.read_obj_from_addr::<u64>(entry1_next_addr).unwrap(),
2587 entry2_addr.offset()
2588 );
2589 assert_eq!(
2590 mem.read_obj_from_addr::<u32>(entry1_len_addr).unwrap(),
2591 entry1_data.len() as u32
2592 );
2593 assert_eq!(
2594 mem.read_obj_from_addr::<[u8; 13]>(entry1_data_addr)
2595 .unwrap(),
2596 entry1_data
2597 );
2598
2599 assert_eq!(mem.read_obj_from_addr::<u64>(entry2_next_addr).unwrap(), 0);
2600 assert_eq!(
2601 mem.read_obj_from_addr::<u32>(entry2_len_addr).unwrap(),
2602 entry2_data.len() as u32
2603 );
2604 assert_eq!(
2605 mem.read_obj_from_addr::<[u8; 9]>(entry2_data_addr).unwrap(),
2606 entry2_data
2607 );
2608 }
2609
2610 #[test]
cmdline_overflow()2611 fn cmdline_overflow() {
2612 const MEM_SIZE: u64 = 0x1000;
2613 let gm = GuestMemory::new(&[(GuestAddress(0x0), MEM_SIZE)]).unwrap();
2614 let mut cmdline = kernel_cmdline::Cmdline::new();
2615 cmdline.insert_str("12345").unwrap();
2616 let cmdline_address = GuestAddress(MEM_SIZE - 5);
2617 let err =
2618 X8664arch::load_cmdline(&gm, cmdline_address, cmdline, CMDLINE_MAX_SIZE as usize - 1)
2619 .unwrap_err();
2620 assert!(matches!(err, Error::CommandLineOverflow));
2621 }
2622
2623 #[test]
cmdline_write_end()2624 fn cmdline_write_end() {
2625 const MEM_SIZE: u64 = 0x1000;
2626 let gm = GuestMemory::new(&[(GuestAddress(0x0), MEM_SIZE)]).unwrap();
2627 let mut cmdline = kernel_cmdline::Cmdline::new();
2628 cmdline.insert_str("1234").unwrap();
2629 let mut cmdline_address = GuestAddress(45);
2630 X8664arch::load_cmdline(&gm, cmdline_address, cmdline, CMDLINE_MAX_SIZE as usize - 1)
2631 .unwrap();
2632 let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
2633 assert_eq!(val, b'1');
2634 cmdline_address = cmdline_address.unchecked_add(1);
2635 let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
2636 assert_eq!(val, b'2');
2637 cmdline_address = cmdline_address.unchecked_add(1);
2638 let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
2639 assert_eq!(val, b'3');
2640 cmdline_address = cmdline_address.unchecked_add(1);
2641 let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
2642 assert_eq!(val, b'4');
2643 cmdline_address = cmdline_address.unchecked_add(1);
2644 let val: u8 = gm.read_obj_from_addr(cmdline_address).unwrap();
2645 assert_eq!(val, b'\0');
2646 }
2647 }
2648