xref: /aosp_15_r20/external/crosvm/arch/src/lib.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2018 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! Virtual machine architecture support code.
6 
7 pub mod android;
8 pub mod fdt;
9 pub mod pstore;
10 pub mod serial;
11 
12 pub mod sys;
13 
14 use std::collections::BTreeMap;
15 use std::error::Error as StdError;
16 use std::fs::File;
17 use std::io;
18 use std::ops::Deref;
19 use std::path::PathBuf;
20 use std::str::FromStr;
21 use std::sync::mpsc;
22 use std::sync::mpsc::SendError;
23 use std::sync::Arc;
24 
25 use acpi_tables::sdt::SDT;
26 use base::syslog;
27 use base::AsRawDescriptor;
28 use base::AsRawDescriptors;
29 use base::FileGetLen;
30 use base::FileReadWriteAtVolatile;
31 use base::RecvTube;
32 use base::SendTube;
33 use base::Tube;
34 use devices::virtio::VirtioDevice;
35 use devices::BarRange;
36 use devices::Bus;
37 use devices::BusDevice;
38 use devices::BusDeviceObj;
39 use devices::BusError;
40 use devices::BusResumeDevice;
41 use devices::FwCfgParameters;
42 use devices::GpeScope;
43 use devices::HotPlugBus;
44 use devices::IrqChip;
45 use devices::IrqEventSource;
46 use devices::PciAddress;
47 use devices::PciBus;
48 use devices::PciDevice;
49 use devices::PciDeviceError;
50 use devices::PciInterruptPin;
51 use devices::PciRoot;
52 use devices::PciRootCommand;
53 use devices::PreferredIrq;
54 #[cfg(any(target_os = "android", target_os = "linux"))]
55 use devices::ProxyDevice;
56 use devices::SerialHardware;
57 use devices::SerialParameters;
58 use devices::VirtioMmioDevice;
59 pub use fdt::apply_device_tree_overlays;
60 pub use fdt::DtbOverlay;
61 #[cfg(feature = "gdb")]
62 use gdbstub::arch::Arch;
63 use hypervisor::IoEventAddress;
64 use hypervisor::MemCacheType;
65 use hypervisor::Vm;
66 #[cfg(windows)]
67 use jail::FakeMinijailStub as Minijail;
68 #[cfg(any(target_os = "android", target_os = "linux"))]
69 use minijail::Minijail;
70 use remain::sorted;
71 use resources::SystemAllocator;
72 use resources::SystemAllocatorConfig;
73 use serde::de::Visitor;
74 use serde::Deserialize;
75 use serde::Serialize;
76 use serde_keyvalue::FromKeyValues;
77 pub use serial::add_serial_devices;
78 pub use serial::get_serial_cmdline;
79 pub use serial::set_default_serial_parameters;
80 pub use serial::GetSerialCmdlineError;
81 pub use serial::SERIAL_ADDR;
82 use sync::Condvar;
83 use sync::Mutex;
84 #[cfg(any(target_os = "android", target_os = "linux"))]
85 pub use sys::linux::PlatformBusResources;
86 use thiserror::Error;
87 use uuid::Uuid;
88 use vm_control::BatControl;
89 use vm_control::BatteryType;
90 use vm_control::PmResource;
91 use vm_memory::GuestAddress;
92 use vm_memory::GuestMemory;
93 use vm_memory::GuestMemoryError;
94 use vm_memory::MemoryRegionInformation;
95 use vm_memory::MemoryRegionOptions;
96 
97 cfg_if::cfg_if! {
98     if #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] {
99         pub use devices::IrqChipAArch64 as IrqChipArch;
100         #[cfg(feature = "gdb")]
101         pub use gdbstub_arch::aarch64::AArch64 as GdbArch;
102         pub use hypervisor::CpuConfigAArch64 as CpuConfigArch;
103         pub use hypervisor::Hypervisor as HypervisorArch;
104         pub use hypervisor::VcpuAArch64 as VcpuArch;
105         pub use hypervisor::VcpuInitAArch64 as VcpuInitArch;
106         pub use hypervisor::VmAArch64 as VmArch;
107     } else if #[cfg(target_arch = "riscv64")] {
108         pub use devices::IrqChipRiscv64 as IrqChipArch;
109         #[cfg(feature = "gdb")]
110         pub use gdbstub_arch::riscv::Riscv64 as GdbArch;
111         pub use hypervisor::CpuConfigRiscv64 as CpuConfigArch;
112         pub use hypervisor::Hypervisor as HypervisorArch;
113         pub use hypervisor::VcpuInitRiscv64 as VcpuInitArch;
114         pub use hypervisor::VcpuRiscv64 as VcpuArch;
115         pub use hypervisor::VmRiscv64 as VmArch;
116     } else if #[cfg(target_arch = "x86_64")] {
117         pub use devices::IrqChipX86_64 as IrqChipArch;
118         #[cfg(feature = "gdb")]
119         pub use gdbstub_arch::x86::X86_64_SSE as GdbArch;
120         pub use hypervisor::CpuConfigX86_64 as CpuConfigArch;
121         pub use hypervisor::HypervisorX86_64 as HypervisorArch;
122         pub use hypervisor::VcpuInitX86_64 as VcpuInitArch;
123         pub use hypervisor::VcpuX86_64 as VcpuArch;
124         pub use hypervisor::VmX86_64 as VmArch;
125     }
126 }
127 
128 pub enum VmImage {
129     Kernel(File),
130     Bios(File),
131 }
132 
133 #[derive(Clone, Debug, Deserialize, Serialize, FromKeyValues, PartialEq, Eq)]
134 #[serde(deny_unknown_fields, rename_all = "kebab-case")]
135 pub struct Pstore {
136     pub path: PathBuf,
137     pub size: u32,
138 }
139 
140 #[derive(Clone, Copy, Debug, Serialize, Deserialize, FromKeyValues)]
141 #[serde(deny_unknown_fields, rename_all = "kebab-case")]
142 pub enum FdtPosition {
143     /// At the start of RAM.
144     Start,
145     /// Near the end of RAM.
146     End,
147     /// After the payload, with some padding for alignment.
148     AfterPayload,
149 }
150 
151 /// Set of CPU cores.
152 #[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
153 pub struct CpuSet(Vec<usize>);
154 
155 impl CpuSet {
new<I: IntoIterator<Item = usize>>(cpus: I) -> Self156     pub fn new<I: IntoIterator<Item = usize>>(cpus: I) -> Self {
157         CpuSet(cpus.into_iter().collect())
158     }
159 
iter(&self) -> std::slice::Iter<'_, usize>160     pub fn iter(&self) -> std::slice::Iter<'_, usize> {
161         self.0.iter()
162     }
163 }
164 
165 impl FromIterator<usize> for CpuSet {
from_iter<T>(iter: T) -> Self where T: IntoIterator<Item = usize>,166     fn from_iter<T>(iter: T) -> Self
167     where
168         T: IntoIterator<Item = usize>,
169     {
170         CpuSet::new(iter)
171     }
172 }
173 
174 /// The SVE config for Vcpus.
175 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
176 #[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize)]
177 #[serde(deny_unknown_fields, rename_all = "kebab-case")]
178 pub struct SveConfig {
179     /// Use SVE
180     pub enable: bool,
181 }
182 
parse_cpu_range(s: &str, cpuset: &mut Vec<usize>) -> Result<(), String>183 fn parse_cpu_range(s: &str, cpuset: &mut Vec<usize>) -> Result<(), String> {
184     fn parse_cpu(s: &str) -> Result<usize, String> {
185         s.parse().map_err(|_| {
186             format!(
187                 "invalid CPU index {} - index must be a non-negative integer",
188                 s
189             )
190         })
191     }
192 
193     let (first_cpu, last_cpu) = match s.split_once('-') {
194         Some((first_cpu, last_cpu)) => {
195             let first_cpu = parse_cpu(first_cpu)?;
196             let last_cpu = parse_cpu(last_cpu)?;
197 
198             if last_cpu < first_cpu {
199                 return Err(format!(
200                     "invalid CPU range {} - ranges must be from low to high",
201                     s
202                 ));
203             }
204             (first_cpu, last_cpu)
205         }
206         None => {
207             let cpu = parse_cpu(s)?;
208             (cpu, cpu)
209         }
210     };
211 
212     cpuset.extend(first_cpu..=last_cpu);
213 
214     Ok(())
215 }
216 
217 impl FromStr for CpuSet {
218     type Err = String;
219 
from_str(s: &str) -> Result<Self, Self::Err>220     fn from_str(s: &str) -> Result<Self, Self::Err> {
221         let mut cpuset = Vec::new();
222         for part in s.split(',') {
223             parse_cpu_range(part, &mut cpuset)?;
224         }
225         Ok(CpuSet::new(cpuset))
226     }
227 }
228 
229 impl Deref for CpuSet {
230     type Target = Vec<usize>;
231 
deref(&self) -> &Self::Target232     fn deref(&self) -> &Self::Target {
233         &self.0
234     }
235 }
236 
237 impl IntoIterator for CpuSet {
238     type Item = usize;
239     type IntoIter = std::vec::IntoIter<Self::Item>;
240 
into_iter(self) -> Self::IntoIter241     fn into_iter(self) -> Self::IntoIter {
242         self.0.into_iter()
243     }
244 }
245 
246 /// Deserializes a `CpuSet` from a sequence which elements can either be integers, or strings
247 /// representing CPU ranges (e.g. `5-8`).
248 impl<'de> Deserialize<'de> for CpuSet {
deserialize<D>(deserializer: D) -> Result<Self, D::Error> where D: serde::Deserializer<'de>,249     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
250     where
251         D: serde::Deserializer<'de>,
252     {
253         struct CpuSetVisitor;
254         impl<'de> Visitor<'de> for CpuSetVisitor {
255             type Value = CpuSet;
256 
257             fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
258                 formatter.write_str("CpuSet")
259             }
260 
261             fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
262             where
263                 A: serde::de::SeqAccess<'de>,
264             {
265                 #[derive(Deserialize)]
266                 #[serde(untagged)]
267                 enum CpuSetValue<'a> {
268                     Single(usize),
269                     Range(&'a str),
270                 }
271 
272                 let mut cpus = Vec::new();
273                 while let Some(cpuset) = seq.next_element::<CpuSetValue>()? {
274                     match cpuset {
275                         CpuSetValue::Single(cpu) => cpus.push(cpu),
276                         CpuSetValue::Range(range) => {
277                             parse_cpu_range(range, &mut cpus).map_err(serde::de::Error::custom)?;
278                         }
279                     }
280                 }
281 
282                 Ok(CpuSet::new(cpus))
283             }
284         }
285 
286         deserializer.deserialize_seq(CpuSetVisitor)
287     }
288 }
289 
290 /// Serializes a `CpuSet` into a sequence of integers and strings representing CPU ranges.
291 impl Serialize for CpuSet {
serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> where S: serde::Serializer,292     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
293     where
294         S: serde::Serializer,
295     {
296         use serde::ser::SerializeSeq;
297 
298         let mut seq = serializer.serialize_seq(None)?;
299 
300         // Factorize ranges into "a-b" strings.
301         let mut serialize_range = |start: usize, end: usize| -> Result<(), S::Error> {
302             if start == end {
303                 seq.serialize_element(&start)?;
304             } else {
305                 seq.serialize_element(&format!("{}-{}", start, end))?;
306             }
307 
308             Ok(())
309         };
310 
311         // Current range.
312         let mut range = None;
313         for core in &self.0 {
314             range = match range {
315                 None => Some((core, core)),
316                 Some((start, end)) if *end == *core - 1 => Some((start, core)),
317                 Some((start, end)) => {
318                     serialize_range(*start, *end)?;
319                     Some((core, core))
320                 }
321             };
322         }
323 
324         if let Some((start, end)) = range {
325             serialize_range(*start, *end)?;
326         }
327 
328         seq.end()
329     }
330 }
331 
332 /// Mapping of guest VCPU threads to host CPU cores.
333 #[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
334 pub enum VcpuAffinity {
335     /// All VCPU threads will be pinned to the same set of host CPU cores.
336     Global(CpuSet),
337     /// Each VCPU may be pinned to a set of host CPU cores.
338     /// The map key is a guest VCPU index, and the corresponding value is the set of
339     /// host CPU indices that the VCPU thread will be allowed to run on.
340     /// If a VCPU index is not present in the map, its affinity will not be set.
341     PerVcpu(BTreeMap<usize, CpuSet>),
342 }
343 
344 /// Memory region with optional size.
345 #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize, FromKeyValues)]
346 pub struct MemoryRegionConfig {
347     pub start: u64,
348     pub size: Option<u64>,
349 }
350 
351 /// General PCI config.
352 #[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize, FromKeyValues)]
353 pub struct PciConfig {
354     /// region for PCI Configuration Access Mechanism
355     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
356     pub cam: Option<MemoryRegionConfig>,
357     /// region for PCIe Enhanced Configuration Access Mechanism
358     #[cfg(target_arch = "x86_64")]
359     pub ecam: Option<MemoryRegionConfig>,
360     /// region for non-prefetchable PCI device memory below 4G
361     pub mem: Option<MemoryRegionConfig>,
362 }
363 
364 /// Holds the pieces needed to build a VM. Passed to `build_vm` in the `LinuxArch` trait below to
365 /// create a `RunnableLinuxVm`.
366 #[sorted]
367 pub struct VmComponents {
368     #[cfg(all(target_arch = "x86_64", unix))]
369     pub ac_adapter: bool,
370     pub acpi_sdts: Vec<SDT>,
371     pub android_fstab: Option<File>,
372     pub boot_cpu: usize,
373     pub bootorder_fw_cfg_blob: Vec<u8>,
374     #[cfg(target_arch = "x86_64")]
375     pub break_linux_pci_config_io: bool,
376     pub cpu_capacity: BTreeMap<usize, u32>,
377     pub cpu_clusters: Vec<CpuSet>,
378     #[cfg(all(
379         any(target_arch = "arm", target_arch = "aarch64"),
380         any(target_os = "android", target_os = "linux")
381     ))]
382     pub cpu_frequencies: BTreeMap<usize, Vec<u32>>,
383     pub delay_rt: bool,
384     pub dynamic_power_coefficient: BTreeMap<usize, u32>,
385     pub extra_kernel_params: Vec<String>,
386     #[cfg(target_arch = "x86_64")]
387     pub force_s2idle: bool,
388     pub fw_cfg_enable: bool,
389     pub fw_cfg_parameters: Vec<FwCfgParameters>,
390     pub host_cpu_topology: bool,
391     pub hugepages: bool,
392     pub hv_cfg: hypervisor::Config,
393     pub initrd_image: Option<File>,
394     pub itmt: bool,
395     pub memory_size: u64,
396     pub no_i8042: bool,
397     pub no_rtc: bool,
398     pub no_smt: bool,
399     #[cfg(all(
400         any(target_arch = "arm", target_arch = "aarch64"),
401         any(target_os = "android", target_os = "linux")
402     ))]
403     pub normalized_cpu_capacities: BTreeMap<usize, u32>,
404     pub pci_config: PciConfig,
405     pub pflash_block_size: u32,
406     pub pflash_image: Option<File>,
407     pub pstore: Option<Pstore>,
408     /// A file to load as pVM firmware. Must be `Some` iff
409     /// `hv_cfg.protection_type == ProtectionType::UnprotectedWithFirmware`.
410     pub pvm_fw: Option<File>,
411     pub rt_cpus: CpuSet,
412     #[cfg(target_arch = "x86_64")]
413     pub smbios: SmbiosOptions,
414     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
415     pub sve_config: SveConfig,
416     pub swiotlb: Option<u64>,
417     pub vcpu_affinity: Option<VcpuAffinity>,
418     pub vcpu_count: usize,
419     #[cfg(all(
420         any(target_arch = "arm", target_arch = "aarch64"),
421         any(target_os = "android", target_os = "linux")
422     ))]
423     pub vcpu_domain_paths: BTreeMap<usize, PathBuf>,
424     #[cfg(all(
425         any(target_arch = "arm", target_arch = "aarch64"),
426         any(target_os = "android", target_os = "linux")
427     ))]
428     pub vcpu_domains: BTreeMap<usize, u32>,
429     #[cfg(all(
430         any(target_arch = "arm", target_arch = "aarch64"),
431         any(target_os = "android", target_os = "linux")
432     ))]
433     pub virt_cpufreq_v2: bool,
434     pub vm_image: VmImage,
435 }
436 
437 /// Holds the elements needed to run a Linux VM. Created by `build_vm`.
438 #[sorted]
439 pub struct RunnableLinuxVm<V: VmArch, Vcpu: VcpuArch> {
440     pub bat_control: Option<BatControl>,
441     pub delay_rt: bool,
442     pub devices_thread: Option<std::thread::JoinHandle<()>>,
443     pub hotplug_bus: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
444     pub io_bus: Arc<Bus>,
445     pub irq_chip: Box<dyn IrqChipArch>,
446     pub mmio_bus: Arc<Bus>,
447     pub no_smt: bool,
448     pub pid_debug_label_map: BTreeMap<u32, String>,
449     #[cfg(any(target_os = "android", target_os = "linux"))]
450     pub platform_devices: Vec<Arc<Mutex<dyn BusDevice>>>,
451     pub pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
452     /// Devices to be notified before the system resumes from the S3 suspended state.
453     pub resume_notify_devices: Vec<Arc<Mutex<dyn BusResumeDevice>>>,
454     pub root_config: Arc<Mutex<PciRoot>>,
455     pub rt_cpus: CpuSet,
456     pub suspend_tube: (Arc<Mutex<SendTube>>, RecvTube),
457     pub vcpu_affinity: Option<VcpuAffinity>,
458     pub vcpu_count: usize,
459     pub vcpu_init: Vec<VcpuInitArch>,
460     /// If vcpus is None, then it's the responsibility of the vcpu thread to create vcpus.
461     /// If it's Some, then `build_vm` already created the vcpus.
462     pub vcpus: Option<Vec<Vcpu>>,
463     pub vm: V,
464     pub vm_request_tubes: Vec<Tube>,
465 }
466 
467 /// The device and optional jail.
468 pub struct VirtioDeviceStub {
469     pub dev: Box<dyn VirtioDevice>,
470     pub jail: Option<Minijail>,
471 }
472 
473 /// Trait which is implemented for each Linux Architecture in order to
474 /// set up the memory, cpus, and system devices and to boot the kernel.
475 pub trait LinuxArch {
476     type Error: StdError;
477     type ArchMemoryLayout;
478 
479     /// Decide architecture specific memory layout details to be used by later stages of the VM
480     /// setup.
arch_memory_layout( components: &VmComponents, ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error>481     fn arch_memory_layout(
482         components: &VmComponents,
483     ) -> std::result::Result<Self::ArchMemoryLayout, Self::Error>;
484 
485     /// Returns a Vec of the valid memory addresses as pairs of address and length. These should be
486     /// used to configure the `GuestMemory` structure for the platform.
487     ///
488     /// # Arguments
489     ///
490     /// * `components` - Parts used to determine the memory layout.
guest_memory_layout( components: &VmComponents, arch_memory_layout: &Self::ArchMemoryLayout, hypervisor: &impl hypervisor::Hypervisor, ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>491     fn guest_memory_layout(
492         components: &VmComponents,
493         arch_memory_layout: &Self::ArchMemoryLayout,
494         hypervisor: &impl hypervisor::Hypervisor,
495     ) -> std::result::Result<Vec<(GuestAddress, u64, MemoryRegionOptions)>, Self::Error>;
496 
497     /// Gets the configuration for a new `SystemAllocator` that fits the given `Vm`'s memory layout.
498     ///
499     /// This is the per-architecture template for constructing the `SystemAllocator`. Platform
500     /// agnostic modifications may be made to this configuration, but the final `SystemAllocator`
501     /// will be at least as strict as this configuration.
502     ///
503     /// # Arguments
504     ///
505     /// * `vm` - The virtual machine to be used as a template for the `SystemAllocator`.
get_system_allocator_config<V: Vm>( vm: &V, arch_memory_layout: &Self::ArchMemoryLayout, ) -> SystemAllocatorConfig506     fn get_system_allocator_config<V: Vm>(
507         vm: &V,
508         arch_memory_layout: &Self::ArchMemoryLayout,
509     ) -> SystemAllocatorConfig;
510 
511     /// Takes `VmComponents` and generates a `RunnableLinuxVm`.
512     ///
513     /// # Arguments
514     ///
515     /// * `components` - Parts to use to build the VM.
516     /// * `vm_evt_wrtube` - Tube used by sub-devices to request that crosvm exit because guest wants
517     ///   to stop/shut down or requested reset.
518     /// * `system_allocator` - Allocator created by this trait's implementation of
519     ///   `get_system_allocator_config`.
520     /// * `serial_parameters` - Definitions for how the serial devices should be configured.
521     /// * `serial_jail` - Jail used for serial devices created here.
522     /// * `battery` - Defines what battery device will be created.
523     /// * `vm` - A VM implementation to build upon.
524     /// * `ramoops_region` - Region allocated for ramoops.
525     /// * `devices` - The devices to be built into the VM.
526     /// * `irq_chip` - The IRQ chip implemention for the VM.
527     /// * `debugcon_jail` - Jail used for debugcon devices created here.
528     /// * `pflash_jail` - Jail used for pflash device created here.
529     /// * `fw_cfg_jail` - Jail used for fw_cfg device created here.
530     /// * `device_tree_overlays` - Device tree overlay binaries
build_vm<V, Vcpu>( components: VmComponents, arch_memory_layout: &Self::ArchMemoryLayout, vm_evt_wrtube: &SendTube, system_allocator: &mut SystemAllocator, serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>, serial_jail: Option<Minijail>, battery: (Option<BatteryType>, Option<Minijail>), vm: V, ramoops_region: Option<pstore::RamoopsRegion>, devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, irq_chip: &mut dyn IrqChipArch, vcpu_ids: &mut Vec<usize>, dump_device_tree_blob: Option<PathBuf>, debugcon_jail: Option<Minijail>, #[cfg(target_arch = "x86_64")] pflash_jail: Option<Minijail>, #[cfg(target_arch = "x86_64")] fw_cfg_jail: Option<Minijail>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, device_tree_overlays: Vec<DtbOverlay>, fdt_position: Option<FdtPosition>, no_pmu: bool, ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error> where V: VmArch, Vcpu: VcpuArch531     fn build_vm<V, Vcpu>(
532         components: VmComponents,
533         arch_memory_layout: &Self::ArchMemoryLayout,
534         vm_evt_wrtube: &SendTube,
535         system_allocator: &mut SystemAllocator,
536         serial_parameters: &BTreeMap<(SerialHardware, u8), SerialParameters>,
537         serial_jail: Option<Minijail>,
538         battery: (Option<BatteryType>, Option<Minijail>),
539         vm: V,
540         ramoops_region: Option<pstore::RamoopsRegion>,
541         devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
542         irq_chip: &mut dyn IrqChipArch,
543         vcpu_ids: &mut Vec<usize>,
544         dump_device_tree_blob: Option<PathBuf>,
545         debugcon_jail: Option<Minijail>,
546         #[cfg(target_arch = "x86_64")] pflash_jail: Option<Minijail>,
547         #[cfg(target_arch = "x86_64")] fw_cfg_jail: Option<Minijail>,
548         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
549         guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
550         device_tree_overlays: Vec<DtbOverlay>,
551         fdt_position: Option<FdtPosition>,
552         no_pmu: bool,
553     ) -> std::result::Result<RunnableLinuxVm<V, Vcpu>, Self::Error>
554     where
555         V: VmArch,
556         Vcpu: VcpuArch;
557 
558     /// Configures the vcpu and should be called once per vcpu from the vcpu's thread.
559     ///
560     /// # Arguments
561     ///
562     /// * `vm` - The virtual machine object.
563     /// * `hypervisor` - The `Hypervisor` that created the vcpu.
564     /// * `irq_chip` - The `IrqChip` associated with this vm.
565     /// * `vcpu` - The VCPU object to configure.
566     /// * `vcpu_init` - The data required to initialize VCPU registers and other state.
567     /// * `vcpu_id` - The id of the given `vcpu`.
568     /// * `num_cpus` - Number of virtual CPUs the guest will have.
569     /// * `cpu_config` - CPU feature configurations.
configure_vcpu<V: Vm>( vm: &V, hypervisor: &dyn HypervisorArch, irq_chip: &mut dyn IrqChipArch, vcpu: &mut dyn VcpuArch, vcpu_init: VcpuInitArch, vcpu_id: usize, num_cpus: usize, cpu_config: Option<CpuConfigArch>, ) -> Result<(), Self::Error>570     fn configure_vcpu<V: Vm>(
571         vm: &V,
572         hypervisor: &dyn HypervisorArch,
573         irq_chip: &mut dyn IrqChipArch,
574         vcpu: &mut dyn VcpuArch,
575         vcpu_init: VcpuInitArch,
576         vcpu_id: usize,
577         num_cpus: usize,
578         cpu_config: Option<CpuConfigArch>,
579     ) -> Result<(), Self::Error>;
580 
581     /// Configures and add a pci device into vm
register_pci_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, device: Box<dyn PciDevice>, #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>, resources: &mut SystemAllocator, hp_control_tube: &mpsc::Sender<PciRootCommand>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<PciAddress, Self::Error>582     fn register_pci_device<V: VmArch, Vcpu: VcpuArch>(
583         linux: &mut RunnableLinuxVm<V, Vcpu>,
584         device: Box<dyn PciDevice>,
585         #[cfg(any(target_os = "android", target_os = "linux"))] minijail: Option<Minijail>,
586         resources: &mut SystemAllocator,
587         hp_control_tube: &mpsc::Sender<PciRootCommand>,
588         #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
589     ) -> Result<PciAddress, Self::Error>;
590 
591     /// Returns frequency map for each of the host's logical cores.
get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>, Self::Error>592     fn get_host_cpu_frequencies_khz() -> Result<BTreeMap<usize, Vec<u32>>, Self::Error>;
593 
594     /// Returns max-freq map of the host's logical cores.
get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>, Self::Error>595     fn get_host_cpu_max_freq_khz() -> Result<BTreeMap<usize, u32>, Self::Error>;
596 
597     /// Returns capacity map of the host's logical cores.
get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>, Self::Error>598     fn get_host_cpu_capacity() -> Result<BTreeMap<usize, u32>, Self::Error>;
599 
600     /// Returns cluster masks for each of the host's logical cores.
get_host_cpu_clusters() -> Result<Vec<CpuSet>, Self::Error>601     fn get_host_cpu_clusters() -> Result<Vec<CpuSet>, Self::Error>;
602 }
603 
604 #[cfg(feature = "gdb")]
605 pub trait GdbOps<T: VcpuArch> {
606     type Error: StdError;
607 
608     /// Reads vCPU's registers.
read_registers(vcpu: &T) -> Result<<GdbArch as Arch>::Registers, Self::Error>609     fn read_registers(vcpu: &T) -> Result<<GdbArch as Arch>::Registers, Self::Error>;
610 
611     /// Writes vCPU's registers.
write_registers(vcpu: &T, regs: &<GdbArch as Arch>::Registers) -> Result<(), Self::Error>612     fn write_registers(vcpu: &T, regs: &<GdbArch as Arch>::Registers) -> Result<(), Self::Error>;
613 
614     /// Reads bytes from the guest memory.
read_memory( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, len: usize, ) -> Result<Vec<u8>, Self::Error>615     fn read_memory(
616         vcpu: &T,
617         guest_mem: &GuestMemory,
618         vaddr: GuestAddress,
619         len: usize,
620     ) -> Result<Vec<u8>, Self::Error>;
621 
622     /// Writes bytes to the specified guest memory.
write_memory( vcpu: &T, guest_mem: &GuestMemory, vaddr: GuestAddress, buf: &[u8], ) -> Result<(), Self::Error>623     fn write_memory(
624         vcpu: &T,
625         guest_mem: &GuestMemory,
626         vaddr: GuestAddress,
627         buf: &[u8],
628     ) -> Result<(), Self::Error>;
629 
630     /// Reads bytes from the guest register.
read_register(vcpu: &T, reg_id: <GdbArch as Arch>::RegId) -> Result<Vec<u8>, Self::Error>631     fn read_register(vcpu: &T, reg_id: <GdbArch as Arch>::RegId) -> Result<Vec<u8>, Self::Error>;
632 
633     /// Writes bytes to the specified guest register.
write_register( vcpu: &T, reg_id: <GdbArch as Arch>::RegId, data: &[u8], ) -> Result<(), Self::Error>634     fn write_register(
635         vcpu: &T,
636         reg_id: <GdbArch as Arch>::RegId,
637         data: &[u8],
638     ) -> Result<(), Self::Error>;
639 
640     /// Make the next vCPU's run single-step.
enable_singlestep(vcpu: &T) -> Result<(), Self::Error>641     fn enable_singlestep(vcpu: &T) -> Result<(), Self::Error>;
642 
643     /// Get maximum number of hardware breakpoints.
get_max_hw_breakpoints(vcpu: &T) -> Result<usize, Self::Error>644     fn get_max_hw_breakpoints(vcpu: &T) -> Result<usize, Self::Error>;
645 
646     /// Set hardware breakpoints at the given addresses.
set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<(), Self::Error>647     fn set_hw_breakpoints(vcpu: &T, breakpoints: &[GuestAddress]) -> Result<(), Self::Error>;
648 }
649 
650 /// Errors for device manager.
651 #[sorted]
652 #[derive(Error, Debug)]
653 pub enum DeviceRegistrationError {
654     /// No more MMIO space available.
655     #[error("no more addresses are available")]
656     AddrsExhausted,
657     /// Could not allocate device address space for the device.
658     #[error("Allocating device addresses: {0}")]
659     AllocateDeviceAddrs(PciDeviceError),
660     /// Could not allocate IO space for the device.
661     #[error("Allocating IO addresses: {0}")]
662     AllocateIoAddrs(PciDeviceError),
663     /// Could not allocate MMIO or IO resource for the device.
664     #[error("Allocating IO resource: {0}")]
665     AllocateIoResource(resources::Error),
666     /// Could not allocate an IRQ number.
667     #[error("Allocating IRQ number")]
668     AllocateIrq,
669     /// Could not allocate IRQ resource for the device.
670     #[cfg(any(target_os = "android", target_os = "linux"))]
671     #[error("Allocating IRQ resource: {0}")]
672     AllocateIrqResource(devices::vfio::VfioError),
673     /// Broken pci topology
674     #[error("pci topology is broken")]
675     BrokenPciTopology,
676     /// Unable to clone a jail for the device.
677     #[cfg(any(target_os = "android", target_os = "linux"))]
678     #[error("failed to clone jail: {0}")]
679     CloneJail(minijail::Error),
680     /// Appending to kernel command line failed.
681     #[error("unable to add device to kernel command line: {0}")]
682     Cmdline(kernel_cmdline::Error),
683     /// Configure window size failed.
684     #[error("failed to configure window size: {0}")]
685     ConfigureWindowSize(PciDeviceError),
686     // Unable to create a pipe.
687     #[error("failed to create pipe: {0}")]
688     CreatePipe(base::Error),
689     // Unable to create a root.
690     #[error("failed to create pci root: {0}")]
691     CreateRoot(anyhow::Error),
692     // Unable to create serial device from serial parameters
693     #[error("failed to create serial device: {0}")]
694     CreateSerialDevice(devices::SerialError),
695     // Unable to create tube
696     #[error("failed to create tube: {0}")]
697     CreateTube(base::TubeError),
698     /// Could not clone an event.
699     #[error("failed to clone event: {0}")]
700     EventClone(base::Error),
701     /// Could not create an event.
702     #[error("failed to create event: {0}")]
703     EventCreate(base::Error),
704     /// Failed to generate ACPI content.
705     #[error("failed to generate ACPI content")]
706     GenerateAcpi,
707     /// No more IRQs are available.
708     #[error("no more IRQs are available")]
709     IrqsExhausted,
710     /// VFIO device is missing a DT symbol.
711     #[error("cannot match VFIO device to DT node due to a missing symbol")]
712     MissingDeviceTreeSymbol,
713     /// Missing a required serial device.
714     #[error("missing required serial device {0}")]
715     MissingRequiredSerialDevice(u8),
716     /// Could not add a device to the mmio bus.
717     #[error("failed to add to mmio bus: {0}")]
718     MmioInsert(BusError),
719     /// Failed to insert device into PCI root.
720     #[error("failed to insert device into PCI root: {0}")]
721     PciRootAddDevice(PciDeviceError),
722     #[cfg(any(target_os = "android", target_os = "linux"))]
723     /// Failed to initialize proxy device for jailed device.
724     #[error("failed to create proxy device: {0}")]
725     ProxyDeviceCreation(devices::ProxyError),
726     #[cfg(any(target_os = "android", target_os = "linux"))]
727     /// Failed to register battery device.
728     #[error("failed to register battery device to VM: {0}")]
729     RegisterBattery(devices::BatteryError),
730     /// Could not register PCI device to pci root bus
731     #[error("failed to register PCI device to pci root bus")]
732     RegisterDevice(SendError<PciRootCommand>),
733     /// Could not register PCI device capabilities.
734     #[error("could not register PCI device capabilities: {0}")]
735     RegisterDeviceCapabilities(PciDeviceError),
736     /// Failed to register ioevent with VM.
737     #[error("failed to register ioevent to VM: {0}")]
738     RegisterIoevent(base::Error),
739     /// Failed to register irq event with VM.
740     #[error("failed to register irq event to VM: {0}")]
741     RegisterIrqfd(base::Error),
742     /// Could not setup VFIO platform IRQ for the device.
743     #[error("Setting up VFIO platform IRQ: {0}")]
744     SetupVfioPlatformIrq(anyhow::Error),
745 }
746 
747 /// Config a PCI device for used by this vm.
configure_pci_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, mut device: Box<dyn PciDevice>, #[cfg(any(target_os = "android", target_os = "linux"))] jail: Option<Minijail>, resources: &mut SystemAllocator, hp_control_tube: &mpsc::Sender<PciRootCommand>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<PciAddress, DeviceRegistrationError>748 pub fn configure_pci_device<V: VmArch, Vcpu: VcpuArch>(
749     linux: &mut RunnableLinuxVm<V, Vcpu>,
750     mut device: Box<dyn PciDevice>,
751     #[cfg(any(target_os = "android", target_os = "linux"))] jail: Option<Minijail>,
752     resources: &mut SystemAllocator,
753     hp_control_tube: &mpsc::Sender<PciRootCommand>,
754     #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
755 ) -> Result<PciAddress, DeviceRegistrationError> {
756     // Allocate PCI device address before allocating BARs.
757     let pci_address = device
758         .allocate_address(resources)
759         .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
760 
761     // Allocate ranges that may need to be in the low MMIO region (MmioType::Low).
762     let mmio_ranges = device
763         .allocate_io_bars(resources)
764         .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
765 
766     // Allocate device ranges that may be in low or high MMIO after low-only ranges.
767     let device_ranges = device
768         .allocate_device_bars(resources)
769         .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
770 
771     // If device is a pcie bridge, add its pci bus to pci root
772     if let Some(pci_bus) = device.get_new_pci_bus() {
773         hp_control_tube
774             .send(PciRootCommand::AddBridge(pci_bus))
775             .map_err(DeviceRegistrationError::RegisterDevice)?;
776         let bar_ranges = Vec::new();
777         device
778             .configure_bridge_window(resources, &bar_ranges)
779             .map_err(DeviceRegistrationError::ConfigureWindowSize)?;
780     }
781 
782     // Do not suggest INTx for hot-plug devices.
783     let intx_event = devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
784 
785     if let PreferredIrq::Fixed { pin, gsi } = device.preferred_irq() {
786         resources.reserve_irq(gsi);
787 
788         device.assign_irq(
789             intx_event
790                 .try_clone()
791                 .map_err(DeviceRegistrationError::EventClone)?,
792             pin,
793             gsi,
794         );
795 
796         linux
797             .irq_chip
798             .as_irq_chip_mut()
799             .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(&device))
800             .map_err(DeviceRegistrationError::RegisterIrqfd)?;
801     }
802 
803     let mut keep_rds = device.keep_rds();
804     syslog::push_descriptors(&mut keep_rds);
805     cros_tracing::push_descriptors!(&mut keep_rds);
806     metrics::push_descriptors(&mut keep_rds);
807 
808     device
809         .register_device_capabilities()
810         .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?;
811 
812     #[cfg(any(target_os = "android", target_os = "linux"))]
813     let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
814         let proxy = ProxyDevice::new(
815             device,
816             jail,
817             keep_rds,
818             #[cfg(feature = "swap")]
819             swap_controller,
820         )
821         .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
822         linux
823             .pid_debug_label_map
824             .insert(proxy.pid() as u32, proxy.debug_label());
825         Arc::new(Mutex::new(proxy))
826     } else {
827         device.on_sandboxed();
828         Arc::new(Mutex::new(device))
829     };
830 
831     #[cfg(windows)]
832     let arced_dev = {
833         device.on_sandboxed();
834         Arc::new(Mutex::new(device))
835     };
836 
837     #[cfg(any(target_os = "android", target_os = "linux"))]
838     hp_control_tube
839         .send(PciRootCommand::Add(pci_address, arced_dev.clone()))
840         .map_err(DeviceRegistrationError::RegisterDevice)?;
841 
842     for range in &mmio_ranges {
843         linux
844             .mmio_bus
845             .insert(arced_dev.clone(), range.addr, range.size)
846             .map_err(DeviceRegistrationError::MmioInsert)?;
847     }
848 
849     for range in &device_ranges {
850         linux
851             .mmio_bus
852             .insert(arced_dev.clone(), range.addr, range.size)
853             .map_err(DeviceRegistrationError::MmioInsert)?;
854     }
855 
856     Ok(pci_address)
857 }
858 
859 /// Creates Virtio MMIO devices for use by this Vm.
generate_virtio_mmio_bus( devices: Vec<(VirtioMmioDevice, Option<Minijail>)>, irq_chip: &mut dyn IrqChip, mmio_bus: &Bus, resources: &mut SystemAllocator, vm: &mut impl Vm, sdts: Vec<SDT>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result<(BTreeMap<u32, String>, Vec<SDT>), DeviceRegistrationError>860 pub fn generate_virtio_mmio_bus(
861     devices: Vec<(VirtioMmioDevice, Option<Minijail>)>,
862     irq_chip: &mut dyn IrqChip,
863     mmio_bus: &Bus,
864     resources: &mut SystemAllocator,
865     vm: &mut impl Vm,
866     sdts: Vec<SDT>,
867     #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
868 ) -> Result<(BTreeMap<u32, String>, Vec<SDT>), DeviceRegistrationError> {
869     #[cfg_attr(windows, allow(unused_mut))]
870     let mut pid_labels = BTreeMap::new();
871 
872     // sdts can be updated only on x86 platforms.
873     #[cfg(target_arch = "x86_64")]
874     let mut sdts = sdts;
875     for dev_value in devices.into_iter() {
876         #[cfg(any(target_os = "android", target_os = "linux"))]
877         let (mut device, jail) = dev_value;
878         #[cfg(windows)]
879         let (mut device, _) = dev_value;
880 
881         let ranges = device
882             .allocate_regions(resources)
883             .map_err(DeviceRegistrationError::AllocateIoResource)?;
884 
885         let mut keep_rds = device.keep_rds();
886         syslog::push_descriptors(&mut keep_rds);
887         cros_tracing::push_descriptors!(&mut keep_rds);
888         metrics::push_descriptors(&mut keep_rds);
889 
890         let irq_num = resources
891             .allocate_irq()
892             .ok_or(DeviceRegistrationError::AllocateIrq)?;
893         let irq_evt = devices::IrqEdgeEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
894         irq_chip
895             .register_edge_irq_event(irq_num, &irq_evt, IrqEventSource::from_device(&device))
896             .map_err(DeviceRegistrationError::RegisterIrqfd)?;
897         device.assign_irq(&irq_evt, irq_num);
898         keep_rds.extend(irq_evt.as_raw_descriptors());
899 
900         for (event, addr, datamatch) in device.ioevents() {
901             let io_addr = IoEventAddress::Mmio(addr);
902             vm.register_ioevent(event, io_addr, datamatch)
903                 .map_err(DeviceRegistrationError::RegisterIoevent)?;
904             keep_rds.push(event.as_raw_descriptor());
905         }
906 
907         #[cfg(target_arch = "x86_64")]
908         {
909             sdts = device
910                 .generate_acpi(sdts)
911                 .ok_or(DeviceRegistrationError::GenerateAcpi)?;
912         }
913 
914         #[cfg(any(target_os = "android", target_os = "linux"))]
915         let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
916             let proxy = ProxyDevice::new(
917                 device,
918                 jail,
919                 keep_rds,
920                 #[cfg(feature = "swap")]
921                 swap_controller,
922             )
923             .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
924             pid_labels.insert(proxy.pid() as u32, proxy.debug_label());
925             Arc::new(Mutex::new(proxy))
926         } else {
927             device.on_sandboxed();
928             Arc::new(Mutex::new(device))
929         };
930 
931         #[cfg(windows)]
932         let arced_dev = {
933             device.on_sandboxed();
934             Arc::new(Mutex::new(device))
935         };
936 
937         for range in &ranges {
938             mmio_bus
939                 .insert(arced_dev.clone(), range.0, range.1)
940                 .map_err(DeviceRegistrationError::MmioInsert)?;
941         }
942     }
943     Ok((pid_labels, sdts))
944 }
945 
946 // Generate pci topology starting from parent bus
generate_pci_topology( parent_bus: Arc<Mutex<PciBus>>, resources: &mut SystemAllocator, io_ranges: &mut BTreeMap<usize, Vec<BarRange>>, device_ranges: &mut BTreeMap<usize, Vec<BarRange>>, device_addrs: &[PciAddress], devices: &mut Vec<(Box<dyn PciDevice>, Option<Minijail>)>, ) -> Result<(Vec<BarRange>, u8), DeviceRegistrationError>947 fn generate_pci_topology(
948     parent_bus: Arc<Mutex<PciBus>>,
949     resources: &mut SystemAllocator,
950     io_ranges: &mut BTreeMap<usize, Vec<BarRange>>,
951     device_ranges: &mut BTreeMap<usize, Vec<BarRange>>,
952     device_addrs: &[PciAddress],
953     devices: &mut Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
954 ) -> Result<(Vec<BarRange>, u8), DeviceRegistrationError> {
955     let mut bar_ranges = Vec::new();
956     let bus_num = parent_bus.lock().get_bus_num();
957     let mut subordinate_bus = bus_num;
958     for (dev_idx, addr) in device_addrs.iter().enumerate() {
959         // Only target for devices that located on this bus
960         if addr.bus == bus_num {
961             // If this device is a pci bridge (a.k.a., it has a pci bus structure),
962             // create its topology recursively
963             if let Some(child_bus) = devices[dev_idx].0.get_new_pci_bus() {
964                 let (child_bar_ranges, child_sub_bus) = generate_pci_topology(
965                     child_bus.clone(),
966                     resources,
967                     io_ranges,
968                     device_ranges,
969                     device_addrs,
970                     devices,
971                 )?;
972                 let device = &mut devices[dev_idx].0;
973                 parent_bus
974                     .lock()
975                     .add_child_bus(child_bus.clone())
976                     .map_err(|_| DeviceRegistrationError::BrokenPciTopology)?;
977                 let bridge_window = device
978                     .configure_bridge_window(resources, &child_bar_ranges)
979                     .map_err(DeviceRegistrationError::ConfigureWindowSize)?;
980                 bar_ranges.extend(bridge_window);
981 
982                 let ranges = device
983                     .allocate_io_bars(resources)
984                     .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
985                 io_ranges.insert(dev_idx, ranges.clone());
986                 bar_ranges.extend(ranges);
987 
988                 let ranges = device
989                     .allocate_device_bars(resources)
990                     .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
991                 device_ranges.insert(dev_idx, ranges.clone());
992                 bar_ranges.extend(ranges);
993 
994                 device.set_subordinate_bus(child_sub_bus);
995 
996                 subordinate_bus = std::cmp::max(subordinate_bus, child_sub_bus);
997             }
998         }
999     }
1000 
1001     for (dev_idx, addr) in device_addrs.iter().enumerate() {
1002         if addr.bus == bus_num {
1003             let device = &mut devices[dev_idx].0;
1004             // Allocate MMIO for non-bridge devices
1005             if device.get_new_pci_bus().is_none() {
1006                 let ranges = device
1007                     .allocate_io_bars(resources)
1008                     .map_err(DeviceRegistrationError::AllocateIoAddrs)?;
1009                 io_ranges.insert(dev_idx, ranges.clone());
1010                 bar_ranges.extend(ranges);
1011 
1012                 let ranges = device
1013                     .allocate_device_bars(resources)
1014                     .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1015                 device_ranges.insert(dev_idx, ranges.clone());
1016                 bar_ranges.extend(ranges);
1017             }
1018         }
1019     }
1020     Ok((bar_ranges, subordinate_bus))
1021 }
1022 
1023 /// Ensure all PCI devices have an assigned PCI address.
assign_pci_addresses( devices: &mut [(Box<dyn BusDeviceObj>, Option<Minijail>)], resources: &mut SystemAllocator, ) -> Result<(), DeviceRegistrationError>1024 pub fn assign_pci_addresses(
1025     devices: &mut [(Box<dyn BusDeviceObj>, Option<Minijail>)],
1026     resources: &mut SystemAllocator,
1027 ) -> Result<(), DeviceRegistrationError> {
1028     // First allocate devices with a preferred address.
1029     for pci_device in devices
1030         .iter_mut()
1031         .filter_map(|(device, _jail)| device.as_pci_device_mut())
1032         .filter(|pci_device| pci_device.preferred_address().is_some())
1033     {
1034         let _ = pci_device
1035             .allocate_address(resources)
1036             .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1037     }
1038 
1039     // Then allocate addresses for the remaining devices.
1040     for pci_device in devices
1041         .iter_mut()
1042         .filter_map(|(device, _jail)| device.as_pci_device_mut())
1043         .filter(|pci_device| pci_device.preferred_address().is_none())
1044     {
1045         let _ = pci_device
1046             .allocate_address(resources)
1047             .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1048     }
1049 
1050     Ok(())
1051 }
1052 
1053 /// Creates a root PCI device for use by this Vm.
generate_pci_root( mut devices: Vec<(Box<dyn PciDevice>, Option<Minijail>)>, irq_chip: &mut dyn IrqChip, mmio_bus: Arc<Bus>, mmio_base: GuestAddress, mmio_register_bit_num: usize, io_bus: Arc<Bus>, resources: &mut SystemAllocator, vm: &mut impl Vm, max_irqs: usize, vcfg_base: Option<u64>, #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>, ) -> Result< ( PciRoot, Vec<(PciAddress, u32, PciInterruptPin)>, BTreeMap<u32, String>, BTreeMap<PciAddress, Vec<u8>>, BTreeMap<PciAddress, Vec<u8>>, ), DeviceRegistrationError, >1054 pub fn generate_pci_root(
1055     mut devices: Vec<(Box<dyn PciDevice>, Option<Minijail>)>,
1056     irq_chip: &mut dyn IrqChip,
1057     mmio_bus: Arc<Bus>,
1058     mmio_base: GuestAddress,
1059     mmio_register_bit_num: usize,
1060     io_bus: Arc<Bus>,
1061     resources: &mut SystemAllocator,
1062     vm: &mut impl Vm,
1063     max_irqs: usize,
1064     vcfg_base: Option<u64>,
1065     #[cfg(feature = "swap")] swap_controller: &mut Option<swap::SwapController>,
1066 ) -> Result<
1067     (
1068         PciRoot,
1069         Vec<(PciAddress, u32, PciInterruptPin)>,
1070         BTreeMap<u32, String>,
1071         BTreeMap<PciAddress, Vec<u8>>,
1072         BTreeMap<PciAddress, Vec<u8>>,
1073     ),
1074     DeviceRegistrationError,
1075 > {
1076     let mut device_addrs = Vec::new();
1077 
1078     for (device, _jail) in devices.iter_mut() {
1079         let address = device
1080             .allocate_address(resources)
1081             .map_err(DeviceRegistrationError::AllocateDeviceAddrs)?;
1082         device_addrs.push(address);
1083     }
1084 
1085     let mut device_ranges = BTreeMap::new();
1086     let mut io_ranges = BTreeMap::new();
1087     let root_bus = Arc::new(Mutex::new(PciBus::new(0, 0, false)));
1088 
1089     generate_pci_topology(
1090         root_bus.clone(),
1091         resources,
1092         &mut io_ranges,
1093         &mut device_ranges,
1094         &device_addrs,
1095         &mut devices,
1096     )?;
1097 
1098     let mut root = PciRoot::new(
1099         vm,
1100         Arc::downgrade(&mmio_bus),
1101         mmio_base,
1102         mmio_register_bit_num,
1103         Arc::downgrade(&io_bus),
1104         root_bus,
1105     )
1106     .map_err(DeviceRegistrationError::CreateRoot)?;
1107     #[cfg_attr(windows, allow(unused_mut))]
1108     let mut pid_labels = BTreeMap::new();
1109 
1110     // Allocate legacy INTx
1111     let mut pci_irqs = Vec::new();
1112     let mut irqs: Vec<u32> = Vec::new();
1113 
1114     // Mapping of (bus, dev, pin) -> IRQ number.
1115     let mut dev_pin_irq = BTreeMap::new();
1116 
1117     for (dev_idx, (device, _jail)) in devices.iter_mut().enumerate() {
1118         let pci_address = device_addrs[dev_idx];
1119 
1120         let irq = match device.preferred_irq() {
1121             PreferredIrq::Fixed { pin, gsi } => {
1122                 // The device reported a preferred IRQ, so use that rather than allocating one.
1123                 resources.reserve_irq(gsi);
1124                 Some((pin, gsi))
1125             }
1126             PreferredIrq::Any => {
1127                 // The device did not provide a preferred IRQ but requested one, so allocate one.
1128 
1129                 // Choose a pin based on the slot's function number. Function 0 must always use
1130                 // INTA# for single-function devices per the PCI spec, and we choose to use INTA#
1131                 // for function 0 on multifunction devices and distribute the remaining functions
1132                 // evenly across the other pins.
1133                 let pin = match pci_address.func % 4 {
1134                     0 => PciInterruptPin::IntA,
1135                     1 => PciInterruptPin::IntB,
1136                     2 => PciInterruptPin::IntC,
1137                     _ => PciInterruptPin::IntD,
1138                 };
1139 
1140                 // If an IRQ number has already been assigned for a different function with this
1141                 // (bus, device, pin) combination, use it. Otherwise allocate a new one and insert
1142                 // it into the map.
1143                 let pin_key = (pci_address.bus, pci_address.dev, pin);
1144                 let irq_num = if let Some(irq_num) = dev_pin_irq.get(&pin_key) {
1145                     *irq_num
1146                 } else {
1147                     // If we have allocated fewer than `max_irqs` total, add a new irq to the `irqs`
1148                     // pool. Otherwise, share one of the existing `irqs`.
1149                     let irq_num = if irqs.len() < max_irqs {
1150                         let irq_num = resources
1151                             .allocate_irq()
1152                             .ok_or(DeviceRegistrationError::AllocateIrq)?;
1153                         irqs.push(irq_num);
1154                         irq_num
1155                     } else {
1156                         // Pick one of the existing IRQs to share, using `dev_idx` to distribute IRQ
1157                         // sharing evenly across devices.
1158                         irqs[dev_idx % max_irqs]
1159                     };
1160 
1161                     dev_pin_irq.insert(pin_key, irq_num);
1162                     irq_num
1163                 };
1164                 Some((pin, irq_num))
1165             }
1166             PreferredIrq::None => {
1167                 // The device does not want an INTx# IRQ.
1168                 None
1169             }
1170         };
1171 
1172         if let Some((pin, gsi)) = irq {
1173             let intx_event =
1174                 devices::IrqLevelEvent::new().map_err(DeviceRegistrationError::EventCreate)?;
1175 
1176             device.assign_irq(
1177                 intx_event
1178                     .try_clone()
1179                     .map_err(DeviceRegistrationError::EventClone)?,
1180                 pin,
1181                 gsi,
1182             );
1183 
1184             irq_chip
1185                 .register_level_irq_event(gsi, &intx_event, IrqEventSource::from_device(device))
1186                 .map_err(DeviceRegistrationError::RegisterIrqfd)?;
1187 
1188             pci_irqs.push((pci_address, gsi, pin));
1189         }
1190     }
1191 
1192     // To prevent issues where device's on_sandbox may spawn thread before all
1193     // sandboxed devices are sandboxed we partition iterator to go over sandboxed
1194     // first. This is needed on linux platforms. On windows, this is a no-op since
1195     // jails are always None, even for sandboxed devices.
1196     let devices = {
1197         let (sandboxed, non_sandboxed): (Vec<_>, Vec<_>) = devices
1198             .into_iter()
1199             .enumerate()
1200             .partition(|(_, (_, jail))| jail.is_some());
1201         sandboxed.into_iter().chain(non_sandboxed)
1202     };
1203 
1204     let mut amls = BTreeMap::new();
1205     let mut gpe_scope_amls = BTreeMap::new();
1206     for (dev_idx, dev_value) in devices {
1207         #[cfg(any(target_os = "android", target_os = "linux"))]
1208         let (mut device, jail) = dev_value;
1209         #[cfg(windows)]
1210         let (mut device, _) = dev_value;
1211         let address = device_addrs[dev_idx];
1212 
1213         let mut keep_rds = device.keep_rds();
1214         syslog::push_descriptors(&mut keep_rds);
1215         cros_tracing::push_descriptors!(&mut keep_rds);
1216         metrics::push_descriptors(&mut keep_rds);
1217         keep_rds.append(&mut vm.get_memory().as_raw_descriptors());
1218 
1219         let ranges = io_ranges.remove(&dev_idx).unwrap_or_default();
1220         let device_ranges = device_ranges.remove(&dev_idx).unwrap_or_default();
1221         device
1222             .register_device_capabilities()
1223             .map_err(DeviceRegistrationError::RegisterDeviceCapabilities)?;
1224 
1225         if let Some(vcfg_base) = vcfg_base {
1226             let (methods, shm) = device.generate_acpi_methods();
1227             if !methods.is_empty() {
1228                 amls.insert(address, methods);
1229             }
1230             if let Some((offset, mmap)) = shm {
1231                 let _ = vm.add_memory_region(
1232                     GuestAddress(vcfg_base + offset as u64),
1233                     Box::new(mmap),
1234                     false,
1235                     false,
1236                     MemCacheType::CacheCoherent,
1237                 );
1238             }
1239         }
1240         let gpe_nr = device.set_gpe(resources);
1241 
1242         #[cfg(any(target_os = "android", target_os = "linux"))]
1243         let arced_dev: Arc<Mutex<dyn BusDevice>> = if let Some(jail) = jail {
1244             let proxy = ProxyDevice::new(
1245                 device,
1246                 jail,
1247                 keep_rds,
1248                 #[cfg(feature = "swap")]
1249                 swap_controller,
1250             )
1251             .map_err(DeviceRegistrationError::ProxyDeviceCreation)?;
1252             pid_labels.insert(proxy.pid() as u32, proxy.debug_label());
1253             Arc::new(Mutex::new(proxy))
1254         } else {
1255             device.on_sandboxed();
1256             Arc::new(Mutex::new(device))
1257         };
1258         #[cfg(windows)]
1259         let arced_dev = {
1260             device.on_sandboxed();
1261             Arc::new(Mutex::new(device))
1262         };
1263         root.add_device(address, arced_dev.clone(), vm)
1264             .map_err(DeviceRegistrationError::PciRootAddDevice)?;
1265         for range in &ranges {
1266             mmio_bus
1267                 .insert(arced_dev.clone(), range.addr, range.size)
1268                 .map_err(DeviceRegistrationError::MmioInsert)?;
1269         }
1270 
1271         for range in &device_ranges {
1272             mmio_bus
1273                 .insert(arced_dev.clone(), range.addr, range.size)
1274                 .map_err(DeviceRegistrationError::MmioInsert)?;
1275         }
1276 
1277         if let Some(gpe_nr) = gpe_nr {
1278             if let Some(acpi_path) = root.acpi_path(&address) {
1279                 let mut gpe_aml = Vec::new();
1280 
1281                 GpeScope {}.cast_to_aml_bytes(
1282                     &mut gpe_aml,
1283                     gpe_nr,
1284                     format!("\\{}", acpi_path).as_str(),
1285                 );
1286                 if !gpe_aml.is_empty() {
1287                     gpe_scope_amls.insert(address, gpe_aml);
1288                 }
1289             }
1290         }
1291     }
1292 
1293     Ok((root, pci_irqs, pid_labels, amls, gpe_scope_amls))
1294 }
1295 
1296 /// Errors for image loading.
1297 #[sorted]
1298 #[derive(Error, Debug)]
1299 pub enum LoadImageError {
1300     #[error("Alignment not a power of two: {0}")]
1301     BadAlignment(u64),
1302     #[error("Getting image size failed: {0}")]
1303     GetLen(io::Error),
1304     #[error("GuestMemory get slice failed: {0}")]
1305     GuestMemorySlice(GuestMemoryError),
1306     #[error("Image size too large: {0}")]
1307     ImageSizeTooLarge(u64),
1308     #[error("No suitable memory region found")]
1309     NoSuitableMemoryRegion,
1310     #[error("Reading image into memory failed: {0}")]
1311     ReadToMemory(io::Error),
1312     #[error("Cannot load zero-sized image")]
1313     ZeroSizedImage,
1314 }
1315 
1316 /// Load an image from a file into guest memory.
1317 ///
1318 /// # Arguments
1319 ///
1320 /// * `guest_mem` - The memory to be used by the guest.
1321 /// * `guest_addr` - The starting address to load the image in the guest memory.
1322 /// * `max_size` - The amount of space in bytes available in the guest memory for the image.
1323 /// * `image` - The file containing the image to be loaded.
1324 ///
1325 /// The size in bytes of the loaded image is returned.
load_image<F>( guest_mem: &GuestMemory, image: &mut F, guest_addr: GuestAddress, max_size: u64, ) -> Result<usize, LoadImageError> where F: FileReadWriteAtVolatile + FileGetLen,1326 pub fn load_image<F>(
1327     guest_mem: &GuestMemory,
1328     image: &mut F,
1329     guest_addr: GuestAddress,
1330     max_size: u64,
1331 ) -> Result<usize, LoadImageError>
1332 where
1333     F: FileReadWriteAtVolatile + FileGetLen,
1334 {
1335     let size = image.get_len().map_err(LoadImageError::GetLen)?;
1336 
1337     if size > usize::MAX as u64 || size > max_size {
1338         return Err(LoadImageError::ImageSizeTooLarge(size));
1339     }
1340 
1341     // This is safe due to the bounds check above.
1342     let size = size as usize;
1343 
1344     let guest_slice = guest_mem
1345         .get_slice_at_addr(guest_addr, size)
1346         .map_err(LoadImageError::GuestMemorySlice)?;
1347     image
1348         .read_exact_at_volatile(guest_slice, 0)
1349         .map_err(LoadImageError::ReadToMemory)?;
1350 
1351     Ok(size)
1352 }
1353 
1354 /// Load an image from a file into guest memory at the highest possible address.
1355 ///
1356 /// # Arguments
1357 ///
1358 /// * `guest_mem` - The memory to be used by the guest.
1359 /// * `image` - The file containing the image to be loaded.
1360 /// * `min_guest_addr` - The minimum address of the start of the image.
1361 /// * `max_guest_addr` - The address to load the last byte of the image.
1362 /// * `region_filter` - The optional filter function for determining if the given guest memory
1363 ///   region is suitable for loading the image into it.
1364 /// * `align` - The minimum alignment of the start address of the image in bytes (must be a power of
1365 ///   two).
1366 ///
1367 /// The guest address and size in bytes of the loaded image are returned.
load_image_high<F>( guest_mem: &GuestMemory, image: &mut F, min_guest_addr: GuestAddress, max_guest_addr: GuestAddress, region_filter: Option<fn(&MemoryRegionInformation) -> bool>, align: u64, ) -> Result<(GuestAddress, usize), LoadImageError> where F: FileReadWriteAtVolatile + FileGetLen,1368 pub fn load_image_high<F>(
1369     guest_mem: &GuestMemory,
1370     image: &mut F,
1371     min_guest_addr: GuestAddress,
1372     max_guest_addr: GuestAddress,
1373     region_filter: Option<fn(&MemoryRegionInformation) -> bool>,
1374     align: u64,
1375 ) -> Result<(GuestAddress, usize), LoadImageError>
1376 where
1377     F: FileReadWriteAtVolatile + FileGetLen,
1378 {
1379     if !align.is_power_of_two() {
1380         return Err(LoadImageError::BadAlignment(align));
1381     }
1382 
1383     let max_size = max_guest_addr.offset_from(min_guest_addr) & !(align - 1);
1384     let size = image.get_len().map_err(LoadImageError::GetLen)?;
1385 
1386     if size == 0 {
1387         return Err(LoadImageError::ZeroSizedImage);
1388     }
1389 
1390     if size > usize::MAX as u64 || size > max_size {
1391         return Err(LoadImageError::ImageSizeTooLarge(size));
1392     }
1393 
1394     // Sort the list of guest memory regions by address so we can iterate over them in reverse order
1395     // (high to low).
1396     let mut regions: Vec<_> = guest_mem
1397         .regions()
1398         .filter(region_filter.unwrap_or(|_| true))
1399         .collect();
1400     regions.sort_unstable_by(|a, b| a.guest_addr.cmp(&b.guest_addr));
1401 
1402     // Find the highest valid address inside a guest memory region that satisfies the requested
1403     // alignment and min/max address requirements while having enough space for the image.
1404     let guest_addr = regions
1405         .into_iter()
1406         .rev()
1407         .filter_map(|r| {
1408             // Highest address within this region.
1409             let rgn_max_addr = r
1410                 .guest_addr
1411                 .checked_add((r.size as u64).checked_sub(1)?)?
1412                 .min(max_guest_addr);
1413             // Lowest aligned address within this region.
1414             let rgn_start_aligned = r.guest_addr.align(align)?;
1415             // Hypothetical address of the image if loaded at the end of the region.
1416             let image_addr = rgn_max_addr.checked_sub(size - 1)? & !(align - 1);
1417 
1418             // Would the image fit within the region?
1419             if image_addr >= rgn_start_aligned {
1420                 Some(image_addr)
1421             } else {
1422                 None
1423             }
1424         })
1425         .find(|&addr| addr >= min_guest_addr)
1426         .ok_or(LoadImageError::NoSuitableMemoryRegion)?;
1427 
1428     // This is safe due to the bounds check above.
1429     let size = size as usize;
1430 
1431     let guest_slice = guest_mem
1432         .get_slice_at_addr(guest_addr, size)
1433         .map_err(LoadImageError::GuestMemorySlice)?;
1434     image
1435         .read_exact_at_volatile(guest_slice, 0)
1436         .map_err(LoadImageError::ReadToMemory)?;
1437 
1438     Ok((guest_addr, size))
1439 }
1440 
1441 /// SMBIOS table configuration
1442 #[derive(Clone, Debug, Default, Serialize, Deserialize, FromKeyValues, PartialEq, Eq)]
1443 #[serde(deny_unknown_fields, rename_all = "kebab-case")]
1444 pub struct SmbiosOptions {
1445     /// BIOS vendor name.
1446     pub bios_vendor: Option<String>,
1447 
1448     /// BIOS version number (free-form string).
1449     pub bios_version: Option<String>,
1450 
1451     /// System manufacturer name.
1452     pub manufacturer: Option<String>,
1453 
1454     /// System product name.
1455     pub product_name: Option<String>,
1456 
1457     /// System serial number (free-form string).
1458     pub serial_number: Option<String>,
1459 
1460     /// System UUID.
1461     pub uuid: Option<Uuid>,
1462 
1463     /// Additional OEM strings to add to SMBIOS table.
1464     #[serde(default)]
1465     pub oem_strings: Vec<String>,
1466 }
1467 
1468 #[cfg(test)]
1469 mod tests {
1470     use serde_keyvalue::from_key_values;
1471     use tempfile::tempfile;
1472 
1473     use super::*;
1474 
1475     #[test]
parse_pstore()1476     fn parse_pstore() {
1477         let res: Pstore = from_key_values("path=/some/path,size=16384").unwrap();
1478         assert_eq!(
1479             res,
1480             Pstore {
1481                 path: "/some/path".into(),
1482                 size: 16384,
1483             }
1484         );
1485 
1486         let res = from_key_values::<Pstore>("path=/some/path");
1487         assert!(res.is_err());
1488 
1489         let res = from_key_values::<Pstore>("size=16384");
1490         assert!(res.is_err());
1491 
1492         let res = from_key_values::<Pstore>("");
1493         assert!(res.is_err());
1494     }
1495 
1496     #[test]
deserialize_cpuset_serde_kv()1497     fn deserialize_cpuset_serde_kv() {
1498         let res: CpuSet = from_key_values("[0,4,7]").unwrap();
1499         assert_eq!(res, CpuSet::new(vec![0, 4, 7]));
1500 
1501         let res: CpuSet = from_key_values("[9-12]").unwrap();
1502         assert_eq!(res, CpuSet::new(vec![9, 10, 11, 12]));
1503 
1504         let res: CpuSet = from_key_values("[0,4,7,9-12,15]").unwrap();
1505         assert_eq!(res, CpuSet::new(vec![0, 4, 7, 9, 10, 11, 12, 15]));
1506     }
1507 
1508     #[test]
deserialize_serialize_cpuset_json()1509     fn deserialize_serialize_cpuset_json() {
1510         let json_str = "[0,4,7]";
1511         let cpuset = CpuSet::new(vec![0, 4, 7]);
1512         let res: CpuSet = serde_json::from_str(json_str).unwrap();
1513         assert_eq!(res, cpuset);
1514         assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1515 
1516         let json_str = r#"["9-12"]"#;
1517         let cpuset = CpuSet::new(vec![9, 10, 11, 12]);
1518         let res: CpuSet = serde_json::from_str(json_str).unwrap();
1519         assert_eq!(res, cpuset);
1520         assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1521 
1522         let json_str = r#"[0,4,7,"9-12",15]"#;
1523         let cpuset = CpuSet::new(vec![0, 4, 7, 9, 10, 11, 12, 15]);
1524         let res: CpuSet = serde_json::from_str(json_str).unwrap();
1525         assert_eq!(res, cpuset);
1526         assert_eq!(serde_json::to_string(&cpuset).unwrap(), json_str);
1527     }
1528 
1529     #[test]
load_image_high_max_4g()1530     fn load_image_high_max_4g() {
1531         let mem = GuestMemory::new(&[
1532             (GuestAddress(0x0000_0000), 0x4000_0000), // 0x00000000..0x40000000
1533             (GuestAddress(0x8000_0000), 0x4000_0000), // 0x80000000..0xC0000000
1534         ])
1535         .unwrap();
1536 
1537         const TEST_IMAGE_SIZE: u64 = 1234;
1538         let mut test_image = tempfile().unwrap();
1539         test_image.set_len(TEST_IMAGE_SIZE).unwrap();
1540 
1541         const TEST_ALIGN: u64 = 0x8000;
1542         let (addr, size) = load_image_high(
1543             &mem,
1544             &mut test_image,
1545             GuestAddress(0x8000),
1546             GuestAddress(0xFFFF_FFFF), // max_guest_addr beyond highest guest memory region
1547             None,
1548             TEST_ALIGN,
1549         )
1550         .unwrap();
1551 
1552         assert_eq!(addr, GuestAddress(0xBFFF_8000));
1553         assert_eq!(addr.offset() % TEST_ALIGN, 0);
1554         assert_eq!(size, TEST_IMAGE_SIZE as usize);
1555     }
1556 }
1557