xref: /aosp_15_r20/external/crosvm/src/crosvm/sys/linux.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #[cfg(target_os = "android")]
6 mod android;
7 pub mod cmdline;
8 pub mod config;
9 mod device_helpers;
10 pub(crate) mod ext2;
11 #[cfg(feature = "gpu")]
12 pub(crate) mod gpu;
13 #[cfg(feature = "pci-hotplug")]
14 pub(crate) mod jail_warden;
15 #[cfg(feature = "pci-hotplug")]
16 pub(crate) mod pci_hotplug_helpers;
17 #[cfg(feature = "pci-hotplug")]
18 pub(crate) mod pci_hotplug_manager;
19 mod vcpu;
20 
21 #[cfg(all(feature = "pvclock", target_arch = "aarch64"))]
22 use std::arch::asm;
23 use std::cmp::max;
24 use std::collections::BTreeMap;
25 use std::collections::BTreeSet;
26 #[cfg(feature = "registered_events")]
27 use std::collections::HashMap;
28 #[cfg(feature = "registered_events")]
29 use std::collections::HashSet;
30 use std::convert::TryInto;
31 use std::ffi::CString;
32 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
33 use std::fs::create_dir_all;
34 use std::fs::File;
35 use std::fs::OpenOptions;
36 #[cfg(feature = "registered_events")]
37 use std::hash::Hash;
38 use std::io::stdin;
39 use std::iter;
40 use std::mem;
41 #[cfg(target_arch = "x86_64")]
42 use std::ops::RangeInclusive;
43 use std::os::unix::prelude::OpenOptionsExt;
44 use std::os::unix::process::ExitStatusExt;
45 use std::path::Path;
46 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
47 use std::path::PathBuf;
48 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
49 use std::process;
50 #[cfg(feature = "registered_events")]
51 use std::rc::Rc;
52 use std::sync::mpsc;
53 use std::sync::Arc;
54 use std::sync::Barrier;
55 use std::thread::JoinHandle;
56 
57 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
58 use aarch64::AArch64 as Arch;
59 use acpi_tables::sdt::SDT;
60 use anyhow::anyhow;
61 use anyhow::bail;
62 use anyhow::Context;
63 use anyhow::Result;
64 use arch::DtbOverlay;
65 use arch::IrqChipArch;
66 use arch::LinuxArch;
67 use arch::RunnableLinuxVm;
68 use arch::VcpuAffinity;
69 use arch::VcpuArch;
70 use arch::VirtioDeviceStub;
71 use arch::VmArch;
72 use arch::VmComponents;
73 use arch::VmImage;
74 use argh::FromArgs;
75 use base::ReadNotifier;
76 #[cfg(feature = "balloon")]
77 use base::UnixSeqpacket;
78 use base::UnixSeqpacketListener;
79 use base::UnlinkUnixSeqpacketListener;
80 use base::*;
81 use cros_async::Executor;
82 use device_helpers::*;
83 use devices::create_devices_worker_thread;
84 use devices::serial_device::SerialHardware;
85 #[cfg(all(feature = "pvclock", target_arch = "x86_64"))]
86 use devices::tsc::get_tsc_sync_mitigations;
87 use devices::vfio::VfioContainerManager;
88 #[cfg(feature = "gpu")]
89 use devices::virtio;
90 #[cfg(any(feature = "video-decoder", feature = "video-encoder"))]
91 use devices::virtio::device_constants::video::VideoDeviceType;
92 #[cfg(feature = "gpu")]
93 use devices::virtio::gpu::EventDevice;
94 #[cfg(target_arch = "x86_64")]
95 use devices::virtio::memory_mapper::MemoryMapper;
96 use devices::virtio::memory_mapper::MemoryMapperTrait;
97 use devices::virtio::vhost::user::VhostUserConnectionTrait;
98 use devices::virtio::vhost::user::VhostUserListener;
99 #[cfg(feature = "balloon")]
100 use devices::virtio::BalloonFeatures;
101 #[cfg(feature = "pci-hotplug")]
102 use devices::virtio::NetParameters;
103 #[cfg(feature = "pci-hotplug")]
104 use devices::virtio::NetParametersMode;
105 use devices::virtio::VirtioDevice;
106 use devices::virtio::VirtioDeviceType;
107 use devices::virtio::VirtioTransportType;
108 use devices::Bus;
109 use devices::BusDeviceObj;
110 use devices::BusType;
111 use devices::CoIommuDev;
112 #[cfg(feature = "usb")]
113 use devices::DeviceProvider;
114 #[cfg(target_arch = "x86_64")]
115 use devices::HotPlugBus;
116 #[cfg(target_arch = "x86_64")]
117 use devices::HotPlugKey;
118 use devices::IommuDevType;
119 use devices::IrqEventIndex;
120 use devices::IrqEventSource;
121 #[cfg(feature = "pci-hotplug")]
122 use devices::NetResourceCarrier;
123 #[cfg(target_arch = "x86_64")]
124 use devices::PciAddress;
125 #[cfg(target_arch = "x86_64")]
126 use devices::PciBridge;
127 use devices::PciDevice;
128 #[cfg(target_arch = "x86_64")]
129 use devices::PciMmioMapper;
130 #[cfg(target_arch = "x86_64")]
131 use devices::PciRoot;
132 #[cfg(target_arch = "x86_64")]
133 use devices::PciRootCommand;
134 #[cfg(target_arch = "x86_64")]
135 use devices::PcieDownstreamPort;
136 #[cfg(target_arch = "x86_64")]
137 use devices::PcieHostPort;
138 #[cfg(target_arch = "x86_64")]
139 use devices::PcieRootPort;
140 #[cfg(target_arch = "x86_64")]
141 use devices::PcieUpstreamPort;
142 use devices::PvPanicCode;
143 use devices::PvPanicPciDevice;
144 #[cfg(feature = "pci-hotplug")]
145 use devices::ResourceCarrier;
146 use devices::StubPciDevice;
147 use devices::VirtioMmioDevice;
148 use devices::VirtioPciDevice;
149 #[cfg(feature = "usb")]
150 use devices::XhciController;
151 #[cfg(feature = "gpu")]
152 use gpu::*;
153 #[cfg(target_arch = "riscv64")]
154 use hypervisor::CpuConfigRiscv64;
155 #[cfg(target_arch = "x86_64")]
156 use hypervisor::CpuConfigX86_64;
157 use hypervisor::Hypervisor;
158 use hypervisor::HypervisorCap;
159 use hypervisor::MemCacheType;
160 use hypervisor::ProtectionType;
161 use hypervisor::Vm;
162 use hypervisor::VmCap;
163 use jail::*;
164 #[cfg(feature = "pci-hotplug")]
165 use jail_warden::JailWarden;
166 #[cfg(feature = "pci-hotplug")]
167 use jail_warden::JailWardenImpl;
168 #[cfg(feature = "pci-hotplug")]
169 use jail_warden::PermissiveJailWarden;
170 use libc;
171 use metrics::MetricsController;
172 use minijail::Minijail;
173 #[cfg(feature = "pci-hotplug")]
174 use pci_hotplug_manager::PciHotPlugManager;
175 use resources::AddressRange;
176 use resources::Alloc;
177 use resources::SystemAllocator;
178 #[cfg(target_arch = "riscv64")]
179 use riscv64::Riscv64 as Arch;
180 use rutabaga_gfx::RutabagaGralloc;
181 use rutabaga_gfx::RutabagaGrallocBackendFlags;
182 use smallvec::SmallVec;
183 #[cfg(feature = "swap")]
184 use swap::SwapController;
185 use sync::Condvar;
186 use sync::Mutex;
187 use vm_control::api::VmMemoryClient;
188 use vm_control::*;
189 use vm_memory::GuestAddress;
190 use vm_memory::GuestMemory;
191 use vm_memory::MemoryPolicy;
192 use vm_memory::MemoryRegionOptions;
193 #[cfg(target_arch = "x86_64")]
194 use x86_64::X8664arch as Arch;
195 
196 use crate::crosvm::config::Config;
197 use crate::crosvm::config::Executable;
198 use crate::crosvm::config::FileBackedMappingParameters;
199 use crate::crosvm::config::HypervisorKind;
200 use crate::crosvm::config::InputDeviceOption;
201 use crate::crosvm::config::IrqChipKind;
202 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
203 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
204 #[cfg(feature = "gdb")]
205 use crate::crosvm::gdb::gdb_thread;
206 #[cfg(feature = "gdb")]
207 use crate::crosvm::gdb::GdbStub;
208 #[cfg(target_arch = "x86_64")]
209 use crate::crosvm::ratelimit::Ratelimit;
210 use crate::crosvm::sys::cmdline::DevicesCommand;
211 use crate::crosvm::sys::config::SharedDir;
212 use crate::crosvm::sys::config::SharedDirKind;
213 use crate::crosvm::sys::platform::vcpu::VcpuPidTid;
214 
215 const KVM_PATH: &str = "/dev/kvm";
216 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
217 #[cfg(feature = "geniezone")]
218 const GENIEZONE_PATH: &str = "/dev/gzvm";
219 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
220 static GUNYAH_PATH: &str = "/dev/gunyah";
221 
create_virtio_devices( cfg: &Config, vm: &mut impl VmArch, resources: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube, #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>, worker_process_pids: &mut BTreeSet<Pid>, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, #[cfg(feature = "gpu")] has_vfio_gfx_device: bool, #[cfg(feature = "registered_events")] registered_evt_q: &SendTube, ) -> DeviceResult<Vec<VirtioDeviceStub>>222 fn create_virtio_devices(
223     cfg: &Config,
224     vm: &mut impl VmArch,
225     resources: &mut SystemAllocator,
226     add_control_tube: &mut impl FnMut(AnyControlTube),
227     #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube,
228     #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>,
229     worker_process_pids: &mut BTreeSet<Pid>,
230     #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
231     #[cfg(feature = "gpu")] has_vfio_gfx_device: bool,
232     #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
233 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
234     let mut devs = Vec::new();
235 
236     #[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))]
237     let mut resource_bridges = Vec::<Tube>::new();
238 
239     if !cfg.wayland_socket_paths.is_empty() {
240         #[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
241         let mut wl_resource_bridge = None::<Tube>;
242 
243         #[cfg(feature = "gpu")]
244         {
245             if cfg.gpu_parameters.is_some() {
246                 let (wl_socket, gpu_socket) = Tube::pair().context("failed to create tube")?;
247                 resource_bridges.push(gpu_socket);
248                 wl_resource_bridge = Some(wl_socket);
249             }
250         }
251 
252         devs.push(create_wayland_device(
253             cfg.protection_type,
254             &cfg.jail_config,
255             &cfg.wayland_socket_paths,
256             wl_resource_bridge,
257         )?);
258     }
259 
260     #[cfg(feature = "video-decoder")]
261     let video_dec_cfg = cfg
262         .video_dec
263         .iter()
264         .map(|config| {
265             let (video_tube, gpu_tube) =
266                 Tube::pair().expect("failed to create tube for video decoder");
267             resource_bridges.push(gpu_tube);
268             (video_tube, config.backend)
269         })
270         .collect::<Vec<_>>();
271 
272     #[cfg(feature = "video-encoder")]
273     let video_enc_cfg = cfg
274         .video_enc
275         .iter()
276         .map(|config| {
277             let (video_tube, gpu_tube) =
278                 Tube::pair().expect("failed to create tube for video encoder");
279             resource_bridges.push(gpu_tube);
280             (video_tube, config.backend)
281         })
282         .collect::<Vec<_>>();
283 
284     #[cfg(feature = "gpu")]
285     {
286         if let Some(gpu_parameters) = &cfg.gpu_parameters {
287             let mut event_devices = Vec::new();
288             if cfg.display_window_mouse {
289                 let display_param = if gpu_parameters.display_params.is_empty() {
290                     Default::default()
291                 } else {
292                     gpu_parameters.display_params[0].clone()
293                 };
294                 let (gpu_display_w, gpu_display_h) = display_param.get_virtual_display_size();
295 
296                 let (event_device_socket, virtio_dev_socket) =
297                     StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
298                         .context("failed to create socket")?;
299                 let mut multi_touch_width = gpu_display_w;
300                 let mut multi_touch_height = gpu_display_h;
301                 let mut multi_touch_name = None;
302                 for input in &cfg.virtio_input {
303                     if let InputDeviceOption::MultiTouch {
304                         width,
305                         height,
306                         name,
307                         ..
308                     } = input
309                     {
310                         if let Some(width) = width {
311                             multi_touch_width = *width;
312                         }
313                         if let Some(height) = height {
314                             multi_touch_height = *height;
315                         }
316                         if let Some(name) = name {
317                             multi_touch_name = Some(name.as_str());
318                         }
319                         break;
320                     }
321                 }
322                 let dev = virtio::input::new_multi_touch(
323                     // u32::MAX is the least likely to collide with the indices generated above for
324                     // the multi_touch options, which begin at 0.
325                     u32::MAX,
326                     virtio_dev_socket,
327                     multi_touch_width,
328                     multi_touch_height,
329                     multi_touch_name,
330                     virtio::base_features(cfg.protection_type),
331                 )
332                 .context("failed to set up mouse device")?;
333                 devs.push(VirtioDeviceStub {
334                     dev: Box::new(dev),
335                     jail: simple_jail(&cfg.jail_config, "input_device")?,
336                 });
337                 event_devices.push(EventDevice::touchscreen(event_device_socket));
338             }
339             if cfg.display_window_keyboard {
340                 let (event_device_socket, virtio_dev_socket) =
341                     StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
342                         .context("failed to create socket")?;
343                 let dev = virtio::input::new_keyboard(
344                     // u32::MAX is the least likely to collide with the indices generated above for
345                     // the multi_touch options, which begin at 0.
346                     u32::MAX,
347                     virtio_dev_socket,
348                     virtio::base_features(cfg.protection_type),
349                 )
350                 .context("failed to set up keyboard device")?;
351                 devs.push(VirtioDeviceStub {
352                     dev: Box::new(dev),
353                     jail: simple_jail(&cfg.jail_config, "input_device")?,
354                 });
355                 event_devices.push(EventDevice::keyboard(event_device_socket));
356             }
357 
358             let (gpu_control_host_tube, gpu_control_device_tube) =
359                 Tube::pair().context("failed to create gpu tube")?;
360             add_control_tube(DeviceControlTube::Gpu(gpu_control_host_tube).into());
361             devs.push(create_gpu_device(
362                 cfg,
363                 vm_evt_wrtube,
364                 gpu_control_device_tube,
365                 resource_bridges,
366                 render_server_fd,
367                 has_vfio_gfx_device,
368                 event_devices,
369             )?);
370         }
371     }
372 
373     for (_, param) in cfg
374         .serial_parameters
375         .iter()
376         .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
377     {
378         let dev = param.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?;
379         devs.push(dev);
380     }
381 
382     for disk in &cfg.disks {
383         let (disk_host_tube, disk_device_tube) = Tube::pair().context("failed to create tube")?;
384         add_control_tube(DeviceControlTube::Disk(disk_host_tube).into());
385         let disk_config = DiskConfig::new(disk, Some(disk_device_tube));
386         devs.push(
387             disk_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
388         );
389     }
390 
391     if !cfg.scsis.is_empty() {
392         let scsi_config = ScsiConfig(&cfg.scsis);
393         devs.push(
394             scsi_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
395         );
396     }
397 
398     for (index, pmem_disk) in cfg.pmems.iter().enumerate() {
399         let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
400         add_control_tube(TaggedControlTube::VmMsync(pmem_host_tube).into());
401         devs.push(create_pmem_device(
402             cfg.protection_type,
403             &cfg.jail_config,
404             vm,
405             resources,
406             pmem_disk,
407             index,
408             pmem_device_tube,
409         )?);
410     }
411 
412     for (index, pmem_ext2) in cfg.pmem_ext2.iter().enumerate() {
413         // Prepare a `VmMemoryClient` for pmem-ext2 device to send a request for mmap() and memory
414         // registeration.
415         let (pmem_ext2_host_tube, pmem_ext2_device_tube) =
416             Tube::pair().context("failed to create tube")?;
417         let vm_memory_client = VmMemoryClient::new(pmem_ext2_device_tube);
418         add_control_tube(
419             VmMemoryTube {
420                 tube: pmem_ext2_host_tube,
421                 expose_with_viommu: false,
422             }
423             .into(),
424         );
425         let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
426         add_control_tube(TaggedControlTube::VmMsync(pmem_host_tube).into());
427         devs.push(create_pmem_ext2_device(
428             cfg.protection_type,
429             &cfg.jail_config,
430             resources,
431             pmem_ext2,
432             index,
433             vm_memory_client,
434             pmem_device_tube,
435             worker_process_pids,
436         )?);
437     }
438 
439     if cfg.rng {
440         devs.push(create_rng_device(cfg.protection_type, &cfg.jail_config)?);
441     }
442 
443     #[cfg(feature = "pvclock")]
444     if cfg.pvclock {
445         // pvclock gets a tube for handling suspend/resume requests from the main thread.
446         let (host_suspend_tube, suspend_tube) = Tube::pair().context("failed to create tube")?;
447         add_control_tube(DeviceControlTube::PvClock(host_suspend_tube).into());
448 
449         let frequency: u64;
450         #[cfg(target_arch = "x86_64")]
451         {
452             let tsc_state = devices::tsc::tsc_state()?;
453             let tsc_sync_mitigations =
454                 get_tsc_sync_mitigations(&tsc_state, cfg.vcpu_count.unwrap_or(1));
455             if tsc_state.core_grouping.size() > 1 {
456                 // Host TSCs are not in sync. Log what mitigations are applied.
457                 warn!(
458                     "Host TSCs are not in sync, applying the following mitigations: {:?}",
459                     tsc_sync_mitigations
460                 );
461             }
462             frequency = tsc_state.frequency;
463         }
464         #[cfg(target_arch = "aarch64")]
465         {
466             let mut x: u64;
467             // SAFETY: This instruction have no side effect apart from storing the current timestamp
468             //         frequency into the specified register.
469             unsafe {
470                 asm!("mrs {x}, cntfrq_el0",
471                     x = out(reg) x,
472                 );
473             }
474             frequency = x;
475 
476             // If unset, KVM defaults to an offset that is calculated from VM boot time. Explicitly
477             // set it to zero on boot. When updating the offset, we always set it to the total
478             // amount of time the VM has been suspended.
479             vm.set_counter_offset(0)
480                 .context("failed to set up pvclock")?;
481         }
482         let dev = create_pvclock_device(
483             cfg.protection_type,
484             &cfg.jail_config,
485             frequency,
486             suspend_tube,
487         )?;
488         devs.push(dev);
489         info!("virtio-pvclock is enabled for this vm");
490     }
491 
492     #[cfg(feature = "vtpm")]
493     {
494         if cfg.vtpm_proxy {
495             devs.push(create_vtpm_proxy_device(
496                 cfg.protection_type,
497                 &cfg.jail_config,
498             )?);
499         }
500     }
501 
502     let mut keyboard_idx = 0;
503     let mut mouse_idx = 0;
504     let mut rotary_idx = 0;
505     let mut switches_idx = 0;
506     let mut multi_touch_idx = 0;
507     let mut single_touch_idx = 0;
508     let mut trackpad_idx = 0;
509     let mut multi_touch_trackpad_idx = 0;
510     let mut custom_idx = 0;
511     for input in &cfg.virtio_input {
512         let input_dev = match input {
513             InputDeviceOption::Evdev { path } => {
514                 create_vinput_device(cfg.protection_type, &cfg.jail_config, path.as_path())?
515             }
516             InputDeviceOption::Keyboard { path } => {
517                 let dev = create_keyboard_device(
518                     cfg.protection_type,
519                     &cfg.jail_config,
520                     path.as_path(),
521                     keyboard_idx,
522                 )?;
523                 keyboard_idx += 1;
524                 dev
525             }
526             InputDeviceOption::Mouse { path } => {
527                 let dev = create_mouse_device(
528                     cfg.protection_type,
529                     &cfg.jail_config,
530                     path.as_path(),
531                     mouse_idx,
532                 )?;
533                 mouse_idx += 1;
534                 dev
535             }
536             InputDeviceOption::MultiTouch {
537                 path,
538                 width,
539                 height,
540                 name,
541             } => {
542                 let mut width = *width;
543                 let mut height = *height;
544                 if multi_touch_idx == 0 {
545                     if width.is_none() {
546                         width = cfg.display_input_width;
547                     }
548                     if height.is_none() {
549                         height = cfg.display_input_height;
550                     }
551                 }
552                 let dev = create_multi_touch_device(
553                     cfg.protection_type,
554                     &cfg.jail_config,
555                     path.as_path(),
556                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
557                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
558                     name.as_deref(),
559                     multi_touch_idx,
560                 )?;
561                 multi_touch_idx += 1;
562                 dev
563             }
564             InputDeviceOption::Rotary { path } => {
565                 let dev = create_rotary_device(
566                     cfg.protection_type,
567                     &cfg.jail_config,
568                     path.as_path(),
569                     rotary_idx,
570                 )?;
571                 rotary_idx += 1;
572                 dev
573             }
574             InputDeviceOption::SingleTouch {
575                 path,
576                 width,
577                 height,
578                 name,
579             } => {
580                 let mut width = *width;
581                 let mut height = *height;
582                 if single_touch_idx == 0 {
583                     if width.is_none() {
584                         width = cfg.display_input_width;
585                     }
586                     if height.is_none() {
587                         height = cfg.display_input_height;
588                     }
589                 }
590                 let dev = create_single_touch_device(
591                     cfg.protection_type,
592                     &cfg.jail_config,
593                     path.as_path(),
594                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
595                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
596                     name.as_deref(),
597                     single_touch_idx,
598                 )?;
599                 single_touch_idx += 1;
600                 dev
601             }
602             InputDeviceOption::Switches { path } => {
603                 let dev = create_switches_device(
604                     cfg.protection_type,
605                     &cfg.jail_config,
606                     path.as_path(),
607                     switches_idx,
608                 )?;
609                 switches_idx += 1;
610                 dev
611             }
612             InputDeviceOption::Trackpad {
613                 path,
614                 width,
615                 height,
616                 name,
617             } => {
618                 let dev = create_trackpad_device(
619                     cfg.protection_type,
620                     &cfg.jail_config,
621                     path.as_path(),
622                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
623                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
624                     name.as_deref(),
625                     trackpad_idx,
626                 )?;
627                 trackpad_idx += 1;
628                 dev
629             }
630             InputDeviceOption::MultiTouchTrackpad {
631                 path,
632                 width,
633                 height,
634                 name,
635             } => {
636                 let dev = create_multitouch_trackpad_device(
637                     cfg.protection_type,
638                     &cfg.jail_config,
639                     path.as_path(),
640                     width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
641                     height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
642                     name.as_deref(),
643                     multi_touch_trackpad_idx,
644                 )?;
645                 multi_touch_trackpad_idx += 1;
646                 dev
647             }
648             InputDeviceOption::Custom { path, config_path } => {
649                 let dev = create_custom_device(
650                     cfg.protection_type,
651                     &cfg.jail_config,
652                     path.as_path(),
653                     custom_idx,
654                     config_path.clone(),
655                 )?;
656                 custom_idx += 1;
657                 dev
658             }
659         };
660         devs.push(input_dev);
661     }
662 
663     #[cfg(feature = "balloon")]
664     if cfg.balloon {
665         let balloon_device_tube = if let Some(ref path) = cfg.balloon_control {
666             Tube::new_from_unix_seqpacket(UnixSeqpacket::connect(path).with_context(|| {
667                 format!(
668                     "failed to connect to balloon control socket {}",
669                     path.display(),
670                 )
671             })?)?
672         } else {
673             // Balloon gets a special socket so balloon requests can be forwarded
674             // from the main process.
675             let (host, device) = Tube::pair().context("failed to create tube")?;
676             add_control_tube(DeviceControlTube::Balloon(host).into());
677             device
678         };
679 
680         let balloon_features = (cfg.balloon_page_reporting as u64)
681             << BalloonFeatures::PageReporting as u64
682             | (cfg.balloon_ws_reporting as u64) << BalloonFeatures::WSReporting as u64;
683 
684         let init_balloon_size = if let Some(init_memory) = cfg.init_memory {
685             let init_memory_bytes = init_memory.saturating_mul(1024 * 1024);
686             let total_memory_bytes = vm.get_memory().memory_size();
687 
688             if init_memory_bytes > total_memory_bytes {
689                 bail!(
690                     "initial memory {} cannot be greater than total memory {}",
691                     init_memory,
692                     total_memory_bytes / (1024 * 1024),
693                 );
694             }
695 
696             // The initial balloon size is the total memory size minus the initial memory size.
697             total_memory_bytes - init_memory_bytes
698         } else {
699             // No --init-mem specified; start with balloon completely deflated.
700             0
701         };
702 
703         devs.push(create_balloon_device(
704             cfg.protection_type,
705             &cfg.jail_config,
706             balloon_device_tube,
707             balloon_inflate_tube,
708             init_balloon_size,
709             balloon_features,
710             #[cfg(feature = "registered_events")]
711             Some(
712                 registered_evt_q
713                     .try_clone()
714                     .context("failed to clone registered_evt_q tube")?,
715             ),
716             cfg.balloon_ws_num_bins,
717         )?);
718     }
719 
720     #[cfg(feature = "net")]
721     for opt in &cfg.net {
722         let dev = opt.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?;
723         devs.push(dev);
724     }
725 
726     #[cfg(feature = "audio")]
727     {
728         for (card_index, virtio_snd) in cfg.virtio_snds.iter().enumerate() {
729             let mut snd_params = virtio_snd.clone();
730             snd_params.card_index = card_index;
731             devs.push(create_virtio_snd_device(
732                 cfg.protection_type,
733                 &cfg.jail_config,
734                 snd_params,
735             )?);
736         }
737     }
738 
739     #[cfg(any(target_os = "android", target_os = "linux"))]
740     #[cfg(feature = "media")]
741     {
742         for v4l2_device in &cfg.v4l2_proxy {
743             devs.push(create_v4l2_device(cfg.protection_type, v4l2_device)?);
744         }
745     }
746 
747     #[cfg(feature = "media")]
748     if cfg.simple_media_device {
749         devs.push(create_simple_media_device(cfg.protection_type)?);
750     }
751 
752     #[cfg(feature = "video-decoder")]
753     {
754         for (tube, backend) in video_dec_cfg {
755             register_video_device(
756                 backend,
757                 &mut devs,
758                 tube,
759                 cfg.protection_type,
760                 &cfg.jail_config,
761                 VideoDeviceType::Decoder,
762             )?;
763         }
764     }
765 
766     #[cfg(feature = "video-encoder")]
767     {
768         for (tube, backend) in video_enc_cfg {
769             register_video_device(
770                 backend,
771                 &mut devs,
772                 tube,
773                 cfg.protection_type,
774                 &cfg.jail_config,
775                 VideoDeviceType::Encoder,
776             )?;
777         }
778     }
779 
780     if let Some(vsock_config) = &cfg.vsock {
781         devs.push(
782             vsock_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
783         );
784     }
785 
786     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
787     {
788         if cfg.vhost_scmi {
789             devs.push(create_vhost_scmi_device(
790                 cfg.protection_type,
791                 &cfg.jail_config,
792                 cfg.vhost_scmi_device.clone(),
793             )?);
794         }
795     }
796     for vhost_user_fs in &cfg.vhost_user_fs {
797         devs.push(create_vhost_user_fs_device(
798             cfg.protection_type,
799             vhost_user_fs,
800         )?);
801     }
802 
803     for shared_dir in &cfg.shared_dirs {
804         let SharedDir {
805             src,
806             tag,
807             kind,
808             ugid,
809             uid_map,
810             gid_map,
811             fs_cfg,
812             p9_cfg,
813         } = shared_dir;
814 
815         let dev = match kind {
816             SharedDirKind::FS => {
817                 let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
818                 add_control_tube(TaggedControlTube::Fs(host_tube).into());
819 
820                 create_fs_device(
821                     cfg.protection_type,
822                     &cfg.jail_config,
823                     *ugid,
824                     uid_map,
825                     gid_map,
826                     src,
827                     tag,
828                     fs_cfg.clone(),
829                     device_tube,
830                 )?
831             }
832             SharedDirKind::P9 => create_9p_device(
833                 cfg.protection_type,
834                 &cfg.jail_config,
835                 *ugid,
836                 uid_map,
837                 gid_map,
838                 src,
839                 tag,
840                 p9_cfg.clone(),
841             )?,
842         };
843         devs.push(dev);
844     }
845 
846     #[cfg(feature = "audio")]
847     if let Some(path) = &cfg.sound {
848         devs.push(create_sound_device(
849             path,
850             cfg.protection_type,
851             &cfg.jail_config,
852         )?);
853     }
854 
855     for opt in &cfg.vhost_user {
856         devs.push(create_vhost_user_frontend(
857             cfg.protection_type,
858             opt,
859             cfg.vhost_user_connect_timeout_ms,
860         )?);
861     }
862 
863     Ok(devs)
864 }
865 
create_devices( cfg: &Config, vm: &mut impl VmArch, resources: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), vm_evt_wrtube: &SendTube, iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>, #[cfg(feature = "usb")] usb_provider: DeviceProvider, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, iova_max_addr: &mut Option<u64>, #[cfg(feature = "registered_events")] registered_evt_q: &SendTube, vfio_container_manager: &mut VfioContainerManager, worker_process_pids: &mut BTreeSet<Pid>, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>866 fn create_devices(
867     cfg: &Config,
868     vm: &mut impl VmArch,
869     resources: &mut SystemAllocator,
870     add_control_tube: &mut impl FnMut(AnyControlTube),
871     vm_evt_wrtube: &SendTube,
872     iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
873     #[cfg(feature = "usb")] usb_provider: DeviceProvider,
874     #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
875     iova_max_addr: &mut Option<u64>,
876     #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
877     vfio_container_manager: &mut VfioContainerManager,
878     // Stores a set of PID of child processes that are suppose to exit cleanly.
879     worker_process_pids: &mut BTreeSet<Pid>,
880 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
881     let mut devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)> = Vec::new();
882     #[cfg(feature = "balloon")]
883     let mut balloon_inflate_tube: Option<Tube> = None;
884     #[cfg(feature = "gpu")]
885     let mut has_vfio_gfx_device = false;
886     if !cfg.vfio.is_empty() {
887         let mut coiommu_attached_endpoints = Vec::new();
888 
889         for vfio_dev in &cfg.vfio {
890             let (dev, jail, viommu_mapper) = create_vfio_device(
891                 &cfg.jail_config,
892                 vm,
893                 resources,
894                 add_control_tube,
895                 &vfio_dev.path,
896                 false,
897                 None,
898                 vfio_dev.guest_address,
899                 Some(&mut coiommu_attached_endpoints),
900                 vfio_dev.iommu,
901                 vfio_dev.dt_symbol.clone(),
902                 vfio_container_manager,
903             )?;
904             match dev {
905                 VfioDeviceVariant::Pci(vfio_pci_device) => {
906                     *iova_max_addr = Some(max(
907                         vfio_pci_device.get_max_iova(),
908                         iova_max_addr.unwrap_or(0),
909                     ));
910 
911                     #[cfg(feature = "gpu")]
912                     if vfio_pci_device.is_gfx() {
913                         has_vfio_gfx_device = true;
914                     }
915 
916                     if let Some(viommu_mapper) = viommu_mapper {
917                         iommu_attached_endpoints.insert(
918                             vfio_pci_device
919                                 .pci_address()
920                                 .context("not initialized")?
921                                 .to_u32(),
922                             Arc::new(Mutex::new(Box::new(viommu_mapper))),
923                         );
924                     }
925 
926                     devices.push((Box::new(vfio_pci_device), jail));
927                 }
928                 VfioDeviceVariant::Platform(vfio_plat_dev) => {
929                     devices.push((Box::new(vfio_plat_dev), jail));
930                 }
931             }
932         }
933 
934         if !coiommu_attached_endpoints.is_empty() || !iommu_attached_endpoints.is_empty() {
935             let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
936             // SAFETY: trivially safe
937             let res = unsafe { libc::getrlimit64(libc::RLIMIT_MEMLOCK, buf.as_mut_ptr()) };
938             if res == 0 {
939                 // SAFETY: safe because getrlimit64 has returned success.
940                 let limit = unsafe { buf.assume_init() };
941                 let rlim_new = limit.rlim_cur.saturating_add(vm.get_memory().memory_size());
942                 let rlim_max = max(limit.rlim_max, rlim_new);
943                 if limit.rlim_cur < rlim_new {
944                     let limit_arg = libc::rlimit64 {
945                         rlim_cur: rlim_new,
946                         rlim_max,
947                     };
948                     // SAFETY: trivially safe
949                     let res = unsafe { libc::setrlimit64(libc::RLIMIT_MEMLOCK, &limit_arg) };
950                     if res != 0 {
951                         bail!("Set rlimit failed");
952                     }
953                 }
954             } else {
955                 bail!("Get rlimit failed");
956             }
957         }
958         #[cfg(feature = "balloon")]
959         let coiommu_tube: Option<Tube>;
960         #[cfg(not(feature = "balloon"))]
961         let coiommu_tube: Option<Tube> = None;
962         if !coiommu_attached_endpoints.is_empty() {
963             let vfio_container = vfio_container_manager
964                 .get_container(IommuDevType::CoIommu, None as Option<&Path>)
965                 .context("failed to get vfio container")?;
966             let (coiommu_host_tube, coiommu_device_tube) =
967                 Tube::pair().context("failed to create coiommu tube")?;
968             add_control_tube(
969                 VmMemoryTube {
970                     tube: coiommu_host_tube,
971                     expose_with_viommu: false,
972                 }
973                 .into(),
974             );
975             let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u64;
976             #[cfg(feature = "balloon")]
977             match Tube::pair() {
978                 Ok((x, y)) => {
979                     coiommu_tube = Some(x);
980                     balloon_inflate_tube = Some(y);
981                 }
982                 Err(x) => return Err(x).context("failed to create coiommu tube"),
983             }
984             let dev = CoIommuDev::new(
985                 vm.get_memory().clone(),
986                 vfio_container,
987                 VmMemoryClient::new(coiommu_device_tube),
988                 coiommu_tube,
989                 coiommu_attached_endpoints,
990                 vcpu_count,
991                 cfg.coiommu_param.unwrap_or_default(),
992             )
993             .context("failed to create coiommu device")?;
994 
995             devices.push((
996                 Box::new(dev),
997                 simple_jail(&cfg.jail_config, "coiommu_device")?,
998             ));
999         }
1000     }
1001 
1002     let stubs = create_virtio_devices(
1003         cfg,
1004         vm,
1005         resources,
1006         add_control_tube,
1007         vm_evt_wrtube,
1008         #[cfg(feature = "balloon")]
1009         balloon_inflate_tube,
1010         worker_process_pids,
1011         #[cfg(feature = "gpu")]
1012         render_server_fd,
1013         #[cfg(feature = "gpu")]
1014         has_vfio_gfx_device,
1015         #[cfg(feature = "registered_events")]
1016         registered_evt_q,
1017     )?;
1018 
1019     for stub in stubs {
1020         match stub.dev.transport_type() {
1021             VirtioTransportType::Pci => {
1022                 let (msi_host_tube, msi_device_tube) =
1023                     Tube::pair().context("failed to create tube")?;
1024                 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1025 
1026                 let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
1027                     let (host_tube, device_tube) =
1028                         Tube::pair().context("failed to create shared memory tube")?;
1029                     add_control_tube(
1030                         VmMemoryTube {
1031                             tube: host_tube,
1032                             expose_with_viommu: stub.dev.expose_shmem_descriptors_with_viommu(),
1033                         }
1034                         .into(),
1035                     );
1036                     Some(device_tube)
1037                 } else {
1038                     None
1039                 };
1040 
1041                 let (ioevent_host_tube, ioevent_device_tube) =
1042                     Tube::pair().context("failed to create ioevent tube")?;
1043                 add_control_tube(
1044                     VmMemoryTube {
1045                         tube: ioevent_host_tube,
1046                         expose_with_viommu: false,
1047                     }
1048                     .into(),
1049                 );
1050 
1051                 let (host_tube, device_tube) =
1052                     Tube::pair().context("failed to create device control tube")?;
1053                 add_control_tube(TaggedControlTube::Vm(host_tube).into());
1054 
1055                 let dev = VirtioPciDevice::new(
1056                     vm.get_memory().clone(),
1057                     stub.dev,
1058                     msi_device_tube,
1059                     cfg.disable_virtio_intx,
1060                     shared_memory_tube.map(VmMemoryClient::new),
1061                     VmMemoryClient::new(ioevent_device_tube),
1062                     device_tube,
1063                 )
1064                 .context("failed to create virtio pci dev")?;
1065 
1066                 devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
1067             }
1068             VirtioTransportType::Mmio => {
1069                 let dev = VirtioMmioDevice::new(vm.get_memory().clone(), stub.dev, false)
1070                     .context("failed to create virtio mmio dev")?;
1071                 devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
1072             }
1073         }
1074     }
1075 
1076     #[cfg(feature = "usb")]
1077     if cfg.usb {
1078         // Create xhci controller.
1079         let usb_controller = Box::new(XhciController::new(
1080             vm.get_memory().clone(),
1081             Box::new(usb_provider),
1082         ));
1083         devices.push((
1084             usb_controller,
1085             simple_jail(&cfg.jail_config, "xhci_device")?,
1086         ));
1087     }
1088 
1089     for params in &cfg.stub_pci_devices {
1090         // Stub devices don't need jailing since they don't do anything.
1091         devices.push((Box::new(StubPciDevice::new(params)), None));
1092     }
1093 
1094     devices.push((
1095         Box::new(PvPanicPciDevice::new(vm_evt_wrtube.try_clone()?)),
1096         None,
1097     ));
1098 
1099     Ok(devices)
1100 }
1101 
create_file_backed_mappings( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, ) -> Result<()>1102 fn create_file_backed_mappings(
1103     cfg: &Config,
1104     vm: &mut impl Vm,
1105     resources: &mut SystemAllocator,
1106 ) -> Result<()> {
1107     for mapping in &cfg.file_backed_mappings {
1108         let file = OpenOptions::new()
1109             .read(true)
1110             .write(mapping.writable)
1111             .custom_flags(if mapping.sync { libc::O_SYNC } else { 0 })
1112             .open(&mapping.path)
1113             .context("failed to open file for file-backed mapping")?;
1114         let prot = if mapping.writable {
1115             Protection::read_write()
1116         } else {
1117             Protection::read()
1118         };
1119         let size = mapping
1120             .size
1121             .try_into()
1122             .context("Invalid size for file-backed mapping")?;
1123         let memory_mapping = MemoryMappingBuilder::new(size)
1124             .from_file(&file)
1125             .offset(mapping.offset)
1126             .protection(prot)
1127             .build()
1128             .context("failed to map backing file for file-backed mapping")?;
1129 
1130         let mapping_range = AddressRange::from_start_and_size(mapping.address, mapping.size)
1131             .context("failed to convert to AddressRange")?;
1132         match resources.mmio_allocator_any().allocate_at(
1133             mapping_range,
1134             Alloc::FileBacked(mapping.address),
1135             "file-backed mapping".to_owned(),
1136         ) {
1137             // OutOfSpace just means that this mapping is not in the MMIO regions at all, so don't
1138             // consider it an error.
1139             // TODO(b/222769529): Reserve this region in a global memory address space allocator
1140             // once we have that so nothing else can accidentally overlap with it.
1141             Ok(()) | Err(resources::Error::OutOfSpace) => {}
1142             e => e.context("failed to allocate guest address for file-backed mapping")?,
1143         }
1144 
1145         vm.add_memory_region(
1146             GuestAddress(mapping.address),
1147             Box::new(memory_mapping),
1148             !mapping.writable,
1149             /* log_dirty_pages = */ false,
1150             MemCacheType::CacheCoherent,
1151         )
1152         .context("failed to configure file-backed mapping")?;
1153     }
1154 
1155     Ok(())
1156 }
1157 
1158 #[cfg(target_arch = "x86_64")]
1159 /// Collection of devices related to PCI hotplug.
1160 struct HotPlugStub {
1161     /// Map from bus index to hotplug bus.
1162     hotplug_buses: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
1163     /// Bus ranges of devices for virtio-iommu.
1164     iommu_bus_ranges: Vec<RangeInclusive<u32>>,
1165     /// Map from gpe index to GpeNotify devices.
1166     gpe_notify_devs: BTreeMap<u32, Arc<Mutex<dyn GpeNotify>>>,
1167     /// Map from bus index to GpeNotify devices.
1168     pme_notify_devs: BTreeMap<u8, Arc<Mutex<dyn PmeNotify>>>,
1169 }
1170 
1171 #[cfg(target_arch = "x86_64")]
1172 impl HotPlugStub {
1173     /// Constructs empty HotPlugStub.
new() -> Self1174     fn new() -> Self {
1175         Self {
1176             hotplug_buses: BTreeMap::new(),
1177             iommu_bus_ranges: Vec::new(),
1178             gpe_notify_devs: BTreeMap::new(),
1179             pme_notify_devs: BTreeMap::new(),
1180         }
1181     }
1182 }
1183 
1184 #[cfg(target_arch = "x86_64")]
1185 /// Creates PCIE root port with only virtual devices.
1186 ///
1187 /// user doesn't specify host pcie root port which link to this virtual pcie rp,
1188 /// find the empty bus and create a total virtual pcie rp
create_pure_virtual_pcie_root_port( sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, hp_bus_count: u8, ) -> Result<HotPlugStub>1189 fn create_pure_virtual_pcie_root_port(
1190     sys_allocator: &mut SystemAllocator,
1191     add_control_tube: &mut impl FnMut(AnyControlTube),
1192     devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
1193     hp_bus_count: u8,
1194 ) -> Result<HotPlugStub> {
1195     let mut hp_sec_buses = Vec::new();
1196     let mut hp_stub = HotPlugStub::new();
1197     // Create Pcie Root Port for non-root buses, each non-root bus device will be
1198     // connected behind a virtual pcie root port.
1199     for i in 1..255 {
1200         if sys_allocator.pci_bus_empty(i) {
1201             if hp_sec_buses.len() < hp_bus_count.into() {
1202                 hp_sec_buses.push(i);
1203             }
1204             continue;
1205         }
1206         let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(i, false)));
1207         hp_stub
1208             .pme_notify_devs
1209             .insert(i, pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>);
1210         let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1211         add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1212         let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1213         // no ipc is used if the root port disables hotplug
1214         devices.push((pci_bridge, None));
1215     }
1216 
1217     // Create Pcie Root Port for hot-plug
1218     if hp_sec_buses.len() < hp_bus_count.into() {
1219         return Err(anyhow!("no more addresses are available"));
1220     }
1221 
1222     for hp_sec_bus in hp_sec_buses {
1223         let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(hp_sec_bus, true)));
1224         hp_stub.pme_notify_devs.insert(
1225             hp_sec_bus,
1226             pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>,
1227         );
1228         let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1229         add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1230         let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1231 
1232         hp_stub.iommu_bus_ranges.push(RangeInclusive::new(
1233             PciAddress {
1234                 bus: pci_bridge.get_secondary_num(),
1235                 dev: 0,
1236                 func: 0,
1237             }
1238             .to_u32(),
1239             PciAddress {
1240                 bus: pci_bridge.get_subordinate_num(),
1241                 dev: 32,
1242                 func: 8,
1243             }
1244             .to_u32(),
1245         ));
1246 
1247         devices.push((pci_bridge, None));
1248         hp_stub
1249             .hotplug_buses
1250             .insert(hp_sec_bus, pcie_root_port as Arc<Mutex<dyn HotPlugBus>>);
1251     }
1252     Ok(hp_stub)
1253 }
1254 
setup_vm_components(cfg: &Config) -> Result<VmComponents>1255 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
1256     let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
1257         Some(
1258             open_file_or_duplicate(initrd_path, OpenOptions::new().read(true))
1259                 .with_context(|| format!("failed to open initrd {}", initrd_path.display()))?,
1260         )
1261     } else {
1262         None
1263     };
1264     let pvm_fw_image = if let Some(pvm_fw_path) = &cfg.pvm_fw {
1265         Some(
1266             open_file_or_duplicate(pvm_fw_path, OpenOptions::new().read(true))
1267                 .with_context(|| format!("failed to open pvm_fw {}", pvm_fw_path.display()))?,
1268         )
1269     } else {
1270         None
1271     };
1272 
1273     let vm_image = match cfg.executable_path {
1274         Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
1275             open_file_or_duplicate(kernel_path, OpenOptions::new().read(true)).with_context(
1276                 || format!("failed to open kernel image {}", kernel_path.display()),
1277             )?,
1278         ),
1279         Some(Executable::Bios(ref bios_path)) => VmImage::Bios(
1280             open_file_or_duplicate(bios_path, OpenOptions::new().read(true))
1281                 .with_context(|| format!("failed to open bios {}", bios_path.display()))?,
1282         ),
1283         _ => panic!("Did not receive a bios or kernel, should be impossible."),
1284     };
1285 
1286     let swiotlb = if let Some(size) = cfg.swiotlb {
1287         Some(
1288             size.checked_mul(1024 * 1024)
1289                 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
1290         )
1291     } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
1292         None
1293     } else {
1294         Some(64 * 1024 * 1024)
1295     };
1296 
1297     let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
1298     {
1299         (
1300             Some(
1301                 open_file_or_duplicate(
1302                     &pflash_parameters.path,
1303                     OpenOptions::new().read(true).write(true),
1304                 )
1305                 .with_context(|| {
1306                     format!("failed to open pflash {}", pflash_parameters.path.display())
1307                 })?,
1308             ),
1309             pflash_parameters.block_size,
1310         )
1311     } else {
1312         (None, 0)
1313     };
1314 
1315     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1316     let mut cpu_frequencies = BTreeMap::new();
1317     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1318     let mut normalized_cpu_capacities = BTreeMap::new();
1319 
1320     // if --enable-fw-cfg or --fw-cfg was given, we want to enable fw_cfg
1321     let fw_cfg_enable = cfg.enable_fw_cfg || !cfg.fw_cfg_parameters.is_empty();
1322     let (cpu_clusters, cpu_capacity) = if cfg.host_cpu_topology {
1323         (
1324             Arch::get_host_cpu_clusters()?,
1325             Arch::get_host_cpu_capacity()?,
1326         )
1327     } else {
1328         (cfg.cpu_clusters.clone(), cfg.cpu_capacity.clone())
1329     };
1330 
1331     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1332     let mut vcpu_domain_paths = BTreeMap::new();
1333     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1334     let mut vcpu_domains = BTreeMap::new();
1335 
1336     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1337     if cfg.virt_cpufreq || cfg.virt_cpufreq_v2 {
1338         if !cfg.cpu_frequencies_khz.is_empty() {
1339             cpu_frequencies = cfg.cpu_frequencies_khz.clone();
1340         } else {
1341             match Arch::get_host_cpu_frequencies_khz() {
1342                 Ok(host_cpu_frequencies) => {
1343                     for cpu_id in 0..cfg.vcpu_count.unwrap_or(1) {
1344                         let vcpu_affinity = match cfg.vcpu_affinity.clone() {
1345                             Some(VcpuAffinity::Global(v)) => v,
1346                             Some(VcpuAffinity::PerVcpu(mut m)) => {
1347                                 m.remove(&cpu_id).unwrap_or_default()
1348                             }
1349                             None => {
1350                                 panic!("There must be some vcpu_affinity setting with VirtCpufreq enabled!")
1351                             }
1352                         };
1353 
1354                         // Check that the physical CPUs that the vCPU is affined to all share the
1355                         // same frequency domain.
1356                         if let Some(freq_domain) = host_cpu_frequencies.get(&vcpu_affinity[0]) {
1357                             for cpu in vcpu_affinity.iter() {
1358                                 if let Some(frequencies) = host_cpu_frequencies.get(cpu) {
1359                                     if frequencies != freq_domain {
1360                                         panic!("Affined CPUs do not share a frequency domain!");
1361                                     }
1362                                 }
1363                             }
1364                             cpu_frequencies.insert(cpu_id, freq_domain.clone());
1365                         } else {
1366                             panic!("No frequency domain for cpu:{}", cpu_id);
1367                         }
1368                     }
1369                 }
1370                 Err(e) => {
1371                     warn!("Unable to get host cpu frequencies {:#}", e);
1372                 }
1373             }
1374         }
1375 
1376         if !cpu_frequencies.is_empty() {
1377             let mut max_freqs = Vec::new();
1378 
1379             for (_cpu, frequencies) in cpu_frequencies.iter() {
1380                 max_freqs.push(*frequencies.iter().max().ok_or(Error::new(libc::EINVAL))?)
1381             }
1382 
1383             let host_max_freqs = Arch::get_host_cpu_max_freq_khz()?;
1384             let largest_host_max_freq = host_max_freqs
1385                 .values()
1386                 .max()
1387                 .ok_or(Error::new(libc::EINVAL))?;
1388 
1389             for (cpu_id, max_freq) in max_freqs.iter().enumerate() {
1390                 let normalized_cpu_capacity = (u64::from(*cpu_capacity.get(&cpu_id).unwrap())
1391                     * u64::from(*max_freq))
1392                 .checked_div(u64::from(*largest_host_max_freq))
1393                 .ok_or(Error::new(libc::EINVAL))?;
1394                 normalized_cpu_capacities.insert(
1395                     cpu_id,
1396                     u32::try_from(normalized_cpu_capacity).map_err(|_| Error::new(libc::EINVAL))?,
1397                 );
1398             }
1399 
1400             if !cfg.cpu_freq_domains.is_empty() {
1401                 let cgroup_path = cfg
1402                     .vcpu_cgroup_path
1403                     .clone()
1404                     .context("cpu_freq_domains requires vcpu_cgroup_path")?;
1405 
1406                 if !cgroup_path.join("cgroup.controllers").exists() {
1407                     panic!("CGroupsV2 must be enabled for cpu freq domain support!");
1408                 }
1409 
1410                 // Assign parent crosvm process to top level cgroup
1411                 let cgroup_procs_path = cgroup_path.join("cgroup.procs");
1412                 std::fs::write(
1413                     cgroup_procs_path.clone(),
1414                     process::id().to_string().as_bytes(),
1415                 )
1416                 .with_context(|| {
1417                     format!(
1418                         "failed to create vcpu-cgroup-path {}",
1419                         cgroup_procs_path.display(),
1420                     )
1421                 })?;
1422 
1423                 for (freq_domain_idx, cpus) in cfg.cpu_freq_domains.iter().enumerate() {
1424                     let vcpu_domain_path =
1425                         cgroup_path.join(format!("vcpu-domain{}", freq_domain_idx));
1426                     // Create subtree for domain
1427                     create_dir_all(&vcpu_domain_path)?;
1428 
1429                     // Set vcpu_domain cgroup type as 'threaded' to get thread level granularity
1430                     // controls
1431                     let cgroup_type_path = cgroup_path.join(vcpu_domain_path.join("cgroup.type"));
1432                     std::fs::write(cgroup_type_path.clone(), b"threaded").with_context(|| {
1433                         format!(
1434                             "failed to create vcpu-cgroup-path {}",
1435                             cgroup_type_path.display(),
1436                         )
1437                     })?;
1438                     for core_idx in cpus.iter() {
1439                         vcpu_domain_paths.insert(*core_idx, vcpu_domain_path.clone());
1440                         vcpu_domains.insert(*core_idx, freq_domain_idx as u32);
1441                     }
1442                 }
1443             }
1444         }
1445     }
1446 
1447     Ok(VmComponents {
1448         #[cfg(target_arch = "x86_64")]
1449         ac_adapter: cfg.ac_adapter,
1450         #[cfg(target_arch = "x86_64")]
1451         break_linux_pci_config_io: cfg.break_linux_pci_config_io,
1452         memory_size: cfg
1453             .memory
1454             .unwrap_or(256)
1455             .checked_mul(1024 * 1024)
1456             .ok_or_else(|| anyhow!("requested memory size too large"))?,
1457         swiotlb,
1458         fw_cfg_enable,
1459         bootorder_fw_cfg_blob: Vec::new(),
1460         vcpu_count: cfg.vcpu_count.unwrap_or(1),
1461         vcpu_affinity: cfg.vcpu_affinity.clone(),
1462         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1463         vcpu_domains,
1464         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1465         vcpu_domain_paths,
1466         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1467         cpu_frequencies,
1468         fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
1469         cpu_clusters,
1470         cpu_capacity,
1471         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1472         normalized_cpu_capacities,
1473         no_smt: cfg.no_smt,
1474         hugepages: cfg.hugepages,
1475         hv_cfg: hypervisor::Config {
1476             #[cfg(target_arch = "aarch64")]
1477             mte: cfg.mte,
1478             protection_type: cfg.protection_type,
1479         },
1480         vm_image,
1481         android_fstab: cfg
1482             .android_fstab
1483             .as_ref()
1484             .map(|x| {
1485                 File::open(x)
1486                     .with_context(|| format!("failed to open android fstab file {}", x.display()))
1487             })
1488             .map_or(Ok(None), |v| v.map(Some))?,
1489         pstore: cfg.pstore.clone(),
1490         pflash_block_size,
1491         pflash_image,
1492         initrd_image,
1493         extra_kernel_params: cfg.params.clone(),
1494         acpi_sdts: cfg
1495             .acpi_tables
1496             .iter()
1497             .map(|path| {
1498                 SDT::from_file(path)
1499                     .with_context(|| format!("failed to open ACPI file {}", path.display()))
1500             })
1501             .collect::<Result<Vec<SDT>>>()?,
1502         rt_cpus: cfg.rt_cpus.clone(),
1503         delay_rt: cfg.delay_rt,
1504         no_i8042: cfg.no_i8042,
1505         no_rtc: cfg.no_rtc,
1506         #[cfg(target_arch = "x86_64")]
1507         smbios: cfg.smbios.clone(),
1508         host_cpu_topology: cfg.host_cpu_topology,
1509         itmt: cfg.itmt,
1510         #[cfg(target_arch = "x86_64")]
1511         force_s2idle: cfg.force_s2idle,
1512         pvm_fw: pvm_fw_image,
1513         pci_config: cfg.pci_config,
1514         dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
1515         boot_cpu: cfg.boot_cpu,
1516         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1517         virt_cpufreq_v2: cfg.virt_cpufreq_v2,
1518         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1519         sve_config: cfg.sve.unwrap_or_default(),
1520     })
1521 }
1522 
1523 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
1524 pub enum ExitState {
1525     Reset,
1526     Stop,
1527     Crash,
1528     GuestPanic,
1529     WatchdogReset,
1530 }
1531 // Remove ranges in `guest_mem_layout` that overlap with ranges in `file_backed_mappings`.
1532 // Returns the updated guest memory layout.
punch_holes_in_guest_mem_layout_for_mappings( guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>, file_backed_mappings: &[FileBackedMappingParameters], ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)>1533 fn punch_holes_in_guest_mem_layout_for_mappings(
1534     guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>,
1535     file_backed_mappings: &[FileBackedMappingParameters],
1536 ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> {
1537     // Create a set containing (start, end) pairs with exclusive end (end = start + size; the byte
1538     // at end is not included in the range).
1539     let mut layout_set = BTreeSet::new();
1540     for (addr, size, options) in &guest_mem_layout {
1541         layout_set.insert((addr.offset(), addr.offset() + size, *options));
1542     }
1543 
1544     for mapping in file_backed_mappings {
1545         let mapping_start = mapping.address;
1546         let mapping_end = mapping_start + mapping.size;
1547 
1548         // Repeatedly split overlapping guest memory regions until no overlaps remain.
1549         while let Some((range_start, range_end, options)) = layout_set
1550             .iter()
1551             .find(|&&(range_start, range_end, _)| {
1552                 mapping_start < range_end && mapping_end > range_start
1553             })
1554             .cloned()
1555         {
1556             layout_set.remove(&(range_start, range_end, options));
1557 
1558             if range_start < mapping_start {
1559                 layout_set.insert((range_start, mapping_start, options));
1560             }
1561             if range_end > mapping_end {
1562                 layout_set.insert((mapping_end, range_end, options));
1563             }
1564         }
1565     }
1566 
1567     // Build the final guest memory layout from the modified layout_set.
1568     layout_set
1569         .iter()
1570         .map(|(start, end, options)| (GuestAddress(*start), end - start, *options))
1571         .collect()
1572 }
1573 
create_guest_memory( cfg: &Config, components: &VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>1574 fn create_guest_memory(
1575     cfg: &Config,
1576     components: &VmComponents,
1577     arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
1578     hypervisor: &impl Hypervisor,
1579 ) -> Result<GuestMemory> {
1580     let guest_mem_layout = Arch::guest_memory_layout(components, arch_memory_layout, hypervisor)
1581         .context("failed to create guest memory layout")?;
1582 
1583     let guest_mem_layout =
1584         punch_holes_in_guest_mem_layout_for_mappings(guest_mem_layout, &cfg.file_backed_mappings);
1585 
1586     let guest_mem = GuestMemory::new_with_options(&guest_mem_layout)
1587         .context("failed to create guest memory")?;
1588     let mut mem_policy = MemoryPolicy::empty();
1589     if components.hugepages {
1590         mem_policy |= MemoryPolicy::USE_HUGEPAGES;
1591     }
1592 
1593     if cfg.lock_guest_memory {
1594         mem_policy |= MemoryPolicy::LOCK_GUEST_MEMORY;
1595     }
1596     guest_mem.set_memory_policy(mem_policy);
1597 
1598     if cfg.unmap_guest_memory_on_fork {
1599         // Note that this isn't compatible with sandboxing. We could potentially fix that by
1600         // delaying the call until after the sandboxed devices are forked. However, the main use
1601         // for this is in conjunction with protected VMs, where most of the guest memory has been
1602         // unshared with the host. We'd need to be confident that the guest memory is unshared with
1603         // the host only after the `use_dontfork` call and those details will vary by hypervisor.
1604         // So, for now we keep things simple to be safe.
1605         guest_mem.use_dontfork().context("use_dontfork failed")?;
1606     }
1607 
1608     Ok(guest_mem)
1609 }
1610 
1611 #[cfg(all(target_arch = "aarch64", feature = "geniezone"))]
run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1612 fn run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1613     use devices::GeniezoneKernelIrqChip;
1614     use hypervisor::geniezone::Geniezone;
1615     use hypervisor::geniezone::GeniezoneVcpu;
1616     use hypervisor::geniezone::GeniezoneVm;
1617 
1618     let device_path = device_path.unwrap_or(Path::new(GENIEZONE_PATH));
1619     let gzvm = Geniezone::new_with_path(device_path)
1620         .with_context(|| format!("failed to open GenieZone device {}", device_path.display()))?;
1621 
1622     let arch_memory_layout =
1623         Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1624     let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &gzvm)?;
1625 
1626     #[cfg(feature = "swap")]
1627     let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1628         Some(
1629             SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1630                 .context("launch vmm-swap monitor process")?,
1631         )
1632     } else {
1633         None
1634     };
1635 
1636     let vm =
1637         GeniezoneVm::new(&gzvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1638 
1639     // Check that the VM was actually created in protected mode as expected.
1640     if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1641         bail!("Failed to create protected VM");
1642     }
1643     let vm_clone = vm.try_clone().context("failed to clone vm")?;
1644 
1645     let ioapic_host_tube;
1646     let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
1647         IrqChipKind::Split => bail!("Geniezone does not support split irqchip mode"),
1648         IrqChipKind::Userspace => bail!("Geniezone does not support userspace irqchip mode"),
1649         IrqChipKind::Kernel => {
1650             ioapic_host_tube = None;
1651             GeniezoneKernelIrqChip::new(vm_clone, components.vcpu_count)
1652                 .context("failed to create IRQ chip")?
1653         }
1654     };
1655 
1656     run_vm::<GeniezoneVcpu, GeniezoneVm>(
1657         cfg,
1658         components,
1659         &arch_memory_layout,
1660         vm,
1661         &mut irq_chip,
1662         ioapic_host_tube,
1663         #[cfg(feature = "swap")]
1664         swap_controller,
1665     )
1666 }
1667 
run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1668 fn run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1669     use devices::KvmKernelIrqChip;
1670     #[cfg(target_arch = "x86_64")]
1671     use devices::KvmSplitIrqChip;
1672     use hypervisor::kvm::Kvm;
1673     use hypervisor::kvm::KvmVcpu;
1674     use hypervisor::kvm::KvmVm;
1675 
1676     let device_path = device_path.unwrap_or(Path::new(KVM_PATH));
1677     let kvm = Kvm::new_with_path(device_path)
1678         .with_context(|| format!("failed to open KVM device {}", device_path.display()))?;
1679 
1680     let arch_memory_layout =
1681         Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1682     let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &kvm)?;
1683 
1684     #[cfg(feature = "swap")]
1685     let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1686         Some(
1687             SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1688                 .context("launch vmm-swap monitor process")?,
1689         )
1690     } else {
1691         None
1692     };
1693 
1694     let vm = KvmVm::new(&kvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1695 
1696     #[cfg(target_arch = "x86_64")]
1697     if cfg.itmt {
1698         vm.set_platform_info_read_access(false)
1699             .context("failed to disable MSR_PLATFORM_INFO read access")?;
1700     }
1701 
1702     // Check that the VM was actually created in protected mode as expected.
1703     // This check is only needed on aarch64. On x86_64, protected VM creation will fail
1704     // if protected mode is not supported.
1705     #[cfg(not(target_arch = "x86_64"))]
1706     if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1707         bail!("Failed to create protected VM");
1708     }
1709     let vm_clone = vm.try_clone().context("failed to clone vm")?;
1710 
1711     enum KvmIrqChip {
1712         #[cfg(target_arch = "x86_64")]
1713         Split(KvmSplitIrqChip),
1714         Kernel(KvmKernelIrqChip),
1715     }
1716 
1717     impl KvmIrqChip {
1718         fn as_mut(&mut self) -> &mut dyn IrqChipArch {
1719             match self {
1720                 #[cfg(target_arch = "x86_64")]
1721                 KvmIrqChip::Split(i) => i,
1722                 KvmIrqChip::Kernel(i) => i,
1723             }
1724         }
1725     }
1726 
1727     let ioapic_host_tube;
1728     let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
1729         IrqChipKind::Userspace => {
1730             bail!("KVM userspace irqchip mode not implemented");
1731         }
1732         IrqChipKind::Split => {
1733             #[cfg(not(target_arch = "x86_64"))]
1734             bail!("KVM split irqchip mode only supported on x86 processors");
1735             #[cfg(target_arch = "x86_64")]
1736             {
1737                 let (host_tube, ioapic_device_tube) =
1738                     Tube::pair().context("failed to create tube")?;
1739                 ioapic_host_tube = Some(host_tube);
1740                 KvmIrqChip::Split(
1741                     KvmSplitIrqChip::new(
1742                         vm_clone,
1743                         components.vcpu_count,
1744                         ioapic_device_tube,
1745                         Some(24),
1746                     )
1747                     .context("failed to create IRQ chip")?,
1748                 )
1749             }
1750         }
1751         IrqChipKind::Kernel => {
1752             ioapic_host_tube = None;
1753             KvmIrqChip::Kernel(
1754                 KvmKernelIrqChip::new(vm_clone, components.vcpu_count)
1755                     .context("failed to create IRQ chip")?,
1756             )
1757         }
1758     };
1759 
1760     run_vm::<KvmVcpu, KvmVm>(
1761         cfg,
1762         components,
1763         &arch_memory_layout,
1764         vm,
1765         irq_chip.as_mut(),
1766         ioapic_host_tube,
1767         #[cfg(feature = "swap")]
1768         swap_controller,
1769     )
1770 }
1771 
1772 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
run_gunyah( device_path: Option<&Path>, cfg: Config, components: VmComponents, ) -> Result<ExitState>1773 fn run_gunyah(
1774     device_path: Option<&Path>,
1775     cfg: Config,
1776     components: VmComponents,
1777 ) -> Result<ExitState> {
1778     use devices::GunyahIrqChip;
1779     use hypervisor::gunyah::Gunyah;
1780     use hypervisor::gunyah::GunyahVcpu;
1781     use hypervisor::gunyah::GunyahVm;
1782 
1783     let device_path = device_path.unwrap_or(Path::new(GUNYAH_PATH));
1784     let gunyah = Gunyah::new_with_path(device_path)
1785         .with_context(|| format!("failed to open Gunyah device {}", device_path.display()))?;
1786 
1787     let arch_memory_layout =
1788         Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1789     let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &gunyah)?;
1790 
1791     #[cfg(feature = "swap")]
1792     let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1793         Some(
1794             SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1795                 .context("launch vmm-swap monitor process")?,
1796         )
1797     } else {
1798         None
1799     };
1800 
1801     let vm = GunyahVm::new(&gunyah, guest_mem, components.hv_cfg).context("failed to create vm")?;
1802 
1803     // Check that the VM was actually created in protected mode as expected.
1804     if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1805         bail!("Failed to create protected VM");
1806     }
1807 
1808     let vm_clone = vm.try_clone()?;
1809 
1810     run_vm::<GunyahVcpu, GunyahVm>(
1811         cfg,
1812         components,
1813         &arch_memory_layout,
1814         vm,
1815         &mut GunyahIrqChip::new(vm_clone)?,
1816         None,
1817         #[cfg(feature = "swap")]
1818         swap_controller,
1819     )
1820 }
1821 
1822 /// Choose a default hypervisor if no `--hypervisor` option was specified.
get_default_hypervisor() -> Option<HypervisorKind>1823 fn get_default_hypervisor() -> Option<HypervisorKind> {
1824     let kvm_path = Path::new(KVM_PATH);
1825     if kvm_path.exists() {
1826         return Some(HypervisorKind::Kvm {
1827             device: Some(kvm_path.to_path_buf()),
1828         });
1829     }
1830 
1831     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1832     #[cfg(feature = "geniezone")]
1833     {
1834         let gz_path = Path::new(GENIEZONE_PATH);
1835         if gz_path.exists() {
1836             return Some(HypervisorKind::Geniezone {
1837                 device: Some(gz_path.to_path_buf()),
1838             });
1839         }
1840     }
1841 
1842     #[cfg(all(
1843         unix,
1844         any(target_arch = "arm", target_arch = "aarch64"),
1845         feature = "gunyah"
1846     ))]
1847     {
1848         let gunyah_path = Path::new(GUNYAH_PATH);
1849         if gunyah_path.exists() {
1850             return Some(HypervisorKind::Gunyah {
1851                 device: Some(gunyah_path.to_path_buf()),
1852             });
1853         }
1854     }
1855 
1856     None
1857 }
1858 
run_config(cfg: Config) -> Result<ExitState>1859 pub fn run_config(cfg: Config) -> Result<ExitState> {
1860     let components = setup_vm_components(&cfg)?;
1861 
1862     let hypervisor = cfg
1863         .hypervisor
1864         .clone()
1865         .or_else(get_default_hypervisor)
1866         .context("no enabled hypervisor")?;
1867 
1868     debug!("creating hypervisor: {:?}", hypervisor);
1869 
1870     match hypervisor {
1871         HypervisorKind::Kvm { device } => run_kvm(device.as_deref(), cfg, components),
1872         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1873         #[cfg(feature = "geniezone")]
1874         HypervisorKind::Geniezone { device } => run_gz(device.as_deref(), cfg, components),
1875         #[cfg(all(
1876             unix,
1877             any(target_arch = "arm", target_arch = "aarch64"),
1878             feature = "gunyah"
1879         ))]
1880         HypervisorKind::Gunyah { device } => run_gunyah(device.as_deref(), cfg, components),
1881     }
1882 }
1883 
run_vm<Vcpu, V>( cfg: Config, #[allow(unused_mut)] mut components: VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,1884 fn run_vm<Vcpu, V>(
1885     cfg: Config,
1886     #[allow(unused_mut)] mut components: VmComponents,
1887     arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
1888     mut vm: V,
1889     irq_chip: &mut dyn IrqChipArch,
1890     ioapic_host_tube: Option<Tube>,
1891     #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>,
1892 ) -> Result<ExitState>
1893 where
1894     Vcpu: VcpuArch + 'static,
1895     V: VmArch + 'static,
1896 {
1897     if cfg.jail_config.is_some() {
1898         // Printing something to the syslog before entering minijail so that libc's syslogger has a
1899         // chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
1900         // access to those files will not be possible.
1901         info!("crosvm entering multiprocess mode");
1902     }
1903 
1904     let (metrics_send, metrics_recv) = Tube::directional_pair().context("metrics tube")?;
1905     metrics::initialize(metrics_send);
1906 
1907     #[cfg(all(feature = "pci-hotplug", feature = "swap"))]
1908     let swap_device_helper = match &swap_controller {
1909         Some(swap_controller) => Some(swap_controller.create_device_helper()?),
1910         None => None,
1911     };
1912     // pci-hotplug is only implemented for x86_64 for now, attempting to use it on other platform
1913     // would crash.
1914     #[cfg(all(feature = "pci-hotplug", not(target_arch = "x86_64")))]
1915     if cfg.pci_hotplug_slots.is_some() {
1916         bail!("pci-hotplug is not implemented for non x86_64 architecture");
1917     }
1918     // hotplug_manager must be created before vm is started since it forks jail warden process.
1919     #[cfg(feature = "pci-hotplug")]
1920     // TODO(293801301): Remove unused_mut after aarch64 support
1921     #[allow(unused_mut)]
1922     let mut hotplug_manager = if cfg.pci_hotplug_slots.is_some() {
1923         Some(PciHotPlugManager::new(
1924             vm.get_memory().clone(),
1925             &cfg,
1926             #[cfg(feature = "swap")]
1927             swap_device_helper,
1928         )?)
1929     } else {
1930         None
1931     };
1932 
1933     #[cfg(feature = "usb")]
1934     let (usb_control_tube, usb_provider) =
1935         DeviceProvider::new().context("failed to create usb provider")?;
1936 
1937     // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
1938     // before any jailed devices have been spawned, so that we can catch any of them that fail very
1939     // quickly.
1940     let sigchld_fd = SignalFd::new(libc::SIGCHLD).context("failed to create signalfd")?;
1941 
1942     let control_server_socket = match &cfg.socket_path {
1943         Some(path) => Some(UnlinkUnixSeqpacketListener(
1944             UnixSeqpacketListener::bind(path).context("failed to create control server")?,
1945         )),
1946         None => None,
1947     };
1948 
1949     let mut all_control_tubes = Vec::new();
1950     let mut add_control_tube = |t| all_control_tubes.push(t);
1951 
1952     if let Some(ioapic_host_tube) = ioapic_host_tube {
1953         add_control_tube(AnyControlTube::IrqTube(ioapic_host_tube));
1954     }
1955 
1956     let battery = if cfg.battery_config.is_some() {
1957         #[cfg_attr(
1958             not(feature = "power-monitor-powerd"),
1959             allow(clippy::manual_map, clippy::needless_match, unused_mut)
1960         )]
1961         let jail = if let Some(jail_config) = &cfg.jail_config {
1962             let mut config = SandboxConfig::new(jail_config, "battery");
1963             #[cfg(feature = "power-monitor-powerd")]
1964             {
1965                 config.bind_mounts = true;
1966             }
1967             let mut jail =
1968                 create_sandbox_minijail(&jail_config.pivot_root, MAX_OPEN_FILES_DEFAULT, &config)?;
1969 
1970             // Setup a bind mount to the system D-Bus socket if the powerd monitor is used.
1971             #[cfg(feature = "power-monitor-powerd")]
1972             {
1973                 let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
1974                 jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
1975             }
1976             Some(jail)
1977         } else {
1978             None
1979         };
1980         (cfg.battery_config.as_ref().map(|c| c.type_), jail)
1981     } else {
1982         (cfg.battery_config.as_ref().map(|c| c.type_), None)
1983     };
1984 
1985     let (vm_evt_wrtube, vm_evt_rdtube) =
1986         Tube::directional_pair().context("failed to create vm event tube")?;
1987 
1988     let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
1989     let mut sys_allocator = SystemAllocator::new(
1990         Arch::get_system_allocator_config(&vm, arch_memory_layout),
1991         pstore_size,
1992         &cfg.mmio_address_ranges,
1993     )
1994     .context("failed to create system allocator")?;
1995 
1996     let ramoops_region = match &components.pstore {
1997         Some(pstore) => Some(
1998             arch::pstore::create_memory_region(
1999                 &mut vm,
2000                 sys_allocator.reserved_region().unwrap(),
2001                 pstore,
2002             )
2003             .context("failed to allocate pstore region")?,
2004         ),
2005         None => None,
2006     };
2007 
2008     create_file_backed_mappings(&cfg, &mut vm, &mut sys_allocator)?;
2009 
2010     #[cfg(feature = "gpu")]
2011     // Hold on to the render server jail so it keeps running until we exit run_vm()
2012     let (_render_server_jail, render_server_fd) =
2013         if let Some(parameters) = &cfg.gpu_render_server_parameters {
2014             let (jail, fd) = start_gpu_render_server(&cfg, parameters)?;
2015             (Some(ScopedMinijail(jail)), Some(fd))
2016         } else {
2017             (None, None)
2018         };
2019 
2020     let mut iommu_attached_endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>> =
2021         BTreeMap::new();
2022     let mut iova_max_addr: Option<u64> = None;
2023 
2024     let mut vfio_container_manager = VfioContainerManager::new();
2025 
2026     #[cfg(feature = "registered_events")]
2027     let (reg_evt_wrtube, reg_evt_rdtube) =
2028         Tube::directional_pair().context("failed to create registered event tube")?;
2029 
2030     let mut worker_process_pids = BTreeSet::new();
2031 
2032     let mut devices = create_devices(
2033         &cfg,
2034         &mut vm,
2035         &mut sys_allocator,
2036         &mut add_control_tube,
2037         &vm_evt_wrtube,
2038         &mut iommu_attached_endpoints,
2039         #[cfg(feature = "usb")]
2040         usb_provider,
2041         #[cfg(feature = "gpu")]
2042         render_server_fd,
2043         &mut iova_max_addr,
2044         #[cfg(feature = "registered_events")]
2045         &reg_evt_wrtube,
2046         &mut vfio_container_manager,
2047         &mut worker_process_pids,
2048     )?;
2049 
2050     #[cfg(feature = "pci-hotplug")]
2051     // TODO(293801301): Remove unused_variables after aarch64 support
2052     #[allow(unused_variables)]
2053     let pci_hotplug_slots = cfg.pci_hotplug_slots;
2054     #[cfg(not(feature = "pci-hotplug"))]
2055     #[allow(unused_variables)]
2056     let pci_hotplug_slots: Option<u8> = None;
2057     #[cfg(target_arch = "x86_64")]
2058     let hp_stub = create_pure_virtual_pcie_root_port(
2059         &mut sys_allocator,
2060         &mut add_control_tube,
2061         &mut devices,
2062         pci_hotplug_slots.unwrap_or(1),
2063     )?;
2064 
2065     arch::assign_pci_addresses(&mut devices, &mut sys_allocator)?;
2066 
2067     let pci_devices: Vec<&dyn PciDevice> = devices
2068         .iter()
2069         .filter_map(|d| (d.0).as_pci_device())
2070         .collect();
2071 
2072     let virtio_devices: Vec<(&dyn VirtioDevice, devices::PciAddress)> = pci_devices
2073         .into_iter()
2074         .flat_map(|s| {
2075             if let Some(virtio_pci_device) = s.as_virtio_pci_device() {
2076                 std::iter::zip(
2077                     Some(virtio_pci_device.virtio_device()),
2078                     virtio_pci_device.pci_address(),
2079                 )
2080                 .next()
2081             } else {
2082                 None
2083             }
2084         })
2085         .collect();
2086 
2087     let mut open_firmware_device_paths: Vec<(Vec<u8>, usize)> = virtio_devices
2088         .iter()
2089         .flat_map(|s| (s.0).bootorder_fw_cfg(s.1.dev))
2090         .collect();
2091 
2092     // order the OpenFirmware device paths, in ascending order, by their boot_index
2093     open_firmware_device_paths.sort_by(|a, b| (a.1).cmp(&(b.1)));
2094 
2095     // "/pci@iocf8/" is x86 specific and represents the root at the system bus port
2096     let mut bootorder_fw_cfg_blob =
2097         open_firmware_device_paths
2098             .into_iter()
2099             .fold(Vec::new(), |a, b| {
2100                 a.into_iter()
2101                     .chain("/pci@i0cf8/".as_bytes().iter().copied())
2102                     .chain(b.0)
2103                     .chain("\n".as_bytes().iter().copied())
2104                     .collect()
2105             });
2106 
2107     // the "bootorder" file is expected to end with a null terminator
2108     bootorder_fw_cfg_blob.push(0);
2109 
2110     components.bootorder_fw_cfg_blob = bootorder_fw_cfg_blob;
2111 
2112     // if the bootindex argument was given, we want to make sure that fw_cfg is enabled so the
2113     // "bootorder" file can be accessed by the guest.
2114     components.fw_cfg_enable |= components.bootorder_fw_cfg_blob.len() > 1;
2115 
2116     let (translate_response_senders, request_rx) = setup_virtio_access_platform(
2117         &mut sys_allocator,
2118         &mut iommu_attached_endpoints,
2119         &mut devices,
2120     )?;
2121 
2122     #[cfg(target_arch = "x86_64")]
2123     let iommu_bus_ranges = hp_stub.iommu_bus_ranges;
2124     #[cfg(not(target_arch = "x86_64"))]
2125     let iommu_bus_ranges = Vec::new();
2126 
2127     let iommu_host_tube = if !iommu_attached_endpoints.is_empty()
2128         || (cfg.vfio_isolate_hotplug && !iommu_bus_ranges.is_empty())
2129     {
2130         let (iommu_host_tube, iommu_device_tube) = Tube::pair().context("failed to create tube")?;
2131         let iommu_dev = create_iommu_device(
2132             cfg.protection_type,
2133             &cfg.jail_config,
2134             iova_max_addr.unwrap_or(u64::MAX),
2135             iommu_attached_endpoints,
2136             iommu_bus_ranges,
2137             translate_response_senders,
2138             request_rx,
2139             iommu_device_tube,
2140         )?;
2141 
2142         let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2143         add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2144         let (ioevent_host_tube, ioevent_device_tube) =
2145             Tube::pair().context("failed to create ioevent tube")?;
2146         add_control_tube(
2147             VmMemoryTube {
2148                 tube: ioevent_host_tube,
2149                 expose_with_viommu: false,
2150             }
2151             .into(),
2152         );
2153         let (host_tube, device_tube) =
2154             Tube::pair().context("failed to create device control tube")?;
2155         add_control_tube(TaggedControlTube::Vm(host_tube).into());
2156         let mut dev = VirtioPciDevice::new(
2157             vm.get_memory().clone(),
2158             iommu_dev.dev,
2159             msi_device_tube,
2160             cfg.disable_virtio_intx,
2161             None,
2162             VmMemoryClient::new(ioevent_device_tube),
2163             device_tube,
2164         )
2165         .context("failed to create virtio pci dev")?;
2166         // early reservation for viommu.
2167         dev.allocate_address(&mut sys_allocator)
2168             .context("failed to allocate resources early for virtio pci dev")?;
2169         let dev = Box::new(dev);
2170         devices.push((dev, iommu_dev.jail));
2171         Some(iommu_host_tube)
2172     } else {
2173         None
2174     };
2175 
2176     #[cfg(target_arch = "x86_64")]
2177     for device in devices
2178         .iter_mut()
2179         .filter_map(|(dev, _)| dev.as_pci_device_mut())
2180     {
2181         let sdts = device
2182             .generate_acpi(components.acpi_sdts)
2183             .or_else(|| {
2184                 error!("ACPI table generation error");
2185                 None
2186             })
2187             .ok_or_else(|| anyhow!("failed to generate ACPI table"))?;
2188         components.acpi_sdts = sdts;
2189     }
2190 
2191     // KVM_CREATE_VCPU uses apic id for x86 and uses cpu id for others.
2192     let mut vcpu_ids = Vec::new();
2193 
2194     let guest_suspended_cvar = if cfg.force_s2idle {
2195         Some(Arc::new((Mutex::new(false), Condvar::new())))
2196     } else {
2197         None
2198     };
2199 
2200     let dt_overlays = cfg
2201         .device_tree_overlay
2202         .iter()
2203         .map(|o| {
2204             Ok(DtbOverlay {
2205                 file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2206                     .with_context(|| {
2207                         format!("failed to open device tree overlay {}", o.path.display())
2208                     })?,
2209                 do_filter: o.filter_devs,
2210             })
2211         })
2212         .collect::<Result<Vec<DtbOverlay>>>()?;
2213 
2214     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
2215     let vcpu_domain_paths = components.vcpu_domain_paths.clone();
2216 
2217     let mut linux = Arch::build_vm::<V, Vcpu>(
2218         components,
2219         arch_memory_layout,
2220         &vm_evt_wrtube,
2221         &mut sys_allocator,
2222         &cfg.serial_parameters,
2223         simple_jail(&cfg.jail_config, "serial_device")?,
2224         battery,
2225         vm,
2226         ramoops_region,
2227         devices,
2228         irq_chip,
2229         &mut vcpu_ids,
2230         cfg.dump_device_tree_blob.clone(),
2231         simple_jail(&cfg.jail_config, "serial_device")?,
2232         #[cfg(target_arch = "x86_64")]
2233         simple_jail(&cfg.jail_config, "block_device")?,
2234         #[cfg(target_arch = "x86_64")]
2235         simple_jail(&cfg.jail_config, "fw_cfg_device")?,
2236         #[cfg(feature = "swap")]
2237         &mut swap_controller,
2238         guest_suspended_cvar.clone(),
2239         dt_overlays,
2240         cfg.fdt_position,
2241         cfg.no_pmu,
2242     )
2243     .context("the architecture failed to build the vm")?;
2244 
2245     for tube in linux.vm_request_tubes.drain(..) {
2246         add_control_tube(TaggedControlTube::Vm(tube).into());
2247     }
2248 
2249     #[cfg(target_arch = "x86_64")]
2250     let (hp_control_tube, hp_worker_tube) = mpsc::channel();
2251     #[cfg(all(feature = "pci-hotplug", target_arch = "x86_64"))]
2252     if let Some(hotplug_manager) = &mut hotplug_manager {
2253         hotplug_manager.set_rootbus_controller(hp_control_tube.clone())?;
2254     }
2255     #[cfg(target_arch = "x86_64")]
2256     let hp_thread = {
2257         for (bus_num, hp_bus) in hp_stub.hotplug_buses.into_iter() {
2258             #[cfg(feature = "pci-hotplug")]
2259             if let Some(hotplug_manager) = &mut hotplug_manager {
2260                 hotplug_manager.add_port(hp_bus)?;
2261             } else {
2262                 linux.hotplug_bus.insert(bus_num, hp_bus);
2263             }
2264             #[cfg(not(feature = "pci-hotplug"))]
2265             linux.hotplug_bus.insert(bus_num, hp_bus);
2266         }
2267 
2268         if let Some(pm) = &linux.pm {
2269             for (gpe, notify_dev) in hp_stub.gpe_notify_devs.into_iter() {
2270                 pm.lock().register_gpe_notify_dev(gpe, notify_dev);
2271             }
2272             for (bus, notify_dev) in hp_stub.pme_notify_devs.into_iter() {
2273                 pm.lock().register_pme_notify_dev(bus, notify_dev);
2274             }
2275         }
2276 
2277         let (hp_vm_mem_host_tube, hp_vm_mem_worker_tube) =
2278             Tube::pair().context("failed to create tube")?;
2279         add_control_tube(
2280             VmMemoryTube {
2281                 tube: hp_vm_mem_host_tube,
2282                 expose_with_viommu: false,
2283             }
2284             .into(),
2285         );
2286 
2287         let supports_readonly_mapping = linux.vm.supports_readonly_mapping();
2288         let pci_root = linux.root_config.clone();
2289         std::thread::Builder::new()
2290             .name("pci_root".to_string())
2291             .spawn(move || {
2292                 start_pci_root_worker(
2293                     supports_readonly_mapping,
2294                     pci_root,
2295                     hp_worker_tube,
2296                     hp_vm_mem_worker_tube,
2297                 )
2298             })?
2299     };
2300 
2301     let flags = RutabagaGrallocBackendFlags::new().disable_vulkano();
2302     let gralloc = RutabagaGralloc::new(flags).context("failed to create gralloc")?;
2303 
2304     run_control(
2305         linux,
2306         sys_allocator,
2307         cfg,
2308         control_server_socket,
2309         all_control_tubes,
2310         #[cfg(feature = "usb")]
2311         usb_control_tube,
2312         vm_evt_rdtube,
2313         vm_evt_wrtube,
2314         sigchld_fd,
2315         gralloc,
2316         vcpu_ids,
2317         iommu_host_tube,
2318         #[cfg(target_arch = "x86_64")]
2319         hp_control_tube,
2320         #[cfg(target_arch = "x86_64")]
2321         hp_thread,
2322         #[cfg(feature = "pci-hotplug")]
2323         hotplug_manager,
2324         #[cfg(feature = "swap")]
2325         swap_controller,
2326         #[cfg(feature = "registered_events")]
2327         reg_evt_rdtube,
2328         guest_suspended_cvar,
2329         metrics_recv,
2330         vfio_container_manager,
2331         worker_process_pids,
2332         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
2333         vcpu_domain_paths,
2334     )
2335 }
2336 
2337 // Hotplug command is facing dead lock issue when it tries to acquire the lock
2338 // for pci root in the vm control thread. Dead lock could happen when the vm
2339 // control thread(Thread A namely) is handling the hotplug command and it tries
2340 // to get the lock for pci root. However, the lock is already hold by another
2341 // device in thread B, which is actively sending an vm control to be handled by
2342 // thread A and waiting for response. However, thread A is blocked on acquiring
2343 // the lock, so dead lock happens. In order to resolve this issue, we add this
2344 // worker thread and push all work that locks pci root to this thread.
2345 #[cfg(target_arch = "x86_64")]
start_pci_root_worker( supports_readonly_mapping: bool, pci_root: Arc<Mutex<PciRoot>>, hp_device_tube: mpsc::Receiver<PciRootCommand>, vm_control_tube: Tube, )2346 fn start_pci_root_worker(
2347     supports_readonly_mapping: bool,
2348     pci_root: Arc<Mutex<PciRoot>>,
2349     hp_device_tube: mpsc::Receiver<PciRootCommand>,
2350     vm_control_tube: Tube,
2351 ) {
2352     struct PciMmioMapperTube {
2353         supports_readonly_mapping: bool,
2354         vm_control_tube: Tube,
2355         registered_regions: BTreeMap<u32, VmMemoryRegionId>,
2356         next_id: u32,
2357     }
2358 
2359     impl PciMmioMapper for PciMmioMapperTube {
2360         fn supports_readonly_mapping(&self) -> bool {
2361             self.supports_readonly_mapping
2362         }
2363 
2364         fn add_mapping(&mut self, addr: GuestAddress, shmem: &SharedMemory) -> anyhow::Result<u32> {
2365             let shmem = shmem
2366                 .try_clone()
2367                 .context("failed to create new SharedMemory")?;
2368             self.vm_control_tube
2369                 .send(&VmMemoryRequest::RegisterMemory {
2370                     source: VmMemorySource::SharedMemory(shmem),
2371                     dest: VmMemoryDestination::GuestPhysicalAddress(addr.0),
2372                     prot: Protection::read(),
2373                     cache: MemCacheType::CacheCoherent,
2374                 })
2375                 .context("failed to send request")?;
2376             match self.vm_control_tube.recv::<VmMemoryResponse>() {
2377                 Ok(VmMemoryResponse::RegisterMemory { region_id, .. }) => {
2378                     let cur_id = self.next_id;
2379                     self.registered_regions.insert(cur_id, region_id);
2380                     self.next_id += 1;
2381                     Ok(cur_id)
2382                 }
2383                 res => bail!("Bad response: {:?}", res),
2384             }
2385         }
2386     }
2387 
2388     let mut mapper = PciMmioMapperTube {
2389         supports_readonly_mapping,
2390         vm_control_tube,
2391         registered_regions: BTreeMap::new(),
2392         next_id: 0,
2393     };
2394 
2395     loop {
2396         match hp_device_tube.recv() {
2397             Ok(cmd) => match cmd {
2398                 PciRootCommand::Add(addr, device) => {
2399                     if let Err(e) = pci_root.lock().add_device(addr, device, &mut mapper) {
2400                         error!("failed to add hotplugged device to PCI root port: {}", e);
2401                     }
2402                 }
2403                 PciRootCommand::AddBridge(pci_bus) => {
2404                     if let Err(e) = pci_root.lock().add_bridge(pci_bus) {
2405                         error!("failed to add hotplugged bridge to PCI root port: {}", e);
2406                     }
2407                 }
2408                 PciRootCommand::Remove(addr) => {
2409                     pci_root.lock().remove_device(addr);
2410                 }
2411                 PciRootCommand::Kill => break,
2412             },
2413             Err(e) => {
2414                 error!("Error: pci root worker channel closed: {}", e);
2415                 break;
2416             }
2417         }
2418     }
2419 }
2420 
2421 #[cfg(target_arch = "x86_64")]
get_hp_bus<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, host_addr: PciAddress, ) -> Result<Arc<Mutex<dyn HotPlugBus>>>2422 fn get_hp_bus<V: VmArch, Vcpu: VcpuArch>(
2423     linux: &RunnableLinuxVm<V, Vcpu>,
2424     host_addr: PciAddress,
2425 ) -> Result<Arc<Mutex<dyn HotPlugBus>>> {
2426     for (_, hp_bus) in linux.hotplug_bus.iter() {
2427         if hp_bus.lock().is_match(host_addr).is_some() {
2428             return Ok(hp_bus.clone());
2429         }
2430     }
2431     Err(anyhow!("Failed to find a suitable hotplug bus"))
2432 }
2433 
2434 #[cfg(target_arch = "x86_64")]
add_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, add_control_tube: &mut impl FnMut(AnyControlTube), hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>, vfio_container_manager: &mut VfioContainerManager, ) -> Result<()>2435 fn add_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2436     linux: &mut RunnableLinuxVm<V, Vcpu>,
2437     sys_allocator: &mut SystemAllocator,
2438     cfg: &Config,
2439     add_control_tube: &mut impl FnMut(AnyControlTube),
2440     hp_control_tube: &mpsc::Sender<PciRootCommand>,
2441     iommu_host_tube: Option<&Tube>,
2442     device: &HotPlugDeviceInfo,
2443     #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
2444     vfio_container_manager: &mut VfioContainerManager,
2445 ) -> Result<()> {
2446     let host_addr = PciAddress::from_path(&device.path)
2447         .context("failed to parse hotplug device's PCI address")?;
2448     let hp_bus = get_hp_bus(linux, host_addr)?;
2449 
2450     let (hotplug_key, pci_address) = match device.device_type {
2451         HotPlugDeviceType::UpstreamPort | HotPlugDeviceType::DownstreamPort => {
2452             let (vm_host_tube, vm_device_tube) = Tube::pair().context("failed to create tube")?;
2453             add_control_tube(TaggedControlTube::Vm(vm_host_tube).into());
2454             let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2455             add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2456             let pcie_host = PcieHostPort::new(device.path.as_path(), vm_device_tube)?;
2457             let (hotplug_key, pci_bridge) = match device.device_type {
2458                 HotPlugDeviceType::UpstreamPort => {
2459                     let hotplug_key = HotPlugKey::HostUpstreamPort { host_addr };
2460                     let pcie_upstream_port = Arc::new(Mutex::new(PcieUpstreamPort::new_from_host(
2461                         pcie_host, true,
2462                     )?));
2463                     let pci_bridge =
2464                         Box::new(PciBridge::new(pcie_upstream_port.clone(), msi_device_tube));
2465                     linux
2466                         .hotplug_bus
2467                         .insert(pci_bridge.get_secondary_num(), pcie_upstream_port);
2468                     (hotplug_key, pci_bridge)
2469                 }
2470                 HotPlugDeviceType::DownstreamPort => {
2471                     let hotplug_key = HotPlugKey::HostDownstreamPort { host_addr };
2472                     let pcie_downstream_port = Arc::new(Mutex::new(
2473                         PcieDownstreamPort::new_from_host(pcie_host, true)?,
2474                     ));
2475                     let pci_bridge = Box::new(PciBridge::new(
2476                         pcie_downstream_port.clone(),
2477                         msi_device_tube,
2478                     ));
2479                     linux
2480                         .hotplug_bus
2481                         .insert(pci_bridge.get_secondary_num(), pcie_downstream_port);
2482                     (hotplug_key, pci_bridge)
2483                 }
2484                 _ => {
2485                     bail!("Impossible to reach here")
2486                 }
2487             };
2488             let pci_address = Arch::register_pci_device(
2489                 linux,
2490                 pci_bridge,
2491                 None,
2492                 sys_allocator,
2493                 hp_control_tube,
2494                 #[cfg(feature = "swap")]
2495                 swap_controller,
2496             )?;
2497 
2498             (hotplug_key, pci_address)
2499         }
2500         HotPlugDeviceType::EndPoint => {
2501             let hotplug_key = HotPlugKey::HostVfio { host_addr };
2502             let (vfio_device, jail, viommu_mapper) = create_vfio_device(
2503                 &cfg.jail_config,
2504                 &linux.vm,
2505                 sys_allocator,
2506                 add_control_tube,
2507                 &device.path,
2508                 true,
2509                 None,
2510                 None,
2511                 None,
2512                 if iommu_host_tube.is_some() {
2513                     IommuDevType::VirtioIommu
2514                 } else {
2515                     IommuDevType::NoIommu
2516                 },
2517                 None,
2518                 vfio_container_manager,
2519             )?;
2520             let vfio_pci_device = match vfio_device {
2521                 VfioDeviceVariant::Pci(pci) => Box::new(pci),
2522                 VfioDeviceVariant::Platform(_) => bail!("vfio platform hotplug not supported"),
2523             };
2524             let pci_address = Arch::register_pci_device(
2525                 linux,
2526                 vfio_pci_device,
2527                 jail,
2528                 sys_allocator,
2529                 hp_control_tube,
2530                 #[cfg(feature = "swap")]
2531                 swap_controller,
2532             )?;
2533             if let Some(iommu_host_tube) = iommu_host_tube {
2534                 let endpoint_addr = pci_address.to_u32();
2535                 let vfio_wrapper = viommu_mapper.context("expected mapper")?;
2536                 let descriptor = vfio_wrapper.clone_as_raw_descriptor()?;
2537                 let request =
2538                     VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceAdd {
2539                         endpoint_addr,
2540                         wrapper_id: vfio_wrapper.id(),
2541                         container: {
2542                             // SAFETY:
2543                             // Safe because the descriptor is uniquely owned by `descriptor`.
2544                             unsafe { File::from_raw_descriptor(descriptor) }
2545                         },
2546                     });
2547                 match virtio_iommu_request(iommu_host_tube, &request)
2548                     .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2549                 {
2550                     VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2551                     resp => bail!("Unexpected message response: {:?}", resp),
2552                 }
2553             }
2554 
2555             (hotplug_key, pci_address)
2556         }
2557     };
2558     hp_bus.lock().add_hotplug_device(hotplug_key, pci_address);
2559     if device.hp_interrupt {
2560         hp_bus.lock().hot_plug(pci_address)?;
2561     }
2562     Ok(())
2563 }
2564 
2565 #[cfg(feature = "pci-hotplug")]
add_hotplug_net<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), hotplug_manager: &mut PciHotPlugManager, net_param: NetParameters, ) -> Result<u8>2566 fn add_hotplug_net<V: VmArch, Vcpu: VcpuArch>(
2567     linux: &mut RunnableLinuxVm<V, Vcpu>,
2568     sys_allocator: &mut SystemAllocator,
2569     add_control_tube: &mut impl FnMut(AnyControlTube),
2570     hotplug_manager: &mut PciHotPlugManager,
2571     net_param: NetParameters,
2572 ) -> Result<u8> {
2573     let (msi_host_tube, msi_device_tube) = Tube::pair().context("create tube")?;
2574     add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2575     let (ioevent_host_tube, ioevent_device_tube) = Tube::pair().context("create tube")?;
2576     let ioevent_vm_memory_client = VmMemoryClient::new(ioevent_device_tube);
2577     add_control_tube(
2578         VmMemoryTube {
2579             tube: ioevent_host_tube,
2580             expose_with_viommu: false,
2581         }
2582         .into(),
2583     );
2584     let (vm_control_host_tube, vm_control_device_tube) = Tube::pair().context("create tube")?;
2585     add_control_tube(TaggedControlTube::Vm(vm_control_host_tube).into());
2586     let net_carrier_device = NetResourceCarrier::new(
2587         net_param,
2588         msi_device_tube,
2589         ioevent_vm_memory_client,
2590         vm_control_device_tube,
2591     );
2592     hotplug_manager.hotplug_device(
2593         vec![ResourceCarrier::VirtioNet(net_carrier_device)],
2594         linux,
2595         sys_allocator,
2596     )
2597 }
2598 
2599 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>( net_cmd: NetControlCommand, linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), hotplug_manager: &mut PciHotPlugManager, ) -> VmResponse2600 fn handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>(
2601     net_cmd: NetControlCommand,
2602     linux: &mut RunnableLinuxVm<V, Vcpu>,
2603     sys_allocator: &mut SystemAllocator,
2604     add_control_tube: &mut impl FnMut(AnyControlTube),
2605     hotplug_manager: &mut PciHotPlugManager,
2606 ) -> VmResponse {
2607     match net_cmd {
2608         NetControlCommand::AddTap(tap_name) => handle_hotplug_net_add(
2609             linux,
2610             sys_allocator,
2611             add_control_tube,
2612             hotplug_manager,
2613             &tap_name,
2614         ),
2615         NetControlCommand::RemoveTap(bus) => {
2616             handle_hotplug_net_remove(linux, sys_allocator, hotplug_manager, bus)
2617         }
2618     }
2619 }
2620 
2621 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), hotplug_manager: &mut PciHotPlugManager, tap_name: &str, ) -> VmResponse2622 fn handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>(
2623     linux: &mut RunnableLinuxVm<V, Vcpu>,
2624     sys_allocator: &mut SystemAllocator,
2625     add_control_tube: &mut impl FnMut(AnyControlTube),
2626     hotplug_manager: &mut PciHotPlugManager,
2627     tap_name: &str,
2628 ) -> VmResponse {
2629     let net_param_mode = NetParametersMode::TapName {
2630         tap_name: tap_name.to_owned(),
2631         mac: None,
2632     };
2633     let net_param = NetParameters {
2634         mode: net_param_mode,
2635         vhost_net: None,
2636         vq_pairs: None,
2637         packed_queue: false,
2638         pci_address: None,
2639     };
2640     let ret = add_hotplug_net(
2641         linux,
2642         sys_allocator,
2643         add_control_tube,
2644         hotplug_manager,
2645         net_param,
2646     );
2647 
2648     match ret {
2649         Ok(pci_bus) => VmResponse::PciHotPlugResponse { bus: pci_bus },
2650         Err(e) => VmResponse::ErrString(format!("{:?}", e)),
2651     }
2652 }
2653 
2654 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, hotplug_manager: &mut PciHotPlugManager, bus: u8, ) -> VmResponse2655 fn handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>(
2656     linux: &mut RunnableLinuxVm<V, Vcpu>,
2657     sys_allocator: &mut SystemAllocator,
2658     hotplug_manager: &mut PciHotPlugManager,
2659     bus: u8,
2660 ) -> VmResponse {
2661     match hotplug_manager.remove_hotplug_device(bus, linux, sys_allocator) {
2662         Ok(_) => VmResponse::Ok,
2663         Err(e) => VmResponse::ErrString(format!("{:?}", e)),
2664     }
2665 }
2666 
2667 #[cfg(target_arch = "x86_64")]
remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, buses_to_remove: &mut Vec<u8>, hotplug_key: HotPlugKey, child_bus: u8, ) -> Result<()>2668 fn remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>(
2669     linux: &RunnableLinuxVm<V, Vcpu>,
2670     sys_allocator: &mut SystemAllocator,
2671     buses_to_remove: &mut Vec<u8>,
2672     hotplug_key: HotPlugKey,
2673     child_bus: u8,
2674 ) -> Result<()> {
2675     for (bus_num, hp_bus) in linux.hotplug_bus.iter() {
2676         let mut hp_bus_lock = hp_bus.lock();
2677         if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2678             sys_allocator.release_pci(pci_addr.bus, pci_addr.dev, pci_addr.func);
2679             hp_bus_lock.hot_unplug(pci_addr)?;
2680             buses_to_remove.push(child_bus);
2681             if hp_bus_lock.is_empty() {
2682                 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2683                     remove_hotplug_bridge(
2684                         linux,
2685                         sys_allocator,
2686                         buses_to_remove,
2687                         hotplug_key,
2688                         *bus_num,
2689                     )?;
2690                 }
2691             }
2692             return Ok(());
2693         }
2694     }
2695 
2696     Err(anyhow!(
2697         "Can not find device {:?} on hotplug buses",
2698         hotplug_key
2699     ))
2700 }
2701 
2702 #[cfg(target_arch = "x86_64")]
remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, ) -> Result<()>2703 fn remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2704     linux: &mut RunnableLinuxVm<V, Vcpu>,
2705     sys_allocator: &mut SystemAllocator,
2706     iommu_host_tube: Option<&Tube>,
2707     device: &HotPlugDeviceInfo,
2708 ) -> Result<()> {
2709     let host_addr = PciAddress::from_path(&device.path)?;
2710     let hotplug_key = match device.device_type {
2711         HotPlugDeviceType::UpstreamPort => HotPlugKey::HostUpstreamPort { host_addr },
2712         HotPlugDeviceType::DownstreamPort => HotPlugKey::HostDownstreamPort { host_addr },
2713         HotPlugDeviceType::EndPoint => HotPlugKey::HostVfio { host_addr },
2714     };
2715 
2716     let hp_bus = linux
2717         .hotplug_bus
2718         .iter()
2719         .find(|(_, hp_bus)| {
2720             let hp_bus = hp_bus.lock();
2721             hp_bus.get_hotplug_device(hotplug_key).is_some()
2722         })
2723         .map(|(bus_num, hp_bus)| (*bus_num, hp_bus.clone()));
2724 
2725     if let Some((bus_num, hp_bus)) = hp_bus {
2726         let mut buses_to_remove = Vec::new();
2727         let mut removed_key = None;
2728         let mut hp_bus_lock = hp_bus.lock();
2729         if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2730             if let Some(iommu_host_tube) = iommu_host_tube {
2731                 let request =
2732                     VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceDel {
2733                         endpoint_addr: pci_addr.to_u32(),
2734                     });
2735                 match virtio_iommu_request(iommu_host_tube, &request)
2736                     .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2737                 {
2738                     VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2739                     resp => bail!("Unexpected message response: {:?}", resp),
2740                 }
2741             }
2742             let mut empty_simbling = true;
2743             if let Some(HotPlugKey::HostDownstreamPort { host_addr }) =
2744                 hp_bus_lock.get_hotplug_key()
2745             {
2746                 let addr_alias = host_addr;
2747                 for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2748                     if *simbling_bus_num != bus_num {
2749                         let hp_bus_lock = hp_bus.lock();
2750                         let hotplug_key = hp_bus_lock.get_hotplug_key();
2751                         if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
2752                             if addr_alias.bus == host_addr.bus && !hp_bus_lock.is_empty() {
2753                                 empty_simbling = false;
2754                                 break;
2755                             }
2756                         }
2757                     }
2758                 }
2759             }
2760 
2761             // If all simbling downstream ports are empty, do not send hot unplug event for this
2762             // downstream port. Root port will send one plug out interrupt and remove all
2763             // the remaining devices
2764             if !empty_simbling {
2765                 hp_bus_lock.hot_unplug(pci_addr)?;
2766             }
2767 
2768             sys_allocator.release_pci(pci_addr.bus, pci_addr.dev, pci_addr.func);
2769             if empty_simbling || hp_bus_lock.is_empty() {
2770                 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2771                     removed_key = Some(hotplug_key);
2772                     remove_hotplug_bridge(
2773                         linux,
2774                         sys_allocator,
2775                         &mut buses_to_remove,
2776                         hotplug_key,
2777                         bus_num,
2778                     )?;
2779                 }
2780             }
2781         }
2782 
2783         // Some types of TBT device has a few empty downstream ports. The emulated bridges
2784         // of these ports won't be removed since no vfio device is connected to our emulated
2785         // bridges. So we explicitly check all simbling bridges of the removed bridge here,
2786         // and remove them if bridge has no child device connected.
2787         if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = removed_key {
2788             let addr_alias = host_addr;
2789             for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2790                 if *simbling_bus_num != bus_num {
2791                     let hp_bus_lock = hp_bus.lock();
2792                     let hotplug_key = hp_bus_lock.get_hotplug_key();
2793                     if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
2794                         if addr_alias.bus == host_addr.bus && hp_bus_lock.is_empty() {
2795                             remove_hotplug_bridge(
2796                                 linux,
2797                                 sys_allocator,
2798                                 &mut buses_to_remove,
2799                                 hotplug_key.unwrap(),
2800                                 *simbling_bus_num,
2801                             )?;
2802                         }
2803                     }
2804                 }
2805             }
2806         }
2807         for bus in buses_to_remove.iter() {
2808             linux.hotplug_bus.remove(bus);
2809         }
2810         return Ok(());
2811     }
2812 
2813     Err(anyhow!(
2814         "Can not find device {:?} on hotplug buses",
2815         hotplug_key
2816     ))
2817 }
2818 
trigger_vm_suspend_and_wait_for_entry( guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>, tube: &SendTube, response: vm_control::VmResponse, suspend_tube: Arc<Mutex<SendTube>>, pm: Option<Arc<Mutex<dyn PmResource + Send>>>, )2819 pub fn trigger_vm_suspend_and_wait_for_entry(
2820     guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>,
2821     tube: &SendTube,
2822     response: vm_control::VmResponse,
2823     suspend_tube: Arc<Mutex<SendTube>>,
2824     pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
2825 ) {
2826     let (lock, cvar) = &*guest_suspended_cvar;
2827     let mut guest_suspended = lock.lock();
2828 
2829     *guest_suspended = false;
2830 
2831     // During suspend also emulate sleepbtn, which allows to suspend VM (if running e.g. acpid and
2832     // reacts on sleep button events)
2833     if let Some(pm) = pm {
2834         pm.lock().slpbtn_evt();
2835     } else {
2836         error!("generating sleepbtn during suspend not supported");
2837     }
2838 
2839     // Wait for notification about guest suspension, if not received after 15sec,
2840     // proceed anyway.
2841     let result = cvar.wait_timeout(guest_suspended, std::time::Duration::from_secs(15));
2842     guest_suspended = result.0;
2843 
2844     if result.1.timed_out() {
2845         warn!("Guest suspension timeout - proceeding anyway");
2846     } else if *guest_suspended {
2847         info!("Guest suspended");
2848     }
2849 
2850     if let Err(e) = suspend_tube.lock().send(&true) {
2851         error!("failed to trigger suspend event: {}", e);
2852     }
2853     // Now we ready to send response over the tube and communicate that VM suspend has finished
2854     if let Err(e) = tube.send(&response) {
2855         error!("failed to send VmResponse: {}", e);
2856     }
2857 }
2858 
2859 #[cfg(feature = "pvclock")]
2860 #[derive(Debug)]
2861 /// The action requested by the pvclock device to perform on the main thread.
2862 enum PvClockAction {
2863     #[cfg(target_arch = "aarch64")]
2864     /// Update the counter offset with VmAarch64::set_counter_offset.
2865     SetCounterOffset(u64),
2866 }
2867 
2868 #[cfg(feature = "pvclock")]
send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<Option<PvClockAction>>2869 fn send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<Option<PvClockAction>> {
2870     tube.send(&command)
2871         .with_context(|| format!("failed to send pvclock command {:?}", command))?;
2872     let resp = tube
2873         .recv::<PvClockCommandResponse>()
2874         .context("failed to receive pvclock command response")?;
2875     match resp {
2876         PvClockCommandResponse::Err(e) => {
2877             bail!("pvclock encountered error on {:?}: {}", command, e);
2878         }
2879         PvClockCommandResponse::DeviceInactive => {
2880             warn!("Tried to send {command:?} but pvclock device was inactive");
2881             Ok(None)
2882         }
2883         PvClockCommandResponse::Resumed {
2884             total_suspended_ticks,
2885         } => {
2886             info!("{command:?} completed with {total_suspended_ticks} total_suspended_ticks");
2887             cfg_if::cfg_if! {
2888                 if #[cfg(target_arch = "aarch64")] {
2889                     Ok(Some(PvClockAction::SetCounterOffset(total_suspended_ticks)))
2890                 } else {
2891                     // For non-AArch64 platforms this is handled by directly updating the offset in
2892                     // shared memory in the pvclock device worker.
2893                     Ok(None)
2894                 }
2895             }
2896         }
2897         PvClockCommandResponse::Ok => {
2898             info!("{command:?} completed with {resp:?}");
2899             Ok(None)
2900         }
2901     }
2902 }
2903 
2904 #[cfg(target_arch = "x86_64")]
handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, add_control_tube: &mut impl FnMut(AnyControlTube), hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, add: bool, #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>, vfio_container_manager: &mut VfioContainerManager, ) -> VmResponse2905 fn handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>(
2906     linux: &mut RunnableLinuxVm<V, Vcpu>,
2907     sys_allocator: &mut SystemAllocator,
2908     cfg: &Config,
2909     add_control_tube: &mut impl FnMut(AnyControlTube),
2910     hp_control_tube: &mpsc::Sender<PciRootCommand>,
2911     iommu_host_tube: Option<&Tube>,
2912     device: &HotPlugDeviceInfo,
2913     add: bool,
2914     #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
2915     vfio_container_manager: &mut VfioContainerManager,
2916 ) -> VmResponse {
2917     let iommu_host_tube = if cfg.vfio_isolate_hotplug {
2918         iommu_host_tube
2919     } else {
2920         None
2921     };
2922 
2923     let ret = if add {
2924         add_hotplug_device(
2925             linux,
2926             sys_allocator,
2927             cfg,
2928             add_control_tube,
2929             hp_control_tube,
2930             iommu_host_tube,
2931             device,
2932             #[cfg(feature = "swap")]
2933             swap_controller,
2934             vfio_container_manager,
2935         )
2936     } else {
2937         remove_hotplug_device(linux, sys_allocator, iommu_host_tube, device)
2938     };
2939 
2940     match ret {
2941         Ok(()) => VmResponse::Ok,
2942         Err(e) => {
2943             error!("handle_hotplug_command failure: {}", e);
2944             VmResponse::Err(base::Error::new(libc::EINVAL))
2945         }
2946     }
2947 }
2948 
2949 struct ControlLoopState<'a, V: VmArch, Vcpu: VcpuArch> {
2950     linux: &'a mut RunnableLinuxVm<V, Vcpu>,
2951     cfg: &'a Config,
2952     sys_allocator: &'a Arc<Mutex<SystemAllocator>>,
2953     control_tubes: &'a BTreeMap<usize, TaggedControlTube>,
2954     disk_host_tubes: &'a [Tube],
2955     #[cfg(feature = "gpu")]
2956     gpu_control_tube: Option<&'a Tube>,
2957     #[cfg(feature = "usb")]
2958     usb_control_tube: &'a Tube,
2959     #[cfg(target_arch = "x86_64")]
2960     iommu_host_tube: &'a Option<Arc<Mutex<Tube>>>,
2961     #[cfg(target_arch = "x86_64")]
2962     hp_control_tube: &'a mpsc::Sender<PciRootCommand>,
2963     guest_suspended_cvar: &'a Option<Arc<(Mutex<bool>, Condvar)>>,
2964     #[cfg(feature = "pci-hotplug")]
2965     hotplug_manager: &'a mut Option<PciHotPlugManager>,
2966     #[cfg(feature = "swap")]
2967     swap_controller: &'a mut Option<SwapController>,
2968     vcpu_handles: &'a [(JoinHandle<()>, mpsc::Sender<vm_control::VcpuControl>)],
2969     #[cfg(feature = "balloon")]
2970     balloon_tube: Option<&'a mut BalloonTube>,
2971     device_ctrl_tube: &'a Tube,
2972     irq_handler_control: &'a Tube,
2973     #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
2974     vm_memory_handler_control: &'a Tube,
2975     #[cfg(feature = "registered_events")]
2976     registered_evt_tubes: &'a mut HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
2977     #[cfg(feature = "pvclock")]
2978     pvclock_host_tube: Option<Arc<Tube>>,
2979     vfio_container_manager: &'a mut VfioContainerManager,
2980     suspended_pvclock_state: &'a mut Option<hypervisor::ClockState>,
2981     vcpus_pid_tid: &'a BTreeMap<usize, (u32, u32)>,
2982 }
2983 
2984 struct VmRequestResult {
2985     response: Option<VmResponse>,
2986     exit: bool,
2987 }
2988 
2989 impl VmRequestResult {
new(response: Option<VmResponse>, exit: bool) -> Self2990     fn new(response: Option<VmResponse>, exit: bool) -> Self {
2991         VmRequestResult { response, exit }
2992     }
2993 }
2994 
process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( state: &mut ControlLoopState<V, Vcpu>, id: usize, tube: &Tube, request: VmRequest, #[cfg_attr( not(any(target_arch = "x86_64", feature = "pci-hotplug")), allow(unused_variables, clippy::ptr_arg) )] add_tubes: &mut Vec<TaggedControlTube>, ) -> Result<VmRequestResult>2995 fn process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
2996     state: &mut ControlLoopState<V, Vcpu>,
2997     id: usize,
2998     tube: &Tube,
2999     request: VmRequest,
3000     #[cfg_attr(
3001         not(any(target_arch = "x86_64", feature = "pci-hotplug")),
3002         allow(unused_variables, clippy::ptr_arg)
3003     )]
3004     add_tubes: &mut Vec<TaggedControlTube>,
3005 ) -> Result<VmRequestResult> {
3006     #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3007     let mut add_irq_control_tubes = Vec::new();
3008     #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3009     let mut add_vm_memory_control_tubes = Vec::new();
3010 
3011     #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3012     let mut add_control_tube = |t| match t {
3013         AnyControlTube::DeviceControlTube(_) => {
3014             panic!("hotplugging DeviceControlTube not supported yet")
3015         }
3016         AnyControlTube::IrqTube(t) => add_irq_control_tubes.push(t),
3017         AnyControlTube::TaggedControlTube(t) => add_tubes.push(t),
3018         AnyControlTube::VmMemoryTube(t) => add_vm_memory_control_tubes.push(t),
3019     };
3020 
3021     let response = match request {
3022         VmRequest::Exit => {
3023             return Ok(VmRequestResult::new(Some(VmResponse::Ok), true));
3024         }
3025         VmRequest::HotPlugVfioCommand { device, add } => {
3026             #[cfg(target_arch = "x86_64")]
3027             {
3028                 handle_hotplug_command(
3029                     state.linux,
3030                     &mut state.sys_allocator.lock(),
3031                     state.cfg,
3032                     &mut add_control_tube,
3033                     state.hp_control_tube,
3034                     state.iommu_host_tube.as_ref().map(|t| t.lock()).as_deref(),
3035                     &device,
3036                     add,
3037                     #[cfg(feature = "swap")]
3038                     state.swap_controller,
3039                     state.vfio_container_manager,
3040                 )
3041             }
3042 
3043             #[cfg(not(target_arch = "x86_64"))]
3044             {
3045                 // Suppress warnings.
3046                 let _ = (device, add);
3047                 let _ = &state.vfio_container_manager;
3048                 VmResponse::Ok
3049             }
3050         }
3051         #[cfg(feature = "pci-hotplug")]
3052         VmRequest::HotPlugNetCommand(net_cmd) => {
3053             if let Some(hotplug_manager) = state.hotplug_manager.as_mut() {
3054                 handle_hotplug_net_command(
3055                     net_cmd,
3056                     state.linux,
3057                     &mut state.sys_allocator.lock(),
3058                     &mut add_control_tube,
3059                     hotplug_manager,
3060                 )
3061             } else {
3062                 VmResponse::ErrString("PCI hotplug is not enabled.".to_owned())
3063             }
3064         }
3065         #[cfg(feature = "registered_events")]
3066         VmRequest::RegisterListener { socket_addr, event } => {
3067             let (registered_tube, already_registered) =
3068                 find_registered_tube(state.registered_evt_tubes, &socket_addr, event);
3069 
3070             if !already_registered {
3071                 let addr_tube = make_addr_tube_from_maybe_existing(registered_tube, socket_addr)?;
3072 
3073                 if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
3074                     tubes.insert(addr_tube);
3075                 } else {
3076                     state
3077                         .registered_evt_tubes
3078                         .insert(event, vec![addr_tube].into_iter().collect());
3079                 }
3080             }
3081             VmResponse::Ok
3082         }
3083         #[cfg(feature = "registered_events")]
3084         VmRequest::UnregisterListener { socket_addr, event } => {
3085             if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
3086                 tubes.retain(|t| t.socket_addr != socket_addr);
3087             }
3088             state
3089                 .registered_evt_tubes
3090                 .retain(|_, tubes| !tubes.is_empty());
3091             VmResponse::Ok
3092         }
3093         #[cfg(feature = "registered_events")]
3094         VmRequest::Unregister { socket_addr } => {
3095             for (_, tubes) in state.registered_evt_tubes.iter_mut() {
3096                 tubes.retain(|t| t.socket_addr != socket_addr);
3097             }
3098             state
3099                 .registered_evt_tubes
3100                 .retain(|_, tubes| !tubes.is_empty());
3101             VmResponse::Ok
3102         }
3103         #[cfg(feature = "balloon")]
3104         VmRequest::BalloonCommand(cmd) => {
3105             if let Some(tube) = state.balloon_tube.as_mut() {
3106                 let Some((r, key)) = tube.send_cmd(cmd, Some(id)) else {
3107                     return Ok(VmRequestResult::new(None, false));
3108                 };
3109                 if key != id {
3110                     let Some(TaggedControlTube::Vm(tube)) = state.control_tubes.get(&key) else {
3111                         return Ok(VmRequestResult::new(None, false));
3112                     };
3113                     if let Err(e) = tube.send(&r) {
3114                         error!("failed to send VmResponse: {}", e);
3115                     }
3116                     return Ok(VmRequestResult::new(None, false));
3117                 }
3118                 r
3119             } else {
3120                 VmResponse::Err(base::Error::new(libc::ENOTSUP))
3121             }
3122         }
3123         VmRequest::VcpuPidTid => VmResponse::VcpuPidTidResponse {
3124             pid_tid_map: state.vcpus_pid_tid.clone(),
3125         },
3126         VmRequest::Throttle(vcpu, cycles) => {
3127             vcpu::kick_vcpu(
3128                 &state.vcpu_handles.get(vcpu),
3129                 state.linux.irq_chip.as_irq_chip(),
3130                 VcpuControl::Throttle(cycles),
3131             );
3132             return Ok(VmRequestResult::new(None, false));
3133         }
3134         _ => {
3135             if !state.cfg.force_s2idle {
3136                 #[cfg(feature = "pvclock")]
3137                 if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
3138                     // Update clock offset when pvclock is used.
3139                     if let VmRequest::ResumeVcpus = request {
3140                         let cmd = PvClockCommand::Resume;
3141                         match send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
3142                             Ok(action) => {
3143                                 info!("{:?} command successfully processed", cmd);
3144                                 if let Some(action) = action {
3145                                     match action {
3146                                         #[cfg(target_arch = "aarch64")]
3147                                         PvClockAction::SetCounterOffset(offset) => {
3148                                             state.linux.vm.set_counter_offset(offset)?;
3149                                         }
3150                                     }
3151                                 }
3152                             }
3153                             Err(e) => error!("{:?} command failed: {:#}", cmd, e),
3154                         };
3155                     }
3156                 }
3157             }
3158             let kick_all_vcpus = |msg| {
3159                 if let VcpuControl::RunState(VmRunMode::Running) = msg {
3160                     for dev in &state.linux.resume_notify_devices {
3161                         dev.lock().resume_imminent();
3162                     }
3163                 }
3164                 vcpu::kick_all_vcpus(state.vcpu_handles, state.linux.irq_chip.as_irq_chip(), msg);
3165             };
3166             let response = request.execute(
3167                 &state.linux.vm,
3168                 state.disk_host_tubes,
3169                 &mut state.linux.pm,
3170                 #[cfg(feature = "gpu")]
3171                 state.gpu_control_tube,
3172                 #[cfg(not(feature = "gpu"))]
3173                 None,
3174                 #[cfg(feature = "usb")]
3175                 Some(state.usb_control_tube),
3176                 #[cfg(not(feature = "usb"))]
3177                 None,
3178                 &mut state.linux.bat_control,
3179                 kick_all_vcpus,
3180                 |index, msg| {
3181                     vcpu::kick_vcpu(
3182                         &state.vcpu_handles.get(index),
3183                         state.linux.irq_chip.as_irq_chip(),
3184                         msg,
3185                     )
3186                 },
3187                 state.cfg.force_s2idle,
3188                 #[cfg(feature = "swap")]
3189                 state.swap_controller.as_ref(),
3190                 state.device_ctrl_tube,
3191                 state.vcpu_handles.len(),
3192                 state.irq_handler_control,
3193                 || state.linux.irq_chip.snapshot(state.linux.vcpu_count),
3194                 state.suspended_pvclock_state,
3195             );
3196             if state.cfg.force_s2idle {
3197                 if let VmRequest::SuspendVcpus = request {
3198                     // Spawn s2idle wait thread.
3199                     let send_tube = tube.try_clone_send_tube().unwrap();
3200                     let suspend_tube = state.linux.suspend_tube.0.clone();
3201                     let guest_suspended_cvar = state.guest_suspended_cvar.clone();
3202                     let delayed_response = response.clone();
3203                     let pm = state.linux.pm.clone();
3204 
3205                     std::thread::Builder::new()
3206                         .name("s2idle_wait".to_owned())
3207                         .spawn(move || {
3208                             trigger_vm_suspend_and_wait_for_entry(
3209                                 guest_suspended_cvar.unwrap(),
3210                                 &send_tube,
3211                                 delayed_response,
3212                                 suspend_tube,
3213                                 pm,
3214                             )
3215                         })
3216                         .context("failed to spawn s2idle_wait thread")?;
3217 
3218                     // For s2idle, omit the response since it will be sent by
3219                     // s2idle_wait thread when suspension actually happens.
3220                     return Ok(VmRequestResult::new(None, false));
3221                 }
3222             } else {
3223                 #[cfg(feature = "pvclock")]
3224                 if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
3225                     // Record the time after VCPUs are suspended to track suspension duration.
3226                     if let VmRequest::SuspendVcpus = request {
3227                         let cmd = PvClockCommand::Suspend;
3228                         match send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
3229                             Ok(action) => {
3230                                 info!("{:?} command successfully processed", cmd);
3231                                 if let Some(action) = action {
3232                                     error!("Unexpected action {:?} requested for suspend", action);
3233                                 }
3234                             }
3235                             Err(e) => error!("{:?} command failed: {:#}", cmd, e),
3236                         };
3237                     }
3238                 }
3239             }
3240             response
3241         }
3242     };
3243 
3244     cfg_if::cfg_if! {
3245         if #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))] {
3246             if !add_irq_control_tubes.is_empty() {
3247                 state
3248                     .irq_handler_control
3249                     .send(&IrqHandlerRequest::AddIrqControlTubes(
3250                         add_irq_control_tubes,
3251                     ))?;
3252             }
3253             if !add_vm_memory_control_tubes.is_empty() {
3254                 state
3255                     .vm_memory_handler_control
3256                     .send(&VmMemoryHandlerRequest::AddControlTubes(
3257                         add_vm_memory_control_tubes,
3258                     ))?;
3259             }
3260         }
3261     }
3262 
3263     Ok(VmRequestResult::new(Some(response), false))
3264 }
3265 
process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( state: &mut ControlLoopState<V, Vcpu>, id: usize, socket: &TaggedControlTube, ) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)>3266 fn process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3267     state: &mut ControlLoopState<V, Vcpu>,
3268     id: usize,
3269     socket: &TaggedControlTube,
3270 ) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)> {
3271     let mut vm_control_ids_to_remove = Vec::new();
3272     let mut add_tubes = Vec::new();
3273     match socket {
3274         TaggedControlTube::Vm(tube) => match tube.recv::<VmRequest>() {
3275             Ok(request) => {
3276                 let res = process_vm_request(state, id, tube, request, &mut add_tubes)?;
3277 
3278                 if let Some(response) = res.response {
3279                     if let Err(e) = tube.send(&response) {
3280                         error!("failed to send VmResponse: {}", e);
3281                     }
3282                 }
3283 
3284                 if res.exit {
3285                     return Ok((true, Vec::new(), Vec::new()));
3286                 }
3287             }
3288             Err(e) => {
3289                 if let TubeError::Disconnected = e {
3290                     vm_control_ids_to_remove.push(id);
3291                 } else {
3292                     error!("failed to recv VmRequest: {}", e);
3293                 }
3294             }
3295         },
3296         TaggedControlTube::VmMsync(tube) => match tube.recv::<VmMemoryMappingRequest>() {
3297             Ok(request) => {
3298                 let response = request.execute(&mut state.linux.vm);
3299                 if let Err(e) = tube.send(&response) {
3300                     error!("failed to send VmMsyncResponse: {}", e);
3301                 }
3302             }
3303             Err(e) => {
3304                 if let TubeError::Disconnected = e {
3305                     vm_control_ids_to_remove.push(id);
3306                 } else {
3307                     error!("failed to recv VmMsyncRequest: {}", e);
3308                 }
3309             }
3310         },
3311         TaggedControlTube::Fs(tube) => match tube.recv::<FsMappingRequest>() {
3312             Ok(request) => {
3313                 let response =
3314                     request.execute(&mut state.linux.vm, &mut state.sys_allocator.lock());
3315                 if let Err(e) = tube.send(&response) {
3316                     error!("failed to send VmResponse: {}", e);
3317                 }
3318             }
3319             Err(e) => {
3320                 if let TubeError::Disconnected = e {
3321                     vm_control_ids_to_remove.push(id);
3322                 } else {
3323                     error!("failed to recv VmResponse: {}", e);
3324                 }
3325             }
3326         },
3327     }
3328 
3329     Ok((false, vm_control_ids_to_remove, add_tubes))
3330 }
3331 
3332 #[cfg(feature = "registered_events")]
3333 struct AddressedProtoTube {
3334     tube: Rc<ProtoTube>,
3335     socket_addr: String,
3336 }
3337 
3338 #[cfg(feature = "registered_events")]
3339 impl PartialEq for AddressedProtoTube {
eq(&self, other: &Self) -> bool3340     fn eq(&self, other: &Self) -> bool {
3341         self.socket_addr == other.socket_addr
3342     }
3343 }
3344 
3345 #[cfg(feature = "registered_events")]
3346 impl Eq for AddressedProtoTube {}
3347 
3348 #[cfg(feature = "registered_events")]
3349 impl Hash for AddressedProtoTube {
hash<H: std::hash::Hasher>(&self, state: &mut H)3350     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
3351         self.socket_addr.hash(state);
3352     }
3353 }
3354 
3355 #[cfg(feature = "registered_events")]
3356 impl AddressedProtoTube {
send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError>3357     pub fn send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError> {
3358         self.tube.send_proto(msg)
3359     }
3360 }
3361 
3362 #[cfg(feature = "registered_events")]
find_registered_tube<'a>( registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>, socket_addr: &str, event: RegisteredEvent, ) -> (Option<&'a Rc<ProtoTube>>, bool)3363 fn find_registered_tube<'a>(
3364     registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
3365     socket_addr: &str,
3366     event: RegisteredEvent,
3367 ) -> (Option<&'a Rc<ProtoTube>>, bool) {
3368     let mut registered_tube: Option<&Rc<ProtoTube>> = None;
3369     let mut already_registered = false;
3370     'outer: for (evt, addr_tubes) in registered_tubes {
3371         for addr_tube in addr_tubes {
3372             if addr_tube.socket_addr == socket_addr {
3373                 if *evt == event {
3374                     already_registered = true;
3375                     break 'outer;
3376                 }
3377                 // Since all tubes of the same addr should
3378                 // be an RC to the same tube, it doesn't
3379                 // matter which one we get. But we do need
3380                 // to check for a registration for the
3381                 // current event, so can't break here.
3382                 registered_tube = Some(&addr_tube.tube);
3383             }
3384         }
3385     }
3386     (registered_tube, already_registered)
3387 }
3388 
3389 #[cfg(feature = "registered_events")]
make_addr_tube_from_maybe_existing( tube: Option<&Rc<ProtoTube>>, addr: String, ) -> Result<AddressedProtoTube>3390 fn make_addr_tube_from_maybe_existing(
3391     tube: Option<&Rc<ProtoTube>>,
3392     addr: String,
3393 ) -> Result<AddressedProtoTube> {
3394     if let Some(registered_tube) = tube {
3395         Ok(AddressedProtoTube {
3396             tube: registered_tube.clone(),
3397             socket_addr: addr,
3398         })
3399     } else {
3400         let sock = UnixSeqpacket::connect(addr.clone()).with_context(|| {
3401             format!("failed to connect to registered listening socket {}", addr)
3402         })?;
3403         let tube = ProtoTube::new_from_unix_seqpacket(sock)?;
3404         Ok(AddressedProtoTube {
3405             tube: Rc::new(tube),
3406             socket_addr: addr,
3407         })
3408     }
3409 }
3410 
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut linux: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, cfg: Config, control_server_socket: Option<UnlinkUnixSeqpacketListener>, all_control_tubes: Vec<AnyControlTube>, #[cfg(feature = "usb")] usb_control_tube: Tube, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, sigchld_fd: SignalFd, gralloc: RutabagaGralloc, vcpu_ids: Vec<usize>, iommu_host_tube: Option<Tube>, #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>, #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>, #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>, #[allow(unused_mut)] #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>, #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube, guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, metrics_tube: RecvTube, mut vfio_container_manager: VfioContainerManager, mut worker_process_pids: BTreeSet<Pid>, #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] vcpu_domain_paths: BTreeMap< usize, PathBuf, >, ) -> Result<ExitState>3411 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3412     mut linux: RunnableLinuxVm<V, Vcpu>,
3413     sys_allocator: SystemAllocator,
3414     cfg: Config,
3415     control_server_socket: Option<UnlinkUnixSeqpacketListener>,
3416     all_control_tubes: Vec<AnyControlTube>,
3417     #[cfg(feature = "usb")] usb_control_tube: Tube,
3418     vm_evt_rdtube: RecvTube,
3419     vm_evt_wrtube: SendTube,
3420     sigchld_fd: SignalFd,
3421     gralloc: RutabagaGralloc,
3422     vcpu_ids: Vec<usize>,
3423     iommu_host_tube: Option<Tube>,
3424     #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>,
3425     #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>,
3426     #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>,
3427     #[allow(unused_mut)] // mut is required x86 only
3428     #[cfg(feature = "swap")]
3429     mut swap_controller: Option<SwapController>,
3430     #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube,
3431     guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
3432     metrics_tube: RecvTube,
3433     mut vfio_container_manager: VfioContainerManager,
3434     // A set of PID of child processes whose clean exit is expected and can be ignored.
3435     mut worker_process_pids: BTreeSet<Pid>,
3436     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] vcpu_domain_paths: BTreeMap<
3437         usize,
3438         PathBuf,
3439     >,
3440 ) -> Result<ExitState> {
3441     // Split up `all_control_tubes`.
3442     #[cfg(feature = "balloon")]
3443     let mut balloon_host_tube = None;
3444     let mut disk_host_tubes = Vec::new();
3445     #[cfg(feature = "gpu")]
3446     let mut gpu_control_tube = None;
3447     #[cfg(feature = "pvclock")]
3448     let mut pvclock_host_tube = None;
3449     let mut irq_control_tubes = Vec::new();
3450     let mut vm_memory_control_tubes = Vec::new();
3451     let mut control_tubes = Vec::new();
3452     for t in all_control_tubes {
3453         match t {
3454             #[cfg(feature = "balloon")]
3455             AnyControlTube::DeviceControlTube(DeviceControlTube::Balloon(t)) => {
3456                 assert!(balloon_host_tube.is_none());
3457                 balloon_host_tube = Some(t)
3458             }
3459             AnyControlTube::DeviceControlTube(DeviceControlTube::Disk(t)) => {
3460                 disk_host_tubes.push(t)
3461             }
3462             #[cfg(feature = "gpu")]
3463             AnyControlTube::DeviceControlTube(DeviceControlTube::Gpu(t)) => {
3464                 assert!(gpu_control_tube.is_none());
3465                 gpu_control_tube = Some(t)
3466             }
3467             #[cfg(feature = "pvclock")]
3468             AnyControlTube::DeviceControlTube(DeviceControlTube::PvClock(t)) => {
3469                 assert!(pvclock_host_tube.is_none());
3470                 pvclock_host_tube = Some(Arc::new(t))
3471             }
3472             AnyControlTube::IrqTube(t) => irq_control_tubes.push(t),
3473             AnyControlTube::TaggedControlTube(t) => control_tubes.push(t),
3474             AnyControlTube::VmMemoryTube(t) => vm_memory_control_tubes.push(t),
3475         }
3476     }
3477 
3478     #[cfg(feature = "gdb")]
3479     let (to_gdb_channel, gdb) = if let Some(port) = cfg.gdb {
3480         // GDB needs a control socket to interrupt vcpus.
3481         let (gdb_host_tube, gdb_control_tube) = Tube::pair().context("failed to create tube")?;
3482         control_tubes.push(TaggedControlTube::Vm(gdb_host_tube));
3483         // Create a channel for GDB thread.
3484         let (to_gdb_channel, from_vcpu_channel) = mpsc::channel();
3485         (
3486             Some(to_gdb_channel),
3487             Some((port, gdb_control_tube, from_vcpu_channel)),
3488         )
3489     } else {
3490         (None, None)
3491     };
3492 
3493     #[derive(EventToken)]
3494     enum Token {
3495         VmEvent,
3496         Suspend,
3497         ChildSignal,
3498         VmControlServer,
3499         VmControl {
3500             id: usize,
3501         },
3502         #[cfg(feature = "registered_events")]
3503         RegisteredEvent,
3504         #[cfg(feature = "balloon")]
3505         BalloonTube,
3506     }
3507     stdin()
3508         .set_raw_mode()
3509         .expect("failed to set terminal raw mode");
3510 
3511     let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
3512     let iommu_host_tube = iommu_host_tube.map(|t| Arc::new(Mutex::new(t)));
3513 
3514     let wait_ctx = WaitContext::build_with(&[
3515         (&linux.suspend_tube.1, Token::Suspend),
3516         (&sigchld_fd, Token::ChildSignal),
3517         (&vm_evt_rdtube, Token::VmEvent),
3518         #[cfg(feature = "registered_events")]
3519         (&reg_evt_rdtube, Token::RegisteredEvent),
3520     ])
3521     .context("failed to build wait context")?;
3522 
3523     if let Some(socket_server) = &control_server_socket {
3524         wait_ctx
3525             .add(socket_server, Token::VmControlServer)
3526             .context("failed to add descriptor to wait context")?;
3527     }
3528     let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
3529     let mut next_control_id = control_tubes.len();
3530     for (id, socket) in control_tubes.iter() {
3531         wait_ctx
3532             .add(socket.as_ref(), Token::VmControl { id: *id })
3533             .context("failed to add descriptor to wait context")?;
3534     }
3535 
3536     #[cfg(feature = "balloon")]
3537     let mut balloon_tube = balloon_host_tube
3538         .map(|tube| -> Result<BalloonTube> {
3539             wait_ctx
3540                 .add(&tube, Token::BalloonTube)
3541                 .context("failed to add descriptor to wait context")?;
3542             Ok(BalloonTube::new(tube))
3543         })
3544         .transpose()
3545         .context("failed to create balloon tube")?;
3546 
3547     if cfg.jail_config.is_some() {
3548         // Before starting VCPUs, in case we started with some capabilities, drop them all.
3549         drop_capabilities().context("failed to drop process capabilities")?;
3550     }
3551 
3552     let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
3553     // Create devices thread, and restore if a restore file exists.
3554     linux.devices_thread = match create_devices_worker_thread(
3555         linux.vm.get_memory().clone(),
3556         linux.io_bus.clone(),
3557         linux.mmio_bus.clone(),
3558         device_ctrl_resp,
3559     ) {
3560         Ok(join_handle) => Some(join_handle),
3561         Err(e) => {
3562             return Err(anyhow!("Failed to start devices thread: {}", e));
3563         }
3564     };
3565 
3566     let mut vcpu_handles = Vec::with_capacity(linux.vcpu_count);
3567     let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpu_count + 1));
3568 
3569     if !linux
3570         .vm
3571         .get_hypervisor()
3572         .check_capability(HypervisorCap::ImmediateExit)
3573     {
3574         return Err(anyhow!(
3575             "missing required hypervisor capability ImmediateExit"
3576         ));
3577     }
3578 
3579     vcpu::setup_vcpu_signal_handler()?;
3580 
3581     let vcpus: Vec<Option<_>> = match linux.vcpus.take() {
3582         Some(vec) => vec.into_iter().map(Some).collect(),
3583         None => iter::repeat_with(|| None).take(linux.vcpu_count).collect(),
3584     };
3585     // Enable core scheduling before creating vCPUs so that the cookie will be
3586     // shared by all vCPU threads.
3587     // TODO(b/199312402): Avoid enabling core scheduling for the crosvm process
3588     // itself for even better performance. Only vCPUs need the feature.
3589     if cfg.core_scheduling && cfg.per_vm_core_scheduling {
3590         if let Err(e) = enable_core_scheduling() {
3591             error!("Failed to enable core scheduling: {}", e);
3592         }
3593     }
3594 
3595     // The tasks file only exist on sysfs if CgroupV1 hierachies are enabled
3596     let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
3597         None => None,
3598         Some(cgroup_path) => {
3599             // Move main process to cgroup_path
3600             match File::create(cgroup_path.join("tasks")) {
3601                 Ok(file) => Some(file),
3602                 Err(_) => {
3603                     info!(
3604                         "Unable to open tasks file in cgroup: {}, trying CgroupV2",
3605                         cgroup_path.display()
3606                     );
3607                     None
3608                 }
3609             }
3610         }
3611     };
3612 
3613     // vCPU freq domains are currently only supported with CgroupsV2.
3614     let mut vcpu_cgroup_v2_files: std::collections::BTreeMap<usize, File> = BTreeMap::new();
3615     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
3616     for (vcpu_id, vcpu_domain_path) in vcpu_domain_paths.iter() {
3617         let vcpu_cgroup_v2_file = File::create(vcpu_domain_path.join("cgroup.threads"))
3618             .with_context(|| {
3619                 format!(
3620                     "failed to create vcpu-cgroup-path {}",
3621                     vcpu_domain_path.join("cgroup.threads").display(),
3622                 )
3623             })?;
3624         vcpu_cgroup_v2_files.insert(*vcpu_id, vcpu_cgroup_v2_file);
3625     }
3626 
3627     #[cfg(target_arch = "x86_64")]
3628     let bus_lock_ratelimit_ctrl: Arc<Mutex<Ratelimit>> = Arc::new(Mutex::new(Ratelimit::new()));
3629     #[cfg(target_arch = "x86_64")]
3630     if cfg.bus_lock_ratelimit > 0 {
3631         let bus_lock_ratelimit = cfg.bus_lock_ratelimit;
3632         if linux.vm.check_capability(VmCap::BusLockDetect) {
3633             info!("Hypervisor support bus lock detect");
3634             linux
3635                 .vm
3636                 .enable_capability(VmCap::BusLockDetect, 0)
3637                 .expect("kvm: Failed to enable bus lock detection cap");
3638             info!("Hypervisor enabled bus lock detect");
3639             bus_lock_ratelimit_ctrl
3640                 .lock()
3641                 .ratelimit_set_speed(bus_lock_ratelimit);
3642         } else {
3643             bail!("Kvm: bus lock detection unsuported");
3644         }
3645     }
3646 
3647     #[cfg(target_os = "android")]
3648     android::set_process_profiles(&cfg.task_profiles)?;
3649 
3650     #[allow(unused_mut)]
3651     let mut run_mode = if cfg.suspended {
3652         // Sleep devices before creating vcpus.
3653         device_ctrl_tube
3654             .send(&DeviceControlCommand::SleepDevices)
3655             .context("send command to devices control socket")?;
3656         match device_ctrl_tube
3657             .recv()
3658             .context("receive from devices control socket")?
3659         {
3660             VmResponse::Ok => (),
3661             resp => bail!("device sleep failed: {}", resp),
3662         }
3663         VmRunMode::Suspending
3664     } else {
3665         VmRunMode::Running
3666     };
3667     #[cfg(feature = "gdb")]
3668     if to_gdb_channel.is_some() {
3669         // Wait until a GDB client attaches
3670         run_mode = VmRunMode::Breakpoint;
3671     }
3672     // If we are restoring from a snapshot, then start suspended.
3673     let (run_mode, post_restore_run_mode) = if cfg.restore_path.is_some() {
3674         (VmRunMode::Suspending, run_mode)
3675     } else {
3676         (run_mode, run_mode)
3677     };
3678 
3679     // Architecture-specific code must supply a vcpu_init element for each VCPU.
3680     assert_eq!(vcpus.len(), linux.vcpu_init.len());
3681 
3682     let (vcpu_pid_tid_sender, vcpu_pid_tid_receiver) = mpsc::channel();
3683     for ((cpu_id, vcpu), vcpu_init) in vcpus.into_iter().enumerate().zip(linux.vcpu_init.drain(..))
3684     {
3685         let vcpu_cgroup_file: Option<File>;
3686         if let Some(cgroup_file) = &vcpu_cgroup_tasks_file {
3687             vcpu_cgroup_file = Some(cgroup_file.try_clone().unwrap())
3688         } else if !cfg.cpu_freq_domains.is_empty() {
3689             vcpu_cgroup_file = Some(
3690                 (vcpu_cgroup_v2_files.remove(&cpu_id).unwrap())
3691                     .try_clone()
3692                     .unwrap(),
3693             )
3694         } else {
3695             vcpu_cgroup_file = None
3696         };
3697 
3698         let (to_vcpu_channel, from_main_channel) = mpsc::channel();
3699         let vcpu_affinity = match linux.vcpu_affinity.clone() {
3700             Some(VcpuAffinity::Global(v)) => v,
3701             Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
3702             None => Default::default(),
3703         };
3704 
3705         #[cfg(target_arch = "x86_64")]
3706         let vcpu_hybrid_type = if !cfg.vcpu_hybrid_type.is_empty() {
3707             Some(*cfg.vcpu_hybrid_type.get(&cpu_id).unwrap())
3708         } else {
3709             None
3710         };
3711 
3712         #[cfg(target_arch = "x86_64")]
3713         let cpu_config = Some(CpuConfigX86_64::new(
3714             cfg.force_calibrated_tsc_leaf,
3715             cfg.host_cpu_topology,
3716             cfg.enable_hwp,
3717             cfg.no_smt,
3718             cfg.itmt,
3719             vcpu_hybrid_type,
3720         ));
3721         #[cfg(target_arch = "x86_64")]
3722         let bus_lock_ratelimit_ctrl = Arc::clone(&bus_lock_ratelimit_ctrl);
3723 
3724         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
3725         let cpu_config = None;
3726 
3727         #[cfg(target_arch = "riscv64")]
3728         let cpu_config = Some(CpuConfigRiscv64::new(vcpu_init.fdt_address));
3729 
3730         let handle = vcpu::run_vcpu(
3731             cpu_id,
3732             vcpu_ids[cpu_id],
3733             vcpu,
3734             vcpu_init,
3735             linux.vm.try_clone().context("failed to clone vm")?,
3736             linux
3737                 .irq_chip
3738                 .try_box_clone()
3739                 .context("failed to clone irqchip")?,
3740             linux.vcpu_count,
3741             linux.rt_cpus.contains(&cpu_id),
3742             vcpu_affinity,
3743             linux.delay_rt,
3744             vcpu_thread_barrier.clone(),
3745             (*linux.io_bus).clone(),
3746             (*linux.mmio_bus).clone(),
3747             vm_evt_wrtube
3748                 .try_clone()
3749                 .context("failed to clone vm event tube")?,
3750             from_main_channel,
3751             #[cfg(feature = "gdb")]
3752             to_gdb_channel.clone(),
3753             cfg.core_scheduling,
3754             cfg.per_vm_core_scheduling,
3755             cpu_config,
3756             match vcpu_cgroup_file {
3757                 None => None,
3758                 Some(ref f) => Some(
3759                     f.try_clone()
3760                         .context("failed to clone vcpu cgroup tasks file")?,
3761                 ),
3762             },
3763             #[cfg(target_arch = "x86_64")]
3764             bus_lock_ratelimit_ctrl,
3765             run_mode,
3766             cfg.boost_uclamp,
3767             vcpu_pid_tid_sender.clone(),
3768         )?;
3769         vcpu_handles.push((handle, to_vcpu_channel));
3770     }
3771 
3772     let mut vcpus_pid_tid = BTreeMap::new();
3773     for _ in 0..vcpu_handles.len() {
3774         let vcpu_pid_tid: VcpuPidTid = vcpu_pid_tid_receiver
3775             .recv()
3776             .context("failed receiving vcpu pid/tid")?;
3777         if vcpus_pid_tid
3778             .insert(
3779                 vcpu_pid_tid.vcpu_id,
3780                 (vcpu_pid_tid.process_id, vcpu_pid_tid.thread_id),
3781             )
3782             .is_some()
3783         {
3784             return Err(anyhow!(
3785                 "Vcpu {} returned more than 1 PID and TID",
3786                 vcpu_pid_tid.vcpu_id
3787             ));
3788         }
3789     }
3790 
3791     #[cfg(feature = "gdb")]
3792     // Spawn GDB thread.
3793     if let Some((gdb_port_num, gdb_control_tube, from_vcpu_channel)) = gdb {
3794         let to_vcpu_channels = vcpu_handles
3795             .iter()
3796             .map(|(_handle, channel)| channel.clone())
3797             .collect();
3798         let target = GdbStub::new(gdb_control_tube, to_vcpu_channels, from_vcpu_channel);
3799         std::thread::Builder::new()
3800             .name("gdb".to_owned())
3801             .spawn(move || gdb_thread(target, gdb_port_num))
3802             .context("failed to spawn GDB thread")?;
3803     };
3804 
3805     let (irq_handler_control, irq_handler_control_for_thread) = Tube::pair()?;
3806     let sys_allocator_for_thread = sys_allocator_mutex.clone();
3807     let irq_chip_for_thread = linux.irq_chip.try_box_clone()?;
3808     let irq_handler_thread = std::thread::Builder::new()
3809         .name("irq_handler_thread".into())
3810         .spawn(move || {
3811             irq_handler_thread(
3812                 irq_control_tubes,
3813                 irq_chip_for_thread,
3814                 sys_allocator_for_thread,
3815                 irq_handler_control_for_thread,
3816             )
3817         })
3818         .unwrap();
3819 
3820     let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
3821     let vm_memory_handler_thread = std::thread::Builder::new()
3822         .name("vm_memory_handler_thread".into())
3823         .spawn({
3824             let vm = linux.vm.try_clone().context("failed to clone Vm")?;
3825             let sys_allocator_mutex = sys_allocator_mutex.clone();
3826             let iommu_client = iommu_host_tube
3827                 .as_ref()
3828                 .map(|t| VmMemoryRequestIommuClient::new(t.clone()));
3829             move || {
3830                 vm_memory_handler_thread(
3831                     vm_memory_control_tubes,
3832                     vm,
3833                     sys_allocator_mutex,
3834                     gralloc,
3835                     iommu_client,
3836                     vm_memory_handler_control_for_thread,
3837                 )
3838             }
3839         })
3840         .unwrap();
3841 
3842     vcpu_thread_barrier.wait();
3843 
3844     // See comment on `VmRequest::execute`.
3845     let mut suspended_pvclock_state: Option<hypervisor::ClockState> = None;
3846 
3847     // Restore VM (if applicable).
3848     // Must happen after the vCPU barrier to avoid deadlock.
3849     if let Some(path) = &cfg.restore_path {
3850         vm_control::do_restore(
3851             path,
3852             |msg| vcpu::kick_all_vcpus(&vcpu_handles, linux.irq_chip.as_irq_chip(), msg),
3853             |msg, index| {
3854                 vcpu::kick_vcpu(&vcpu_handles.get(index), linux.irq_chip.as_irq_chip(), msg)
3855             },
3856             &irq_handler_control,
3857             &device_ctrl_tube,
3858             linux.vcpu_count,
3859             |image| {
3860                 linux
3861                     .irq_chip
3862                     .try_box_clone()?
3863                     .restore(image, linux.vcpu_count)
3864             },
3865             /* require_encrypted= */ false,
3866             &mut suspended_pvclock_state,
3867         )?;
3868         // Allow the vCPUs to start for real.
3869         vcpu::kick_all_vcpus(
3870             &vcpu_handles,
3871             linux.irq_chip.as_irq_chip(),
3872             VcpuControl::RunState(post_restore_run_mode),
3873         )
3874     }
3875 
3876     #[cfg(feature = "swap")]
3877     if let Some(swap_controller) = &swap_controller {
3878         swap_controller
3879             .on_static_devices_setup_complete()
3880             .context("static device setup complete")?;
3881     }
3882 
3883     let metrics_thread = if metrics::is_initialized() {
3884         Some(
3885             std::thread::Builder::new()
3886                 .name("metrics_thread".into())
3887                 .spawn(move || {
3888                     if let Err(e) = MetricsController::new(vec![metrics_tube]).run() {
3889                         error!("Metrics controller error: {:?}", e);
3890                     }
3891                 })
3892                 .context("metrics thread failed")?,
3893         )
3894     } else {
3895         None
3896     };
3897 
3898     let mut exit_state = ExitState::Stop;
3899     let mut pvpanic_code = PvPanicCode::Unknown;
3900     #[cfg(feature = "registered_events")]
3901     let mut registered_evt_tubes: HashMap<RegisteredEvent, HashSet<AddressedProtoTube>> =
3902         HashMap::new();
3903 
3904     'wait: loop {
3905         let events = {
3906             match wait_ctx.wait() {
3907                 Ok(v) => v,
3908                 Err(e) => {
3909                     error!("failed to poll: {}", e);
3910                     break;
3911                 }
3912             }
3913         };
3914 
3915         let mut vm_control_ids_to_remove = Vec::new();
3916         for event in events.iter().filter(|e| e.is_readable) {
3917             match event.token {
3918                 #[cfg(feature = "registered_events")]
3919                 Token::RegisteredEvent => match reg_evt_rdtube.recv::<RegisteredEventWithData>() {
3920                     Ok(reg_evt) => {
3921                         let evt = reg_evt.into_event();
3922                         let mut tubes_to_remove: Vec<String> = Vec::new();
3923                         if let Some(tubes) = registered_evt_tubes.get_mut(&evt) {
3924                             for tube in tubes.iter() {
3925                                 if let Err(e) = tube.send(&reg_evt.into_proto()) {
3926                                     warn!(
3927                                         "failed to send registered event {:?} to {}, removing from \
3928                                          registrations: {}",
3929                                         reg_evt, tube.socket_addr, e
3930                                     );
3931                                     tubes_to_remove.push(tube.socket_addr.clone());
3932                                 }
3933                             }
3934                         }
3935                         for tube_addr in tubes_to_remove {
3936                             for tubes in registered_evt_tubes.values_mut() {
3937                                 tubes.retain(|t| t.socket_addr != tube_addr);
3938                             }
3939                         }
3940                         registered_evt_tubes.retain(|_, tubes| !tubes.is_empty());
3941                     }
3942                     Err(e) => {
3943                         warn!("failed to recv RegisteredEvent: {}", e);
3944                     }
3945                 },
3946                 Token::VmEvent => {
3947                     let mut break_to_wait: bool = true;
3948                     match vm_evt_rdtube.recv::<VmEventType>() {
3949                         Ok(vm_event) => match vm_event {
3950                             VmEventType::Exit => {
3951                                 info!("vcpu requested shutdown");
3952                                 exit_state = ExitState::Stop;
3953                             }
3954                             VmEventType::Reset => {
3955                                 info!("vcpu requested reset");
3956                                 exit_state = ExitState::Reset;
3957                             }
3958                             VmEventType::Crash => {
3959                                 info!("vcpu crashed");
3960                                 exit_state = ExitState::Crash;
3961                             }
3962                             VmEventType::Panic(panic_code) => {
3963                                 pvpanic_code = PvPanicCode::from_u8(panic_code);
3964                                 info!("Guest reported panic [Code: {}]", pvpanic_code);
3965                                 break_to_wait = false;
3966                             }
3967                             VmEventType::WatchdogReset => {
3968                                 info!("vcpu stall detected");
3969                                 exit_state = ExitState::WatchdogReset;
3970                             }
3971                         },
3972                         Err(e) => {
3973                             warn!("failed to recv VmEvent: {}", e);
3974                         }
3975                     }
3976                     if break_to_wait {
3977                         if pvpanic_code == PvPanicCode::Panicked {
3978                             exit_state = ExitState::GuestPanic;
3979                         }
3980                         break 'wait;
3981                     }
3982                 }
3983                 Token::Suspend => match linux.suspend_tube.1.recv::<bool>() {
3984                     Ok(is_suspend_request) => {
3985                         let mode = if is_suspend_request {
3986                             VmRunMode::Suspending
3987                         } else {
3988                             for dev in &linux.resume_notify_devices {
3989                                 dev.lock().resume_imminent();
3990                             }
3991                             VmRunMode::Running
3992                         };
3993                         info!("VM requested {}", mode);
3994                         vcpu::kick_all_vcpus(
3995                             &vcpu_handles,
3996                             linux.irq_chip.as_irq_chip(),
3997                             VcpuControl::RunState(mode),
3998                         );
3999                     }
4000                     Err(err) => {
4001                         warn!("Failed to read suspend tube {:?}", err);
4002                     }
4003                 },
4004                 Token::ChildSignal => {
4005                     // Print all available siginfo structs, then exit the loop if child process has
4006                     // been exited except CLD_STOPPED and CLD_CONTINUED. the two should be ignored
4007                     // here since they are used by the vmm-swap feature.
4008                     let mut do_exit = false;
4009                     while let Some(siginfo) =
4010                         sigchld_fd.read().context("failed to read signalfd")?
4011                     {
4012                         let pid = siginfo.ssi_pid;
4013                         let pid_label = match linux.pid_debug_label_map.get(&pid) {
4014                             Some(label) => format!("{} (pid {})", label, pid),
4015                             None => format!("pid {}", pid),
4016                         };
4017 
4018                         // TODO(kawasin): this is a temporary exception until device suspension.
4019                         #[cfg(feature = "swap")]
4020                         if siginfo.ssi_code == libc::CLD_STOPPED
4021                             || siginfo.ssi_code == libc::CLD_CONTINUED
4022                         {
4023                             continue;
4024                         }
4025 
4026                         // Ignore clean exits of non-tracked child processes when running without
4027                         // sandboxing. The virtio gpu process launches a render server for
4028                         // pass-through graphics. Host GPU drivers have been observed to fork
4029                         // child processes that exit cleanly which should not be considered a
4030                         // crash. When running with sandboxing, this should be handled by the
4031                         // device's process handler.
4032                         if cfg.jail_config.is_none()
4033                             && !linux.pid_debug_label_map.contains_key(&pid)
4034                             && siginfo.ssi_signo == libc::SIGCHLD as u32
4035                             && siginfo.ssi_code == libc::CLD_EXITED
4036                             && siginfo.ssi_status == 0
4037                         {
4038                             continue;
4039                         }
4040 
4041                         // Allow clean exits of a child process in `worker_process_pids`.
4042                         if siginfo.ssi_signo == libc::SIGCHLD as u32
4043                             && siginfo.ssi_code == libc::CLD_EXITED
4044                             && siginfo.ssi_status == 0
4045                             && worker_process_pids.remove(&(pid as Pid))
4046                         {
4047                             info!("child {pid} exited successfully");
4048                             continue;
4049                         }
4050 
4051                         error!(
4052                             "child {} exited: signo {}, status {}, code {}",
4053                             pid_label, siginfo.ssi_signo, siginfo.ssi_status, siginfo.ssi_code
4054                         );
4055                         do_exit = true;
4056                     }
4057                     if do_exit {
4058                         exit_state = ExitState::Crash;
4059                         break 'wait;
4060                     }
4061                 }
4062                 Token::VmControlServer => {
4063                     if let Some(socket_server) = &control_server_socket {
4064                         match socket_server.accept() {
4065                             Ok(socket) => {
4066                                 let id = next_control_id;
4067                                 next_control_id += 1;
4068                                 wait_ctx
4069                                     .add(&socket, Token::VmControl { id })
4070                                     .context("failed to add descriptor to wait context")?;
4071                                 control_tubes.insert(
4072                                     id,
4073                                     TaggedControlTube::Vm(Tube::new_from_unix_seqpacket(socket)?),
4074                                 );
4075                             }
4076                             Err(e) => error!("failed to accept socket: {}", e),
4077                         }
4078                     }
4079                 }
4080                 Token::VmControl { id } => {
4081                     if let Some(socket) = control_tubes.get(&id) {
4082                         let mut state = ControlLoopState {
4083                             linux: &mut linux,
4084                             cfg: &cfg,
4085                             sys_allocator: &sys_allocator_mutex,
4086                             control_tubes: &control_tubes,
4087                             disk_host_tubes: &disk_host_tubes[..],
4088                             #[cfg(feature = "gpu")]
4089                             gpu_control_tube: gpu_control_tube.as_ref(),
4090                             #[cfg(feature = "usb")]
4091                             usb_control_tube: &usb_control_tube,
4092                             #[cfg(target_arch = "x86_64")]
4093                             iommu_host_tube: &iommu_host_tube,
4094                             #[cfg(target_arch = "x86_64")]
4095                             hp_control_tube: &hp_control_tube,
4096                             guest_suspended_cvar: &guest_suspended_cvar,
4097                             #[cfg(feature = "pci-hotplug")]
4098                             hotplug_manager: &mut hotplug_manager,
4099                             #[cfg(feature = "swap")]
4100                             swap_controller: &mut swap_controller,
4101                             vcpu_handles: &vcpu_handles,
4102                             #[cfg(feature = "balloon")]
4103                             balloon_tube: balloon_tube.as_mut(),
4104                             device_ctrl_tube: &device_ctrl_tube,
4105                             irq_handler_control: &irq_handler_control,
4106                             #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
4107                             vm_memory_handler_control: &vm_memory_handler_control,
4108                             #[cfg(feature = "registered_events")]
4109                             registered_evt_tubes: &mut registered_evt_tubes,
4110                             #[cfg(feature = "pvclock")]
4111                             pvclock_host_tube: pvclock_host_tube.clone(),
4112                             vfio_container_manager: &mut vfio_container_manager,
4113                             suspended_pvclock_state: &mut suspended_pvclock_state,
4114                             vcpus_pid_tid: &vcpus_pid_tid,
4115                         };
4116                         let (exit_requested, mut ids_to_remove, add_tubes) =
4117                             process_vm_control_event(&mut state, id, socket)?;
4118                         if exit_requested {
4119                             break 'wait;
4120                         }
4121                         vm_control_ids_to_remove.append(&mut ids_to_remove);
4122                         for socket in add_tubes {
4123                             let id = next_control_id;
4124                             next_control_id += 1;
4125                             wait_ctx
4126                                 .add(socket.as_ref(), Token::VmControl { id })
4127                                 .context(
4128                                     "failed to add hotplug vfio-pci descriptor to wait context",
4129                                 )?;
4130                             control_tubes.insert(id, socket);
4131                         }
4132                     }
4133                 }
4134                 #[cfg(feature = "balloon")]
4135                 Token::BalloonTube => {
4136                     match balloon_tube.as_mut().expect("missing balloon tube").recv() {
4137                         Ok(resp) => {
4138                             for (resp, idx) in resp {
4139                                 if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
4140                                     if let Err(e) = tube.send(&resp) {
4141                                         error!("failed to send VmResponse: {}", e);
4142                                     }
4143                                 } else {
4144                                     error!("Bad tube index {}", idx);
4145                                 }
4146                             }
4147                         }
4148                         Err(err) => {
4149                             error!("Error processing balloon tube {:?}", err)
4150                         }
4151                     }
4152                 }
4153             }
4154         }
4155 
4156         remove_hungup_and_drained_tubes(
4157             &events,
4158             &wait_ctx,
4159             &mut control_tubes,
4160             vm_control_ids_to_remove,
4161             |token: &Token| {
4162                 if let Token::VmControl { id } = token {
4163                     return Some(*id);
4164                 }
4165                 None
4166             },
4167         )?;
4168     }
4169 
4170     vcpu::kick_all_vcpus(
4171         &vcpu_handles,
4172         linux.irq_chip.as_irq_chip(),
4173         VcpuControl::RunState(VmRunMode::Exiting),
4174     );
4175     for (handle, _) in vcpu_handles {
4176         if let Err(e) = handle.join() {
4177             error!("failed to join vcpu thread: {:?}", e);
4178         }
4179     }
4180 
4181     // After joining all vcpu threads, unregister the process-wide signal handler.
4182     if let Err(e) = vcpu::remove_vcpu_signal_handler() {
4183         error!("failed to remove vcpu thread signal handler: {:#}", e);
4184     }
4185 
4186     // Stop the vmm-swap monitor process.
4187     #[cfg(feature = "swap")]
4188     drop(swap_controller);
4189 
4190     // Stop pci root worker thread
4191     #[cfg(target_arch = "x86_64")]
4192     {
4193         let _ = hp_control_tube.send(PciRootCommand::Kill);
4194         if let Err(e) = hp_thread.join() {
4195             error!("failed to join hotplug thread: {:?}", e);
4196         }
4197     }
4198 
4199     if linux.devices_thread.is_some() {
4200         if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
4201             error!("failed to stop device control loop: {}", e);
4202         };
4203         if let Some(thread) = linux.devices_thread.take() {
4204             if let Err(e) = thread.join() {
4205                 error!("failed to exit devices thread: {:?}", e);
4206             }
4207         }
4208     }
4209 
4210     // Shut down the VM Memory handler thread.
4211     if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
4212         error!(
4213             "failed to request exit from VM Memory handler thread: {}",
4214             e
4215         );
4216     }
4217     if let Err(e) = vm_memory_handler_thread.join() {
4218         error!("failed to exit VM Memory handler thread: {:?}", e);
4219     }
4220 
4221     // Shut down the IRQ handler thread.
4222     if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
4223         error!("failed to request exit from IRQ handler thread: {}", e);
4224     }
4225     if let Err(e) = irq_handler_thread.join() {
4226         error!("failed to exit irq handler thread: {:?}", e);
4227     }
4228 
4229     // At this point, the only remaining `Arc` references to the `Bus` objects should be the ones
4230     // inside `linux`. If the checks below fail, then some other thread is probably still running
4231     // and needs to be explicitly stopped before dropping `linux` to ensure devices actually get
4232     // cleaned up.
4233     match Arc::try_unwrap(std::mem::replace(
4234         &mut linux.mmio_bus,
4235         Arc::new(Bus::new(BusType::Mmio)),
4236     )) {
4237         Ok(_) => {}
4238         Err(_) => panic!("internal error: mmio_bus had more than one reference at shutdown"),
4239     }
4240     match Arc::try_unwrap(std::mem::replace(
4241         &mut linux.io_bus,
4242         Arc::new(Bus::new(BusType::Io)),
4243     )) {
4244         Ok(_) => {}
4245         Err(_) => panic!("internal error: io_bus had more than one reference at shutdown"),
4246     }
4247 
4248     // Explicitly drop the VM structure here to allow the devices to clean up before the
4249     // control sockets are closed when this function exits.
4250     mem::drop(linux);
4251 
4252     // Drop the hotplug manager to tell the warden process to exit before we try to join
4253     // the metrics thread.
4254     #[cfg(feature = "pci-hotplug")]
4255     mem::drop(hotplug_manager);
4256 
4257     // All our children should have exited by now, so closing our fd should
4258     // terminate metrics. Then join so that everything gets flushed.
4259     metrics::get_destructor().cleanup();
4260     if let Some(metrics_thread) = metrics_thread {
4261         if let Err(e) = metrics_thread.join() {
4262             error!("failed to exit irq handler thread: {:?}", e);
4263         }
4264     }
4265 
4266     stdin()
4267         .set_canon_mode()
4268         .expect("failed to restore canonical mode for terminal");
4269 
4270     Ok(exit_state)
4271 }
4272 
4273 #[derive(EventToken)]
4274 enum IrqHandlerToken {
4275     IrqFd { index: IrqEventIndex },
4276     VmIrq { id: usize },
4277     DelayedIrqFd,
4278     HandlerControl,
4279 }
4280 
4281 /// Handles IRQs and requests from devices to add additional IRQ lines.
irq_handler_thread( irq_control_tubes: Vec<Tube>, mut irq_chip: Box<dyn IrqChipArch + 'static>, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, handler_control: Tube, ) -> anyhow::Result<()>4282 fn irq_handler_thread(
4283     irq_control_tubes: Vec<Tube>,
4284     mut irq_chip: Box<dyn IrqChipArch + 'static>,
4285     sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4286     handler_control: Tube,
4287 ) -> anyhow::Result<()> {
4288     let wait_ctx = WaitContext::build_with(&[(
4289         handler_control.get_read_notifier(),
4290         IrqHandlerToken::HandlerControl,
4291     )])
4292     .context("failed to build wait context")?;
4293 
4294     if let Some(delayed_ioapic_irq_trigger) = irq_chip.irq_delayed_event_token()? {
4295         wait_ctx
4296             .add(&delayed_ioapic_irq_trigger, IrqHandlerToken::DelayedIrqFd)
4297             .context("failed to add descriptor to wait context")?;
4298     }
4299 
4300     let mut irq_event_tokens = irq_chip
4301         .irq_event_tokens()
4302         .context("failed get event tokens from irqchip")?;
4303 
4304     for (index, _gsi, evt) in irq_event_tokens.iter() {
4305         wait_ctx
4306             .add(evt, IrqHandlerToken::IrqFd { index: *index })
4307             .context("failed to add irq chip event tokens to wait context")?;
4308     }
4309 
4310     let mut irq_control_tubes = BTreeMap::from_iter(irq_control_tubes.into_iter().enumerate());
4311     let mut next_control_id = irq_control_tubes.len();
4312     for (id, socket) in irq_control_tubes.iter() {
4313         wait_ctx
4314             .add(
4315                 socket.get_read_notifier(),
4316                 IrqHandlerToken::VmIrq { id: *id },
4317             )
4318             .context("irq control tubes to wait context")?;
4319     }
4320 
4321     'wait: loop {
4322         let events = {
4323             match wait_ctx.wait() {
4324                 Ok(v) => v,
4325                 Err(e) => {
4326                     error!("failed to poll: {}", e);
4327                     break 'wait;
4328                 }
4329             }
4330         };
4331         let token_count = events.len();
4332         let mut vm_irq_tubes_to_remove = Vec::new();
4333         let mut notify_control_on_iteration_end = false;
4334 
4335         for event in events.iter().filter(|e| e.is_readable) {
4336             match event.token {
4337                 IrqHandlerToken::HandlerControl => {
4338                     match handler_control.recv::<IrqHandlerRequest>() {
4339                         Ok(request) => {
4340                             match request {
4341                                 IrqHandlerRequest::Exit => break 'wait,
4342                                 IrqHandlerRequest::AddIrqControlTubes(tubes) => {
4343                                     for socket in tubes {
4344                                         let id = next_control_id;
4345                                         next_control_id += 1;
4346                                         wait_ctx
4347                                         .add(
4348                                             socket.get_read_notifier(),
4349                                             IrqHandlerToken::VmIrq { id },
4350                                         )
4351                                         .context("failed to add new IRQ control Tube to wait context")?;
4352                                         irq_control_tubes.insert(id, socket);
4353                                     }
4354                                 }
4355                                 IrqHandlerRequest::RefreshIrqEventTokens => {
4356                                     for (_index, _gsi, evt) in irq_event_tokens.iter() {
4357                                         wait_ctx.delete(evt).context(
4358                                             "failed to remove irq chip event \
4359                                                 token from wait context",
4360                                         )?;
4361                                     }
4362 
4363                                     irq_event_tokens = irq_chip
4364                                         .irq_event_tokens()
4365                                         .context("failed get event tokens from irqchip")?;
4366                                     for (index, _gsi, evt) in irq_event_tokens.iter() {
4367                                         wait_ctx
4368                                             .add(evt, IrqHandlerToken::IrqFd { index: *index })
4369                                             .context(
4370                                                 "failed to add irq chip event \
4371                                                 tokens to wait context",
4372                                             )?;
4373                                     }
4374 
4375                                     if let Err(e) = handler_control
4376                                         .send(&IrqHandlerResponse::IrqEventTokenRefreshComplete)
4377                                     {
4378                                         error!(
4379                                             "failed to notify IRQ event token refresh \
4380                                             was completed: {}",
4381                                             e
4382                                         );
4383                                     }
4384                                 }
4385                                 IrqHandlerRequest::WakeAndNotifyIteration => {
4386                                     notify_control_on_iteration_end = true;
4387                                 }
4388                             }
4389                         }
4390                         Err(e) => {
4391                             if let TubeError::Disconnected = e {
4392                                 panic!("irq handler control tube disconnected.");
4393                             } else {
4394                                 error!("failed to recv IrqHandlerRequest: {}", e);
4395                             }
4396                         }
4397                     }
4398                 }
4399                 IrqHandlerToken::VmIrq { id } => {
4400                     if let Some(tube) = irq_control_tubes.get(&id) {
4401                         handle_irq_tube_request(
4402                             &sys_allocator_mutex,
4403                             &mut irq_chip,
4404                             &mut vm_irq_tubes_to_remove,
4405                             &wait_ctx,
4406                             tube,
4407                             id,
4408                         );
4409                     }
4410                 }
4411                 IrqHandlerToken::IrqFd { index } => {
4412                     if let Err(e) = irq_chip.service_irq_event(index) {
4413                         error!("failed to signal irq {}: {}", index, e);
4414                     }
4415                 }
4416                 IrqHandlerToken::DelayedIrqFd => {
4417                     if let Err(e) = irq_chip.process_delayed_irq_events() {
4418                         warn!("can't deliver delayed irqs: {}", e);
4419                     }
4420                 }
4421             }
4422         }
4423 
4424         if notify_control_on_iteration_end {
4425             if let Err(e) = handler_control.send(&IrqHandlerResponse::HandlerIterationComplete(
4426                 token_count - 1,
4427             )) {
4428                 error!(
4429                     "failed to notify on iteration completion (snapshotting may fail): {}",
4430                     e
4431                 );
4432             }
4433         }
4434 
4435         remove_hungup_and_drained_tubes(
4436             &events,
4437             &wait_ctx,
4438             &mut irq_control_tubes,
4439             vm_irq_tubes_to_remove,
4440             |token: &IrqHandlerToken| {
4441                 if let IrqHandlerToken::VmIrq { id } = token {
4442                     return Some(*id);
4443                 }
4444                 None
4445             },
4446         )?;
4447         if events.iter().any(|e| {
4448             e.is_hungup && !e.is_readable && matches!(e.token, IrqHandlerToken::HandlerControl)
4449         }) {
4450             error!("IRQ handler control hung up but did not request an exit.");
4451             break 'wait;
4452         }
4453     }
4454     Ok(())
4455 }
4456 
handle_irq_tube_request( sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, irq_chip: &mut Box<dyn IrqChipArch + 'static>, vm_irq_tubes_to_remove: &mut Vec<usize>, wait_ctx: &WaitContext<IrqHandlerToken>, tube: &Tube, tube_index: usize, )4457 fn handle_irq_tube_request(
4458     sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
4459     irq_chip: &mut Box<dyn IrqChipArch + 'static>,
4460     vm_irq_tubes_to_remove: &mut Vec<usize>,
4461     wait_ctx: &WaitContext<IrqHandlerToken>,
4462     tube: &Tube,
4463     tube_index: usize,
4464 ) {
4465     match tube.recv::<VmIrqRequest>() {
4466         Ok(request) => {
4467             let response = {
4468                 request.execute(
4469                     |setup| match setup {
4470                         IrqSetup::Event(irq, ev, device_id, queue_id, device_name) => {
4471                             let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4472                             let source = IrqEventSource {
4473                                 device_id: device_id.try_into().expect("Invalid device_id"),
4474                                 queue_id,
4475                                 device_name,
4476                             };
4477                             if let Some(event_index) =
4478                                 irq_chip.register_edge_irq_event(irq, &irq_evt, source)?
4479                             {
4480                                 if let Err(e) =
4481                                     wait_ctx.add(ev, IrqHandlerToken::IrqFd { index: event_index })
4482                                 {
4483                                     warn!("failed to add IrqFd to poll context: {}", e);
4484                                     return Err(e);
4485                                 }
4486                             }
4487                             Ok(())
4488                         }
4489                         IrqSetup::Route(route) => irq_chip.route_irq(route),
4490                         IrqSetup::UnRegister(irq, ev) => {
4491                             let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4492                             irq_chip.unregister_edge_irq_event(irq, &irq_evt)
4493                         }
4494                     },
4495                     &mut sys_allocator_mutex.lock(),
4496                 )
4497             };
4498             if let Err(e) = tube.send(&response) {
4499                 error!("failed to send VmIrqResponse: {}", e);
4500             }
4501         }
4502         Err(e) => {
4503             if let TubeError::Disconnected = e {
4504                 vm_irq_tubes_to_remove.push(tube_index);
4505             } else {
4506                 error!("failed to recv VmIrqRequest: {}", e);
4507             }
4508         }
4509     }
4510 }
4511 
4512 /// Commands to control the VM Memory handler thread.
4513 #[derive(serde::Serialize, serde::Deserialize)]
4514 pub enum VmMemoryHandlerRequest {
4515     /// No response is sent for this command.
4516     AddControlTubes(Vec<VmMemoryTube>),
4517     /// No response is sent for this command.
4518     Exit,
4519 }
4520 
vm_memory_handler_thread( control_tubes: Vec<VmMemoryTube>, mut vm: impl Vm, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, mut gralloc: RutabagaGralloc, mut iommu_client: Option<VmMemoryRequestIommuClient>, handler_control: Tube, ) -> anyhow::Result<()>4521 fn vm_memory_handler_thread(
4522     control_tubes: Vec<VmMemoryTube>,
4523     mut vm: impl Vm,
4524     sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4525     mut gralloc: RutabagaGralloc,
4526     mut iommu_client: Option<VmMemoryRequestIommuClient>,
4527     handler_control: Tube,
4528 ) -> anyhow::Result<()> {
4529     #[derive(EventToken)]
4530     enum Token {
4531         VmControl { id: usize },
4532         HandlerControl,
4533     }
4534 
4535     let wait_ctx =
4536         WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
4537             .context("failed to build wait context")?;
4538     let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
4539     let mut next_control_id = control_tubes.len();
4540     for (id, socket) in control_tubes.iter() {
4541         wait_ctx
4542             .add(socket.as_ref(), Token::VmControl { id: *id })
4543             .context("failed to add descriptor to wait context")?;
4544     }
4545 
4546     let mut region_state: VmMemoryRegionState = Default::default();
4547 
4548     'wait: loop {
4549         let events = {
4550             match wait_ctx.wait() {
4551                 Ok(v) => v,
4552                 Err(e) => {
4553                     error!("failed to poll: {}", e);
4554                     break;
4555                 }
4556             }
4557         };
4558 
4559         let mut vm_control_ids_to_remove = Vec::new();
4560         for event in events.iter().filter(|e| e.is_readable) {
4561             match event.token {
4562                 Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
4563                     Ok(request) => match request {
4564                         VmMemoryHandlerRequest::Exit => break 'wait,
4565                         VmMemoryHandlerRequest::AddControlTubes(tubes) => {
4566                             for socket in tubes {
4567                                 let id = next_control_id;
4568                                 next_control_id += 1;
4569                                 wait_ctx
4570                                     .add(socket.get_read_notifier(), Token::VmControl { id })
4571                                     .context(
4572                                         "failed to add new vm memory control Tube to wait context",
4573                                     )?;
4574                                 control_tubes.insert(id, socket);
4575                             }
4576                         }
4577                     },
4578                     Err(e) => {
4579                         if let TubeError::Disconnected = e {
4580                             panic!("vm memory control tube disconnected.");
4581                         } else {
4582                             error!("failed to recv VmMemoryHandlerRequest: {}", e);
4583                         }
4584                     }
4585                 },
4586                 Token::VmControl { id } => {
4587                     if let Some(VmMemoryTube {
4588                         tube,
4589                         expose_with_viommu,
4590                     }) = control_tubes.get(&id)
4591                     {
4592                         match tube.recv::<VmMemoryRequest>() {
4593                             Ok(request) => {
4594                                 let response = request.execute(
4595                                     tube,
4596                                     &mut vm,
4597                                     &mut sys_allocator_mutex.lock(),
4598                                     &mut gralloc,
4599                                     if *expose_with_viommu {
4600                                         iommu_client.as_mut()
4601                                     } else {
4602                                         None
4603                                     },
4604                                     &mut region_state,
4605                                 );
4606                                 if let Err(e) = tube.send(&response) {
4607                                     error!("failed to send VmMemoryControlResponse: {}", e);
4608                                 }
4609                             }
4610                             Err(e) => {
4611                                 if let TubeError::Disconnected = e {
4612                                     vm_control_ids_to_remove.push(id);
4613                                 } else {
4614                                     error!("failed to recv VmMemoryControlRequest: {}", e);
4615                                 }
4616                             }
4617                         }
4618                     }
4619                 }
4620             }
4621         }
4622 
4623         remove_hungup_and_drained_tubes(
4624             &events,
4625             &wait_ctx,
4626             &mut control_tubes,
4627             vm_control_ids_to_remove,
4628             |token: &Token| {
4629                 if let Token::VmControl { id } = token {
4630                     return Some(*id);
4631                 }
4632                 None
4633             },
4634         )?;
4635         if events
4636             .iter()
4637             .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
4638         {
4639             error!("vm memory handler control hung up but did not request an exit.");
4640             break 'wait;
4641         }
4642     }
4643     Ok(())
4644 }
4645 
4646 /// When control tubes hang up, we want to make sure that we've fully drained
4647 /// the underlying socket before removing it. This function also handles
4648 /// removing closed sockets in such a way that avoids phantom events.
4649 ///
4650 /// `tube_ids_to_remove` is the set of ids that we already know should
4651 /// be removed (e.g. from getting a disconnect error on read).
remove_hungup_and_drained_tubes<T, U>( events: &SmallVec<[TriggeredEvent<T>; 16]>, wait_ctx: &WaitContext<T>, tubes: &mut BTreeMap<usize, U>, mut tube_ids_to_remove: Vec<usize>, get_tube_id: fn(token: &T) -> Option<usize>, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier,4652 fn remove_hungup_and_drained_tubes<T, U>(
4653     events: &SmallVec<[TriggeredEvent<T>; 16]>,
4654     wait_ctx: &WaitContext<T>,
4655     tubes: &mut BTreeMap<usize, U>,
4656     mut tube_ids_to_remove: Vec<usize>,
4657     get_tube_id: fn(token: &T) -> Option<usize>,
4658 ) -> anyhow::Result<()>
4659 where
4660     T: EventToken,
4661     U: ReadNotifier,
4662 {
4663     // It's possible more data is readable and buffered while the socket is hungup,
4664     // so don't delete the tube from the poll context until we're sure all the
4665     // data is read.
4666     // Below case covers a condition where we have received a hungup event and the tube is not
4667     // readable.
4668     // In case of readable tube, once all data is read, any attempt to read more data on hungup
4669     // tube should fail. On such failure, we get Disconnected error and ids gets added to
4670     // tube_ids_to_remove by the time we reach here.
4671     for event in events.iter().filter(|e| e.is_hungup && !e.is_readable) {
4672         if let Some(id) = get_tube_id(&event.token) {
4673             tube_ids_to_remove.push(id);
4674         }
4675     }
4676 
4677     tube_ids_to_remove.dedup();
4678     for id in tube_ids_to_remove {
4679         // Delete the socket from the `wait_ctx` synchronously. Otherwise, the kernel will do
4680         // this automatically when the FD inserted into the `wait_ctx` is closed after this
4681         // if-block, but this removal can be deferred unpredictably. In some instances where the
4682         // system is under heavy load, we can even get events returned by `wait_ctx` for an FD
4683         // that has already been closed. Because the token associated with that spurious event
4684         // now belongs to a different socket, the control loop will start to interact with
4685         // sockets that might not be ready to use. This can cause incorrect hangup detection or
4686         // blocking on a socket that will never be ready. See also: crbug.com/1019986
4687         if let Some(socket) = tubes.remove(&id) {
4688             wait_ctx
4689                 .delete(socket.get_read_notifier())
4690                 .context("failed to remove descriptor from wait context")?;
4691         }
4692     }
4693     Ok(())
4694 }
4695 
4696 /// Start and jail a vhost-user device according to its configuration and a vhost listener string.
4697 ///
4698 /// The jailing business is nasty and potentially unsafe if done from the wrong context - do not
4699 /// call outside of `start_devices`!
4700 ///
4701 /// Returns the pid of the jailed device process.
jail_and_start_vu_device<T: VirtioDeviceBuilder>( jail_config: &Option<JailConfig>, params: T, vhost: &str, name: &str, ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)>4702 fn jail_and_start_vu_device<T: VirtioDeviceBuilder>(
4703     jail_config: &Option<JailConfig>,
4704     params: T,
4705     vhost: &str,
4706     name: &str,
4707 ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)> {
4708     let mut keep_rds = Vec::new();
4709 
4710     base::syslog::push_descriptors(&mut keep_rds);
4711     cros_tracing::push_descriptors!(&mut keep_rds);
4712     metrics::push_descriptors(&mut keep_rds);
4713 
4714     let jail_type = VirtioDeviceType::VhostUser;
4715 
4716     // Create a jail from the configuration. If the configuration is `None`, `create_jail` will also
4717     // return `None` so fall back to an empty (i.e. non-constrained) Minijail.
4718     let jail = params
4719         .create_jail(jail_config, jail_type)
4720         .with_context(|| format!("failed to create jail for {}", name))?
4721         .ok_or(())
4722         .or_else(|_| Minijail::new())
4723         .with_context(|| format!("failed to create empty jail for {}", name))?;
4724 
4725     // Create the device in the parent process, so the child does not need any privileges necessary
4726     // to do it (only runtime capabilities are required).
4727     let device = params
4728         .create_vhost_user_device(&mut keep_rds)
4729         .context("failed to create vhost-user device")?;
4730     let mut listener =
4731         VhostUserListener::new(vhost).context("failed to create the vhost listener")?;
4732     keep_rds.push(listener.as_raw_descriptor());
4733     let parent_resources = listener.take_parent_process_resources();
4734 
4735     // Executor must be created before jail in order to prevent the jailed process from creating
4736     // unrestricted io_urings.
4737     let ex = Executor::new().context("Failed to create an Executor")?;
4738     keep_rds.extend(ex.as_raw_descriptors());
4739 
4740     // Deduplicate the FDs since minijail expects them to be unique.
4741     keep_rds.sort_unstable();
4742     keep_rds.dedup();
4743 
4744     // SAFETY:
4745     // Safe because we are keeping all the descriptors needed for the child to function.
4746     match unsafe { jail.fork(Some(&keep_rds)).context("error while forking")? } {
4747         0 => {
4748             // In the child process.
4749 
4750             // Free memory for the resources managed by the parent, without running drop() on them.
4751             // The parent will do it as we exit.
4752             let _ = std::mem::ManuallyDrop::new(parent_resources);
4753 
4754             // Make sure the child process does not survive its parent.
4755             // SAFETY: trivially safe
4756             if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } < 0 {
4757                 panic!("call to prctl(PR_SET_DEATHSIG, SIGKILL) failed. Aborting child process.");
4758             }
4759 
4760             // Set the name for the thread.
4761             const MAX_LEN: usize = 15; // pthread_setname_np() limit on Linux
4762             let debug_label_trimmed = &name.as_bytes()[..std::cmp::min(MAX_LEN, name.len())];
4763             let thread_name = CString::new(debug_label_trimmed).unwrap();
4764             // SAFETY:
4765             // Safe because we trimmed the name to 15 characters (and pthread_setname_np will return
4766             // an error if we don't anyway).
4767             let _ = unsafe { libc::pthread_setname_np(libc::pthread_self(), thread_name.as_ptr()) };
4768 
4769             // Run the device loop and terminate the child process once it exits.
4770             let res = match listener.run_device(ex, device) {
4771                 Ok(()) => 0,
4772                 Err(e) => {
4773                     error!("error while running device {}: {:#}", name, e);
4774                     1
4775                 }
4776             };
4777             // SAFETY: trivially safe
4778             unsafe { libc::exit(res) };
4779         }
4780         pid => {
4781             // In the parent process. We will drop the device and listener when exiting this method.
4782             // This is fine as ownership for both has been transferred to the child process and they
4783             // will keep living there. We just retain `parent_resources` for things we are supposed
4784             // to clean up ourselves.
4785 
4786             info!("process for device {} (PID {}) started", &name, pid);
4787             #[cfg(feature = "seccomp_trace")]
4788             debug!(
4789                     "seccomp_trace {{\"event\": \"minijail_fork\", \"pid\": {}, \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
4790                     pid,
4791                     &name,
4792                     read_jail_addr(&jail)
4793                 );
4794             Ok((pid, parent_resources))
4795         }
4796     }
4797 }
4798 
process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()>4799 fn process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()> {
4800     let command = tube
4801         .recv::<VmRequest>()
4802         .context("failed to receive VmRequest")?;
4803     let resp = match command {
4804         VmRequest::DiskCommand {
4805             disk_index,
4806             ref command,
4807         } => match &disk_host_tubes.get(disk_index) {
4808             Some(tube) => handle_disk_command(command, tube),
4809             None => VmResponse::Err(base::Error::new(libc::ENODEV)),
4810         },
4811         request => {
4812             error!(
4813                 "Request {:?} currently not supported in vhost user backend",
4814                 request
4815             );
4816             VmResponse::Err(base::Error::new(libc::EPERM))
4817         }
4818     };
4819 
4820     tube.send(&resp).context("failed to send VmResponse")?;
4821     Ok(())
4822 }
4823 
start_vhost_user_control_server( control_server_socket: UnlinkUnixSeqpacketListener, disk_host_tubes: Vec<Tube>, )4824 fn start_vhost_user_control_server(
4825     control_server_socket: UnlinkUnixSeqpacketListener,
4826     disk_host_tubes: Vec<Tube>,
4827 ) {
4828     info!("Start vhost-user control server");
4829     loop {
4830         match control_server_socket.accept() {
4831             Ok(socket) => {
4832                 let tube = match Tube::new_from_unix_seqpacket(socket) {
4833                     Ok(tube) => tube,
4834                     Err(e) => {
4835                         error!("failed to open tube: {:#}", e);
4836                         return;
4837                     }
4838                 };
4839                 if let Err(e) = process_vhost_user_control_request(tube, &disk_host_tubes) {
4840                     error!("failed to process control request: {:#}", e);
4841                 }
4842             }
4843             Err(e) => {
4844                 error!("failed to establish connection: {}", e);
4845             }
4846         }
4847     }
4848 }
4849 
start_devices(opts: DevicesCommand) -> anyhow::Result<()>4850 pub fn start_devices(opts: DevicesCommand) -> anyhow::Result<()> {
4851     if let Some(async_executor) = opts.async_executor {
4852         Executor::set_default_executor_kind(async_executor)
4853             .context("Failed to set the default async executor")?;
4854     }
4855 
4856     struct DeviceJailInfo {
4857         // Unique name for the device, in the form `foomatic-0`.
4858         name: String,
4859         _drop_resources: Option<Box<dyn std::any::Any>>,
4860     }
4861 
4862     fn add_device<T: VirtioDeviceBuilder>(
4863         i: usize,
4864         device_params: T,
4865         vhost: &str,
4866         jail_config: &Option<JailConfig>,
4867         devices_jails: &mut BTreeMap<libc::pid_t, DeviceJailInfo>,
4868     ) -> anyhow::Result<()> {
4869         let name = format!("{}-{}", T::NAME, i);
4870 
4871         let (pid, _drop_resources) =
4872             jail_and_start_vu_device::<T>(jail_config, device_params, vhost, &name)?;
4873 
4874         devices_jails.insert(
4875             pid,
4876             DeviceJailInfo {
4877                 name,
4878                 _drop_resources,
4879             },
4880         );
4881 
4882         Ok(())
4883     }
4884 
4885     let mut devices_jails: BTreeMap<libc::pid_t, DeviceJailInfo> = BTreeMap::new();
4886 
4887     let jail = if opts.disable_sandbox {
4888         None
4889     } else {
4890         Some(opts.jail)
4891     };
4892 
4893     // Create control server socket
4894     let control_server_socket = opts.control_socket.map(|path| {
4895         UnlinkUnixSeqpacketListener(
4896             UnixSeqpacketListener::bind(path).expect("Could not bind socket"),
4897         )
4898     });
4899 
4900     // Create serial devices.
4901     for (i, params) in opts.serial.iter().enumerate() {
4902         let serial_config = &params.device;
4903         add_device(i, serial_config, &params.vhost, &jail, &mut devices_jails)?;
4904     }
4905 
4906     let mut disk_host_tubes = Vec::new();
4907     let control_socket_exists = control_server_socket.is_some();
4908     // Create block devices.
4909     for (i, params) in opts.block.iter().enumerate() {
4910         let tube = if control_socket_exists {
4911             let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
4912             disk_host_tubes.push(host_tube);
4913             Some(device_tube)
4914         } else {
4915             None
4916         };
4917         let disk_config = DiskConfig::new(&params.device, tube);
4918         add_device(i, disk_config, &params.vhost, &jail, &mut devices_jails)?;
4919     }
4920 
4921     // Create vsock devices.
4922     for (i, params) in opts.vsock.iter().enumerate() {
4923         add_device(i, &params.device, &params.vhost, &jail, &mut devices_jails)?;
4924     }
4925 
4926     // Create network devices.
4927     #[cfg(feature = "net")]
4928     for (i, params) in opts.net.iter().enumerate() {
4929         add_device(i, &params.device, &params.vhost, &jail, &mut devices_jails)?;
4930     }
4931 
4932     // No device created, that's probably not intended - print the help in that case.
4933     if devices_jails.is_empty() {
4934         let err = DevicesCommand::from_args(
4935             &[&std::env::args().next().unwrap_or(String::from("crosvm"))],
4936             &["--help"],
4937         )
4938         .unwrap_err();
4939         println!("{}", err.output);
4940         return Ok(());
4941     }
4942 
4943     let ex = Executor::new()?;
4944     if let Some(control_server_socket) = control_server_socket {
4945         // Start the control server in the parent process.
4946         ex.spawn_blocking(move || {
4947             start_vhost_user_control_server(control_server_socket, disk_host_tubes)
4948         })
4949         .detach();
4950     }
4951 
4952     // Now wait for all device processes to return.
4953     while !devices_jails.is_empty() {
4954         match base::linux::wait_for_pid(-1, 0) {
4955             Err(e) => panic!("error waiting for child process to complete: {:#}", e),
4956             Ok((Some(pid), wait_status)) => match devices_jails.remove_entry(&pid) {
4957                 Some((_, info)) => {
4958                     if let Some(status) = wait_status.code() {
4959                         info!(
4960                             "process for device {} (PID {}) exited with code {}",
4961                             &info.name, pid, status
4962                         );
4963                     } else if let Some(signal) = wait_status.signal() {
4964                         warn!(
4965                             "process for device {} (PID {}) has been killed by signal {:?}",
4966                             &info.name, pid, signal,
4967                         );
4968                     }
4969                 }
4970                 None => error!("pid {} is not one of our device processes", pid),
4971             },
4972             // `wait_for_pid` will necessarily return a PID because we asked to it wait for one to
4973             // complete.
4974             Ok((None, _)) => unreachable!(),
4975         }
4976     }
4977 
4978     info!("all device processes have exited");
4979 
4980     Ok(())
4981 }
4982 
4983 /// Setup crash reporting for a process. Each process MUST provide a unique `product_type` to avoid
4984 /// making crash reports incomprehensible.
4985 #[cfg(feature = "crash-report")]
setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String>4986 pub fn setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String> {
4987     crash_report::setup_crash_reporting(crash_report::CrashReportAttributes {
4988         product_type: "emulator".to_owned(),
4989         pipe_name: None,
4990         report_uuid: None,
4991         product_name: None,
4992         product_version: None,
4993     })
4994 }
4995 
4996 #[cfg(test)]
4997 mod tests {
4998     use std::path::PathBuf;
4999 
5000     use super::*;
5001 
5002     // Create a file-backed mapping parameters struct with the given `address` and `size` and other
5003     // parameters set to default values.
test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters5004     fn test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters {
5005         FileBackedMappingParameters {
5006             address,
5007             size,
5008             path: PathBuf::new(),
5009             offset: 0,
5010             writable: false,
5011             sync: false,
5012             align: false,
5013         }
5014     }
5015 
5016     #[test]
guest_mem_file_backed_mappings_overlap()5017     fn guest_mem_file_backed_mappings_overlap() {
5018         // Base case: no file mappings; output layout should be identical.
5019         assert_eq!(
5020             punch_holes_in_guest_mem_layout_for_mappings(
5021                 vec![
5022                     (GuestAddress(0), 0xD000_0000, Default::default()),
5023                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5024                 ],
5025                 &[]
5026             ),
5027             vec![
5028                 (GuestAddress(0), 0xD000_0000, Default::default()),
5029                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5030             ]
5031         );
5032 
5033         // File mapping that does not overlap guest memory.
5034         assert_eq!(
5035             punch_holes_in_guest_mem_layout_for_mappings(
5036                 vec![
5037                     (GuestAddress(0), 0xD000_0000, Default::default()),
5038                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5039                 ],
5040                 &[test_file_backed_mapping(0xD000_0000, 0x1000)]
5041             ),
5042             vec![
5043                 (GuestAddress(0), 0xD000_0000, Default::default()),
5044                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5045             ]
5046         );
5047 
5048         // File mapping at the start of the low address space region.
5049         assert_eq!(
5050             punch_holes_in_guest_mem_layout_for_mappings(
5051                 vec![
5052                     (GuestAddress(0), 0xD000_0000, Default::default()),
5053                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5054                 ],
5055                 &[test_file_backed_mapping(0, 0x2000)]
5056             ),
5057             vec![
5058                 (
5059                     GuestAddress(0x2000),
5060                     0xD000_0000 - 0x2000,
5061                     Default::default()
5062                 ),
5063                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5064             ]
5065         );
5066 
5067         // File mapping at the end of the low address space region.
5068         assert_eq!(
5069             punch_holes_in_guest_mem_layout_for_mappings(
5070                 vec![
5071                     (GuestAddress(0), 0xD000_0000, Default::default()),
5072                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5073                 ],
5074                 &[test_file_backed_mapping(0xD000_0000 - 0x2000, 0x2000)]
5075             ),
5076             vec![
5077                 (GuestAddress(0), 0xD000_0000 - 0x2000, Default::default()),
5078                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5079             ]
5080         );
5081 
5082         // File mapping fully contained within the middle of the low address space region.
5083         assert_eq!(
5084             punch_holes_in_guest_mem_layout_for_mappings(
5085                 vec![
5086                     (GuestAddress(0), 0xD000_0000, Default::default()),
5087                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5088                 ],
5089                 &[test_file_backed_mapping(0x1000, 0x2000)]
5090             ),
5091             vec![
5092                 (GuestAddress(0), 0x1000, Default::default()),
5093                 (
5094                     GuestAddress(0x3000),
5095                     0xD000_0000 - 0x3000,
5096                     Default::default()
5097                 ),
5098                 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5099             ]
5100         );
5101 
5102         // File mapping at the start of the high address space region.
5103         assert_eq!(
5104             punch_holes_in_guest_mem_layout_for_mappings(
5105                 vec![
5106                     (GuestAddress(0), 0xD000_0000, Default::default()),
5107                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5108                 ],
5109                 &[test_file_backed_mapping(0x1_0000_0000, 0x2000)]
5110             ),
5111             vec![
5112                 (GuestAddress(0), 0xD000_0000, Default::default()),
5113                 (
5114                     GuestAddress(0x1_0000_2000),
5115                     0x8_0000 - 0x2000,
5116                     Default::default()
5117                 ),
5118             ]
5119         );
5120 
5121         // File mapping at the end of the high address space region.
5122         assert_eq!(
5123             punch_holes_in_guest_mem_layout_for_mappings(
5124                 vec![
5125                     (GuestAddress(0), 0xD000_0000, Default::default()),
5126                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5127                 ],
5128                 &[test_file_backed_mapping(0x1_0008_0000 - 0x2000, 0x2000)]
5129             ),
5130             vec![
5131                 (GuestAddress(0), 0xD000_0000, Default::default()),
5132                 (
5133                     GuestAddress(0x1_0000_0000),
5134                     0x8_0000 - 0x2000,
5135                     Default::default()
5136                 ),
5137             ]
5138         );
5139 
5140         // File mapping fully contained within the middle of the high address space region.
5141         assert_eq!(
5142             punch_holes_in_guest_mem_layout_for_mappings(
5143                 vec![
5144                     (GuestAddress(0), 0xD000_0000, Default::default()),
5145                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5146                 ],
5147                 &[test_file_backed_mapping(0x1_0000_1000, 0x2000)]
5148             ),
5149             vec![
5150                 (GuestAddress(0), 0xD000_0000, Default::default()),
5151                 (GuestAddress(0x1_0000_0000), 0x1000, Default::default()),
5152                 (
5153                     GuestAddress(0x1_0000_3000),
5154                     0x8_0000 - 0x3000,
5155                     Default::default()
5156                 ),
5157             ]
5158         );
5159 
5160         // File mapping overlapping two guest memory regions.
5161         assert_eq!(
5162             punch_holes_in_guest_mem_layout_for_mappings(
5163                 vec![
5164                     (GuestAddress(0), 0xD000_0000, Default::default()),
5165                     (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5166                 ],
5167                 &[test_file_backed_mapping(0xA000_0000, 0x60002000)]
5168             ),
5169             vec![
5170                 (GuestAddress(0), 0xA000_0000, Default::default()),
5171                 (
5172                     GuestAddress(0x1_0000_2000),
5173                     0x8_0000 - 0x2000,
5174                     Default::default()
5175                 ),
5176             ]
5177         );
5178     }
5179 }
5180