1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #[cfg(target_os = "android")]
6 mod android;
7 pub mod cmdline;
8 pub mod config;
9 mod device_helpers;
10 pub(crate) mod ext2;
11 #[cfg(feature = "gpu")]
12 pub(crate) mod gpu;
13 #[cfg(feature = "pci-hotplug")]
14 pub(crate) mod jail_warden;
15 #[cfg(feature = "pci-hotplug")]
16 pub(crate) mod pci_hotplug_helpers;
17 #[cfg(feature = "pci-hotplug")]
18 pub(crate) mod pci_hotplug_manager;
19 mod vcpu;
20
21 #[cfg(all(feature = "pvclock", target_arch = "aarch64"))]
22 use std::arch::asm;
23 use std::cmp::max;
24 use std::collections::BTreeMap;
25 use std::collections::BTreeSet;
26 #[cfg(feature = "registered_events")]
27 use std::collections::HashMap;
28 #[cfg(feature = "registered_events")]
29 use std::collections::HashSet;
30 use std::convert::TryInto;
31 use std::ffi::CString;
32 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
33 use std::fs::create_dir_all;
34 use std::fs::File;
35 use std::fs::OpenOptions;
36 #[cfg(feature = "registered_events")]
37 use std::hash::Hash;
38 use std::io::stdin;
39 use std::iter;
40 use std::mem;
41 #[cfg(target_arch = "x86_64")]
42 use std::ops::RangeInclusive;
43 use std::os::unix::prelude::OpenOptionsExt;
44 use std::os::unix::process::ExitStatusExt;
45 use std::path::Path;
46 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
47 use std::path::PathBuf;
48 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
49 use std::process;
50 #[cfg(feature = "registered_events")]
51 use std::rc::Rc;
52 use std::sync::mpsc;
53 use std::sync::Arc;
54 use std::sync::Barrier;
55 use std::thread::JoinHandle;
56
57 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
58 use aarch64::AArch64 as Arch;
59 use acpi_tables::sdt::SDT;
60 use anyhow::anyhow;
61 use anyhow::bail;
62 use anyhow::Context;
63 use anyhow::Result;
64 use arch::DtbOverlay;
65 use arch::IrqChipArch;
66 use arch::LinuxArch;
67 use arch::RunnableLinuxVm;
68 use arch::VcpuAffinity;
69 use arch::VcpuArch;
70 use arch::VirtioDeviceStub;
71 use arch::VmArch;
72 use arch::VmComponents;
73 use arch::VmImage;
74 use argh::FromArgs;
75 use base::ReadNotifier;
76 #[cfg(feature = "balloon")]
77 use base::UnixSeqpacket;
78 use base::UnixSeqpacketListener;
79 use base::UnlinkUnixSeqpacketListener;
80 use base::*;
81 use cros_async::Executor;
82 use device_helpers::*;
83 use devices::create_devices_worker_thread;
84 use devices::serial_device::SerialHardware;
85 #[cfg(all(feature = "pvclock", target_arch = "x86_64"))]
86 use devices::tsc::get_tsc_sync_mitigations;
87 use devices::vfio::VfioContainerManager;
88 #[cfg(feature = "gpu")]
89 use devices::virtio;
90 #[cfg(any(feature = "video-decoder", feature = "video-encoder"))]
91 use devices::virtio::device_constants::video::VideoDeviceType;
92 #[cfg(feature = "gpu")]
93 use devices::virtio::gpu::EventDevice;
94 #[cfg(target_arch = "x86_64")]
95 use devices::virtio::memory_mapper::MemoryMapper;
96 use devices::virtio::memory_mapper::MemoryMapperTrait;
97 use devices::virtio::vhost::user::VhostUserConnectionTrait;
98 use devices::virtio::vhost::user::VhostUserListener;
99 #[cfg(feature = "balloon")]
100 use devices::virtio::BalloonFeatures;
101 #[cfg(feature = "pci-hotplug")]
102 use devices::virtio::NetParameters;
103 #[cfg(feature = "pci-hotplug")]
104 use devices::virtio::NetParametersMode;
105 use devices::virtio::VirtioDevice;
106 use devices::virtio::VirtioDeviceType;
107 use devices::virtio::VirtioTransportType;
108 use devices::Bus;
109 use devices::BusDeviceObj;
110 use devices::BusType;
111 use devices::CoIommuDev;
112 #[cfg(feature = "usb")]
113 use devices::DeviceProvider;
114 #[cfg(target_arch = "x86_64")]
115 use devices::HotPlugBus;
116 #[cfg(target_arch = "x86_64")]
117 use devices::HotPlugKey;
118 use devices::IommuDevType;
119 use devices::IrqEventIndex;
120 use devices::IrqEventSource;
121 #[cfg(feature = "pci-hotplug")]
122 use devices::NetResourceCarrier;
123 #[cfg(target_arch = "x86_64")]
124 use devices::PciAddress;
125 #[cfg(target_arch = "x86_64")]
126 use devices::PciBridge;
127 use devices::PciDevice;
128 #[cfg(target_arch = "x86_64")]
129 use devices::PciMmioMapper;
130 #[cfg(target_arch = "x86_64")]
131 use devices::PciRoot;
132 #[cfg(target_arch = "x86_64")]
133 use devices::PciRootCommand;
134 #[cfg(target_arch = "x86_64")]
135 use devices::PcieDownstreamPort;
136 #[cfg(target_arch = "x86_64")]
137 use devices::PcieHostPort;
138 #[cfg(target_arch = "x86_64")]
139 use devices::PcieRootPort;
140 #[cfg(target_arch = "x86_64")]
141 use devices::PcieUpstreamPort;
142 use devices::PvPanicCode;
143 use devices::PvPanicPciDevice;
144 #[cfg(feature = "pci-hotplug")]
145 use devices::ResourceCarrier;
146 use devices::StubPciDevice;
147 use devices::VirtioMmioDevice;
148 use devices::VirtioPciDevice;
149 #[cfg(feature = "usb")]
150 use devices::XhciController;
151 #[cfg(feature = "gpu")]
152 use gpu::*;
153 #[cfg(target_arch = "riscv64")]
154 use hypervisor::CpuConfigRiscv64;
155 #[cfg(target_arch = "x86_64")]
156 use hypervisor::CpuConfigX86_64;
157 use hypervisor::Hypervisor;
158 use hypervisor::HypervisorCap;
159 use hypervisor::MemCacheType;
160 use hypervisor::ProtectionType;
161 use hypervisor::Vm;
162 use hypervisor::VmCap;
163 use jail::*;
164 #[cfg(feature = "pci-hotplug")]
165 use jail_warden::JailWarden;
166 #[cfg(feature = "pci-hotplug")]
167 use jail_warden::JailWardenImpl;
168 #[cfg(feature = "pci-hotplug")]
169 use jail_warden::PermissiveJailWarden;
170 use libc;
171 use metrics::MetricsController;
172 use minijail::Minijail;
173 #[cfg(feature = "pci-hotplug")]
174 use pci_hotplug_manager::PciHotPlugManager;
175 use resources::AddressRange;
176 use resources::Alloc;
177 use resources::SystemAllocator;
178 #[cfg(target_arch = "riscv64")]
179 use riscv64::Riscv64 as Arch;
180 use rutabaga_gfx::RutabagaGralloc;
181 use rutabaga_gfx::RutabagaGrallocBackendFlags;
182 use smallvec::SmallVec;
183 #[cfg(feature = "swap")]
184 use swap::SwapController;
185 use sync::Condvar;
186 use sync::Mutex;
187 use vm_control::api::VmMemoryClient;
188 use vm_control::*;
189 use vm_memory::GuestAddress;
190 use vm_memory::GuestMemory;
191 use vm_memory::MemoryPolicy;
192 use vm_memory::MemoryRegionOptions;
193 #[cfg(target_arch = "x86_64")]
194 use x86_64::X8664arch as Arch;
195
196 use crate::crosvm::config::Config;
197 use crate::crosvm::config::Executable;
198 use crate::crosvm::config::FileBackedMappingParameters;
199 use crate::crosvm::config::HypervisorKind;
200 use crate::crosvm::config::InputDeviceOption;
201 use crate::crosvm::config::IrqChipKind;
202 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_HEIGHT;
203 use crate::crosvm::config::DEFAULT_TOUCH_DEVICE_WIDTH;
204 #[cfg(feature = "gdb")]
205 use crate::crosvm::gdb::gdb_thread;
206 #[cfg(feature = "gdb")]
207 use crate::crosvm::gdb::GdbStub;
208 #[cfg(target_arch = "x86_64")]
209 use crate::crosvm::ratelimit::Ratelimit;
210 use crate::crosvm::sys::cmdline::DevicesCommand;
211 use crate::crosvm::sys::config::SharedDir;
212 use crate::crosvm::sys::config::SharedDirKind;
213 use crate::crosvm::sys::platform::vcpu::VcpuPidTid;
214
215 const KVM_PATH: &str = "/dev/kvm";
216 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
217 #[cfg(feature = "geniezone")]
218 const GENIEZONE_PATH: &str = "/dev/gzvm";
219 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
220 static GUNYAH_PATH: &str = "/dev/gunyah";
221
create_virtio_devices( cfg: &Config, vm: &mut impl VmArch, resources: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube, #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>, worker_process_pids: &mut BTreeSet<Pid>, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, #[cfg(feature = "gpu")] has_vfio_gfx_device: bool, #[cfg(feature = "registered_events")] registered_evt_q: &SendTube, ) -> DeviceResult<Vec<VirtioDeviceStub>>222 fn create_virtio_devices(
223 cfg: &Config,
224 vm: &mut impl VmArch,
225 resources: &mut SystemAllocator,
226 add_control_tube: &mut impl FnMut(AnyControlTube),
227 #[cfg_attr(not(feature = "gpu"), allow(unused_variables))] vm_evt_wrtube: &SendTube,
228 #[cfg(feature = "balloon")] balloon_inflate_tube: Option<Tube>,
229 worker_process_pids: &mut BTreeSet<Pid>,
230 #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
231 #[cfg(feature = "gpu")] has_vfio_gfx_device: bool,
232 #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
233 ) -> DeviceResult<Vec<VirtioDeviceStub>> {
234 let mut devs = Vec::new();
235
236 #[cfg(any(feature = "gpu", feature = "video-decoder", feature = "video-encoder"))]
237 let mut resource_bridges = Vec::<Tube>::new();
238
239 if !cfg.wayland_socket_paths.is_empty() {
240 #[cfg_attr(not(feature = "gpu"), allow(unused_mut))]
241 let mut wl_resource_bridge = None::<Tube>;
242
243 #[cfg(feature = "gpu")]
244 {
245 if cfg.gpu_parameters.is_some() {
246 let (wl_socket, gpu_socket) = Tube::pair().context("failed to create tube")?;
247 resource_bridges.push(gpu_socket);
248 wl_resource_bridge = Some(wl_socket);
249 }
250 }
251
252 devs.push(create_wayland_device(
253 cfg.protection_type,
254 &cfg.jail_config,
255 &cfg.wayland_socket_paths,
256 wl_resource_bridge,
257 )?);
258 }
259
260 #[cfg(feature = "video-decoder")]
261 let video_dec_cfg = cfg
262 .video_dec
263 .iter()
264 .map(|config| {
265 let (video_tube, gpu_tube) =
266 Tube::pair().expect("failed to create tube for video decoder");
267 resource_bridges.push(gpu_tube);
268 (video_tube, config.backend)
269 })
270 .collect::<Vec<_>>();
271
272 #[cfg(feature = "video-encoder")]
273 let video_enc_cfg = cfg
274 .video_enc
275 .iter()
276 .map(|config| {
277 let (video_tube, gpu_tube) =
278 Tube::pair().expect("failed to create tube for video encoder");
279 resource_bridges.push(gpu_tube);
280 (video_tube, config.backend)
281 })
282 .collect::<Vec<_>>();
283
284 #[cfg(feature = "gpu")]
285 {
286 if let Some(gpu_parameters) = &cfg.gpu_parameters {
287 let mut event_devices = Vec::new();
288 if cfg.display_window_mouse {
289 let display_param = if gpu_parameters.display_params.is_empty() {
290 Default::default()
291 } else {
292 gpu_parameters.display_params[0].clone()
293 };
294 let (gpu_display_w, gpu_display_h) = display_param.get_virtual_display_size();
295
296 let (event_device_socket, virtio_dev_socket) =
297 StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
298 .context("failed to create socket")?;
299 let mut multi_touch_width = gpu_display_w;
300 let mut multi_touch_height = gpu_display_h;
301 let mut multi_touch_name = None;
302 for input in &cfg.virtio_input {
303 if let InputDeviceOption::MultiTouch {
304 width,
305 height,
306 name,
307 ..
308 } = input
309 {
310 if let Some(width) = width {
311 multi_touch_width = *width;
312 }
313 if let Some(height) = height {
314 multi_touch_height = *height;
315 }
316 if let Some(name) = name {
317 multi_touch_name = Some(name.as_str());
318 }
319 break;
320 }
321 }
322 let dev = virtio::input::new_multi_touch(
323 // u32::MAX is the least likely to collide with the indices generated above for
324 // the multi_touch options, which begin at 0.
325 u32::MAX,
326 virtio_dev_socket,
327 multi_touch_width,
328 multi_touch_height,
329 multi_touch_name,
330 virtio::base_features(cfg.protection_type),
331 )
332 .context("failed to set up mouse device")?;
333 devs.push(VirtioDeviceStub {
334 dev: Box::new(dev),
335 jail: simple_jail(&cfg.jail_config, "input_device")?,
336 });
337 event_devices.push(EventDevice::touchscreen(event_device_socket));
338 }
339 if cfg.display_window_keyboard {
340 let (event_device_socket, virtio_dev_socket) =
341 StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte)
342 .context("failed to create socket")?;
343 let dev = virtio::input::new_keyboard(
344 // u32::MAX is the least likely to collide with the indices generated above for
345 // the multi_touch options, which begin at 0.
346 u32::MAX,
347 virtio_dev_socket,
348 virtio::base_features(cfg.protection_type),
349 )
350 .context("failed to set up keyboard device")?;
351 devs.push(VirtioDeviceStub {
352 dev: Box::new(dev),
353 jail: simple_jail(&cfg.jail_config, "input_device")?,
354 });
355 event_devices.push(EventDevice::keyboard(event_device_socket));
356 }
357
358 let (gpu_control_host_tube, gpu_control_device_tube) =
359 Tube::pair().context("failed to create gpu tube")?;
360 add_control_tube(DeviceControlTube::Gpu(gpu_control_host_tube).into());
361 devs.push(create_gpu_device(
362 cfg,
363 vm_evt_wrtube,
364 gpu_control_device_tube,
365 resource_bridges,
366 render_server_fd,
367 has_vfio_gfx_device,
368 event_devices,
369 )?);
370 }
371 }
372
373 for (_, param) in cfg
374 .serial_parameters
375 .iter()
376 .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole)
377 {
378 let dev = param.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?;
379 devs.push(dev);
380 }
381
382 for disk in &cfg.disks {
383 let (disk_host_tube, disk_device_tube) = Tube::pair().context("failed to create tube")?;
384 add_control_tube(DeviceControlTube::Disk(disk_host_tube).into());
385 let disk_config = DiskConfig::new(disk, Some(disk_device_tube));
386 devs.push(
387 disk_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
388 );
389 }
390
391 if !cfg.scsis.is_empty() {
392 let scsi_config = ScsiConfig(&cfg.scsis);
393 devs.push(
394 scsi_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
395 );
396 }
397
398 for (index, pmem_disk) in cfg.pmems.iter().enumerate() {
399 let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
400 add_control_tube(TaggedControlTube::VmMsync(pmem_host_tube).into());
401 devs.push(create_pmem_device(
402 cfg.protection_type,
403 &cfg.jail_config,
404 vm,
405 resources,
406 pmem_disk,
407 index,
408 pmem_device_tube,
409 )?);
410 }
411
412 for (index, pmem_ext2) in cfg.pmem_ext2.iter().enumerate() {
413 // Prepare a `VmMemoryClient` for pmem-ext2 device to send a request for mmap() and memory
414 // registeration.
415 let (pmem_ext2_host_tube, pmem_ext2_device_tube) =
416 Tube::pair().context("failed to create tube")?;
417 let vm_memory_client = VmMemoryClient::new(pmem_ext2_device_tube);
418 add_control_tube(
419 VmMemoryTube {
420 tube: pmem_ext2_host_tube,
421 expose_with_viommu: false,
422 }
423 .into(),
424 );
425 let (pmem_host_tube, pmem_device_tube) = Tube::pair().context("failed to create tube")?;
426 add_control_tube(TaggedControlTube::VmMsync(pmem_host_tube).into());
427 devs.push(create_pmem_ext2_device(
428 cfg.protection_type,
429 &cfg.jail_config,
430 resources,
431 pmem_ext2,
432 index,
433 vm_memory_client,
434 pmem_device_tube,
435 worker_process_pids,
436 )?);
437 }
438
439 if cfg.rng {
440 devs.push(create_rng_device(cfg.protection_type, &cfg.jail_config)?);
441 }
442
443 #[cfg(feature = "pvclock")]
444 if cfg.pvclock {
445 // pvclock gets a tube for handling suspend/resume requests from the main thread.
446 let (host_suspend_tube, suspend_tube) = Tube::pair().context("failed to create tube")?;
447 add_control_tube(DeviceControlTube::PvClock(host_suspend_tube).into());
448
449 let frequency: u64;
450 #[cfg(target_arch = "x86_64")]
451 {
452 let tsc_state = devices::tsc::tsc_state()?;
453 let tsc_sync_mitigations =
454 get_tsc_sync_mitigations(&tsc_state, cfg.vcpu_count.unwrap_or(1));
455 if tsc_state.core_grouping.size() > 1 {
456 // Host TSCs are not in sync. Log what mitigations are applied.
457 warn!(
458 "Host TSCs are not in sync, applying the following mitigations: {:?}",
459 tsc_sync_mitigations
460 );
461 }
462 frequency = tsc_state.frequency;
463 }
464 #[cfg(target_arch = "aarch64")]
465 {
466 let mut x: u64;
467 // SAFETY: This instruction have no side effect apart from storing the current timestamp
468 // frequency into the specified register.
469 unsafe {
470 asm!("mrs {x}, cntfrq_el0",
471 x = out(reg) x,
472 );
473 }
474 frequency = x;
475
476 // If unset, KVM defaults to an offset that is calculated from VM boot time. Explicitly
477 // set it to zero on boot. When updating the offset, we always set it to the total
478 // amount of time the VM has been suspended.
479 vm.set_counter_offset(0)
480 .context("failed to set up pvclock")?;
481 }
482 let dev = create_pvclock_device(
483 cfg.protection_type,
484 &cfg.jail_config,
485 frequency,
486 suspend_tube,
487 )?;
488 devs.push(dev);
489 info!("virtio-pvclock is enabled for this vm");
490 }
491
492 #[cfg(feature = "vtpm")]
493 {
494 if cfg.vtpm_proxy {
495 devs.push(create_vtpm_proxy_device(
496 cfg.protection_type,
497 &cfg.jail_config,
498 )?);
499 }
500 }
501
502 let mut keyboard_idx = 0;
503 let mut mouse_idx = 0;
504 let mut rotary_idx = 0;
505 let mut switches_idx = 0;
506 let mut multi_touch_idx = 0;
507 let mut single_touch_idx = 0;
508 let mut trackpad_idx = 0;
509 let mut multi_touch_trackpad_idx = 0;
510 let mut custom_idx = 0;
511 for input in &cfg.virtio_input {
512 let input_dev = match input {
513 InputDeviceOption::Evdev { path } => {
514 create_vinput_device(cfg.protection_type, &cfg.jail_config, path.as_path())?
515 }
516 InputDeviceOption::Keyboard { path } => {
517 let dev = create_keyboard_device(
518 cfg.protection_type,
519 &cfg.jail_config,
520 path.as_path(),
521 keyboard_idx,
522 )?;
523 keyboard_idx += 1;
524 dev
525 }
526 InputDeviceOption::Mouse { path } => {
527 let dev = create_mouse_device(
528 cfg.protection_type,
529 &cfg.jail_config,
530 path.as_path(),
531 mouse_idx,
532 )?;
533 mouse_idx += 1;
534 dev
535 }
536 InputDeviceOption::MultiTouch {
537 path,
538 width,
539 height,
540 name,
541 } => {
542 let mut width = *width;
543 let mut height = *height;
544 if multi_touch_idx == 0 {
545 if width.is_none() {
546 width = cfg.display_input_width;
547 }
548 if height.is_none() {
549 height = cfg.display_input_height;
550 }
551 }
552 let dev = create_multi_touch_device(
553 cfg.protection_type,
554 &cfg.jail_config,
555 path.as_path(),
556 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
557 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
558 name.as_deref(),
559 multi_touch_idx,
560 )?;
561 multi_touch_idx += 1;
562 dev
563 }
564 InputDeviceOption::Rotary { path } => {
565 let dev = create_rotary_device(
566 cfg.protection_type,
567 &cfg.jail_config,
568 path.as_path(),
569 rotary_idx,
570 )?;
571 rotary_idx += 1;
572 dev
573 }
574 InputDeviceOption::SingleTouch {
575 path,
576 width,
577 height,
578 name,
579 } => {
580 let mut width = *width;
581 let mut height = *height;
582 if single_touch_idx == 0 {
583 if width.is_none() {
584 width = cfg.display_input_width;
585 }
586 if height.is_none() {
587 height = cfg.display_input_height;
588 }
589 }
590 let dev = create_single_touch_device(
591 cfg.protection_type,
592 &cfg.jail_config,
593 path.as_path(),
594 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
595 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
596 name.as_deref(),
597 single_touch_idx,
598 )?;
599 single_touch_idx += 1;
600 dev
601 }
602 InputDeviceOption::Switches { path } => {
603 let dev = create_switches_device(
604 cfg.protection_type,
605 &cfg.jail_config,
606 path.as_path(),
607 switches_idx,
608 )?;
609 switches_idx += 1;
610 dev
611 }
612 InputDeviceOption::Trackpad {
613 path,
614 width,
615 height,
616 name,
617 } => {
618 let dev = create_trackpad_device(
619 cfg.protection_type,
620 &cfg.jail_config,
621 path.as_path(),
622 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
623 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
624 name.as_deref(),
625 trackpad_idx,
626 )?;
627 trackpad_idx += 1;
628 dev
629 }
630 InputDeviceOption::MultiTouchTrackpad {
631 path,
632 width,
633 height,
634 name,
635 } => {
636 let dev = create_multitouch_trackpad_device(
637 cfg.protection_type,
638 &cfg.jail_config,
639 path.as_path(),
640 width.unwrap_or(DEFAULT_TOUCH_DEVICE_WIDTH),
641 height.unwrap_or(DEFAULT_TOUCH_DEVICE_HEIGHT),
642 name.as_deref(),
643 multi_touch_trackpad_idx,
644 )?;
645 multi_touch_trackpad_idx += 1;
646 dev
647 }
648 InputDeviceOption::Custom { path, config_path } => {
649 let dev = create_custom_device(
650 cfg.protection_type,
651 &cfg.jail_config,
652 path.as_path(),
653 custom_idx,
654 config_path.clone(),
655 )?;
656 custom_idx += 1;
657 dev
658 }
659 };
660 devs.push(input_dev);
661 }
662
663 #[cfg(feature = "balloon")]
664 if cfg.balloon {
665 let balloon_device_tube = if let Some(ref path) = cfg.balloon_control {
666 Tube::new_from_unix_seqpacket(UnixSeqpacket::connect(path).with_context(|| {
667 format!(
668 "failed to connect to balloon control socket {}",
669 path.display(),
670 )
671 })?)?
672 } else {
673 // Balloon gets a special socket so balloon requests can be forwarded
674 // from the main process.
675 let (host, device) = Tube::pair().context("failed to create tube")?;
676 add_control_tube(DeviceControlTube::Balloon(host).into());
677 device
678 };
679
680 let balloon_features = (cfg.balloon_page_reporting as u64)
681 << BalloonFeatures::PageReporting as u64
682 | (cfg.balloon_ws_reporting as u64) << BalloonFeatures::WSReporting as u64;
683
684 let init_balloon_size = if let Some(init_memory) = cfg.init_memory {
685 let init_memory_bytes = init_memory.saturating_mul(1024 * 1024);
686 let total_memory_bytes = vm.get_memory().memory_size();
687
688 if init_memory_bytes > total_memory_bytes {
689 bail!(
690 "initial memory {} cannot be greater than total memory {}",
691 init_memory,
692 total_memory_bytes / (1024 * 1024),
693 );
694 }
695
696 // The initial balloon size is the total memory size minus the initial memory size.
697 total_memory_bytes - init_memory_bytes
698 } else {
699 // No --init-mem specified; start with balloon completely deflated.
700 0
701 };
702
703 devs.push(create_balloon_device(
704 cfg.protection_type,
705 &cfg.jail_config,
706 balloon_device_tube,
707 balloon_inflate_tube,
708 init_balloon_size,
709 balloon_features,
710 #[cfg(feature = "registered_events")]
711 Some(
712 registered_evt_q
713 .try_clone()
714 .context("failed to clone registered_evt_q tube")?,
715 ),
716 cfg.balloon_ws_num_bins,
717 )?);
718 }
719
720 #[cfg(feature = "net")]
721 for opt in &cfg.net {
722 let dev = opt.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?;
723 devs.push(dev);
724 }
725
726 #[cfg(feature = "audio")]
727 {
728 for (card_index, virtio_snd) in cfg.virtio_snds.iter().enumerate() {
729 let mut snd_params = virtio_snd.clone();
730 snd_params.card_index = card_index;
731 devs.push(create_virtio_snd_device(
732 cfg.protection_type,
733 &cfg.jail_config,
734 snd_params,
735 )?);
736 }
737 }
738
739 #[cfg(any(target_os = "android", target_os = "linux"))]
740 #[cfg(feature = "media")]
741 {
742 for v4l2_device in &cfg.v4l2_proxy {
743 devs.push(create_v4l2_device(cfg.protection_type, v4l2_device)?);
744 }
745 }
746
747 #[cfg(feature = "media")]
748 if cfg.simple_media_device {
749 devs.push(create_simple_media_device(cfg.protection_type)?);
750 }
751
752 #[cfg(feature = "video-decoder")]
753 {
754 for (tube, backend) in video_dec_cfg {
755 register_video_device(
756 backend,
757 &mut devs,
758 tube,
759 cfg.protection_type,
760 &cfg.jail_config,
761 VideoDeviceType::Decoder,
762 )?;
763 }
764 }
765
766 #[cfg(feature = "video-encoder")]
767 {
768 for (tube, backend) in video_enc_cfg {
769 register_video_device(
770 backend,
771 &mut devs,
772 tube,
773 cfg.protection_type,
774 &cfg.jail_config,
775 VideoDeviceType::Encoder,
776 )?;
777 }
778 }
779
780 if let Some(vsock_config) = &cfg.vsock {
781 devs.push(
782 vsock_config.create_virtio_device_and_jail(cfg.protection_type, &cfg.jail_config)?,
783 );
784 }
785
786 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
787 {
788 if cfg.vhost_scmi {
789 devs.push(create_vhost_scmi_device(
790 cfg.protection_type,
791 &cfg.jail_config,
792 cfg.vhost_scmi_device.clone(),
793 )?);
794 }
795 }
796 for vhost_user_fs in &cfg.vhost_user_fs {
797 devs.push(create_vhost_user_fs_device(
798 cfg.protection_type,
799 vhost_user_fs,
800 )?);
801 }
802
803 for shared_dir in &cfg.shared_dirs {
804 let SharedDir {
805 src,
806 tag,
807 kind,
808 ugid,
809 uid_map,
810 gid_map,
811 fs_cfg,
812 p9_cfg,
813 } = shared_dir;
814
815 let dev = match kind {
816 SharedDirKind::FS => {
817 let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
818 add_control_tube(TaggedControlTube::Fs(host_tube).into());
819
820 create_fs_device(
821 cfg.protection_type,
822 &cfg.jail_config,
823 *ugid,
824 uid_map,
825 gid_map,
826 src,
827 tag,
828 fs_cfg.clone(),
829 device_tube,
830 )?
831 }
832 SharedDirKind::P9 => create_9p_device(
833 cfg.protection_type,
834 &cfg.jail_config,
835 *ugid,
836 uid_map,
837 gid_map,
838 src,
839 tag,
840 p9_cfg.clone(),
841 )?,
842 };
843 devs.push(dev);
844 }
845
846 #[cfg(feature = "audio")]
847 if let Some(path) = &cfg.sound {
848 devs.push(create_sound_device(
849 path,
850 cfg.protection_type,
851 &cfg.jail_config,
852 )?);
853 }
854
855 for opt in &cfg.vhost_user {
856 devs.push(create_vhost_user_frontend(
857 cfg.protection_type,
858 opt,
859 cfg.vhost_user_connect_timeout_ms,
860 )?);
861 }
862
863 Ok(devs)
864 }
865
create_devices( cfg: &Config, vm: &mut impl VmArch, resources: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), vm_evt_wrtube: &SendTube, iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>, #[cfg(feature = "usb")] usb_provider: DeviceProvider, #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>, iova_max_addr: &mut Option<u64>, #[cfg(feature = "registered_events")] registered_evt_q: &SendTube, vfio_container_manager: &mut VfioContainerManager, worker_process_pids: &mut BTreeSet<Pid>, ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>>866 fn create_devices(
867 cfg: &Config,
868 vm: &mut impl VmArch,
869 resources: &mut SystemAllocator,
870 add_control_tube: &mut impl FnMut(AnyControlTube),
871 vm_evt_wrtube: &SendTube,
872 iommu_attached_endpoints: &mut BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>>,
873 #[cfg(feature = "usb")] usb_provider: DeviceProvider,
874 #[cfg(feature = "gpu")] render_server_fd: Option<SafeDescriptor>,
875 iova_max_addr: &mut Option<u64>,
876 #[cfg(feature = "registered_events")] registered_evt_q: &SendTube,
877 vfio_container_manager: &mut VfioContainerManager,
878 // Stores a set of PID of child processes that are suppose to exit cleanly.
879 worker_process_pids: &mut BTreeSet<Pid>,
880 ) -> DeviceResult<Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>> {
881 let mut devices: Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)> = Vec::new();
882 #[cfg(feature = "balloon")]
883 let mut balloon_inflate_tube: Option<Tube> = None;
884 #[cfg(feature = "gpu")]
885 let mut has_vfio_gfx_device = false;
886 if !cfg.vfio.is_empty() {
887 let mut coiommu_attached_endpoints = Vec::new();
888
889 for vfio_dev in &cfg.vfio {
890 let (dev, jail, viommu_mapper) = create_vfio_device(
891 &cfg.jail_config,
892 vm,
893 resources,
894 add_control_tube,
895 &vfio_dev.path,
896 false,
897 None,
898 vfio_dev.guest_address,
899 Some(&mut coiommu_attached_endpoints),
900 vfio_dev.iommu,
901 vfio_dev.dt_symbol.clone(),
902 vfio_container_manager,
903 )?;
904 match dev {
905 VfioDeviceVariant::Pci(vfio_pci_device) => {
906 *iova_max_addr = Some(max(
907 vfio_pci_device.get_max_iova(),
908 iova_max_addr.unwrap_or(0),
909 ));
910
911 #[cfg(feature = "gpu")]
912 if vfio_pci_device.is_gfx() {
913 has_vfio_gfx_device = true;
914 }
915
916 if let Some(viommu_mapper) = viommu_mapper {
917 iommu_attached_endpoints.insert(
918 vfio_pci_device
919 .pci_address()
920 .context("not initialized")?
921 .to_u32(),
922 Arc::new(Mutex::new(Box::new(viommu_mapper))),
923 );
924 }
925
926 devices.push((Box::new(vfio_pci_device), jail));
927 }
928 VfioDeviceVariant::Platform(vfio_plat_dev) => {
929 devices.push((Box::new(vfio_plat_dev), jail));
930 }
931 }
932 }
933
934 if !coiommu_attached_endpoints.is_empty() || !iommu_attached_endpoints.is_empty() {
935 let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
936 // SAFETY: trivially safe
937 let res = unsafe { libc::getrlimit64(libc::RLIMIT_MEMLOCK, buf.as_mut_ptr()) };
938 if res == 0 {
939 // SAFETY: safe because getrlimit64 has returned success.
940 let limit = unsafe { buf.assume_init() };
941 let rlim_new = limit.rlim_cur.saturating_add(vm.get_memory().memory_size());
942 let rlim_max = max(limit.rlim_max, rlim_new);
943 if limit.rlim_cur < rlim_new {
944 let limit_arg = libc::rlimit64 {
945 rlim_cur: rlim_new,
946 rlim_max,
947 };
948 // SAFETY: trivially safe
949 let res = unsafe { libc::setrlimit64(libc::RLIMIT_MEMLOCK, &limit_arg) };
950 if res != 0 {
951 bail!("Set rlimit failed");
952 }
953 }
954 } else {
955 bail!("Get rlimit failed");
956 }
957 }
958 #[cfg(feature = "balloon")]
959 let coiommu_tube: Option<Tube>;
960 #[cfg(not(feature = "balloon"))]
961 let coiommu_tube: Option<Tube> = None;
962 if !coiommu_attached_endpoints.is_empty() {
963 let vfio_container = vfio_container_manager
964 .get_container(IommuDevType::CoIommu, None as Option<&Path>)
965 .context("failed to get vfio container")?;
966 let (coiommu_host_tube, coiommu_device_tube) =
967 Tube::pair().context("failed to create coiommu tube")?;
968 add_control_tube(
969 VmMemoryTube {
970 tube: coiommu_host_tube,
971 expose_with_viommu: false,
972 }
973 .into(),
974 );
975 let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u64;
976 #[cfg(feature = "balloon")]
977 match Tube::pair() {
978 Ok((x, y)) => {
979 coiommu_tube = Some(x);
980 balloon_inflate_tube = Some(y);
981 }
982 Err(x) => return Err(x).context("failed to create coiommu tube"),
983 }
984 let dev = CoIommuDev::new(
985 vm.get_memory().clone(),
986 vfio_container,
987 VmMemoryClient::new(coiommu_device_tube),
988 coiommu_tube,
989 coiommu_attached_endpoints,
990 vcpu_count,
991 cfg.coiommu_param.unwrap_or_default(),
992 )
993 .context("failed to create coiommu device")?;
994
995 devices.push((
996 Box::new(dev),
997 simple_jail(&cfg.jail_config, "coiommu_device")?,
998 ));
999 }
1000 }
1001
1002 let stubs = create_virtio_devices(
1003 cfg,
1004 vm,
1005 resources,
1006 add_control_tube,
1007 vm_evt_wrtube,
1008 #[cfg(feature = "balloon")]
1009 balloon_inflate_tube,
1010 worker_process_pids,
1011 #[cfg(feature = "gpu")]
1012 render_server_fd,
1013 #[cfg(feature = "gpu")]
1014 has_vfio_gfx_device,
1015 #[cfg(feature = "registered_events")]
1016 registered_evt_q,
1017 )?;
1018
1019 for stub in stubs {
1020 match stub.dev.transport_type() {
1021 VirtioTransportType::Pci => {
1022 let (msi_host_tube, msi_device_tube) =
1023 Tube::pair().context("failed to create tube")?;
1024 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1025
1026 let shared_memory_tube = if stub.dev.get_shared_memory_region().is_some() {
1027 let (host_tube, device_tube) =
1028 Tube::pair().context("failed to create shared memory tube")?;
1029 add_control_tube(
1030 VmMemoryTube {
1031 tube: host_tube,
1032 expose_with_viommu: stub.dev.expose_shmem_descriptors_with_viommu(),
1033 }
1034 .into(),
1035 );
1036 Some(device_tube)
1037 } else {
1038 None
1039 };
1040
1041 let (ioevent_host_tube, ioevent_device_tube) =
1042 Tube::pair().context("failed to create ioevent tube")?;
1043 add_control_tube(
1044 VmMemoryTube {
1045 tube: ioevent_host_tube,
1046 expose_with_viommu: false,
1047 }
1048 .into(),
1049 );
1050
1051 let (host_tube, device_tube) =
1052 Tube::pair().context("failed to create device control tube")?;
1053 add_control_tube(TaggedControlTube::Vm(host_tube).into());
1054
1055 let dev = VirtioPciDevice::new(
1056 vm.get_memory().clone(),
1057 stub.dev,
1058 msi_device_tube,
1059 cfg.disable_virtio_intx,
1060 shared_memory_tube.map(VmMemoryClient::new),
1061 VmMemoryClient::new(ioevent_device_tube),
1062 device_tube,
1063 )
1064 .context("failed to create virtio pci dev")?;
1065
1066 devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
1067 }
1068 VirtioTransportType::Mmio => {
1069 let dev = VirtioMmioDevice::new(vm.get_memory().clone(), stub.dev, false)
1070 .context("failed to create virtio mmio dev")?;
1071 devices.push((Box::new(dev) as Box<dyn BusDeviceObj>, stub.jail));
1072 }
1073 }
1074 }
1075
1076 #[cfg(feature = "usb")]
1077 if cfg.usb {
1078 // Create xhci controller.
1079 let usb_controller = Box::new(XhciController::new(
1080 vm.get_memory().clone(),
1081 Box::new(usb_provider),
1082 ));
1083 devices.push((
1084 usb_controller,
1085 simple_jail(&cfg.jail_config, "xhci_device")?,
1086 ));
1087 }
1088
1089 for params in &cfg.stub_pci_devices {
1090 // Stub devices don't need jailing since they don't do anything.
1091 devices.push((Box::new(StubPciDevice::new(params)), None));
1092 }
1093
1094 devices.push((
1095 Box::new(PvPanicPciDevice::new(vm_evt_wrtube.try_clone()?)),
1096 None,
1097 ));
1098
1099 Ok(devices)
1100 }
1101
create_file_backed_mappings( cfg: &Config, vm: &mut impl Vm, resources: &mut SystemAllocator, ) -> Result<()>1102 fn create_file_backed_mappings(
1103 cfg: &Config,
1104 vm: &mut impl Vm,
1105 resources: &mut SystemAllocator,
1106 ) -> Result<()> {
1107 for mapping in &cfg.file_backed_mappings {
1108 let file = OpenOptions::new()
1109 .read(true)
1110 .write(mapping.writable)
1111 .custom_flags(if mapping.sync { libc::O_SYNC } else { 0 })
1112 .open(&mapping.path)
1113 .context("failed to open file for file-backed mapping")?;
1114 let prot = if mapping.writable {
1115 Protection::read_write()
1116 } else {
1117 Protection::read()
1118 };
1119 let size = mapping
1120 .size
1121 .try_into()
1122 .context("Invalid size for file-backed mapping")?;
1123 let memory_mapping = MemoryMappingBuilder::new(size)
1124 .from_file(&file)
1125 .offset(mapping.offset)
1126 .protection(prot)
1127 .build()
1128 .context("failed to map backing file for file-backed mapping")?;
1129
1130 let mapping_range = AddressRange::from_start_and_size(mapping.address, mapping.size)
1131 .context("failed to convert to AddressRange")?;
1132 match resources.mmio_allocator_any().allocate_at(
1133 mapping_range,
1134 Alloc::FileBacked(mapping.address),
1135 "file-backed mapping".to_owned(),
1136 ) {
1137 // OutOfSpace just means that this mapping is not in the MMIO regions at all, so don't
1138 // consider it an error.
1139 // TODO(b/222769529): Reserve this region in a global memory address space allocator
1140 // once we have that so nothing else can accidentally overlap with it.
1141 Ok(()) | Err(resources::Error::OutOfSpace) => {}
1142 e => e.context("failed to allocate guest address for file-backed mapping")?,
1143 }
1144
1145 vm.add_memory_region(
1146 GuestAddress(mapping.address),
1147 Box::new(memory_mapping),
1148 !mapping.writable,
1149 /* log_dirty_pages = */ false,
1150 MemCacheType::CacheCoherent,
1151 )
1152 .context("failed to configure file-backed mapping")?;
1153 }
1154
1155 Ok(())
1156 }
1157
1158 #[cfg(target_arch = "x86_64")]
1159 /// Collection of devices related to PCI hotplug.
1160 struct HotPlugStub {
1161 /// Map from bus index to hotplug bus.
1162 hotplug_buses: BTreeMap<u8, Arc<Mutex<dyn HotPlugBus>>>,
1163 /// Bus ranges of devices for virtio-iommu.
1164 iommu_bus_ranges: Vec<RangeInclusive<u32>>,
1165 /// Map from gpe index to GpeNotify devices.
1166 gpe_notify_devs: BTreeMap<u32, Arc<Mutex<dyn GpeNotify>>>,
1167 /// Map from bus index to GpeNotify devices.
1168 pme_notify_devs: BTreeMap<u8, Arc<Mutex<dyn PmeNotify>>>,
1169 }
1170
1171 #[cfg(target_arch = "x86_64")]
1172 impl HotPlugStub {
1173 /// Constructs empty HotPlugStub.
new() -> Self1174 fn new() -> Self {
1175 Self {
1176 hotplug_buses: BTreeMap::new(),
1177 iommu_bus_ranges: Vec::new(),
1178 gpe_notify_devs: BTreeMap::new(),
1179 pme_notify_devs: BTreeMap::new(),
1180 }
1181 }
1182 }
1183
1184 #[cfg(target_arch = "x86_64")]
1185 /// Creates PCIE root port with only virtual devices.
1186 ///
1187 /// user doesn't specify host pcie root port which link to this virtual pcie rp,
1188 /// find the empty bus and create a total virtual pcie rp
create_pure_virtual_pcie_root_port( sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>, hp_bus_count: u8, ) -> Result<HotPlugStub>1189 fn create_pure_virtual_pcie_root_port(
1190 sys_allocator: &mut SystemAllocator,
1191 add_control_tube: &mut impl FnMut(AnyControlTube),
1192 devices: &mut Vec<(Box<dyn BusDeviceObj>, Option<Minijail>)>,
1193 hp_bus_count: u8,
1194 ) -> Result<HotPlugStub> {
1195 let mut hp_sec_buses = Vec::new();
1196 let mut hp_stub = HotPlugStub::new();
1197 // Create Pcie Root Port for non-root buses, each non-root bus device will be
1198 // connected behind a virtual pcie root port.
1199 for i in 1..255 {
1200 if sys_allocator.pci_bus_empty(i) {
1201 if hp_sec_buses.len() < hp_bus_count.into() {
1202 hp_sec_buses.push(i);
1203 }
1204 continue;
1205 }
1206 let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(i, false)));
1207 hp_stub
1208 .pme_notify_devs
1209 .insert(i, pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>);
1210 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1211 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1212 let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1213 // no ipc is used if the root port disables hotplug
1214 devices.push((pci_bridge, None));
1215 }
1216
1217 // Create Pcie Root Port for hot-plug
1218 if hp_sec_buses.len() < hp_bus_count.into() {
1219 return Err(anyhow!("no more addresses are available"));
1220 }
1221
1222 for hp_sec_bus in hp_sec_buses {
1223 let pcie_root_port = Arc::new(Mutex::new(PcieRootPort::new(hp_sec_bus, true)));
1224 hp_stub.pme_notify_devs.insert(
1225 hp_sec_bus,
1226 pcie_root_port.clone() as Arc<Mutex<dyn PmeNotify>>,
1227 );
1228 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
1229 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
1230 let pci_bridge = Box::new(PciBridge::new(pcie_root_port.clone(), msi_device_tube));
1231
1232 hp_stub.iommu_bus_ranges.push(RangeInclusive::new(
1233 PciAddress {
1234 bus: pci_bridge.get_secondary_num(),
1235 dev: 0,
1236 func: 0,
1237 }
1238 .to_u32(),
1239 PciAddress {
1240 bus: pci_bridge.get_subordinate_num(),
1241 dev: 32,
1242 func: 8,
1243 }
1244 .to_u32(),
1245 ));
1246
1247 devices.push((pci_bridge, None));
1248 hp_stub
1249 .hotplug_buses
1250 .insert(hp_sec_bus, pcie_root_port as Arc<Mutex<dyn HotPlugBus>>);
1251 }
1252 Ok(hp_stub)
1253 }
1254
setup_vm_components(cfg: &Config) -> Result<VmComponents>1255 fn setup_vm_components(cfg: &Config) -> Result<VmComponents> {
1256 let initrd_image = if let Some(initrd_path) = &cfg.initrd_path {
1257 Some(
1258 open_file_or_duplicate(initrd_path, OpenOptions::new().read(true))
1259 .with_context(|| format!("failed to open initrd {}", initrd_path.display()))?,
1260 )
1261 } else {
1262 None
1263 };
1264 let pvm_fw_image = if let Some(pvm_fw_path) = &cfg.pvm_fw {
1265 Some(
1266 open_file_or_duplicate(pvm_fw_path, OpenOptions::new().read(true))
1267 .with_context(|| format!("failed to open pvm_fw {}", pvm_fw_path.display()))?,
1268 )
1269 } else {
1270 None
1271 };
1272
1273 let vm_image = match cfg.executable_path {
1274 Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel(
1275 open_file_or_duplicate(kernel_path, OpenOptions::new().read(true)).with_context(
1276 || format!("failed to open kernel image {}", kernel_path.display()),
1277 )?,
1278 ),
1279 Some(Executable::Bios(ref bios_path)) => VmImage::Bios(
1280 open_file_or_duplicate(bios_path, OpenOptions::new().read(true))
1281 .with_context(|| format!("failed to open bios {}", bios_path.display()))?,
1282 ),
1283 _ => panic!("Did not receive a bios or kernel, should be impossible."),
1284 };
1285
1286 let swiotlb = if let Some(size) = cfg.swiotlb {
1287 Some(
1288 size.checked_mul(1024 * 1024)
1289 .ok_or_else(|| anyhow!("requested swiotlb size too large"))?,
1290 )
1291 } else if matches!(cfg.protection_type, ProtectionType::Unprotected) {
1292 None
1293 } else {
1294 Some(64 * 1024 * 1024)
1295 };
1296
1297 let (pflash_image, pflash_block_size) = if let Some(pflash_parameters) = &cfg.pflash_parameters
1298 {
1299 (
1300 Some(
1301 open_file_or_duplicate(
1302 &pflash_parameters.path,
1303 OpenOptions::new().read(true).write(true),
1304 )
1305 .with_context(|| {
1306 format!("failed to open pflash {}", pflash_parameters.path.display())
1307 })?,
1308 ),
1309 pflash_parameters.block_size,
1310 )
1311 } else {
1312 (None, 0)
1313 };
1314
1315 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1316 let mut cpu_frequencies = BTreeMap::new();
1317 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1318 let mut normalized_cpu_capacities = BTreeMap::new();
1319
1320 // if --enable-fw-cfg or --fw-cfg was given, we want to enable fw_cfg
1321 let fw_cfg_enable = cfg.enable_fw_cfg || !cfg.fw_cfg_parameters.is_empty();
1322 let (cpu_clusters, cpu_capacity) = if cfg.host_cpu_topology {
1323 (
1324 Arch::get_host_cpu_clusters()?,
1325 Arch::get_host_cpu_capacity()?,
1326 )
1327 } else {
1328 (cfg.cpu_clusters.clone(), cfg.cpu_capacity.clone())
1329 };
1330
1331 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1332 let mut vcpu_domain_paths = BTreeMap::new();
1333 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1334 let mut vcpu_domains = BTreeMap::new();
1335
1336 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1337 if cfg.virt_cpufreq || cfg.virt_cpufreq_v2 {
1338 if !cfg.cpu_frequencies_khz.is_empty() {
1339 cpu_frequencies = cfg.cpu_frequencies_khz.clone();
1340 } else {
1341 match Arch::get_host_cpu_frequencies_khz() {
1342 Ok(host_cpu_frequencies) => {
1343 for cpu_id in 0..cfg.vcpu_count.unwrap_or(1) {
1344 let vcpu_affinity = match cfg.vcpu_affinity.clone() {
1345 Some(VcpuAffinity::Global(v)) => v,
1346 Some(VcpuAffinity::PerVcpu(mut m)) => {
1347 m.remove(&cpu_id).unwrap_or_default()
1348 }
1349 None => {
1350 panic!("There must be some vcpu_affinity setting with VirtCpufreq enabled!")
1351 }
1352 };
1353
1354 // Check that the physical CPUs that the vCPU is affined to all share the
1355 // same frequency domain.
1356 if let Some(freq_domain) = host_cpu_frequencies.get(&vcpu_affinity[0]) {
1357 for cpu in vcpu_affinity.iter() {
1358 if let Some(frequencies) = host_cpu_frequencies.get(cpu) {
1359 if frequencies != freq_domain {
1360 panic!("Affined CPUs do not share a frequency domain!");
1361 }
1362 }
1363 }
1364 cpu_frequencies.insert(cpu_id, freq_domain.clone());
1365 } else {
1366 panic!("No frequency domain for cpu:{}", cpu_id);
1367 }
1368 }
1369 }
1370 Err(e) => {
1371 warn!("Unable to get host cpu frequencies {:#}", e);
1372 }
1373 }
1374 }
1375
1376 if !cpu_frequencies.is_empty() {
1377 let mut max_freqs = Vec::new();
1378
1379 for (_cpu, frequencies) in cpu_frequencies.iter() {
1380 max_freqs.push(*frequencies.iter().max().ok_or(Error::new(libc::EINVAL))?)
1381 }
1382
1383 let host_max_freqs = Arch::get_host_cpu_max_freq_khz()?;
1384 let largest_host_max_freq = host_max_freqs
1385 .values()
1386 .max()
1387 .ok_or(Error::new(libc::EINVAL))?;
1388
1389 for (cpu_id, max_freq) in max_freqs.iter().enumerate() {
1390 let normalized_cpu_capacity = (u64::from(*cpu_capacity.get(&cpu_id).unwrap())
1391 * u64::from(*max_freq))
1392 .checked_div(u64::from(*largest_host_max_freq))
1393 .ok_or(Error::new(libc::EINVAL))?;
1394 normalized_cpu_capacities.insert(
1395 cpu_id,
1396 u32::try_from(normalized_cpu_capacity).map_err(|_| Error::new(libc::EINVAL))?,
1397 );
1398 }
1399
1400 if !cfg.cpu_freq_domains.is_empty() {
1401 let cgroup_path = cfg
1402 .vcpu_cgroup_path
1403 .clone()
1404 .context("cpu_freq_domains requires vcpu_cgroup_path")?;
1405
1406 if !cgroup_path.join("cgroup.controllers").exists() {
1407 panic!("CGroupsV2 must be enabled for cpu freq domain support!");
1408 }
1409
1410 // Assign parent crosvm process to top level cgroup
1411 let cgroup_procs_path = cgroup_path.join("cgroup.procs");
1412 std::fs::write(
1413 cgroup_procs_path.clone(),
1414 process::id().to_string().as_bytes(),
1415 )
1416 .with_context(|| {
1417 format!(
1418 "failed to create vcpu-cgroup-path {}",
1419 cgroup_procs_path.display(),
1420 )
1421 })?;
1422
1423 for (freq_domain_idx, cpus) in cfg.cpu_freq_domains.iter().enumerate() {
1424 let vcpu_domain_path =
1425 cgroup_path.join(format!("vcpu-domain{}", freq_domain_idx));
1426 // Create subtree for domain
1427 create_dir_all(&vcpu_domain_path)?;
1428
1429 // Set vcpu_domain cgroup type as 'threaded' to get thread level granularity
1430 // controls
1431 let cgroup_type_path = cgroup_path.join(vcpu_domain_path.join("cgroup.type"));
1432 std::fs::write(cgroup_type_path.clone(), b"threaded").with_context(|| {
1433 format!(
1434 "failed to create vcpu-cgroup-path {}",
1435 cgroup_type_path.display(),
1436 )
1437 })?;
1438 for core_idx in cpus.iter() {
1439 vcpu_domain_paths.insert(*core_idx, vcpu_domain_path.clone());
1440 vcpu_domains.insert(*core_idx, freq_domain_idx as u32);
1441 }
1442 }
1443 }
1444 }
1445 }
1446
1447 Ok(VmComponents {
1448 #[cfg(target_arch = "x86_64")]
1449 ac_adapter: cfg.ac_adapter,
1450 #[cfg(target_arch = "x86_64")]
1451 break_linux_pci_config_io: cfg.break_linux_pci_config_io,
1452 memory_size: cfg
1453 .memory
1454 .unwrap_or(256)
1455 .checked_mul(1024 * 1024)
1456 .ok_or_else(|| anyhow!("requested memory size too large"))?,
1457 swiotlb,
1458 fw_cfg_enable,
1459 bootorder_fw_cfg_blob: Vec::new(),
1460 vcpu_count: cfg.vcpu_count.unwrap_or(1),
1461 vcpu_affinity: cfg.vcpu_affinity.clone(),
1462 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1463 vcpu_domains,
1464 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1465 vcpu_domain_paths,
1466 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1467 cpu_frequencies,
1468 fw_cfg_parameters: cfg.fw_cfg_parameters.clone(),
1469 cpu_clusters,
1470 cpu_capacity,
1471 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1472 normalized_cpu_capacities,
1473 no_smt: cfg.no_smt,
1474 hugepages: cfg.hugepages,
1475 hv_cfg: hypervisor::Config {
1476 #[cfg(target_arch = "aarch64")]
1477 mte: cfg.mte,
1478 protection_type: cfg.protection_type,
1479 },
1480 vm_image,
1481 android_fstab: cfg
1482 .android_fstab
1483 .as_ref()
1484 .map(|x| {
1485 File::open(x)
1486 .with_context(|| format!("failed to open android fstab file {}", x.display()))
1487 })
1488 .map_or(Ok(None), |v| v.map(Some))?,
1489 pstore: cfg.pstore.clone(),
1490 pflash_block_size,
1491 pflash_image,
1492 initrd_image,
1493 extra_kernel_params: cfg.params.clone(),
1494 acpi_sdts: cfg
1495 .acpi_tables
1496 .iter()
1497 .map(|path| {
1498 SDT::from_file(path)
1499 .with_context(|| format!("failed to open ACPI file {}", path.display()))
1500 })
1501 .collect::<Result<Vec<SDT>>>()?,
1502 rt_cpus: cfg.rt_cpus.clone(),
1503 delay_rt: cfg.delay_rt,
1504 no_i8042: cfg.no_i8042,
1505 no_rtc: cfg.no_rtc,
1506 #[cfg(target_arch = "x86_64")]
1507 smbios: cfg.smbios.clone(),
1508 host_cpu_topology: cfg.host_cpu_topology,
1509 itmt: cfg.itmt,
1510 #[cfg(target_arch = "x86_64")]
1511 force_s2idle: cfg.force_s2idle,
1512 pvm_fw: pvm_fw_image,
1513 pci_config: cfg.pci_config,
1514 dynamic_power_coefficient: cfg.dynamic_power_coefficient.clone(),
1515 boot_cpu: cfg.boot_cpu,
1516 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1517 virt_cpufreq_v2: cfg.virt_cpufreq_v2,
1518 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1519 sve_config: cfg.sve.unwrap_or_default(),
1520 })
1521 }
1522
1523 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
1524 pub enum ExitState {
1525 Reset,
1526 Stop,
1527 Crash,
1528 GuestPanic,
1529 WatchdogReset,
1530 }
1531 // Remove ranges in `guest_mem_layout` that overlap with ranges in `file_backed_mappings`.
1532 // Returns the updated guest memory layout.
punch_holes_in_guest_mem_layout_for_mappings( guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>, file_backed_mappings: &[FileBackedMappingParameters], ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)>1533 fn punch_holes_in_guest_mem_layout_for_mappings(
1534 guest_mem_layout: Vec<(GuestAddress, u64, MemoryRegionOptions)>,
1535 file_backed_mappings: &[FileBackedMappingParameters],
1536 ) -> Vec<(GuestAddress, u64, MemoryRegionOptions)> {
1537 // Create a set containing (start, end) pairs with exclusive end (end = start + size; the byte
1538 // at end is not included in the range).
1539 let mut layout_set = BTreeSet::new();
1540 for (addr, size, options) in &guest_mem_layout {
1541 layout_set.insert((addr.offset(), addr.offset() + size, *options));
1542 }
1543
1544 for mapping in file_backed_mappings {
1545 let mapping_start = mapping.address;
1546 let mapping_end = mapping_start + mapping.size;
1547
1548 // Repeatedly split overlapping guest memory regions until no overlaps remain.
1549 while let Some((range_start, range_end, options)) = layout_set
1550 .iter()
1551 .find(|&&(range_start, range_end, _)| {
1552 mapping_start < range_end && mapping_end > range_start
1553 })
1554 .cloned()
1555 {
1556 layout_set.remove(&(range_start, range_end, options));
1557
1558 if range_start < mapping_start {
1559 layout_set.insert((range_start, mapping_start, options));
1560 }
1561 if range_end > mapping_end {
1562 layout_set.insert((mapping_end, range_end, options));
1563 }
1564 }
1565 }
1566
1567 // Build the final guest memory layout from the modified layout_set.
1568 layout_set
1569 .iter()
1570 .map(|(start, end, options)| (GuestAddress(*start), end - start, *options))
1571 .collect()
1572 }
1573
create_guest_memory( cfg: &Config, components: &VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, hypervisor: &impl Hypervisor, ) -> Result<GuestMemory>1574 fn create_guest_memory(
1575 cfg: &Config,
1576 components: &VmComponents,
1577 arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
1578 hypervisor: &impl Hypervisor,
1579 ) -> Result<GuestMemory> {
1580 let guest_mem_layout = Arch::guest_memory_layout(components, arch_memory_layout, hypervisor)
1581 .context("failed to create guest memory layout")?;
1582
1583 let guest_mem_layout =
1584 punch_holes_in_guest_mem_layout_for_mappings(guest_mem_layout, &cfg.file_backed_mappings);
1585
1586 let guest_mem = GuestMemory::new_with_options(&guest_mem_layout)
1587 .context("failed to create guest memory")?;
1588 let mut mem_policy = MemoryPolicy::empty();
1589 if components.hugepages {
1590 mem_policy |= MemoryPolicy::USE_HUGEPAGES;
1591 }
1592
1593 if cfg.lock_guest_memory {
1594 mem_policy |= MemoryPolicy::LOCK_GUEST_MEMORY;
1595 }
1596 guest_mem.set_memory_policy(mem_policy);
1597
1598 if cfg.unmap_guest_memory_on_fork {
1599 // Note that this isn't compatible with sandboxing. We could potentially fix that by
1600 // delaying the call until after the sandboxed devices are forked. However, the main use
1601 // for this is in conjunction with protected VMs, where most of the guest memory has been
1602 // unshared with the host. We'd need to be confident that the guest memory is unshared with
1603 // the host only after the `use_dontfork` call and those details will vary by hypervisor.
1604 // So, for now we keep things simple to be safe.
1605 guest_mem.use_dontfork().context("use_dontfork failed")?;
1606 }
1607
1608 Ok(guest_mem)
1609 }
1610
1611 #[cfg(all(target_arch = "aarch64", feature = "geniezone"))]
run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1612 fn run_gz(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1613 use devices::GeniezoneKernelIrqChip;
1614 use hypervisor::geniezone::Geniezone;
1615 use hypervisor::geniezone::GeniezoneVcpu;
1616 use hypervisor::geniezone::GeniezoneVm;
1617
1618 let device_path = device_path.unwrap_or(Path::new(GENIEZONE_PATH));
1619 let gzvm = Geniezone::new_with_path(device_path)
1620 .with_context(|| format!("failed to open GenieZone device {}", device_path.display()))?;
1621
1622 let arch_memory_layout =
1623 Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1624 let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &gzvm)?;
1625
1626 #[cfg(feature = "swap")]
1627 let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1628 Some(
1629 SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1630 .context("launch vmm-swap monitor process")?,
1631 )
1632 } else {
1633 None
1634 };
1635
1636 let vm =
1637 GeniezoneVm::new(&gzvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1638
1639 // Check that the VM was actually created in protected mode as expected.
1640 if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1641 bail!("Failed to create protected VM");
1642 }
1643 let vm_clone = vm.try_clone().context("failed to clone vm")?;
1644
1645 let ioapic_host_tube;
1646 let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
1647 IrqChipKind::Split => bail!("Geniezone does not support split irqchip mode"),
1648 IrqChipKind::Userspace => bail!("Geniezone does not support userspace irqchip mode"),
1649 IrqChipKind::Kernel => {
1650 ioapic_host_tube = None;
1651 GeniezoneKernelIrqChip::new(vm_clone, components.vcpu_count)
1652 .context("failed to create IRQ chip")?
1653 }
1654 };
1655
1656 run_vm::<GeniezoneVcpu, GeniezoneVm>(
1657 cfg,
1658 components,
1659 &arch_memory_layout,
1660 vm,
1661 &mut irq_chip,
1662 ioapic_host_tube,
1663 #[cfg(feature = "swap")]
1664 swap_controller,
1665 )
1666 }
1667
run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState>1668 fn run_kvm(device_path: Option<&Path>, cfg: Config, components: VmComponents) -> Result<ExitState> {
1669 use devices::KvmKernelIrqChip;
1670 #[cfg(target_arch = "x86_64")]
1671 use devices::KvmSplitIrqChip;
1672 use hypervisor::kvm::Kvm;
1673 use hypervisor::kvm::KvmVcpu;
1674 use hypervisor::kvm::KvmVm;
1675
1676 let device_path = device_path.unwrap_or(Path::new(KVM_PATH));
1677 let kvm = Kvm::new_with_path(device_path)
1678 .with_context(|| format!("failed to open KVM device {}", device_path.display()))?;
1679
1680 let arch_memory_layout =
1681 Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1682 let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &kvm)?;
1683
1684 #[cfg(feature = "swap")]
1685 let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1686 Some(
1687 SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1688 .context("launch vmm-swap monitor process")?,
1689 )
1690 } else {
1691 None
1692 };
1693
1694 let vm = KvmVm::new(&kvm, guest_mem, components.hv_cfg).context("failed to create vm")?;
1695
1696 #[cfg(target_arch = "x86_64")]
1697 if cfg.itmt {
1698 vm.set_platform_info_read_access(false)
1699 .context("failed to disable MSR_PLATFORM_INFO read access")?;
1700 }
1701
1702 // Check that the VM was actually created in protected mode as expected.
1703 // This check is only needed on aarch64. On x86_64, protected VM creation will fail
1704 // if protected mode is not supported.
1705 #[cfg(not(target_arch = "x86_64"))]
1706 if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1707 bail!("Failed to create protected VM");
1708 }
1709 let vm_clone = vm.try_clone().context("failed to clone vm")?;
1710
1711 enum KvmIrqChip {
1712 #[cfg(target_arch = "x86_64")]
1713 Split(KvmSplitIrqChip),
1714 Kernel(KvmKernelIrqChip),
1715 }
1716
1717 impl KvmIrqChip {
1718 fn as_mut(&mut self) -> &mut dyn IrqChipArch {
1719 match self {
1720 #[cfg(target_arch = "x86_64")]
1721 KvmIrqChip::Split(i) => i,
1722 KvmIrqChip::Kernel(i) => i,
1723 }
1724 }
1725 }
1726
1727 let ioapic_host_tube;
1728 let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) {
1729 IrqChipKind::Userspace => {
1730 bail!("KVM userspace irqchip mode not implemented");
1731 }
1732 IrqChipKind::Split => {
1733 #[cfg(not(target_arch = "x86_64"))]
1734 bail!("KVM split irqchip mode only supported on x86 processors");
1735 #[cfg(target_arch = "x86_64")]
1736 {
1737 let (host_tube, ioapic_device_tube) =
1738 Tube::pair().context("failed to create tube")?;
1739 ioapic_host_tube = Some(host_tube);
1740 KvmIrqChip::Split(
1741 KvmSplitIrqChip::new(
1742 vm_clone,
1743 components.vcpu_count,
1744 ioapic_device_tube,
1745 Some(24),
1746 )
1747 .context("failed to create IRQ chip")?,
1748 )
1749 }
1750 }
1751 IrqChipKind::Kernel => {
1752 ioapic_host_tube = None;
1753 KvmIrqChip::Kernel(
1754 KvmKernelIrqChip::new(vm_clone, components.vcpu_count)
1755 .context("failed to create IRQ chip")?,
1756 )
1757 }
1758 };
1759
1760 run_vm::<KvmVcpu, KvmVm>(
1761 cfg,
1762 components,
1763 &arch_memory_layout,
1764 vm,
1765 irq_chip.as_mut(),
1766 ioapic_host_tube,
1767 #[cfg(feature = "swap")]
1768 swap_controller,
1769 )
1770 }
1771
1772 #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), feature = "gunyah"))]
run_gunyah( device_path: Option<&Path>, cfg: Config, components: VmComponents, ) -> Result<ExitState>1773 fn run_gunyah(
1774 device_path: Option<&Path>,
1775 cfg: Config,
1776 components: VmComponents,
1777 ) -> Result<ExitState> {
1778 use devices::GunyahIrqChip;
1779 use hypervisor::gunyah::Gunyah;
1780 use hypervisor::gunyah::GunyahVcpu;
1781 use hypervisor::gunyah::GunyahVm;
1782
1783 let device_path = device_path.unwrap_or(Path::new(GUNYAH_PATH));
1784 let gunyah = Gunyah::new_with_path(device_path)
1785 .with_context(|| format!("failed to open Gunyah device {}", device_path.display()))?;
1786
1787 let arch_memory_layout =
1788 Arch::arch_memory_layout(&components).context("failed to create arch memory layout")?;
1789 let guest_mem = create_guest_memory(&cfg, &components, &arch_memory_layout, &gunyah)?;
1790
1791 #[cfg(feature = "swap")]
1792 let swap_controller = if let Some(swap_dir) = cfg.swap_dir.as_ref() {
1793 Some(
1794 SwapController::launch(guest_mem.clone(), swap_dir, &cfg.jail_config)
1795 .context("launch vmm-swap monitor process")?,
1796 )
1797 } else {
1798 None
1799 };
1800
1801 let vm = GunyahVm::new(&gunyah, guest_mem, components.hv_cfg).context("failed to create vm")?;
1802
1803 // Check that the VM was actually created in protected mode as expected.
1804 if cfg.protection_type.isolates_memory() && !vm.check_capability(VmCap::Protected) {
1805 bail!("Failed to create protected VM");
1806 }
1807
1808 let vm_clone = vm.try_clone()?;
1809
1810 run_vm::<GunyahVcpu, GunyahVm>(
1811 cfg,
1812 components,
1813 &arch_memory_layout,
1814 vm,
1815 &mut GunyahIrqChip::new(vm_clone)?,
1816 None,
1817 #[cfg(feature = "swap")]
1818 swap_controller,
1819 )
1820 }
1821
1822 /// Choose a default hypervisor if no `--hypervisor` option was specified.
get_default_hypervisor() -> Option<HypervisorKind>1823 fn get_default_hypervisor() -> Option<HypervisorKind> {
1824 let kvm_path = Path::new(KVM_PATH);
1825 if kvm_path.exists() {
1826 return Some(HypervisorKind::Kvm {
1827 device: Some(kvm_path.to_path_buf()),
1828 });
1829 }
1830
1831 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1832 #[cfg(feature = "geniezone")]
1833 {
1834 let gz_path = Path::new(GENIEZONE_PATH);
1835 if gz_path.exists() {
1836 return Some(HypervisorKind::Geniezone {
1837 device: Some(gz_path.to_path_buf()),
1838 });
1839 }
1840 }
1841
1842 #[cfg(all(
1843 unix,
1844 any(target_arch = "arm", target_arch = "aarch64"),
1845 feature = "gunyah"
1846 ))]
1847 {
1848 let gunyah_path = Path::new(GUNYAH_PATH);
1849 if gunyah_path.exists() {
1850 return Some(HypervisorKind::Gunyah {
1851 device: Some(gunyah_path.to_path_buf()),
1852 });
1853 }
1854 }
1855
1856 None
1857 }
1858
run_config(cfg: Config) -> Result<ExitState>1859 pub fn run_config(cfg: Config) -> Result<ExitState> {
1860 let components = setup_vm_components(&cfg)?;
1861
1862 let hypervisor = cfg
1863 .hypervisor
1864 .clone()
1865 .or_else(get_default_hypervisor)
1866 .context("no enabled hypervisor")?;
1867
1868 debug!("creating hypervisor: {:?}", hypervisor);
1869
1870 match hypervisor {
1871 HypervisorKind::Kvm { device } => run_kvm(device.as_deref(), cfg, components),
1872 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1873 #[cfg(feature = "geniezone")]
1874 HypervisorKind::Geniezone { device } => run_gz(device.as_deref(), cfg, components),
1875 #[cfg(all(
1876 unix,
1877 any(target_arch = "arm", target_arch = "aarch64"),
1878 feature = "gunyah"
1879 ))]
1880 HypervisorKind::Gunyah { device } => run_gunyah(device.as_deref(), cfg, components),
1881 }
1882 }
1883
run_vm<Vcpu, V>( cfg: Config, #[allow(unused_mut)] mut components: VmComponents, arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout, mut vm: V, irq_chip: &mut dyn IrqChipArch, ioapic_host_tube: Option<Tube>, #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>, ) -> Result<ExitState> where Vcpu: VcpuArch + 'static, V: VmArch + 'static,1884 fn run_vm<Vcpu, V>(
1885 cfg: Config,
1886 #[allow(unused_mut)] mut components: VmComponents,
1887 arch_memory_layout: &<Arch as LinuxArch>::ArchMemoryLayout,
1888 mut vm: V,
1889 irq_chip: &mut dyn IrqChipArch,
1890 ioapic_host_tube: Option<Tube>,
1891 #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>,
1892 ) -> Result<ExitState>
1893 where
1894 Vcpu: VcpuArch + 'static,
1895 V: VmArch + 'static,
1896 {
1897 if cfg.jail_config.is_some() {
1898 // Printing something to the syslog before entering minijail so that libc's syslogger has a
1899 // chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
1900 // access to those files will not be possible.
1901 info!("crosvm entering multiprocess mode");
1902 }
1903
1904 let (metrics_send, metrics_recv) = Tube::directional_pair().context("metrics tube")?;
1905 metrics::initialize(metrics_send);
1906
1907 #[cfg(all(feature = "pci-hotplug", feature = "swap"))]
1908 let swap_device_helper = match &swap_controller {
1909 Some(swap_controller) => Some(swap_controller.create_device_helper()?),
1910 None => None,
1911 };
1912 // pci-hotplug is only implemented for x86_64 for now, attempting to use it on other platform
1913 // would crash.
1914 #[cfg(all(feature = "pci-hotplug", not(target_arch = "x86_64")))]
1915 if cfg.pci_hotplug_slots.is_some() {
1916 bail!("pci-hotplug is not implemented for non x86_64 architecture");
1917 }
1918 // hotplug_manager must be created before vm is started since it forks jail warden process.
1919 #[cfg(feature = "pci-hotplug")]
1920 // TODO(293801301): Remove unused_mut after aarch64 support
1921 #[allow(unused_mut)]
1922 let mut hotplug_manager = if cfg.pci_hotplug_slots.is_some() {
1923 Some(PciHotPlugManager::new(
1924 vm.get_memory().clone(),
1925 &cfg,
1926 #[cfg(feature = "swap")]
1927 swap_device_helper,
1928 )?)
1929 } else {
1930 None
1931 };
1932
1933 #[cfg(feature = "usb")]
1934 let (usb_control_tube, usb_provider) =
1935 DeviceProvider::new().context("failed to create usb provider")?;
1936
1937 // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
1938 // before any jailed devices have been spawned, so that we can catch any of them that fail very
1939 // quickly.
1940 let sigchld_fd = SignalFd::new(libc::SIGCHLD).context("failed to create signalfd")?;
1941
1942 let control_server_socket = match &cfg.socket_path {
1943 Some(path) => Some(UnlinkUnixSeqpacketListener(
1944 UnixSeqpacketListener::bind(path).context("failed to create control server")?,
1945 )),
1946 None => None,
1947 };
1948
1949 let mut all_control_tubes = Vec::new();
1950 let mut add_control_tube = |t| all_control_tubes.push(t);
1951
1952 if let Some(ioapic_host_tube) = ioapic_host_tube {
1953 add_control_tube(AnyControlTube::IrqTube(ioapic_host_tube));
1954 }
1955
1956 let battery = if cfg.battery_config.is_some() {
1957 #[cfg_attr(
1958 not(feature = "power-monitor-powerd"),
1959 allow(clippy::manual_map, clippy::needless_match, unused_mut)
1960 )]
1961 let jail = if let Some(jail_config) = &cfg.jail_config {
1962 let mut config = SandboxConfig::new(jail_config, "battery");
1963 #[cfg(feature = "power-monitor-powerd")]
1964 {
1965 config.bind_mounts = true;
1966 }
1967 let mut jail =
1968 create_sandbox_minijail(&jail_config.pivot_root, MAX_OPEN_FILES_DEFAULT, &config)?;
1969
1970 // Setup a bind mount to the system D-Bus socket if the powerd monitor is used.
1971 #[cfg(feature = "power-monitor-powerd")]
1972 {
1973 let system_bus_socket_path = Path::new("/run/dbus/system_bus_socket");
1974 jail.mount_bind(system_bus_socket_path, system_bus_socket_path, true)?;
1975 }
1976 Some(jail)
1977 } else {
1978 None
1979 };
1980 (cfg.battery_config.as_ref().map(|c| c.type_), jail)
1981 } else {
1982 (cfg.battery_config.as_ref().map(|c| c.type_), None)
1983 };
1984
1985 let (vm_evt_wrtube, vm_evt_rdtube) =
1986 Tube::directional_pair().context("failed to create vm event tube")?;
1987
1988 let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64);
1989 let mut sys_allocator = SystemAllocator::new(
1990 Arch::get_system_allocator_config(&vm, arch_memory_layout),
1991 pstore_size,
1992 &cfg.mmio_address_ranges,
1993 )
1994 .context("failed to create system allocator")?;
1995
1996 let ramoops_region = match &components.pstore {
1997 Some(pstore) => Some(
1998 arch::pstore::create_memory_region(
1999 &mut vm,
2000 sys_allocator.reserved_region().unwrap(),
2001 pstore,
2002 )
2003 .context("failed to allocate pstore region")?,
2004 ),
2005 None => None,
2006 };
2007
2008 create_file_backed_mappings(&cfg, &mut vm, &mut sys_allocator)?;
2009
2010 #[cfg(feature = "gpu")]
2011 // Hold on to the render server jail so it keeps running until we exit run_vm()
2012 let (_render_server_jail, render_server_fd) =
2013 if let Some(parameters) = &cfg.gpu_render_server_parameters {
2014 let (jail, fd) = start_gpu_render_server(&cfg, parameters)?;
2015 (Some(ScopedMinijail(jail)), Some(fd))
2016 } else {
2017 (None, None)
2018 };
2019
2020 let mut iommu_attached_endpoints: BTreeMap<u32, Arc<Mutex<Box<dyn MemoryMapperTrait>>>> =
2021 BTreeMap::new();
2022 let mut iova_max_addr: Option<u64> = None;
2023
2024 let mut vfio_container_manager = VfioContainerManager::new();
2025
2026 #[cfg(feature = "registered_events")]
2027 let (reg_evt_wrtube, reg_evt_rdtube) =
2028 Tube::directional_pair().context("failed to create registered event tube")?;
2029
2030 let mut worker_process_pids = BTreeSet::new();
2031
2032 let mut devices = create_devices(
2033 &cfg,
2034 &mut vm,
2035 &mut sys_allocator,
2036 &mut add_control_tube,
2037 &vm_evt_wrtube,
2038 &mut iommu_attached_endpoints,
2039 #[cfg(feature = "usb")]
2040 usb_provider,
2041 #[cfg(feature = "gpu")]
2042 render_server_fd,
2043 &mut iova_max_addr,
2044 #[cfg(feature = "registered_events")]
2045 ®_evt_wrtube,
2046 &mut vfio_container_manager,
2047 &mut worker_process_pids,
2048 )?;
2049
2050 #[cfg(feature = "pci-hotplug")]
2051 // TODO(293801301): Remove unused_variables after aarch64 support
2052 #[allow(unused_variables)]
2053 let pci_hotplug_slots = cfg.pci_hotplug_slots;
2054 #[cfg(not(feature = "pci-hotplug"))]
2055 #[allow(unused_variables)]
2056 let pci_hotplug_slots: Option<u8> = None;
2057 #[cfg(target_arch = "x86_64")]
2058 let hp_stub = create_pure_virtual_pcie_root_port(
2059 &mut sys_allocator,
2060 &mut add_control_tube,
2061 &mut devices,
2062 pci_hotplug_slots.unwrap_or(1),
2063 )?;
2064
2065 arch::assign_pci_addresses(&mut devices, &mut sys_allocator)?;
2066
2067 let pci_devices: Vec<&dyn PciDevice> = devices
2068 .iter()
2069 .filter_map(|d| (d.0).as_pci_device())
2070 .collect();
2071
2072 let virtio_devices: Vec<(&dyn VirtioDevice, devices::PciAddress)> = pci_devices
2073 .into_iter()
2074 .flat_map(|s| {
2075 if let Some(virtio_pci_device) = s.as_virtio_pci_device() {
2076 std::iter::zip(
2077 Some(virtio_pci_device.virtio_device()),
2078 virtio_pci_device.pci_address(),
2079 )
2080 .next()
2081 } else {
2082 None
2083 }
2084 })
2085 .collect();
2086
2087 let mut open_firmware_device_paths: Vec<(Vec<u8>, usize)> = virtio_devices
2088 .iter()
2089 .flat_map(|s| (s.0).bootorder_fw_cfg(s.1.dev))
2090 .collect();
2091
2092 // order the OpenFirmware device paths, in ascending order, by their boot_index
2093 open_firmware_device_paths.sort_by(|a, b| (a.1).cmp(&(b.1)));
2094
2095 // "/pci@iocf8/" is x86 specific and represents the root at the system bus port
2096 let mut bootorder_fw_cfg_blob =
2097 open_firmware_device_paths
2098 .into_iter()
2099 .fold(Vec::new(), |a, b| {
2100 a.into_iter()
2101 .chain("/pci@i0cf8/".as_bytes().iter().copied())
2102 .chain(b.0)
2103 .chain("\n".as_bytes().iter().copied())
2104 .collect()
2105 });
2106
2107 // the "bootorder" file is expected to end with a null terminator
2108 bootorder_fw_cfg_blob.push(0);
2109
2110 components.bootorder_fw_cfg_blob = bootorder_fw_cfg_blob;
2111
2112 // if the bootindex argument was given, we want to make sure that fw_cfg is enabled so the
2113 // "bootorder" file can be accessed by the guest.
2114 components.fw_cfg_enable |= components.bootorder_fw_cfg_blob.len() > 1;
2115
2116 let (translate_response_senders, request_rx) = setup_virtio_access_platform(
2117 &mut sys_allocator,
2118 &mut iommu_attached_endpoints,
2119 &mut devices,
2120 )?;
2121
2122 #[cfg(target_arch = "x86_64")]
2123 let iommu_bus_ranges = hp_stub.iommu_bus_ranges;
2124 #[cfg(not(target_arch = "x86_64"))]
2125 let iommu_bus_ranges = Vec::new();
2126
2127 let iommu_host_tube = if !iommu_attached_endpoints.is_empty()
2128 || (cfg.vfio_isolate_hotplug && !iommu_bus_ranges.is_empty())
2129 {
2130 let (iommu_host_tube, iommu_device_tube) = Tube::pair().context("failed to create tube")?;
2131 let iommu_dev = create_iommu_device(
2132 cfg.protection_type,
2133 &cfg.jail_config,
2134 iova_max_addr.unwrap_or(u64::MAX),
2135 iommu_attached_endpoints,
2136 iommu_bus_ranges,
2137 translate_response_senders,
2138 request_rx,
2139 iommu_device_tube,
2140 )?;
2141
2142 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2143 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2144 let (ioevent_host_tube, ioevent_device_tube) =
2145 Tube::pair().context("failed to create ioevent tube")?;
2146 add_control_tube(
2147 VmMemoryTube {
2148 tube: ioevent_host_tube,
2149 expose_with_viommu: false,
2150 }
2151 .into(),
2152 );
2153 let (host_tube, device_tube) =
2154 Tube::pair().context("failed to create device control tube")?;
2155 add_control_tube(TaggedControlTube::Vm(host_tube).into());
2156 let mut dev = VirtioPciDevice::new(
2157 vm.get_memory().clone(),
2158 iommu_dev.dev,
2159 msi_device_tube,
2160 cfg.disable_virtio_intx,
2161 None,
2162 VmMemoryClient::new(ioevent_device_tube),
2163 device_tube,
2164 )
2165 .context("failed to create virtio pci dev")?;
2166 // early reservation for viommu.
2167 dev.allocate_address(&mut sys_allocator)
2168 .context("failed to allocate resources early for virtio pci dev")?;
2169 let dev = Box::new(dev);
2170 devices.push((dev, iommu_dev.jail));
2171 Some(iommu_host_tube)
2172 } else {
2173 None
2174 };
2175
2176 #[cfg(target_arch = "x86_64")]
2177 for device in devices
2178 .iter_mut()
2179 .filter_map(|(dev, _)| dev.as_pci_device_mut())
2180 {
2181 let sdts = device
2182 .generate_acpi(components.acpi_sdts)
2183 .or_else(|| {
2184 error!("ACPI table generation error");
2185 None
2186 })
2187 .ok_or_else(|| anyhow!("failed to generate ACPI table"))?;
2188 components.acpi_sdts = sdts;
2189 }
2190
2191 // KVM_CREATE_VCPU uses apic id for x86 and uses cpu id for others.
2192 let mut vcpu_ids = Vec::new();
2193
2194 let guest_suspended_cvar = if cfg.force_s2idle {
2195 Some(Arc::new((Mutex::new(false), Condvar::new())))
2196 } else {
2197 None
2198 };
2199
2200 let dt_overlays = cfg
2201 .device_tree_overlay
2202 .iter()
2203 .map(|o| {
2204 Ok(DtbOverlay {
2205 file: open_file_or_duplicate(o.path.as_path(), OpenOptions::new().read(true))
2206 .with_context(|| {
2207 format!("failed to open device tree overlay {}", o.path.display())
2208 })?,
2209 do_filter: o.filter_devs,
2210 })
2211 })
2212 .collect::<Result<Vec<DtbOverlay>>>()?;
2213
2214 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
2215 let vcpu_domain_paths = components.vcpu_domain_paths.clone();
2216
2217 let mut linux = Arch::build_vm::<V, Vcpu>(
2218 components,
2219 arch_memory_layout,
2220 &vm_evt_wrtube,
2221 &mut sys_allocator,
2222 &cfg.serial_parameters,
2223 simple_jail(&cfg.jail_config, "serial_device")?,
2224 battery,
2225 vm,
2226 ramoops_region,
2227 devices,
2228 irq_chip,
2229 &mut vcpu_ids,
2230 cfg.dump_device_tree_blob.clone(),
2231 simple_jail(&cfg.jail_config, "serial_device")?,
2232 #[cfg(target_arch = "x86_64")]
2233 simple_jail(&cfg.jail_config, "block_device")?,
2234 #[cfg(target_arch = "x86_64")]
2235 simple_jail(&cfg.jail_config, "fw_cfg_device")?,
2236 #[cfg(feature = "swap")]
2237 &mut swap_controller,
2238 guest_suspended_cvar.clone(),
2239 dt_overlays,
2240 cfg.fdt_position,
2241 cfg.no_pmu,
2242 )
2243 .context("the architecture failed to build the vm")?;
2244
2245 for tube in linux.vm_request_tubes.drain(..) {
2246 add_control_tube(TaggedControlTube::Vm(tube).into());
2247 }
2248
2249 #[cfg(target_arch = "x86_64")]
2250 let (hp_control_tube, hp_worker_tube) = mpsc::channel();
2251 #[cfg(all(feature = "pci-hotplug", target_arch = "x86_64"))]
2252 if let Some(hotplug_manager) = &mut hotplug_manager {
2253 hotplug_manager.set_rootbus_controller(hp_control_tube.clone())?;
2254 }
2255 #[cfg(target_arch = "x86_64")]
2256 let hp_thread = {
2257 for (bus_num, hp_bus) in hp_stub.hotplug_buses.into_iter() {
2258 #[cfg(feature = "pci-hotplug")]
2259 if let Some(hotplug_manager) = &mut hotplug_manager {
2260 hotplug_manager.add_port(hp_bus)?;
2261 } else {
2262 linux.hotplug_bus.insert(bus_num, hp_bus);
2263 }
2264 #[cfg(not(feature = "pci-hotplug"))]
2265 linux.hotplug_bus.insert(bus_num, hp_bus);
2266 }
2267
2268 if let Some(pm) = &linux.pm {
2269 for (gpe, notify_dev) in hp_stub.gpe_notify_devs.into_iter() {
2270 pm.lock().register_gpe_notify_dev(gpe, notify_dev);
2271 }
2272 for (bus, notify_dev) in hp_stub.pme_notify_devs.into_iter() {
2273 pm.lock().register_pme_notify_dev(bus, notify_dev);
2274 }
2275 }
2276
2277 let (hp_vm_mem_host_tube, hp_vm_mem_worker_tube) =
2278 Tube::pair().context("failed to create tube")?;
2279 add_control_tube(
2280 VmMemoryTube {
2281 tube: hp_vm_mem_host_tube,
2282 expose_with_viommu: false,
2283 }
2284 .into(),
2285 );
2286
2287 let supports_readonly_mapping = linux.vm.supports_readonly_mapping();
2288 let pci_root = linux.root_config.clone();
2289 std::thread::Builder::new()
2290 .name("pci_root".to_string())
2291 .spawn(move || {
2292 start_pci_root_worker(
2293 supports_readonly_mapping,
2294 pci_root,
2295 hp_worker_tube,
2296 hp_vm_mem_worker_tube,
2297 )
2298 })?
2299 };
2300
2301 let flags = RutabagaGrallocBackendFlags::new().disable_vulkano();
2302 let gralloc = RutabagaGralloc::new(flags).context("failed to create gralloc")?;
2303
2304 run_control(
2305 linux,
2306 sys_allocator,
2307 cfg,
2308 control_server_socket,
2309 all_control_tubes,
2310 #[cfg(feature = "usb")]
2311 usb_control_tube,
2312 vm_evt_rdtube,
2313 vm_evt_wrtube,
2314 sigchld_fd,
2315 gralloc,
2316 vcpu_ids,
2317 iommu_host_tube,
2318 #[cfg(target_arch = "x86_64")]
2319 hp_control_tube,
2320 #[cfg(target_arch = "x86_64")]
2321 hp_thread,
2322 #[cfg(feature = "pci-hotplug")]
2323 hotplug_manager,
2324 #[cfg(feature = "swap")]
2325 swap_controller,
2326 #[cfg(feature = "registered_events")]
2327 reg_evt_rdtube,
2328 guest_suspended_cvar,
2329 metrics_recv,
2330 vfio_container_manager,
2331 worker_process_pids,
2332 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
2333 vcpu_domain_paths,
2334 )
2335 }
2336
2337 // Hotplug command is facing dead lock issue when it tries to acquire the lock
2338 // for pci root in the vm control thread. Dead lock could happen when the vm
2339 // control thread(Thread A namely) is handling the hotplug command and it tries
2340 // to get the lock for pci root. However, the lock is already hold by another
2341 // device in thread B, which is actively sending an vm control to be handled by
2342 // thread A and waiting for response. However, thread A is blocked on acquiring
2343 // the lock, so dead lock happens. In order to resolve this issue, we add this
2344 // worker thread and push all work that locks pci root to this thread.
2345 #[cfg(target_arch = "x86_64")]
start_pci_root_worker( supports_readonly_mapping: bool, pci_root: Arc<Mutex<PciRoot>>, hp_device_tube: mpsc::Receiver<PciRootCommand>, vm_control_tube: Tube, )2346 fn start_pci_root_worker(
2347 supports_readonly_mapping: bool,
2348 pci_root: Arc<Mutex<PciRoot>>,
2349 hp_device_tube: mpsc::Receiver<PciRootCommand>,
2350 vm_control_tube: Tube,
2351 ) {
2352 struct PciMmioMapperTube {
2353 supports_readonly_mapping: bool,
2354 vm_control_tube: Tube,
2355 registered_regions: BTreeMap<u32, VmMemoryRegionId>,
2356 next_id: u32,
2357 }
2358
2359 impl PciMmioMapper for PciMmioMapperTube {
2360 fn supports_readonly_mapping(&self) -> bool {
2361 self.supports_readonly_mapping
2362 }
2363
2364 fn add_mapping(&mut self, addr: GuestAddress, shmem: &SharedMemory) -> anyhow::Result<u32> {
2365 let shmem = shmem
2366 .try_clone()
2367 .context("failed to create new SharedMemory")?;
2368 self.vm_control_tube
2369 .send(&VmMemoryRequest::RegisterMemory {
2370 source: VmMemorySource::SharedMemory(shmem),
2371 dest: VmMemoryDestination::GuestPhysicalAddress(addr.0),
2372 prot: Protection::read(),
2373 cache: MemCacheType::CacheCoherent,
2374 })
2375 .context("failed to send request")?;
2376 match self.vm_control_tube.recv::<VmMemoryResponse>() {
2377 Ok(VmMemoryResponse::RegisterMemory { region_id, .. }) => {
2378 let cur_id = self.next_id;
2379 self.registered_regions.insert(cur_id, region_id);
2380 self.next_id += 1;
2381 Ok(cur_id)
2382 }
2383 res => bail!("Bad response: {:?}", res),
2384 }
2385 }
2386 }
2387
2388 let mut mapper = PciMmioMapperTube {
2389 supports_readonly_mapping,
2390 vm_control_tube,
2391 registered_regions: BTreeMap::new(),
2392 next_id: 0,
2393 };
2394
2395 loop {
2396 match hp_device_tube.recv() {
2397 Ok(cmd) => match cmd {
2398 PciRootCommand::Add(addr, device) => {
2399 if let Err(e) = pci_root.lock().add_device(addr, device, &mut mapper) {
2400 error!("failed to add hotplugged device to PCI root port: {}", e);
2401 }
2402 }
2403 PciRootCommand::AddBridge(pci_bus) => {
2404 if let Err(e) = pci_root.lock().add_bridge(pci_bus) {
2405 error!("failed to add hotplugged bridge to PCI root port: {}", e);
2406 }
2407 }
2408 PciRootCommand::Remove(addr) => {
2409 pci_root.lock().remove_device(addr);
2410 }
2411 PciRootCommand::Kill => break,
2412 },
2413 Err(e) => {
2414 error!("Error: pci root worker channel closed: {}", e);
2415 break;
2416 }
2417 }
2418 }
2419 }
2420
2421 #[cfg(target_arch = "x86_64")]
get_hp_bus<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, host_addr: PciAddress, ) -> Result<Arc<Mutex<dyn HotPlugBus>>>2422 fn get_hp_bus<V: VmArch, Vcpu: VcpuArch>(
2423 linux: &RunnableLinuxVm<V, Vcpu>,
2424 host_addr: PciAddress,
2425 ) -> Result<Arc<Mutex<dyn HotPlugBus>>> {
2426 for (_, hp_bus) in linux.hotplug_bus.iter() {
2427 if hp_bus.lock().is_match(host_addr).is_some() {
2428 return Ok(hp_bus.clone());
2429 }
2430 }
2431 Err(anyhow!("Failed to find a suitable hotplug bus"))
2432 }
2433
2434 #[cfg(target_arch = "x86_64")]
add_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, add_control_tube: &mut impl FnMut(AnyControlTube), hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>, vfio_container_manager: &mut VfioContainerManager, ) -> Result<()>2435 fn add_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2436 linux: &mut RunnableLinuxVm<V, Vcpu>,
2437 sys_allocator: &mut SystemAllocator,
2438 cfg: &Config,
2439 add_control_tube: &mut impl FnMut(AnyControlTube),
2440 hp_control_tube: &mpsc::Sender<PciRootCommand>,
2441 iommu_host_tube: Option<&Tube>,
2442 device: &HotPlugDeviceInfo,
2443 #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
2444 vfio_container_manager: &mut VfioContainerManager,
2445 ) -> Result<()> {
2446 let host_addr = PciAddress::from_path(&device.path)
2447 .context("failed to parse hotplug device's PCI address")?;
2448 let hp_bus = get_hp_bus(linux, host_addr)?;
2449
2450 let (hotplug_key, pci_address) = match device.device_type {
2451 HotPlugDeviceType::UpstreamPort | HotPlugDeviceType::DownstreamPort => {
2452 let (vm_host_tube, vm_device_tube) = Tube::pair().context("failed to create tube")?;
2453 add_control_tube(TaggedControlTube::Vm(vm_host_tube).into());
2454 let (msi_host_tube, msi_device_tube) = Tube::pair().context("failed to create tube")?;
2455 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2456 let pcie_host = PcieHostPort::new(device.path.as_path(), vm_device_tube)?;
2457 let (hotplug_key, pci_bridge) = match device.device_type {
2458 HotPlugDeviceType::UpstreamPort => {
2459 let hotplug_key = HotPlugKey::HostUpstreamPort { host_addr };
2460 let pcie_upstream_port = Arc::new(Mutex::new(PcieUpstreamPort::new_from_host(
2461 pcie_host, true,
2462 )?));
2463 let pci_bridge =
2464 Box::new(PciBridge::new(pcie_upstream_port.clone(), msi_device_tube));
2465 linux
2466 .hotplug_bus
2467 .insert(pci_bridge.get_secondary_num(), pcie_upstream_port);
2468 (hotplug_key, pci_bridge)
2469 }
2470 HotPlugDeviceType::DownstreamPort => {
2471 let hotplug_key = HotPlugKey::HostDownstreamPort { host_addr };
2472 let pcie_downstream_port = Arc::new(Mutex::new(
2473 PcieDownstreamPort::new_from_host(pcie_host, true)?,
2474 ));
2475 let pci_bridge = Box::new(PciBridge::new(
2476 pcie_downstream_port.clone(),
2477 msi_device_tube,
2478 ));
2479 linux
2480 .hotplug_bus
2481 .insert(pci_bridge.get_secondary_num(), pcie_downstream_port);
2482 (hotplug_key, pci_bridge)
2483 }
2484 _ => {
2485 bail!("Impossible to reach here")
2486 }
2487 };
2488 let pci_address = Arch::register_pci_device(
2489 linux,
2490 pci_bridge,
2491 None,
2492 sys_allocator,
2493 hp_control_tube,
2494 #[cfg(feature = "swap")]
2495 swap_controller,
2496 )?;
2497
2498 (hotplug_key, pci_address)
2499 }
2500 HotPlugDeviceType::EndPoint => {
2501 let hotplug_key = HotPlugKey::HostVfio { host_addr };
2502 let (vfio_device, jail, viommu_mapper) = create_vfio_device(
2503 &cfg.jail_config,
2504 &linux.vm,
2505 sys_allocator,
2506 add_control_tube,
2507 &device.path,
2508 true,
2509 None,
2510 None,
2511 None,
2512 if iommu_host_tube.is_some() {
2513 IommuDevType::VirtioIommu
2514 } else {
2515 IommuDevType::NoIommu
2516 },
2517 None,
2518 vfio_container_manager,
2519 )?;
2520 let vfio_pci_device = match vfio_device {
2521 VfioDeviceVariant::Pci(pci) => Box::new(pci),
2522 VfioDeviceVariant::Platform(_) => bail!("vfio platform hotplug not supported"),
2523 };
2524 let pci_address = Arch::register_pci_device(
2525 linux,
2526 vfio_pci_device,
2527 jail,
2528 sys_allocator,
2529 hp_control_tube,
2530 #[cfg(feature = "swap")]
2531 swap_controller,
2532 )?;
2533 if let Some(iommu_host_tube) = iommu_host_tube {
2534 let endpoint_addr = pci_address.to_u32();
2535 let vfio_wrapper = viommu_mapper.context("expected mapper")?;
2536 let descriptor = vfio_wrapper.clone_as_raw_descriptor()?;
2537 let request =
2538 VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceAdd {
2539 endpoint_addr,
2540 wrapper_id: vfio_wrapper.id(),
2541 container: {
2542 // SAFETY:
2543 // Safe because the descriptor is uniquely owned by `descriptor`.
2544 unsafe { File::from_raw_descriptor(descriptor) }
2545 },
2546 });
2547 match virtio_iommu_request(iommu_host_tube, &request)
2548 .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2549 {
2550 VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2551 resp => bail!("Unexpected message response: {:?}", resp),
2552 }
2553 }
2554
2555 (hotplug_key, pci_address)
2556 }
2557 };
2558 hp_bus.lock().add_hotplug_device(hotplug_key, pci_address);
2559 if device.hp_interrupt {
2560 hp_bus.lock().hot_plug(pci_address)?;
2561 }
2562 Ok(())
2563 }
2564
2565 #[cfg(feature = "pci-hotplug")]
add_hotplug_net<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), hotplug_manager: &mut PciHotPlugManager, net_param: NetParameters, ) -> Result<u8>2566 fn add_hotplug_net<V: VmArch, Vcpu: VcpuArch>(
2567 linux: &mut RunnableLinuxVm<V, Vcpu>,
2568 sys_allocator: &mut SystemAllocator,
2569 add_control_tube: &mut impl FnMut(AnyControlTube),
2570 hotplug_manager: &mut PciHotPlugManager,
2571 net_param: NetParameters,
2572 ) -> Result<u8> {
2573 let (msi_host_tube, msi_device_tube) = Tube::pair().context("create tube")?;
2574 add_control_tube(AnyControlTube::IrqTube(msi_host_tube));
2575 let (ioevent_host_tube, ioevent_device_tube) = Tube::pair().context("create tube")?;
2576 let ioevent_vm_memory_client = VmMemoryClient::new(ioevent_device_tube);
2577 add_control_tube(
2578 VmMemoryTube {
2579 tube: ioevent_host_tube,
2580 expose_with_viommu: false,
2581 }
2582 .into(),
2583 );
2584 let (vm_control_host_tube, vm_control_device_tube) = Tube::pair().context("create tube")?;
2585 add_control_tube(TaggedControlTube::Vm(vm_control_host_tube).into());
2586 let net_carrier_device = NetResourceCarrier::new(
2587 net_param,
2588 msi_device_tube,
2589 ioevent_vm_memory_client,
2590 vm_control_device_tube,
2591 );
2592 hotplug_manager.hotplug_device(
2593 vec![ResourceCarrier::VirtioNet(net_carrier_device)],
2594 linux,
2595 sys_allocator,
2596 )
2597 }
2598
2599 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>( net_cmd: NetControlCommand, linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), hotplug_manager: &mut PciHotPlugManager, ) -> VmResponse2600 fn handle_hotplug_net_command<V: VmArch, Vcpu: VcpuArch>(
2601 net_cmd: NetControlCommand,
2602 linux: &mut RunnableLinuxVm<V, Vcpu>,
2603 sys_allocator: &mut SystemAllocator,
2604 add_control_tube: &mut impl FnMut(AnyControlTube),
2605 hotplug_manager: &mut PciHotPlugManager,
2606 ) -> VmResponse {
2607 match net_cmd {
2608 NetControlCommand::AddTap(tap_name) => handle_hotplug_net_add(
2609 linux,
2610 sys_allocator,
2611 add_control_tube,
2612 hotplug_manager,
2613 &tap_name,
2614 ),
2615 NetControlCommand::RemoveTap(bus) => {
2616 handle_hotplug_net_remove(linux, sys_allocator, hotplug_manager, bus)
2617 }
2618 }
2619 }
2620
2621 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, add_control_tube: &mut impl FnMut(AnyControlTube), hotplug_manager: &mut PciHotPlugManager, tap_name: &str, ) -> VmResponse2622 fn handle_hotplug_net_add<V: VmArch, Vcpu: VcpuArch>(
2623 linux: &mut RunnableLinuxVm<V, Vcpu>,
2624 sys_allocator: &mut SystemAllocator,
2625 add_control_tube: &mut impl FnMut(AnyControlTube),
2626 hotplug_manager: &mut PciHotPlugManager,
2627 tap_name: &str,
2628 ) -> VmResponse {
2629 let net_param_mode = NetParametersMode::TapName {
2630 tap_name: tap_name.to_owned(),
2631 mac: None,
2632 };
2633 let net_param = NetParameters {
2634 mode: net_param_mode,
2635 vhost_net: None,
2636 vq_pairs: None,
2637 packed_queue: false,
2638 pci_address: None,
2639 };
2640 let ret = add_hotplug_net(
2641 linux,
2642 sys_allocator,
2643 add_control_tube,
2644 hotplug_manager,
2645 net_param,
2646 );
2647
2648 match ret {
2649 Ok(pci_bus) => VmResponse::PciHotPlugResponse { bus: pci_bus },
2650 Err(e) => VmResponse::ErrString(format!("{:?}", e)),
2651 }
2652 }
2653
2654 #[cfg(feature = "pci-hotplug")]
handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, hotplug_manager: &mut PciHotPlugManager, bus: u8, ) -> VmResponse2655 fn handle_hotplug_net_remove<V: VmArch, Vcpu: VcpuArch>(
2656 linux: &mut RunnableLinuxVm<V, Vcpu>,
2657 sys_allocator: &mut SystemAllocator,
2658 hotplug_manager: &mut PciHotPlugManager,
2659 bus: u8,
2660 ) -> VmResponse {
2661 match hotplug_manager.remove_hotplug_device(bus, linux, sys_allocator) {
2662 Ok(_) => VmResponse::Ok,
2663 Err(e) => VmResponse::ErrString(format!("{:?}", e)),
2664 }
2665 }
2666
2667 #[cfg(target_arch = "x86_64")]
remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>( linux: &RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, buses_to_remove: &mut Vec<u8>, hotplug_key: HotPlugKey, child_bus: u8, ) -> Result<()>2668 fn remove_hotplug_bridge<V: VmArch, Vcpu: VcpuArch>(
2669 linux: &RunnableLinuxVm<V, Vcpu>,
2670 sys_allocator: &mut SystemAllocator,
2671 buses_to_remove: &mut Vec<u8>,
2672 hotplug_key: HotPlugKey,
2673 child_bus: u8,
2674 ) -> Result<()> {
2675 for (bus_num, hp_bus) in linux.hotplug_bus.iter() {
2676 let mut hp_bus_lock = hp_bus.lock();
2677 if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2678 sys_allocator.release_pci(pci_addr.bus, pci_addr.dev, pci_addr.func);
2679 hp_bus_lock.hot_unplug(pci_addr)?;
2680 buses_to_remove.push(child_bus);
2681 if hp_bus_lock.is_empty() {
2682 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2683 remove_hotplug_bridge(
2684 linux,
2685 sys_allocator,
2686 buses_to_remove,
2687 hotplug_key,
2688 *bus_num,
2689 )?;
2690 }
2691 }
2692 return Ok(());
2693 }
2694 }
2695
2696 Err(anyhow!(
2697 "Can not find device {:?} on hotplug buses",
2698 hotplug_key
2699 ))
2700 }
2701
2702 #[cfg(target_arch = "x86_64")]
remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, ) -> Result<()>2703 fn remove_hotplug_device<V: VmArch, Vcpu: VcpuArch>(
2704 linux: &mut RunnableLinuxVm<V, Vcpu>,
2705 sys_allocator: &mut SystemAllocator,
2706 iommu_host_tube: Option<&Tube>,
2707 device: &HotPlugDeviceInfo,
2708 ) -> Result<()> {
2709 let host_addr = PciAddress::from_path(&device.path)?;
2710 let hotplug_key = match device.device_type {
2711 HotPlugDeviceType::UpstreamPort => HotPlugKey::HostUpstreamPort { host_addr },
2712 HotPlugDeviceType::DownstreamPort => HotPlugKey::HostDownstreamPort { host_addr },
2713 HotPlugDeviceType::EndPoint => HotPlugKey::HostVfio { host_addr },
2714 };
2715
2716 let hp_bus = linux
2717 .hotplug_bus
2718 .iter()
2719 .find(|(_, hp_bus)| {
2720 let hp_bus = hp_bus.lock();
2721 hp_bus.get_hotplug_device(hotplug_key).is_some()
2722 })
2723 .map(|(bus_num, hp_bus)| (*bus_num, hp_bus.clone()));
2724
2725 if let Some((bus_num, hp_bus)) = hp_bus {
2726 let mut buses_to_remove = Vec::new();
2727 let mut removed_key = None;
2728 let mut hp_bus_lock = hp_bus.lock();
2729 if let Some(pci_addr) = hp_bus_lock.get_hotplug_device(hotplug_key) {
2730 if let Some(iommu_host_tube) = iommu_host_tube {
2731 let request =
2732 VirtioIOMMURequest::VfioCommand(VirtioIOMMUVfioCommand::VfioDeviceDel {
2733 endpoint_addr: pci_addr.to_u32(),
2734 });
2735 match virtio_iommu_request(iommu_host_tube, &request)
2736 .map_err(|_| VirtioIOMMUVfioError::SocketFailed)?
2737 {
2738 VirtioIOMMUResponse::VfioResponse(VirtioIOMMUVfioResult::Ok) => (),
2739 resp => bail!("Unexpected message response: {:?}", resp),
2740 }
2741 }
2742 let mut empty_simbling = true;
2743 if let Some(HotPlugKey::HostDownstreamPort { host_addr }) =
2744 hp_bus_lock.get_hotplug_key()
2745 {
2746 let addr_alias = host_addr;
2747 for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2748 if *simbling_bus_num != bus_num {
2749 let hp_bus_lock = hp_bus.lock();
2750 let hotplug_key = hp_bus_lock.get_hotplug_key();
2751 if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
2752 if addr_alias.bus == host_addr.bus && !hp_bus_lock.is_empty() {
2753 empty_simbling = false;
2754 break;
2755 }
2756 }
2757 }
2758 }
2759 }
2760
2761 // If all simbling downstream ports are empty, do not send hot unplug event for this
2762 // downstream port. Root port will send one plug out interrupt and remove all
2763 // the remaining devices
2764 if !empty_simbling {
2765 hp_bus_lock.hot_unplug(pci_addr)?;
2766 }
2767
2768 sys_allocator.release_pci(pci_addr.bus, pci_addr.dev, pci_addr.func);
2769 if empty_simbling || hp_bus_lock.is_empty() {
2770 if let Some(hotplug_key) = hp_bus_lock.get_hotplug_key() {
2771 removed_key = Some(hotplug_key);
2772 remove_hotplug_bridge(
2773 linux,
2774 sys_allocator,
2775 &mut buses_to_remove,
2776 hotplug_key,
2777 bus_num,
2778 )?;
2779 }
2780 }
2781 }
2782
2783 // Some types of TBT device has a few empty downstream ports. The emulated bridges
2784 // of these ports won't be removed since no vfio device is connected to our emulated
2785 // bridges. So we explicitly check all simbling bridges of the removed bridge here,
2786 // and remove them if bridge has no child device connected.
2787 if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = removed_key {
2788 let addr_alias = host_addr;
2789 for (simbling_bus_num, hp_bus) in linux.hotplug_bus.iter() {
2790 if *simbling_bus_num != bus_num {
2791 let hp_bus_lock = hp_bus.lock();
2792 let hotplug_key = hp_bus_lock.get_hotplug_key();
2793 if let Some(HotPlugKey::HostDownstreamPort { host_addr }) = hotplug_key {
2794 if addr_alias.bus == host_addr.bus && hp_bus_lock.is_empty() {
2795 remove_hotplug_bridge(
2796 linux,
2797 sys_allocator,
2798 &mut buses_to_remove,
2799 hotplug_key.unwrap(),
2800 *simbling_bus_num,
2801 )?;
2802 }
2803 }
2804 }
2805 }
2806 }
2807 for bus in buses_to_remove.iter() {
2808 linux.hotplug_bus.remove(bus);
2809 }
2810 return Ok(());
2811 }
2812
2813 Err(anyhow!(
2814 "Can not find device {:?} on hotplug buses",
2815 hotplug_key
2816 ))
2817 }
2818
trigger_vm_suspend_and_wait_for_entry( guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>, tube: &SendTube, response: vm_control::VmResponse, suspend_tube: Arc<Mutex<SendTube>>, pm: Option<Arc<Mutex<dyn PmResource + Send>>>, )2819 pub fn trigger_vm_suspend_and_wait_for_entry(
2820 guest_suspended_cvar: Arc<(Mutex<bool>, Condvar)>,
2821 tube: &SendTube,
2822 response: vm_control::VmResponse,
2823 suspend_tube: Arc<Mutex<SendTube>>,
2824 pm: Option<Arc<Mutex<dyn PmResource + Send>>>,
2825 ) {
2826 let (lock, cvar) = &*guest_suspended_cvar;
2827 let mut guest_suspended = lock.lock();
2828
2829 *guest_suspended = false;
2830
2831 // During suspend also emulate sleepbtn, which allows to suspend VM (if running e.g. acpid and
2832 // reacts on sleep button events)
2833 if let Some(pm) = pm {
2834 pm.lock().slpbtn_evt();
2835 } else {
2836 error!("generating sleepbtn during suspend not supported");
2837 }
2838
2839 // Wait for notification about guest suspension, if not received after 15sec,
2840 // proceed anyway.
2841 let result = cvar.wait_timeout(guest_suspended, std::time::Duration::from_secs(15));
2842 guest_suspended = result.0;
2843
2844 if result.1.timed_out() {
2845 warn!("Guest suspension timeout - proceeding anyway");
2846 } else if *guest_suspended {
2847 info!("Guest suspended");
2848 }
2849
2850 if let Err(e) = suspend_tube.lock().send(&true) {
2851 error!("failed to trigger suspend event: {}", e);
2852 }
2853 // Now we ready to send response over the tube and communicate that VM suspend has finished
2854 if let Err(e) = tube.send(&response) {
2855 error!("failed to send VmResponse: {}", e);
2856 }
2857 }
2858
2859 #[cfg(feature = "pvclock")]
2860 #[derive(Debug)]
2861 /// The action requested by the pvclock device to perform on the main thread.
2862 enum PvClockAction {
2863 #[cfg(target_arch = "aarch64")]
2864 /// Update the counter offset with VmAarch64::set_counter_offset.
2865 SetCounterOffset(u64),
2866 }
2867
2868 #[cfg(feature = "pvclock")]
send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<Option<PvClockAction>>2869 fn send_pvclock_cmd(tube: &Tube, command: PvClockCommand) -> Result<Option<PvClockAction>> {
2870 tube.send(&command)
2871 .with_context(|| format!("failed to send pvclock command {:?}", command))?;
2872 let resp = tube
2873 .recv::<PvClockCommandResponse>()
2874 .context("failed to receive pvclock command response")?;
2875 match resp {
2876 PvClockCommandResponse::Err(e) => {
2877 bail!("pvclock encountered error on {:?}: {}", command, e);
2878 }
2879 PvClockCommandResponse::DeviceInactive => {
2880 warn!("Tried to send {command:?} but pvclock device was inactive");
2881 Ok(None)
2882 }
2883 PvClockCommandResponse::Resumed {
2884 total_suspended_ticks,
2885 } => {
2886 info!("{command:?} completed with {total_suspended_ticks} total_suspended_ticks");
2887 cfg_if::cfg_if! {
2888 if #[cfg(target_arch = "aarch64")] {
2889 Ok(Some(PvClockAction::SetCounterOffset(total_suspended_ticks)))
2890 } else {
2891 // For non-AArch64 platforms this is handled by directly updating the offset in
2892 // shared memory in the pvclock device worker.
2893 Ok(None)
2894 }
2895 }
2896 }
2897 PvClockCommandResponse::Ok => {
2898 info!("{command:?} completed with {resp:?}");
2899 Ok(None)
2900 }
2901 }
2902 }
2903
2904 #[cfg(target_arch = "x86_64")]
handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>( linux: &mut RunnableLinuxVm<V, Vcpu>, sys_allocator: &mut SystemAllocator, cfg: &Config, add_control_tube: &mut impl FnMut(AnyControlTube), hp_control_tube: &mpsc::Sender<PciRootCommand>, iommu_host_tube: Option<&Tube>, device: &HotPlugDeviceInfo, add: bool, #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>, vfio_container_manager: &mut VfioContainerManager, ) -> VmResponse2905 fn handle_hotplug_command<V: VmArch, Vcpu: VcpuArch>(
2906 linux: &mut RunnableLinuxVm<V, Vcpu>,
2907 sys_allocator: &mut SystemAllocator,
2908 cfg: &Config,
2909 add_control_tube: &mut impl FnMut(AnyControlTube),
2910 hp_control_tube: &mpsc::Sender<PciRootCommand>,
2911 iommu_host_tube: Option<&Tube>,
2912 device: &HotPlugDeviceInfo,
2913 add: bool,
2914 #[cfg(feature = "swap")] swap_controller: &mut Option<SwapController>,
2915 vfio_container_manager: &mut VfioContainerManager,
2916 ) -> VmResponse {
2917 let iommu_host_tube = if cfg.vfio_isolate_hotplug {
2918 iommu_host_tube
2919 } else {
2920 None
2921 };
2922
2923 let ret = if add {
2924 add_hotplug_device(
2925 linux,
2926 sys_allocator,
2927 cfg,
2928 add_control_tube,
2929 hp_control_tube,
2930 iommu_host_tube,
2931 device,
2932 #[cfg(feature = "swap")]
2933 swap_controller,
2934 vfio_container_manager,
2935 )
2936 } else {
2937 remove_hotplug_device(linux, sys_allocator, iommu_host_tube, device)
2938 };
2939
2940 match ret {
2941 Ok(()) => VmResponse::Ok,
2942 Err(e) => {
2943 error!("handle_hotplug_command failure: {}", e);
2944 VmResponse::Err(base::Error::new(libc::EINVAL))
2945 }
2946 }
2947 }
2948
2949 struct ControlLoopState<'a, V: VmArch, Vcpu: VcpuArch> {
2950 linux: &'a mut RunnableLinuxVm<V, Vcpu>,
2951 cfg: &'a Config,
2952 sys_allocator: &'a Arc<Mutex<SystemAllocator>>,
2953 control_tubes: &'a BTreeMap<usize, TaggedControlTube>,
2954 disk_host_tubes: &'a [Tube],
2955 #[cfg(feature = "gpu")]
2956 gpu_control_tube: Option<&'a Tube>,
2957 #[cfg(feature = "usb")]
2958 usb_control_tube: &'a Tube,
2959 #[cfg(target_arch = "x86_64")]
2960 iommu_host_tube: &'a Option<Arc<Mutex<Tube>>>,
2961 #[cfg(target_arch = "x86_64")]
2962 hp_control_tube: &'a mpsc::Sender<PciRootCommand>,
2963 guest_suspended_cvar: &'a Option<Arc<(Mutex<bool>, Condvar)>>,
2964 #[cfg(feature = "pci-hotplug")]
2965 hotplug_manager: &'a mut Option<PciHotPlugManager>,
2966 #[cfg(feature = "swap")]
2967 swap_controller: &'a mut Option<SwapController>,
2968 vcpu_handles: &'a [(JoinHandle<()>, mpsc::Sender<vm_control::VcpuControl>)],
2969 #[cfg(feature = "balloon")]
2970 balloon_tube: Option<&'a mut BalloonTube>,
2971 device_ctrl_tube: &'a Tube,
2972 irq_handler_control: &'a Tube,
2973 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
2974 vm_memory_handler_control: &'a Tube,
2975 #[cfg(feature = "registered_events")]
2976 registered_evt_tubes: &'a mut HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
2977 #[cfg(feature = "pvclock")]
2978 pvclock_host_tube: Option<Arc<Tube>>,
2979 vfio_container_manager: &'a mut VfioContainerManager,
2980 suspended_pvclock_state: &'a mut Option<hypervisor::ClockState>,
2981 vcpus_pid_tid: &'a BTreeMap<usize, (u32, u32)>,
2982 }
2983
2984 struct VmRequestResult {
2985 response: Option<VmResponse>,
2986 exit: bool,
2987 }
2988
2989 impl VmRequestResult {
new(response: Option<VmResponse>, exit: bool) -> Self2990 fn new(response: Option<VmResponse>, exit: bool) -> Self {
2991 VmRequestResult { response, exit }
2992 }
2993 }
2994
process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( state: &mut ControlLoopState<V, Vcpu>, id: usize, tube: &Tube, request: VmRequest, #[cfg_attr( not(any(target_arch = "x86_64", feature = "pci-hotplug")), allow(unused_variables, clippy::ptr_arg) )] add_tubes: &mut Vec<TaggedControlTube>, ) -> Result<VmRequestResult>2995 fn process_vm_request<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
2996 state: &mut ControlLoopState<V, Vcpu>,
2997 id: usize,
2998 tube: &Tube,
2999 request: VmRequest,
3000 #[cfg_attr(
3001 not(any(target_arch = "x86_64", feature = "pci-hotplug")),
3002 allow(unused_variables, clippy::ptr_arg)
3003 )]
3004 add_tubes: &mut Vec<TaggedControlTube>,
3005 ) -> Result<VmRequestResult> {
3006 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3007 let mut add_irq_control_tubes = Vec::new();
3008 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3009 let mut add_vm_memory_control_tubes = Vec::new();
3010
3011 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
3012 let mut add_control_tube = |t| match t {
3013 AnyControlTube::DeviceControlTube(_) => {
3014 panic!("hotplugging DeviceControlTube not supported yet")
3015 }
3016 AnyControlTube::IrqTube(t) => add_irq_control_tubes.push(t),
3017 AnyControlTube::TaggedControlTube(t) => add_tubes.push(t),
3018 AnyControlTube::VmMemoryTube(t) => add_vm_memory_control_tubes.push(t),
3019 };
3020
3021 let response = match request {
3022 VmRequest::Exit => {
3023 return Ok(VmRequestResult::new(Some(VmResponse::Ok), true));
3024 }
3025 VmRequest::HotPlugVfioCommand { device, add } => {
3026 #[cfg(target_arch = "x86_64")]
3027 {
3028 handle_hotplug_command(
3029 state.linux,
3030 &mut state.sys_allocator.lock(),
3031 state.cfg,
3032 &mut add_control_tube,
3033 state.hp_control_tube,
3034 state.iommu_host_tube.as_ref().map(|t| t.lock()).as_deref(),
3035 &device,
3036 add,
3037 #[cfg(feature = "swap")]
3038 state.swap_controller,
3039 state.vfio_container_manager,
3040 )
3041 }
3042
3043 #[cfg(not(target_arch = "x86_64"))]
3044 {
3045 // Suppress warnings.
3046 let _ = (device, add);
3047 let _ = &state.vfio_container_manager;
3048 VmResponse::Ok
3049 }
3050 }
3051 #[cfg(feature = "pci-hotplug")]
3052 VmRequest::HotPlugNetCommand(net_cmd) => {
3053 if let Some(hotplug_manager) = state.hotplug_manager.as_mut() {
3054 handle_hotplug_net_command(
3055 net_cmd,
3056 state.linux,
3057 &mut state.sys_allocator.lock(),
3058 &mut add_control_tube,
3059 hotplug_manager,
3060 )
3061 } else {
3062 VmResponse::ErrString("PCI hotplug is not enabled.".to_owned())
3063 }
3064 }
3065 #[cfg(feature = "registered_events")]
3066 VmRequest::RegisterListener { socket_addr, event } => {
3067 let (registered_tube, already_registered) =
3068 find_registered_tube(state.registered_evt_tubes, &socket_addr, event);
3069
3070 if !already_registered {
3071 let addr_tube = make_addr_tube_from_maybe_existing(registered_tube, socket_addr)?;
3072
3073 if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
3074 tubes.insert(addr_tube);
3075 } else {
3076 state
3077 .registered_evt_tubes
3078 .insert(event, vec![addr_tube].into_iter().collect());
3079 }
3080 }
3081 VmResponse::Ok
3082 }
3083 #[cfg(feature = "registered_events")]
3084 VmRequest::UnregisterListener { socket_addr, event } => {
3085 if let Some(tubes) = state.registered_evt_tubes.get_mut(&event) {
3086 tubes.retain(|t| t.socket_addr != socket_addr);
3087 }
3088 state
3089 .registered_evt_tubes
3090 .retain(|_, tubes| !tubes.is_empty());
3091 VmResponse::Ok
3092 }
3093 #[cfg(feature = "registered_events")]
3094 VmRequest::Unregister { socket_addr } => {
3095 for (_, tubes) in state.registered_evt_tubes.iter_mut() {
3096 tubes.retain(|t| t.socket_addr != socket_addr);
3097 }
3098 state
3099 .registered_evt_tubes
3100 .retain(|_, tubes| !tubes.is_empty());
3101 VmResponse::Ok
3102 }
3103 #[cfg(feature = "balloon")]
3104 VmRequest::BalloonCommand(cmd) => {
3105 if let Some(tube) = state.balloon_tube.as_mut() {
3106 let Some((r, key)) = tube.send_cmd(cmd, Some(id)) else {
3107 return Ok(VmRequestResult::new(None, false));
3108 };
3109 if key != id {
3110 let Some(TaggedControlTube::Vm(tube)) = state.control_tubes.get(&key) else {
3111 return Ok(VmRequestResult::new(None, false));
3112 };
3113 if let Err(e) = tube.send(&r) {
3114 error!("failed to send VmResponse: {}", e);
3115 }
3116 return Ok(VmRequestResult::new(None, false));
3117 }
3118 r
3119 } else {
3120 VmResponse::Err(base::Error::new(libc::ENOTSUP))
3121 }
3122 }
3123 VmRequest::VcpuPidTid => VmResponse::VcpuPidTidResponse {
3124 pid_tid_map: state.vcpus_pid_tid.clone(),
3125 },
3126 VmRequest::Throttle(vcpu, cycles) => {
3127 vcpu::kick_vcpu(
3128 &state.vcpu_handles.get(vcpu),
3129 state.linux.irq_chip.as_irq_chip(),
3130 VcpuControl::Throttle(cycles),
3131 );
3132 return Ok(VmRequestResult::new(None, false));
3133 }
3134 _ => {
3135 if !state.cfg.force_s2idle {
3136 #[cfg(feature = "pvclock")]
3137 if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
3138 // Update clock offset when pvclock is used.
3139 if let VmRequest::ResumeVcpus = request {
3140 let cmd = PvClockCommand::Resume;
3141 match send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
3142 Ok(action) => {
3143 info!("{:?} command successfully processed", cmd);
3144 if let Some(action) = action {
3145 match action {
3146 #[cfg(target_arch = "aarch64")]
3147 PvClockAction::SetCounterOffset(offset) => {
3148 state.linux.vm.set_counter_offset(offset)?;
3149 }
3150 }
3151 }
3152 }
3153 Err(e) => error!("{:?} command failed: {:#}", cmd, e),
3154 };
3155 }
3156 }
3157 }
3158 let kick_all_vcpus = |msg| {
3159 if let VcpuControl::RunState(VmRunMode::Running) = msg {
3160 for dev in &state.linux.resume_notify_devices {
3161 dev.lock().resume_imminent();
3162 }
3163 }
3164 vcpu::kick_all_vcpus(state.vcpu_handles, state.linux.irq_chip.as_irq_chip(), msg);
3165 };
3166 let response = request.execute(
3167 &state.linux.vm,
3168 state.disk_host_tubes,
3169 &mut state.linux.pm,
3170 #[cfg(feature = "gpu")]
3171 state.gpu_control_tube,
3172 #[cfg(not(feature = "gpu"))]
3173 None,
3174 #[cfg(feature = "usb")]
3175 Some(state.usb_control_tube),
3176 #[cfg(not(feature = "usb"))]
3177 None,
3178 &mut state.linux.bat_control,
3179 kick_all_vcpus,
3180 |index, msg| {
3181 vcpu::kick_vcpu(
3182 &state.vcpu_handles.get(index),
3183 state.linux.irq_chip.as_irq_chip(),
3184 msg,
3185 )
3186 },
3187 state.cfg.force_s2idle,
3188 #[cfg(feature = "swap")]
3189 state.swap_controller.as_ref(),
3190 state.device_ctrl_tube,
3191 state.vcpu_handles.len(),
3192 state.irq_handler_control,
3193 || state.linux.irq_chip.snapshot(state.linux.vcpu_count),
3194 state.suspended_pvclock_state,
3195 );
3196 if state.cfg.force_s2idle {
3197 if let VmRequest::SuspendVcpus = request {
3198 // Spawn s2idle wait thread.
3199 let send_tube = tube.try_clone_send_tube().unwrap();
3200 let suspend_tube = state.linux.suspend_tube.0.clone();
3201 let guest_suspended_cvar = state.guest_suspended_cvar.clone();
3202 let delayed_response = response.clone();
3203 let pm = state.linux.pm.clone();
3204
3205 std::thread::Builder::new()
3206 .name("s2idle_wait".to_owned())
3207 .spawn(move || {
3208 trigger_vm_suspend_and_wait_for_entry(
3209 guest_suspended_cvar.unwrap(),
3210 &send_tube,
3211 delayed_response,
3212 suspend_tube,
3213 pm,
3214 )
3215 })
3216 .context("failed to spawn s2idle_wait thread")?;
3217
3218 // For s2idle, omit the response since it will be sent by
3219 // s2idle_wait thread when suspension actually happens.
3220 return Ok(VmRequestResult::new(None, false));
3221 }
3222 } else {
3223 #[cfg(feature = "pvclock")]
3224 if let Some(ref pvclock_host_tube) = state.pvclock_host_tube {
3225 // Record the time after VCPUs are suspended to track suspension duration.
3226 if let VmRequest::SuspendVcpus = request {
3227 let cmd = PvClockCommand::Suspend;
3228 match send_pvclock_cmd(pvclock_host_tube, cmd.clone()) {
3229 Ok(action) => {
3230 info!("{:?} command successfully processed", cmd);
3231 if let Some(action) = action {
3232 error!("Unexpected action {:?} requested for suspend", action);
3233 }
3234 }
3235 Err(e) => error!("{:?} command failed: {:#}", cmd, e),
3236 };
3237 }
3238 }
3239 }
3240 response
3241 }
3242 };
3243
3244 cfg_if::cfg_if! {
3245 if #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))] {
3246 if !add_irq_control_tubes.is_empty() {
3247 state
3248 .irq_handler_control
3249 .send(&IrqHandlerRequest::AddIrqControlTubes(
3250 add_irq_control_tubes,
3251 ))?;
3252 }
3253 if !add_vm_memory_control_tubes.is_empty() {
3254 state
3255 .vm_memory_handler_control
3256 .send(&VmMemoryHandlerRequest::AddControlTubes(
3257 add_vm_memory_control_tubes,
3258 ))?;
3259 }
3260 }
3261 }
3262
3263 Ok(VmRequestResult::new(Some(response), false))
3264 }
3265
process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( state: &mut ControlLoopState<V, Vcpu>, id: usize, socket: &TaggedControlTube, ) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)>3266 fn process_vm_control_event<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3267 state: &mut ControlLoopState<V, Vcpu>,
3268 id: usize,
3269 socket: &TaggedControlTube,
3270 ) -> Result<(bool, Vec<usize>, Vec<TaggedControlTube>)> {
3271 let mut vm_control_ids_to_remove = Vec::new();
3272 let mut add_tubes = Vec::new();
3273 match socket {
3274 TaggedControlTube::Vm(tube) => match tube.recv::<VmRequest>() {
3275 Ok(request) => {
3276 let res = process_vm_request(state, id, tube, request, &mut add_tubes)?;
3277
3278 if let Some(response) = res.response {
3279 if let Err(e) = tube.send(&response) {
3280 error!("failed to send VmResponse: {}", e);
3281 }
3282 }
3283
3284 if res.exit {
3285 return Ok((true, Vec::new(), Vec::new()));
3286 }
3287 }
3288 Err(e) => {
3289 if let TubeError::Disconnected = e {
3290 vm_control_ids_to_remove.push(id);
3291 } else {
3292 error!("failed to recv VmRequest: {}", e);
3293 }
3294 }
3295 },
3296 TaggedControlTube::VmMsync(tube) => match tube.recv::<VmMemoryMappingRequest>() {
3297 Ok(request) => {
3298 let response = request.execute(&mut state.linux.vm);
3299 if let Err(e) = tube.send(&response) {
3300 error!("failed to send VmMsyncResponse: {}", e);
3301 }
3302 }
3303 Err(e) => {
3304 if let TubeError::Disconnected = e {
3305 vm_control_ids_to_remove.push(id);
3306 } else {
3307 error!("failed to recv VmMsyncRequest: {}", e);
3308 }
3309 }
3310 },
3311 TaggedControlTube::Fs(tube) => match tube.recv::<FsMappingRequest>() {
3312 Ok(request) => {
3313 let response =
3314 request.execute(&mut state.linux.vm, &mut state.sys_allocator.lock());
3315 if let Err(e) = tube.send(&response) {
3316 error!("failed to send VmResponse: {}", e);
3317 }
3318 }
3319 Err(e) => {
3320 if let TubeError::Disconnected = e {
3321 vm_control_ids_to_remove.push(id);
3322 } else {
3323 error!("failed to recv VmResponse: {}", e);
3324 }
3325 }
3326 },
3327 }
3328
3329 Ok((false, vm_control_ids_to_remove, add_tubes))
3330 }
3331
3332 #[cfg(feature = "registered_events")]
3333 struct AddressedProtoTube {
3334 tube: Rc<ProtoTube>,
3335 socket_addr: String,
3336 }
3337
3338 #[cfg(feature = "registered_events")]
3339 impl PartialEq for AddressedProtoTube {
eq(&self, other: &Self) -> bool3340 fn eq(&self, other: &Self) -> bool {
3341 self.socket_addr == other.socket_addr
3342 }
3343 }
3344
3345 #[cfg(feature = "registered_events")]
3346 impl Eq for AddressedProtoTube {}
3347
3348 #[cfg(feature = "registered_events")]
3349 impl Hash for AddressedProtoTube {
hash<H: std::hash::Hasher>(&self, state: &mut H)3350 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
3351 self.socket_addr.hash(state);
3352 }
3353 }
3354
3355 #[cfg(feature = "registered_events")]
3356 impl AddressedProtoTube {
send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError>3357 pub fn send<M: protobuf::Message>(&self, msg: &M) -> Result<(), base::TubeError> {
3358 self.tube.send_proto(msg)
3359 }
3360 }
3361
3362 #[cfg(feature = "registered_events")]
find_registered_tube<'a>( registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>, socket_addr: &str, event: RegisteredEvent, ) -> (Option<&'a Rc<ProtoTube>>, bool)3363 fn find_registered_tube<'a>(
3364 registered_tubes: &'a HashMap<RegisteredEvent, HashSet<AddressedProtoTube>>,
3365 socket_addr: &str,
3366 event: RegisteredEvent,
3367 ) -> (Option<&'a Rc<ProtoTube>>, bool) {
3368 let mut registered_tube: Option<&Rc<ProtoTube>> = None;
3369 let mut already_registered = false;
3370 'outer: for (evt, addr_tubes) in registered_tubes {
3371 for addr_tube in addr_tubes {
3372 if addr_tube.socket_addr == socket_addr {
3373 if *evt == event {
3374 already_registered = true;
3375 break 'outer;
3376 }
3377 // Since all tubes of the same addr should
3378 // be an RC to the same tube, it doesn't
3379 // matter which one we get. But we do need
3380 // to check for a registration for the
3381 // current event, so can't break here.
3382 registered_tube = Some(&addr_tube.tube);
3383 }
3384 }
3385 }
3386 (registered_tube, already_registered)
3387 }
3388
3389 #[cfg(feature = "registered_events")]
make_addr_tube_from_maybe_existing( tube: Option<&Rc<ProtoTube>>, addr: String, ) -> Result<AddressedProtoTube>3390 fn make_addr_tube_from_maybe_existing(
3391 tube: Option<&Rc<ProtoTube>>,
3392 addr: String,
3393 ) -> Result<AddressedProtoTube> {
3394 if let Some(registered_tube) = tube {
3395 Ok(AddressedProtoTube {
3396 tube: registered_tube.clone(),
3397 socket_addr: addr,
3398 })
3399 } else {
3400 let sock = UnixSeqpacket::connect(addr.clone()).with_context(|| {
3401 format!("failed to connect to registered listening socket {}", addr)
3402 })?;
3403 let tube = ProtoTube::new_from_unix_seqpacket(sock)?;
3404 Ok(AddressedProtoTube {
3405 tube: Rc::new(tube),
3406 socket_addr: addr,
3407 })
3408 }
3409 }
3410
run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( mut linux: RunnableLinuxVm<V, Vcpu>, sys_allocator: SystemAllocator, cfg: Config, control_server_socket: Option<UnlinkUnixSeqpacketListener>, all_control_tubes: Vec<AnyControlTube>, #[cfg(feature = "usb")] usb_control_tube: Tube, vm_evt_rdtube: RecvTube, vm_evt_wrtube: SendTube, sigchld_fd: SignalFd, gralloc: RutabagaGralloc, vcpu_ids: Vec<usize>, iommu_host_tube: Option<Tube>, #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>, #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>, #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>, #[allow(unused_mut)] #[cfg(feature = "swap")] mut swap_controller: Option<SwapController>, #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube, guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>, metrics_tube: RecvTube, mut vfio_container_manager: VfioContainerManager, mut worker_process_pids: BTreeSet<Pid>, #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] vcpu_domain_paths: BTreeMap< usize, PathBuf, >, ) -> Result<ExitState>3411 fn run_control<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
3412 mut linux: RunnableLinuxVm<V, Vcpu>,
3413 sys_allocator: SystemAllocator,
3414 cfg: Config,
3415 control_server_socket: Option<UnlinkUnixSeqpacketListener>,
3416 all_control_tubes: Vec<AnyControlTube>,
3417 #[cfg(feature = "usb")] usb_control_tube: Tube,
3418 vm_evt_rdtube: RecvTube,
3419 vm_evt_wrtube: SendTube,
3420 sigchld_fd: SignalFd,
3421 gralloc: RutabagaGralloc,
3422 vcpu_ids: Vec<usize>,
3423 iommu_host_tube: Option<Tube>,
3424 #[cfg(target_arch = "x86_64")] hp_control_tube: mpsc::Sender<PciRootCommand>,
3425 #[cfg(target_arch = "x86_64")] hp_thread: std::thread::JoinHandle<()>,
3426 #[cfg(feature = "pci-hotplug")] mut hotplug_manager: Option<PciHotPlugManager>,
3427 #[allow(unused_mut)] // mut is required x86 only
3428 #[cfg(feature = "swap")]
3429 mut swap_controller: Option<SwapController>,
3430 #[cfg(feature = "registered_events")] reg_evt_rdtube: RecvTube,
3431 guest_suspended_cvar: Option<Arc<(Mutex<bool>, Condvar)>>,
3432 metrics_tube: RecvTube,
3433 mut vfio_container_manager: VfioContainerManager,
3434 // A set of PID of child processes whose clean exit is expected and can be ignored.
3435 mut worker_process_pids: BTreeSet<Pid>,
3436 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] vcpu_domain_paths: BTreeMap<
3437 usize,
3438 PathBuf,
3439 >,
3440 ) -> Result<ExitState> {
3441 // Split up `all_control_tubes`.
3442 #[cfg(feature = "balloon")]
3443 let mut balloon_host_tube = None;
3444 let mut disk_host_tubes = Vec::new();
3445 #[cfg(feature = "gpu")]
3446 let mut gpu_control_tube = None;
3447 #[cfg(feature = "pvclock")]
3448 let mut pvclock_host_tube = None;
3449 let mut irq_control_tubes = Vec::new();
3450 let mut vm_memory_control_tubes = Vec::new();
3451 let mut control_tubes = Vec::new();
3452 for t in all_control_tubes {
3453 match t {
3454 #[cfg(feature = "balloon")]
3455 AnyControlTube::DeviceControlTube(DeviceControlTube::Balloon(t)) => {
3456 assert!(balloon_host_tube.is_none());
3457 balloon_host_tube = Some(t)
3458 }
3459 AnyControlTube::DeviceControlTube(DeviceControlTube::Disk(t)) => {
3460 disk_host_tubes.push(t)
3461 }
3462 #[cfg(feature = "gpu")]
3463 AnyControlTube::DeviceControlTube(DeviceControlTube::Gpu(t)) => {
3464 assert!(gpu_control_tube.is_none());
3465 gpu_control_tube = Some(t)
3466 }
3467 #[cfg(feature = "pvclock")]
3468 AnyControlTube::DeviceControlTube(DeviceControlTube::PvClock(t)) => {
3469 assert!(pvclock_host_tube.is_none());
3470 pvclock_host_tube = Some(Arc::new(t))
3471 }
3472 AnyControlTube::IrqTube(t) => irq_control_tubes.push(t),
3473 AnyControlTube::TaggedControlTube(t) => control_tubes.push(t),
3474 AnyControlTube::VmMemoryTube(t) => vm_memory_control_tubes.push(t),
3475 }
3476 }
3477
3478 #[cfg(feature = "gdb")]
3479 let (to_gdb_channel, gdb) = if let Some(port) = cfg.gdb {
3480 // GDB needs a control socket to interrupt vcpus.
3481 let (gdb_host_tube, gdb_control_tube) = Tube::pair().context("failed to create tube")?;
3482 control_tubes.push(TaggedControlTube::Vm(gdb_host_tube));
3483 // Create a channel for GDB thread.
3484 let (to_gdb_channel, from_vcpu_channel) = mpsc::channel();
3485 (
3486 Some(to_gdb_channel),
3487 Some((port, gdb_control_tube, from_vcpu_channel)),
3488 )
3489 } else {
3490 (None, None)
3491 };
3492
3493 #[derive(EventToken)]
3494 enum Token {
3495 VmEvent,
3496 Suspend,
3497 ChildSignal,
3498 VmControlServer,
3499 VmControl {
3500 id: usize,
3501 },
3502 #[cfg(feature = "registered_events")]
3503 RegisteredEvent,
3504 #[cfg(feature = "balloon")]
3505 BalloonTube,
3506 }
3507 stdin()
3508 .set_raw_mode()
3509 .expect("failed to set terminal raw mode");
3510
3511 let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator));
3512 let iommu_host_tube = iommu_host_tube.map(|t| Arc::new(Mutex::new(t)));
3513
3514 let wait_ctx = WaitContext::build_with(&[
3515 (&linux.suspend_tube.1, Token::Suspend),
3516 (&sigchld_fd, Token::ChildSignal),
3517 (&vm_evt_rdtube, Token::VmEvent),
3518 #[cfg(feature = "registered_events")]
3519 (®_evt_rdtube, Token::RegisteredEvent),
3520 ])
3521 .context("failed to build wait context")?;
3522
3523 if let Some(socket_server) = &control_server_socket {
3524 wait_ctx
3525 .add(socket_server, Token::VmControlServer)
3526 .context("failed to add descriptor to wait context")?;
3527 }
3528 let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
3529 let mut next_control_id = control_tubes.len();
3530 for (id, socket) in control_tubes.iter() {
3531 wait_ctx
3532 .add(socket.as_ref(), Token::VmControl { id: *id })
3533 .context("failed to add descriptor to wait context")?;
3534 }
3535
3536 #[cfg(feature = "balloon")]
3537 let mut balloon_tube = balloon_host_tube
3538 .map(|tube| -> Result<BalloonTube> {
3539 wait_ctx
3540 .add(&tube, Token::BalloonTube)
3541 .context("failed to add descriptor to wait context")?;
3542 Ok(BalloonTube::new(tube))
3543 })
3544 .transpose()
3545 .context("failed to create balloon tube")?;
3546
3547 if cfg.jail_config.is_some() {
3548 // Before starting VCPUs, in case we started with some capabilities, drop them all.
3549 drop_capabilities().context("failed to drop process capabilities")?;
3550 }
3551
3552 let (device_ctrl_tube, device_ctrl_resp) = Tube::pair().context("failed to create tube")?;
3553 // Create devices thread, and restore if a restore file exists.
3554 linux.devices_thread = match create_devices_worker_thread(
3555 linux.vm.get_memory().clone(),
3556 linux.io_bus.clone(),
3557 linux.mmio_bus.clone(),
3558 device_ctrl_resp,
3559 ) {
3560 Ok(join_handle) => Some(join_handle),
3561 Err(e) => {
3562 return Err(anyhow!("Failed to start devices thread: {}", e));
3563 }
3564 };
3565
3566 let mut vcpu_handles = Vec::with_capacity(linux.vcpu_count);
3567 let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpu_count + 1));
3568
3569 if !linux
3570 .vm
3571 .get_hypervisor()
3572 .check_capability(HypervisorCap::ImmediateExit)
3573 {
3574 return Err(anyhow!(
3575 "missing required hypervisor capability ImmediateExit"
3576 ));
3577 }
3578
3579 vcpu::setup_vcpu_signal_handler()?;
3580
3581 let vcpus: Vec<Option<_>> = match linux.vcpus.take() {
3582 Some(vec) => vec.into_iter().map(Some).collect(),
3583 None => iter::repeat_with(|| None).take(linux.vcpu_count).collect(),
3584 };
3585 // Enable core scheduling before creating vCPUs so that the cookie will be
3586 // shared by all vCPU threads.
3587 // TODO(b/199312402): Avoid enabling core scheduling for the crosvm process
3588 // itself for even better performance. Only vCPUs need the feature.
3589 if cfg.core_scheduling && cfg.per_vm_core_scheduling {
3590 if let Err(e) = enable_core_scheduling() {
3591 error!("Failed to enable core scheduling: {}", e);
3592 }
3593 }
3594
3595 // The tasks file only exist on sysfs if CgroupV1 hierachies are enabled
3596 let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
3597 None => None,
3598 Some(cgroup_path) => {
3599 // Move main process to cgroup_path
3600 match File::create(cgroup_path.join("tasks")) {
3601 Ok(file) => Some(file),
3602 Err(_) => {
3603 info!(
3604 "Unable to open tasks file in cgroup: {}, trying CgroupV2",
3605 cgroup_path.display()
3606 );
3607 None
3608 }
3609 }
3610 }
3611 };
3612
3613 // vCPU freq domains are currently only supported with CgroupsV2.
3614 let mut vcpu_cgroup_v2_files: std::collections::BTreeMap<usize, File> = BTreeMap::new();
3615 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
3616 for (vcpu_id, vcpu_domain_path) in vcpu_domain_paths.iter() {
3617 let vcpu_cgroup_v2_file = File::create(vcpu_domain_path.join("cgroup.threads"))
3618 .with_context(|| {
3619 format!(
3620 "failed to create vcpu-cgroup-path {}",
3621 vcpu_domain_path.join("cgroup.threads").display(),
3622 )
3623 })?;
3624 vcpu_cgroup_v2_files.insert(*vcpu_id, vcpu_cgroup_v2_file);
3625 }
3626
3627 #[cfg(target_arch = "x86_64")]
3628 let bus_lock_ratelimit_ctrl: Arc<Mutex<Ratelimit>> = Arc::new(Mutex::new(Ratelimit::new()));
3629 #[cfg(target_arch = "x86_64")]
3630 if cfg.bus_lock_ratelimit > 0 {
3631 let bus_lock_ratelimit = cfg.bus_lock_ratelimit;
3632 if linux.vm.check_capability(VmCap::BusLockDetect) {
3633 info!("Hypervisor support bus lock detect");
3634 linux
3635 .vm
3636 .enable_capability(VmCap::BusLockDetect, 0)
3637 .expect("kvm: Failed to enable bus lock detection cap");
3638 info!("Hypervisor enabled bus lock detect");
3639 bus_lock_ratelimit_ctrl
3640 .lock()
3641 .ratelimit_set_speed(bus_lock_ratelimit);
3642 } else {
3643 bail!("Kvm: bus lock detection unsuported");
3644 }
3645 }
3646
3647 #[cfg(target_os = "android")]
3648 android::set_process_profiles(&cfg.task_profiles)?;
3649
3650 #[allow(unused_mut)]
3651 let mut run_mode = if cfg.suspended {
3652 // Sleep devices before creating vcpus.
3653 device_ctrl_tube
3654 .send(&DeviceControlCommand::SleepDevices)
3655 .context("send command to devices control socket")?;
3656 match device_ctrl_tube
3657 .recv()
3658 .context("receive from devices control socket")?
3659 {
3660 VmResponse::Ok => (),
3661 resp => bail!("device sleep failed: {}", resp),
3662 }
3663 VmRunMode::Suspending
3664 } else {
3665 VmRunMode::Running
3666 };
3667 #[cfg(feature = "gdb")]
3668 if to_gdb_channel.is_some() {
3669 // Wait until a GDB client attaches
3670 run_mode = VmRunMode::Breakpoint;
3671 }
3672 // If we are restoring from a snapshot, then start suspended.
3673 let (run_mode, post_restore_run_mode) = if cfg.restore_path.is_some() {
3674 (VmRunMode::Suspending, run_mode)
3675 } else {
3676 (run_mode, run_mode)
3677 };
3678
3679 // Architecture-specific code must supply a vcpu_init element for each VCPU.
3680 assert_eq!(vcpus.len(), linux.vcpu_init.len());
3681
3682 let (vcpu_pid_tid_sender, vcpu_pid_tid_receiver) = mpsc::channel();
3683 for ((cpu_id, vcpu), vcpu_init) in vcpus.into_iter().enumerate().zip(linux.vcpu_init.drain(..))
3684 {
3685 let vcpu_cgroup_file: Option<File>;
3686 if let Some(cgroup_file) = &vcpu_cgroup_tasks_file {
3687 vcpu_cgroup_file = Some(cgroup_file.try_clone().unwrap())
3688 } else if !cfg.cpu_freq_domains.is_empty() {
3689 vcpu_cgroup_file = Some(
3690 (vcpu_cgroup_v2_files.remove(&cpu_id).unwrap())
3691 .try_clone()
3692 .unwrap(),
3693 )
3694 } else {
3695 vcpu_cgroup_file = None
3696 };
3697
3698 let (to_vcpu_channel, from_main_channel) = mpsc::channel();
3699 let vcpu_affinity = match linux.vcpu_affinity.clone() {
3700 Some(VcpuAffinity::Global(v)) => v,
3701 Some(VcpuAffinity::PerVcpu(mut m)) => m.remove(&cpu_id).unwrap_or_default(),
3702 None => Default::default(),
3703 };
3704
3705 #[cfg(target_arch = "x86_64")]
3706 let vcpu_hybrid_type = if !cfg.vcpu_hybrid_type.is_empty() {
3707 Some(*cfg.vcpu_hybrid_type.get(&cpu_id).unwrap())
3708 } else {
3709 None
3710 };
3711
3712 #[cfg(target_arch = "x86_64")]
3713 let cpu_config = Some(CpuConfigX86_64::new(
3714 cfg.force_calibrated_tsc_leaf,
3715 cfg.host_cpu_topology,
3716 cfg.enable_hwp,
3717 cfg.no_smt,
3718 cfg.itmt,
3719 vcpu_hybrid_type,
3720 ));
3721 #[cfg(target_arch = "x86_64")]
3722 let bus_lock_ratelimit_ctrl = Arc::clone(&bus_lock_ratelimit_ctrl);
3723
3724 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
3725 let cpu_config = None;
3726
3727 #[cfg(target_arch = "riscv64")]
3728 let cpu_config = Some(CpuConfigRiscv64::new(vcpu_init.fdt_address));
3729
3730 let handle = vcpu::run_vcpu(
3731 cpu_id,
3732 vcpu_ids[cpu_id],
3733 vcpu,
3734 vcpu_init,
3735 linux.vm.try_clone().context("failed to clone vm")?,
3736 linux
3737 .irq_chip
3738 .try_box_clone()
3739 .context("failed to clone irqchip")?,
3740 linux.vcpu_count,
3741 linux.rt_cpus.contains(&cpu_id),
3742 vcpu_affinity,
3743 linux.delay_rt,
3744 vcpu_thread_barrier.clone(),
3745 (*linux.io_bus).clone(),
3746 (*linux.mmio_bus).clone(),
3747 vm_evt_wrtube
3748 .try_clone()
3749 .context("failed to clone vm event tube")?,
3750 from_main_channel,
3751 #[cfg(feature = "gdb")]
3752 to_gdb_channel.clone(),
3753 cfg.core_scheduling,
3754 cfg.per_vm_core_scheduling,
3755 cpu_config,
3756 match vcpu_cgroup_file {
3757 None => None,
3758 Some(ref f) => Some(
3759 f.try_clone()
3760 .context("failed to clone vcpu cgroup tasks file")?,
3761 ),
3762 },
3763 #[cfg(target_arch = "x86_64")]
3764 bus_lock_ratelimit_ctrl,
3765 run_mode,
3766 cfg.boost_uclamp,
3767 vcpu_pid_tid_sender.clone(),
3768 )?;
3769 vcpu_handles.push((handle, to_vcpu_channel));
3770 }
3771
3772 let mut vcpus_pid_tid = BTreeMap::new();
3773 for _ in 0..vcpu_handles.len() {
3774 let vcpu_pid_tid: VcpuPidTid = vcpu_pid_tid_receiver
3775 .recv()
3776 .context("failed receiving vcpu pid/tid")?;
3777 if vcpus_pid_tid
3778 .insert(
3779 vcpu_pid_tid.vcpu_id,
3780 (vcpu_pid_tid.process_id, vcpu_pid_tid.thread_id),
3781 )
3782 .is_some()
3783 {
3784 return Err(anyhow!(
3785 "Vcpu {} returned more than 1 PID and TID",
3786 vcpu_pid_tid.vcpu_id
3787 ));
3788 }
3789 }
3790
3791 #[cfg(feature = "gdb")]
3792 // Spawn GDB thread.
3793 if let Some((gdb_port_num, gdb_control_tube, from_vcpu_channel)) = gdb {
3794 let to_vcpu_channels = vcpu_handles
3795 .iter()
3796 .map(|(_handle, channel)| channel.clone())
3797 .collect();
3798 let target = GdbStub::new(gdb_control_tube, to_vcpu_channels, from_vcpu_channel);
3799 std::thread::Builder::new()
3800 .name("gdb".to_owned())
3801 .spawn(move || gdb_thread(target, gdb_port_num))
3802 .context("failed to spawn GDB thread")?;
3803 };
3804
3805 let (irq_handler_control, irq_handler_control_for_thread) = Tube::pair()?;
3806 let sys_allocator_for_thread = sys_allocator_mutex.clone();
3807 let irq_chip_for_thread = linux.irq_chip.try_box_clone()?;
3808 let irq_handler_thread = std::thread::Builder::new()
3809 .name("irq_handler_thread".into())
3810 .spawn(move || {
3811 irq_handler_thread(
3812 irq_control_tubes,
3813 irq_chip_for_thread,
3814 sys_allocator_for_thread,
3815 irq_handler_control_for_thread,
3816 )
3817 })
3818 .unwrap();
3819
3820 let (vm_memory_handler_control, vm_memory_handler_control_for_thread) = Tube::pair()?;
3821 let vm_memory_handler_thread = std::thread::Builder::new()
3822 .name("vm_memory_handler_thread".into())
3823 .spawn({
3824 let vm = linux.vm.try_clone().context("failed to clone Vm")?;
3825 let sys_allocator_mutex = sys_allocator_mutex.clone();
3826 let iommu_client = iommu_host_tube
3827 .as_ref()
3828 .map(|t| VmMemoryRequestIommuClient::new(t.clone()));
3829 move || {
3830 vm_memory_handler_thread(
3831 vm_memory_control_tubes,
3832 vm,
3833 sys_allocator_mutex,
3834 gralloc,
3835 iommu_client,
3836 vm_memory_handler_control_for_thread,
3837 )
3838 }
3839 })
3840 .unwrap();
3841
3842 vcpu_thread_barrier.wait();
3843
3844 // See comment on `VmRequest::execute`.
3845 let mut suspended_pvclock_state: Option<hypervisor::ClockState> = None;
3846
3847 // Restore VM (if applicable).
3848 // Must happen after the vCPU barrier to avoid deadlock.
3849 if let Some(path) = &cfg.restore_path {
3850 vm_control::do_restore(
3851 path,
3852 |msg| vcpu::kick_all_vcpus(&vcpu_handles, linux.irq_chip.as_irq_chip(), msg),
3853 |msg, index| {
3854 vcpu::kick_vcpu(&vcpu_handles.get(index), linux.irq_chip.as_irq_chip(), msg)
3855 },
3856 &irq_handler_control,
3857 &device_ctrl_tube,
3858 linux.vcpu_count,
3859 |image| {
3860 linux
3861 .irq_chip
3862 .try_box_clone()?
3863 .restore(image, linux.vcpu_count)
3864 },
3865 /* require_encrypted= */ false,
3866 &mut suspended_pvclock_state,
3867 )?;
3868 // Allow the vCPUs to start for real.
3869 vcpu::kick_all_vcpus(
3870 &vcpu_handles,
3871 linux.irq_chip.as_irq_chip(),
3872 VcpuControl::RunState(post_restore_run_mode),
3873 )
3874 }
3875
3876 #[cfg(feature = "swap")]
3877 if let Some(swap_controller) = &swap_controller {
3878 swap_controller
3879 .on_static_devices_setup_complete()
3880 .context("static device setup complete")?;
3881 }
3882
3883 let metrics_thread = if metrics::is_initialized() {
3884 Some(
3885 std::thread::Builder::new()
3886 .name("metrics_thread".into())
3887 .spawn(move || {
3888 if let Err(e) = MetricsController::new(vec![metrics_tube]).run() {
3889 error!("Metrics controller error: {:?}", e);
3890 }
3891 })
3892 .context("metrics thread failed")?,
3893 )
3894 } else {
3895 None
3896 };
3897
3898 let mut exit_state = ExitState::Stop;
3899 let mut pvpanic_code = PvPanicCode::Unknown;
3900 #[cfg(feature = "registered_events")]
3901 let mut registered_evt_tubes: HashMap<RegisteredEvent, HashSet<AddressedProtoTube>> =
3902 HashMap::new();
3903
3904 'wait: loop {
3905 let events = {
3906 match wait_ctx.wait() {
3907 Ok(v) => v,
3908 Err(e) => {
3909 error!("failed to poll: {}", e);
3910 break;
3911 }
3912 }
3913 };
3914
3915 let mut vm_control_ids_to_remove = Vec::new();
3916 for event in events.iter().filter(|e| e.is_readable) {
3917 match event.token {
3918 #[cfg(feature = "registered_events")]
3919 Token::RegisteredEvent => match reg_evt_rdtube.recv::<RegisteredEventWithData>() {
3920 Ok(reg_evt) => {
3921 let evt = reg_evt.into_event();
3922 let mut tubes_to_remove: Vec<String> = Vec::new();
3923 if let Some(tubes) = registered_evt_tubes.get_mut(&evt) {
3924 for tube in tubes.iter() {
3925 if let Err(e) = tube.send(®_evt.into_proto()) {
3926 warn!(
3927 "failed to send registered event {:?} to {}, removing from \
3928 registrations: {}",
3929 reg_evt, tube.socket_addr, e
3930 );
3931 tubes_to_remove.push(tube.socket_addr.clone());
3932 }
3933 }
3934 }
3935 for tube_addr in tubes_to_remove {
3936 for tubes in registered_evt_tubes.values_mut() {
3937 tubes.retain(|t| t.socket_addr != tube_addr);
3938 }
3939 }
3940 registered_evt_tubes.retain(|_, tubes| !tubes.is_empty());
3941 }
3942 Err(e) => {
3943 warn!("failed to recv RegisteredEvent: {}", e);
3944 }
3945 },
3946 Token::VmEvent => {
3947 let mut break_to_wait: bool = true;
3948 match vm_evt_rdtube.recv::<VmEventType>() {
3949 Ok(vm_event) => match vm_event {
3950 VmEventType::Exit => {
3951 info!("vcpu requested shutdown");
3952 exit_state = ExitState::Stop;
3953 }
3954 VmEventType::Reset => {
3955 info!("vcpu requested reset");
3956 exit_state = ExitState::Reset;
3957 }
3958 VmEventType::Crash => {
3959 info!("vcpu crashed");
3960 exit_state = ExitState::Crash;
3961 }
3962 VmEventType::Panic(panic_code) => {
3963 pvpanic_code = PvPanicCode::from_u8(panic_code);
3964 info!("Guest reported panic [Code: {}]", pvpanic_code);
3965 break_to_wait = false;
3966 }
3967 VmEventType::WatchdogReset => {
3968 info!("vcpu stall detected");
3969 exit_state = ExitState::WatchdogReset;
3970 }
3971 },
3972 Err(e) => {
3973 warn!("failed to recv VmEvent: {}", e);
3974 }
3975 }
3976 if break_to_wait {
3977 if pvpanic_code == PvPanicCode::Panicked {
3978 exit_state = ExitState::GuestPanic;
3979 }
3980 break 'wait;
3981 }
3982 }
3983 Token::Suspend => match linux.suspend_tube.1.recv::<bool>() {
3984 Ok(is_suspend_request) => {
3985 let mode = if is_suspend_request {
3986 VmRunMode::Suspending
3987 } else {
3988 for dev in &linux.resume_notify_devices {
3989 dev.lock().resume_imminent();
3990 }
3991 VmRunMode::Running
3992 };
3993 info!("VM requested {}", mode);
3994 vcpu::kick_all_vcpus(
3995 &vcpu_handles,
3996 linux.irq_chip.as_irq_chip(),
3997 VcpuControl::RunState(mode),
3998 );
3999 }
4000 Err(err) => {
4001 warn!("Failed to read suspend tube {:?}", err);
4002 }
4003 },
4004 Token::ChildSignal => {
4005 // Print all available siginfo structs, then exit the loop if child process has
4006 // been exited except CLD_STOPPED and CLD_CONTINUED. the two should be ignored
4007 // here since they are used by the vmm-swap feature.
4008 let mut do_exit = false;
4009 while let Some(siginfo) =
4010 sigchld_fd.read().context("failed to read signalfd")?
4011 {
4012 let pid = siginfo.ssi_pid;
4013 let pid_label = match linux.pid_debug_label_map.get(&pid) {
4014 Some(label) => format!("{} (pid {})", label, pid),
4015 None => format!("pid {}", pid),
4016 };
4017
4018 // TODO(kawasin): this is a temporary exception until device suspension.
4019 #[cfg(feature = "swap")]
4020 if siginfo.ssi_code == libc::CLD_STOPPED
4021 || siginfo.ssi_code == libc::CLD_CONTINUED
4022 {
4023 continue;
4024 }
4025
4026 // Ignore clean exits of non-tracked child processes when running without
4027 // sandboxing. The virtio gpu process launches a render server for
4028 // pass-through graphics. Host GPU drivers have been observed to fork
4029 // child processes that exit cleanly which should not be considered a
4030 // crash. When running with sandboxing, this should be handled by the
4031 // device's process handler.
4032 if cfg.jail_config.is_none()
4033 && !linux.pid_debug_label_map.contains_key(&pid)
4034 && siginfo.ssi_signo == libc::SIGCHLD as u32
4035 && siginfo.ssi_code == libc::CLD_EXITED
4036 && siginfo.ssi_status == 0
4037 {
4038 continue;
4039 }
4040
4041 // Allow clean exits of a child process in `worker_process_pids`.
4042 if siginfo.ssi_signo == libc::SIGCHLD as u32
4043 && siginfo.ssi_code == libc::CLD_EXITED
4044 && siginfo.ssi_status == 0
4045 && worker_process_pids.remove(&(pid as Pid))
4046 {
4047 info!("child {pid} exited successfully");
4048 continue;
4049 }
4050
4051 error!(
4052 "child {} exited: signo {}, status {}, code {}",
4053 pid_label, siginfo.ssi_signo, siginfo.ssi_status, siginfo.ssi_code
4054 );
4055 do_exit = true;
4056 }
4057 if do_exit {
4058 exit_state = ExitState::Crash;
4059 break 'wait;
4060 }
4061 }
4062 Token::VmControlServer => {
4063 if let Some(socket_server) = &control_server_socket {
4064 match socket_server.accept() {
4065 Ok(socket) => {
4066 let id = next_control_id;
4067 next_control_id += 1;
4068 wait_ctx
4069 .add(&socket, Token::VmControl { id })
4070 .context("failed to add descriptor to wait context")?;
4071 control_tubes.insert(
4072 id,
4073 TaggedControlTube::Vm(Tube::new_from_unix_seqpacket(socket)?),
4074 );
4075 }
4076 Err(e) => error!("failed to accept socket: {}", e),
4077 }
4078 }
4079 }
4080 Token::VmControl { id } => {
4081 if let Some(socket) = control_tubes.get(&id) {
4082 let mut state = ControlLoopState {
4083 linux: &mut linux,
4084 cfg: &cfg,
4085 sys_allocator: &sys_allocator_mutex,
4086 control_tubes: &control_tubes,
4087 disk_host_tubes: &disk_host_tubes[..],
4088 #[cfg(feature = "gpu")]
4089 gpu_control_tube: gpu_control_tube.as_ref(),
4090 #[cfg(feature = "usb")]
4091 usb_control_tube: &usb_control_tube,
4092 #[cfg(target_arch = "x86_64")]
4093 iommu_host_tube: &iommu_host_tube,
4094 #[cfg(target_arch = "x86_64")]
4095 hp_control_tube: &hp_control_tube,
4096 guest_suspended_cvar: &guest_suspended_cvar,
4097 #[cfg(feature = "pci-hotplug")]
4098 hotplug_manager: &mut hotplug_manager,
4099 #[cfg(feature = "swap")]
4100 swap_controller: &mut swap_controller,
4101 vcpu_handles: &vcpu_handles,
4102 #[cfg(feature = "balloon")]
4103 balloon_tube: balloon_tube.as_mut(),
4104 device_ctrl_tube: &device_ctrl_tube,
4105 irq_handler_control: &irq_handler_control,
4106 #[cfg(any(target_arch = "x86_64", feature = "pci-hotplug"))]
4107 vm_memory_handler_control: &vm_memory_handler_control,
4108 #[cfg(feature = "registered_events")]
4109 registered_evt_tubes: &mut registered_evt_tubes,
4110 #[cfg(feature = "pvclock")]
4111 pvclock_host_tube: pvclock_host_tube.clone(),
4112 vfio_container_manager: &mut vfio_container_manager,
4113 suspended_pvclock_state: &mut suspended_pvclock_state,
4114 vcpus_pid_tid: &vcpus_pid_tid,
4115 };
4116 let (exit_requested, mut ids_to_remove, add_tubes) =
4117 process_vm_control_event(&mut state, id, socket)?;
4118 if exit_requested {
4119 break 'wait;
4120 }
4121 vm_control_ids_to_remove.append(&mut ids_to_remove);
4122 for socket in add_tubes {
4123 let id = next_control_id;
4124 next_control_id += 1;
4125 wait_ctx
4126 .add(socket.as_ref(), Token::VmControl { id })
4127 .context(
4128 "failed to add hotplug vfio-pci descriptor to wait context",
4129 )?;
4130 control_tubes.insert(id, socket);
4131 }
4132 }
4133 }
4134 #[cfg(feature = "balloon")]
4135 Token::BalloonTube => {
4136 match balloon_tube.as_mut().expect("missing balloon tube").recv() {
4137 Ok(resp) => {
4138 for (resp, idx) in resp {
4139 if let Some(TaggedControlTube::Vm(tube)) = control_tubes.get(&idx) {
4140 if let Err(e) = tube.send(&resp) {
4141 error!("failed to send VmResponse: {}", e);
4142 }
4143 } else {
4144 error!("Bad tube index {}", idx);
4145 }
4146 }
4147 }
4148 Err(err) => {
4149 error!("Error processing balloon tube {:?}", err)
4150 }
4151 }
4152 }
4153 }
4154 }
4155
4156 remove_hungup_and_drained_tubes(
4157 &events,
4158 &wait_ctx,
4159 &mut control_tubes,
4160 vm_control_ids_to_remove,
4161 |token: &Token| {
4162 if let Token::VmControl { id } = token {
4163 return Some(*id);
4164 }
4165 None
4166 },
4167 )?;
4168 }
4169
4170 vcpu::kick_all_vcpus(
4171 &vcpu_handles,
4172 linux.irq_chip.as_irq_chip(),
4173 VcpuControl::RunState(VmRunMode::Exiting),
4174 );
4175 for (handle, _) in vcpu_handles {
4176 if let Err(e) = handle.join() {
4177 error!("failed to join vcpu thread: {:?}", e);
4178 }
4179 }
4180
4181 // After joining all vcpu threads, unregister the process-wide signal handler.
4182 if let Err(e) = vcpu::remove_vcpu_signal_handler() {
4183 error!("failed to remove vcpu thread signal handler: {:#}", e);
4184 }
4185
4186 // Stop the vmm-swap monitor process.
4187 #[cfg(feature = "swap")]
4188 drop(swap_controller);
4189
4190 // Stop pci root worker thread
4191 #[cfg(target_arch = "x86_64")]
4192 {
4193 let _ = hp_control_tube.send(PciRootCommand::Kill);
4194 if let Err(e) = hp_thread.join() {
4195 error!("failed to join hotplug thread: {:?}", e);
4196 }
4197 }
4198
4199 if linux.devices_thread.is_some() {
4200 if let Err(e) = device_ctrl_tube.send(&DeviceControlCommand::Exit) {
4201 error!("failed to stop device control loop: {}", e);
4202 };
4203 if let Some(thread) = linux.devices_thread.take() {
4204 if let Err(e) = thread.join() {
4205 error!("failed to exit devices thread: {:?}", e);
4206 }
4207 }
4208 }
4209
4210 // Shut down the VM Memory handler thread.
4211 if let Err(e) = vm_memory_handler_control.send(&VmMemoryHandlerRequest::Exit) {
4212 error!(
4213 "failed to request exit from VM Memory handler thread: {}",
4214 e
4215 );
4216 }
4217 if let Err(e) = vm_memory_handler_thread.join() {
4218 error!("failed to exit VM Memory handler thread: {:?}", e);
4219 }
4220
4221 // Shut down the IRQ handler thread.
4222 if let Err(e) = irq_handler_control.send(&IrqHandlerRequest::Exit) {
4223 error!("failed to request exit from IRQ handler thread: {}", e);
4224 }
4225 if let Err(e) = irq_handler_thread.join() {
4226 error!("failed to exit irq handler thread: {:?}", e);
4227 }
4228
4229 // At this point, the only remaining `Arc` references to the `Bus` objects should be the ones
4230 // inside `linux`. If the checks below fail, then some other thread is probably still running
4231 // and needs to be explicitly stopped before dropping `linux` to ensure devices actually get
4232 // cleaned up.
4233 match Arc::try_unwrap(std::mem::replace(
4234 &mut linux.mmio_bus,
4235 Arc::new(Bus::new(BusType::Mmio)),
4236 )) {
4237 Ok(_) => {}
4238 Err(_) => panic!("internal error: mmio_bus had more than one reference at shutdown"),
4239 }
4240 match Arc::try_unwrap(std::mem::replace(
4241 &mut linux.io_bus,
4242 Arc::new(Bus::new(BusType::Io)),
4243 )) {
4244 Ok(_) => {}
4245 Err(_) => panic!("internal error: io_bus had more than one reference at shutdown"),
4246 }
4247
4248 // Explicitly drop the VM structure here to allow the devices to clean up before the
4249 // control sockets are closed when this function exits.
4250 mem::drop(linux);
4251
4252 // Drop the hotplug manager to tell the warden process to exit before we try to join
4253 // the metrics thread.
4254 #[cfg(feature = "pci-hotplug")]
4255 mem::drop(hotplug_manager);
4256
4257 // All our children should have exited by now, so closing our fd should
4258 // terminate metrics. Then join so that everything gets flushed.
4259 metrics::get_destructor().cleanup();
4260 if let Some(metrics_thread) = metrics_thread {
4261 if let Err(e) = metrics_thread.join() {
4262 error!("failed to exit irq handler thread: {:?}", e);
4263 }
4264 }
4265
4266 stdin()
4267 .set_canon_mode()
4268 .expect("failed to restore canonical mode for terminal");
4269
4270 Ok(exit_state)
4271 }
4272
4273 #[derive(EventToken)]
4274 enum IrqHandlerToken {
4275 IrqFd { index: IrqEventIndex },
4276 VmIrq { id: usize },
4277 DelayedIrqFd,
4278 HandlerControl,
4279 }
4280
4281 /// Handles IRQs and requests from devices to add additional IRQ lines.
irq_handler_thread( irq_control_tubes: Vec<Tube>, mut irq_chip: Box<dyn IrqChipArch + 'static>, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, handler_control: Tube, ) -> anyhow::Result<()>4282 fn irq_handler_thread(
4283 irq_control_tubes: Vec<Tube>,
4284 mut irq_chip: Box<dyn IrqChipArch + 'static>,
4285 sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4286 handler_control: Tube,
4287 ) -> anyhow::Result<()> {
4288 let wait_ctx = WaitContext::build_with(&[(
4289 handler_control.get_read_notifier(),
4290 IrqHandlerToken::HandlerControl,
4291 )])
4292 .context("failed to build wait context")?;
4293
4294 if let Some(delayed_ioapic_irq_trigger) = irq_chip.irq_delayed_event_token()? {
4295 wait_ctx
4296 .add(&delayed_ioapic_irq_trigger, IrqHandlerToken::DelayedIrqFd)
4297 .context("failed to add descriptor to wait context")?;
4298 }
4299
4300 let mut irq_event_tokens = irq_chip
4301 .irq_event_tokens()
4302 .context("failed get event tokens from irqchip")?;
4303
4304 for (index, _gsi, evt) in irq_event_tokens.iter() {
4305 wait_ctx
4306 .add(evt, IrqHandlerToken::IrqFd { index: *index })
4307 .context("failed to add irq chip event tokens to wait context")?;
4308 }
4309
4310 let mut irq_control_tubes = BTreeMap::from_iter(irq_control_tubes.into_iter().enumerate());
4311 let mut next_control_id = irq_control_tubes.len();
4312 for (id, socket) in irq_control_tubes.iter() {
4313 wait_ctx
4314 .add(
4315 socket.get_read_notifier(),
4316 IrqHandlerToken::VmIrq { id: *id },
4317 )
4318 .context("irq control tubes to wait context")?;
4319 }
4320
4321 'wait: loop {
4322 let events = {
4323 match wait_ctx.wait() {
4324 Ok(v) => v,
4325 Err(e) => {
4326 error!("failed to poll: {}", e);
4327 break 'wait;
4328 }
4329 }
4330 };
4331 let token_count = events.len();
4332 let mut vm_irq_tubes_to_remove = Vec::new();
4333 let mut notify_control_on_iteration_end = false;
4334
4335 for event in events.iter().filter(|e| e.is_readable) {
4336 match event.token {
4337 IrqHandlerToken::HandlerControl => {
4338 match handler_control.recv::<IrqHandlerRequest>() {
4339 Ok(request) => {
4340 match request {
4341 IrqHandlerRequest::Exit => break 'wait,
4342 IrqHandlerRequest::AddIrqControlTubes(tubes) => {
4343 for socket in tubes {
4344 let id = next_control_id;
4345 next_control_id += 1;
4346 wait_ctx
4347 .add(
4348 socket.get_read_notifier(),
4349 IrqHandlerToken::VmIrq { id },
4350 )
4351 .context("failed to add new IRQ control Tube to wait context")?;
4352 irq_control_tubes.insert(id, socket);
4353 }
4354 }
4355 IrqHandlerRequest::RefreshIrqEventTokens => {
4356 for (_index, _gsi, evt) in irq_event_tokens.iter() {
4357 wait_ctx.delete(evt).context(
4358 "failed to remove irq chip event \
4359 token from wait context",
4360 )?;
4361 }
4362
4363 irq_event_tokens = irq_chip
4364 .irq_event_tokens()
4365 .context("failed get event tokens from irqchip")?;
4366 for (index, _gsi, evt) in irq_event_tokens.iter() {
4367 wait_ctx
4368 .add(evt, IrqHandlerToken::IrqFd { index: *index })
4369 .context(
4370 "failed to add irq chip event \
4371 tokens to wait context",
4372 )?;
4373 }
4374
4375 if let Err(e) = handler_control
4376 .send(&IrqHandlerResponse::IrqEventTokenRefreshComplete)
4377 {
4378 error!(
4379 "failed to notify IRQ event token refresh \
4380 was completed: {}",
4381 e
4382 );
4383 }
4384 }
4385 IrqHandlerRequest::WakeAndNotifyIteration => {
4386 notify_control_on_iteration_end = true;
4387 }
4388 }
4389 }
4390 Err(e) => {
4391 if let TubeError::Disconnected = e {
4392 panic!("irq handler control tube disconnected.");
4393 } else {
4394 error!("failed to recv IrqHandlerRequest: {}", e);
4395 }
4396 }
4397 }
4398 }
4399 IrqHandlerToken::VmIrq { id } => {
4400 if let Some(tube) = irq_control_tubes.get(&id) {
4401 handle_irq_tube_request(
4402 &sys_allocator_mutex,
4403 &mut irq_chip,
4404 &mut vm_irq_tubes_to_remove,
4405 &wait_ctx,
4406 tube,
4407 id,
4408 );
4409 }
4410 }
4411 IrqHandlerToken::IrqFd { index } => {
4412 if let Err(e) = irq_chip.service_irq_event(index) {
4413 error!("failed to signal irq {}: {}", index, e);
4414 }
4415 }
4416 IrqHandlerToken::DelayedIrqFd => {
4417 if let Err(e) = irq_chip.process_delayed_irq_events() {
4418 warn!("can't deliver delayed irqs: {}", e);
4419 }
4420 }
4421 }
4422 }
4423
4424 if notify_control_on_iteration_end {
4425 if let Err(e) = handler_control.send(&IrqHandlerResponse::HandlerIterationComplete(
4426 token_count - 1,
4427 )) {
4428 error!(
4429 "failed to notify on iteration completion (snapshotting may fail): {}",
4430 e
4431 );
4432 }
4433 }
4434
4435 remove_hungup_and_drained_tubes(
4436 &events,
4437 &wait_ctx,
4438 &mut irq_control_tubes,
4439 vm_irq_tubes_to_remove,
4440 |token: &IrqHandlerToken| {
4441 if let IrqHandlerToken::VmIrq { id } = token {
4442 return Some(*id);
4443 }
4444 None
4445 },
4446 )?;
4447 if events.iter().any(|e| {
4448 e.is_hungup && !e.is_readable && matches!(e.token, IrqHandlerToken::HandlerControl)
4449 }) {
4450 error!("IRQ handler control hung up but did not request an exit.");
4451 break 'wait;
4452 }
4453 }
4454 Ok(())
4455 }
4456
handle_irq_tube_request( sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>, irq_chip: &mut Box<dyn IrqChipArch + 'static>, vm_irq_tubes_to_remove: &mut Vec<usize>, wait_ctx: &WaitContext<IrqHandlerToken>, tube: &Tube, tube_index: usize, )4457 fn handle_irq_tube_request(
4458 sys_allocator_mutex: &Arc<Mutex<SystemAllocator>>,
4459 irq_chip: &mut Box<dyn IrqChipArch + 'static>,
4460 vm_irq_tubes_to_remove: &mut Vec<usize>,
4461 wait_ctx: &WaitContext<IrqHandlerToken>,
4462 tube: &Tube,
4463 tube_index: usize,
4464 ) {
4465 match tube.recv::<VmIrqRequest>() {
4466 Ok(request) => {
4467 let response = {
4468 request.execute(
4469 |setup| match setup {
4470 IrqSetup::Event(irq, ev, device_id, queue_id, device_name) => {
4471 let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4472 let source = IrqEventSource {
4473 device_id: device_id.try_into().expect("Invalid device_id"),
4474 queue_id,
4475 device_name,
4476 };
4477 if let Some(event_index) =
4478 irq_chip.register_edge_irq_event(irq, &irq_evt, source)?
4479 {
4480 if let Err(e) =
4481 wait_ctx.add(ev, IrqHandlerToken::IrqFd { index: event_index })
4482 {
4483 warn!("failed to add IrqFd to poll context: {}", e);
4484 return Err(e);
4485 }
4486 }
4487 Ok(())
4488 }
4489 IrqSetup::Route(route) => irq_chip.route_irq(route),
4490 IrqSetup::UnRegister(irq, ev) => {
4491 let irq_evt = devices::IrqEdgeEvent::from_event(ev.try_clone()?);
4492 irq_chip.unregister_edge_irq_event(irq, &irq_evt)
4493 }
4494 },
4495 &mut sys_allocator_mutex.lock(),
4496 )
4497 };
4498 if let Err(e) = tube.send(&response) {
4499 error!("failed to send VmIrqResponse: {}", e);
4500 }
4501 }
4502 Err(e) => {
4503 if let TubeError::Disconnected = e {
4504 vm_irq_tubes_to_remove.push(tube_index);
4505 } else {
4506 error!("failed to recv VmIrqRequest: {}", e);
4507 }
4508 }
4509 }
4510 }
4511
4512 /// Commands to control the VM Memory handler thread.
4513 #[derive(serde::Serialize, serde::Deserialize)]
4514 pub enum VmMemoryHandlerRequest {
4515 /// No response is sent for this command.
4516 AddControlTubes(Vec<VmMemoryTube>),
4517 /// No response is sent for this command.
4518 Exit,
4519 }
4520
vm_memory_handler_thread( control_tubes: Vec<VmMemoryTube>, mut vm: impl Vm, sys_allocator_mutex: Arc<Mutex<SystemAllocator>>, mut gralloc: RutabagaGralloc, mut iommu_client: Option<VmMemoryRequestIommuClient>, handler_control: Tube, ) -> anyhow::Result<()>4521 fn vm_memory_handler_thread(
4522 control_tubes: Vec<VmMemoryTube>,
4523 mut vm: impl Vm,
4524 sys_allocator_mutex: Arc<Mutex<SystemAllocator>>,
4525 mut gralloc: RutabagaGralloc,
4526 mut iommu_client: Option<VmMemoryRequestIommuClient>,
4527 handler_control: Tube,
4528 ) -> anyhow::Result<()> {
4529 #[derive(EventToken)]
4530 enum Token {
4531 VmControl { id: usize },
4532 HandlerControl,
4533 }
4534
4535 let wait_ctx =
4536 WaitContext::build_with(&[(handler_control.get_read_notifier(), Token::HandlerControl)])
4537 .context("failed to build wait context")?;
4538 let mut control_tubes = BTreeMap::from_iter(control_tubes.into_iter().enumerate());
4539 let mut next_control_id = control_tubes.len();
4540 for (id, socket) in control_tubes.iter() {
4541 wait_ctx
4542 .add(socket.as_ref(), Token::VmControl { id: *id })
4543 .context("failed to add descriptor to wait context")?;
4544 }
4545
4546 let mut region_state: VmMemoryRegionState = Default::default();
4547
4548 'wait: loop {
4549 let events = {
4550 match wait_ctx.wait() {
4551 Ok(v) => v,
4552 Err(e) => {
4553 error!("failed to poll: {}", e);
4554 break;
4555 }
4556 }
4557 };
4558
4559 let mut vm_control_ids_to_remove = Vec::new();
4560 for event in events.iter().filter(|e| e.is_readable) {
4561 match event.token {
4562 Token::HandlerControl => match handler_control.recv::<VmMemoryHandlerRequest>() {
4563 Ok(request) => match request {
4564 VmMemoryHandlerRequest::Exit => break 'wait,
4565 VmMemoryHandlerRequest::AddControlTubes(tubes) => {
4566 for socket in tubes {
4567 let id = next_control_id;
4568 next_control_id += 1;
4569 wait_ctx
4570 .add(socket.get_read_notifier(), Token::VmControl { id })
4571 .context(
4572 "failed to add new vm memory control Tube to wait context",
4573 )?;
4574 control_tubes.insert(id, socket);
4575 }
4576 }
4577 },
4578 Err(e) => {
4579 if let TubeError::Disconnected = e {
4580 panic!("vm memory control tube disconnected.");
4581 } else {
4582 error!("failed to recv VmMemoryHandlerRequest: {}", e);
4583 }
4584 }
4585 },
4586 Token::VmControl { id } => {
4587 if let Some(VmMemoryTube {
4588 tube,
4589 expose_with_viommu,
4590 }) = control_tubes.get(&id)
4591 {
4592 match tube.recv::<VmMemoryRequest>() {
4593 Ok(request) => {
4594 let response = request.execute(
4595 tube,
4596 &mut vm,
4597 &mut sys_allocator_mutex.lock(),
4598 &mut gralloc,
4599 if *expose_with_viommu {
4600 iommu_client.as_mut()
4601 } else {
4602 None
4603 },
4604 &mut region_state,
4605 );
4606 if let Err(e) = tube.send(&response) {
4607 error!("failed to send VmMemoryControlResponse: {}", e);
4608 }
4609 }
4610 Err(e) => {
4611 if let TubeError::Disconnected = e {
4612 vm_control_ids_to_remove.push(id);
4613 } else {
4614 error!("failed to recv VmMemoryControlRequest: {}", e);
4615 }
4616 }
4617 }
4618 }
4619 }
4620 }
4621 }
4622
4623 remove_hungup_and_drained_tubes(
4624 &events,
4625 &wait_ctx,
4626 &mut control_tubes,
4627 vm_control_ids_to_remove,
4628 |token: &Token| {
4629 if let Token::VmControl { id } = token {
4630 return Some(*id);
4631 }
4632 None
4633 },
4634 )?;
4635 if events
4636 .iter()
4637 .any(|e| e.is_hungup && !e.is_readable && matches!(e.token, Token::HandlerControl))
4638 {
4639 error!("vm memory handler control hung up but did not request an exit.");
4640 break 'wait;
4641 }
4642 }
4643 Ok(())
4644 }
4645
4646 /// When control tubes hang up, we want to make sure that we've fully drained
4647 /// the underlying socket before removing it. This function also handles
4648 /// removing closed sockets in such a way that avoids phantom events.
4649 ///
4650 /// `tube_ids_to_remove` is the set of ids that we already know should
4651 /// be removed (e.g. from getting a disconnect error on read).
remove_hungup_and_drained_tubes<T, U>( events: &SmallVec<[TriggeredEvent<T>; 16]>, wait_ctx: &WaitContext<T>, tubes: &mut BTreeMap<usize, U>, mut tube_ids_to_remove: Vec<usize>, get_tube_id: fn(token: &T) -> Option<usize>, ) -> anyhow::Result<()> where T: EventToken, U: ReadNotifier,4652 fn remove_hungup_and_drained_tubes<T, U>(
4653 events: &SmallVec<[TriggeredEvent<T>; 16]>,
4654 wait_ctx: &WaitContext<T>,
4655 tubes: &mut BTreeMap<usize, U>,
4656 mut tube_ids_to_remove: Vec<usize>,
4657 get_tube_id: fn(token: &T) -> Option<usize>,
4658 ) -> anyhow::Result<()>
4659 where
4660 T: EventToken,
4661 U: ReadNotifier,
4662 {
4663 // It's possible more data is readable and buffered while the socket is hungup,
4664 // so don't delete the tube from the poll context until we're sure all the
4665 // data is read.
4666 // Below case covers a condition where we have received a hungup event and the tube is not
4667 // readable.
4668 // In case of readable tube, once all data is read, any attempt to read more data on hungup
4669 // tube should fail. On such failure, we get Disconnected error and ids gets added to
4670 // tube_ids_to_remove by the time we reach here.
4671 for event in events.iter().filter(|e| e.is_hungup && !e.is_readable) {
4672 if let Some(id) = get_tube_id(&event.token) {
4673 tube_ids_to_remove.push(id);
4674 }
4675 }
4676
4677 tube_ids_to_remove.dedup();
4678 for id in tube_ids_to_remove {
4679 // Delete the socket from the `wait_ctx` synchronously. Otherwise, the kernel will do
4680 // this automatically when the FD inserted into the `wait_ctx` is closed after this
4681 // if-block, but this removal can be deferred unpredictably. In some instances where the
4682 // system is under heavy load, we can even get events returned by `wait_ctx` for an FD
4683 // that has already been closed. Because the token associated with that spurious event
4684 // now belongs to a different socket, the control loop will start to interact with
4685 // sockets that might not be ready to use. This can cause incorrect hangup detection or
4686 // blocking on a socket that will never be ready. See also: crbug.com/1019986
4687 if let Some(socket) = tubes.remove(&id) {
4688 wait_ctx
4689 .delete(socket.get_read_notifier())
4690 .context("failed to remove descriptor from wait context")?;
4691 }
4692 }
4693 Ok(())
4694 }
4695
4696 /// Start and jail a vhost-user device according to its configuration and a vhost listener string.
4697 ///
4698 /// The jailing business is nasty and potentially unsafe if done from the wrong context - do not
4699 /// call outside of `start_devices`!
4700 ///
4701 /// Returns the pid of the jailed device process.
jail_and_start_vu_device<T: VirtioDeviceBuilder>( jail_config: &Option<JailConfig>, params: T, vhost: &str, name: &str, ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)>4702 fn jail_and_start_vu_device<T: VirtioDeviceBuilder>(
4703 jail_config: &Option<JailConfig>,
4704 params: T,
4705 vhost: &str,
4706 name: &str,
4707 ) -> anyhow::Result<(libc::pid_t, Option<Box<dyn std::any::Any>>)> {
4708 let mut keep_rds = Vec::new();
4709
4710 base::syslog::push_descriptors(&mut keep_rds);
4711 cros_tracing::push_descriptors!(&mut keep_rds);
4712 metrics::push_descriptors(&mut keep_rds);
4713
4714 let jail_type = VirtioDeviceType::VhostUser;
4715
4716 // Create a jail from the configuration. If the configuration is `None`, `create_jail` will also
4717 // return `None` so fall back to an empty (i.e. non-constrained) Minijail.
4718 let jail = params
4719 .create_jail(jail_config, jail_type)
4720 .with_context(|| format!("failed to create jail for {}", name))?
4721 .ok_or(())
4722 .or_else(|_| Minijail::new())
4723 .with_context(|| format!("failed to create empty jail for {}", name))?;
4724
4725 // Create the device in the parent process, so the child does not need any privileges necessary
4726 // to do it (only runtime capabilities are required).
4727 let device = params
4728 .create_vhost_user_device(&mut keep_rds)
4729 .context("failed to create vhost-user device")?;
4730 let mut listener =
4731 VhostUserListener::new(vhost).context("failed to create the vhost listener")?;
4732 keep_rds.push(listener.as_raw_descriptor());
4733 let parent_resources = listener.take_parent_process_resources();
4734
4735 // Executor must be created before jail in order to prevent the jailed process from creating
4736 // unrestricted io_urings.
4737 let ex = Executor::new().context("Failed to create an Executor")?;
4738 keep_rds.extend(ex.as_raw_descriptors());
4739
4740 // Deduplicate the FDs since minijail expects them to be unique.
4741 keep_rds.sort_unstable();
4742 keep_rds.dedup();
4743
4744 // SAFETY:
4745 // Safe because we are keeping all the descriptors needed for the child to function.
4746 match unsafe { jail.fork(Some(&keep_rds)).context("error while forking")? } {
4747 0 => {
4748 // In the child process.
4749
4750 // Free memory for the resources managed by the parent, without running drop() on them.
4751 // The parent will do it as we exit.
4752 let _ = std::mem::ManuallyDrop::new(parent_resources);
4753
4754 // Make sure the child process does not survive its parent.
4755 // SAFETY: trivially safe
4756 if unsafe { libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) } < 0 {
4757 panic!("call to prctl(PR_SET_DEATHSIG, SIGKILL) failed. Aborting child process.");
4758 }
4759
4760 // Set the name for the thread.
4761 const MAX_LEN: usize = 15; // pthread_setname_np() limit on Linux
4762 let debug_label_trimmed = &name.as_bytes()[..std::cmp::min(MAX_LEN, name.len())];
4763 let thread_name = CString::new(debug_label_trimmed).unwrap();
4764 // SAFETY:
4765 // Safe because we trimmed the name to 15 characters (and pthread_setname_np will return
4766 // an error if we don't anyway).
4767 let _ = unsafe { libc::pthread_setname_np(libc::pthread_self(), thread_name.as_ptr()) };
4768
4769 // Run the device loop and terminate the child process once it exits.
4770 let res = match listener.run_device(ex, device) {
4771 Ok(()) => 0,
4772 Err(e) => {
4773 error!("error while running device {}: {:#}", name, e);
4774 1
4775 }
4776 };
4777 // SAFETY: trivially safe
4778 unsafe { libc::exit(res) };
4779 }
4780 pid => {
4781 // In the parent process. We will drop the device and listener when exiting this method.
4782 // This is fine as ownership for both has been transferred to the child process and they
4783 // will keep living there. We just retain `parent_resources` for things we are supposed
4784 // to clean up ourselves.
4785
4786 info!("process for device {} (PID {}) started", &name, pid);
4787 #[cfg(feature = "seccomp_trace")]
4788 debug!(
4789 "seccomp_trace {{\"event\": \"minijail_fork\", \"pid\": {}, \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
4790 pid,
4791 &name,
4792 read_jail_addr(&jail)
4793 );
4794 Ok((pid, parent_resources))
4795 }
4796 }
4797 }
4798
process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()>4799 fn process_vhost_user_control_request(tube: Tube, disk_host_tubes: &[Tube]) -> Result<()> {
4800 let command = tube
4801 .recv::<VmRequest>()
4802 .context("failed to receive VmRequest")?;
4803 let resp = match command {
4804 VmRequest::DiskCommand {
4805 disk_index,
4806 ref command,
4807 } => match &disk_host_tubes.get(disk_index) {
4808 Some(tube) => handle_disk_command(command, tube),
4809 None => VmResponse::Err(base::Error::new(libc::ENODEV)),
4810 },
4811 request => {
4812 error!(
4813 "Request {:?} currently not supported in vhost user backend",
4814 request
4815 );
4816 VmResponse::Err(base::Error::new(libc::EPERM))
4817 }
4818 };
4819
4820 tube.send(&resp).context("failed to send VmResponse")?;
4821 Ok(())
4822 }
4823
start_vhost_user_control_server( control_server_socket: UnlinkUnixSeqpacketListener, disk_host_tubes: Vec<Tube>, )4824 fn start_vhost_user_control_server(
4825 control_server_socket: UnlinkUnixSeqpacketListener,
4826 disk_host_tubes: Vec<Tube>,
4827 ) {
4828 info!("Start vhost-user control server");
4829 loop {
4830 match control_server_socket.accept() {
4831 Ok(socket) => {
4832 let tube = match Tube::new_from_unix_seqpacket(socket) {
4833 Ok(tube) => tube,
4834 Err(e) => {
4835 error!("failed to open tube: {:#}", e);
4836 return;
4837 }
4838 };
4839 if let Err(e) = process_vhost_user_control_request(tube, &disk_host_tubes) {
4840 error!("failed to process control request: {:#}", e);
4841 }
4842 }
4843 Err(e) => {
4844 error!("failed to establish connection: {}", e);
4845 }
4846 }
4847 }
4848 }
4849
start_devices(opts: DevicesCommand) -> anyhow::Result<()>4850 pub fn start_devices(opts: DevicesCommand) -> anyhow::Result<()> {
4851 if let Some(async_executor) = opts.async_executor {
4852 Executor::set_default_executor_kind(async_executor)
4853 .context("Failed to set the default async executor")?;
4854 }
4855
4856 struct DeviceJailInfo {
4857 // Unique name for the device, in the form `foomatic-0`.
4858 name: String,
4859 _drop_resources: Option<Box<dyn std::any::Any>>,
4860 }
4861
4862 fn add_device<T: VirtioDeviceBuilder>(
4863 i: usize,
4864 device_params: T,
4865 vhost: &str,
4866 jail_config: &Option<JailConfig>,
4867 devices_jails: &mut BTreeMap<libc::pid_t, DeviceJailInfo>,
4868 ) -> anyhow::Result<()> {
4869 let name = format!("{}-{}", T::NAME, i);
4870
4871 let (pid, _drop_resources) =
4872 jail_and_start_vu_device::<T>(jail_config, device_params, vhost, &name)?;
4873
4874 devices_jails.insert(
4875 pid,
4876 DeviceJailInfo {
4877 name,
4878 _drop_resources,
4879 },
4880 );
4881
4882 Ok(())
4883 }
4884
4885 let mut devices_jails: BTreeMap<libc::pid_t, DeviceJailInfo> = BTreeMap::new();
4886
4887 let jail = if opts.disable_sandbox {
4888 None
4889 } else {
4890 Some(opts.jail)
4891 };
4892
4893 // Create control server socket
4894 let control_server_socket = opts.control_socket.map(|path| {
4895 UnlinkUnixSeqpacketListener(
4896 UnixSeqpacketListener::bind(path).expect("Could not bind socket"),
4897 )
4898 });
4899
4900 // Create serial devices.
4901 for (i, params) in opts.serial.iter().enumerate() {
4902 let serial_config = ¶ms.device;
4903 add_device(i, serial_config, ¶ms.vhost, &jail, &mut devices_jails)?;
4904 }
4905
4906 let mut disk_host_tubes = Vec::new();
4907 let control_socket_exists = control_server_socket.is_some();
4908 // Create block devices.
4909 for (i, params) in opts.block.iter().enumerate() {
4910 let tube = if control_socket_exists {
4911 let (host_tube, device_tube) = Tube::pair().context("failed to create tube")?;
4912 disk_host_tubes.push(host_tube);
4913 Some(device_tube)
4914 } else {
4915 None
4916 };
4917 let disk_config = DiskConfig::new(¶ms.device, tube);
4918 add_device(i, disk_config, ¶ms.vhost, &jail, &mut devices_jails)?;
4919 }
4920
4921 // Create vsock devices.
4922 for (i, params) in opts.vsock.iter().enumerate() {
4923 add_device(i, ¶ms.device, ¶ms.vhost, &jail, &mut devices_jails)?;
4924 }
4925
4926 // Create network devices.
4927 #[cfg(feature = "net")]
4928 for (i, params) in opts.net.iter().enumerate() {
4929 add_device(i, ¶ms.device, ¶ms.vhost, &jail, &mut devices_jails)?;
4930 }
4931
4932 // No device created, that's probably not intended - print the help in that case.
4933 if devices_jails.is_empty() {
4934 let err = DevicesCommand::from_args(
4935 &[&std::env::args().next().unwrap_or(String::from("crosvm"))],
4936 &["--help"],
4937 )
4938 .unwrap_err();
4939 println!("{}", err.output);
4940 return Ok(());
4941 }
4942
4943 let ex = Executor::new()?;
4944 if let Some(control_server_socket) = control_server_socket {
4945 // Start the control server in the parent process.
4946 ex.spawn_blocking(move || {
4947 start_vhost_user_control_server(control_server_socket, disk_host_tubes)
4948 })
4949 .detach();
4950 }
4951
4952 // Now wait for all device processes to return.
4953 while !devices_jails.is_empty() {
4954 match base::linux::wait_for_pid(-1, 0) {
4955 Err(e) => panic!("error waiting for child process to complete: {:#}", e),
4956 Ok((Some(pid), wait_status)) => match devices_jails.remove_entry(&pid) {
4957 Some((_, info)) => {
4958 if let Some(status) = wait_status.code() {
4959 info!(
4960 "process for device {} (PID {}) exited with code {}",
4961 &info.name, pid, status
4962 );
4963 } else if let Some(signal) = wait_status.signal() {
4964 warn!(
4965 "process for device {} (PID {}) has been killed by signal {:?}",
4966 &info.name, pid, signal,
4967 );
4968 }
4969 }
4970 None => error!("pid {} is not one of our device processes", pid),
4971 },
4972 // `wait_for_pid` will necessarily return a PID because we asked to it wait for one to
4973 // complete.
4974 Ok((None, _)) => unreachable!(),
4975 }
4976 }
4977
4978 info!("all device processes have exited");
4979
4980 Ok(())
4981 }
4982
4983 /// Setup crash reporting for a process. Each process MUST provide a unique `product_type` to avoid
4984 /// making crash reports incomprehensible.
4985 #[cfg(feature = "crash-report")]
setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String>4986 pub fn setup_emulator_crash_reporting(_cfg: &Config) -> anyhow::Result<String> {
4987 crash_report::setup_crash_reporting(crash_report::CrashReportAttributes {
4988 product_type: "emulator".to_owned(),
4989 pipe_name: None,
4990 report_uuid: None,
4991 product_name: None,
4992 product_version: None,
4993 })
4994 }
4995
4996 #[cfg(test)]
4997 mod tests {
4998 use std::path::PathBuf;
4999
5000 use super::*;
5001
5002 // Create a file-backed mapping parameters struct with the given `address` and `size` and other
5003 // parameters set to default values.
test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters5004 fn test_file_backed_mapping(address: u64, size: u64) -> FileBackedMappingParameters {
5005 FileBackedMappingParameters {
5006 address,
5007 size,
5008 path: PathBuf::new(),
5009 offset: 0,
5010 writable: false,
5011 sync: false,
5012 align: false,
5013 }
5014 }
5015
5016 #[test]
guest_mem_file_backed_mappings_overlap()5017 fn guest_mem_file_backed_mappings_overlap() {
5018 // Base case: no file mappings; output layout should be identical.
5019 assert_eq!(
5020 punch_holes_in_guest_mem_layout_for_mappings(
5021 vec![
5022 (GuestAddress(0), 0xD000_0000, Default::default()),
5023 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5024 ],
5025 &[]
5026 ),
5027 vec![
5028 (GuestAddress(0), 0xD000_0000, Default::default()),
5029 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5030 ]
5031 );
5032
5033 // File mapping that does not overlap guest memory.
5034 assert_eq!(
5035 punch_holes_in_guest_mem_layout_for_mappings(
5036 vec![
5037 (GuestAddress(0), 0xD000_0000, Default::default()),
5038 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5039 ],
5040 &[test_file_backed_mapping(0xD000_0000, 0x1000)]
5041 ),
5042 vec![
5043 (GuestAddress(0), 0xD000_0000, Default::default()),
5044 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5045 ]
5046 );
5047
5048 // File mapping at the start of the low address space region.
5049 assert_eq!(
5050 punch_holes_in_guest_mem_layout_for_mappings(
5051 vec![
5052 (GuestAddress(0), 0xD000_0000, Default::default()),
5053 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5054 ],
5055 &[test_file_backed_mapping(0, 0x2000)]
5056 ),
5057 vec![
5058 (
5059 GuestAddress(0x2000),
5060 0xD000_0000 - 0x2000,
5061 Default::default()
5062 ),
5063 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5064 ]
5065 );
5066
5067 // File mapping at the end of the low address space region.
5068 assert_eq!(
5069 punch_holes_in_guest_mem_layout_for_mappings(
5070 vec![
5071 (GuestAddress(0), 0xD000_0000, Default::default()),
5072 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5073 ],
5074 &[test_file_backed_mapping(0xD000_0000 - 0x2000, 0x2000)]
5075 ),
5076 vec![
5077 (GuestAddress(0), 0xD000_0000 - 0x2000, Default::default()),
5078 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5079 ]
5080 );
5081
5082 // File mapping fully contained within the middle of the low address space region.
5083 assert_eq!(
5084 punch_holes_in_guest_mem_layout_for_mappings(
5085 vec![
5086 (GuestAddress(0), 0xD000_0000, Default::default()),
5087 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5088 ],
5089 &[test_file_backed_mapping(0x1000, 0x2000)]
5090 ),
5091 vec![
5092 (GuestAddress(0), 0x1000, Default::default()),
5093 (
5094 GuestAddress(0x3000),
5095 0xD000_0000 - 0x3000,
5096 Default::default()
5097 ),
5098 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5099 ]
5100 );
5101
5102 // File mapping at the start of the high address space region.
5103 assert_eq!(
5104 punch_holes_in_guest_mem_layout_for_mappings(
5105 vec![
5106 (GuestAddress(0), 0xD000_0000, Default::default()),
5107 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5108 ],
5109 &[test_file_backed_mapping(0x1_0000_0000, 0x2000)]
5110 ),
5111 vec![
5112 (GuestAddress(0), 0xD000_0000, Default::default()),
5113 (
5114 GuestAddress(0x1_0000_2000),
5115 0x8_0000 - 0x2000,
5116 Default::default()
5117 ),
5118 ]
5119 );
5120
5121 // File mapping at the end of the high address space region.
5122 assert_eq!(
5123 punch_holes_in_guest_mem_layout_for_mappings(
5124 vec![
5125 (GuestAddress(0), 0xD000_0000, Default::default()),
5126 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5127 ],
5128 &[test_file_backed_mapping(0x1_0008_0000 - 0x2000, 0x2000)]
5129 ),
5130 vec![
5131 (GuestAddress(0), 0xD000_0000, Default::default()),
5132 (
5133 GuestAddress(0x1_0000_0000),
5134 0x8_0000 - 0x2000,
5135 Default::default()
5136 ),
5137 ]
5138 );
5139
5140 // File mapping fully contained within the middle of the high address space region.
5141 assert_eq!(
5142 punch_holes_in_guest_mem_layout_for_mappings(
5143 vec![
5144 (GuestAddress(0), 0xD000_0000, Default::default()),
5145 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5146 ],
5147 &[test_file_backed_mapping(0x1_0000_1000, 0x2000)]
5148 ),
5149 vec![
5150 (GuestAddress(0), 0xD000_0000, Default::default()),
5151 (GuestAddress(0x1_0000_0000), 0x1000, Default::default()),
5152 (
5153 GuestAddress(0x1_0000_3000),
5154 0x8_0000 - 0x3000,
5155 Default::default()
5156 ),
5157 ]
5158 );
5159
5160 // File mapping overlapping two guest memory regions.
5161 assert_eq!(
5162 punch_holes_in_guest_mem_layout_for_mappings(
5163 vec![
5164 (GuestAddress(0), 0xD000_0000, Default::default()),
5165 (GuestAddress(0x1_0000_0000), 0x8_0000, Default::default()),
5166 ],
5167 &[test_file_backed_mapping(0xA000_0000, 0x60002000)]
5168 ),
5169 vec![
5170 (GuestAddress(0), 0xA000_0000, Default::default()),
5171 (
5172 GuestAddress(0x1_0000_2000),
5173 0x8_0000 - 0x2000,
5174 Default::default()
5175 ),
5176 ]
5177 );
5178 }
5179 }
5180