xref: /aosp_15_r20/external/crosvm/src/crosvm/plugin/mod.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2018 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 mod config;
6 mod process;
7 mod vcpu;
8 
9 use std::fmt::Write as FmtWrite;
10 use std::fs::File;
11 use std::io;
12 use std::io::Read;
13 use std::io::Write;
14 use std::os::unix::net::UnixDatagram;
15 use std::path::Path;
16 use std::sync::atomic::AtomicBool;
17 use std::sync::atomic::Ordering;
18 use std::sync::Arc;
19 use std::sync::Barrier;
20 use std::thread;
21 use std::time::Duration;
22 use std::time::Instant;
23 
24 use anyhow::anyhow;
25 use anyhow::bail;
26 use anyhow::Context;
27 use anyhow::Result;
28 use base::add_fd_flags;
29 use base::block_signal;
30 use base::clear_signal;
31 use base::drop_capabilities;
32 use base::enable_core_scheduling;
33 use base::error;
34 use base::getegid;
35 use base::geteuid;
36 use base::info;
37 use base::pipe;
38 use base::register_rt_signal_handler;
39 use base::warn;
40 use base::AsRawDescriptor;
41 use base::Error as SysError;
42 use base::Event;
43 use base::EventToken;
44 use base::FromRawDescriptor;
45 use base::Killable;
46 use base::MmapError;
47 use base::RawDescriptor;
48 use base::Result as SysResult;
49 use base::SignalFd;
50 use base::WaitContext;
51 use base::SIGRTMIN;
52 use jail::create_sandbox_minijail;
53 use jail::mount_proc;
54 use jail::SandboxConfig;
55 use kvm::Cap;
56 use kvm::Datamatch;
57 use kvm::IoeventAddress;
58 use kvm::Kvm;
59 use kvm::Vcpu;
60 use kvm::VcpuExit;
61 use kvm::Vm;
62 use libc::c_int;
63 use libc::c_ulong;
64 use libc::fcntl;
65 use libc::ioctl;
66 use libc::socketpair;
67 use libc::AF_UNIX;
68 use libc::EAGAIN;
69 use libc::EBADF;
70 use libc::EDEADLK;
71 use libc::EEXIST;
72 use libc::EINTR;
73 use libc::EINVAL;
74 use libc::ENOENT;
75 use libc::EOVERFLOW;
76 use libc::EPERM;
77 use libc::FIOCLEX;
78 use libc::F_SETPIPE_SZ;
79 use libc::O_NONBLOCK;
80 use libc::SIGCHLD;
81 use libc::SOCK_SEQPACKET;
82 use net_util::sys::linux::Tap;
83 use remain::sorted;
84 use thiserror::Error;
85 use vm_memory::GuestMemory;
86 use vm_memory::MemoryPolicy;
87 
88 use self::process::*;
89 use self::vcpu::*;
90 use crate::crosvm::config::Executable;
91 use crate::crosvm::config::HypervisorKind;
92 pub use crate::crosvm::plugin::config::parse_plugin_mount_option;
93 pub use crate::crosvm::plugin::config::BindMount;
94 pub use crate::crosvm::plugin::config::GidMap;
95 use crate::Config;
96 
97 const MAX_DATAGRAM_SIZE: usize = 4096;
98 const MAX_VCPU_DATAGRAM_SIZE: usize = 0x40000;
99 const MAX_OPEN_FILES: u64 = 32768;
100 
101 /// An error that occurs when communicating with the plugin process.
102 #[sorted]
103 #[derive(Error, Debug)]
104 pub enum CommError {
105     #[error("failed to decode plugin request: {0}")]
106     DecodeRequest(protobuf::Error),
107     #[error("failed to encode plugin response: {0}")]
108     EncodeResponse(protobuf::Error),
109     #[error("plugin request socket has been hung up")]
110     PluginSocketHup,
111     #[error("failed to recv from plugin request socket: {0}")]
112     PluginSocketRecv(SysError),
113     #[error("failed to send to plugin request socket: {0}")]
114     PluginSocketSend(SysError),
115 }
116 
new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)>117 fn new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)> {
118     let mut fds = [0, 0];
119     // SAFETY: trivially safe as we check the return value
120     unsafe {
121         let ret = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, fds.as_mut_ptr());
122         if ret == 0 {
123             ioctl(fds[0], FIOCLEX);
124             Ok((
125                 UnixDatagram::from_raw_descriptor(fds[0]),
126                 UnixDatagram::from_raw_descriptor(fds[1]),
127             ))
128         } else {
129             Err(SysError::last())
130         }
131     }
132 }
133 
134 struct VcpuPipe {
135     crosvm_read: File,
136     plugin_write: File,
137     plugin_read: File,
138     crosvm_write: File,
139 }
140 
new_pipe_pair() -> SysResult<VcpuPipe>141 fn new_pipe_pair() -> SysResult<VcpuPipe> {
142     let to_crosvm = pipe()?;
143     let to_plugin = pipe()?;
144     // Increasing the pipe size can be a nice-to-have to make sure that
145     // messages get across atomically (and made sure that writes don't block),
146     // though it's not necessary a hard requirement for things to work.
147     // SAFETY: safe because no memory is modified and we check return value.
148     let flags = unsafe {
149         fcntl(
150             to_crosvm.0.as_raw_descriptor(),
151             F_SETPIPE_SZ,
152             MAX_VCPU_DATAGRAM_SIZE as c_int,
153         )
154     };
155     if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
156         warn!(
157             "Failed to adjust size of crosvm pipe (result {}): {}",
158             flags,
159             SysError::last()
160         );
161     }
162     // SAFETY: safe because no memory is modified and we check return value.
163     let flags = unsafe {
164         fcntl(
165             to_plugin.0.as_raw_descriptor(),
166             F_SETPIPE_SZ,
167             MAX_VCPU_DATAGRAM_SIZE as c_int,
168         )
169     };
170     if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
171         warn!(
172             "Failed to adjust size of plugin pipe (result {}): {}",
173             flags,
174             SysError::last()
175         );
176     }
177     Ok(VcpuPipe {
178         crosvm_read: to_crosvm.0,
179         plugin_write: to_crosvm.1,
180         plugin_read: to_plugin.0,
181         crosvm_write: to_plugin.1,
182     })
183 }
184 
proto_to_sys_err(e: protobuf::Error) -> SysError185 fn proto_to_sys_err(e: protobuf::Error) -> SysError {
186     io_to_sys_err(io::Error::from(e))
187 }
188 
io_to_sys_err(e: io::Error) -> SysError189 fn io_to_sys_err(e: io::Error) -> SysError {
190     SysError::new(e.raw_os_error().unwrap_or(EINVAL))
191 }
192 
mmap_to_sys_err(e: MmapError) -> SysError193 fn mmap_to_sys_err(e: MmapError) -> SysError {
194     match e {
195         MmapError::SystemCallFailed(e) => e,
196         _ => SysError::new(EINVAL),
197     }
198 }
199 
200 /// Each `PluginObject` represents one object that was instantiated by the guest using the `Create`
201 /// request.
202 ///
203 /// Each such object has an ID associated with it that exists in an ID space shared by every variant
204 /// of `PluginObject`. This allows all the objects to be indexed in a single map, and allows for a
205 /// common destroy method.
206 
207 /// In addition to the destory method, each object may have methods specific to its variant type.
208 /// These variant methods must be done by matching the variant to the expected type for that method.
209 /// For example, getting the dirty log from a `Memory` object starting with an ID:
210 ///
211 /// ```ignore
212 /// match objects.get(&request_id) {
213 ///    Some(&PluginObject::Memory { slot, length }) => vm.get_dirty_log(slot, &mut dirty_log[..]),
214 ///    _ => return Err(SysError::new(ENOENT)),
215 /// }
216 /// ```
217 enum PluginObject {
218     IoEvent {
219         evt: Event,
220         addr: IoeventAddress,
221         length: u32,
222         datamatch: u64,
223     },
224     Memory {
225         slot: u32,
226         length: usize,
227     },
228     IrqEvent {
229         irq_id: u32,
230         evt: Event,
231     },
232 }
233 
234 impl PluginObject {
destroy(self, vm: &mut Vm) -> SysResult<()>235     fn destroy(self, vm: &mut Vm) -> SysResult<()> {
236         match self {
237             PluginObject::IoEvent {
238                 evt,
239                 addr,
240                 length,
241                 datamatch,
242             } => match length {
243                 0 => vm.unregister_ioevent(&evt, addr, Datamatch::AnyLength),
244                 1 => vm.unregister_ioevent(&evt, addr, Datamatch::U8(Some(datamatch as u8))),
245                 2 => vm.unregister_ioevent(&evt, addr, Datamatch::U16(Some(datamatch as u16))),
246                 4 => vm.unregister_ioevent(&evt, addr, Datamatch::U32(Some(datamatch as u32))),
247                 8 => vm.unregister_ioevent(&evt, addr, Datamatch::U64(Some(datamatch))),
248                 _ => Err(SysError::new(EINVAL)),
249             },
250             PluginObject::Memory { slot, .. } => vm.remove_memory_region(slot).and(Ok(())),
251             PluginObject::IrqEvent { irq_id, evt } => vm.unregister_irqfd(&evt, irq_id),
252         }
253     }
254 }
255 
run_vcpus( kvm: &Kvm, vm: &Vm, plugin: &Process, vcpu_count: u32, kill_signaled: &Arc<AtomicBool>, exit_evt: &Event, vcpu_handles: &mut Vec<thread::JoinHandle<()>>, vcpu_cgroup_tasks_file: Option<File>, ) -> Result<()>256 pub fn run_vcpus(
257     kvm: &Kvm,
258     vm: &Vm,
259     plugin: &Process,
260     vcpu_count: u32,
261     kill_signaled: &Arc<AtomicBool>,
262     exit_evt: &Event,
263     vcpu_handles: &mut Vec<thread::JoinHandle<()>>,
264     vcpu_cgroup_tasks_file: Option<File>,
265 ) -> Result<()> {
266     let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_count) as usize));
267     let use_kvm_signals = !kvm.check_extension(Cap::ImmediateExit);
268 
269     // If we need to force a vcpu to exit from a VM then a SIGRTMIN signal is sent
270     // to that vcpu's thread.  If KVM is running the VM then it'll return -EINTR.
271     // An issue is what to do when KVM isn't running the VM (where we could be
272     // in the kernel or in the app).
273     //
274     // If KVM supports "immediate exit" then we set a signal handler that will
275     // set the |immediate_exit| flag that tells KVM to return -EINTR before running
276     // the VM.
277     //
278     // If KVM doesn't support immediate exit then we'll block SIGRTMIN in the app
279     // and tell KVM to unblock SIGRTMIN before running the VM (at which point a blocked
280     // signal might get asserted).  There's overhead to have KVM unblock and re-block
281     // SIGRTMIN each time it runs the VM, so this mode should be avoided.
282 
283     if use_kvm_signals {
284         // SAFETY:
285         // Our signal handler does nothing and is trivially async signal safe.
286         unsafe {
287             extern "C" fn handle_signal(_: c_int) {}
288             // We need to install this signal handler even though we do block
289             // the signal below, to ensure that this signal will interrupt
290             // execution of KVM_RUN (this is implementation issue).
291             register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
292                 .expect("failed to register vcpu signal handler");
293         }
294         // We do not really want the signal handler to run...
295         block_signal(SIGRTMIN() + 0).expect("failed to block signal");
296     } else {
297         // SAFETY: trivially safe as we check return value.
298         unsafe {
299             extern "C" fn handle_signal(_: c_int) {
300                 Vcpu::set_local_immediate_exit(true);
301             }
302             register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
303                 .expect("failed to register vcpu signal handler");
304         }
305     }
306 
307     for cpu_id in 0..vcpu_count {
308         let kill_signaled = kill_signaled.clone();
309         let vcpu_thread_barrier = vcpu_thread_barrier.clone();
310         let vcpu_exit_evt = exit_evt.try_clone().context("failed to clone event")?;
311         let vcpu_plugin = plugin.create_vcpu(cpu_id)?;
312         let vcpu = Vcpu::new(cpu_id as c_ulong, kvm, vm).context("error creating vcpu")?;
313         let vcpu_cgroup_tasks_file = vcpu_cgroup_tasks_file
314             .as_ref()
315             .map(|f| f.try_clone().unwrap());
316 
317         vcpu_handles.push(
318             thread::Builder::new()
319                 .name(format!("crosvm_vcpu{}", cpu_id))
320                 .spawn(move || {
321                     if use_kvm_signals {
322                         // Tell KVM to not block anything when entering kvm run
323                         // because we will be using first RT signal to kick the VCPU.
324                         vcpu.set_signal_mask(&[])
325                             .expect("failed to set up KVM VCPU signal mask");
326                     }
327                     // Move vcpu thread to cgroup
328                     if let Some(mut f) = vcpu_cgroup_tasks_file {
329                         f.write_all(base::gettid().to_string().as_bytes()).unwrap();
330                     }
331 
332                     if let Err(e) = enable_core_scheduling() {
333                         error!("Failed to enable core scheduling: {}", e);
334                     }
335 
336                     let vcpu = vcpu
337                         .to_runnable(Some(SIGRTMIN() + 0))
338                         .expect("Failed to set thread id");
339 
340                     let res = vcpu_plugin.init(&vcpu);
341                     vcpu_thread_barrier.wait();
342                     if let Err(e) = res {
343                         error!("failed to initialize vcpu {}: {}", cpu_id, e);
344                     } else {
345                         loop {
346                             let mut interrupted_by_signal = false;
347                             let run_res = vcpu.run();
348                             match run_res {
349                                 Ok(run) => match run {
350                                     VcpuExit::IoIn { port, mut size } => {
351                                         let mut data = [0; 256];
352                                         if size > data.len() {
353                                             error!(
354                                                 "unsupported IoIn size of {} bytes at port {:#x}",
355                                                 size, port
356                                             );
357                                             size = data.len();
358                                         }
359                                         vcpu_plugin.io_read(port as u64, &mut data[..size], &vcpu);
360                                         if let Err(e) = vcpu.set_data(&data[..size]) {
361                                             error!(
362                                                 "failed to set return data for IoIn at port {:#x}: {}",
363                                                 port, e
364                                             );
365                                         }
366                                     }
367                                     VcpuExit::IoOut {
368                                         port,
369                                         mut size,
370                                         data,
371                                     } => {
372                                         if size > data.len() {
373                                             error!("unsupported IoOut size of {} bytes at port {:#x}", size, port);
374                                             size = data.len();
375                                         }
376                                         vcpu_plugin.io_write(port as u64, &data[..size], &vcpu);
377                                     }
378                                     VcpuExit::MmioRead { address, size } => {
379                                         let mut data = [0; 8];
380                                         vcpu_plugin.mmio_read(
381                                             address,
382                                             &mut data[..size],
383                                             &vcpu,
384                                         );
385                                         // Setting data for mmio can not fail.
386                                         let _ = vcpu.set_data(&data[..size]);
387                                     }
388                                     VcpuExit::MmioWrite {
389                                         address,
390                                         size,
391                                         data,
392                                     } => {
393                                         vcpu_plugin.mmio_write(
394                                             address,
395                                             &data[..size],
396                                             &vcpu,
397                                         );
398                                     }
399                                     VcpuExit::HypervHcall { input, params } => {
400                                         let mut data = [0; 8];
401                                         vcpu_plugin.hyperv_call(input, params, &mut data, &vcpu);
402                                         // Setting data for hyperv call can not fail.
403                                         let _ = vcpu.set_data(&data);
404                                     }
405                                     VcpuExit::HypervSynic {
406                                         msr,
407                                         control,
408                                         evt_page,
409                                         msg_page,
410                                     } => {
411                                         vcpu_plugin
412                                             .hyperv_synic(msr, control, evt_page, msg_page, &vcpu);
413                                     }
414                                     VcpuExit::Hlt => break,
415                                     VcpuExit::Shutdown => break,
416                                     VcpuExit::InternalError => {
417                                         error!("vcpu {} has internal error", cpu_id);
418                                         break;
419                                     }
420                                     r => warn!("unexpected vcpu exit: {:?}", r),
421                                 },
422                                 Err(e) => match e.errno() {
423                                     EINTR => interrupted_by_signal = true,
424                                     EAGAIN => {}
425                                     _ => {
426                                         error!("vcpu hit unknown error: {}", e);
427                                         break;
428                                     }
429                                 },
430                             }
431                             if kill_signaled.load(Ordering::SeqCst) {
432                                 break;
433                             }
434 
435                             // Only handle the pause request if kvm reported that it was
436                             // interrupted by a signal.  This helps to entire that KVM has had a chance
437                             // to finish emulating any IO that may have immediately happened.
438                             // If we eagerly check pre_run() then any IO that we
439                             // just reported to the plugin won't have been processed yet by KVM.
440                             // Not eagerly calling pre_run() also helps to reduce
441                             // any overhead from checking if a pause request is pending.
442                             // The assumption is that pause requests aren't common
443                             // or frequent so it's better to optimize for the non-pause execution paths.
444                             if interrupted_by_signal {
445                                 if use_kvm_signals {
446                                     clear_signal(SIGRTMIN() + 0)
447                                         .expect("failed to clear pending signal");
448                                 } else {
449                                     vcpu.set_immediate_exit(false);
450                                 }
451 
452                                 if let Err(e) = vcpu_plugin.pre_run(&vcpu) {
453                                     error!("failed to process pause on vcpu {}: {}", cpu_id, e);
454                                     break;
455                                 }
456                             }
457                         }
458                     }
459                     vcpu_exit_evt
460                         .signal()
461                         .expect("failed to signal vcpu exit event");
462                 })
463                 .context("error spawning vcpu thread")?,
464         );
465     }
466     Ok(())
467 }
468 
469 #[derive(EventToken)]
470 enum Token {
471     Exit,
472     ChildSignal,
473     Stderr,
474     Plugin { index: usize },
475 }
476 
477 /// Run a VM with a plugin process specified by `cfg`.
478 ///
479 /// Not every field of `cfg` will be used. In particular, most field that pertain to a specific
480 /// device are ignored because the plugin is responsible for emulating hardware.
run_config(cfg: Config) -> Result<()>481 pub fn run_config(cfg: Config) -> Result<()> {
482     info!("crosvm starting plugin process");
483 
484     // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
485     // before any jailed devices have been spawned, so that we can catch any of them that fail very
486     // quickly.
487     let sigchld_fd = SignalFd::new(SIGCHLD).context("failed to create signalfd")?;
488 
489     // Create a pipe to capture error messages from plugin and minijail.
490     let (mut stderr_rd, stderr_wr) = pipe().context("failed to create stderr pipe")?;
491     add_fd_flags(stderr_rd.as_raw_descriptor(), O_NONBLOCK)
492         .context("error marking stderr nonblocking")?;
493 
494     let jail = if let Some(jail_config) = &cfg.jail_config {
495         if jail_config.seccomp_policy_dir.is_none() {
496             bail!("plugin requires seccomp policy file specified.");
497         }
498 
499         let mut config = SandboxConfig::new(jail_config, "plugin");
500         config.bind_mounts = true;
501         let uid_map = format!("0 {} 1", geteuid());
502         let gid_map = format!("0 {} 1", getegid());
503         let gid_map = if !cfg.plugin_gid_maps.is_empty() {
504             gid_map
505                 + &cfg
506                     .plugin_gid_maps
507                     .into_iter()
508                     .fold(String::new(), |mut output, m| {
509                         let _ = write!(output, ",{} {} {}", m.inner, m.outer, m.count);
510                         output
511                     })
512         } else {
513             gid_map
514         };
515         config.ugid_map = Some((&uid_map, &gid_map));
516 
517         let root_path = cfg.plugin_root.as_ref().unwrap_or(&jail_config.pivot_root);
518         let mut jail = create_sandbox_minijail(root_path, MAX_OPEN_FILES, &config)
519             .context("create plugin sandbox")?;
520 
521         // Because we requested to "run as init", minijail will not mount /proc for us even though
522         // plugin will be running in its own PID namespace, so we have to mount it ourselves.
523         mount_proc(&mut jail).context("mount proc")?;
524 
525         // Mount minimal set of devices (full, zero, urandom, etc). We can not use
526         // jail.mount_dev() here because crosvm may not be running with CAP_SYS_ADMIN.
527         let device_names = ["full", "null", "urandom", "zero"];
528         for name in &device_names {
529             let device = Path::new("/dev").join(name);
530             jail.mount_bind(&device, &device, true)
531                 .context("failed to mount dev")?;
532         }
533 
534         for bind_mount in &cfg.plugin_mounts {
535             jail.mount_bind(&bind_mount.src, &bind_mount.dst, bind_mount.writable)
536                 .with_context(|| {
537                     format!(
538                         "failed to bind mount {} -> {} as {} ",
539                         bind_mount.src.display(),
540                         bind_mount.dst.display(),
541                         if bind_mount.writable {
542                             "writable"
543                         } else {
544                             "read only"
545                         }
546                     )
547                 })?;
548         }
549 
550         Some(jail)
551     } else {
552         None
553     };
554 
555     #[allow(unused_mut)]
556     let mut tap_interfaces: Vec<Tap> = Vec::new();
557     #[cfg(feature = "net")]
558     for net_params in cfg.net {
559         use devices::virtio::NetParametersMode;
560         use net_util::TapTCommon;
561 
562         if net_params.vhost_net.is_some() {
563             bail!("vhost-net not supported with plugin");
564         }
565 
566         match net_params.mode {
567             NetParametersMode::RawConfig {
568                 host_ip,
569                 netmask,
570                 mac,
571             } => {
572                 let tap = Tap::new(false, false).context("error opening tap device")?;
573                 tap.set_ip_addr(host_ip).context("error setting tap ip")?;
574                 tap.set_netmask(netmask)
575                     .context("error setting tap netmask")?;
576                 tap.set_mac_address(mac)
577                     .context("error setting tap mac address")?;
578 
579                 tap.enable().context("error enabling tap device")?;
580                 tap_interfaces.push(tap);
581             }
582             NetParametersMode::TapName { tap_name, mac } => {
583                 let tap = Tap::new_with_name(tap_name.as_bytes(), true, false)
584                     .context("failed to create tap device from name")?;
585                 if let Some(mac) = mac {
586                     tap.set_mac_address(mac)
587                         .context("error setting tap mac addres")?;
588                 }
589                 tap_interfaces.push(tap);
590             }
591             NetParametersMode::TapFd { tap_fd, mac } => {
592                 // SAFETY:
593                 // Safe because we ensure that we get a unique handle to the fd.
594                 let tap = unsafe {
595                     Tap::from_raw_descriptor(
596                         base::validate_raw_descriptor(tap_fd)
597                             .context("failed to validate raw tap fd")?,
598                     )
599                     .context("failed to create tap device from raw fd")?
600                 };
601                 if let Some(mac) = mac {
602                     tap.set_mac_address(mac)
603                         .context("error setting tap mac addres")?;
604                 }
605                 tap_interfaces.push(tap);
606             }
607         }
608     }
609 
610     let plugin_args: Vec<&str> = cfg.params.iter().map(|s| &s[..]).collect();
611 
612     let plugin_path = match cfg.executable_path {
613         Some(Executable::Plugin(ref plugin_path)) => plugin_path.as_path(),
614         _ => panic!("Executable was not a plugin"),
615     };
616     let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u32;
617     let mem = GuestMemory::new(&[]).unwrap();
618     let mut mem_policy = MemoryPolicy::empty();
619     if cfg.hugepages {
620         mem_policy |= MemoryPolicy::USE_HUGEPAGES;
621     }
622     mem.set_memory_policy(mem_policy);
623 
624     let kvm_device_path = if let Some(HypervisorKind::Kvm { device }) = &cfg.hypervisor {
625         device.as_deref()
626     } else {
627         None
628     };
629 
630     let kvm_device_path = kvm_device_path.unwrap_or(Path::new("/dev/kvm"));
631     let kvm = Kvm::new_with_path(kvm_device_path).context("error creating Kvm")?;
632     let mut vm = Vm::new(&kvm, mem).context("error creating vm")?;
633     vm.create_irq_chip()
634         .context("failed to create kvm irqchip")?;
635     vm.create_pit().context("failed to create kvm PIT")?;
636 
637     let mut plugin = Process::new(vcpu_count, plugin_path, &plugin_args, jail, stderr_wr)?;
638     // Now that the jail for the plugin has been created and we had a chance to adjust gids there,
639     // we can drop all our capabilities in case we had any.
640     drop_capabilities().context("failed to drop process capabilities")?;
641 
642     let mut res = Ok(());
643     // If Some, we will exit after enough time is passed to shutdown cleanly.
644     let mut dying_instant: Option<Instant> = None;
645     let duration_to_die = Duration::from_millis(1000);
646 
647     let exit_evt = Event::new().context("failed to create event")?;
648     let kill_signaled = Arc::new(AtomicBool::new(false));
649     let mut vcpu_handles = Vec::with_capacity(vcpu_count as usize);
650 
651     let wait_ctx = WaitContext::build_with(&[
652         (&exit_evt, Token::Exit),
653         (&sigchld_fd, Token::ChildSignal),
654         (&stderr_rd, Token::Stderr),
655     ])
656     .context("failed to add control descriptors to wait context")?;
657 
658     let mut sockets_to_drop = Vec::new();
659     let mut redo_wait_ctx_sockets = true;
660     // In this loop, make every attempt to not return early. If an error is encountered, set `res`
661     // to the error, set `dying_instant` to now, and signal the plugin that it will be killed soon.
662     // If the plugin cannot be signaled because it is dead of `signal_kill` failed, simply break
663     // from the poll loop so that the VCPU threads can be cleaned up.
664     'wait: loop {
665         // After we have waited long enough, it's time to give up and exit.
666         if dying_instant
667             .map(|i| i.elapsed() >= duration_to_die)
668             .unwrap_or(false)
669         {
670             break;
671         }
672 
673         if redo_wait_ctx_sockets {
674             for (index, socket) in plugin.sockets().iter().enumerate() {
675                 wait_ctx
676                     .add(socket, Token::Plugin { index })
677                     .context("failed to add plugin sockets to wait context")?;
678             }
679         }
680 
681         let plugin_socket_count = plugin.sockets().len();
682         let events = {
683             let poll_res = match dying_instant {
684                 Some(inst) => wait_ctx.wait_timeout(duration_to_die - inst.elapsed()),
685                 None => wait_ctx.wait(),
686             };
687             match poll_res {
688                 Ok(v) => v,
689                 Err(e) => {
690                     // Polling no longer works, time to break and cleanup,
691                     if res.is_ok() {
692                         res = Err(e).context("failed to poll all FDs");
693                     }
694                     break;
695                 }
696             }
697         };
698 
699         for event in events.iter().filter(|e| e.is_hungup) {
700             if let Token::Stderr = event.token {
701                 let _ = wait_ctx.delete(&stderr_rd);
702             }
703         }
704 
705         for event in events.iter().filter(|e| e.is_readable) {
706             match event.token {
707                 Token::Exit => {
708                     // No need to check the exit event if we are already doing cleanup.
709                     let _ = wait_ctx.delete(&exit_evt);
710                     dying_instant.get_or_insert(Instant::now());
711                     let sig_res = plugin.signal_kill();
712                     if res.is_ok() && sig_res.is_err() {
713                         res = sig_res.context("error sending kill signal to plugin on exit event");
714                     }
715                 }
716                 Token::ChildSignal => {
717                     // Print all available siginfo structs, then exit the loop.
718                     loop {
719                         match sigchld_fd.read() {
720                             Ok(Some(siginfo)) => {
721                                 // If the plugin process has ended, there is no need to continue
722                                 // processing plugin connections, so we break early.
723                                 if siginfo.ssi_pid == plugin.pid() as u32 {
724                                     break 'wait;
725                                 }
726                                 // Because SIGCHLD is not expected from anything other than the
727                                 // plugin process, report it as an error.
728                                 if res.is_ok() {
729                                     res = Err(anyhow!(
730                                         "process {} died with signal {}, status {}, and code {}",
731                                         siginfo.ssi_pid,
732                                         siginfo.ssi_signo,
733                                         siginfo.ssi_status,
734                                         siginfo.ssi_code,
735                                     ));
736                                 }
737                             }
738                             Ok(None) => break, // No more signals to read.
739                             Err(e) => {
740                                 // Something really must be messed up for this to happen, continue
741                                 // processing connections for a limited time.
742                                 if res.is_ok() {
743                                     res = Err(e).context("failed to read signal fd");
744                                 }
745                                 break;
746                             }
747                         }
748                     }
749                     // As we only spawn the plugin process, getting a SIGCHLD can only mean
750                     // something went wrong.
751                     dying_instant.get_or_insert(Instant::now());
752                     let sig_res = plugin.signal_kill();
753                     if res.is_ok() && sig_res.is_err() {
754                         res = sig_res.context("error sending kill signal to plugin on SIGCHLD");
755                     }
756                 }
757                 Token::Stderr => loop {
758                     let mut buf = [0u8; 4096];
759                     match stderr_rd.read(&mut buf) {
760                         Ok(len) => {
761                             for l in String::from_utf8_lossy(&buf[0..len]).lines() {
762                                 error!("minijail/plugin: {}", l);
763                             }
764                         }
765                         Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => {
766                             break;
767                         }
768                         Err(e) => {
769                             error!("failed reading from stderr: {}", e);
770                             break;
771                         }
772                     }
773                 },
774                 Token::Plugin { index } => {
775                     match plugin.handle_socket(index, &kvm, &mut vm, &vcpu_handles, &tap_interfaces)
776                     {
777                         Ok(_) => {}
778                         // A HUP is an expected event for a socket, so don't bother warning about
779                         // it.
780                         Err(CommError::PluginSocketHup) => sockets_to_drop.push(index),
781                         // Only one connection out of potentially many is broken. Drop it, but don't
782                         // start cleaning up. Because the error isn't returned, we will warn about
783                         // it here.
784                         Err(e) => {
785                             warn!("error handling plugin socket: {}", e);
786                             sockets_to_drop.push(index);
787                         }
788                     }
789                 }
790             }
791         }
792 
793         if vcpu_handles.is_empty() && dying_instant.is_none() && plugin.is_started() {
794             let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
795                 None => None,
796                 Some(cgroup_path) => {
797                     // Move main process to cgroup_path
798                     let mut f = File::create(cgroup_path.join("tasks"))?;
799                     f.write_all(std::process::id().to_string().as_bytes())?;
800                     Some(f)
801                 }
802             };
803 
804             let res = run_vcpus(
805                 &kvm,
806                 &vm,
807                 &plugin,
808                 vcpu_count,
809                 &kill_signaled,
810                 &exit_evt,
811                 &mut vcpu_handles,
812                 vcpu_cgroup_tasks_file,
813             );
814             if let Err(e) = res {
815                 dying_instant.get_or_insert(Instant::now());
816                 error!("failed to start vcpus: {}", e);
817             }
818         }
819 
820         redo_wait_ctx_sockets =
821             !sockets_to_drop.is_empty() || plugin.sockets().len() != plugin_socket_count;
822 
823         // Cleanup all of the sockets that we have determined were disconnected or suffered some
824         // other error.
825         plugin.drop_sockets(&mut sockets_to_drop);
826         sockets_to_drop.clear();
827 
828         if redo_wait_ctx_sockets {
829             for socket in plugin.sockets() {
830                 let _ = wait_ctx.delete(socket);
831             }
832         }
833     }
834 
835     // vcpu threads MUST see the kill signaled flag, otherwise they may re-enter the VM.
836     kill_signaled.store(true, Ordering::SeqCst);
837     // Depending on how we ended up here, the plugin process, or a VCPU thread waiting for requests
838     // might be stuck. The `signal_kill` call will unstick all the VCPU threads by closing their
839     // blocked connections.
840     plugin
841         .signal_kill()
842         .context("error sending kill signal to plugin on cleanup")?;
843     for handle in vcpu_handles {
844         match handle.kill(SIGRTMIN() + 0) {
845             Ok(_) => {
846                 if let Err(e) = handle.join() {
847                     error!("failed to join vcpu thread: {:?}", e);
848                 }
849             }
850             Err(e) => error!("failed to kill vcpu thread: {}", e),
851         }
852     }
853 
854     match plugin.try_wait() {
855         // The plugin has run out of time by now
856         Ok(ProcessStatus::Running) => Err(anyhow!("plugin did not exit within timeout")),
857         // Return an error discovered earlier in this function.
858         Ok(ProcessStatus::Success) => res.map_err(anyhow::Error::msg),
859         Ok(ProcessStatus::Fail(code)) => Err(anyhow!("plugin exited with error: {}", code)),
860         Ok(ProcessStatus::Signal(code)) => Err(anyhow!("plugin exited with signal {}", code)),
861         Err(e) => Err(anyhow!("error waiting for plugin to exit: {}", e)),
862     }
863 }
864