1 // Copyright 2018 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 mod config;
6 mod process;
7 mod vcpu;
8
9 use std::fmt::Write as FmtWrite;
10 use std::fs::File;
11 use std::io;
12 use std::io::Read;
13 use std::io::Write;
14 use std::os::unix::net::UnixDatagram;
15 use std::path::Path;
16 use std::sync::atomic::AtomicBool;
17 use std::sync::atomic::Ordering;
18 use std::sync::Arc;
19 use std::sync::Barrier;
20 use std::thread;
21 use std::time::Duration;
22 use std::time::Instant;
23
24 use anyhow::anyhow;
25 use anyhow::bail;
26 use anyhow::Context;
27 use anyhow::Result;
28 use base::add_fd_flags;
29 use base::block_signal;
30 use base::clear_signal;
31 use base::drop_capabilities;
32 use base::enable_core_scheduling;
33 use base::error;
34 use base::getegid;
35 use base::geteuid;
36 use base::info;
37 use base::pipe;
38 use base::register_rt_signal_handler;
39 use base::warn;
40 use base::AsRawDescriptor;
41 use base::Error as SysError;
42 use base::Event;
43 use base::EventToken;
44 use base::FromRawDescriptor;
45 use base::Killable;
46 use base::MmapError;
47 use base::RawDescriptor;
48 use base::Result as SysResult;
49 use base::SignalFd;
50 use base::WaitContext;
51 use base::SIGRTMIN;
52 use jail::create_sandbox_minijail;
53 use jail::mount_proc;
54 use jail::SandboxConfig;
55 use kvm::Cap;
56 use kvm::Datamatch;
57 use kvm::IoeventAddress;
58 use kvm::Kvm;
59 use kvm::Vcpu;
60 use kvm::VcpuExit;
61 use kvm::Vm;
62 use libc::c_int;
63 use libc::c_ulong;
64 use libc::fcntl;
65 use libc::ioctl;
66 use libc::socketpair;
67 use libc::AF_UNIX;
68 use libc::EAGAIN;
69 use libc::EBADF;
70 use libc::EDEADLK;
71 use libc::EEXIST;
72 use libc::EINTR;
73 use libc::EINVAL;
74 use libc::ENOENT;
75 use libc::EOVERFLOW;
76 use libc::EPERM;
77 use libc::FIOCLEX;
78 use libc::F_SETPIPE_SZ;
79 use libc::O_NONBLOCK;
80 use libc::SIGCHLD;
81 use libc::SOCK_SEQPACKET;
82 use net_util::sys::linux::Tap;
83 use remain::sorted;
84 use thiserror::Error;
85 use vm_memory::GuestMemory;
86 use vm_memory::MemoryPolicy;
87
88 use self::process::*;
89 use self::vcpu::*;
90 use crate::crosvm::config::Executable;
91 use crate::crosvm::config::HypervisorKind;
92 pub use crate::crosvm::plugin::config::parse_plugin_mount_option;
93 pub use crate::crosvm::plugin::config::BindMount;
94 pub use crate::crosvm::plugin::config::GidMap;
95 use crate::Config;
96
97 const MAX_DATAGRAM_SIZE: usize = 4096;
98 const MAX_VCPU_DATAGRAM_SIZE: usize = 0x40000;
99 const MAX_OPEN_FILES: u64 = 32768;
100
101 /// An error that occurs when communicating with the plugin process.
102 #[sorted]
103 #[derive(Error, Debug)]
104 pub enum CommError {
105 #[error("failed to decode plugin request: {0}")]
106 DecodeRequest(protobuf::Error),
107 #[error("failed to encode plugin response: {0}")]
108 EncodeResponse(protobuf::Error),
109 #[error("plugin request socket has been hung up")]
110 PluginSocketHup,
111 #[error("failed to recv from plugin request socket: {0}")]
112 PluginSocketRecv(SysError),
113 #[error("failed to send to plugin request socket: {0}")]
114 PluginSocketSend(SysError),
115 }
116
new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)>117 fn new_seqpacket_pair() -> SysResult<(UnixDatagram, UnixDatagram)> {
118 let mut fds = [0, 0];
119 // SAFETY: trivially safe as we check the return value
120 unsafe {
121 let ret = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, fds.as_mut_ptr());
122 if ret == 0 {
123 ioctl(fds[0], FIOCLEX);
124 Ok((
125 UnixDatagram::from_raw_descriptor(fds[0]),
126 UnixDatagram::from_raw_descriptor(fds[1]),
127 ))
128 } else {
129 Err(SysError::last())
130 }
131 }
132 }
133
134 struct VcpuPipe {
135 crosvm_read: File,
136 plugin_write: File,
137 plugin_read: File,
138 crosvm_write: File,
139 }
140
new_pipe_pair() -> SysResult<VcpuPipe>141 fn new_pipe_pair() -> SysResult<VcpuPipe> {
142 let to_crosvm = pipe()?;
143 let to_plugin = pipe()?;
144 // Increasing the pipe size can be a nice-to-have to make sure that
145 // messages get across atomically (and made sure that writes don't block),
146 // though it's not necessary a hard requirement for things to work.
147 // SAFETY: safe because no memory is modified and we check return value.
148 let flags = unsafe {
149 fcntl(
150 to_crosvm.0.as_raw_descriptor(),
151 F_SETPIPE_SZ,
152 MAX_VCPU_DATAGRAM_SIZE as c_int,
153 )
154 };
155 if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
156 warn!(
157 "Failed to adjust size of crosvm pipe (result {}): {}",
158 flags,
159 SysError::last()
160 );
161 }
162 // SAFETY: safe because no memory is modified and we check return value.
163 let flags = unsafe {
164 fcntl(
165 to_plugin.0.as_raw_descriptor(),
166 F_SETPIPE_SZ,
167 MAX_VCPU_DATAGRAM_SIZE as c_int,
168 )
169 };
170 if flags < 0 || flags != MAX_VCPU_DATAGRAM_SIZE as i32 {
171 warn!(
172 "Failed to adjust size of plugin pipe (result {}): {}",
173 flags,
174 SysError::last()
175 );
176 }
177 Ok(VcpuPipe {
178 crosvm_read: to_crosvm.0,
179 plugin_write: to_crosvm.1,
180 plugin_read: to_plugin.0,
181 crosvm_write: to_plugin.1,
182 })
183 }
184
proto_to_sys_err(e: protobuf::Error) -> SysError185 fn proto_to_sys_err(e: protobuf::Error) -> SysError {
186 io_to_sys_err(io::Error::from(e))
187 }
188
io_to_sys_err(e: io::Error) -> SysError189 fn io_to_sys_err(e: io::Error) -> SysError {
190 SysError::new(e.raw_os_error().unwrap_or(EINVAL))
191 }
192
mmap_to_sys_err(e: MmapError) -> SysError193 fn mmap_to_sys_err(e: MmapError) -> SysError {
194 match e {
195 MmapError::SystemCallFailed(e) => e,
196 _ => SysError::new(EINVAL),
197 }
198 }
199
200 /// Each `PluginObject` represents one object that was instantiated by the guest using the `Create`
201 /// request.
202 ///
203 /// Each such object has an ID associated with it that exists in an ID space shared by every variant
204 /// of `PluginObject`. This allows all the objects to be indexed in a single map, and allows for a
205 /// common destroy method.
206
207 /// In addition to the destory method, each object may have methods specific to its variant type.
208 /// These variant methods must be done by matching the variant to the expected type for that method.
209 /// For example, getting the dirty log from a `Memory` object starting with an ID:
210 ///
211 /// ```ignore
212 /// match objects.get(&request_id) {
213 /// Some(&PluginObject::Memory { slot, length }) => vm.get_dirty_log(slot, &mut dirty_log[..]),
214 /// _ => return Err(SysError::new(ENOENT)),
215 /// }
216 /// ```
217 enum PluginObject {
218 IoEvent {
219 evt: Event,
220 addr: IoeventAddress,
221 length: u32,
222 datamatch: u64,
223 },
224 Memory {
225 slot: u32,
226 length: usize,
227 },
228 IrqEvent {
229 irq_id: u32,
230 evt: Event,
231 },
232 }
233
234 impl PluginObject {
destroy(self, vm: &mut Vm) -> SysResult<()>235 fn destroy(self, vm: &mut Vm) -> SysResult<()> {
236 match self {
237 PluginObject::IoEvent {
238 evt,
239 addr,
240 length,
241 datamatch,
242 } => match length {
243 0 => vm.unregister_ioevent(&evt, addr, Datamatch::AnyLength),
244 1 => vm.unregister_ioevent(&evt, addr, Datamatch::U8(Some(datamatch as u8))),
245 2 => vm.unregister_ioevent(&evt, addr, Datamatch::U16(Some(datamatch as u16))),
246 4 => vm.unregister_ioevent(&evt, addr, Datamatch::U32(Some(datamatch as u32))),
247 8 => vm.unregister_ioevent(&evt, addr, Datamatch::U64(Some(datamatch))),
248 _ => Err(SysError::new(EINVAL)),
249 },
250 PluginObject::Memory { slot, .. } => vm.remove_memory_region(slot).and(Ok(())),
251 PluginObject::IrqEvent { irq_id, evt } => vm.unregister_irqfd(&evt, irq_id),
252 }
253 }
254 }
255
run_vcpus( kvm: &Kvm, vm: &Vm, plugin: &Process, vcpu_count: u32, kill_signaled: &Arc<AtomicBool>, exit_evt: &Event, vcpu_handles: &mut Vec<thread::JoinHandle<()>>, vcpu_cgroup_tasks_file: Option<File>, ) -> Result<()>256 pub fn run_vcpus(
257 kvm: &Kvm,
258 vm: &Vm,
259 plugin: &Process,
260 vcpu_count: u32,
261 kill_signaled: &Arc<AtomicBool>,
262 exit_evt: &Event,
263 vcpu_handles: &mut Vec<thread::JoinHandle<()>>,
264 vcpu_cgroup_tasks_file: Option<File>,
265 ) -> Result<()> {
266 let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_count) as usize));
267 let use_kvm_signals = !kvm.check_extension(Cap::ImmediateExit);
268
269 // If we need to force a vcpu to exit from a VM then a SIGRTMIN signal is sent
270 // to that vcpu's thread. If KVM is running the VM then it'll return -EINTR.
271 // An issue is what to do when KVM isn't running the VM (where we could be
272 // in the kernel or in the app).
273 //
274 // If KVM supports "immediate exit" then we set a signal handler that will
275 // set the |immediate_exit| flag that tells KVM to return -EINTR before running
276 // the VM.
277 //
278 // If KVM doesn't support immediate exit then we'll block SIGRTMIN in the app
279 // and tell KVM to unblock SIGRTMIN before running the VM (at which point a blocked
280 // signal might get asserted). There's overhead to have KVM unblock and re-block
281 // SIGRTMIN each time it runs the VM, so this mode should be avoided.
282
283 if use_kvm_signals {
284 // SAFETY:
285 // Our signal handler does nothing and is trivially async signal safe.
286 unsafe {
287 extern "C" fn handle_signal(_: c_int) {}
288 // We need to install this signal handler even though we do block
289 // the signal below, to ensure that this signal will interrupt
290 // execution of KVM_RUN (this is implementation issue).
291 register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
292 .expect("failed to register vcpu signal handler");
293 }
294 // We do not really want the signal handler to run...
295 block_signal(SIGRTMIN() + 0).expect("failed to block signal");
296 } else {
297 // SAFETY: trivially safe as we check return value.
298 unsafe {
299 extern "C" fn handle_signal(_: c_int) {
300 Vcpu::set_local_immediate_exit(true);
301 }
302 register_rt_signal_handler(SIGRTMIN() + 0, handle_signal)
303 .expect("failed to register vcpu signal handler");
304 }
305 }
306
307 for cpu_id in 0..vcpu_count {
308 let kill_signaled = kill_signaled.clone();
309 let vcpu_thread_barrier = vcpu_thread_barrier.clone();
310 let vcpu_exit_evt = exit_evt.try_clone().context("failed to clone event")?;
311 let vcpu_plugin = plugin.create_vcpu(cpu_id)?;
312 let vcpu = Vcpu::new(cpu_id as c_ulong, kvm, vm).context("error creating vcpu")?;
313 let vcpu_cgroup_tasks_file = vcpu_cgroup_tasks_file
314 .as_ref()
315 .map(|f| f.try_clone().unwrap());
316
317 vcpu_handles.push(
318 thread::Builder::new()
319 .name(format!("crosvm_vcpu{}", cpu_id))
320 .spawn(move || {
321 if use_kvm_signals {
322 // Tell KVM to not block anything when entering kvm run
323 // because we will be using first RT signal to kick the VCPU.
324 vcpu.set_signal_mask(&[])
325 .expect("failed to set up KVM VCPU signal mask");
326 }
327 // Move vcpu thread to cgroup
328 if let Some(mut f) = vcpu_cgroup_tasks_file {
329 f.write_all(base::gettid().to_string().as_bytes()).unwrap();
330 }
331
332 if let Err(e) = enable_core_scheduling() {
333 error!("Failed to enable core scheduling: {}", e);
334 }
335
336 let vcpu = vcpu
337 .to_runnable(Some(SIGRTMIN() + 0))
338 .expect("Failed to set thread id");
339
340 let res = vcpu_plugin.init(&vcpu);
341 vcpu_thread_barrier.wait();
342 if let Err(e) = res {
343 error!("failed to initialize vcpu {}: {}", cpu_id, e);
344 } else {
345 loop {
346 let mut interrupted_by_signal = false;
347 let run_res = vcpu.run();
348 match run_res {
349 Ok(run) => match run {
350 VcpuExit::IoIn { port, mut size } => {
351 let mut data = [0; 256];
352 if size > data.len() {
353 error!(
354 "unsupported IoIn size of {} bytes at port {:#x}",
355 size, port
356 );
357 size = data.len();
358 }
359 vcpu_plugin.io_read(port as u64, &mut data[..size], &vcpu);
360 if let Err(e) = vcpu.set_data(&data[..size]) {
361 error!(
362 "failed to set return data for IoIn at port {:#x}: {}",
363 port, e
364 );
365 }
366 }
367 VcpuExit::IoOut {
368 port,
369 mut size,
370 data,
371 } => {
372 if size > data.len() {
373 error!("unsupported IoOut size of {} bytes at port {:#x}", size, port);
374 size = data.len();
375 }
376 vcpu_plugin.io_write(port as u64, &data[..size], &vcpu);
377 }
378 VcpuExit::MmioRead { address, size } => {
379 let mut data = [0; 8];
380 vcpu_plugin.mmio_read(
381 address,
382 &mut data[..size],
383 &vcpu,
384 );
385 // Setting data for mmio can not fail.
386 let _ = vcpu.set_data(&data[..size]);
387 }
388 VcpuExit::MmioWrite {
389 address,
390 size,
391 data,
392 } => {
393 vcpu_plugin.mmio_write(
394 address,
395 &data[..size],
396 &vcpu,
397 );
398 }
399 VcpuExit::HypervHcall { input, params } => {
400 let mut data = [0; 8];
401 vcpu_plugin.hyperv_call(input, params, &mut data, &vcpu);
402 // Setting data for hyperv call can not fail.
403 let _ = vcpu.set_data(&data);
404 }
405 VcpuExit::HypervSynic {
406 msr,
407 control,
408 evt_page,
409 msg_page,
410 } => {
411 vcpu_plugin
412 .hyperv_synic(msr, control, evt_page, msg_page, &vcpu);
413 }
414 VcpuExit::Hlt => break,
415 VcpuExit::Shutdown => break,
416 VcpuExit::InternalError => {
417 error!("vcpu {} has internal error", cpu_id);
418 break;
419 }
420 r => warn!("unexpected vcpu exit: {:?}", r),
421 },
422 Err(e) => match e.errno() {
423 EINTR => interrupted_by_signal = true,
424 EAGAIN => {}
425 _ => {
426 error!("vcpu hit unknown error: {}", e);
427 break;
428 }
429 },
430 }
431 if kill_signaled.load(Ordering::SeqCst) {
432 break;
433 }
434
435 // Only handle the pause request if kvm reported that it was
436 // interrupted by a signal. This helps to entire that KVM has had a chance
437 // to finish emulating any IO that may have immediately happened.
438 // If we eagerly check pre_run() then any IO that we
439 // just reported to the plugin won't have been processed yet by KVM.
440 // Not eagerly calling pre_run() also helps to reduce
441 // any overhead from checking if a pause request is pending.
442 // The assumption is that pause requests aren't common
443 // or frequent so it's better to optimize for the non-pause execution paths.
444 if interrupted_by_signal {
445 if use_kvm_signals {
446 clear_signal(SIGRTMIN() + 0)
447 .expect("failed to clear pending signal");
448 } else {
449 vcpu.set_immediate_exit(false);
450 }
451
452 if let Err(e) = vcpu_plugin.pre_run(&vcpu) {
453 error!("failed to process pause on vcpu {}: {}", cpu_id, e);
454 break;
455 }
456 }
457 }
458 }
459 vcpu_exit_evt
460 .signal()
461 .expect("failed to signal vcpu exit event");
462 })
463 .context("error spawning vcpu thread")?,
464 );
465 }
466 Ok(())
467 }
468
469 #[derive(EventToken)]
470 enum Token {
471 Exit,
472 ChildSignal,
473 Stderr,
474 Plugin { index: usize },
475 }
476
477 /// Run a VM with a plugin process specified by `cfg`.
478 ///
479 /// Not every field of `cfg` will be used. In particular, most field that pertain to a specific
480 /// device are ignored because the plugin is responsible for emulating hardware.
run_config(cfg: Config) -> Result<()>481 pub fn run_config(cfg: Config) -> Result<()> {
482 info!("crosvm starting plugin process");
483
484 // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
485 // before any jailed devices have been spawned, so that we can catch any of them that fail very
486 // quickly.
487 let sigchld_fd = SignalFd::new(SIGCHLD).context("failed to create signalfd")?;
488
489 // Create a pipe to capture error messages from plugin and minijail.
490 let (mut stderr_rd, stderr_wr) = pipe().context("failed to create stderr pipe")?;
491 add_fd_flags(stderr_rd.as_raw_descriptor(), O_NONBLOCK)
492 .context("error marking stderr nonblocking")?;
493
494 let jail = if let Some(jail_config) = &cfg.jail_config {
495 if jail_config.seccomp_policy_dir.is_none() {
496 bail!("plugin requires seccomp policy file specified.");
497 }
498
499 let mut config = SandboxConfig::new(jail_config, "plugin");
500 config.bind_mounts = true;
501 let uid_map = format!("0 {} 1", geteuid());
502 let gid_map = format!("0 {} 1", getegid());
503 let gid_map = if !cfg.plugin_gid_maps.is_empty() {
504 gid_map
505 + &cfg
506 .plugin_gid_maps
507 .into_iter()
508 .fold(String::new(), |mut output, m| {
509 let _ = write!(output, ",{} {} {}", m.inner, m.outer, m.count);
510 output
511 })
512 } else {
513 gid_map
514 };
515 config.ugid_map = Some((&uid_map, &gid_map));
516
517 let root_path = cfg.plugin_root.as_ref().unwrap_or(&jail_config.pivot_root);
518 let mut jail = create_sandbox_minijail(root_path, MAX_OPEN_FILES, &config)
519 .context("create plugin sandbox")?;
520
521 // Because we requested to "run as init", minijail will not mount /proc for us even though
522 // plugin will be running in its own PID namespace, so we have to mount it ourselves.
523 mount_proc(&mut jail).context("mount proc")?;
524
525 // Mount minimal set of devices (full, zero, urandom, etc). We can not use
526 // jail.mount_dev() here because crosvm may not be running with CAP_SYS_ADMIN.
527 let device_names = ["full", "null", "urandom", "zero"];
528 for name in &device_names {
529 let device = Path::new("/dev").join(name);
530 jail.mount_bind(&device, &device, true)
531 .context("failed to mount dev")?;
532 }
533
534 for bind_mount in &cfg.plugin_mounts {
535 jail.mount_bind(&bind_mount.src, &bind_mount.dst, bind_mount.writable)
536 .with_context(|| {
537 format!(
538 "failed to bind mount {} -> {} as {} ",
539 bind_mount.src.display(),
540 bind_mount.dst.display(),
541 if bind_mount.writable {
542 "writable"
543 } else {
544 "read only"
545 }
546 )
547 })?;
548 }
549
550 Some(jail)
551 } else {
552 None
553 };
554
555 #[allow(unused_mut)]
556 let mut tap_interfaces: Vec<Tap> = Vec::new();
557 #[cfg(feature = "net")]
558 for net_params in cfg.net {
559 use devices::virtio::NetParametersMode;
560 use net_util::TapTCommon;
561
562 if net_params.vhost_net.is_some() {
563 bail!("vhost-net not supported with plugin");
564 }
565
566 match net_params.mode {
567 NetParametersMode::RawConfig {
568 host_ip,
569 netmask,
570 mac,
571 } => {
572 let tap = Tap::new(false, false).context("error opening tap device")?;
573 tap.set_ip_addr(host_ip).context("error setting tap ip")?;
574 tap.set_netmask(netmask)
575 .context("error setting tap netmask")?;
576 tap.set_mac_address(mac)
577 .context("error setting tap mac address")?;
578
579 tap.enable().context("error enabling tap device")?;
580 tap_interfaces.push(tap);
581 }
582 NetParametersMode::TapName { tap_name, mac } => {
583 let tap = Tap::new_with_name(tap_name.as_bytes(), true, false)
584 .context("failed to create tap device from name")?;
585 if let Some(mac) = mac {
586 tap.set_mac_address(mac)
587 .context("error setting tap mac addres")?;
588 }
589 tap_interfaces.push(tap);
590 }
591 NetParametersMode::TapFd { tap_fd, mac } => {
592 // SAFETY:
593 // Safe because we ensure that we get a unique handle to the fd.
594 let tap = unsafe {
595 Tap::from_raw_descriptor(
596 base::validate_raw_descriptor(tap_fd)
597 .context("failed to validate raw tap fd")?,
598 )
599 .context("failed to create tap device from raw fd")?
600 };
601 if let Some(mac) = mac {
602 tap.set_mac_address(mac)
603 .context("error setting tap mac addres")?;
604 }
605 tap_interfaces.push(tap);
606 }
607 }
608 }
609
610 let plugin_args: Vec<&str> = cfg.params.iter().map(|s| &s[..]).collect();
611
612 let plugin_path = match cfg.executable_path {
613 Some(Executable::Plugin(ref plugin_path)) => plugin_path.as_path(),
614 _ => panic!("Executable was not a plugin"),
615 };
616 let vcpu_count = cfg.vcpu_count.unwrap_or(1) as u32;
617 let mem = GuestMemory::new(&[]).unwrap();
618 let mut mem_policy = MemoryPolicy::empty();
619 if cfg.hugepages {
620 mem_policy |= MemoryPolicy::USE_HUGEPAGES;
621 }
622 mem.set_memory_policy(mem_policy);
623
624 let kvm_device_path = if let Some(HypervisorKind::Kvm { device }) = &cfg.hypervisor {
625 device.as_deref()
626 } else {
627 None
628 };
629
630 let kvm_device_path = kvm_device_path.unwrap_or(Path::new("/dev/kvm"));
631 let kvm = Kvm::new_with_path(kvm_device_path).context("error creating Kvm")?;
632 let mut vm = Vm::new(&kvm, mem).context("error creating vm")?;
633 vm.create_irq_chip()
634 .context("failed to create kvm irqchip")?;
635 vm.create_pit().context("failed to create kvm PIT")?;
636
637 let mut plugin = Process::new(vcpu_count, plugin_path, &plugin_args, jail, stderr_wr)?;
638 // Now that the jail for the plugin has been created and we had a chance to adjust gids there,
639 // we can drop all our capabilities in case we had any.
640 drop_capabilities().context("failed to drop process capabilities")?;
641
642 let mut res = Ok(());
643 // If Some, we will exit after enough time is passed to shutdown cleanly.
644 let mut dying_instant: Option<Instant> = None;
645 let duration_to_die = Duration::from_millis(1000);
646
647 let exit_evt = Event::new().context("failed to create event")?;
648 let kill_signaled = Arc::new(AtomicBool::new(false));
649 let mut vcpu_handles = Vec::with_capacity(vcpu_count as usize);
650
651 let wait_ctx = WaitContext::build_with(&[
652 (&exit_evt, Token::Exit),
653 (&sigchld_fd, Token::ChildSignal),
654 (&stderr_rd, Token::Stderr),
655 ])
656 .context("failed to add control descriptors to wait context")?;
657
658 let mut sockets_to_drop = Vec::new();
659 let mut redo_wait_ctx_sockets = true;
660 // In this loop, make every attempt to not return early. If an error is encountered, set `res`
661 // to the error, set `dying_instant` to now, and signal the plugin that it will be killed soon.
662 // If the plugin cannot be signaled because it is dead of `signal_kill` failed, simply break
663 // from the poll loop so that the VCPU threads can be cleaned up.
664 'wait: loop {
665 // After we have waited long enough, it's time to give up and exit.
666 if dying_instant
667 .map(|i| i.elapsed() >= duration_to_die)
668 .unwrap_or(false)
669 {
670 break;
671 }
672
673 if redo_wait_ctx_sockets {
674 for (index, socket) in plugin.sockets().iter().enumerate() {
675 wait_ctx
676 .add(socket, Token::Plugin { index })
677 .context("failed to add plugin sockets to wait context")?;
678 }
679 }
680
681 let plugin_socket_count = plugin.sockets().len();
682 let events = {
683 let poll_res = match dying_instant {
684 Some(inst) => wait_ctx.wait_timeout(duration_to_die - inst.elapsed()),
685 None => wait_ctx.wait(),
686 };
687 match poll_res {
688 Ok(v) => v,
689 Err(e) => {
690 // Polling no longer works, time to break and cleanup,
691 if res.is_ok() {
692 res = Err(e).context("failed to poll all FDs");
693 }
694 break;
695 }
696 }
697 };
698
699 for event in events.iter().filter(|e| e.is_hungup) {
700 if let Token::Stderr = event.token {
701 let _ = wait_ctx.delete(&stderr_rd);
702 }
703 }
704
705 for event in events.iter().filter(|e| e.is_readable) {
706 match event.token {
707 Token::Exit => {
708 // No need to check the exit event if we are already doing cleanup.
709 let _ = wait_ctx.delete(&exit_evt);
710 dying_instant.get_or_insert(Instant::now());
711 let sig_res = plugin.signal_kill();
712 if res.is_ok() && sig_res.is_err() {
713 res = sig_res.context("error sending kill signal to plugin on exit event");
714 }
715 }
716 Token::ChildSignal => {
717 // Print all available siginfo structs, then exit the loop.
718 loop {
719 match sigchld_fd.read() {
720 Ok(Some(siginfo)) => {
721 // If the plugin process has ended, there is no need to continue
722 // processing plugin connections, so we break early.
723 if siginfo.ssi_pid == plugin.pid() as u32 {
724 break 'wait;
725 }
726 // Because SIGCHLD is not expected from anything other than the
727 // plugin process, report it as an error.
728 if res.is_ok() {
729 res = Err(anyhow!(
730 "process {} died with signal {}, status {}, and code {}",
731 siginfo.ssi_pid,
732 siginfo.ssi_signo,
733 siginfo.ssi_status,
734 siginfo.ssi_code,
735 ));
736 }
737 }
738 Ok(None) => break, // No more signals to read.
739 Err(e) => {
740 // Something really must be messed up for this to happen, continue
741 // processing connections for a limited time.
742 if res.is_ok() {
743 res = Err(e).context("failed to read signal fd");
744 }
745 break;
746 }
747 }
748 }
749 // As we only spawn the plugin process, getting a SIGCHLD can only mean
750 // something went wrong.
751 dying_instant.get_or_insert(Instant::now());
752 let sig_res = plugin.signal_kill();
753 if res.is_ok() && sig_res.is_err() {
754 res = sig_res.context("error sending kill signal to plugin on SIGCHLD");
755 }
756 }
757 Token::Stderr => loop {
758 let mut buf = [0u8; 4096];
759 match stderr_rd.read(&mut buf) {
760 Ok(len) => {
761 for l in String::from_utf8_lossy(&buf[0..len]).lines() {
762 error!("minijail/plugin: {}", l);
763 }
764 }
765 Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => {
766 break;
767 }
768 Err(e) => {
769 error!("failed reading from stderr: {}", e);
770 break;
771 }
772 }
773 },
774 Token::Plugin { index } => {
775 match plugin.handle_socket(index, &kvm, &mut vm, &vcpu_handles, &tap_interfaces)
776 {
777 Ok(_) => {}
778 // A HUP is an expected event for a socket, so don't bother warning about
779 // it.
780 Err(CommError::PluginSocketHup) => sockets_to_drop.push(index),
781 // Only one connection out of potentially many is broken. Drop it, but don't
782 // start cleaning up. Because the error isn't returned, we will warn about
783 // it here.
784 Err(e) => {
785 warn!("error handling plugin socket: {}", e);
786 sockets_to_drop.push(index);
787 }
788 }
789 }
790 }
791 }
792
793 if vcpu_handles.is_empty() && dying_instant.is_none() && plugin.is_started() {
794 let vcpu_cgroup_tasks_file = match &cfg.vcpu_cgroup_path {
795 None => None,
796 Some(cgroup_path) => {
797 // Move main process to cgroup_path
798 let mut f = File::create(cgroup_path.join("tasks"))?;
799 f.write_all(std::process::id().to_string().as_bytes())?;
800 Some(f)
801 }
802 };
803
804 let res = run_vcpus(
805 &kvm,
806 &vm,
807 &plugin,
808 vcpu_count,
809 &kill_signaled,
810 &exit_evt,
811 &mut vcpu_handles,
812 vcpu_cgroup_tasks_file,
813 );
814 if let Err(e) = res {
815 dying_instant.get_or_insert(Instant::now());
816 error!("failed to start vcpus: {}", e);
817 }
818 }
819
820 redo_wait_ctx_sockets =
821 !sockets_to_drop.is_empty() || plugin.sockets().len() != plugin_socket_count;
822
823 // Cleanup all of the sockets that we have determined were disconnected or suffered some
824 // other error.
825 plugin.drop_sockets(&mut sockets_to_drop);
826 sockets_to_drop.clear();
827
828 if redo_wait_ctx_sockets {
829 for socket in plugin.sockets() {
830 let _ = wait_ctx.delete(socket);
831 }
832 }
833 }
834
835 // vcpu threads MUST see the kill signaled flag, otherwise they may re-enter the VM.
836 kill_signaled.store(true, Ordering::SeqCst);
837 // Depending on how we ended up here, the plugin process, or a VCPU thread waiting for requests
838 // might be stuck. The `signal_kill` call will unstick all the VCPU threads by closing their
839 // blocked connections.
840 plugin
841 .signal_kill()
842 .context("error sending kill signal to plugin on cleanup")?;
843 for handle in vcpu_handles {
844 match handle.kill(SIGRTMIN() + 0) {
845 Ok(_) => {
846 if let Err(e) = handle.join() {
847 error!("failed to join vcpu thread: {:?}", e);
848 }
849 }
850 Err(e) => error!("failed to kill vcpu thread: {}", e),
851 }
852 }
853
854 match plugin.try_wait() {
855 // The plugin has run out of time by now
856 Ok(ProcessStatus::Running) => Err(anyhow!("plugin did not exit within timeout")),
857 // Return an error discovered earlier in this function.
858 Ok(ProcessStatus::Success) => res.map_err(anyhow::Error::msg),
859 Ok(ProcessStatus::Fail(code)) => Err(anyhow!("plugin exited with error: {}", code)),
860 Ok(ProcessStatus::Signal(code)) => Err(anyhow!("plugin exited with signal {}", code)),
861 Err(e) => Err(anyhow!("error waiting for plugin to exit: {}", e)),
862 }
863 }
864