1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::arch::x86_64::__cpuid;
6 use std::arch::x86_64::__cpuid_count;
7 use std::convert::TryInto;
8 use std::fmt;
9 use std::fmt::Display;
10 use std::sync::atomic::AtomicU64;
11 use std::sync::atomic::Ordering;
12 use std::sync::mpsc;
13 use std::sync::Arc;
14 use std::sync::Barrier;
15 use std::thread;
16 use std::thread::JoinHandle;
17 use std::time::Duration;
18 use std::time::Instant;
19
20 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
21 use aarch64::AArch64 as Arch;
22 use anyhow::anyhow;
23 use anyhow::Context;
24 use anyhow::Result;
25 use arch::CpuConfigArch;
26 use arch::CpuSet;
27 use arch::IrqChipArch;
28 use arch::LinuxArch;
29 use arch::RunnableLinuxVm;
30 use arch::VcpuAffinity;
31 use arch::VcpuArch;
32 use arch::VmArch;
33 use base::error;
34 use base::info;
35 use base::set_audio_thread_priority;
36 use base::set_cpu_affinity;
37 use base::warn;
38 use base::Event;
39 use base::Result as BaseResult;
40 use base::SafeMultimediaHandle;
41 use base::SendTube;
42 use base::Timer;
43 use base::Tube;
44 use base::VmEventType;
45 use cros_async::select2;
46 use cros_async::EventAsync;
47 use cros_async::Executor;
48 use cros_async::SelectResult;
49 use cros_async::TimerAsync;
50 use cros_tracing::trace_event;
51 use crosvm_cli::bail_exit_code;
52 use crosvm_cli::sys::windows::exit::Exit;
53 use crosvm_cli::sys::windows::exit::ExitContext;
54 use crosvm_cli::sys::windows::exit::ExitContextAnyhow;
55 use devices::tsc::TscSyncMitigations;
56 use devices::Bus;
57 use devices::VcpuRunState;
58 use futures::pin_mut;
59 #[cfg(feature = "whpx")]
60 use hypervisor::whpx::WhpxVcpu;
61 #[cfg(target_arch = "x86_64")]
62 use hypervisor::CpuConfigX86_64;
63 use hypervisor::HypervisorCap;
64 use hypervisor::IoEventAddress;
65 use hypervisor::IoOperation;
66 use hypervisor::IoParams;
67 use hypervisor::VcpuExit;
68 use hypervisor::VcpuInitX86_64;
69 use metrics_events::MetricEventType;
70 use sync::Condvar;
71 use sync::Mutex;
72 use vm_control::VcpuControl;
73 use vm_control::VmRunMode;
74 use winapi::shared::winerror::ERROR_RETRY;
75 #[cfg(target_arch = "x86_64")]
76 use x86_64::cpuid::adjust_cpuid;
77 #[cfg(target_arch = "x86_64")]
78 use x86_64::cpuid::CpuIdContext;
79 #[cfg(target_arch = "x86_64")]
80 use x86_64::X8664arch as Arch;
81
82 #[cfg(feature = "stats")]
83 use crate::crosvm::sys::windows::stats::StatisticsCollector;
84 #[cfg(feature = "stats")]
85 use crate::crosvm::sys::windows::stats::VmExitStatistics;
86 use crate::sys::windows::save_vcpu_tsc_offset;
87 use crate::sys::windows::ExitState;
88
89 const ERROR_RETRY_I32: i32 = ERROR_RETRY as i32;
90
91 #[derive(Default)]
92 pub struct VcpuRunMode {
93 mtx: Mutex<VmRunMode>,
94 cvar: Condvar,
95 }
96
97 impl VcpuRunMode {
get_mode(&self) -> VmRunMode98 pub fn get_mode(&self) -> VmRunMode {
99 *self.mtx.lock()
100 }
101
set_and_notify(&self, new_mode: VmRunMode)102 pub fn set_and_notify(&self, new_mode: VmRunMode) {
103 *self.mtx.lock() = new_mode;
104 self.cvar.notify_all();
105 }
106 }
107
108 struct RunnableVcpuInfo<V> {
109 vcpu: V,
110 thread_priority_handle: Option<SafeMultimediaHandle>,
111 }
112
113 #[derive(Clone, Debug)]
114 struct VcpuMonitoringMetadata {
115 pub start_instant: Instant,
116 // Milliseconds since the baseline start_instant
117 pub last_run_time: Arc<AtomicU64>,
118 pub last_exit_snapshot: Arc<Mutex<Option<VcpuExitData>>>,
119 }
120
121 #[derive(Clone, Debug)]
122 struct VcpuRunThread {
123 pub cpu_id: usize,
124 pub monitoring_metadata: Option<VcpuMonitoringMetadata>,
125 }
126
127 impl VcpuRunThread {
new(cpu_id: usize, enable_vcpu_monitoring: bool) -> VcpuRunThread128 pub fn new(cpu_id: usize, enable_vcpu_monitoring: bool) -> VcpuRunThread {
129 VcpuRunThread {
130 cpu_id,
131 monitoring_metadata: enable_vcpu_monitoring.then(|| VcpuMonitoringMetadata {
132 start_instant: Instant::now(),
133 last_run_time: Arc::new(AtomicU64::new(0)),
134 last_exit_snapshot: Arc::new(Mutex::new(Option::None)),
135 }),
136 }
137 }
138
139 /// Perform WHPX-specific vcpu configurations
140 #[cfg(feature = "whpx")]
whpx_configure_vcpu(vcpu: &mut dyn VcpuArch, irq_chip: &mut dyn IrqChipArch)141 fn whpx_configure_vcpu(vcpu: &mut dyn VcpuArch, irq_chip: &mut dyn IrqChipArch) {
142 // only apply to actual WhpxVcpu instances
143 if let Some(whpx_vcpu) = vcpu.downcast_mut::<WhpxVcpu>() {
144 // WhpxVcpu instances need to know the TSC and Lapic frequencies to handle Hyper-V MSR
145 // reads and writes.
146 let tsc_freq = devices::tsc::tsc_frequency()
147 .map_err(|e| {
148 error!(
149 "Could not determine TSC frequency, WHPX vcpu will not be configured with \
150 a TSC Frequency: {e}"
151 );
152 e
153 })
154 .ok();
155 whpx_vcpu.set_frequencies(tsc_freq, irq_chip.lapic_frequency());
156 }
157 }
158
159 // Sets up a vcpu and converts it into a runnable vcpu.
runnable_vcpu<V>( cpu_id: usize, vcpu: Option<V>, vcpu_init: VcpuInitX86_64, vm: &impl VmArch, irq_chip: &mut dyn IrqChipArch, vcpu_count: usize, run_rt: bool, vcpu_affinity: Option<CpuSet>, no_smt: bool, host_cpu_topology: bool, force_calibrated_tsc_leaf: bool, ) -> Result<RunnableVcpuInfo<V>> where V: VcpuArch,160 fn runnable_vcpu<V>(
161 cpu_id: usize,
162 vcpu: Option<V>,
163 vcpu_init: VcpuInitX86_64,
164 vm: &impl VmArch,
165 irq_chip: &mut dyn IrqChipArch,
166 vcpu_count: usize,
167 run_rt: bool,
168 vcpu_affinity: Option<CpuSet>,
169 no_smt: bool,
170 host_cpu_topology: bool,
171 force_calibrated_tsc_leaf: bool,
172 ) -> Result<RunnableVcpuInfo<V>>
173 where
174 V: VcpuArch,
175 {
176 let mut vcpu = match vcpu {
177 Some(v) => v,
178 None => {
179 // If vcpu is None, it means this arch/hypervisor requires create_vcpu to be called
180 // from the vcpu thread.
181 match vm
182 .create_vcpu(cpu_id)
183 .exit_context(Exit::CreateVcpu, "failed to create vcpu")?
184 .downcast::<V>()
185 {
186 Ok(v) => *v,
187 Err(_) => panic!("VM created wrong type of VCPU"),
188 }
189 }
190 };
191
192 irq_chip
193 .add_vcpu(cpu_id, &vcpu)
194 .exit_context(Exit::AddIrqChipVcpu, "failed to add vcpu to irq chip")?;
195
196 if let Some(affinity) = vcpu_affinity {
197 if let Err(e) = set_cpu_affinity(affinity) {
198 error!("Failed to set CPU affinity: {}", e);
199 }
200 }
201
202 #[cfg(target_arch = "x86_64")]
203 let cpu_config = Some(CpuConfigX86_64::new(
204 force_calibrated_tsc_leaf,
205 host_cpu_topology,
206 false, /* enable_hwp */
207 no_smt,
208 false, /* itmt */
209 None, /* hybrid_type */
210 ));
211
212 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
213 let cpu_config = None;
214
215 Arch::configure_vcpu(
216 vm,
217 vm.get_hypervisor(),
218 irq_chip,
219 &mut vcpu,
220 vcpu_init,
221 cpu_id,
222 vcpu_count,
223 cpu_config,
224 )
225 .exit_context(Exit::ConfigureVcpu, "failed to configure vcpu")?;
226
227 #[cfg(feature = "whpx")]
228 Self::whpx_configure_vcpu(&mut vcpu, irq_chip);
229
230 let mut thread_priority_handle = None;
231 if run_rt {
232 // Until we are multi process on Windows, we can't use the normal thread priority APIs;
233 // instead, we use a trick from the audio device which is able to set a thread RT even
234 // though the process itself is not RT.
235 thread_priority_handle = match set_audio_thread_priority() {
236 Ok(hndl) => Some(hndl),
237 Err(e) => {
238 warn!("Failed to set vcpu thread to real time priority: {}", e);
239 None
240 }
241 };
242 }
243
244 Ok(RunnableVcpuInfo {
245 vcpu,
246 thread_priority_handle,
247 })
248 }
249
run<V>( &self, vcpu: Option<V>, vcpu_init: VcpuInitX86_64, vcpus: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>, vm: impl VmArch + 'static, mut irq_chip: Box<dyn IrqChipArch + 'static>, vcpu_count: usize, run_rt: bool, vcpu_affinity: Option<CpuSet>, delay_rt: bool, no_smt: bool, start_barrier: Arc<Barrier>, vcpu_create_barrier: Arc<Barrier>, mut io_bus: devices::Bus, mut mmio_bus: devices::Bus, vm_evt_wrtube: SendTube, run_mode_arc: Arc<VcpuRunMode>, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, host_cpu_topology: bool, tsc_offset: Option<u64>, force_calibrated_tsc_leaf: bool, vcpu_control: mpsc::Receiver<VcpuControl>, ) -> Result<JoinHandle<Result<()>>> where V: VcpuArch + 'static,250 pub fn run<V>(
251 &self,
252 vcpu: Option<V>,
253 vcpu_init: VcpuInitX86_64,
254 vcpus: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>,
255 vm: impl VmArch + 'static,
256 mut irq_chip: Box<dyn IrqChipArch + 'static>,
257 vcpu_count: usize,
258 run_rt: bool,
259 vcpu_affinity: Option<CpuSet>,
260 delay_rt: bool,
261 no_smt: bool,
262 start_barrier: Arc<Barrier>,
263 vcpu_create_barrier: Arc<Barrier>,
264 mut io_bus: devices::Bus,
265 mut mmio_bus: devices::Bus,
266 vm_evt_wrtube: SendTube,
267 run_mode_arc: Arc<VcpuRunMode>,
268 #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
269 host_cpu_topology: bool,
270 tsc_offset: Option<u64>,
271 force_calibrated_tsc_leaf: bool,
272 vcpu_control: mpsc::Receiver<VcpuControl>,
273 ) -> Result<JoinHandle<Result<()>>>
274 where
275 V: VcpuArch + 'static,
276 {
277 let context = self.clone();
278 thread::Builder::new()
279 .name(format!("crosvm_vcpu{}", self.cpu_id))
280 .spawn(move || {
281 // Having a closure returning ExitState guarentees that we
282 // send a VmEventType on all code paths after the closure
283 // returns.
284 let vcpu_fn = || -> Result<ExitState> {
285 let runnable_vcpu = Self::runnable_vcpu(
286 context.cpu_id,
287 vcpu,
288 vcpu_init,
289 &vm,
290 irq_chip.as_mut(),
291 vcpu_count,
292 run_rt && !delay_rt,
293 vcpu_affinity,
294 no_smt,
295 host_cpu_topology,
296 force_calibrated_tsc_leaf,
297 );
298
299 #[cfg(target_arch = "x86_64")]
300 let cpu_config = CpuConfigX86_64::new(
301 force_calibrated_tsc_leaf,
302 host_cpu_topology,
303 false, /* enable_hwp */
304 no_smt,
305 false, /* itmt */
306 None, /* hybrid_type */
307 );
308
309 #[cfg(target_arch = "x86_64")]
310 let cpuid_context = CpuIdContext::new(
311 context.cpu_id,
312 vcpu_count,
313 Some(irq_chip.as_ref()),
314 cpu_config,
315 vm.get_hypervisor()
316 .check_capability(HypervisorCap::CalibratedTscLeafRequired),
317 __cpuid_count,
318 __cpuid,
319 );
320
321 // The vcpu_create_barrier is supplied from the main thread in order for it to
322 // wait until this thread is done creating its vcpu.
323 vcpu_create_barrier.wait();
324
325 // Wait for this barrier before continuing forward.
326 start_barrier.wait();
327
328 let RunnableVcpuInfo {
329 vcpu,
330 thread_priority_handle: _thread_priority_handle,
331 } = runnable_vcpu?;
332
333 if let Some(offset) = tsc_offset {
334 vcpu.set_tsc_offset(offset).unwrap_or_else(|e| {
335 error!(
336 "Failed to set tsc_offset of {} on vcpu {}: {}",
337 offset, context.cpu_id, e
338 )
339 });
340 }
341
342 // Clone vcpu so it can be used by the main thread to force a vcpu run to exit
343 vcpus
344 .lock()
345 .push(Box::new(vcpu.try_clone().expect("Could not clone vcpu!")));
346
347 mmio_bus.set_access_id(context.cpu_id);
348 io_bus.set_access_id(context.cpu_id);
349
350 vcpu_loop(
351 &context,
352 vcpu,
353 vm,
354 irq_chip,
355 io_bus,
356 mmio_bus,
357 run_mode_arc,
358 #[cfg(feature = "stats")]
359 stats,
360 #[cfg(target_arch = "x86_64")]
361 cpuid_context,
362 vcpu_control,
363 )
364 };
365
366 let exit_state = vcpu_fn().unwrap_or_else(|e| {
367 error!(
368 "vcpu {} run loop exited with error: {:#}",
369 context.cpu_id, e
370 );
371 ExitState::Stop
372 });
373
374 let final_event_data = match exit_state {
375 ExitState::Stop => VmEventType::Exit,
376 _ => unreachable!(),
377 };
378 vm_evt_wrtube
379 .send::<VmEventType>(&final_event_data)
380 .unwrap_or_else(|e| {
381 error!(
382 "failed to send final event {:?} on vcpu {}: {}",
383 final_event_data, context.cpu_id, e
384 )
385 });
386 Ok(())
387 })
388 .exit_context(Exit::SpawnVcpu, "failed to spawn VCPU thread")
389 }
390 }
391
392 #[derive(Clone, Debug)]
393 struct VcpuExitData {
394 // Represented by duration since baseline start_instant
395 exit_time: Duration,
396 exit_result: BaseResult<VcpuExit>,
397 }
398
399 impl Display for VcpuExitData {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result400 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
401 write!(f, "exit result: {:?}", self.exit_result)
402 }
403 }
404
405 struct VcpuStallMonitor {
406 vcpu_run_threads: Vec<VcpuRunThread>,
407 run_mode: Arc<VcpuRunMode>,
408 }
409
410 impl VcpuStallMonitor {
411 const HOST_STALL_TIMEOUT: Duration = Duration::from_secs(2);
412 const VCPU_CHECKUP_INTERVAL: Duration = Duration::from_secs(1);
413 const STALL_REPORTING_LIMITER: Duration = Duration::from_secs(10);
414
init(run_mode: Arc<VcpuRunMode>) -> VcpuStallMonitor415 pub fn init(run_mode: Arc<VcpuRunMode>) -> VcpuStallMonitor {
416 VcpuStallMonitor {
417 vcpu_run_threads: vec![],
418 run_mode,
419 }
420 }
421
add_vcpu_thread(&mut self, thread: VcpuRunThread)422 pub fn add_vcpu_thread(&mut self, thread: VcpuRunThread) {
423 self.vcpu_run_threads.push(thread);
424 }
425
run(self, exit_event: &Event) -> Result<JoinHandle<Result<()>>>426 pub fn run(self, exit_event: &Event) -> Result<JoinHandle<Result<()>>> {
427 let cloned_exit_event = exit_event
428 .try_clone()
429 .exit_context(Exit::CloneEvent, "failed to clone event")?;
430 thread::Builder::new()
431 .name("crosvm_vcpu_stall_monitor".to_string())
432 .spawn(move || {
433 let ex = Executor::new()?;
434
435 let mut timer = TimerAsync::new(Timer::new()?, &ex)?;
436 let mut reset_timer = true;
437
438 let exit_evt_async = EventAsync::new(cloned_exit_event, &ex)?;
439 let exit_future = exit_evt_async.next_val();
440 pin_mut!(exit_future);
441 'main: loop {
442 if reset_timer {
443 timer.reset_repeating(Self::VCPU_CHECKUP_INTERVAL)?;
444 reset_timer = false;
445 }
446 let timer_future = timer.wait();
447 pin_mut!(timer_future);
448 match ex.run_until(select2(timer_future, exit_future)) {
449 Ok((timer_result, exit_result)) => {
450 match exit_result {
451 SelectResult::Finished(_) => {
452 info!("vcpu monitor got exit event");
453 break 'main;
454 }
455 SelectResult::Pending(future) => exit_future = future,
456 }
457
458 match timer_result {
459 SelectResult::Finished(Err(e)) => {
460 error!(
461 "vcpu monitor aborting due to error awaiting future: {}",
462 e
463 );
464 break 'main;
465 }
466 SelectResult::Finished(_) => self.report_any_stalls(),
467 _ => (),
468 }
469 }
470 Err(e) => {
471 error!("vcpu monitor failed to wait on future set: {:?}", e);
472 break 'main;
473 }
474 }
475
476 // Always ensure the vcpus aren't suspended before continuing to montior.
477 let mut run_mode_lock = self.run_mode.mtx.lock();
478 loop {
479 match *run_mode_lock {
480 VmRunMode::Running => break,
481 VmRunMode::Suspending | VmRunMode::Breakpoint => {
482 info!("vcpu monitor pausing until end of suspension");
483 run_mode_lock = self.run_mode.cvar.wait(run_mode_lock);
484 reset_timer = true;
485 }
486 VmRunMode::Exiting => {
487 info!("vcpu monitor detected vm exit");
488 break 'main;
489 }
490 }
491 }
492 }
493
494 Ok(())
495 })
496 .exit_context(
497 Exit::SpawnVcpuMonitor,
498 "failed to spawn VCPU stall monitor thread",
499 )
500 }
501
report_any_stalls(&self)502 fn report_any_stalls(&self) {
503 // TODO(b/208267651): Add and fire Clearcut events for stalls (and add tests)
504 // TODO(b/208267651): Also test guest stalls (vcpu.run() goes too long without exiting)
505 let now = Instant::now();
506 for vcpu_thread in self.vcpu_run_threads.iter() {
507 let monitoring_metadata = vcpu_thread.monitoring_metadata.as_ref().unwrap();
508 if let Some(ref exit_snapshot) = monitoring_metadata.last_exit_snapshot.lock().clone() {
509 let last_run =
510 Duration::from_millis(monitoring_metadata.last_run_time.load(Ordering::SeqCst));
511 if last_run < exit_snapshot.exit_time {
512 // VCPU is between runs
513 let time_since_exit = now.saturating_duration_since(
514 monitoring_metadata.start_instant + exit_snapshot.exit_time,
515 );
516 if time_since_exit > Self::HOST_STALL_TIMEOUT {
517 self.report_stall(vcpu_thread.cpu_id, exit_snapshot, time_since_exit);
518 }
519 }
520 };
521 }
522 }
523
report_stall(&self, cpu_id: usize, exit_data: &VcpuExitData, stall_time: Duration)524 fn report_stall(&self, cpu_id: usize, exit_data: &VcpuExitData, stall_time: Duration) {
525 if stall_time > Self::STALL_REPORTING_LIMITER {
526 return;
527 }
528 // Double check the Vm is running. We don't care about stalls during suspension/exit
529 if *self.run_mode.mtx.lock() != VmRunMode::Running {
530 let duration_string = format!("{:.1}sec", stall_time.as_secs_f32());
531 error!(
532 "Host stall for {} on VCPU {} exit while handling: {}",
533 duration_string, cpu_id, exit_data,
534 );
535 }
536 }
537 }
538
setup_vcpu_signal_handler() -> Result<()>539 fn setup_vcpu_signal_handler() -> Result<()> {
540 Ok(())
541 }
542
run_all_vcpus<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( vcpus: Vec<Option<Vcpu>>, vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>, guest_os: &RunnableLinuxVm<V, Vcpu>, exit_evt: &Event, vm_evt_wrtube: &SendTube, #[cfg(feature = "stats")] stats: &Option<Arc<Mutex<StatisticsCollector>>>, host_cpu_topology: bool, run_mode_arc: Arc<VcpuRunMode>, tsc_sync_mitigations: TscSyncMitigations, force_calibrated_tsc_leaf: bool, ) -> Result<(Vec<JoinHandle<Result<()>>>, Vec<mpsc::Sender<VcpuControl>>)>543 pub fn run_all_vcpus<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
544 vcpus: Vec<Option<Vcpu>>,
545 vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>,
546 guest_os: &RunnableLinuxVm<V, Vcpu>,
547 exit_evt: &Event,
548 vm_evt_wrtube: &SendTube,
549 #[cfg(feature = "stats")] stats: &Option<Arc<Mutex<StatisticsCollector>>>,
550 host_cpu_topology: bool,
551 run_mode_arc: Arc<VcpuRunMode>,
552 tsc_sync_mitigations: TscSyncMitigations,
553 force_calibrated_tsc_leaf: bool,
554 ) -> Result<(Vec<JoinHandle<Result<()>>>, Vec<mpsc::Sender<VcpuControl>>)> {
555 let mut vcpu_threads = Vec::with_capacity(guest_os.vcpu_count + 1);
556 let mut vcpu_control_channels = Vec::with_capacity(guest_os.vcpu_count);
557 let start_barrier = Arc::new(Barrier::new(guest_os.vcpu_count + 1));
558 let enable_vcpu_monitoring = anti_tamper::enable_vcpu_monitoring();
559 setup_vcpu_signal_handler()?;
560
561 let mut stall_monitor =
562 enable_vcpu_monitoring.then(|| VcpuStallMonitor::init(run_mode_arc.clone()));
563 for (cpu_id, vcpu) in vcpus.into_iter().enumerate() {
564 let vcpu_affinity = match guest_os.vcpu_affinity.clone() {
565 Some(VcpuAffinity::Global(v)) => Some(v),
566 Some(VcpuAffinity::PerVcpu(mut m)) => Some(m.remove(&cpu_id).unwrap_or_default()),
567 None => None,
568 };
569
570 // TSC sync mitigations may set vcpu affinity and set a TSC offset
571 let (vcpu_affinity, tsc_offset): (Option<CpuSet>, Option<u64>) =
572 if let Some(mitigation_affinity) = tsc_sync_mitigations.get_vcpu_affinity(cpu_id) {
573 if vcpu_affinity.is_none() {
574 (
575 Some(CpuSet::new(mitigation_affinity)),
576 tsc_sync_mitigations.get_vcpu_tsc_offset(cpu_id),
577 )
578 } else {
579 error!(
580 "Core affinity {:?} specified via commandline conflicts and overrides \
581 affinity needed for TSC sync mitigation: {:?}.",
582 vcpu_affinity, mitigation_affinity
583 );
584 (vcpu_affinity, None)
585 }
586 } else {
587 (vcpu_affinity, None)
588 };
589
590 let vcpu_init = &guest_os.vcpu_init[cpu_id];
591 // The vcpu_create_barrier allows the main thread to delay the spawning of additional
592 // vcpu threads until a single vcpu thread spawned has finished creating it's vcpu.
593 // We currently use this to allow creation of 1 vcpu at a time for all hypervisors.
594 // There are issues with multiple hypervisors with this approach:
595 // - Windows 11 has a regression which causes a BSOD with creation of multiple vcpu
596 // in parallel. http://b/229635845 for more details.
597 // - GHAXM/HAXM cannot create vcpu0 in parallel with other Vcpus.
598 let vcpu_create_barrier = Arc::new(Barrier::new(2));
599 let vcpu_run_thread = VcpuRunThread::new(cpu_id, enable_vcpu_monitoring);
600 let (vcpu_control_send, vcpu_control_recv) = mpsc::channel();
601 vcpu_control_channels.push(vcpu_control_send);
602 let join_handle = vcpu_run_thread.run(
603 vcpu,
604 vcpu_init.clone(),
605 vcpu_boxes.clone(),
606 guest_os
607 .vm
608 .try_clone()
609 .exit_context(Exit::CloneEvent, "failed to clone vm")?,
610 guest_os
611 .irq_chip
612 .try_box_clone()
613 .exit_context(Exit::CloneEvent, "failed to clone event")?,
614 guest_os.vcpu_count,
615 guest_os.rt_cpus.contains(&cpu_id),
616 vcpu_affinity,
617 guest_os.delay_rt,
618 guest_os.no_smt,
619 start_barrier.clone(),
620 vcpu_create_barrier.clone(),
621 (*guest_os.io_bus).clone(),
622 (*guest_os.mmio_bus).clone(),
623 vm_evt_wrtube
624 .try_clone()
625 .exit_context(Exit::CloneTube, "failed to clone tube")?,
626 run_mode_arc.clone(),
627 #[cfg(feature = "stats")]
628 stats.clone(),
629 host_cpu_topology,
630 tsc_offset,
631 force_calibrated_tsc_leaf,
632 vcpu_control_recv,
633 )?;
634 if let Some(ref mut monitor) = stall_monitor {
635 monitor.add_vcpu_thread(vcpu_run_thread);
636 }
637
638 // Wait until the vcpu is created before we start a new vcpu thread
639 vcpu_create_barrier.wait();
640
641 vcpu_threads.push(join_handle);
642 }
643 if let Some(monitor) = stall_monitor {
644 vcpu_threads.push(monitor.run(exit_evt)?);
645 }
646 // Now wait on the start barrier to start all threads at the same time.
647 start_barrier.wait();
648 Ok((vcpu_threads, vcpu_control_channels))
649 }
650
vcpu_loop<V>( context: &VcpuRunThread, mut vcpu: V, vm: impl VmArch + 'static, irq_chip: Box<dyn IrqChipArch + 'static>, io_bus: Bus, mmio_bus: Bus, run_mode_arc: Arc<VcpuRunMode>, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, #[cfg(target_arch = "x86_64")] cpuid_context: CpuIdContext, vcpu_control: mpsc::Receiver<VcpuControl>, ) -> Result<ExitState> where V: VcpuArch + 'static,651 fn vcpu_loop<V>(
652 context: &VcpuRunThread,
653 mut vcpu: V,
654 vm: impl VmArch + 'static,
655 irq_chip: Box<dyn IrqChipArch + 'static>,
656 io_bus: Bus,
657 mmio_bus: Bus,
658 run_mode_arc: Arc<VcpuRunMode>,
659 #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
660 #[cfg(target_arch = "x86_64")] cpuid_context: CpuIdContext,
661 vcpu_control: mpsc::Receiver<VcpuControl>,
662 ) -> Result<ExitState>
663 where
664 V: VcpuArch + 'static,
665 {
666 #[cfg(feature = "stats")]
667 let mut exit_stats = VmExitStatistics::new();
668
669 #[cfg(feature = "stats")]
670 {
671 mmio_bus.stats.lock().set_enabled(stats.is_some());
672 io_bus.stats.lock().set_enabled(stats.is_some());
673 exit_stats.set_enabled(stats.is_some());
674 }
675
676 let mut save_tsc_offset = true;
677
678 loop {
679 let _trace_event = trace_event!(crosvm, "vcpu loop");
680 let mut check_vm_shutdown = run_mode_arc.get_mode() != VmRunMode::Running;
681
682 match irq_chip.wait_until_runnable(&vcpu).with_exit_context(
683 Exit::WaitUntilRunnable,
684 || {
685 format!(
686 "error waiting for vcpu {} to become runnable",
687 context.cpu_id
688 )
689 },
690 )? {
691 VcpuRunState::Runnable => {}
692 VcpuRunState::Interrupted => check_vm_shutdown = true,
693 }
694
695 if !check_vm_shutdown {
696 let exit = {
697 let _trace_event = trace_event!(crosvm, "vcpu::run");
698 if let Some(ref monitoring_metadata) = context.monitoring_metadata {
699 monitoring_metadata.last_run_time.store(
700 // Safe conversion because millis will always be < u32::MAX
701 monitoring_metadata
702 .start_instant
703 .elapsed()
704 .as_millis()
705 .try_into()
706 .unwrap(),
707 Ordering::SeqCst,
708 );
709 }
710 vcpu.run()
711 };
712 if let Some(ref monitoring_metadata) = context.monitoring_metadata {
713 *monitoring_metadata.last_exit_snapshot.lock() = Some(VcpuExitData {
714 exit_time: monitoring_metadata.start_instant.elapsed(),
715 exit_result: exit,
716 });
717 }
718
719 // save the tsc offset if we need to
720 if save_tsc_offset {
721 if let Ok(offset) = vcpu.get_tsc_offset() {
722 save_vcpu_tsc_offset(offset, context.cpu_id);
723 } else {
724 error!("Unable to determine TSC offset");
725 }
726 save_tsc_offset = false;
727 }
728
729 #[cfg(feature = "stats")]
730 let start = exit_stats.start_stat();
731
732 match exit {
733 Ok(VcpuExit::Io) => {
734 let _trace_event = trace_event!(crosvm, "VcpuExit::Io");
735 vcpu.handle_io(&mut |IoParams { address, operation}| {
736 match operation {
737 IoOperation::Read(data) => {
738 io_bus.read(address, data);
739 }
740 IoOperation::Write(data) => {
741 vm.handle_io_events(IoEventAddress::Pio(address), data)
742 .unwrap_or_else(|e| error!(
743 "failed to handle ioevent for pio write to {} on vcpu {}: {}",
744 address, context.cpu_id, e
745 ));
746 io_bus.write(address, data);
747 }
748 }
749 }).unwrap_or_else(|e| error!("failed to handle io: {}", e));
750 }
751 Ok(VcpuExit::Mmio) => {
752 let _trace_event = trace_event!(crosvm, "VcpuExit::Mmio");
753 vcpu.handle_mmio(&mut |IoParams { address, operation }| {
754 match operation {
755 IoOperation::Read(data) => {
756 if !mmio_bus.read(address, data) {
757 info!(
758 "mmio read failed: {:x}; trying memory read..",
759 address
760 );
761 vm.get_memory()
762 .read_exact_at_addr(
763 data,
764 vm_memory::GuestAddress(address),
765 )
766 .unwrap_or_else(|e| {
767 error!(
768 "guest memory read failed at {:x}: {}",
769 address, e
770 )
771 });
772 }
773 Ok(())
774 }
775 IoOperation::Write(data) => {
776 vm.handle_io_events(IoEventAddress::Mmio(address), data)
777 .unwrap_or_else(|e| error!(
778 "failed to handle ioevent for mmio write to {} on vcpu {}: {}",
779 address, context.cpu_id, e
780 ));
781 if !mmio_bus.write(address, data) {
782 info!(
783 "mmio write failed: {:x}; trying memory write..",
784 address
785 );
786 vm.get_memory()
787 .write_all_at_addr(data, vm_memory::GuestAddress(address))
788 .unwrap_or_else(|e| error!(
789 "guest memory write failed at {:x}: {}",
790 address, e
791 ));
792 }
793 Ok(())
794 }
795 }
796 }).unwrap_or_else(|e| error!("failed to handle mmio: {}", e));
797 }
798 Ok(VcpuExit::IoapicEoi { vector }) => {
799 let _trace_event = trace_event!(crosvm, "VcpuExit::IoapicEoi");
800 irq_chip.broadcast_eoi(vector).unwrap_or_else(|e| {
801 error!(
802 "failed to broadcast eoi {} on vcpu {}: {}",
803 vector, context.cpu_id, e
804 )
805 });
806 }
807 Ok(VcpuExit::IrqWindowOpen) => {
808 let _trace_event = trace_event!(crosvm, "VcpuExit::IrqWindowOpen");
809 }
810 Ok(VcpuExit::Hlt) => irq_chip.halted(context.cpu_id),
811
812 // VcpuExit::Shutdown is always an error on Windows. HAXM exits with
813 // Shutdown only for triple faults and other vcpu panics. WHPX never exits
814 // with Shutdown. Normal reboots and shutdowns, like window close, use
815 // the vm event tube and VmRunMode::Exiting instead of VcpuExit::Shutdown.
816 Ok(VcpuExit::Shutdown(reason)) => {
817 if let Err(e) = reason {
818 metrics::log_descriptor(
819 MetricEventType::VcpuShutdownError,
820 e.get_raw_error_code() as i64,
821 );
822 }
823 bail_exit_code!(Exit::VcpuShutdown, "vcpu shutdown (reason: {:?})", reason)
824 }
825 Ok(VcpuExit::FailEntry {
826 hardware_entry_failure_reason,
827 }) => bail_exit_code!(
828 Exit::VcpuFailEntry,
829 "vcpu hw run failure: {:#x}",
830 hardware_entry_failure_reason,
831 ),
832 Ok(VcpuExit::SystemEventShutdown) => {
833 bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventShutdown")
834 }
835 Ok(VcpuExit::SystemEventReset) => {
836 bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventReset")
837 }
838 Ok(VcpuExit::SystemEventCrash) => {
839 bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventCrash")
840 }
841
842 // When we're shutting down (e.g., emulator window gets closed), GVM vmexits
843 // with KVM_EXIT_INTR, which vcpu.run maps to VcpuExit::Intr. But KVM_EXIT_INTR
844 // can happen during normal operation too, when GVM's timer finds requests
845 // pending from the host. So we set check_vm_shutdown, then below check the
846 // VmRunMode state to see if we should exit the run loop.
847 Ok(VcpuExit::Intr) => {
848 let _trace_event = trace_event!(crosvm, "VcpuExit::Intr");
849 check_vm_shutdown = true
850 }
851 Ok(VcpuExit::Canceled) => {
852 let _trace_event = trace_event!(crosvm, "VcpuExit::Canceled");
853 check_vm_shutdown = true
854 }
855 #[cfg(target_arch = "x86_64")]
856 Ok(VcpuExit::Cpuid { mut entry }) => {
857 let _trace_event = trace_event!(crosvm, "VcpuExit::Cpuid");
858 // adjust the results based on crosvm logic
859 adjust_cpuid(&mut entry, &cpuid_context);
860
861 // let the vcpu finish handling the exit
862 vcpu.handle_cpuid(&entry).unwrap_or_else(|e| {
863 error!(
864 "failed to handle setting cpuid results on cpu {}: {}",
865 context.cpu_id, e
866 )
867 });
868 }
869 #[cfg(target_arch = "x86_64")]
870 Ok(VcpuExit::MsrAccess) => {
871 let _trace_event = trace_event!(crosvm, "VcpuExit::MsrAccess");
872 } // MsrAccess handled by hypervisor impl
873 Ok(r) => {
874 let _trace_event = trace_event!(crosvm, "VcpuExit::Unexpected");
875 error!("unexpected vcpu.run return value: {:?}", r);
876 check_vm_shutdown = true;
877 }
878 Err(e) => match e.errno() {
879 ERROR_RETRY_I32 => {}
880 _ => {
881 run_mode_arc.set_and_notify(VmRunMode::Exiting);
882 Err(e).exit_context(Exit::VcpuRunError, "vcpu run error")?;
883 }
884 },
885 }
886
887 #[cfg(feature = "stats")]
888 exit_stats.end_stat(&exit, start);
889 }
890
891 if check_vm_shutdown {
892 let mut run_mode_lock = run_mode_arc.mtx.lock();
893 loop {
894 match *run_mode_lock {
895 VmRunMode::Running => {
896 process_vcpu_control_messages(&mut vcpu, *run_mode_lock, &vcpu_control);
897 break;
898 }
899 VmRunMode::Suspending => {
900 if let Err(e) = vcpu.on_suspend() {
901 error!(
902 "failed to signal to hypervisor that vcpu {} is being suspended: {}",
903 context.cpu_id, e
904 );
905 }
906 }
907 VmRunMode::Breakpoint => {}
908 VmRunMode::Exiting => {
909 #[cfg(feature = "stats")]
910 if let Some(stats) = stats {
911 let mut collector = stats.lock();
912 collector.pio_bus_stats.push(io_bus.stats);
913 collector.mmio_bus_stats.push(mmio_bus.stats);
914 collector.vm_exit_stats.push(exit_stats);
915 }
916 return Ok(ExitState::Stop);
917 }
918 }
919
920 // For non running modes, we don't want to process messages until we've completed
921 // *all* work for any VmRunMode transition. This is because one control message
922 // asks us to inform the requestor of our current state. We want to make sure our
923 // our state has completely transitioned before we respond to the requestor. If
924 // we do this elsewhere, we might respond while in a partial state which could
925 // break features like snapshotting (e.g. by introducing a race condition).
926 process_vcpu_control_messages(&mut vcpu, *run_mode_lock, &vcpu_control);
927
928 // Give ownership of our exclusive lock to the condition variable that
929 // will block. When the condition variable is notified, `wait` will
930 // unblock and return a new exclusive lock.
931 run_mode_lock = run_mode_arc.cvar.wait(run_mode_lock);
932 }
933 }
934
935 irq_chip.inject_interrupts(&vcpu).unwrap_or_else(|e| {
936 error!(
937 "failed to inject interrupts for vcpu {}: {}",
938 context.cpu_id, e
939 )
940 });
941 }
942 }
943
process_vcpu_control_messages<V>( vcpu: &mut V, run_mode: VmRunMode, vcpu_control: &mpsc::Receiver<VcpuControl>, ) where V: VcpuArch + 'static,944 fn process_vcpu_control_messages<V>(
945 vcpu: &mut V,
946 run_mode: VmRunMode,
947 vcpu_control: &mpsc::Receiver<VcpuControl>,
948 ) where
949 V: VcpuArch + 'static,
950 {
951 let control_messages: Vec<VcpuControl> = vcpu_control.try_iter().collect();
952
953 for msg in control_messages {
954 match msg {
955 VcpuControl::RunState(new_mode) => {
956 panic!("VCPUs do not handle RunState messages on Windows")
957 }
958 #[cfg(feature = "gdb")]
959 VcpuControl::Debug(d) => {
960 unimplemented!("Windows VCPUs do not support debug yet.");
961 }
962 VcpuControl::MakeRT => {
963 unimplemented!("Windows VCPUs do not support on demand RT.");
964 }
965 VcpuControl::GetStates(response_chan) => {
966 // Wondering why we need this given that the state value is already in an Arc?
967 //
968 // The control loop generally sets the run mode directly via the Arc; however,
969 // it has no way of knowing *when* the VCPU threads have actually acknowledged
970 // the new value. By returning the value in here, we prove the the control loop
971 // we have accepted the new value and are done with our state change.
972 if let Err(e) = response_chan.send(run_mode) {
973 error!("Failed to send GetState: {}", e);
974 };
975 }
976 VcpuControl::Snapshot(snapshot_writer, response_chan) => {
977 let resp = vcpu
978 .snapshot()
979 .and_then(|s| snapshot_writer.write_fragment(&format!("vcpu{}", vcpu.id()), &s))
980 .with_context(|| format!("Failed to snapshot Vcpu #{}", vcpu.id()));
981 if let Err(e) = response_chan.send(resp) {
982 error!("Failed to send snapshot response: {}", e);
983 }
984 }
985 VcpuControl::Restore(req) => {
986 let resp = req
987 .snapshot_reader
988 .read_fragment(&format!("vcpu{}", vcpu.id()))
989 .and_then(|s| vcpu.restore(&s, req.host_tsc_reference_moment))
990 .with_context(|| format!("Failed to restore Vcpu #{}", vcpu.id()));
991 if let Err(e) = req.result_sender.send(resp) {
992 error!("Failed to send restore response: {}", e);
993 }
994 }
995 }
996 }
997 }
998
999 #[cfg(test)]
1000 mod tests {
1001 use super::*;
1002
1003 struct SetupData {
1004 pub monitor: VcpuStallMonitor,
1005 pub exit_evt: Event,
1006 }
1007
set_up_stall_monitor(vcpu_count: usize) -> Result<SetupData>1008 fn set_up_stall_monitor(vcpu_count: usize) -> Result<SetupData> {
1009 let run_mode = Arc::new(VcpuRunMode::default());
1010 let mut monitor = VcpuStallMonitor::init(run_mode);
1011
1012 for id in 0..vcpu_count {
1013 let new_vcpu = VcpuRunThread::new(id, true /* enable_vcpu_monitoring */);
1014 monitor.add_vcpu_thread(new_vcpu);
1015 }
1016
1017 Ok(SetupData {
1018 monitor,
1019 exit_evt: Event::new().expect("Failed to create event"),
1020 })
1021 }
1022
1023 #[test]
stall_monitor_closes_on_exit_evt() -> Result<()>1024 fn stall_monitor_closes_on_exit_evt() -> Result<()> {
1025 let SetupData { monitor, exit_evt } = set_up_stall_monitor(1)?;
1026
1027 exit_evt.signal()?;
1028 let _ = monitor
1029 .run(&exit_evt)?
1030 .join()
1031 .unwrap_or_else(|e| panic!("Thread join failed: {:?}", e));
1032 Ok(())
1033 }
1034 }
1035