xref: /aosp_15_r20/external/crosvm/src/sys/windows/run_vcpu.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::arch::x86_64::__cpuid;
6 use std::arch::x86_64::__cpuid_count;
7 use std::convert::TryInto;
8 use std::fmt;
9 use std::fmt::Display;
10 use std::sync::atomic::AtomicU64;
11 use std::sync::atomic::Ordering;
12 use std::sync::mpsc;
13 use std::sync::Arc;
14 use std::sync::Barrier;
15 use std::thread;
16 use std::thread::JoinHandle;
17 use std::time::Duration;
18 use std::time::Instant;
19 
20 #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
21 use aarch64::AArch64 as Arch;
22 use anyhow::anyhow;
23 use anyhow::Context;
24 use anyhow::Result;
25 use arch::CpuConfigArch;
26 use arch::CpuSet;
27 use arch::IrqChipArch;
28 use arch::LinuxArch;
29 use arch::RunnableLinuxVm;
30 use arch::VcpuAffinity;
31 use arch::VcpuArch;
32 use arch::VmArch;
33 use base::error;
34 use base::info;
35 use base::set_audio_thread_priority;
36 use base::set_cpu_affinity;
37 use base::warn;
38 use base::Event;
39 use base::Result as BaseResult;
40 use base::SafeMultimediaHandle;
41 use base::SendTube;
42 use base::Timer;
43 use base::Tube;
44 use base::VmEventType;
45 use cros_async::select2;
46 use cros_async::EventAsync;
47 use cros_async::Executor;
48 use cros_async::SelectResult;
49 use cros_async::TimerAsync;
50 use cros_tracing::trace_event;
51 use crosvm_cli::bail_exit_code;
52 use crosvm_cli::sys::windows::exit::Exit;
53 use crosvm_cli::sys::windows::exit::ExitContext;
54 use crosvm_cli::sys::windows::exit::ExitContextAnyhow;
55 use devices::tsc::TscSyncMitigations;
56 use devices::Bus;
57 use devices::VcpuRunState;
58 use futures::pin_mut;
59 #[cfg(feature = "whpx")]
60 use hypervisor::whpx::WhpxVcpu;
61 #[cfg(target_arch = "x86_64")]
62 use hypervisor::CpuConfigX86_64;
63 use hypervisor::HypervisorCap;
64 use hypervisor::IoEventAddress;
65 use hypervisor::IoOperation;
66 use hypervisor::IoParams;
67 use hypervisor::VcpuExit;
68 use hypervisor::VcpuInitX86_64;
69 use metrics_events::MetricEventType;
70 use sync::Condvar;
71 use sync::Mutex;
72 use vm_control::VcpuControl;
73 use vm_control::VmRunMode;
74 use winapi::shared::winerror::ERROR_RETRY;
75 #[cfg(target_arch = "x86_64")]
76 use x86_64::cpuid::adjust_cpuid;
77 #[cfg(target_arch = "x86_64")]
78 use x86_64::cpuid::CpuIdContext;
79 #[cfg(target_arch = "x86_64")]
80 use x86_64::X8664arch as Arch;
81 
82 #[cfg(feature = "stats")]
83 use crate::crosvm::sys::windows::stats::StatisticsCollector;
84 #[cfg(feature = "stats")]
85 use crate::crosvm::sys::windows::stats::VmExitStatistics;
86 use crate::sys::windows::save_vcpu_tsc_offset;
87 use crate::sys::windows::ExitState;
88 
89 const ERROR_RETRY_I32: i32 = ERROR_RETRY as i32;
90 
91 #[derive(Default)]
92 pub struct VcpuRunMode {
93     mtx: Mutex<VmRunMode>,
94     cvar: Condvar,
95 }
96 
97 impl VcpuRunMode {
get_mode(&self) -> VmRunMode98     pub fn get_mode(&self) -> VmRunMode {
99         *self.mtx.lock()
100     }
101 
set_and_notify(&self, new_mode: VmRunMode)102     pub fn set_and_notify(&self, new_mode: VmRunMode) {
103         *self.mtx.lock() = new_mode;
104         self.cvar.notify_all();
105     }
106 }
107 
108 struct RunnableVcpuInfo<V> {
109     vcpu: V,
110     thread_priority_handle: Option<SafeMultimediaHandle>,
111 }
112 
113 #[derive(Clone, Debug)]
114 struct VcpuMonitoringMetadata {
115     pub start_instant: Instant,
116     // Milliseconds since the baseline start_instant
117     pub last_run_time: Arc<AtomicU64>,
118     pub last_exit_snapshot: Arc<Mutex<Option<VcpuExitData>>>,
119 }
120 
121 #[derive(Clone, Debug)]
122 struct VcpuRunThread {
123     pub cpu_id: usize,
124     pub monitoring_metadata: Option<VcpuMonitoringMetadata>,
125 }
126 
127 impl VcpuRunThread {
new(cpu_id: usize, enable_vcpu_monitoring: bool) -> VcpuRunThread128     pub fn new(cpu_id: usize, enable_vcpu_monitoring: bool) -> VcpuRunThread {
129         VcpuRunThread {
130             cpu_id,
131             monitoring_metadata: enable_vcpu_monitoring.then(|| VcpuMonitoringMetadata {
132                 start_instant: Instant::now(),
133                 last_run_time: Arc::new(AtomicU64::new(0)),
134                 last_exit_snapshot: Arc::new(Mutex::new(Option::None)),
135             }),
136         }
137     }
138 
139     /// Perform WHPX-specific vcpu configurations
140     #[cfg(feature = "whpx")]
whpx_configure_vcpu(vcpu: &mut dyn VcpuArch, irq_chip: &mut dyn IrqChipArch)141     fn whpx_configure_vcpu(vcpu: &mut dyn VcpuArch, irq_chip: &mut dyn IrqChipArch) {
142         // only apply to actual WhpxVcpu instances
143         if let Some(whpx_vcpu) = vcpu.downcast_mut::<WhpxVcpu>() {
144             // WhpxVcpu instances need to know the TSC and Lapic frequencies to handle Hyper-V MSR
145             // reads and writes.
146             let tsc_freq = devices::tsc::tsc_frequency()
147                 .map_err(|e| {
148                     error!(
149                         "Could not determine TSC frequency, WHPX vcpu will not be configured with \
150                         a TSC Frequency: {e}"
151                     );
152                     e
153                 })
154                 .ok();
155             whpx_vcpu.set_frequencies(tsc_freq, irq_chip.lapic_frequency());
156         }
157     }
158 
159     // Sets up a vcpu and converts it into a runnable vcpu.
runnable_vcpu<V>( cpu_id: usize, vcpu: Option<V>, vcpu_init: VcpuInitX86_64, vm: &impl VmArch, irq_chip: &mut dyn IrqChipArch, vcpu_count: usize, run_rt: bool, vcpu_affinity: Option<CpuSet>, no_smt: bool, host_cpu_topology: bool, force_calibrated_tsc_leaf: bool, ) -> Result<RunnableVcpuInfo<V>> where V: VcpuArch,160     fn runnable_vcpu<V>(
161         cpu_id: usize,
162         vcpu: Option<V>,
163         vcpu_init: VcpuInitX86_64,
164         vm: &impl VmArch,
165         irq_chip: &mut dyn IrqChipArch,
166         vcpu_count: usize,
167         run_rt: bool,
168         vcpu_affinity: Option<CpuSet>,
169         no_smt: bool,
170         host_cpu_topology: bool,
171         force_calibrated_tsc_leaf: bool,
172     ) -> Result<RunnableVcpuInfo<V>>
173     where
174         V: VcpuArch,
175     {
176         let mut vcpu = match vcpu {
177             Some(v) => v,
178             None => {
179                 // If vcpu is None, it means this arch/hypervisor requires create_vcpu to be called
180                 // from the vcpu thread.
181                 match vm
182                     .create_vcpu(cpu_id)
183                     .exit_context(Exit::CreateVcpu, "failed to create vcpu")?
184                     .downcast::<V>()
185                 {
186                     Ok(v) => *v,
187                     Err(_) => panic!("VM created wrong type of VCPU"),
188                 }
189             }
190         };
191 
192         irq_chip
193             .add_vcpu(cpu_id, &vcpu)
194             .exit_context(Exit::AddIrqChipVcpu, "failed to add vcpu to irq chip")?;
195 
196         if let Some(affinity) = vcpu_affinity {
197             if let Err(e) = set_cpu_affinity(affinity) {
198                 error!("Failed to set CPU affinity: {}", e);
199             }
200         }
201 
202         #[cfg(target_arch = "x86_64")]
203         let cpu_config = Some(CpuConfigX86_64::new(
204             force_calibrated_tsc_leaf,
205             host_cpu_topology,
206             false, /* enable_hwp */
207             no_smt,
208             false, /* itmt */
209             None,  /* hybrid_type */
210         ));
211 
212         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
213         let cpu_config = None;
214 
215         Arch::configure_vcpu(
216             vm,
217             vm.get_hypervisor(),
218             irq_chip,
219             &mut vcpu,
220             vcpu_init,
221             cpu_id,
222             vcpu_count,
223             cpu_config,
224         )
225         .exit_context(Exit::ConfigureVcpu, "failed to configure vcpu")?;
226 
227         #[cfg(feature = "whpx")]
228         Self::whpx_configure_vcpu(&mut vcpu, irq_chip);
229 
230         let mut thread_priority_handle = None;
231         if run_rt {
232             // Until we are multi process on Windows, we can't use the normal thread priority APIs;
233             // instead, we use a trick from the audio device which is able to set a thread RT even
234             // though the process itself is not RT.
235             thread_priority_handle = match set_audio_thread_priority() {
236                 Ok(hndl) => Some(hndl),
237                 Err(e) => {
238                     warn!("Failed to set vcpu thread to real time priority: {}", e);
239                     None
240                 }
241             };
242         }
243 
244         Ok(RunnableVcpuInfo {
245             vcpu,
246             thread_priority_handle,
247         })
248     }
249 
run<V>( &self, vcpu: Option<V>, vcpu_init: VcpuInitX86_64, vcpus: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>, vm: impl VmArch + 'static, mut irq_chip: Box<dyn IrqChipArch + 'static>, vcpu_count: usize, run_rt: bool, vcpu_affinity: Option<CpuSet>, delay_rt: bool, no_smt: bool, start_barrier: Arc<Barrier>, vcpu_create_barrier: Arc<Barrier>, mut io_bus: devices::Bus, mut mmio_bus: devices::Bus, vm_evt_wrtube: SendTube, run_mode_arc: Arc<VcpuRunMode>, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, host_cpu_topology: bool, tsc_offset: Option<u64>, force_calibrated_tsc_leaf: bool, vcpu_control: mpsc::Receiver<VcpuControl>, ) -> Result<JoinHandle<Result<()>>> where V: VcpuArch + 'static,250     pub fn run<V>(
251         &self,
252         vcpu: Option<V>,
253         vcpu_init: VcpuInitX86_64,
254         vcpus: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>,
255         vm: impl VmArch + 'static,
256         mut irq_chip: Box<dyn IrqChipArch + 'static>,
257         vcpu_count: usize,
258         run_rt: bool,
259         vcpu_affinity: Option<CpuSet>,
260         delay_rt: bool,
261         no_smt: bool,
262         start_barrier: Arc<Barrier>,
263         vcpu_create_barrier: Arc<Barrier>,
264         mut io_bus: devices::Bus,
265         mut mmio_bus: devices::Bus,
266         vm_evt_wrtube: SendTube,
267         run_mode_arc: Arc<VcpuRunMode>,
268         #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
269         host_cpu_topology: bool,
270         tsc_offset: Option<u64>,
271         force_calibrated_tsc_leaf: bool,
272         vcpu_control: mpsc::Receiver<VcpuControl>,
273     ) -> Result<JoinHandle<Result<()>>>
274     where
275         V: VcpuArch + 'static,
276     {
277         let context = self.clone();
278         thread::Builder::new()
279             .name(format!("crosvm_vcpu{}", self.cpu_id))
280             .spawn(move || {
281                 // Having a closure returning ExitState guarentees that we
282                 // send a VmEventType on all code paths after the closure
283                 // returns.
284                 let vcpu_fn = || -> Result<ExitState> {
285                     let runnable_vcpu = Self::runnable_vcpu(
286                         context.cpu_id,
287                         vcpu,
288                         vcpu_init,
289                         &vm,
290                         irq_chip.as_mut(),
291                         vcpu_count,
292                         run_rt && !delay_rt,
293                         vcpu_affinity,
294                         no_smt,
295                         host_cpu_topology,
296                         force_calibrated_tsc_leaf,
297                     );
298 
299                     #[cfg(target_arch = "x86_64")]
300                     let cpu_config = CpuConfigX86_64::new(
301                         force_calibrated_tsc_leaf,
302                         host_cpu_topology,
303                         false, /* enable_hwp */
304                         no_smt,
305                         false, /* itmt */
306                         None,  /* hybrid_type */
307                     );
308 
309                     #[cfg(target_arch = "x86_64")]
310                     let cpuid_context = CpuIdContext::new(
311                         context.cpu_id,
312                         vcpu_count,
313                         Some(irq_chip.as_ref()),
314                         cpu_config,
315                         vm.get_hypervisor()
316                             .check_capability(HypervisorCap::CalibratedTscLeafRequired),
317                         __cpuid_count,
318                         __cpuid,
319                     );
320 
321                     // The vcpu_create_barrier is supplied from the main thread in order for it to
322                     // wait until this thread is done creating its vcpu.
323                     vcpu_create_barrier.wait();
324 
325                     // Wait for this barrier before continuing forward.
326                     start_barrier.wait();
327 
328                     let RunnableVcpuInfo {
329                         vcpu,
330                         thread_priority_handle: _thread_priority_handle,
331                     } = runnable_vcpu?;
332 
333                     if let Some(offset) = tsc_offset {
334                         vcpu.set_tsc_offset(offset).unwrap_or_else(|e| {
335                             error!(
336                                 "Failed to set tsc_offset of {} on vcpu {}: {}",
337                                 offset, context.cpu_id, e
338                             )
339                         });
340                     }
341 
342                     // Clone vcpu so it can be used by the main thread to force a vcpu run to exit
343                     vcpus
344                         .lock()
345                         .push(Box::new(vcpu.try_clone().expect("Could not clone vcpu!")));
346 
347                     mmio_bus.set_access_id(context.cpu_id);
348                     io_bus.set_access_id(context.cpu_id);
349 
350                     vcpu_loop(
351                         &context,
352                         vcpu,
353                         vm,
354                         irq_chip,
355                         io_bus,
356                         mmio_bus,
357                         run_mode_arc,
358                         #[cfg(feature = "stats")]
359                         stats,
360                         #[cfg(target_arch = "x86_64")]
361                         cpuid_context,
362                         vcpu_control,
363                     )
364                 };
365 
366                 let exit_state = vcpu_fn().unwrap_or_else(|e| {
367                     error!(
368                         "vcpu {} run loop exited with error: {:#}",
369                         context.cpu_id, e
370                     );
371                     ExitState::Stop
372                 });
373 
374                 let final_event_data = match exit_state {
375                     ExitState::Stop => VmEventType::Exit,
376                     _ => unreachable!(),
377                 };
378                 vm_evt_wrtube
379                     .send::<VmEventType>(&final_event_data)
380                     .unwrap_or_else(|e| {
381                         error!(
382                             "failed to send final event {:?} on vcpu {}: {}",
383                             final_event_data, context.cpu_id, e
384                         )
385                     });
386                 Ok(())
387             })
388             .exit_context(Exit::SpawnVcpu, "failed to spawn VCPU thread")
389     }
390 }
391 
392 #[derive(Clone, Debug)]
393 struct VcpuExitData {
394     // Represented by duration since baseline start_instant
395     exit_time: Duration,
396     exit_result: BaseResult<VcpuExit>,
397 }
398 
399 impl Display for VcpuExitData {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result400     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
401         write!(f, "exit result: {:?}", self.exit_result)
402     }
403 }
404 
405 struct VcpuStallMonitor {
406     vcpu_run_threads: Vec<VcpuRunThread>,
407     run_mode: Arc<VcpuRunMode>,
408 }
409 
410 impl VcpuStallMonitor {
411     const HOST_STALL_TIMEOUT: Duration = Duration::from_secs(2);
412     const VCPU_CHECKUP_INTERVAL: Duration = Duration::from_secs(1);
413     const STALL_REPORTING_LIMITER: Duration = Duration::from_secs(10);
414 
init(run_mode: Arc<VcpuRunMode>) -> VcpuStallMonitor415     pub fn init(run_mode: Arc<VcpuRunMode>) -> VcpuStallMonitor {
416         VcpuStallMonitor {
417             vcpu_run_threads: vec![],
418             run_mode,
419         }
420     }
421 
add_vcpu_thread(&mut self, thread: VcpuRunThread)422     pub fn add_vcpu_thread(&mut self, thread: VcpuRunThread) {
423         self.vcpu_run_threads.push(thread);
424     }
425 
run(self, exit_event: &Event) -> Result<JoinHandle<Result<()>>>426     pub fn run(self, exit_event: &Event) -> Result<JoinHandle<Result<()>>> {
427         let cloned_exit_event = exit_event
428             .try_clone()
429             .exit_context(Exit::CloneEvent, "failed to clone event")?;
430         thread::Builder::new()
431             .name("crosvm_vcpu_stall_monitor".to_string())
432             .spawn(move || {
433                 let ex = Executor::new()?;
434 
435                 let mut timer = TimerAsync::new(Timer::new()?, &ex)?;
436                 let mut reset_timer = true;
437 
438                 let exit_evt_async = EventAsync::new(cloned_exit_event, &ex)?;
439                 let exit_future = exit_evt_async.next_val();
440                 pin_mut!(exit_future);
441                 'main: loop {
442                     if reset_timer {
443                         timer.reset_repeating(Self::VCPU_CHECKUP_INTERVAL)?;
444                         reset_timer = false;
445                     }
446                     let timer_future = timer.wait();
447                     pin_mut!(timer_future);
448                     match ex.run_until(select2(timer_future, exit_future)) {
449                         Ok((timer_result, exit_result)) => {
450                             match exit_result {
451                                 SelectResult::Finished(_) => {
452                                     info!("vcpu monitor got exit event");
453                                     break 'main;
454                                 }
455                                 SelectResult::Pending(future) => exit_future = future,
456                             }
457 
458                             match timer_result {
459                                 SelectResult::Finished(Err(e)) => {
460                                     error!(
461                                         "vcpu monitor aborting due to error awaiting future: {}",
462                                         e
463                                     );
464                                     break 'main;
465                                 }
466                                 SelectResult::Finished(_) => self.report_any_stalls(),
467                                 _ => (),
468                             }
469                         }
470                         Err(e) => {
471                             error!("vcpu monitor failed to wait on future set: {:?}", e);
472                             break 'main;
473                         }
474                     }
475 
476                     // Always ensure the vcpus aren't suspended before continuing to montior.
477                     let mut run_mode_lock = self.run_mode.mtx.lock();
478                     loop {
479                         match *run_mode_lock {
480                             VmRunMode::Running => break,
481                             VmRunMode::Suspending | VmRunMode::Breakpoint => {
482                                 info!("vcpu monitor pausing until end of suspension");
483                                 run_mode_lock = self.run_mode.cvar.wait(run_mode_lock);
484                                 reset_timer = true;
485                             }
486                             VmRunMode::Exiting => {
487                                 info!("vcpu monitor detected vm exit");
488                                 break 'main;
489                             }
490                         }
491                     }
492                 }
493 
494                 Ok(())
495             })
496             .exit_context(
497                 Exit::SpawnVcpuMonitor,
498                 "failed to spawn VCPU stall monitor thread",
499             )
500     }
501 
report_any_stalls(&self)502     fn report_any_stalls(&self) {
503         // TODO(b/208267651): Add and fire Clearcut events for stalls (and add tests)
504         // TODO(b/208267651): Also test guest stalls (vcpu.run() goes too long without exiting)
505         let now = Instant::now();
506         for vcpu_thread in self.vcpu_run_threads.iter() {
507             let monitoring_metadata = vcpu_thread.monitoring_metadata.as_ref().unwrap();
508             if let Some(ref exit_snapshot) = monitoring_metadata.last_exit_snapshot.lock().clone() {
509                 let last_run =
510                     Duration::from_millis(monitoring_metadata.last_run_time.load(Ordering::SeqCst));
511                 if last_run < exit_snapshot.exit_time {
512                     // VCPU is between runs
513                     let time_since_exit = now.saturating_duration_since(
514                         monitoring_metadata.start_instant + exit_snapshot.exit_time,
515                     );
516                     if time_since_exit > Self::HOST_STALL_TIMEOUT {
517                         self.report_stall(vcpu_thread.cpu_id, exit_snapshot, time_since_exit);
518                     }
519                 }
520             };
521         }
522     }
523 
report_stall(&self, cpu_id: usize, exit_data: &VcpuExitData, stall_time: Duration)524     fn report_stall(&self, cpu_id: usize, exit_data: &VcpuExitData, stall_time: Duration) {
525         if stall_time > Self::STALL_REPORTING_LIMITER {
526             return;
527         }
528         // Double check the Vm is running. We don't care about stalls during suspension/exit
529         if *self.run_mode.mtx.lock() != VmRunMode::Running {
530             let duration_string = format!("{:.1}sec", stall_time.as_secs_f32());
531             error!(
532                 "Host stall for {} on VCPU {} exit while handling: {}",
533                 duration_string, cpu_id, exit_data,
534             );
535         }
536     }
537 }
538 
setup_vcpu_signal_handler() -> Result<()>539 fn setup_vcpu_signal_handler() -> Result<()> {
540     Ok(())
541 }
542 
run_all_vcpus<V: VmArch + 'static, Vcpu: VcpuArch + 'static>( vcpus: Vec<Option<Vcpu>>, vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>, guest_os: &RunnableLinuxVm<V, Vcpu>, exit_evt: &Event, vm_evt_wrtube: &SendTube, #[cfg(feature = "stats")] stats: &Option<Arc<Mutex<StatisticsCollector>>>, host_cpu_topology: bool, run_mode_arc: Arc<VcpuRunMode>, tsc_sync_mitigations: TscSyncMitigations, force_calibrated_tsc_leaf: bool, ) -> Result<(Vec<JoinHandle<Result<()>>>, Vec<mpsc::Sender<VcpuControl>>)>543 pub fn run_all_vcpus<V: VmArch + 'static, Vcpu: VcpuArch + 'static>(
544     vcpus: Vec<Option<Vcpu>>,
545     vcpu_boxes: Arc<Mutex<Vec<Box<dyn VcpuArch>>>>,
546     guest_os: &RunnableLinuxVm<V, Vcpu>,
547     exit_evt: &Event,
548     vm_evt_wrtube: &SendTube,
549     #[cfg(feature = "stats")] stats: &Option<Arc<Mutex<StatisticsCollector>>>,
550     host_cpu_topology: bool,
551     run_mode_arc: Arc<VcpuRunMode>,
552     tsc_sync_mitigations: TscSyncMitigations,
553     force_calibrated_tsc_leaf: bool,
554 ) -> Result<(Vec<JoinHandle<Result<()>>>, Vec<mpsc::Sender<VcpuControl>>)> {
555     let mut vcpu_threads = Vec::with_capacity(guest_os.vcpu_count + 1);
556     let mut vcpu_control_channels = Vec::with_capacity(guest_os.vcpu_count);
557     let start_barrier = Arc::new(Barrier::new(guest_os.vcpu_count + 1));
558     let enable_vcpu_monitoring = anti_tamper::enable_vcpu_monitoring();
559     setup_vcpu_signal_handler()?;
560 
561     let mut stall_monitor =
562         enable_vcpu_monitoring.then(|| VcpuStallMonitor::init(run_mode_arc.clone()));
563     for (cpu_id, vcpu) in vcpus.into_iter().enumerate() {
564         let vcpu_affinity = match guest_os.vcpu_affinity.clone() {
565             Some(VcpuAffinity::Global(v)) => Some(v),
566             Some(VcpuAffinity::PerVcpu(mut m)) => Some(m.remove(&cpu_id).unwrap_or_default()),
567             None => None,
568         };
569 
570         // TSC sync mitigations may set vcpu affinity and set a TSC offset
571         let (vcpu_affinity, tsc_offset): (Option<CpuSet>, Option<u64>) =
572             if let Some(mitigation_affinity) = tsc_sync_mitigations.get_vcpu_affinity(cpu_id) {
573                 if vcpu_affinity.is_none() {
574                     (
575                         Some(CpuSet::new(mitigation_affinity)),
576                         tsc_sync_mitigations.get_vcpu_tsc_offset(cpu_id),
577                     )
578                 } else {
579                     error!(
580                         "Core affinity {:?} specified via commandline conflicts and overrides \
581                         affinity needed for TSC sync mitigation: {:?}.",
582                         vcpu_affinity, mitigation_affinity
583                     );
584                     (vcpu_affinity, None)
585                 }
586             } else {
587                 (vcpu_affinity, None)
588             };
589 
590         let vcpu_init = &guest_os.vcpu_init[cpu_id];
591         // The vcpu_create_barrier allows the main thread to delay the spawning of additional
592         // vcpu threads until a single vcpu thread spawned has finished creating it's vcpu.
593         // We currently use this to allow creation of 1 vcpu at a time for all hypervisors.
594         // There are issues with multiple hypervisors with this approach:
595         // - Windows 11 has a regression which causes a BSOD with creation of multiple vcpu
596         //   in parallel. http://b/229635845 for more details.
597         // - GHAXM/HAXM cannot create vcpu0 in parallel with other Vcpus.
598         let vcpu_create_barrier = Arc::new(Barrier::new(2));
599         let vcpu_run_thread = VcpuRunThread::new(cpu_id, enable_vcpu_monitoring);
600         let (vcpu_control_send, vcpu_control_recv) = mpsc::channel();
601         vcpu_control_channels.push(vcpu_control_send);
602         let join_handle = vcpu_run_thread.run(
603             vcpu,
604             vcpu_init.clone(),
605             vcpu_boxes.clone(),
606             guest_os
607                 .vm
608                 .try_clone()
609                 .exit_context(Exit::CloneEvent, "failed to clone vm")?,
610             guest_os
611                 .irq_chip
612                 .try_box_clone()
613                 .exit_context(Exit::CloneEvent, "failed to clone event")?,
614             guest_os.vcpu_count,
615             guest_os.rt_cpus.contains(&cpu_id),
616             vcpu_affinity,
617             guest_os.delay_rt,
618             guest_os.no_smt,
619             start_barrier.clone(),
620             vcpu_create_barrier.clone(),
621             (*guest_os.io_bus).clone(),
622             (*guest_os.mmio_bus).clone(),
623             vm_evt_wrtube
624                 .try_clone()
625                 .exit_context(Exit::CloneTube, "failed to clone tube")?,
626             run_mode_arc.clone(),
627             #[cfg(feature = "stats")]
628             stats.clone(),
629             host_cpu_topology,
630             tsc_offset,
631             force_calibrated_tsc_leaf,
632             vcpu_control_recv,
633         )?;
634         if let Some(ref mut monitor) = stall_monitor {
635             monitor.add_vcpu_thread(vcpu_run_thread);
636         }
637 
638         // Wait until the vcpu is created before we start a new vcpu thread
639         vcpu_create_barrier.wait();
640 
641         vcpu_threads.push(join_handle);
642     }
643     if let Some(monitor) = stall_monitor {
644         vcpu_threads.push(monitor.run(exit_evt)?);
645     }
646     // Now wait on the start barrier to start all threads at the same time.
647     start_barrier.wait();
648     Ok((vcpu_threads, vcpu_control_channels))
649 }
650 
vcpu_loop<V>( context: &VcpuRunThread, mut vcpu: V, vm: impl VmArch + 'static, irq_chip: Box<dyn IrqChipArch + 'static>, io_bus: Bus, mmio_bus: Bus, run_mode_arc: Arc<VcpuRunMode>, #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>, #[cfg(target_arch = "x86_64")] cpuid_context: CpuIdContext, vcpu_control: mpsc::Receiver<VcpuControl>, ) -> Result<ExitState> where V: VcpuArch + 'static,651 fn vcpu_loop<V>(
652     context: &VcpuRunThread,
653     mut vcpu: V,
654     vm: impl VmArch + 'static,
655     irq_chip: Box<dyn IrqChipArch + 'static>,
656     io_bus: Bus,
657     mmio_bus: Bus,
658     run_mode_arc: Arc<VcpuRunMode>,
659     #[cfg(feature = "stats")] stats: Option<Arc<Mutex<StatisticsCollector>>>,
660     #[cfg(target_arch = "x86_64")] cpuid_context: CpuIdContext,
661     vcpu_control: mpsc::Receiver<VcpuControl>,
662 ) -> Result<ExitState>
663 where
664     V: VcpuArch + 'static,
665 {
666     #[cfg(feature = "stats")]
667     let mut exit_stats = VmExitStatistics::new();
668 
669     #[cfg(feature = "stats")]
670     {
671         mmio_bus.stats.lock().set_enabled(stats.is_some());
672         io_bus.stats.lock().set_enabled(stats.is_some());
673         exit_stats.set_enabled(stats.is_some());
674     }
675 
676     let mut save_tsc_offset = true;
677 
678     loop {
679         let _trace_event = trace_event!(crosvm, "vcpu loop");
680         let mut check_vm_shutdown = run_mode_arc.get_mode() != VmRunMode::Running;
681 
682         match irq_chip.wait_until_runnable(&vcpu).with_exit_context(
683             Exit::WaitUntilRunnable,
684             || {
685                 format!(
686                     "error waiting for vcpu {} to become runnable",
687                     context.cpu_id
688                 )
689             },
690         )? {
691             VcpuRunState::Runnable => {}
692             VcpuRunState::Interrupted => check_vm_shutdown = true,
693         }
694 
695         if !check_vm_shutdown {
696             let exit = {
697                 let _trace_event = trace_event!(crosvm, "vcpu::run");
698                 if let Some(ref monitoring_metadata) = context.monitoring_metadata {
699                     monitoring_metadata.last_run_time.store(
700                         // Safe conversion because millis will always be < u32::MAX
701                         monitoring_metadata
702                             .start_instant
703                             .elapsed()
704                             .as_millis()
705                             .try_into()
706                             .unwrap(),
707                         Ordering::SeqCst,
708                     );
709                 }
710                 vcpu.run()
711             };
712             if let Some(ref monitoring_metadata) = context.monitoring_metadata {
713                 *monitoring_metadata.last_exit_snapshot.lock() = Some(VcpuExitData {
714                     exit_time: monitoring_metadata.start_instant.elapsed(),
715                     exit_result: exit,
716                 });
717             }
718 
719             // save the tsc offset if we need to
720             if save_tsc_offset {
721                 if let Ok(offset) = vcpu.get_tsc_offset() {
722                     save_vcpu_tsc_offset(offset, context.cpu_id);
723                 } else {
724                     error!("Unable to determine TSC offset");
725                 }
726                 save_tsc_offset = false;
727             }
728 
729             #[cfg(feature = "stats")]
730             let start = exit_stats.start_stat();
731 
732             match exit {
733                 Ok(VcpuExit::Io) => {
734                     let _trace_event = trace_event!(crosvm, "VcpuExit::Io");
735                     vcpu.handle_io(&mut |IoParams { address, operation}| {
736                         match operation {
737                             IoOperation::Read(data) => {
738                                 io_bus.read(address, data);
739                             }
740                             IoOperation::Write(data) => {
741                                 vm.handle_io_events(IoEventAddress::Pio(address), data)
742                                     .unwrap_or_else(|e| error!(
743                                         "failed to handle ioevent for pio write to {} on vcpu {}: {}",
744                                         address, context.cpu_id, e
745                                     ));
746                                 io_bus.write(address, data);
747                             }
748                         }
749                     }).unwrap_or_else(|e| error!("failed to handle io: {}", e));
750                 }
751                 Ok(VcpuExit::Mmio) => {
752                     let _trace_event = trace_event!(crosvm, "VcpuExit::Mmio");
753                     vcpu.handle_mmio(&mut |IoParams { address, operation }| {
754                         match operation {
755                             IoOperation::Read(data) => {
756                                 if !mmio_bus.read(address, data) {
757                                     info!(
758                                         "mmio read failed: {:x}; trying memory read..",
759                                         address
760                                     );
761                                     vm.get_memory()
762                                         .read_exact_at_addr(
763                                             data,
764                                             vm_memory::GuestAddress(address),
765                                         )
766                                         .unwrap_or_else(|e| {
767                                             error!(
768                                                 "guest memory read failed at {:x}: {}",
769                                                 address, e
770                                             )
771                                         });
772                                 }
773                                 Ok(())
774                             }
775                             IoOperation::Write(data) => {
776                                 vm.handle_io_events(IoEventAddress::Mmio(address), data)
777                                     .unwrap_or_else(|e| error!(
778                                         "failed to handle ioevent for mmio write to {} on vcpu {}: {}",
779                                         address, context.cpu_id, e
780                                     ));
781                                 if !mmio_bus.write(address, data) {
782                                     info!(
783                                         "mmio write failed: {:x}; trying memory write..",
784                                         address
785                                     );
786                                     vm.get_memory()
787                                         .write_all_at_addr(data, vm_memory::GuestAddress(address))
788                                         .unwrap_or_else(|e| error!(
789                                             "guest memory write failed at {:x}: {}",
790                                             address, e
791                                         ));
792                                 }
793                                 Ok(())
794                             }
795                         }
796                     }).unwrap_or_else(|e| error!("failed to handle mmio: {}", e));
797                 }
798                 Ok(VcpuExit::IoapicEoi { vector }) => {
799                     let _trace_event = trace_event!(crosvm, "VcpuExit::IoapicEoi");
800                     irq_chip.broadcast_eoi(vector).unwrap_or_else(|e| {
801                         error!(
802                             "failed to broadcast eoi {} on vcpu {}: {}",
803                             vector, context.cpu_id, e
804                         )
805                     });
806                 }
807                 Ok(VcpuExit::IrqWindowOpen) => {
808                     let _trace_event = trace_event!(crosvm, "VcpuExit::IrqWindowOpen");
809                 }
810                 Ok(VcpuExit::Hlt) => irq_chip.halted(context.cpu_id),
811 
812                 // VcpuExit::Shutdown is always an error on Windows.  HAXM exits with
813                 // Shutdown only for triple faults and other vcpu panics.  WHPX never exits
814                 // with Shutdown.  Normal reboots and shutdowns, like window close, use
815                 // the vm event tube and VmRunMode::Exiting instead of VcpuExit::Shutdown.
816                 Ok(VcpuExit::Shutdown(reason)) => {
817                     if let Err(e) = reason {
818                         metrics::log_descriptor(
819                             MetricEventType::VcpuShutdownError,
820                             e.get_raw_error_code() as i64,
821                         );
822                     }
823                     bail_exit_code!(Exit::VcpuShutdown, "vcpu shutdown (reason: {:?})", reason)
824                 }
825                 Ok(VcpuExit::FailEntry {
826                     hardware_entry_failure_reason,
827                 }) => bail_exit_code!(
828                     Exit::VcpuFailEntry,
829                     "vcpu hw run failure: {:#x}",
830                     hardware_entry_failure_reason,
831                 ),
832                 Ok(VcpuExit::SystemEventShutdown) => {
833                     bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventShutdown")
834                 }
835                 Ok(VcpuExit::SystemEventReset) => {
836                     bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventReset")
837                 }
838                 Ok(VcpuExit::SystemEventCrash) => {
839                     bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventCrash")
840                 }
841 
842                 // When we're shutting down (e.g., emulator window gets closed), GVM vmexits
843                 // with KVM_EXIT_INTR, which vcpu.run maps to VcpuExit::Intr.  But KVM_EXIT_INTR
844                 // can happen during normal operation too, when GVM's timer finds requests
845                 // pending from the host.  So we set check_vm_shutdown, then below check the
846                 // VmRunMode state to see if we should exit the run loop.
847                 Ok(VcpuExit::Intr) => {
848                     let _trace_event = trace_event!(crosvm, "VcpuExit::Intr");
849                     check_vm_shutdown = true
850                 }
851                 Ok(VcpuExit::Canceled) => {
852                     let _trace_event = trace_event!(crosvm, "VcpuExit::Canceled");
853                     check_vm_shutdown = true
854                 }
855                 #[cfg(target_arch = "x86_64")]
856                 Ok(VcpuExit::Cpuid { mut entry }) => {
857                     let _trace_event = trace_event!(crosvm, "VcpuExit::Cpuid");
858                     // adjust the results based on crosvm logic
859                     adjust_cpuid(&mut entry, &cpuid_context);
860 
861                     // let the vcpu finish handling the exit
862                     vcpu.handle_cpuid(&entry).unwrap_or_else(|e| {
863                         error!(
864                             "failed to handle setting cpuid results on cpu {}: {}",
865                             context.cpu_id, e
866                         )
867                     });
868                 }
869                 #[cfg(target_arch = "x86_64")]
870                 Ok(VcpuExit::MsrAccess) => {
871                     let _trace_event = trace_event!(crosvm, "VcpuExit::MsrAccess");
872                 } // MsrAccess handled by hypervisor impl
873                 Ok(r) => {
874                     let _trace_event = trace_event!(crosvm, "VcpuExit::Unexpected");
875                     error!("unexpected vcpu.run return value: {:?}", r);
876                     check_vm_shutdown = true;
877                 }
878                 Err(e) => match e.errno() {
879                     ERROR_RETRY_I32 => {}
880                     _ => {
881                         run_mode_arc.set_and_notify(VmRunMode::Exiting);
882                         Err(e).exit_context(Exit::VcpuRunError, "vcpu run error")?;
883                     }
884                 },
885             }
886 
887             #[cfg(feature = "stats")]
888             exit_stats.end_stat(&exit, start);
889         }
890 
891         if check_vm_shutdown {
892             let mut run_mode_lock = run_mode_arc.mtx.lock();
893             loop {
894                 match *run_mode_lock {
895                     VmRunMode::Running => {
896                         process_vcpu_control_messages(&mut vcpu, *run_mode_lock, &vcpu_control);
897                         break;
898                     }
899                     VmRunMode::Suspending => {
900                         if let Err(e) = vcpu.on_suspend() {
901                             error!(
902                                 "failed to signal to hypervisor that vcpu {} is being suspended: {}",
903                                 context.cpu_id, e
904                             );
905                         }
906                     }
907                     VmRunMode::Breakpoint => {}
908                     VmRunMode::Exiting => {
909                         #[cfg(feature = "stats")]
910                         if let Some(stats) = stats {
911                             let mut collector = stats.lock();
912                             collector.pio_bus_stats.push(io_bus.stats);
913                             collector.mmio_bus_stats.push(mmio_bus.stats);
914                             collector.vm_exit_stats.push(exit_stats);
915                         }
916                         return Ok(ExitState::Stop);
917                     }
918                 }
919 
920                 // For non running modes, we don't want to process messages until we've completed
921                 // *all* work for any VmRunMode transition. This is because one control message
922                 // asks us to inform the requestor of our current state. We want to make sure our
923                 // our state has completely transitioned before we respond to the requestor. If
924                 // we do this elsewhere, we might respond while in a partial state which could
925                 // break features like snapshotting (e.g. by introducing a race condition).
926                 process_vcpu_control_messages(&mut vcpu, *run_mode_lock, &vcpu_control);
927 
928                 // Give ownership of our exclusive lock to the condition variable that
929                 // will block. When the condition variable is notified, `wait` will
930                 // unblock and return a new exclusive lock.
931                 run_mode_lock = run_mode_arc.cvar.wait(run_mode_lock);
932             }
933         }
934 
935         irq_chip.inject_interrupts(&vcpu).unwrap_or_else(|e| {
936             error!(
937                 "failed to inject interrupts for vcpu {}: {}",
938                 context.cpu_id, e
939             )
940         });
941     }
942 }
943 
process_vcpu_control_messages<V>( vcpu: &mut V, run_mode: VmRunMode, vcpu_control: &mpsc::Receiver<VcpuControl>, ) where V: VcpuArch + 'static,944 fn process_vcpu_control_messages<V>(
945     vcpu: &mut V,
946     run_mode: VmRunMode,
947     vcpu_control: &mpsc::Receiver<VcpuControl>,
948 ) where
949     V: VcpuArch + 'static,
950 {
951     let control_messages: Vec<VcpuControl> = vcpu_control.try_iter().collect();
952 
953     for msg in control_messages {
954         match msg {
955             VcpuControl::RunState(new_mode) => {
956                 panic!("VCPUs do not handle RunState messages on Windows")
957             }
958             #[cfg(feature = "gdb")]
959             VcpuControl::Debug(d) => {
960                 unimplemented!("Windows VCPUs do not support debug yet.");
961             }
962             VcpuControl::MakeRT => {
963                 unimplemented!("Windows VCPUs do not support on demand RT.");
964             }
965             VcpuControl::GetStates(response_chan) => {
966                 // Wondering why we need this given that the state value is already in an Arc?
967                 //
968                 // The control loop generally sets the run mode directly via the Arc; however,
969                 // it has no way of knowing *when* the VCPU threads have actually acknowledged
970                 // the new value. By returning the value in here, we prove the the control loop
971                 // we have accepted the new value and are done with our state change.
972                 if let Err(e) = response_chan.send(run_mode) {
973                     error!("Failed to send GetState: {}", e);
974                 };
975             }
976             VcpuControl::Snapshot(snapshot_writer, response_chan) => {
977                 let resp = vcpu
978                     .snapshot()
979                     .and_then(|s| snapshot_writer.write_fragment(&format!("vcpu{}", vcpu.id()), &s))
980                     .with_context(|| format!("Failed to snapshot Vcpu #{}", vcpu.id()));
981                 if let Err(e) = response_chan.send(resp) {
982                     error!("Failed to send snapshot response: {}", e);
983                 }
984             }
985             VcpuControl::Restore(req) => {
986                 let resp = req
987                     .snapshot_reader
988                     .read_fragment(&format!("vcpu{}", vcpu.id()))
989                     .and_then(|s| vcpu.restore(&s, req.host_tsc_reference_moment))
990                     .with_context(|| format!("Failed to restore Vcpu #{}", vcpu.id()));
991                 if let Err(e) = req.result_sender.send(resp) {
992                     error!("Failed to send restore response: {}", e);
993                 }
994             }
995         }
996     }
997 }
998 
999 #[cfg(test)]
1000 mod tests {
1001     use super::*;
1002 
1003     struct SetupData {
1004         pub monitor: VcpuStallMonitor,
1005         pub exit_evt: Event,
1006     }
1007 
set_up_stall_monitor(vcpu_count: usize) -> Result<SetupData>1008     fn set_up_stall_monitor(vcpu_count: usize) -> Result<SetupData> {
1009         let run_mode = Arc::new(VcpuRunMode::default());
1010         let mut monitor = VcpuStallMonitor::init(run_mode);
1011 
1012         for id in 0..vcpu_count {
1013             let new_vcpu = VcpuRunThread::new(id, true /* enable_vcpu_monitoring */);
1014             monitor.add_vcpu_thread(new_vcpu);
1015         }
1016 
1017         Ok(SetupData {
1018             monitor,
1019             exit_evt: Event::new().expect("Failed to create event"),
1020         })
1021     }
1022 
1023     #[test]
stall_monitor_closes_on_exit_evt() -> Result<()>1024     fn stall_monitor_closes_on_exit_evt() -> Result<()> {
1025         let SetupData { monitor, exit_evt } = set_up_stall_monitor(1)?;
1026 
1027         exit_evt.signal()?;
1028         let _ = monitor
1029             .run(&exit_evt)?
1030             .join()
1031             .unwrap_or_else(|e| panic!("Thread join failed: {:?}", e));
1032         Ok(())
1033     }
1034 }
1035