xref: /aosp_15_r20/external/crosvm/devices/src/virtio/pvclock.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! Virtio version of a linux pvclock clocksource.
6 //!
7 //! Driver source is here:
8 //! <https://android.googlesource.com/kernel/common/+/ebaa2c516811825b141de844cee7a38653058ef5/drivers/virtio/virtio_pvclock.c>
9 //!
10 //! # Background
11 //!
12 //! Userland applications often rely on CLOCK_MONOTONIC to be relatively continuous.
13 //! Large jumps can signal problems (e.g., triggering Android watchdogs).
14 //! This assumption breaks down in virtualized environments, where a VM's suspension isn't
15 //! inherently linked to the guest kernel's concept of "suspend".
16 //! Since fixing all userland code is impractical, virtio-pvclock allows the VMM and guest kernel
17 //! to collaborate on emulating the expected clock behavior around suspend/resume.
18 //!
19 //! # How it works
20 //!
21 //! ## Core functions of virtio-pvclock device:
22 //!
23 //! 1. Adjusts hardware clocksource offsets to make the guest clocks appear suspended when the VM is
24 //!    suspended.
25 //!   - This is achieved through the pvclock mechanism implemented in x86 KVM used by kvm-clock.
26 //! 2. Provides the guest kernel with the duration of VM suspension, allowing the guest to adjust
27 //!    its clocks accordingly.
28 //!   - Since the offset between the CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained by the guest
29 //!     kernel, applying the adjustment is the guest driver's responsibility.
30 //!
31 //! ## Expected guest clock behaviors under virtio-pvclock is enabled
32 //!
33 //! - Monotonicity of CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained.
34 //! - CLOCK_MONOTONIC will not include the time passed during crosvm is suspended from its run mode
35 //!   perspective.
36 //! - CLOCK_BOOTTIME will be adjusted to include the time passed during crosvm is suspended.
37 //!
38 //! # Why it is needed
39 //!
40 //! Because the existing solution does not cover some expectations we need.
41 //!
42 //! kvm-clock is letting the host to manage the offsets of CLOCK_MONOTONIC.
43 //! However, it doesn't address the difference between CLOCK_BOOTTIME and CLOCK_MONOTONIC related
44 //! to host's suspend/resume, as it is designed to maintain the CLOCK_REALTIME in sync mainly.
45 
46 #[cfg(target_arch = "aarch64")]
47 use std::arch::asm;
48 use std::collections::BTreeMap;
49 use std::mem::replace;
50 use std::mem::size_of;
51 use std::sync::atomic::AtomicU64;
52 use std::sync::atomic::Ordering;
53 use std::sync::Arc;
54 use std::time::Duration;
55 
56 use anyhow::anyhow;
57 use anyhow::bail;
58 use anyhow::Context;
59 use anyhow::Result;
60 use base::error;
61 use base::info;
62 use base::warn;
63 use base::AsRawDescriptor;
64 #[cfg(windows)]
65 use base::CloseNotifier;
66 use base::Error;
67 use base::Event;
68 use base::EventToken;
69 use base::RawDescriptor;
70 use base::ReadNotifier;
71 use base::Tube;
72 use base::WaitContext;
73 use base::WorkerThread;
74 use chrono::DateTime;
75 use chrono::Utc;
76 use data_model::Le32;
77 use data_model::Le64;
78 use serde::Deserialize;
79 use serde::Serialize;
80 use vm_control::PvClockCommand;
81 use vm_control::PvClockCommandResponse;
82 use vm_memory::GuestAddress;
83 use vm_memory::GuestMemory;
84 use vm_memory::GuestMemoryError;
85 use zerocopy::AsBytes;
86 use zerocopy::FromBytes;
87 use zerocopy::FromZeroes;
88 
89 use super::copy_config;
90 use super::DeviceType;
91 use super::Interrupt;
92 use super::Queue;
93 use super::VirtioDevice;
94 
95 // Pvclock has one virtio queue: set_pvclock_page
96 const QUEUE_SIZE: u16 = 1;
97 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
98 
99 // pvclock flag bits
100 const PVCLOCK_TSC_STABLE_BIT: u8 = 1;
101 const PVCLOCK_GUEST_STOPPED: u8 = 2;
102 
103 // The feature bitmap for virtio pvclock
104 const VIRTIO_PVCLOCK_F_TSC_STABLE: u64 = 0; // TSC is stable
105 const VIRTIO_PVCLOCK_F_INJECT_SLEEP: u64 = 1; // Inject sleep for suspend
106 const VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING: u64 = 2; // Use device clocksource rating
107 
108 // Status values for a virtio_pvclock request.
109 const VIRTIO_PVCLOCK_S_OK: u8 = 0;
110 const VIRTIO_PVCLOCK_S_IOERR: u8 = 1;
111 
112 const VIRTIO_PVCLOCK_CLOCKSOURCE_RATING: u32 = 450;
113 
114 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
read_clock_counter() -> u64115 fn read_clock_counter() -> u64 {
116     // SAFETY: rdtsc is unprivileged and have no side effects.
117     unsafe { std::arch::x86_64::_rdtsc() }
118 }
119 
120 #[cfg(target_arch = "aarch64")]
read_clock_counter() -> u64121 fn read_clock_counter() -> u64 {
122     let mut x: u64;
123     // SAFETY: This instruction have no side effect apart from storing the current timestamp counter
124     //         into the specified register.
125     unsafe {
126         asm!("mrs {x}, cntvct_el0",
127             x = out(reg) x,
128         );
129     }
130     x
131 }
132 
133 /// Calculate a (multiplier, shift) pair for scaled math of clocks.
134 /// The values are passed on to `pvclock_scale_delta` in the guest kernel and satisfy the following
135 /// (approximate) equality:
136 /// `n * scaled_hz / base_hz ~= ((n << shift) * multiplier) >> 32`
137 /// The logic here is roughly based on `kvm_get_time_scale` (but simplified as we can use u128).
138 /// # Arguments
139 /// * `scaled_hz` - Frequency to convert to. When dealing with clocksources, this is NSEC_PER_SEC.
140 /// * `base_hz` - Frequency to convert from. When dealing with clocksources, this is the counter
141 ///   frequency.
freq_scale_shift(scaled_hz: u64, base_hz: u64) -> (u32, i8)142 fn freq_scale_shift(scaled_hz: u64, base_hz: u64) -> (u32, i8) {
143     assert!(scaled_hz > 0 && base_hz > 0);
144     // We treat `multiplier` as a 0.32 fixed-point number by folding the >> 32 into its definition.
145     // With this definition, `multiplier` can be calculated as `(scaled_hz / base_hz) >> shift`
146     // with a corresponding `shift`.
147     //
148     // The value of `shift` should satisfy a few constraints:
149     // 1. `multiplier` needs to be < 1.0 due to the representable range of 0.32 fixed-point (maximum
150     //    (2^32-1)/2^32).
151     // 2. `shift` should be minimized because `pvclock_scale_delta` applies `shift` on the 64-bit
152     //    TSC value before extending to 128-bit and large positive shifts reduce the TSC rollover
153     //    time.
154     //
155     // Minimizing `shift` means maximizing `multiplier`. From the < 1.0 constraint, this is
156     // equivalent to having a multiplier within [0.5, 1.0). The logic below picks a multiplier
157     // satisfying that, while updating `shift` accordingly when we double or halve the multiplier.
158     let mut shift = 0;
159     // Convert to u128 so that overflow handling becomes much easier.
160     let mut scaled_hz = scaled_hz as u128;
161     let mut base_hz = base_hz as u128;
162     if scaled_hz >= base_hz {
163         while scaled_hz >= base_hz {
164             // `multiplier` >= 1.0; iteratively scale it down
165             // scaled_hz is at most 64 bits, so after this loop base_hz is at most 65 bits.
166             base_hz <<= 1;
167             shift += 1;
168         }
169     } else {
170         while base_hz > 2 * scaled_hz {
171             // `multiplier` < 0.5; iteratively scale it up
172             // base_hz is at most 64 bits. If the loop condition passes then scaled_hz is at most 63
173             // bits, otherwise at most 64 bits. Post-loop scaled_hz is at most 64 bits.
174             scaled_hz <<= 1;
175             shift -= 1;
176         }
177     }
178     // From above, we know that the values are at most 65 bits. This provides sufficient headroom
179     // for scaled_hz << 32 below.
180     assert!(base_hz < (1u128 << 65) && scaled_hz < (1u128 << 65));
181     let mult: u32 = ((scaled_hz << 32) / base_hz)
182         .try_into()
183         .expect("should not overflow");
184     (mult, shift)
185 }
186 
187 // The config structure being exposed to the guest to tell them how much suspend time should be
188 // injected to the guest's CLOCK_BOOTTIME.
189 #[derive(Debug, Clone, Copy, Default, AsBytes, FromZeroes, FromBytes)]
190 #[allow(non_camel_case_types)]
191 #[repr(C)]
192 struct virtio_pvclock_config {
193     // Total duration the VM has been paused while the guest kernel is not in the suspended state
194     // (from the power management and timekeeping perspective).
195     suspend_time_ns: Le64,
196     // Device-suggested rating of the pvclock clocksource.
197     clocksource_rating: Le32,
198     padding: u32,
199 }
200 
201 #[derive(Debug, Clone, Copy, Default, FromZeroes, FromBytes, AsBytes)]
202 #[allow(non_camel_case_types)]
203 #[repr(C)]
204 struct virtio_pvclock_set_pvclock_page_req {
205     // Physical address of pvclock page.
206     pvclock_page_pa: Le64,
207     // Current system time.
208     system_time: Le64,
209     // Current tsc value.
210     tsc_timestamp: Le64,
211     // Status of this request, one of VIRTIO_PVCLOCK_S_*.
212     status: u8,
213     padding: [u8; 7],
214 }
215 
216 // Data structure for interacting with pvclock shared memory.
217 struct PvclockSharedData {
218     mem: GuestMemory,
219     seqlock_addr: GuestAddress,
220     tsc_suspended_delta_addr: GuestAddress,
221     tsc_frequency_multiplier_addr: GuestAddress,
222     tsc_frequency_shift_addr: GuestAddress,
223     flags_addr: GuestAddress,
224 }
225 
226 impl PvclockSharedData {
new(mem: GuestMemory, addr: GuestAddress) -> Self227     pub fn new(mem: GuestMemory, addr: GuestAddress) -> Self {
228         PvclockSharedData {
229             mem,
230             // The addresses of the various fields that we need to modify are relative to the
231             // base of the pvclock page. For reference, see the pvclock_vcpu_time_info struct.
232             seqlock_addr: addr,
233             tsc_suspended_delta_addr: addr.unchecked_add(8),
234             tsc_frequency_multiplier_addr: addr.unchecked_add(24),
235             tsc_frequency_shift_addr: addr.unchecked_add(28),
236             flags_addr: addr.unchecked_add(29),
237         }
238     }
239 
240     /// Only the seqlock_addr is needed to re-create this struct at restore
241     /// time, so that is all our snapshot contains.
snapshot(&self) -> GuestAddress242     fn snapshot(&self) -> GuestAddress {
243         self.seqlock_addr
244     }
245 
246     /// Set all fields to zero.
zero_fill(&mut self) -> Result<()>247     pub fn zero_fill(&mut self) -> Result<()> {
248         // The pvclock data structure is 32 bytes long, so we write 32 bytes of 0s
249         self.mem
250             .write_all_at_addr(&[0u8; 32], self.seqlock_addr)
251             .context("failed to zero fill the pvclock shared data")
252     }
253 
increment_seqlock(&mut self) -> Result<()>254     pub fn increment_seqlock(&mut self) -> Result<()> {
255         // TODO (b/264931437): reads and writes using read/write_obj_from/at_addr are not
256         //  guaranteed to be atomic. Although this should not be a problem for the seqlock
257         //  or the other fields in the pvclock shared data (whch are protected via the seqlock)
258         //  we might want to update these calls to be as atomic as possible if/when we have
259         //  the ability to do so, just as a general cleanup and to be consistent.
260         let value = self
261             .mem
262             .read_obj_from_addr::<u32>(self.seqlock_addr)
263             .context("failed to read seqlock value")?;
264         self.mem
265             .write_obj_at_addr(value.wrapping_add(1), self.seqlock_addr)
266             .context("failed to write seqlock value")
267     }
268 
set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()>269     pub fn set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()> {
270         self.mem
271             .write_obj_at_addr(delta, self.tsc_suspended_delta_addr)
272             .context("failed to write tsc suspended delta")
273     }
274 
set_tsc_frequency(&mut self, frequency: u64) -> Result<()>275     pub fn set_tsc_frequency(&mut self, frequency: u64) -> Result<()> {
276         let (multiplier, shift): (u32, i8) = freq_scale_shift(1_000_000_000, frequency);
277 
278         self.mem
279             .write_obj_at_addr(multiplier, self.tsc_frequency_multiplier_addr)
280             .context("failed to write tsc frequency mlutiplier")?;
281         self.mem
282             .write_obj_at_addr(shift, self.tsc_frequency_shift_addr)
283             .context("failed to write tsc frequency shift")
284     }
285 
enable_pvclock_flags(&mut self, flags: u8) -> Result<()>286     pub fn enable_pvclock_flags(&mut self, flags: u8) -> Result<()> {
287         let value = self
288             .mem
289             .read_obj_from_addr::<u8>(self.flags_addr)
290             .context("failed to read flags")?;
291         self.mem
292             .write_obj_at_addr(value | flags, self.flags_addr)
293             .context("failed to write flags")
294     }
295 }
296 
297 /// Serializable part of the [PvClock] struct which will be used by the virtio_snapshot / restore.
298 #[derive(Serialize, Deserialize)]
299 struct PvClockState {
300     tsc_frequency: u64,
301     /// If the device is sleeping, a [PvClockWorkerSnapshot] that can re-create the worker
302     /// will be stored here. (We can't just store the worker itself as it contains an object
303     /// tree with references to [GuestMemory].)
304     paused_main_worker: Option<PvClockWorkerSnapshot>,
305     /// The total time the vm has been suspended, this is in an `Arc<AtomicU64>>` because it's set
306     /// by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
307     total_suspend_ns: Arc<AtomicU64>,
308     features: u64,
309     acked_features: u64,
310 }
311 
312 /// An enum to keep dynamic state of pvclock workers in a type safe manner.
313 enum PvClockWorkerState {
314     /// Idle means no worker is running.
315     /// This tube is for communicating with this device from the crosvm threads.
316     Idle(Tube),
317     /// A stub worker to respond pvclock commands when the device is not activated yet.
318     Stub(WorkerThread<StubWorkerReturn>),
319     /// A main worker to respond pvclock commands while the device is active.
320     Main(WorkerThread<MainWorkerReturn>),
321     /// None is used only for handling transitional state between the states above.
322     None,
323 }
324 
325 /// A struct that represents virtio-pvclock device.
326 pub struct PvClock {
327     state: PvClockState,
328     worker_state: PvClockWorkerState,
329 }
330 
331 impl PvClock {
new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self332     pub fn new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self {
333         let state = PvClockState {
334             tsc_frequency,
335             paused_main_worker: None,
336             total_suspend_ns: Arc::new(AtomicU64::new(0)),
337             features: base_features
338                 | 1 << VIRTIO_PVCLOCK_F_TSC_STABLE
339                 | 1 << VIRTIO_PVCLOCK_F_INJECT_SLEEP
340                 | 1 << VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING,
341             acked_features: 0,
342         };
343         PvClock {
344             state,
345             worker_state: PvClockWorkerState::Idle(suspend_tube),
346         }
347     }
348 
get_config(&self) -> virtio_pvclock_config349     fn get_config(&self) -> virtio_pvclock_config {
350         virtio_pvclock_config {
351             suspend_time_ns: self.state.total_suspend_ns.load(Ordering::SeqCst).into(),
352             clocksource_rating: VIRTIO_PVCLOCK_CLOCKSOURCE_RATING.into(),
353             padding: 0,
354         }
355     }
356 
357     /// Use switch_to_*_worker unless needed to keep the state transition consistent
start_main_worker( &mut self, interrupt: Interrupt, pvclock_worker: PvClockWorker, mut queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>358     fn start_main_worker(
359         &mut self,
360         interrupt: Interrupt,
361         pvclock_worker: PvClockWorker,
362         mut queues: BTreeMap<usize, Queue>,
363     ) -> anyhow::Result<()> {
364         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
365         if let PvClockWorkerState::Idle(suspend_tube) = last_state {
366             if queues.len() != QUEUE_SIZES.len() {
367                 self.worker_state = PvClockWorkerState::Idle(suspend_tube);
368                 return Err(anyhow!(
369                     "expected {} queues, got {}",
370                     QUEUE_SIZES.len(),
371                     queues.len()
372                 ));
373             }
374             let set_pvclock_page_queue = queues.remove(&0).unwrap();
375             self.worker_state = PvClockWorkerState::Main(WorkerThread::start(
376                 "virtio_pvclock".to_string(),
377                 move |kill_evt| {
378                     run_main_worker(
379                         pvclock_worker,
380                         set_pvclock_page_queue,
381                         suspend_tube,
382                         interrupt,
383                         kill_evt,
384                     )
385                 },
386             ));
387         } else {
388             panic!("Invalid state transition");
389         }
390         Ok(())
391     }
392 
393     /// Use switch_to_*_worker unless needed to keep the state transition consistent
start_stub_worker(&mut self)394     fn start_stub_worker(&mut self) {
395         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
396         self.worker_state = if let PvClockWorkerState::Idle(suspend_tube) = last_state {
397             PvClockWorkerState::Stub(WorkerThread::start(
398                 "virtio_pvclock_stub".to_string(),
399                 move |kill_evt| run_stub_worker(suspend_tube, kill_evt),
400             ))
401         } else {
402             panic!("Invalid state transition");
403         };
404     }
405 
406     /// Use switch_to_*_worker unless needed to keep the state transition consistent
stop_stub_worker(&mut self)407     fn stop_stub_worker(&mut self) {
408         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
409         self.worker_state = if let PvClockWorkerState::Stub(stub_worker_thread) = last_state {
410             let stub_worker_ret = stub_worker_thread.stop();
411             PvClockWorkerState::Idle(stub_worker_ret.suspend_tube)
412         } else {
413             panic!("Invalid state transition");
414         }
415     }
416 
417     /// Use switch_to_*_worker unless needed to keep the state transition consistent
stop_main_worker(&mut self)418     fn stop_main_worker(&mut self) {
419         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
420         if let PvClockWorkerState::Main(main_worker_thread) = last_state {
421             let main_worker_ret = main_worker_thread.stop();
422             self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
423             let mut queues = BTreeMap::new();
424             queues.insert(0, main_worker_ret.set_pvclock_page_queue);
425             self.state.paused_main_worker = Some(main_worker_ret.worker.into());
426         } else {
427             panic!("Invalid state transition");
428         }
429     }
430 
switch_to_stub_worker(&mut self)431     fn switch_to_stub_worker(&mut self) {
432         self.stop_main_worker();
433         self.start_stub_worker();
434     }
435 
switch_to_main_worker( &mut self, interrupt: Interrupt, pvclock_worker: PvClockWorker, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>436     fn switch_to_main_worker(
437         &mut self,
438         interrupt: Interrupt,
439         pvclock_worker: PvClockWorker,
440         queues: BTreeMap<usize, Queue>,
441     ) -> anyhow::Result<()> {
442         self.stop_stub_worker();
443         self.start_main_worker(interrupt, pvclock_worker, queues)
444     }
445 }
446 
447 /// Represents a moment in time including the TSC counter value at that time.
448 #[derive(Serialize, Deserialize, Clone)]
449 struct PvclockInstant {
450     time: DateTime<Utc>,
451     tsc_value: u64,
452 }
453 
454 /// The unique data retained by [PvClockWorker] which can be used to re-create
455 /// an identical worker.
456 #[derive(Serialize, Deserialize, Clone)]
457 struct PvClockWorkerSnapshot {
458     suspend_time: Option<PvclockInstant>,
459     total_suspend_tsc_delta: u64,
460     pvclock_shared_data_base_address: Option<GuestAddress>,
461 }
462 
463 impl From<PvClockWorker> for PvClockWorkerSnapshot {
from(worker: PvClockWorker) -> Self464     fn from(worker: PvClockWorker) -> Self {
465         PvClockWorkerSnapshot {
466             suspend_time: worker.suspend_time,
467             total_suspend_tsc_delta: worker.total_suspend_tsc_delta,
468             pvclock_shared_data_base_address: worker
469                 .pvclock_shared_data
470                 .map(|pvclock| pvclock.snapshot()),
471         }
472     }
473 }
474 
475 /// Worker struct for the virtio-pvclock device.
476 ///
477 /// Handles virtio requests, storing information about suspend/resume, adjusting the
478 /// pvclock data in shared memory, and injecting suspend durations via config
479 /// changes.
480 struct PvClockWorker {
481     tsc_frequency: u64,
482     // The moment the last suspend occurred.
483     suspend_time: Option<PvclockInstant>,
484     // The total time the vm has been suspended, this is in an Arc<AtomicU64>> because it's set
485     // by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
486     total_injected_ns: Arc<AtomicU64>,
487     // The total change in the TSC value over suspensions.
488     total_suspend_tsc_delta: u64,
489     // Pvclock shared data.
490     pvclock_shared_data: Option<PvclockSharedData>,
491     mem: GuestMemory,
492 }
493 
494 impl PvClockWorker {
new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self495     pub fn new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self {
496         PvClockWorker {
497             tsc_frequency,
498             suspend_time: None,
499             total_injected_ns,
500             total_suspend_tsc_delta: 0,
501             pvclock_shared_data: None,
502             mem,
503         }
504     }
505 
from_snapshot( tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, snap: PvClockWorkerSnapshot, mem: GuestMemory, ) -> Self506     fn from_snapshot(
507         tsc_frequency: u64,
508         total_injected_ns: Arc<AtomicU64>,
509         snap: PvClockWorkerSnapshot,
510         mem: GuestMemory,
511     ) -> Self {
512         PvClockWorker {
513             tsc_frequency,
514             suspend_time: snap.suspend_time,
515             total_injected_ns,
516             total_suspend_tsc_delta: snap.total_suspend_tsc_delta,
517             pvclock_shared_data: snap
518                 .pvclock_shared_data_base_address
519                 .map(|addr| PvclockSharedData::new(mem.clone(), addr)),
520             mem,
521         }
522     }
523 
524     /// Initialize the pvclock for initial boot. We assume that the systemtime of 0 corresponds
525     /// to the tsc time of 0, so we do not set these. We set the tsc frequency based on the vcpu
526     /// tsc frequency and we set PVCLOCK_TSC_STABLE_BIT in flags to tell the guest that it's
527     /// safe to use vcpu0's pvclock page for use by the vdso. The order of writing the different
528     /// fields doesn't matter at this point, but does matter when updating.
set_pvclock_page(&mut self, addr: u64) -> Result<()>529     fn set_pvclock_page(&mut self, addr: u64) -> Result<()> {
530         if self.pvclock_shared_data.is_some() {
531             return Err(Error::new(libc::EALREADY)).context("pvclock page already set");
532         }
533 
534         let mut shared_data = PvclockSharedData::new(self.mem.clone(), GuestAddress(addr));
535 
536         // set all fields to 0 first
537         shared_data.zero_fill()?;
538 
539         shared_data.set_tsc_frequency(self.tsc_frequency)?;
540         shared_data.enable_pvclock_flags(PVCLOCK_TSC_STABLE_BIT)?;
541 
542         self.pvclock_shared_data = Some(shared_data);
543         Ok(())
544     }
545 
suspend(&mut self)546     pub fn suspend(&mut self) {
547         if self.suspend_time.is_some() {
548             warn!("Suspend time already set, ignoring new suspend time");
549             return;
550         }
551         self.suspend_time = Some(PvclockInstant {
552             time: Utc::now(),
553             tsc_value: read_clock_counter(),
554         });
555     }
556 
resume(&mut self) -> Result<u64>557     pub fn resume(&mut self) -> Result<u64> {
558         // First, increment the sequence lock by 1 before writing to the pvclock page.
559         self.increment_pvclock_seqlock()?;
560 
561         // The guest makes sure there are memory barriers in between reads of the seqlock and other
562         // fields, we should make sure there are memory barriers in between writes of seqlock and
563         // writes to other fields.
564         std::sync::atomic::fence(Ordering::SeqCst);
565 
566         // Set the guest_stopped_bit and tsc suspended delta in pvclock struct. We only need to set
567         // the bit, the guest will unset it once the guest has handled the stoppage.
568         // We get the result here because we want to call increment_pvclock_seqlock regardless of
569         // the result of these calls.
570         let result = self
571             .set_guest_stopped_bit()
572             .and_then(|_| self.set_suspended_time());
573 
574         // The guest makes sure there are memory barriers in between reads of the seqlock and other
575         // fields, we should make sure there are memory barriers in between writes of seqlock and
576         // writes to other fields.
577         std::sync::atomic::fence(Ordering::SeqCst);
578 
579         // Do a final increment once changes are done.
580         self.increment_pvclock_seqlock()?;
581 
582         result
583     }
584 
get_suspended_duration(suspend_time: &PvclockInstant) -> Duration585     fn get_suspended_duration(suspend_time: &PvclockInstant) -> Duration {
586         match Utc::now().signed_duration_since(suspend_time.time).to_std() {
587             Ok(duration) => duration,
588             Err(e) => {
589                 error!(
590                     "pvclock found suspend time in the future (was the host \
591                     clock adjusted?). Guest boot/realtime clock may now be \
592                     incorrect. Details: {}",
593                     e
594                 );
595                 Duration::ZERO
596             }
597         }
598     }
599 
set_suspended_time(&mut self) -> Result<u64>600     fn set_suspended_time(&mut self) -> Result<u64> {
601         let (this_suspend_duration, this_suspend_tsc_delta) =
602             if let Some(suspend_time) = self.suspend_time.take() {
603                 (
604                     Self::get_suspended_duration(&suspend_time),
605                     // NB: This calculation may wrap around, as TSC can be reset to zero when
606                     // the device has resumed from the "deep" suspend state (it may not happen for
607                     // s2idle cases). It also happens when the tsc value itself wraps.
608                     read_clock_counter().wrapping_sub(suspend_time.tsc_value),
609                 )
610             } else {
611                 return Err(Error::new(libc::ENOTSUP))
612                     .context("Cannot set suspend time because suspend was never called");
613             };
614 
615         // update the total tsc delta during all suspends
616         // NB: This calculation may wrap around, as the suspend time can be bigger than u64 range.
617         self.total_suspend_tsc_delta = self
618             .total_suspend_tsc_delta
619             .wrapping_add(this_suspend_tsc_delta);
620 
621         // save tsc_suspended_delta to shared memory
622         self.pvclock_shared_data
623             .as_mut()
624             .ok_or(
625                 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
626             )?
627             .set_tsc_suspended_delta(self.total_suspend_tsc_delta)?;
628 
629         info!(
630             "set total suspend tsc delta to {}",
631             self.total_suspend_tsc_delta
632         );
633 
634         // update total suspend ns
635         self.total_injected_ns
636             .fetch_add(this_suspend_duration.as_nanos() as u64, Ordering::SeqCst);
637 
638         Ok(self.total_suspend_tsc_delta)
639     }
640 
increment_pvclock_seqlock(&mut self) -> Result<()>641     fn increment_pvclock_seqlock(&mut self) -> Result<()> {
642         self.pvclock_shared_data
643             .as_mut()
644             .ok_or(
645                 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
646             )?
647             .increment_seqlock()
648     }
649 
set_guest_stopped_bit(&mut self) -> Result<()>650     fn set_guest_stopped_bit(&mut self) -> Result<()> {
651         self.pvclock_shared_data
652             .as_mut()
653             .ok_or(
654                 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
655             )?
656             .enable_pvclock_flags(PVCLOCK_GUEST_STOPPED)
657     }
658 }
659 
pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error660 fn pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error {
661     for cause in error.chain() {
662         if let Some(e) = cause.downcast_ref::<base::Error>() {
663             return *e;
664         }
665 
666         if let Some(e) = cause.downcast_ref::<GuestMemoryError>() {
667             return match e {
668                 // Two kinds of GuestMemoryError contain base::Error
669                 GuestMemoryError::MemoryAddSealsFailed(e) => *e,
670                 GuestMemoryError::MemoryCreationFailed(e) => *e,
671                 // Otherwise return EINVAL
672                 _ => Error::new(libc::EINVAL),
673             };
674         }
675     }
676     // Unknown base error
677     Error::new(libc::EFAULT)
678 }
679 
680 struct StubWorkerReturn {
681     suspend_tube: Tube,
682 }
683 
684 /// A stub worker to respond any requests when the device is inactive.
run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn685 fn run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn {
686     #[derive(EventToken, Debug)]
687     enum Token {
688         SomePvClockRequest,
689         Kill,
690     }
691     let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
692         (suspend_tube.get_read_notifier(), Token::SomePvClockRequest),
693         // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
694         // implemented for Tube.
695         #[cfg(windows)]
696         (suspend_tube.get_close_notifier(), Token::Kill),
697         (&kill_evt, Token::Kill),
698     ]) {
699         Ok(wait_ctx) => wait_ctx,
700         Err(e) => {
701             error!("failed creating WaitContext: {}", e);
702             return StubWorkerReturn { suspend_tube };
703         }
704     };
705     'wait: loop {
706         let events = match wait_ctx.wait() {
707             Ok(v) => v,
708             Err(e) => {
709                 error!("failed polling for events: {}", e);
710                 break;
711             }
712         };
713         for event in events.iter().filter(|e| e.is_readable) {
714             match event.token {
715                 Token::SomePvClockRequest => {
716                     match suspend_tube.recv::<PvClockCommand>() {
717                         Ok(req) => req,
718                         Err(e) => {
719                             error!("failed to receive request: {}", e);
720                             continue;
721                         }
722                     };
723                     if let Err(e) = suspend_tube.send(&PvClockCommandResponse::DeviceInactive) {
724                         error!("error sending PvClockCommandResponse: {}", e);
725                     }
726                 }
727                 Token::Kill => {
728                     break 'wait;
729                 }
730             }
731         }
732     }
733     StubWorkerReturn { suspend_tube }
734 }
735 
736 struct MainWorkerReturn {
737     worker: PvClockWorker,
738     set_pvclock_page_queue: Queue,
739     suspend_tube: Tube,
740 }
741 
742 // TODO(b/237300012): asyncify this device.
743 /// A worker to process PvClockCommand requests
run_main_worker( mut worker: PvClockWorker, mut set_pvclock_page_queue: Queue, suspend_tube: Tube, interrupt: Interrupt, kill_evt: Event, ) -> MainWorkerReturn744 fn run_main_worker(
745     mut worker: PvClockWorker,
746     mut set_pvclock_page_queue: Queue,
747     suspend_tube: Tube,
748     interrupt: Interrupt,
749     kill_evt: Event,
750 ) -> MainWorkerReturn {
751     #[derive(EventToken)]
752     enum Token {
753         SetPvClockPageQueue,
754         SuspendResume,
755         InterruptResample,
756         Kill,
757     }
758 
759     let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
760         (set_pvclock_page_queue.event(), Token::SetPvClockPageQueue),
761         (suspend_tube.get_read_notifier(), Token::SuspendResume),
762         // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
763         // implemented for Tube.
764         #[cfg(windows)]
765         (suspend_tube.get_close_notifier(), Token::Kill),
766         (&kill_evt, Token::Kill),
767     ]) {
768         Ok(pc) => pc,
769         Err(e) => {
770             error!("failed creating WaitContext: {}", e);
771             return MainWorkerReturn {
772                 suspend_tube,
773                 set_pvclock_page_queue,
774                 worker,
775             };
776         }
777     };
778     if let Some(resample_evt) = interrupt.get_resample_evt() {
779         if wait_ctx
780             .add(resample_evt, Token::InterruptResample)
781             .is_err()
782         {
783             error!("failed creating WaitContext");
784             return MainWorkerReturn {
785                 suspend_tube,
786                 set_pvclock_page_queue,
787                 worker,
788             };
789         }
790     }
791 
792     'wait: loop {
793         let events = match wait_ctx.wait() {
794             Ok(v) => v,
795             Err(e) => {
796                 error!("failed polling for events: {}", e);
797                 break;
798             }
799         };
800 
801         for event in events.iter().filter(|e| e.is_readable) {
802             match event.token {
803                 Token::SetPvClockPageQueue => {
804                     let _ = set_pvclock_page_queue.event().wait();
805                     let desc_chain = match set_pvclock_page_queue.pop() {
806                         Some(desc_chain) => desc_chain,
807                         None => {
808                             error!("set_pvclock_page queue was empty");
809                             continue;
810                         }
811                     };
812 
813                     // This device does not follow the virtio spec requirements for device-readable
814                     // vs. device-writable descriptors, so we can't use `Reader`/`Writer`. Pick the
815                     // first descriptor from the chain and assume the whole req structure is
816                     // contained within it.
817                     let desc = desc_chain
818                         .reader
819                         .get_remaining_regions()
820                         .chain(desc_chain.writer.get_remaining_regions())
821                         .next()
822                         .unwrap();
823 
824                     let len = if desc.len < size_of::<virtio_pvclock_set_pvclock_page_req>() {
825                         error!("pvclock descriptor too short");
826                         0
827                     } else {
828                         let addr = GuestAddress(desc.offset);
829                         let mut req: virtio_pvclock_set_pvclock_page_req = match worker
830                             .mem
831                             .read_obj_from_addr(addr)
832                         {
833                             Ok(req) => req,
834                             Err(e) => {
835                                 error!("failed to read request from set_pvclock_page queue: {}", e);
836                                 continue;
837                             }
838                         };
839 
840                         req.status = match worker.set_pvclock_page(req.pvclock_page_pa.into()) {
841                             Err(e) => {
842                                 error!("failed to set pvclock page: {:#}", e);
843                                 VIRTIO_PVCLOCK_S_IOERR
844                             }
845                             Ok(_) => VIRTIO_PVCLOCK_S_OK,
846                         };
847 
848                         if let Err(e) = worker.mem.write_obj_at_addr(req, addr) {
849                             error!("failed to write set_pvclock_page status: {}", e);
850                             continue;
851                         }
852 
853                         desc.len as u32
854                     };
855 
856                     set_pvclock_page_queue.add_used(desc_chain, len);
857                     set_pvclock_page_queue.trigger_interrupt();
858                 }
859                 Token::SuspendResume => {
860                     let req = match suspend_tube.recv::<PvClockCommand>() {
861                         Ok(req) => req,
862                         Err(e) => {
863                             error!("failed to receive request: {}", e);
864                             continue;
865                         }
866                     };
867 
868                     let resp = match req {
869                         PvClockCommand::Suspend => {
870                             worker.suspend();
871                             PvClockCommandResponse::Ok
872                         }
873                         PvClockCommand::Resume => {
874                             match worker.resume() {
875                                 Ok(total_suspended_ticks) => {
876                                     // signal to the driver that the total_suspend_ns has changed
877                                     interrupt.signal_config_changed();
878                                     PvClockCommandResponse::Resumed {
879                                         total_suspended_ticks,
880                                     }
881                                 }
882                                 Err(e) => {
883                                     error!("Failed to resume pvclock: {:#}", e);
884                                     PvClockCommandResponse::Err(pvclock_response_error_from_anyhow(
885                                         e,
886                                     ))
887                                 }
888                             }
889                         }
890                     };
891 
892                     if let Err(e) = suspend_tube.send(&resp) {
893                         error!("error sending PvClockCommandResponse: {}", e);
894                     }
895                 }
896 
897                 Token::InterruptResample => {
898                     interrupt.interrupt_resample();
899                 }
900                 Token::Kill => {
901                     break 'wait;
902                 }
903             }
904         }
905     }
906 
907     MainWorkerReturn {
908         suspend_tube,
909         set_pvclock_page_queue,
910         worker,
911     }
912 }
913 
914 impl VirtioDevice for PvClock {
keep_rds(&self) -> Vec<RawDescriptor>915     fn keep_rds(&self) -> Vec<RawDescriptor> {
916         if let PvClockWorkerState::Idle(suspend_tube) = &self.worker_state {
917             vec![suspend_tube.as_raw_descriptor()]
918         } else {
919             Vec::new()
920         }
921     }
922 
device_type(&self) -> DeviceType923     fn device_type(&self) -> DeviceType {
924         DeviceType::Pvclock
925     }
926 
queue_max_sizes(&self) -> &[u16]927     fn queue_max_sizes(&self) -> &[u16] {
928         QUEUE_SIZES
929     }
930 
features(&self) -> u64931     fn features(&self) -> u64 {
932         self.state.features
933     }
934 
ack_features(&mut self, mut value: u64)935     fn ack_features(&mut self, mut value: u64) {
936         if value & !self.features() != 0 {
937             warn!("virtio-pvclock got unknown feature ack {:x}", value);
938             value &= self.features();
939         }
940         self.state.acked_features |= value;
941     }
942 
read_config(&self, offset: u64, data: &mut [u8])943     fn read_config(&self, offset: u64, data: &mut [u8]) {
944         copy_config(data, 0, self.get_config().as_bytes(), offset);
945     }
946 
write_config(&mut self, offset: u64, data: &[u8])947     fn write_config(&mut self, offset: u64, data: &[u8]) {
948         // Pvclock device doesn't expect a guest write to config
949         warn!(
950             "Unexpected write to virtio-pvclock config at offset {}: {:?}",
951             offset, data
952         );
953     }
954 
activate( &mut self, mem: GuestMemory, interrupt: Interrupt, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>955     fn activate(
956         &mut self,
957         mem: GuestMemory,
958         interrupt: Interrupt,
959         queues: BTreeMap<usize, Queue>,
960     ) -> anyhow::Result<()> {
961         let tsc_frequency = self.state.tsc_frequency;
962         let total_suspend_ns = self.state.total_suspend_ns.clone();
963         let worker = PvClockWorker::new(tsc_frequency, total_suspend_ns, mem);
964         self.switch_to_main_worker(interrupt, worker, queues)
965     }
966 
reset(&mut self) -> Result<()>967     fn reset(&mut self) -> Result<()> {
968         self.switch_to_stub_worker();
969         Ok(())
970     }
971 
virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>>972     fn virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>> {
973         let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
974         match last_state {
975             PvClockWorkerState::Main(main_worker_thread) => {
976                 let main_worker_ret = main_worker_thread.stop();
977                 let mut queues = BTreeMap::new();
978                 queues.insert(0, main_worker_ret.set_pvclock_page_queue);
979                 self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
980                 self.state.paused_main_worker = Some(main_worker_ret.worker.into());
981                 Ok(Some(queues))
982             }
983             PvClockWorkerState::Stub(stub_worker_thread) => {
984                 let stub_ret = stub_worker_thread.stop();
985                 self.worker_state = PvClockWorkerState::Idle(stub_ret.suspend_tube);
986                 Ok(None)
987             }
988             PvClockWorkerState::Idle(suspend_tube) => {
989                 self.worker_state = PvClockWorkerState::Idle(suspend_tube);
990                 Ok(None)
991             }
992             PvClockWorkerState::None => panic!("invalid state transition"),
993         }
994     }
995 
virtio_wake( &mut self, queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>, ) -> anyhow::Result<()>996     fn virtio_wake(
997         &mut self,
998         queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>,
999     ) -> anyhow::Result<()> {
1000         if let Some((mem, interrupt, queues)) = queues_state {
1001             let worker_snap = self
1002                 .state
1003                 .paused_main_worker
1004                 .take()
1005                 .ok_or(anyhow!("a sleeping pvclock must have a paused worker"))?;
1006             let worker = PvClockWorker::from_snapshot(
1007                 self.state.tsc_frequency,
1008                 self.state.total_suspend_ns.clone(),
1009                 worker_snap,
1010                 mem,
1011             );
1012             // Use unchecked as no worker is running at this point
1013             self.start_main_worker(interrupt, worker, queues)?;
1014         } else {
1015             // If the device wasn't activated, we should bring up the stub worker since that's
1016             // what is supposed to be running for an un-activated device.
1017             self.start_stub_worker();
1018         }
1019         Ok(())
1020     }
1021 
virtio_snapshot(&mut self) -> anyhow::Result<serde_json::Value>1022     fn virtio_snapshot(&mut self) -> anyhow::Result<serde_json::Value> {
1023         serde_json::to_value(&self.state).context("failed to serialize PvClockState")
1024     }
1025 
virtio_restore(&mut self, data: serde_json::Value) -> anyhow::Result<()>1026     fn virtio_restore(&mut self, data: serde_json::Value) -> anyhow::Result<()> {
1027         let state: PvClockState = serde_json::from_value(data).context("error deserializing")?;
1028         if state.features != self.features() {
1029             bail!(
1030                 "expected virtio_features to match, but they did not. Live: {:?}, snapshot {:?}",
1031                 self.features(),
1032                 state.features,
1033             );
1034         }
1035         // TODO(b/291346907): we assume that the TSC frequency has NOT changed
1036         // since the snapshot was made. Assuming we have not moved machines,
1037         // this is a reasonable assumption. We don't verify the frequency
1038         // because TSC calibration noisy.
1039         self.state = state;
1040         Ok(())
1041     }
1042 
on_device_sandboxed(&mut self)1043     fn on_device_sandboxed(&mut self) {
1044         self.start_stub_worker();
1045     }
1046 }
1047 
1048 #[cfg(test)]
1049 mod tests {
1050     use super::*;
1051     use crate::virtio::QueueConfig;
1052 
1053     const TEST_QUEUE_SIZE: u16 = 2048;
1054 
make_interrupt() -> Interrupt1055     fn make_interrupt() -> Interrupt {
1056         Interrupt::new_for_test()
1057     }
1058 
create_pvclock_device() -> (Tube, PvClock)1059     fn create_pvclock_device() -> (Tube, PvClock) {
1060         let (host_tube, device_tube) = Tube::pair().unwrap();
1061         let mut pvclock_device = PvClock::new(0, 1e9 as u64, device_tube);
1062 
1063         // Simulate the device initialization to start the stub thread.
1064         // In the real case, on_device_sandboxed will be called after the device is sandboxed
1065         // (or at some point during the device initializtion when the sandbox is disabled) to
1066         // allow devices to use multi-threads (as spawning new threads before sandboxing is
1067         // prohibited because of the minijail's restriction).
1068         pvclock_device.on_device_sandboxed();
1069 
1070         (host_tube, pvclock_device)
1071     }
1072 
create_sleeping_device() -> (PvClock, GuestMemory, Tube)1073     fn create_sleeping_device() -> (PvClock, GuestMemory, Tube) {
1074         let (_host_tube, mut pvclock_device) = create_pvclock_device();
1075 
1076         // The queue won't actually be used, so passing one that isn't
1077         // fully configured is fine.
1078         let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1079         fake_queue.set_ready(true);
1080         let mem = GuestMemory::new(&[(GuestAddress(0), 0x10000)]).unwrap();
1081         let interrupt = make_interrupt();
1082         pvclock_device
1083             .activate(
1084                 mem.clone(),
1085                 interrupt.clone(),
1086                 BTreeMap::from([(
1087                     0,
1088                     fake_queue
1089                         .activate(&mem, Event::new().unwrap(), interrupt)
1090                         .unwrap(),
1091                 )]),
1092             )
1093             .expect("activate should succeed");
1094         let queues = pvclock_device
1095             .virtio_sleep()
1096             .expect("sleep should succeed")
1097             .expect("sleep should yield queues");
1098         assert_eq!(queues.len(), 1);
1099         assert_eq!(
1100             queues.get(&0).expect("queue must be present").size(),
1101             TEST_QUEUE_SIZE
1102         );
1103         assert!(pvclock_device.state.paused_main_worker.is_some());
1104         (pvclock_device, mem, _host_tube)
1105     }
1106 
assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory)1107     fn assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory) {
1108         // We just create a new queue here, because it isn't actually accessed
1109         // by the device in these tests.
1110         let mut wake_queues = BTreeMap::new();
1111         let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1112         let interrupt = make_interrupt();
1113         fake_queue.set_ready(true);
1114         wake_queues.insert(
1115             0,
1116             fake_queue
1117                 .activate(mem, Event::new().unwrap(), interrupt.clone())
1118                 .unwrap(),
1119         );
1120         let queues_state = (mem.clone(), interrupt, wake_queues);
1121         pvclock_device
1122             .virtio_wake(Some(queues_state))
1123             .expect("wake should succeed");
1124         assert!(pvclock_device.state.paused_main_worker.is_none());
1125     }
1126 
1127     #[test]
test_command_response_when_inactive()1128     fn test_command_response_when_inactive() {
1129         let (host_tube, _pvclock_device) = create_pvclock_device();
1130         assert!(host_tube.send(&PvClockCommand::Suspend).is_ok());
1131         let res = host_tube.recv::<PvClockCommandResponse>();
1132         assert!(matches!(res, Ok(PvClockCommandResponse::DeviceInactive)));
1133     }
1134 
1135     #[test]
test_sleep_wake_smoke()1136     fn test_sleep_wake_smoke() {
1137         let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1138         assert_wake_successful(&mut pvclock_device, &mem);
1139     }
1140 
1141     #[test]
test_save_restore()1142     fn test_save_restore() {
1143         let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1144         let test_suspend_ns = 9999;
1145 
1146         // Store a test value we can look for later in the test to verify
1147         // we're restoring properties.
1148         pvclock_device
1149             .state
1150             .total_suspend_ns
1151             .store(test_suspend_ns, Ordering::SeqCst);
1152 
1153         let snap = pvclock_device.virtio_snapshot().unwrap();
1154         pvclock_device
1155             .state
1156             .total_suspend_ns
1157             .store(0, Ordering::SeqCst);
1158         pvclock_device.virtio_restore(snap).unwrap();
1159         assert_eq!(
1160             pvclock_device.state.total_suspend_ns.load(Ordering::SeqCst),
1161             test_suspend_ns
1162         );
1163 
1164         assert_wake_successful(&mut pvclock_device, &mem);
1165     }
1166 
1167     /// A simplified clone of `pvclock_scale_delta` from Linux kernel to emulate
1168     /// what the kernel does when converting TSC to ktime.
pvclock_scale_tsc(mult: u32, shift: i8, tsc: u64) -> u641169     fn pvclock_scale_tsc(mult: u32, shift: i8, tsc: u64) -> u64 {
1170         let shifted = if shift < 0 {
1171             tsc >> -shift
1172         } else {
1173             tsc << shift
1174         };
1175         let product = shifted as u128 * mult as u128;
1176         (product >> 32).try_into().expect("should not overflow")
1177     }
1178 
1179     /// Helper function for checking the behavior of `freq_scale_shift`.
check_freq_scale(f: u64, input: u64)1180     fn check_freq_scale(f: u64, input: u64) {
1181         // We only test `scaled_hz` = 1GHz because that is the only value used in the code base.
1182         let (mult, shift) = freq_scale_shift(1_000_000_000, f);
1183 
1184         let scaled = pvclock_scale_tsc(mult, shift, input);
1185 
1186         // Use relative error <= 1e-8 as the target. TSC can be huge so this isn't really a super
1187         // accurate target, and our goal is to simply sanity check the math without adding too many
1188         // requirements about rounding errors.
1189         let expected: u64 = (input as u128 * 1_000_000_000u128 / f as u128) as u64;
1190         let expected_lo: u64 = (input as u128 * 999_999_990u128 / f as u128) as u64;
1191         let expected_hi: u64 = (input as u128 * 1_000_000_010u128 / f as u128) as u64;
1192         assert!(
1193             (expected_lo..=expected_hi).contains(&scaled),
1194             "{scaled} should be close to {expected} (base_hz={f}, mult={mult}, shift={shift})"
1195         );
1196     }
1197 
1198     #[test]
test_freq_scale_shift_accuracy()1199     fn test_freq_scale_shift_accuracy() {
1200         // Basic check for formula correctness: scaling `scaled_hz` to `base_hz` should yield
1201         // `base_hz`.
1202         for f in (1..=50).map(|n| n * 100_000_000) {
1203             check_freq_scale(f, f);
1204         }
1205     }
1206 
1207     #[test]
test_freq_scale_shift_overflow_high_freq()1208     fn test_freq_scale_shift_overflow_high_freq() {
1209         // For scale factors < 1.0, test that we can correctly convert the maximum TSC value without
1210         // overflow. We must be able to handle values as large as it realistically can be, as the
1211         // kernel clock breaks if the calculated ktime goes backwards (b/342168920).
1212         for f in (11..=50).map(|n| n * 100_000_000) {
1213             check_freq_scale(f, u64::MAX);
1214         }
1215     }
1216 
1217     #[test]
test_freq_scale_shift_overflow_low_freq()1218     fn test_freq_scale_shift_overflow_low_freq() {
1219         fn prev_power_of_two(n: u64) -> u64 {
1220             assert_ne!(n, 0);
1221             let highest_bit_set = 63 - n.leading_zeros();
1222             1 << highest_bit_set
1223         }
1224         // Same test as above, but for scale factors >= 1.0. The difference is that for scale
1225         // factors >= 1.0 we first round up the factor, then apply a multiplier (< 1.0). We reflect
1226         // this limitation in our tested maximum value.
1227         for f in (1..=10).map(|n| n * 100_000_000) {
1228             // Truncate the remainder since prev_power_of_two rounds down anyway.
1229             let factor = 1_000_000_000 / f;
1230             // This is like (exp2(floor(log2(factor)) + 1)).
1231             let target = u64::MAX / (prev_power_of_two(factor) << 1);
1232             check_freq_scale(f, target);
1233         }
1234     }
1235 }
1236