1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 //! Virtio version of a linux pvclock clocksource.
6 //!
7 //! Driver source is here:
8 //! <https://android.googlesource.com/kernel/common/+/ebaa2c516811825b141de844cee7a38653058ef5/drivers/virtio/virtio_pvclock.c>
9 //!
10 //! # Background
11 //!
12 //! Userland applications often rely on CLOCK_MONOTONIC to be relatively continuous.
13 //! Large jumps can signal problems (e.g., triggering Android watchdogs).
14 //! This assumption breaks down in virtualized environments, where a VM's suspension isn't
15 //! inherently linked to the guest kernel's concept of "suspend".
16 //! Since fixing all userland code is impractical, virtio-pvclock allows the VMM and guest kernel
17 //! to collaborate on emulating the expected clock behavior around suspend/resume.
18 //!
19 //! # How it works
20 //!
21 //! ## Core functions of virtio-pvclock device:
22 //!
23 //! 1. Adjusts hardware clocksource offsets to make the guest clocks appear suspended when the VM is
24 //! suspended.
25 //! - This is achieved through the pvclock mechanism implemented in x86 KVM used by kvm-clock.
26 //! 2. Provides the guest kernel with the duration of VM suspension, allowing the guest to adjust
27 //! its clocks accordingly.
28 //! - Since the offset between the CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained by the guest
29 //! kernel, applying the adjustment is the guest driver's responsibility.
30 //!
31 //! ## Expected guest clock behaviors under virtio-pvclock is enabled
32 //!
33 //! - Monotonicity of CLOCK_MONOTONIC and CLOCK_BOOTTIME is maintained.
34 //! - CLOCK_MONOTONIC will not include the time passed during crosvm is suspended from its run mode
35 //! perspective.
36 //! - CLOCK_BOOTTIME will be adjusted to include the time passed during crosvm is suspended.
37 //!
38 //! # Why it is needed
39 //!
40 //! Because the existing solution does not cover some expectations we need.
41 //!
42 //! kvm-clock is letting the host to manage the offsets of CLOCK_MONOTONIC.
43 //! However, it doesn't address the difference between CLOCK_BOOTTIME and CLOCK_MONOTONIC related
44 //! to host's suspend/resume, as it is designed to maintain the CLOCK_REALTIME in sync mainly.
45
46 #[cfg(target_arch = "aarch64")]
47 use std::arch::asm;
48 use std::collections::BTreeMap;
49 use std::mem::replace;
50 use std::mem::size_of;
51 use std::sync::atomic::AtomicU64;
52 use std::sync::atomic::Ordering;
53 use std::sync::Arc;
54 use std::time::Duration;
55
56 use anyhow::anyhow;
57 use anyhow::bail;
58 use anyhow::Context;
59 use anyhow::Result;
60 use base::error;
61 use base::info;
62 use base::warn;
63 use base::AsRawDescriptor;
64 #[cfg(windows)]
65 use base::CloseNotifier;
66 use base::Error;
67 use base::Event;
68 use base::EventToken;
69 use base::RawDescriptor;
70 use base::ReadNotifier;
71 use base::Tube;
72 use base::WaitContext;
73 use base::WorkerThread;
74 use chrono::DateTime;
75 use chrono::Utc;
76 use data_model::Le32;
77 use data_model::Le64;
78 use serde::Deserialize;
79 use serde::Serialize;
80 use vm_control::PvClockCommand;
81 use vm_control::PvClockCommandResponse;
82 use vm_memory::GuestAddress;
83 use vm_memory::GuestMemory;
84 use vm_memory::GuestMemoryError;
85 use zerocopy::AsBytes;
86 use zerocopy::FromBytes;
87 use zerocopy::FromZeroes;
88
89 use super::copy_config;
90 use super::DeviceType;
91 use super::Interrupt;
92 use super::Queue;
93 use super::VirtioDevice;
94
95 // Pvclock has one virtio queue: set_pvclock_page
96 const QUEUE_SIZE: u16 = 1;
97 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE];
98
99 // pvclock flag bits
100 const PVCLOCK_TSC_STABLE_BIT: u8 = 1;
101 const PVCLOCK_GUEST_STOPPED: u8 = 2;
102
103 // The feature bitmap for virtio pvclock
104 const VIRTIO_PVCLOCK_F_TSC_STABLE: u64 = 0; // TSC is stable
105 const VIRTIO_PVCLOCK_F_INJECT_SLEEP: u64 = 1; // Inject sleep for suspend
106 const VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING: u64 = 2; // Use device clocksource rating
107
108 // Status values for a virtio_pvclock request.
109 const VIRTIO_PVCLOCK_S_OK: u8 = 0;
110 const VIRTIO_PVCLOCK_S_IOERR: u8 = 1;
111
112 const VIRTIO_PVCLOCK_CLOCKSOURCE_RATING: u32 = 450;
113
114 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
read_clock_counter() -> u64115 fn read_clock_counter() -> u64 {
116 // SAFETY: rdtsc is unprivileged and have no side effects.
117 unsafe { std::arch::x86_64::_rdtsc() }
118 }
119
120 #[cfg(target_arch = "aarch64")]
read_clock_counter() -> u64121 fn read_clock_counter() -> u64 {
122 let mut x: u64;
123 // SAFETY: This instruction have no side effect apart from storing the current timestamp counter
124 // into the specified register.
125 unsafe {
126 asm!("mrs {x}, cntvct_el0",
127 x = out(reg) x,
128 );
129 }
130 x
131 }
132
133 /// Calculate a (multiplier, shift) pair for scaled math of clocks.
134 /// The values are passed on to `pvclock_scale_delta` in the guest kernel and satisfy the following
135 /// (approximate) equality:
136 /// `n * scaled_hz / base_hz ~= ((n << shift) * multiplier) >> 32`
137 /// The logic here is roughly based on `kvm_get_time_scale` (but simplified as we can use u128).
138 /// # Arguments
139 /// * `scaled_hz` - Frequency to convert to. When dealing with clocksources, this is NSEC_PER_SEC.
140 /// * `base_hz` - Frequency to convert from. When dealing with clocksources, this is the counter
141 /// frequency.
freq_scale_shift(scaled_hz: u64, base_hz: u64) -> (u32, i8)142 fn freq_scale_shift(scaled_hz: u64, base_hz: u64) -> (u32, i8) {
143 assert!(scaled_hz > 0 && base_hz > 0);
144 // We treat `multiplier` as a 0.32 fixed-point number by folding the >> 32 into its definition.
145 // With this definition, `multiplier` can be calculated as `(scaled_hz / base_hz) >> shift`
146 // with a corresponding `shift`.
147 //
148 // The value of `shift` should satisfy a few constraints:
149 // 1. `multiplier` needs to be < 1.0 due to the representable range of 0.32 fixed-point (maximum
150 // (2^32-1)/2^32).
151 // 2. `shift` should be minimized because `pvclock_scale_delta` applies `shift` on the 64-bit
152 // TSC value before extending to 128-bit and large positive shifts reduce the TSC rollover
153 // time.
154 //
155 // Minimizing `shift` means maximizing `multiplier`. From the < 1.0 constraint, this is
156 // equivalent to having a multiplier within [0.5, 1.0). The logic below picks a multiplier
157 // satisfying that, while updating `shift` accordingly when we double or halve the multiplier.
158 let mut shift = 0;
159 // Convert to u128 so that overflow handling becomes much easier.
160 let mut scaled_hz = scaled_hz as u128;
161 let mut base_hz = base_hz as u128;
162 if scaled_hz >= base_hz {
163 while scaled_hz >= base_hz {
164 // `multiplier` >= 1.0; iteratively scale it down
165 // scaled_hz is at most 64 bits, so after this loop base_hz is at most 65 bits.
166 base_hz <<= 1;
167 shift += 1;
168 }
169 } else {
170 while base_hz > 2 * scaled_hz {
171 // `multiplier` < 0.5; iteratively scale it up
172 // base_hz is at most 64 bits. If the loop condition passes then scaled_hz is at most 63
173 // bits, otherwise at most 64 bits. Post-loop scaled_hz is at most 64 bits.
174 scaled_hz <<= 1;
175 shift -= 1;
176 }
177 }
178 // From above, we know that the values are at most 65 bits. This provides sufficient headroom
179 // for scaled_hz << 32 below.
180 assert!(base_hz < (1u128 << 65) && scaled_hz < (1u128 << 65));
181 let mult: u32 = ((scaled_hz << 32) / base_hz)
182 .try_into()
183 .expect("should not overflow");
184 (mult, shift)
185 }
186
187 // The config structure being exposed to the guest to tell them how much suspend time should be
188 // injected to the guest's CLOCK_BOOTTIME.
189 #[derive(Debug, Clone, Copy, Default, AsBytes, FromZeroes, FromBytes)]
190 #[allow(non_camel_case_types)]
191 #[repr(C)]
192 struct virtio_pvclock_config {
193 // Total duration the VM has been paused while the guest kernel is not in the suspended state
194 // (from the power management and timekeeping perspective).
195 suspend_time_ns: Le64,
196 // Device-suggested rating of the pvclock clocksource.
197 clocksource_rating: Le32,
198 padding: u32,
199 }
200
201 #[derive(Debug, Clone, Copy, Default, FromZeroes, FromBytes, AsBytes)]
202 #[allow(non_camel_case_types)]
203 #[repr(C)]
204 struct virtio_pvclock_set_pvclock_page_req {
205 // Physical address of pvclock page.
206 pvclock_page_pa: Le64,
207 // Current system time.
208 system_time: Le64,
209 // Current tsc value.
210 tsc_timestamp: Le64,
211 // Status of this request, one of VIRTIO_PVCLOCK_S_*.
212 status: u8,
213 padding: [u8; 7],
214 }
215
216 // Data structure for interacting with pvclock shared memory.
217 struct PvclockSharedData {
218 mem: GuestMemory,
219 seqlock_addr: GuestAddress,
220 tsc_suspended_delta_addr: GuestAddress,
221 tsc_frequency_multiplier_addr: GuestAddress,
222 tsc_frequency_shift_addr: GuestAddress,
223 flags_addr: GuestAddress,
224 }
225
226 impl PvclockSharedData {
new(mem: GuestMemory, addr: GuestAddress) -> Self227 pub fn new(mem: GuestMemory, addr: GuestAddress) -> Self {
228 PvclockSharedData {
229 mem,
230 // The addresses of the various fields that we need to modify are relative to the
231 // base of the pvclock page. For reference, see the pvclock_vcpu_time_info struct.
232 seqlock_addr: addr,
233 tsc_suspended_delta_addr: addr.unchecked_add(8),
234 tsc_frequency_multiplier_addr: addr.unchecked_add(24),
235 tsc_frequency_shift_addr: addr.unchecked_add(28),
236 flags_addr: addr.unchecked_add(29),
237 }
238 }
239
240 /// Only the seqlock_addr is needed to re-create this struct at restore
241 /// time, so that is all our snapshot contains.
snapshot(&self) -> GuestAddress242 fn snapshot(&self) -> GuestAddress {
243 self.seqlock_addr
244 }
245
246 /// Set all fields to zero.
zero_fill(&mut self) -> Result<()>247 pub fn zero_fill(&mut self) -> Result<()> {
248 // The pvclock data structure is 32 bytes long, so we write 32 bytes of 0s
249 self.mem
250 .write_all_at_addr(&[0u8; 32], self.seqlock_addr)
251 .context("failed to zero fill the pvclock shared data")
252 }
253
increment_seqlock(&mut self) -> Result<()>254 pub fn increment_seqlock(&mut self) -> Result<()> {
255 // TODO (b/264931437): reads and writes using read/write_obj_from/at_addr are not
256 // guaranteed to be atomic. Although this should not be a problem for the seqlock
257 // or the other fields in the pvclock shared data (whch are protected via the seqlock)
258 // we might want to update these calls to be as atomic as possible if/when we have
259 // the ability to do so, just as a general cleanup and to be consistent.
260 let value = self
261 .mem
262 .read_obj_from_addr::<u32>(self.seqlock_addr)
263 .context("failed to read seqlock value")?;
264 self.mem
265 .write_obj_at_addr(value.wrapping_add(1), self.seqlock_addr)
266 .context("failed to write seqlock value")
267 }
268
set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()>269 pub fn set_tsc_suspended_delta(&mut self, delta: u64) -> Result<()> {
270 self.mem
271 .write_obj_at_addr(delta, self.tsc_suspended_delta_addr)
272 .context("failed to write tsc suspended delta")
273 }
274
set_tsc_frequency(&mut self, frequency: u64) -> Result<()>275 pub fn set_tsc_frequency(&mut self, frequency: u64) -> Result<()> {
276 let (multiplier, shift): (u32, i8) = freq_scale_shift(1_000_000_000, frequency);
277
278 self.mem
279 .write_obj_at_addr(multiplier, self.tsc_frequency_multiplier_addr)
280 .context("failed to write tsc frequency mlutiplier")?;
281 self.mem
282 .write_obj_at_addr(shift, self.tsc_frequency_shift_addr)
283 .context("failed to write tsc frequency shift")
284 }
285
enable_pvclock_flags(&mut self, flags: u8) -> Result<()>286 pub fn enable_pvclock_flags(&mut self, flags: u8) -> Result<()> {
287 let value = self
288 .mem
289 .read_obj_from_addr::<u8>(self.flags_addr)
290 .context("failed to read flags")?;
291 self.mem
292 .write_obj_at_addr(value | flags, self.flags_addr)
293 .context("failed to write flags")
294 }
295 }
296
297 /// Serializable part of the [PvClock] struct which will be used by the virtio_snapshot / restore.
298 #[derive(Serialize, Deserialize)]
299 struct PvClockState {
300 tsc_frequency: u64,
301 /// If the device is sleeping, a [PvClockWorkerSnapshot] that can re-create the worker
302 /// will be stored here. (We can't just store the worker itself as it contains an object
303 /// tree with references to [GuestMemory].)
304 paused_main_worker: Option<PvClockWorkerSnapshot>,
305 /// The total time the vm has been suspended, this is in an `Arc<AtomicU64>>` because it's set
306 /// by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
307 total_suspend_ns: Arc<AtomicU64>,
308 features: u64,
309 acked_features: u64,
310 }
311
312 /// An enum to keep dynamic state of pvclock workers in a type safe manner.
313 enum PvClockWorkerState {
314 /// Idle means no worker is running.
315 /// This tube is for communicating with this device from the crosvm threads.
316 Idle(Tube),
317 /// A stub worker to respond pvclock commands when the device is not activated yet.
318 Stub(WorkerThread<StubWorkerReturn>),
319 /// A main worker to respond pvclock commands while the device is active.
320 Main(WorkerThread<MainWorkerReturn>),
321 /// None is used only for handling transitional state between the states above.
322 None,
323 }
324
325 /// A struct that represents virtio-pvclock device.
326 pub struct PvClock {
327 state: PvClockState,
328 worker_state: PvClockWorkerState,
329 }
330
331 impl PvClock {
new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self332 pub fn new(base_features: u64, tsc_frequency: u64, suspend_tube: Tube) -> Self {
333 let state = PvClockState {
334 tsc_frequency,
335 paused_main_worker: None,
336 total_suspend_ns: Arc::new(AtomicU64::new(0)),
337 features: base_features
338 | 1 << VIRTIO_PVCLOCK_F_TSC_STABLE
339 | 1 << VIRTIO_PVCLOCK_F_INJECT_SLEEP
340 | 1 << VIRTIO_PVCLOCK_F_CLOCKSOURCE_RATING,
341 acked_features: 0,
342 };
343 PvClock {
344 state,
345 worker_state: PvClockWorkerState::Idle(suspend_tube),
346 }
347 }
348
get_config(&self) -> virtio_pvclock_config349 fn get_config(&self) -> virtio_pvclock_config {
350 virtio_pvclock_config {
351 suspend_time_ns: self.state.total_suspend_ns.load(Ordering::SeqCst).into(),
352 clocksource_rating: VIRTIO_PVCLOCK_CLOCKSOURCE_RATING.into(),
353 padding: 0,
354 }
355 }
356
357 /// Use switch_to_*_worker unless needed to keep the state transition consistent
start_main_worker( &mut self, interrupt: Interrupt, pvclock_worker: PvClockWorker, mut queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>358 fn start_main_worker(
359 &mut self,
360 interrupt: Interrupt,
361 pvclock_worker: PvClockWorker,
362 mut queues: BTreeMap<usize, Queue>,
363 ) -> anyhow::Result<()> {
364 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
365 if let PvClockWorkerState::Idle(suspend_tube) = last_state {
366 if queues.len() != QUEUE_SIZES.len() {
367 self.worker_state = PvClockWorkerState::Idle(suspend_tube);
368 return Err(anyhow!(
369 "expected {} queues, got {}",
370 QUEUE_SIZES.len(),
371 queues.len()
372 ));
373 }
374 let set_pvclock_page_queue = queues.remove(&0).unwrap();
375 self.worker_state = PvClockWorkerState::Main(WorkerThread::start(
376 "virtio_pvclock".to_string(),
377 move |kill_evt| {
378 run_main_worker(
379 pvclock_worker,
380 set_pvclock_page_queue,
381 suspend_tube,
382 interrupt,
383 kill_evt,
384 )
385 },
386 ));
387 } else {
388 panic!("Invalid state transition");
389 }
390 Ok(())
391 }
392
393 /// Use switch_to_*_worker unless needed to keep the state transition consistent
start_stub_worker(&mut self)394 fn start_stub_worker(&mut self) {
395 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
396 self.worker_state = if let PvClockWorkerState::Idle(suspend_tube) = last_state {
397 PvClockWorkerState::Stub(WorkerThread::start(
398 "virtio_pvclock_stub".to_string(),
399 move |kill_evt| run_stub_worker(suspend_tube, kill_evt),
400 ))
401 } else {
402 panic!("Invalid state transition");
403 };
404 }
405
406 /// Use switch_to_*_worker unless needed to keep the state transition consistent
stop_stub_worker(&mut self)407 fn stop_stub_worker(&mut self) {
408 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
409 self.worker_state = if let PvClockWorkerState::Stub(stub_worker_thread) = last_state {
410 let stub_worker_ret = stub_worker_thread.stop();
411 PvClockWorkerState::Idle(stub_worker_ret.suspend_tube)
412 } else {
413 panic!("Invalid state transition");
414 }
415 }
416
417 /// Use switch_to_*_worker unless needed to keep the state transition consistent
stop_main_worker(&mut self)418 fn stop_main_worker(&mut self) {
419 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
420 if let PvClockWorkerState::Main(main_worker_thread) = last_state {
421 let main_worker_ret = main_worker_thread.stop();
422 self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
423 let mut queues = BTreeMap::new();
424 queues.insert(0, main_worker_ret.set_pvclock_page_queue);
425 self.state.paused_main_worker = Some(main_worker_ret.worker.into());
426 } else {
427 panic!("Invalid state transition");
428 }
429 }
430
switch_to_stub_worker(&mut self)431 fn switch_to_stub_worker(&mut self) {
432 self.stop_main_worker();
433 self.start_stub_worker();
434 }
435
switch_to_main_worker( &mut self, interrupt: Interrupt, pvclock_worker: PvClockWorker, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>436 fn switch_to_main_worker(
437 &mut self,
438 interrupt: Interrupt,
439 pvclock_worker: PvClockWorker,
440 queues: BTreeMap<usize, Queue>,
441 ) -> anyhow::Result<()> {
442 self.stop_stub_worker();
443 self.start_main_worker(interrupt, pvclock_worker, queues)
444 }
445 }
446
447 /// Represents a moment in time including the TSC counter value at that time.
448 #[derive(Serialize, Deserialize, Clone)]
449 struct PvclockInstant {
450 time: DateTime<Utc>,
451 tsc_value: u64,
452 }
453
454 /// The unique data retained by [PvClockWorker] which can be used to re-create
455 /// an identical worker.
456 #[derive(Serialize, Deserialize, Clone)]
457 struct PvClockWorkerSnapshot {
458 suspend_time: Option<PvclockInstant>,
459 total_suspend_tsc_delta: u64,
460 pvclock_shared_data_base_address: Option<GuestAddress>,
461 }
462
463 impl From<PvClockWorker> for PvClockWorkerSnapshot {
from(worker: PvClockWorker) -> Self464 fn from(worker: PvClockWorker) -> Self {
465 PvClockWorkerSnapshot {
466 suspend_time: worker.suspend_time,
467 total_suspend_tsc_delta: worker.total_suspend_tsc_delta,
468 pvclock_shared_data_base_address: worker
469 .pvclock_shared_data
470 .map(|pvclock| pvclock.snapshot()),
471 }
472 }
473 }
474
475 /// Worker struct for the virtio-pvclock device.
476 ///
477 /// Handles virtio requests, storing information about suspend/resume, adjusting the
478 /// pvclock data in shared memory, and injecting suspend durations via config
479 /// changes.
480 struct PvClockWorker {
481 tsc_frequency: u64,
482 // The moment the last suspend occurred.
483 suspend_time: Option<PvclockInstant>,
484 // The total time the vm has been suspended, this is in an Arc<AtomicU64>> because it's set
485 // by the PvClockWorker thread but read by PvClock from the mmio bus in the main thread.
486 total_injected_ns: Arc<AtomicU64>,
487 // The total change in the TSC value over suspensions.
488 total_suspend_tsc_delta: u64,
489 // Pvclock shared data.
490 pvclock_shared_data: Option<PvclockSharedData>,
491 mem: GuestMemory,
492 }
493
494 impl PvClockWorker {
new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self495 pub fn new(tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, mem: GuestMemory) -> Self {
496 PvClockWorker {
497 tsc_frequency,
498 suspend_time: None,
499 total_injected_ns,
500 total_suspend_tsc_delta: 0,
501 pvclock_shared_data: None,
502 mem,
503 }
504 }
505
from_snapshot( tsc_frequency: u64, total_injected_ns: Arc<AtomicU64>, snap: PvClockWorkerSnapshot, mem: GuestMemory, ) -> Self506 fn from_snapshot(
507 tsc_frequency: u64,
508 total_injected_ns: Arc<AtomicU64>,
509 snap: PvClockWorkerSnapshot,
510 mem: GuestMemory,
511 ) -> Self {
512 PvClockWorker {
513 tsc_frequency,
514 suspend_time: snap.suspend_time,
515 total_injected_ns,
516 total_suspend_tsc_delta: snap.total_suspend_tsc_delta,
517 pvclock_shared_data: snap
518 .pvclock_shared_data_base_address
519 .map(|addr| PvclockSharedData::new(mem.clone(), addr)),
520 mem,
521 }
522 }
523
524 /// Initialize the pvclock for initial boot. We assume that the systemtime of 0 corresponds
525 /// to the tsc time of 0, so we do not set these. We set the tsc frequency based on the vcpu
526 /// tsc frequency and we set PVCLOCK_TSC_STABLE_BIT in flags to tell the guest that it's
527 /// safe to use vcpu0's pvclock page for use by the vdso. The order of writing the different
528 /// fields doesn't matter at this point, but does matter when updating.
set_pvclock_page(&mut self, addr: u64) -> Result<()>529 fn set_pvclock_page(&mut self, addr: u64) -> Result<()> {
530 if self.pvclock_shared_data.is_some() {
531 return Err(Error::new(libc::EALREADY)).context("pvclock page already set");
532 }
533
534 let mut shared_data = PvclockSharedData::new(self.mem.clone(), GuestAddress(addr));
535
536 // set all fields to 0 first
537 shared_data.zero_fill()?;
538
539 shared_data.set_tsc_frequency(self.tsc_frequency)?;
540 shared_data.enable_pvclock_flags(PVCLOCK_TSC_STABLE_BIT)?;
541
542 self.pvclock_shared_data = Some(shared_data);
543 Ok(())
544 }
545
suspend(&mut self)546 pub fn suspend(&mut self) {
547 if self.suspend_time.is_some() {
548 warn!("Suspend time already set, ignoring new suspend time");
549 return;
550 }
551 self.suspend_time = Some(PvclockInstant {
552 time: Utc::now(),
553 tsc_value: read_clock_counter(),
554 });
555 }
556
resume(&mut self) -> Result<u64>557 pub fn resume(&mut self) -> Result<u64> {
558 // First, increment the sequence lock by 1 before writing to the pvclock page.
559 self.increment_pvclock_seqlock()?;
560
561 // The guest makes sure there are memory barriers in between reads of the seqlock and other
562 // fields, we should make sure there are memory barriers in between writes of seqlock and
563 // writes to other fields.
564 std::sync::atomic::fence(Ordering::SeqCst);
565
566 // Set the guest_stopped_bit and tsc suspended delta in pvclock struct. We only need to set
567 // the bit, the guest will unset it once the guest has handled the stoppage.
568 // We get the result here because we want to call increment_pvclock_seqlock regardless of
569 // the result of these calls.
570 let result = self
571 .set_guest_stopped_bit()
572 .and_then(|_| self.set_suspended_time());
573
574 // The guest makes sure there are memory barriers in between reads of the seqlock and other
575 // fields, we should make sure there are memory barriers in between writes of seqlock and
576 // writes to other fields.
577 std::sync::atomic::fence(Ordering::SeqCst);
578
579 // Do a final increment once changes are done.
580 self.increment_pvclock_seqlock()?;
581
582 result
583 }
584
get_suspended_duration(suspend_time: &PvclockInstant) -> Duration585 fn get_suspended_duration(suspend_time: &PvclockInstant) -> Duration {
586 match Utc::now().signed_duration_since(suspend_time.time).to_std() {
587 Ok(duration) => duration,
588 Err(e) => {
589 error!(
590 "pvclock found suspend time in the future (was the host \
591 clock adjusted?). Guest boot/realtime clock may now be \
592 incorrect. Details: {}",
593 e
594 );
595 Duration::ZERO
596 }
597 }
598 }
599
set_suspended_time(&mut self) -> Result<u64>600 fn set_suspended_time(&mut self) -> Result<u64> {
601 let (this_suspend_duration, this_suspend_tsc_delta) =
602 if let Some(suspend_time) = self.suspend_time.take() {
603 (
604 Self::get_suspended_duration(&suspend_time),
605 // NB: This calculation may wrap around, as TSC can be reset to zero when
606 // the device has resumed from the "deep" suspend state (it may not happen for
607 // s2idle cases). It also happens when the tsc value itself wraps.
608 read_clock_counter().wrapping_sub(suspend_time.tsc_value),
609 )
610 } else {
611 return Err(Error::new(libc::ENOTSUP))
612 .context("Cannot set suspend time because suspend was never called");
613 };
614
615 // update the total tsc delta during all suspends
616 // NB: This calculation may wrap around, as the suspend time can be bigger than u64 range.
617 self.total_suspend_tsc_delta = self
618 .total_suspend_tsc_delta
619 .wrapping_add(this_suspend_tsc_delta);
620
621 // save tsc_suspended_delta to shared memory
622 self.pvclock_shared_data
623 .as_mut()
624 .ok_or(
625 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
626 )?
627 .set_tsc_suspended_delta(self.total_suspend_tsc_delta)?;
628
629 info!(
630 "set total suspend tsc delta to {}",
631 self.total_suspend_tsc_delta
632 );
633
634 // update total suspend ns
635 self.total_injected_ns
636 .fetch_add(this_suspend_duration.as_nanos() as u64, Ordering::SeqCst);
637
638 Ok(self.total_suspend_tsc_delta)
639 }
640
increment_pvclock_seqlock(&mut self) -> Result<()>641 fn increment_pvclock_seqlock(&mut self) -> Result<()> {
642 self.pvclock_shared_data
643 .as_mut()
644 .ok_or(
645 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
646 )?
647 .increment_seqlock()
648 }
649
set_guest_stopped_bit(&mut self) -> Result<()>650 fn set_guest_stopped_bit(&mut self) -> Result<()> {
651 self.pvclock_shared_data
652 .as_mut()
653 .ok_or(
654 anyhow::Error::new(Error::new(libc::ENODATA)).context("pvclock page is not set"),
655 )?
656 .enable_pvclock_flags(PVCLOCK_GUEST_STOPPED)
657 }
658 }
659
pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error660 fn pvclock_response_error_from_anyhow(error: anyhow::Error) -> base::Error {
661 for cause in error.chain() {
662 if let Some(e) = cause.downcast_ref::<base::Error>() {
663 return *e;
664 }
665
666 if let Some(e) = cause.downcast_ref::<GuestMemoryError>() {
667 return match e {
668 // Two kinds of GuestMemoryError contain base::Error
669 GuestMemoryError::MemoryAddSealsFailed(e) => *e,
670 GuestMemoryError::MemoryCreationFailed(e) => *e,
671 // Otherwise return EINVAL
672 _ => Error::new(libc::EINVAL),
673 };
674 }
675 }
676 // Unknown base error
677 Error::new(libc::EFAULT)
678 }
679
680 struct StubWorkerReturn {
681 suspend_tube: Tube,
682 }
683
684 /// A stub worker to respond any requests when the device is inactive.
run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn685 fn run_stub_worker(suspend_tube: Tube, kill_evt: Event) -> StubWorkerReturn {
686 #[derive(EventToken, Debug)]
687 enum Token {
688 SomePvClockRequest,
689 Kill,
690 }
691 let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
692 (suspend_tube.get_read_notifier(), Token::SomePvClockRequest),
693 // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
694 // implemented for Tube.
695 #[cfg(windows)]
696 (suspend_tube.get_close_notifier(), Token::Kill),
697 (&kill_evt, Token::Kill),
698 ]) {
699 Ok(wait_ctx) => wait_ctx,
700 Err(e) => {
701 error!("failed creating WaitContext: {}", e);
702 return StubWorkerReturn { suspend_tube };
703 }
704 };
705 'wait: loop {
706 let events = match wait_ctx.wait() {
707 Ok(v) => v,
708 Err(e) => {
709 error!("failed polling for events: {}", e);
710 break;
711 }
712 };
713 for event in events.iter().filter(|e| e.is_readable) {
714 match event.token {
715 Token::SomePvClockRequest => {
716 match suspend_tube.recv::<PvClockCommand>() {
717 Ok(req) => req,
718 Err(e) => {
719 error!("failed to receive request: {}", e);
720 continue;
721 }
722 };
723 if let Err(e) = suspend_tube.send(&PvClockCommandResponse::DeviceInactive) {
724 error!("error sending PvClockCommandResponse: {}", e);
725 }
726 }
727 Token::Kill => {
728 break 'wait;
729 }
730 }
731 }
732 }
733 StubWorkerReturn { suspend_tube }
734 }
735
736 struct MainWorkerReturn {
737 worker: PvClockWorker,
738 set_pvclock_page_queue: Queue,
739 suspend_tube: Tube,
740 }
741
742 // TODO(b/237300012): asyncify this device.
743 /// A worker to process PvClockCommand requests
run_main_worker( mut worker: PvClockWorker, mut set_pvclock_page_queue: Queue, suspend_tube: Tube, interrupt: Interrupt, kill_evt: Event, ) -> MainWorkerReturn744 fn run_main_worker(
745 mut worker: PvClockWorker,
746 mut set_pvclock_page_queue: Queue,
747 suspend_tube: Tube,
748 interrupt: Interrupt,
749 kill_evt: Event,
750 ) -> MainWorkerReturn {
751 #[derive(EventToken)]
752 enum Token {
753 SetPvClockPageQueue,
754 SuspendResume,
755 InterruptResample,
756 Kill,
757 }
758
759 let wait_ctx: WaitContext<Token> = match WaitContext::build_with(&[
760 (set_pvclock_page_queue.event(), Token::SetPvClockPageQueue),
761 (suspend_tube.get_read_notifier(), Token::SuspendResume),
762 // TODO(b/242743502): Can also close on Tube closure for Unix once CloseNotifier is
763 // implemented for Tube.
764 #[cfg(windows)]
765 (suspend_tube.get_close_notifier(), Token::Kill),
766 (&kill_evt, Token::Kill),
767 ]) {
768 Ok(pc) => pc,
769 Err(e) => {
770 error!("failed creating WaitContext: {}", e);
771 return MainWorkerReturn {
772 suspend_tube,
773 set_pvclock_page_queue,
774 worker,
775 };
776 }
777 };
778 if let Some(resample_evt) = interrupt.get_resample_evt() {
779 if wait_ctx
780 .add(resample_evt, Token::InterruptResample)
781 .is_err()
782 {
783 error!("failed creating WaitContext");
784 return MainWorkerReturn {
785 suspend_tube,
786 set_pvclock_page_queue,
787 worker,
788 };
789 }
790 }
791
792 'wait: loop {
793 let events = match wait_ctx.wait() {
794 Ok(v) => v,
795 Err(e) => {
796 error!("failed polling for events: {}", e);
797 break;
798 }
799 };
800
801 for event in events.iter().filter(|e| e.is_readable) {
802 match event.token {
803 Token::SetPvClockPageQueue => {
804 let _ = set_pvclock_page_queue.event().wait();
805 let desc_chain = match set_pvclock_page_queue.pop() {
806 Some(desc_chain) => desc_chain,
807 None => {
808 error!("set_pvclock_page queue was empty");
809 continue;
810 }
811 };
812
813 // This device does not follow the virtio spec requirements for device-readable
814 // vs. device-writable descriptors, so we can't use `Reader`/`Writer`. Pick the
815 // first descriptor from the chain and assume the whole req structure is
816 // contained within it.
817 let desc = desc_chain
818 .reader
819 .get_remaining_regions()
820 .chain(desc_chain.writer.get_remaining_regions())
821 .next()
822 .unwrap();
823
824 let len = if desc.len < size_of::<virtio_pvclock_set_pvclock_page_req>() {
825 error!("pvclock descriptor too short");
826 0
827 } else {
828 let addr = GuestAddress(desc.offset);
829 let mut req: virtio_pvclock_set_pvclock_page_req = match worker
830 .mem
831 .read_obj_from_addr(addr)
832 {
833 Ok(req) => req,
834 Err(e) => {
835 error!("failed to read request from set_pvclock_page queue: {}", e);
836 continue;
837 }
838 };
839
840 req.status = match worker.set_pvclock_page(req.pvclock_page_pa.into()) {
841 Err(e) => {
842 error!("failed to set pvclock page: {:#}", e);
843 VIRTIO_PVCLOCK_S_IOERR
844 }
845 Ok(_) => VIRTIO_PVCLOCK_S_OK,
846 };
847
848 if let Err(e) = worker.mem.write_obj_at_addr(req, addr) {
849 error!("failed to write set_pvclock_page status: {}", e);
850 continue;
851 }
852
853 desc.len as u32
854 };
855
856 set_pvclock_page_queue.add_used(desc_chain, len);
857 set_pvclock_page_queue.trigger_interrupt();
858 }
859 Token::SuspendResume => {
860 let req = match suspend_tube.recv::<PvClockCommand>() {
861 Ok(req) => req,
862 Err(e) => {
863 error!("failed to receive request: {}", e);
864 continue;
865 }
866 };
867
868 let resp = match req {
869 PvClockCommand::Suspend => {
870 worker.suspend();
871 PvClockCommandResponse::Ok
872 }
873 PvClockCommand::Resume => {
874 match worker.resume() {
875 Ok(total_suspended_ticks) => {
876 // signal to the driver that the total_suspend_ns has changed
877 interrupt.signal_config_changed();
878 PvClockCommandResponse::Resumed {
879 total_suspended_ticks,
880 }
881 }
882 Err(e) => {
883 error!("Failed to resume pvclock: {:#}", e);
884 PvClockCommandResponse::Err(pvclock_response_error_from_anyhow(
885 e,
886 ))
887 }
888 }
889 }
890 };
891
892 if let Err(e) = suspend_tube.send(&resp) {
893 error!("error sending PvClockCommandResponse: {}", e);
894 }
895 }
896
897 Token::InterruptResample => {
898 interrupt.interrupt_resample();
899 }
900 Token::Kill => {
901 break 'wait;
902 }
903 }
904 }
905 }
906
907 MainWorkerReturn {
908 suspend_tube,
909 set_pvclock_page_queue,
910 worker,
911 }
912 }
913
914 impl VirtioDevice for PvClock {
keep_rds(&self) -> Vec<RawDescriptor>915 fn keep_rds(&self) -> Vec<RawDescriptor> {
916 if let PvClockWorkerState::Idle(suspend_tube) = &self.worker_state {
917 vec![suspend_tube.as_raw_descriptor()]
918 } else {
919 Vec::new()
920 }
921 }
922
device_type(&self) -> DeviceType923 fn device_type(&self) -> DeviceType {
924 DeviceType::Pvclock
925 }
926
queue_max_sizes(&self) -> &[u16]927 fn queue_max_sizes(&self) -> &[u16] {
928 QUEUE_SIZES
929 }
930
features(&self) -> u64931 fn features(&self) -> u64 {
932 self.state.features
933 }
934
ack_features(&mut self, mut value: u64)935 fn ack_features(&mut self, mut value: u64) {
936 if value & !self.features() != 0 {
937 warn!("virtio-pvclock got unknown feature ack {:x}", value);
938 value &= self.features();
939 }
940 self.state.acked_features |= value;
941 }
942
read_config(&self, offset: u64, data: &mut [u8])943 fn read_config(&self, offset: u64, data: &mut [u8]) {
944 copy_config(data, 0, self.get_config().as_bytes(), offset);
945 }
946
write_config(&mut self, offset: u64, data: &[u8])947 fn write_config(&mut self, offset: u64, data: &[u8]) {
948 // Pvclock device doesn't expect a guest write to config
949 warn!(
950 "Unexpected write to virtio-pvclock config at offset {}: {:?}",
951 offset, data
952 );
953 }
954
activate( &mut self, mem: GuestMemory, interrupt: Interrupt, queues: BTreeMap<usize, Queue>, ) -> anyhow::Result<()>955 fn activate(
956 &mut self,
957 mem: GuestMemory,
958 interrupt: Interrupt,
959 queues: BTreeMap<usize, Queue>,
960 ) -> anyhow::Result<()> {
961 let tsc_frequency = self.state.tsc_frequency;
962 let total_suspend_ns = self.state.total_suspend_ns.clone();
963 let worker = PvClockWorker::new(tsc_frequency, total_suspend_ns, mem);
964 self.switch_to_main_worker(interrupt, worker, queues)
965 }
966
reset(&mut self) -> Result<()>967 fn reset(&mut self) -> Result<()> {
968 self.switch_to_stub_worker();
969 Ok(())
970 }
971
virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>>972 fn virtio_sleep(&mut self) -> anyhow::Result<Option<BTreeMap<usize, Queue>>> {
973 let last_state = replace(&mut self.worker_state, PvClockWorkerState::None);
974 match last_state {
975 PvClockWorkerState::Main(main_worker_thread) => {
976 let main_worker_ret = main_worker_thread.stop();
977 let mut queues = BTreeMap::new();
978 queues.insert(0, main_worker_ret.set_pvclock_page_queue);
979 self.worker_state = PvClockWorkerState::Idle(main_worker_ret.suspend_tube);
980 self.state.paused_main_worker = Some(main_worker_ret.worker.into());
981 Ok(Some(queues))
982 }
983 PvClockWorkerState::Stub(stub_worker_thread) => {
984 let stub_ret = stub_worker_thread.stop();
985 self.worker_state = PvClockWorkerState::Idle(stub_ret.suspend_tube);
986 Ok(None)
987 }
988 PvClockWorkerState::Idle(suspend_tube) => {
989 self.worker_state = PvClockWorkerState::Idle(suspend_tube);
990 Ok(None)
991 }
992 PvClockWorkerState::None => panic!("invalid state transition"),
993 }
994 }
995
virtio_wake( &mut self, queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>, ) -> anyhow::Result<()>996 fn virtio_wake(
997 &mut self,
998 queues_state: Option<(GuestMemory, Interrupt, BTreeMap<usize, Queue>)>,
999 ) -> anyhow::Result<()> {
1000 if let Some((mem, interrupt, queues)) = queues_state {
1001 let worker_snap = self
1002 .state
1003 .paused_main_worker
1004 .take()
1005 .ok_or(anyhow!("a sleeping pvclock must have a paused worker"))?;
1006 let worker = PvClockWorker::from_snapshot(
1007 self.state.tsc_frequency,
1008 self.state.total_suspend_ns.clone(),
1009 worker_snap,
1010 mem,
1011 );
1012 // Use unchecked as no worker is running at this point
1013 self.start_main_worker(interrupt, worker, queues)?;
1014 } else {
1015 // If the device wasn't activated, we should bring up the stub worker since that's
1016 // what is supposed to be running for an un-activated device.
1017 self.start_stub_worker();
1018 }
1019 Ok(())
1020 }
1021
virtio_snapshot(&mut self) -> anyhow::Result<serde_json::Value>1022 fn virtio_snapshot(&mut self) -> anyhow::Result<serde_json::Value> {
1023 serde_json::to_value(&self.state).context("failed to serialize PvClockState")
1024 }
1025
virtio_restore(&mut self, data: serde_json::Value) -> anyhow::Result<()>1026 fn virtio_restore(&mut self, data: serde_json::Value) -> anyhow::Result<()> {
1027 let state: PvClockState = serde_json::from_value(data).context("error deserializing")?;
1028 if state.features != self.features() {
1029 bail!(
1030 "expected virtio_features to match, but they did not. Live: {:?}, snapshot {:?}",
1031 self.features(),
1032 state.features,
1033 );
1034 }
1035 // TODO(b/291346907): we assume that the TSC frequency has NOT changed
1036 // since the snapshot was made. Assuming we have not moved machines,
1037 // this is a reasonable assumption. We don't verify the frequency
1038 // because TSC calibration noisy.
1039 self.state = state;
1040 Ok(())
1041 }
1042
on_device_sandboxed(&mut self)1043 fn on_device_sandboxed(&mut self) {
1044 self.start_stub_worker();
1045 }
1046 }
1047
1048 #[cfg(test)]
1049 mod tests {
1050 use super::*;
1051 use crate::virtio::QueueConfig;
1052
1053 const TEST_QUEUE_SIZE: u16 = 2048;
1054
make_interrupt() -> Interrupt1055 fn make_interrupt() -> Interrupt {
1056 Interrupt::new_for_test()
1057 }
1058
create_pvclock_device() -> (Tube, PvClock)1059 fn create_pvclock_device() -> (Tube, PvClock) {
1060 let (host_tube, device_tube) = Tube::pair().unwrap();
1061 let mut pvclock_device = PvClock::new(0, 1e9 as u64, device_tube);
1062
1063 // Simulate the device initialization to start the stub thread.
1064 // In the real case, on_device_sandboxed will be called after the device is sandboxed
1065 // (or at some point during the device initializtion when the sandbox is disabled) to
1066 // allow devices to use multi-threads (as spawning new threads before sandboxing is
1067 // prohibited because of the minijail's restriction).
1068 pvclock_device.on_device_sandboxed();
1069
1070 (host_tube, pvclock_device)
1071 }
1072
create_sleeping_device() -> (PvClock, GuestMemory, Tube)1073 fn create_sleeping_device() -> (PvClock, GuestMemory, Tube) {
1074 let (_host_tube, mut pvclock_device) = create_pvclock_device();
1075
1076 // The queue won't actually be used, so passing one that isn't
1077 // fully configured is fine.
1078 let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1079 fake_queue.set_ready(true);
1080 let mem = GuestMemory::new(&[(GuestAddress(0), 0x10000)]).unwrap();
1081 let interrupt = make_interrupt();
1082 pvclock_device
1083 .activate(
1084 mem.clone(),
1085 interrupt.clone(),
1086 BTreeMap::from([(
1087 0,
1088 fake_queue
1089 .activate(&mem, Event::new().unwrap(), interrupt)
1090 .unwrap(),
1091 )]),
1092 )
1093 .expect("activate should succeed");
1094 let queues = pvclock_device
1095 .virtio_sleep()
1096 .expect("sleep should succeed")
1097 .expect("sleep should yield queues");
1098 assert_eq!(queues.len(), 1);
1099 assert_eq!(
1100 queues.get(&0).expect("queue must be present").size(),
1101 TEST_QUEUE_SIZE
1102 );
1103 assert!(pvclock_device.state.paused_main_worker.is_some());
1104 (pvclock_device, mem, _host_tube)
1105 }
1106
assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory)1107 fn assert_wake_successful(pvclock_device: &mut PvClock, mem: &GuestMemory) {
1108 // We just create a new queue here, because it isn't actually accessed
1109 // by the device in these tests.
1110 let mut wake_queues = BTreeMap::new();
1111 let mut fake_queue = QueueConfig::new(TEST_QUEUE_SIZE, 0);
1112 let interrupt = make_interrupt();
1113 fake_queue.set_ready(true);
1114 wake_queues.insert(
1115 0,
1116 fake_queue
1117 .activate(mem, Event::new().unwrap(), interrupt.clone())
1118 .unwrap(),
1119 );
1120 let queues_state = (mem.clone(), interrupt, wake_queues);
1121 pvclock_device
1122 .virtio_wake(Some(queues_state))
1123 .expect("wake should succeed");
1124 assert!(pvclock_device.state.paused_main_worker.is_none());
1125 }
1126
1127 #[test]
test_command_response_when_inactive()1128 fn test_command_response_when_inactive() {
1129 let (host_tube, _pvclock_device) = create_pvclock_device();
1130 assert!(host_tube.send(&PvClockCommand::Suspend).is_ok());
1131 let res = host_tube.recv::<PvClockCommandResponse>();
1132 assert!(matches!(res, Ok(PvClockCommandResponse::DeviceInactive)));
1133 }
1134
1135 #[test]
test_sleep_wake_smoke()1136 fn test_sleep_wake_smoke() {
1137 let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1138 assert_wake_successful(&mut pvclock_device, &mem);
1139 }
1140
1141 #[test]
test_save_restore()1142 fn test_save_restore() {
1143 let (mut pvclock_device, mem, _tube) = create_sleeping_device();
1144 let test_suspend_ns = 9999;
1145
1146 // Store a test value we can look for later in the test to verify
1147 // we're restoring properties.
1148 pvclock_device
1149 .state
1150 .total_suspend_ns
1151 .store(test_suspend_ns, Ordering::SeqCst);
1152
1153 let snap = pvclock_device.virtio_snapshot().unwrap();
1154 pvclock_device
1155 .state
1156 .total_suspend_ns
1157 .store(0, Ordering::SeqCst);
1158 pvclock_device.virtio_restore(snap).unwrap();
1159 assert_eq!(
1160 pvclock_device.state.total_suspend_ns.load(Ordering::SeqCst),
1161 test_suspend_ns
1162 );
1163
1164 assert_wake_successful(&mut pvclock_device, &mem);
1165 }
1166
1167 /// A simplified clone of `pvclock_scale_delta` from Linux kernel to emulate
1168 /// what the kernel does when converting TSC to ktime.
pvclock_scale_tsc(mult: u32, shift: i8, tsc: u64) -> u641169 fn pvclock_scale_tsc(mult: u32, shift: i8, tsc: u64) -> u64 {
1170 let shifted = if shift < 0 {
1171 tsc >> -shift
1172 } else {
1173 tsc << shift
1174 };
1175 let product = shifted as u128 * mult as u128;
1176 (product >> 32).try_into().expect("should not overflow")
1177 }
1178
1179 /// Helper function for checking the behavior of `freq_scale_shift`.
check_freq_scale(f: u64, input: u64)1180 fn check_freq_scale(f: u64, input: u64) {
1181 // We only test `scaled_hz` = 1GHz because that is the only value used in the code base.
1182 let (mult, shift) = freq_scale_shift(1_000_000_000, f);
1183
1184 let scaled = pvclock_scale_tsc(mult, shift, input);
1185
1186 // Use relative error <= 1e-8 as the target. TSC can be huge so this isn't really a super
1187 // accurate target, and our goal is to simply sanity check the math without adding too many
1188 // requirements about rounding errors.
1189 let expected: u64 = (input as u128 * 1_000_000_000u128 / f as u128) as u64;
1190 let expected_lo: u64 = (input as u128 * 999_999_990u128 / f as u128) as u64;
1191 let expected_hi: u64 = (input as u128 * 1_000_000_010u128 / f as u128) as u64;
1192 assert!(
1193 (expected_lo..=expected_hi).contains(&scaled),
1194 "{scaled} should be close to {expected} (base_hz={f}, mult={mult}, shift={shift})"
1195 );
1196 }
1197
1198 #[test]
test_freq_scale_shift_accuracy()1199 fn test_freq_scale_shift_accuracy() {
1200 // Basic check for formula correctness: scaling `scaled_hz` to `base_hz` should yield
1201 // `base_hz`.
1202 for f in (1..=50).map(|n| n * 100_000_000) {
1203 check_freq_scale(f, f);
1204 }
1205 }
1206
1207 #[test]
test_freq_scale_shift_overflow_high_freq()1208 fn test_freq_scale_shift_overflow_high_freq() {
1209 // For scale factors < 1.0, test that we can correctly convert the maximum TSC value without
1210 // overflow. We must be able to handle values as large as it realistically can be, as the
1211 // kernel clock breaks if the calculated ktime goes backwards (b/342168920).
1212 for f in (11..=50).map(|n| n * 100_000_000) {
1213 check_freq_scale(f, u64::MAX);
1214 }
1215 }
1216
1217 #[test]
test_freq_scale_shift_overflow_low_freq()1218 fn test_freq_scale_shift_overflow_low_freq() {
1219 fn prev_power_of_two(n: u64) -> u64 {
1220 assert_ne!(n, 0);
1221 let highest_bit_set = 63 - n.leading_zeros();
1222 1 << highest_bit_set
1223 }
1224 // Same test as above, but for scale factors >= 1.0. The difference is that for scale
1225 // factors >= 1.0 we first round up the factor, then apply a multiplier (< 1.0). We reflect
1226 // this limitation in our tested maximum value.
1227 for f in (1..=10).map(|n| n * 100_000_000) {
1228 // Truncate the remainder since prev_power_of_two rounds down anyway.
1229 let factor = 1_000_000_000 / f;
1230 // This is like (exp2(floor(log2(factor)) + 1)).
1231 let target = u64::MAX / (prev_power_of_two(factor) << 1);
1232 check_freq_scale(f, target);
1233 }
1234 }
1235 }
1236