1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 //! PageHandler manages the page states of multiple regions.
6
7 #![deny(missing_docs)]
8
9 use std::fs::File;
10 use std::mem;
11 use std::ops::Range;
12 use std::sync::Arc;
13
14 use anyhow::Context;
15 use base::error;
16 use base::linux::FileDataIterator;
17 use base::AsRawDescriptor;
18 use base::SharedMemory;
19 use base::VolatileSlice;
20 use sync::Mutex;
21 use thiserror::Error as ThisError;
22
23 use crate::file::Error as FileError;
24 use crate::file::SwapFile;
25 use crate::pagesize::addr_to_page_idx;
26 use crate::pagesize::bytes_to_pages;
27 use crate::pagesize::is_hugepage_aligned;
28 use crate::pagesize::is_page_aligned;
29 use crate::pagesize::page_base_addr;
30 use crate::pagesize::page_idx_to_addr;
31 use crate::pagesize::pages_to_bytes;
32 use crate::pagesize::round_up_hugepage_size;
33 use crate::pagesize::THP_SIZE;
34 use crate::staging::CopyOp;
35 use crate::staging::Error as StagingError;
36 use crate::staging::StagingMemory;
37 use crate::userfaultfd::Error as UffdError;
38 use crate::userfaultfd::Userfaultfd;
39 use crate::worker::Channel;
40 use crate::worker::Task;
41 use crate::SwapMetrics;
42
43 pub(crate) const MLOCK_BUDGET: usize = 16 * 1024 * 1024; // = 16MB
44 const PREFETCH_THRESHOLD: usize = 4 * 1024 * 1024; // = 4MB
45
46 /// Result for PageHandler
47 pub type Result<T> = std::result::Result<T, Error>;
48
49 /// Errors for PageHandler
50 #[derive(ThisError, Debug)]
51 pub enum Error {
52 #[error("the address is invalid {0:#018X}")]
53 /// the address is invalid
54 InvalidAddress(usize),
55 #[error("the regions {0:?} and {1:?} overlap")]
56 /// regions are overlaps on registering
57 RegionOverlap(Range<usize>, Range<usize>),
58 #[error("failed to create page handler {0:?}")]
59 /// failed to create page handler
60 CreateFailed(anyhow::Error),
61 #[error("file operation failed : {0:?}")]
62 /// file operation failed
63 File(#[from] FileError),
64 #[error("staging operation failed : {0:?}")]
65 /// staging operation failed
66 Staging(#[from] StagingError),
67 #[error("userfaultfd failed : {0:?}")]
68 /// userfaultfd operation failed
69 Userfaultfd(#[from] UffdError),
70 #[error("failed to iterate data ranges: {0:?}")]
71 /// FileDataIterator failed
72 FileDataIterator(#[from] base::Error),
73 }
74
75 /// Remove the memory range on the guest memory.
76 ///
77 /// This is an alternative to [vm_memory::GuestMemory::remove_range()] when working with host
78 /// addresses instead of guest addresses.
79 ///
80 /// # Safety
81 ///
82 /// The memory range must be on the guest memory.
83 #[deny(unsafe_op_in_unsafe_fn)]
remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error>84 unsafe fn remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error> {
85 // SAFETY:
86 // Safe because the caller guarantees addr is in guest memory, so this does not affect any rust
87 // managed memory.
88 let ret = unsafe { libc::madvise(addr as *mut libc::c_void, len, libc::MADV_REMOVE) };
89 if ret < 0 {
90 base::errno_result()
91 } else {
92 Ok(())
93 }
94 }
95
uffd_copy_all( uffd: &Userfaultfd, mut page_addr: usize, mut data_slice: VolatileSlice, wake: bool, ) -> std::result::Result<(), UffdError>96 fn uffd_copy_all(
97 uffd: &Userfaultfd,
98 mut page_addr: usize,
99 mut data_slice: VolatileSlice,
100 wake: bool,
101 ) -> std::result::Result<(), UffdError> {
102 loop {
103 let result = uffd.copy(page_addr, data_slice.size(), data_slice.as_ptr(), wake);
104 match result {
105 Err(UffdError::PartiallyCopied(copied)) => {
106 page_addr += copied;
107 data_slice.advance(copied);
108 }
109 other => {
110 // Even EEXIST for copy operation should be an error for page fault handling. If
111 // the page was swapped in before, the page should be cleared from the swap file
112 // and do `Userfaultfd::zero()` instead.
113 return other.map(|_| ());
114 }
115 }
116 }
117 }
118
119 /// [Region] represents a memory region and corresponding [SwapFile].
120 struct Region {
121 /// the head page index of the region.
122 head_page_idx: usize,
123 base_page_idx_in_file: usize,
124 num_pages: usize,
125 staging_memory: StagingMemory,
126 copied_from_file_pages: usize,
127 copied_from_staging_pages: usize,
128 zeroed_pages: usize,
129 swap_in_pages: usize,
130 /// the amount of pages which were already initialized on page faults.
131 redundant_pages: usize,
132 }
133
134 /// MoveToStaging copies chunks of consecutive pages next to each other on the guest memory to the
135 /// staging memory and removes the chunks on the guest memory.
136 pub struct MoveToStaging {
137 remove_area: Range<usize>,
138 copies: Vec<CopyOp>,
139 }
140
141 impl Task for MoveToStaging {
execute(self)142 fn execute(self) {
143 for copy_op in self.copies {
144 copy_op.execute();
145 }
146 // Remove chunks of pages at once to reduce madvise(2) syscall.
147 // SAFETY:
148 // Safe because the region is already backed by the file and the content will be
149 // swapped in on a page fault.
150 let result = unsafe {
151 remove_memory(
152 self.remove_area.start,
153 self.remove_area.end - self.remove_area.start,
154 )
155 };
156 if let Err(e) = result {
157 panic!("failed to remove memory: {:?}", e);
158 }
159 }
160 }
161
162 struct PageHandleContext<'a> {
163 file: SwapFile<'a>,
164 regions: Vec<Region>,
165 mlock_budget_pages: usize,
166 }
167
168 /// PageHandler manages the page states of multiple regions.
169 ///
170 /// Handles multiple events derived from userfaultfd and swap out requests.
171 /// All the addresses and sizes in bytes are converted to page id internally.
172 pub struct PageHandler<'a> {
173 ctx: Mutex<PageHandleContext<'a>>,
174 channel: Arc<Channel<MoveToStaging>>,
175 }
176
177 impl<'a> PageHandler<'a> {
178 /// Creates [PageHandler] for the given region.
179 ///
180 /// If any of regions overlaps, this returns [Error::RegionOverlap].
181 ///
182 /// # Arguments
183 ///
184 /// * `swap_file` - The swap file.
185 /// * `staging_shmem` - The staging memory. It must have enough size to hold guest memory.
186 /// Otherwise monitor process crashes on creating a mmap.
187 /// * `address_ranges` - The list of address range of the regions. the start address must align
188 /// with page. the size must be multiple of pagesize.
create( swap_file: &'a File, staging_shmem: &'a SharedMemory, address_ranges: &[Range<usize>], stating_move_context: Arc<Channel<MoveToStaging>>, ) -> Result<Self>189 pub fn create(
190 swap_file: &'a File,
191 staging_shmem: &'a SharedMemory,
192 address_ranges: &[Range<usize>],
193 stating_move_context: Arc<Channel<MoveToStaging>>,
194 ) -> Result<Self> {
195 // Truncate the file into the size to hold all regions, otherwise access beyond the end of
196 // file may cause SIGBUS.
197 swap_file
198 .set_len(
199 address_ranges
200 .iter()
201 .map(|r| (r.end.saturating_sub(r.start)) as u64)
202 .sum(),
203 )
204 .context("truncate swap file")
205 .map_err(Error::CreateFailed)?;
206
207 let mut regions: Vec<Region> = Vec::new();
208 let mut offset_pages = 0;
209 for address_range in address_ranges {
210 let head_page_idx = addr_to_page_idx(address_range.start);
211 if address_range.end < address_range.start {
212 return Err(Error::CreateFailed(anyhow::anyhow!(
213 "invalid region end < start"
214 )));
215 }
216 let region_size = address_range.end - address_range.start;
217 let num_pages = bytes_to_pages(region_size);
218
219 // Find an overlapping region
220 match regions.iter().position(|region| {
221 if region.head_page_idx < head_page_idx {
222 region.head_page_idx + region.num_pages > head_page_idx
223 } else {
224 region.head_page_idx < head_page_idx + num_pages
225 }
226 }) {
227 Some(i) => {
228 let region = ®ions[i];
229
230 return Err(Error::RegionOverlap(
231 address_range.clone(),
232 page_idx_to_addr(region.head_page_idx)
233 ..(page_idx_to_addr(region.head_page_idx + region.num_pages)),
234 ));
235 }
236 None => {
237 let base_addr = address_range.start;
238 assert!(is_page_aligned(base_addr));
239 assert!(is_page_aligned(region_size));
240
241 let staging_memory = StagingMemory::new(
242 staging_shmem,
243 pages_to_bytes(offset_pages) as u64,
244 num_pages,
245 )?;
246 regions.push(Region {
247 head_page_idx,
248 base_page_idx_in_file: offset_pages,
249 num_pages,
250 staging_memory,
251 copied_from_file_pages: 0,
252 copied_from_staging_pages: 0,
253 zeroed_pages: 0,
254 swap_in_pages: 0,
255 redundant_pages: 0,
256 });
257 offset_pages += num_pages;
258 }
259 }
260 }
261
262 let file = SwapFile::new(swap_file, offset_pages)?;
263
264 Ok(Self {
265 ctx: Mutex::new(PageHandleContext {
266 file,
267 regions,
268 mlock_budget_pages: bytes_to_pages(MLOCK_BUDGET),
269 }),
270 channel: stating_move_context,
271 })
272 }
273
find_region(regions: &mut [Region], page_idx: usize) -> Option<&mut Region>274 fn find_region(regions: &mut [Region], page_idx: usize) -> Option<&mut Region> {
275 // sequential search the corresponding page map from the list. It should be fast enough
276 // because there are a few regions (usually only 1).
277 regions.iter_mut().find(|region| {
278 region.head_page_idx <= page_idx && page_idx < region.head_page_idx + region.num_pages
279 })
280 }
281
282 /// Fills the faulted page with zero if the page is not initialized, with the content in the
283 /// swap file if the page is swapped out.
284 ///
285 /// # Arguments
286 ///
287 /// * `uffd` - the reference to the [Userfaultfd] for the faulting process.
288 /// * `address` - the address that triggered the page fault.
handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()>289 pub fn handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()> {
290 let page_idx = addr_to_page_idx(address);
291 // the head address of the page.
292 let page_addr = page_base_addr(address);
293 let page_size = pages_to_bytes(1);
294 let mut ctx = self.ctx.lock();
295 let PageHandleContext { regions, file, .. } = &mut *ctx;
296 let region = Self::find_region(regions, page_idx).ok_or(Error::InvalidAddress(address))?;
297
298 let idx_in_region = page_idx - region.head_page_idx;
299 let idx_in_file = idx_in_region + region.base_page_idx_in_file;
300 if let Some(page_slice) = region.staging_memory.page_content(idx_in_region)? {
301 uffd_copy_all(uffd, page_addr, page_slice, true)?;
302 // TODO(b/265758094): optimize clear operation.
303 region
304 .staging_memory
305 .clear_range(idx_in_region..idx_in_region + 1)?;
306 region.copied_from_staging_pages += 1;
307 Ok(())
308 } else if let Some(page_slice) = file.page_content(idx_in_file, false)? {
309 // TODO(kawasin): Unlock regions to proceed swap-in operation background.
310 uffd_copy_all(uffd, page_addr, page_slice, true)?;
311 // TODO(b/265758094): optimize clear operation.
312 // Do not erase the page from the disk for trimming optimization on next swap out.
313 let munlocked_pages = file.clear_range(idx_in_file..idx_in_file + 1)?;
314 region.copied_from_file_pages += 1;
315 ctx.mlock_budget_pages += munlocked_pages;
316 Ok(())
317 } else {
318 // Map a zero page since no swap file has been created yet but the fault
319 // happened.
320 // safe because the fault page is notified by uffd.
321 let result = uffd.zero(page_addr, page_size, true);
322 match result {
323 Ok(_) => {
324 region.zeroed_pages += 1;
325 Ok(())
326 }
327 Err(UffdError::PageExist) => {
328 // This case can happen if page faults on the same page happen on different
329 // processes.
330 uffd.wake(page_addr, page_size)?;
331 region.redundant_pages += 1;
332 Ok(())
333 }
334 Err(e) => Err(e.into()),
335 }
336 }
337 }
338
339 /// Clear the internal state for the pages.
340 ///
341 /// When pages are removed by madvise with `MADV_DONTNEED` or `MADV_REMOVE`, userfaultfd
342 /// notifies the event as `UFFD_EVENT_REMOVE`. This handles the remove event.
343 ///
344 /// In crosvm, balloon frees the guest memory and cause `UFFD_EVENT_REMOVE`.
345 ///
346 /// # Arguments
347 ///
348 /// * `start_addr` - the head address of the memory area to be freed.
349 /// * `end_addr` - the end address of the memory area to be freed. `UFFD_EVENT_REMOVE` tells the
350 /// head address of the next memory area of the freed area. (i.e. the exact tail address of
351 /// the memory area is `end_addr - 1`.)
handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()>352 pub fn handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()> {
353 if !is_page_aligned(start_addr) {
354 return Err(Error::InvalidAddress(start_addr));
355 } else if !is_page_aligned(end_addr) {
356 return Err(Error::InvalidAddress(end_addr));
357 }
358 let start_page_idx = addr_to_page_idx(start_addr);
359 let last_page_idx = addr_to_page_idx(end_addr);
360 let mut ctx = self.ctx.lock();
361 // TODO(b/269983521): Clear multiple pages in the same region at once.
362 for page_idx in start_page_idx..(last_page_idx) {
363 let page_addr = page_idx_to_addr(page_idx);
364 // TODO(kawasin): Cache the position if the range does not span multiple regions.
365 let region = Self::find_region(&mut ctx.regions, page_idx)
366 .ok_or(Error::InvalidAddress(page_addr))?;
367 let idx_in_region = page_idx - region.head_page_idx;
368 let idx_range = idx_in_region..idx_in_region + 1;
369 if let Err(e) = region.staging_memory.clear_range(idx_range) {
370 error!("failed to clear removed page from staging: {:?}", e);
371 }
372 let idx_in_file = idx_in_region + region.base_page_idx_in_file;
373 let idx_range = idx_in_file..idx_in_file + 1;
374 // Erase the pages from the disk because the pages are removed from the guest memory.
375 let munlocked_pages = ctx.file.free_range(idx_range)?;
376 ctx.mlock_budget_pages += munlocked_pages;
377 }
378 Ok(())
379 }
380
381 /// Move active pages in the memory region to the staging memory.
382 ///
383 /// It only moves active contents in the guest memory to the swap file and skips empty pages
384 /// (e.g. pages not touched, freed by balloon) using `lseek(2)` + `SEEK_HOLE/DATA`.
385 ///
386 /// Returns the count of moved out pages.
387 ///
388 /// # Arguments
389 ///
390 /// * `base_addr` - the head address of the memory region.
391 /// * `memfd` - the file descriptor of the memfd backing the guest memory region.
392 /// * `base_offset` - the offset of the memory region in the memfd.
393 ///
394 /// # Safety
395 ///
396 /// The region must have been registered to all userfaultfd of processes which may touch the
397 /// region.
398 ///
399 /// The memory must be protected not to be updated while moving.
400 ///
401 /// The page fault events for the region from the userfaultfd must be handled by
402 /// [Self::handle_page_fault()].
403 ///
404 /// Must call [Channel::wait_complete()] to wait all the copy operation complete within the
405 /// memory protection period.
406 #[deny(unsafe_op_in_unsafe_fn)]
move_to_staging<T>( &self, base_addr: usize, memfd: &T, base_offset: u64, ) -> Result<usize> where T: AsRawDescriptor,407 pub unsafe fn move_to_staging<T>(
408 &self,
409 base_addr: usize,
410 memfd: &T,
411 base_offset: u64,
412 ) -> Result<usize>
413 where
414 T: AsRawDescriptor,
415 {
416 let hugepage_size = *THP_SIZE;
417 let mut ctx = self.ctx.lock();
418 let region = Self::find_region(&mut ctx.regions, addr_to_page_idx(base_addr))
419 .ok_or(Error::InvalidAddress(base_addr))?;
420
421 if page_idx_to_addr(region.head_page_idx) != base_addr {
422 return Err(Error::InvalidAddress(base_addr));
423 }
424 let region_size = pages_to_bytes(region.num_pages);
425 let mut file_data = FileDataIterator::new(memfd, base_offset, region_size as u64);
426 let mut moved_size = 0;
427 let mut copies = Vec::new();
428 let mut remaining_batch_size = hugepage_size;
429 let mut batch_head_offset = 0;
430 let mut cur_data = None;
431 while let Some(data_range) = cur_data
432 .take()
433 .map(Ok)
434 .or_else(|| file_data.next())
435 .transpose()
436 .map_err(Error::FileDataIterator)?
437 {
438 // Assert offset is page aligned
439 let offset = (data_range.start - base_offset) as usize;
440 assert!(is_page_aligned(offset));
441
442 // The chunk size must be within usize since the chunk is within the guest memory.
443 let chunk_size = (data_range.end - data_range.start) as usize;
444 let data_range = if chunk_size > remaining_batch_size {
445 // Split the chunk if it is bigger than remaining_batch_size.
446
447 let split_size = if chunk_size >= hugepage_size {
448 // If the chunk size is bigger than or equals to huge page size, the chunk may
449 // contains a huge page. If we MADV_REMOVE a huge page partially, it can cause
450 // inconsistency between the actual page table and vmm-swap internal state.
451 let chunk_addr = base_addr + offset;
452 if !is_hugepage_aligned(chunk_addr) {
453 // Split the chunk before the where a huge page could start.
454 std::cmp::min(
455 round_up_hugepage_size(chunk_addr) - chunk_addr,
456 remaining_batch_size,
457 )
458 } else {
459 if remaining_batch_size < hugepage_size {
460 // Remove the batch since it does not have enough room for a huge page.
461 self.channel.push(MoveToStaging {
462 remove_area: base_addr + batch_head_offset..base_addr + offset,
463 copies: mem::take(&mut copies),
464 });
465 remaining_batch_size = hugepage_size;
466 batch_head_offset = offset;
467 }
468 hugepage_size
469 }
470 } else {
471 remaining_batch_size
472 };
473 // Cache the rest of splitted chunk to avoid useless lseek(2) syscall.
474 cur_data = Some(data_range.start + split_size as u64..data_range.end);
475 data_range.start..data_range.start + split_size as u64
476 } else {
477 data_range
478 };
479
480 let size = (data_range.end - data_range.start) as usize;
481 assert!(is_page_aligned(size));
482
483 // SAFETY:
484 // Safe because:
485 // * src_addr is aligned with page size
486 // * the data_range starting from src_addr is on the guest memory.
487 let copy_op = unsafe {
488 region.staging_memory.copy(
489 (base_addr + offset) as *const u8,
490 bytes_to_pages(offset),
491 bytes_to_pages(size),
492 )?
493 };
494 copies.push(copy_op);
495
496 moved_size += size;
497 // The size must be smaller than or equals to remaining_batch_size.
498 remaining_batch_size -= size;
499
500 if remaining_batch_size == 0 {
501 // Remove the batch of pages at once to reduce madvise(2) syscall.
502 self.channel.push(MoveToStaging {
503 remove_area: base_addr + batch_head_offset..base_addr + offset + size,
504 copies: mem::take(&mut copies),
505 });
506 remaining_batch_size = hugepage_size;
507 batch_head_offset = offset + size;
508 }
509 }
510 // Remove the final batch of pages.
511 self.channel.push(MoveToStaging {
512 remove_area: base_addr + batch_head_offset..base_addr + region_size,
513 copies,
514 });
515
516 region.copied_from_file_pages = 0;
517 region.copied_from_staging_pages = 0;
518 region.zeroed_pages = 0;
519 region.swap_in_pages = 0;
520 region.redundant_pages = 0;
521
522 Ok(bytes_to_pages(moved_size))
523 }
524
525 /// Write a chunk of consecutive pages in the staging memory to the swap file.
526 ///
527 /// If there is no active pages in the staging memory, this returns `Ok(0)`.
528 ///
529 /// The pages in guest memory have been moved to staging memory by [Self::move_to_staging()].
530 ///
531 /// Returns the count of swapped out pages.
532 ///
533 /// Even if swap_out fails on any internal steps, it does not break the page state management
534 /// and `PageHandler` can continue working with a little pages leaking in staging memory or swap
535 /// file. The leaked pages are removed when vmm-swap is disabled and `PageHandler` is dropped.
536 ///
537 /// # Arguments
538 ///
539 /// * `max_size` - the upper limit of the chunk size to write into the swap file at once. The
540 /// chunk is splitted if it is bigger than `max_size`.
swap_out(&self, max_size: usize) -> Result<usize>541 pub fn swap_out(&self, max_size: usize) -> Result<usize> {
542 let max_pages = bytes_to_pages(max_size);
543 let mut ctx = self.ctx.lock();
544 let PageHandleContext { regions, file, .. } = &mut *ctx;
545 for region in regions.iter_mut() {
546 if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
547 let idx_range_in_file = idx_range.start + region.base_page_idx_in_file
548 ..idx_range.end + region.base_page_idx_in_file;
549 let pages = idx_range.end - idx_range.start;
550 let slice = region.staging_memory.get_slice(idx_range.clone())?;
551 // Convert VolatileSlice to &[u8]
552 // SAFETY:
553 // Safe because the range of volatile slice is already validated.
554 let slice = unsafe { std::slice::from_raw_parts(slice.as_ptr(), slice.size()) };
555 file.write_to_file(idx_range_in_file.start, slice)?;
556 // TODO(kawasin): clear state_list on each write and MADV_REMOVE several chunk at
557 // once.
558 region.staging_memory.clear_range(idx_range)?;
559 // TODO(kawasin): free the page cache of the swap file.
560 // TODO(kawasin): use writev() to swap_out several small chunks at once.
561 return Ok(pages);
562 }
563 }
564 Ok(0)
565 }
566
567 /// Create a new [SwapInContext].
start_swap_in(&'a self) -> SwapInContext<'a>568 pub fn start_swap_in(&'a self) -> SwapInContext<'a> {
569 SwapInContext {
570 ctx: &self.ctx,
571 cur_staging: 0,
572 }
573 }
574
575 /// Create a new [TrimContext].
start_trim(&'a self) -> TrimContext<'a>576 pub fn start_trim(&'a self) -> TrimContext<'a> {
577 TrimContext {
578 ctx: &self.ctx,
579 cur_page: 0,
580 cur_region: 0,
581 next_data_in_file: 0..0,
582 clean_pages: 0,
583 zero_pages: 0,
584 }
585 }
586
587 /// Returns count of pages copied from vmm-swap file to the guest memory.
compute_copied_from_file_pages(&self) -> usize588 fn compute_copied_from_file_pages(&self) -> usize {
589 self.ctx
590 .lock()
591 .regions
592 .iter()
593 .map(|r| r.copied_from_file_pages)
594 .sum()
595 }
596
597 /// Returns count of pages copied from staging memory to the guest memory.
compute_copied_from_staging_pages(&self) -> usize598 fn compute_copied_from_staging_pages(&self) -> usize {
599 self.ctx
600 .lock()
601 .regions
602 .iter()
603 .map(|r| r.copied_from_staging_pages)
604 .sum()
605 }
606
607 /// Returns count of pages initialized with zero.
compute_zeroed_pages(&self) -> usize608 fn compute_zeroed_pages(&self) -> usize {
609 self.ctx.lock().regions.iter().map(|r| r.zeroed_pages).sum()
610 }
611
612 /// Returns count of pages which were already initialized on page faults.
compute_redundant_pages(&self) -> usize613 fn compute_redundant_pages(&self) -> usize {
614 self.ctx
615 .lock()
616 .regions
617 .iter()
618 .map(|r| r.redundant_pages)
619 .sum()
620 }
621
622 /// Returns count of pages present in the staging memory.
compute_staging_pages(&self) -> usize623 fn compute_staging_pages(&self) -> usize {
624 self.ctx
625 .lock()
626 .regions
627 .iter()
628 .map(|r| r.staging_memory.present_pages())
629 .sum()
630 }
631
632 /// Returns count of pages present in the swap files.
compute_swap_pages(&self) -> usize633 fn compute_swap_pages(&self) -> usize {
634 self.ctx.lock().file.present_pages()
635 }
636
637 /// Fill [SwapMetrics] with page handler metrics.
load_metrics(&self, metrics: &mut SwapMetrics)638 pub fn load_metrics(&self, metrics: &mut SwapMetrics) {
639 metrics.copied_from_file_pages = self.compute_copied_from_file_pages() as u64;
640 metrics.copied_from_staging_pages = self.compute_copied_from_staging_pages() as u64;
641 metrics.zeroed_pages = self.compute_zeroed_pages() as u64;
642 metrics.redundant_pages = self.compute_redundant_pages() as u64;
643 metrics.staging_pages = self.compute_staging_pages() as u64;
644 metrics.swap_pages = self.compute_swap_pages() as u64;
645 }
646 }
647
648 /// Context for swap-in operation.
649 ///
650 /// This holds cursor of indices in the regions for each step for optimization.
651 pub struct SwapInContext<'a> {
652 ctx: &'a Mutex<PageHandleContext<'a>>,
653 cur_staging: usize,
654 }
655
656 impl SwapInContext<'_> {
657 /// Swap in a chunk of consecutive pages from the staging memory and the swap file.
658 ///
659 /// If there is no more pages present outside of the guest memory, this returns `Ok(0)`.
660 ///
661 /// Returns the count of swapped in pages.
662 ///
663 /// # Arguments
664 ///
665 /// * `uffd` - the main [Userfaultfd].
666 /// * `max_size` - the upper limit of the chunk size to swap into the guest memory at once. The
667 /// chunk is splitted if it is bigger than `max_size`.
swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize>668 pub fn swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize> {
669 let mut ctx = self.ctx.lock();
670 // Request the kernel to pre-populate the present pages in the swap file to page cache
671 // background. At most 16MB of pages will be populated.
672 // The threshold is to apply MADV_WILLNEED to bigger chunk of pages. The kernel populates
673 // consecutive pages at once on MADV_WILLNEED.
674 if ctx.mlock_budget_pages > bytes_to_pages(PREFETCH_THRESHOLD) {
675 let mlock_budget_pages = ctx.mlock_budget_pages;
676 let locked_pages = ctx.file.lock_and_async_prefetch(mlock_budget_pages)?;
677 ctx.mlock_budget_pages -= locked_pages;
678 }
679
680 let max_pages = bytes_to_pages(max_size);
681 for region in ctx.regions[self.cur_staging..].iter_mut() {
682 // TODO(kawasin): swap_in multiple chunks less than max_size at once.
683 if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
684 let pages = idx_range.end - idx_range.start;
685 let page_addr = page_idx_to_addr(region.head_page_idx + idx_range.start);
686 let slice = region.staging_memory.get_slice(idx_range.clone())?;
687 uffd_copy_all(uffd, page_addr, slice, false)?;
688 // Clear the staging memory to avoid memory spike.
689 // TODO(kawasin): reduce the call count of MADV_REMOVE by removing several data
690 // at once.
691 region.staging_memory.clear_range(idx_range)?;
692 region.swap_in_pages += pages;
693 return Ok(pages);
694 }
695 self.cur_staging += 1;
696 }
697
698 if let Some(mut idx_range_in_file) = ctx.file.first_data_range(max_pages) {
699 let PageHandleContext { regions, file, .. } = &mut *ctx;
700 for region in regions.iter_mut() {
701 let region_tail_idx_in_file = region.base_page_idx_in_file + region.num_pages;
702 if idx_range_in_file.start >= region_tail_idx_in_file {
703 continue;
704 } else if idx_range_in_file.start < region.base_page_idx_in_file {
705 return Err(Error::File(FileError::OutOfRange));
706 } else if idx_range_in_file.end > region_tail_idx_in_file {
707 // The consecutive pages can be across regions. Swap-in pages in a region at
708 // once.
709 idx_range_in_file.end = region_tail_idx_in_file;
710 }
711 let pages = idx_range_in_file.end - idx_range_in_file.start;
712 let page_addr = page_idx_to_addr(
713 idx_range_in_file.start - region.base_page_idx_in_file + region.head_page_idx,
714 );
715 let slice = file.get_slice(idx_range_in_file.clone())?;
716 // TODO(kawasin): Unlock regions to proceed page fault handling on the main thread.
717 // We also need to handle the EEXIST error from UFFD_COPY.
718 uffd_copy_all(uffd, page_addr, slice, false)?;
719 // Do not erase each chunk of pages from disk on swap_in. The whole file will be
720 // truncated when swap_in is completed. Even if swap_in is aborted, the remaining
721 // disk contents help the trimming optimization on swap_out.
722 let munlocked_pages = file.clear_range(idx_range_in_file)?;
723 region.swap_in_pages += pages;
724 ctx.mlock_budget_pages += munlocked_pages;
725 return Ok(pages);
726 }
727 // File has remaining pages, but regions has been consumed.
728 return Err(Error::File(FileError::OutOfRange));
729 }
730
731 Ok(0)
732 }
733 }
734
735 impl Drop for SwapInContext<'_> {
drop(&mut self)736 fn drop(&mut self) {
737 let mut ctx = self.ctx.lock();
738 if let Err(e) = ctx.file.clear_mlock() {
739 panic!("failed to clear mlock: {:?}", e);
740 }
741 ctx.mlock_budget_pages = bytes_to_pages(MLOCK_BUDGET);
742 }
743 }
744
745 /// Context for trim operation.
746 ///
747 /// This drops 2 types of pages in the staging memory to reduce disk write.
748 ///
749 /// * Clean pages
750 /// * The pages which have been swapped out to the disk and have not been changed.
751 /// * Drop the pages in the staging memory and mark it as present on the swap file.
752 /// * Zero pages
753 /// * Drop the pages in the staging memory. The pages will be UFFD_ZEROed on page fault.
754 pub struct TrimContext<'a> {
755 ctx: &'a Mutex<PageHandleContext<'a>>,
756 cur_region: usize,
757 cur_page: usize,
758 /// The page idx range of pages which have been stored in the swap file.
759 next_data_in_file: Range<usize>,
760 clean_pages: usize,
761 zero_pages: usize,
762 }
763
764 impl TrimContext<'_> {
765 /// Trim pages in the staging memory.
766 ///
767 /// This returns the pages trimmed. This returns `None` if it traversed all pages in the staging
768 /// memory.
769 ///
770 /// # Arguments
771 ///
772 /// `max_size` - The maximum pages to be compared.
trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>>773 pub fn trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>> {
774 let mut ctx = self.ctx.lock();
775 if self.cur_region >= ctx.regions.len() {
776 return Ok(None);
777 }
778 let PageHandleContext { regions, file, .. } = &mut *ctx;
779 let region = &mut regions[self.cur_region];
780 let mut n_trimmed = 0;
781
782 for _ in 0..max_pages {
783 if let Some(slice_in_staging) = region
784 .staging_memory
785 .page_content(self.cur_page)
786 .context("get page of staging memory")?
787 {
788 let idx_range = self.cur_page..self.cur_page + 1;
789 let idx_in_file = idx_range.start + region.base_page_idx_in_file;
790
791 // Check zero page on the staging memory first. If the page is non-zero and have not
792 // been changed, zero checking is useless, but less cost than file I/O for the pages
793 // which were in the swap file and now is zero.
794 // Check 2 types of page in the same loop to utilize CPU cache for staging memory.
795 if slice_in_staging.is_all_zero() {
796 region
797 .staging_memory
798 .clear_range(idx_range.clone())
799 .context("clear a page in staging memory")?;
800 // The page is on the swap file as well.
801 let munlocked_pages = file
802 .free_range(idx_in_file..idx_in_file + 1)
803 .context("clear a page in swap file")?;
804 if munlocked_pages != 0 {
805 // Only either of swap-in or trimming runs at the same time. This is not
806 // expected path. Just logging an error because leaking
807 // mlock_budget_pages is not fatal.
808 error!("pages are mlock(2)ed while trimming");
809 }
810 n_trimmed += 1;
811 self.zero_pages += 1;
812 } else if let Some(slice_in_file) = file.page_content(idx_in_file, true)? {
813 // Compare the page with the previous content of the page on the disk.
814 if slice_in_staging == slice_in_file {
815 region
816 .staging_memory
817 .clear_range(idx_range.clone())
818 .context("clear a page in staging memory")?;
819 file.mark_as_present(idx_in_file)?;
820 n_trimmed += 1;
821 self.clean_pages += 1;
822 }
823 }
824 }
825
826 self.cur_page += 1;
827 if self.cur_page >= region.num_pages {
828 self.cur_region += 1;
829 self.cur_page = 0;
830 self.next_data_in_file = 0..0;
831 break;
832 }
833 }
834
835 Ok(Some(n_trimmed))
836 }
837
838 /// Total trimmed clean pages.
trimmed_clean_pages(&self) -> usize839 pub fn trimmed_clean_pages(&self) -> usize {
840 self.clean_pages
841 }
842
843 /// Total trimmed zero pages.
trimmed_zero_pages(&self) -> usize844 pub fn trimmed_zero_pages(&self) -> usize {
845 self.zero_pages
846 }
847 }
848