xref: /aosp_15_r20/external/crosvm/swap/src/page_handler.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! PageHandler manages the page states of multiple regions.
6 
7 #![deny(missing_docs)]
8 
9 use std::fs::File;
10 use std::mem;
11 use std::ops::Range;
12 use std::sync::Arc;
13 
14 use anyhow::Context;
15 use base::error;
16 use base::linux::FileDataIterator;
17 use base::AsRawDescriptor;
18 use base::SharedMemory;
19 use base::VolatileSlice;
20 use sync::Mutex;
21 use thiserror::Error as ThisError;
22 
23 use crate::file::Error as FileError;
24 use crate::file::SwapFile;
25 use crate::pagesize::addr_to_page_idx;
26 use crate::pagesize::bytes_to_pages;
27 use crate::pagesize::is_hugepage_aligned;
28 use crate::pagesize::is_page_aligned;
29 use crate::pagesize::page_base_addr;
30 use crate::pagesize::page_idx_to_addr;
31 use crate::pagesize::pages_to_bytes;
32 use crate::pagesize::round_up_hugepage_size;
33 use crate::pagesize::THP_SIZE;
34 use crate::staging::CopyOp;
35 use crate::staging::Error as StagingError;
36 use crate::staging::StagingMemory;
37 use crate::userfaultfd::Error as UffdError;
38 use crate::userfaultfd::Userfaultfd;
39 use crate::worker::Channel;
40 use crate::worker::Task;
41 use crate::SwapMetrics;
42 
43 pub(crate) const MLOCK_BUDGET: usize = 16 * 1024 * 1024; // = 16MB
44 const PREFETCH_THRESHOLD: usize = 4 * 1024 * 1024; // = 4MB
45 
46 /// Result for PageHandler
47 pub type Result<T> = std::result::Result<T, Error>;
48 
49 /// Errors for PageHandler
50 #[derive(ThisError, Debug)]
51 pub enum Error {
52     #[error("the address is invalid {0:#018X}")]
53     /// the address is invalid
54     InvalidAddress(usize),
55     #[error("the regions {0:?} and {1:?} overlap")]
56     /// regions are overlaps on registering
57     RegionOverlap(Range<usize>, Range<usize>),
58     #[error("failed to create page handler {0:?}")]
59     /// failed to create page handler
60     CreateFailed(anyhow::Error),
61     #[error("file operation failed : {0:?}")]
62     /// file operation failed
63     File(#[from] FileError),
64     #[error("staging operation failed : {0:?}")]
65     /// staging operation failed
66     Staging(#[from] StagingError),
67     #[error("userfaultfd failed : {0:?}")]
68     /// userfaultfd operation failed
69     Userfaultfd(#[from] UffdError),
70     #[error("failed to iterate data ranges: {0:?}")]
71     /// FileDataIterator failed
72     FileDataIterator(#[from] base::Error),
73 }
74 
75 /// Remove the memory range on the guest memory.
76 ///
77 /// This is an alternative to [vm_memory::GuestMemory::remove_range()] when working with host
78 /// addresses instead of guest addresses.
79 ///
80 /// # Safety
81 ///
82 /// The memory range must be on the guest memory.
83 #[deny(unsafe_op_in_unsafe_fn)]
remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error>84 unsafe fn remove_memory(addr: usize, len: usize) -> std::result::Result<(), base::Error> {
85     // SAFETY:
86     // Safe because the caller guarantees addr is in guest memory, so this does not affect any rust
87     // managed memory.
88     let ret = unsafe { libc::madvise(addr as *mut libc::c_void, len, libc::MADV_REMOVE) };
89     if ret < 0 {
90         base::errno_result()
91     } else {
92         Ok(())
93     }
94 }
95 
uffd_copy_all( uffd: &Userfaultfd, mut page_addr: usize, mut data_slice: VolatileSlice, wake: bool, ) -> std::result::Result<(), UffdError>96 fn uffd_copy_all(
97     uffd: &Userfaultfd,
98     mut page_addr: usize,
99     mut data_slice: VolatileSlice,
100     wake: bool,
101 ) -> std::result::Result<(), UffdError> {
102     loop {
103         let result = uffd.copy(page_addr, data_slice.size(), data_slice.as_ptr(), wake);
104         match result {
105             Err(UffdError::PartiallyCopied(copied)) => {
106                 page_addr += copied;
107                 data_slice.advance(copied);
108             }
109             other => {
110                 // Even EEXIST for copy operation should be an error for page fault handling. If
111                 // the page was swapped in before, the page should be cleared from the swap file
112                 // and do `Userfaultfd::zero()` instead.
113                 return other.map(|_| ());
114             }
115         }
116     }
117 }
118 
119 /// [Region] represents a memory region and corresponding [SwapFile].
120 struct Region {
121     /// the head page index of the region.
122     head_page_idx: usize,
123     base_page_idx_in_file: usize,
124     num_pages: usize,
125     staging_memory: StagingMemory,
126     copied_from_file_pages: usize,
127     copied_from_staging_pages: usize,
128     zeroed_pages: usize,
129     swap_in_pages: usize,
130     /// the amount of pages which were already initialized on page faults.
131     redundant_pages: usize,
132 }
133 
134 /// MoveToStaging copies chunks of consecutive pages next to each other on the guest memory to the
135 /// staging memory and removes the chunks on the guest memory.
136 pub struct MoveToStaging {
137     remove_area: Range<usize>,
138     copies: Vec<CopyOp>,
139 }
140 
141 impl Task for MoveToStaging {
execute(self)142     fn execute(self) {
143         for copy_op in self.copies {
144             copy_op.execute();
145         }
146         // Remove chunks of pages at once to reduce madvise(2) syscall.
147         // SAFETY:
148         // Safe because the region is already backed by the file and the content will be
149         // swapped in on a page fault.
150         let result = unsafe {
151             remove_memory(
152                 self.remove_area.start,
153                 self.remove_area.end - self.remove_area.start,
154             )
155         };
156         if let Err(e) = result {
157             panic!("failed to remove memory: {:?}", e);
158         }
159     }
160 }
161 
162 struct PageHandleContext<'a> {
163     file: SwapFile<'a>,
164     regions: Vec<Region>,
165     mlock_budget_pages: usize,
166 }
167 
168 /// PageHandler manages the page states of multiple regions.
169 ///
170 /// Handles multiple events derived from userfaultfd and swap out requests.
171 /// All the addresses and sizes in bytes are converted to page id internally.
172 pub struct PageHandler<'a> {
173     ctx: Mutex<PageHandleContext<'a>>,
174     channel: Arc<Channel<MoveToStaging>>,
175 }
176 
177 impl<'a> PageHandler<'a> {
178     /// Creates [PageHandler] for the given region.
179     ///
180     /// If any of regions overlaps, this returns [Error::RegionOverlap].
181     ///
182     /// # Arguments
183     ///
184     /// * `swap_file` - The swap file.
185     /// * `staging_shmem` - The staging memory. It must have enough size to hold guest memory.
186     ///   Otherwise monitor process crashes on creating a mmap.
187     /// * `address_ranges` - The list of address range of the regions. the start address must align
188     ///   with page. the size must be multiple of pagesize.
create( swap_file: &'a File, staging_shmem: &'a SharedMemory, address_ranges: &[Range<usize>], stating_move_context: Arc<Channel<MoveToStaging>>, ) -> Result<Self>189     pub fn create(
190         swap_file: &'a File,
191         staging_shmem: &'a SharedMemory,
192         address_ranges: &[Range<usize>],
193         stating_move_context: Arc<Channel<MoveToStaging>>,
194     ) -> Result<Self> {
195         // Truncate the file into the size to hold all regions, otherwise access beyond the end of
196         // file may cause SIGBUS.
197         swap_file
198             .set_len(
199                 address_ranges
200                     .iter()
201                     .map(|r| (r.end.saturating_sub(r.start)) as u64)
202                     .sum(),
203             )
204             .context("truncate swap file")
205             .map_err(Error::CreateFailed)?;
206 
207         let mut regions: Vec<Region> = Vec::new();
208         let mut offset_pages = 0;
209         for address_range in address_ranges {
210             let head_page_idx = addr_to_page_idx(address_range.start);
211             if address_range.end < address_range.start {
212                 return Err(Error::CreateFailed(anyhow::anyhow!(
213                     "invalid region end < start"
214                 )));
215             }
216             let region_size = address_range.end - address_range.start;
217             let num_pages = bytes_to_pages(region_size);
218 
219             // Find an overlapping region
220             match regions.iter().position(|region| {
221                 if region.head_page_idx < head_page_idx {
222                     region.head_page_idx + region.num_pages > head_page_idx
223                 } else {
224                     region.head_page_idx < head_page_idx + num_pages
225                 }
226             }) {
227                 Some(i) => {
228                     let region = &regions[i];
229 
230                     return Err(Error::RegionOverlap(
231                         address_range.clone(),
232                         page_idx_to_addr(region.head_page_idx)
233                             ..(page_idx_to_addr(region.head_page_idx + region.num_pages)),
234                     ));
235                 }
236                 None => {
237                     let base_addr = address_range.start;
238                     assert!(is_page_aligned(base_addr));
239                     assert!(is_page_aligned(region_size));
240 
241                     let staging_memory = StagingMemory::new(
242                         staging_shmem,
243                         pages_to_bytes(offset_pages) as u64,
244                         num_pages,
245                     )?;
246                     regions.push(Region {
247                         head_page_idx,
248                         base_page_idx_in_file: offset_pages,
249                         num_pages,
250                         staging_memory,
251                         copied_from_file_pages: 0,
252                         copied_from_staging_pages: 0,
253                         zeroed_pages: 0,
254                         swap_in_pages: 0,
255                         redundant_pages: 0,
256                     });
257                     offset_pages += num_pages;
258                 }
259             }
260         }
261 
262         let file = SwapFile::new(swap_file, offset_pages)?;
263 
264         Ok(Self {
265             ctx: Mutex::new(PageHandleContext {
266                 file,
267                 regions,
268                 mlock_budget_pages: bytes_to_pages(MLOCK_BUDGET),
269             }),
270             channel: stating_move_context,
271         })
272     }
273 
find_region(regions: &mut [Region], page_idx: usize) -> Option<&mut Region>274     fn find_region(regions: &mut [Region], page_idx: usize) -> Option<&mut Region> {
275         // sequential search the corresponding page map from the list. It should be fast enough
276         // because there are a few regions (usually only 1).
277         regions.iter_mut().find(|region| {
278             region.head_page_idx <= page_idx && page_idx < region.head_page_idx + region.num_pages
279         })
280     }
281 
282     /// Fills the faulted page with zero if the page is not initialized, with the content in the
283     /// swap file if the page is swapped out.
284     ///
285     /// # Arguments
286     ///
287     /// * `uffd` - the reference to the [Userfaultfd] for the faulting process.
288     /// * `address` - the address that triggered the page fault.
handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()>289     pub fn handle_page_fault(&self, uffd: &Userfaultfd, address: usize) -> Result<()> {
290         let page_idx = addr_to_page_idx(address);
291         // the head address of the page.
292         let page_addr = page_base_addr(address);
293         let page_size = pages_to_bytes(1);
294         let mut ctx = self.ctx.lock();
295         let PageHandleContext { regions, file, .. } = &mut *ctx;
296         let region = Self::find_region(regions, page_idx).ok_or(Error::InvalidAddress(address))?;
297 
298         let idx_in_region = page_idx - region.head_page_idx;
299         let idx_in_file = idx_in_region + region.base_page_idx_in_file;
300         if let Some(page_slice) = region.staging_memory.page_content(idx_in_region)? {
301             uffd_copy_all(uffd, page_addr, page_slice, true)?;
302             // TODO(b/265758094): optimize clear operation.
303             region
304                 .staging_memory
305                 .clear_range(idx_in_region..idx_in_region + 1)?;
306             region.copied_from_staging_pages += 1;
307             Ok(())
308         } else if let Some(page_slice) = file.page_content(idx_in_file, false)? {
309             // TODO(kawasin): Unlock regions to proceed swap-in operation background.
310             uffd_copy_all(uffd, page_addr, page_slice, true)?;
311             // TODO(b/265758094): optimize clear operation.
312             // Do not erase the page from the disk for trimming optimization on next swap out.
313             let munlocked_pages = file.clear_range(idx_in_file..idx_in_file + 1)?;
314             region.copied_from_file_pages += 1;
315             ctx.mlock_budget_pages += munlocked_pages;
316             Ok(())
317         } else {
318             // Map a zero page since no swap file has been created yet but the fault
319             // happened.
320             // safe because the fault page is notified by uffd.
321             let result = uffd.zero(page_addr, page_size, true);
322             match result {
323                 Ok(_) => {
324                     region.zeroed_pages += 1;
325                     Ok(())
326                 }
327                 Err(UffdError::PageExist) => {
328                     // This case can happen if page faults on the same page happen on different
329                     // processes.
330                     uffd.wake(page_addr, page_size)?;
331                     region.redundant_pages += 1;
332                     Ok(())
333                 }
334                 Err(e) => Err(e.into()),
335             }
336         }
337     }
338 
339     /// Clear the internal state for the pages.
340     ///
341     /// When pages are removed by madvise with `MADV_DONTNEED` or `MADV_REMOVE`, userfaultfd
342     /// notifies the event as `UFFD_EVENT_REMOVE`. This handles the remove event.
343     ///
344     /// In crosvm, balloon frees the guest memory and cause `UFFD_EVENT_REMOVE`.
345     ///
346     /// # Arguments
347     ///
348     /// * `start_addr` - the head address of the memory area to be freed.
349     /// * `end_addr` - the end address of the memory area to be freed. `UFFD_EVENT_REMOVE` tells the
350     ///   head address of the next memory area of the freed area. (i.e. the exact tail address of
351     ///   the memory area is `end_addr - 1`.)
handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()>352     pub fn handle_page_remove(&self, start_addr: usize, end_addr: usize) -> Result<()> {
353         if !is_page_aligned(start_addr) {
354             return Err(Error::InvalidAddress(start_addr));
355         } else if !is_page_aligned(end_addr) {
356             return Err(Error::InvalidAddress(end_addr));
357         }
358         let start_page_idx = addr_to_page_idx(start_addr);
359         let last_page_idx = addr_to_page_idx(end_addr);
360         let mut ctx = self.ctx.lock();
361         // TODO(b/269983521): Clear multiple pages in the same region at once.
362         for page_idx in start_page_idx..(last_page_idx) {
363             let page_addr = page_idx_to_addr(page_idx);
364             // TODO(kawasin): Cache the position if the range does not span multiple regions.
365             let region = Self::find_region(&mut ctx.regions, page_idx)
366                 .ok_or(Error::InvalidAddress(page_addr))?;
367             let idx_in_region = page_idx - region.head_page_idx;
368             let idx_range = idx_in_region..idx_in_region + 1;
369             if let Err(e) = region.staging_memory.clear_range(idx_range) {
370                 error!("failed to clear removed page from staging: {:?}", e);
371             }
372             let idx_in_file = idx_in_region + region.base_page_idx_in_file;
373             let idx_range = idx_in_file..idx_in_file + 1;
374             // Erase the pages from the disk because the pages are removed from the guest memory.
375             let munlocked_pages = ctx.file.free_range(idx_range)?;
376             ctx.mlock_budget_pages += munlocked_pages;
377         }
378         Ok(())
379     }
380 
381     /// Move active pages in the memory region to the staging memory.
382     ///
383     /// It only moves active contents in the guest memory to the swap file and skips empty pages
384     /// (e.g. pages not touched, freed by balloon) using `lseek(2)` + `SEEK_HOLE/DATA`.
385     ///
386     /// Returns the count of moved out pages.
387     ///
388     /// # Arguments
389     ///
390     /// * `base_addr` - the head address of the memory region.
391     /// * `memfd` - the file descriptor of the memfd backing the guest memory region.
392     /// * `base_offset` - the offset of the memory region in the memfd.
393     ///
394     /// # Safety
395     ///
396     /// The region must have been registered to all userfaultfd of processes which may touch the
397     /// region.
398     ///
399     /// The memory must be protected not to be updated while moving.
400     ///
401     /// The page fault events for the region from the userfaultfd must be handled by
402     /// [Self::handle_page_fault()].
403     ///
404     /// Must call [Channel::wait_complete()] to wait all the copy operation complete within the
405     /// memory protection period.
406     #[deny(unsafe_op_in_unsafe_fn)]
move_to_staging<T>( &self, base_addr: usize, memfd: &T, base_offset: u64, ) -> Result<usize> where T: AsRawDescriptor,407     pub unsafe fn move_to_staging<T>(
408         &self,
409         base_addr: usize,
410         memfd: &T,
411         base_offset: u64,
412     ) -> Result<usize>
413     where
414         T: AsRawDescriptor,
415     {
416         let hugepage_size = *THP_SIZE;
417         let mut ctx = self.ctx.lock();
418         let region = Self::find_region(&mut ctx.regions, addr_to_page_idx(base_addr))
419             .ok_or(Error::InvalidAddress(base_addr))?;
420 
421         if page_idx_to_addr(region.head_page_idx) != base_addr {
422             return Err(Error::InvalidAddress(base_addr));
423         }
424         let region_size = pages_to_bytes(region.num_pages);
425         let mut file_data = FileDataIterator::new(memfd, base_offset, region_size as u64);
426         let mut moved_size = 0;
427         let mut copies = Vec::new();
428         let mut remaining_batch_size = hugepage_size;
429         let mut batch_head_offset = 0;
430         let mut cur_data = None;
431         while let Some(data_range) = cur_data
432             .take()
433             .map(Ok)
434             .or_else(|| file_data.next())
435             .transpose()
436             .map_err(Error::FileDataIterator)?
437         {
438             // Assert offset is page aligned
439             let offset = (data_range.start - base_offset) as usize;
440             assert!(is_page_aligned(offset));
441 
442             // The chunk size must be within usize since the chunk is within the guest memory.
443             let chunk_size = (data_range.end - data_range.start) as usize;
444             let data_range = if chunk_size > remaining_batch_size {
445                 // Split the chunk if it is bigger than remaining_batch_size.
446 
447                 let split_size = if chunk_size >= hugepage_size {
448                     // If the chunk size is bigger than or equals to huge page size, the chunk may
449                     // contains a huge page. If we MADV_REMOVE a huge page partially, it can cause
450                     // inconsistency between the actual page table and vmm-swap internal state.
451                     let chunk_addr = base_addr + offset;
452                     if !is_hugepage_aligned(chunk_addr) {
453                         // Split the chunk before the where a huge page could start.
454                         std::cmp::min(
455                             round_up_hugepage_size(chunk_addr) - chunk_addr,
456                             remaining_batch_size,
457                         )
458                     } else {
459                         if remaining_batch_size < hugepage_size {
460                             // Remove the batch since it does not have enough room for a huge page.
461                             self.channel.push(MoveToStaging {
462                                 remove_area: base_addr + batch_head_offset..base_addr + offset,
463                                 copies: mem::take(&mut copies),
464                             });
465                             remaining_batch_size = hugepage_size;
466                             batch_head_offset = offset;
467                         }
468                         hugepage_size
469                     }
470                 } else {
471                     remaining_batch_size
472                 };
473                 // Cache the rest of splitted chunk to avoid useless lseek(2) syscall.
474                 cur_data = Some(data_range.start + split_size as u64..data_range.end);
475                 data_range.start..data_range.start + split_size as u64
476             } else {
477                 data_range
478             };
479 
480             let size = (data_range.end - data_range.start) as usize;
481             assert!(is_page_aligned(size));
482 
483             // SAFETY:
484             // Safe because:
485             // * src_addr is aligned with page size
486             // * the data_range starting from src_addr is on the guest memory.
487             let copy_op = unsafe {
488                 region.staging_memory.copy(
489                     (base_addr + offset) as *const u8,
490                     bytes_to_pages(offset),
491                     bytes_to_pages(size),
492                 )?
493             };
494             copies.push(copy_op);
495 
496             moved_size += size;
497             // The size must be smaller than or equals to remaining_batch_size.
498             remaining_batch_size -= size;
499 
500             if remaining_batch_size == 0 {
501                 // Remove the batch of pages at once to reduce madvise(2) syscall.
502                 self.channel.push(MoveToStaging {
503                     remove_area: base_addr + batch_head_offset..base_addr + offset + size,
504                     copies: mem::take(&mut copies),
505                 });
506                 remaining_batch_size = hugepage_size;
507                 batch_head_offset = offset + size;
508             }
509         }
510         // Remove the final batch of pages.
511         self.channel.push(MoveToStaging {
512             remove_area: base_addr + batch_head_offset..base_addr + region_size,
513             copies,
514         });
515 
516         region.copied_from_file_pages = 0;
517         region.copied_from_staging_pages = 0;
518         region.zeroed_pages = 0;
519         region.swap_in_pages = 0;
520         region.redundant_pages = 0;
521 
522         Ok(bytes_to_pages(moved_size))
523     }
524 
525     /// Write a chunk of consecutive pages in the staging memory to the swap file.
526     ///
527     /// If there is no active pages in the staging memory, this returns `Ok(0)`.
528     ///
529     /// The pages in guest memory have been moved to staging memory by [Self::move_to_staging()].
530     ///
531     /// Returns the count of swapped out pages.
532     ///
533     /// Even if swap_out fails on any internal steps, it does not break the page state management
534     /// and `PageHandler` can continue working with a little pages leaking in staging memory or swap
535     /// file. The leaked pages are removed when vmm-swap is disabled and `PageHandler` is dropped.
536     ///
537     /// # Arguments
538     ///
539     /// * `max_size` - the upper limit of the chunk size to write into the swap file at once. The
540     ///   chunk is splitted if it is bigger than `max_size`.
swap_out(&self, max_size: usize) -> Result<usize>541     pub fn swap_out(&self, max_size: usize) -> Result<usize> {
542         let max_pages = bytes_to_pages(max_size);
543         let mut ctx = self.ctx.lock();
544         let PageHandleContext { regions, file, .. } = &mut *ctx;
545         for region in regions.iter_mut() {
546             if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
547                 let idx_range_in_file = idx_range.start + region.base_page_idx_in_file
548                     ..idx_range.end + region.base_page_idx_in_file;
549                 let pages = idx_range.end - idx_range.start;
550                 let slice = region.staging_memory.get_slice(idx_range.clone())?;
551                 // Convert VolatileSlice to &[u8]
552                 // SAFETY:
553                 // Safe because the range of volatile slice is already validated.
554                 let slice = unsafe { std::slice::from_raw_parts(slice.as_ptr(), slice.size()) };
555                 file.write_to_file(idx_range_in_file.start, slice)?;
556                 // TODO(kawasin): clear state_list on each write and MADV_REMOVE several chunk at
557                 // once.
558                 region.staging_memory.clear_range(idx_range)?;
559                 // TODO(kawasin): free the page cache of the swap file.
560                 // TODO(kawasin): use writev() to swap_out several small chunks at once.
561                 return Ok(pages);
562             }
563         }
564         Ok(0)
565     }
566 
567     /// Create a new [SwapInContext].
start_swap_in(&'a self) -> SwapInContext<'a>568     pub fn start_swap_in(&'a self) -> SwapInContext<'a> {
569         SwapInContext {
570             ctx: &self.ctx,
571             cur_staging: 0,
572         }
573     }
574 
575     /// Create a new [TrimContext].
start_trim(&'a self) -> TrimContext<'a>576     pub fn start_trim(&'a self) -> TrimContext<'a> {
577         TrimContext {
578             ctx: &self.ctx,
579             cur_page: 0,
580             cur_region: 0,
581             next_data_in_file: 0..0,
582             clean_pages: 0,
583             zero_pages: 0,
584         }
585     }
586 
587     /// Returns count of pages copied from vmm-swap file to the guest memory.
compute_copied_from_file_pages(&self) -> usize588     fn compute_copied_from_file_pages(&self) -> usize {
589         self.ctx
590             .lock()
591             .regions
592             .iter()
593             .map(|r| r.copied_from_file_pages)
594             .sum()
595     }
596 
597     /// Returns count of pages copied from staging memory to the guest memory.
compute_copied_from_staging_pages(&self) -> usize598     fn compute_copied_from_staging_pages(&self) -> usize {
599         self.ctx
600             .lock()
601             .regions
602             .iter()
603             .map(|r| r.copied_from_staging_pages)
604             .sum()
605     }
606 
607     /// Returns count of pages initialized with zero.
compute_zeroed_pages(&self) -> usize608     fn compute_zeroed_pages(&self) -> usize {
609         self.ctx.lock().regions.iter().map(|r| r.zeroed_pages).sum()
610     }
611 
612     /// Returns count of pages which were already initialized on page faults.
compute_redundant_pages(&self) -> usize613     fn compute_redundant_pages(&self) -> usize {
614         self.ctx
615             .lock()
616             .regions
617             .iter()
618             .map(|r| r.redundant_pages)
619             .sum()
620     }
621 
622     /// Returns count of pages present in the staging memory.
compute_staging_pages(&self) -> usize623     fn compute_staging_pages(&self) -> usize {
624         self.ctx
625             .lock()
626             .regions
627             .iter()
628             .map(|r| r.staging_memory.present_pages())
629             .sum()
630     }
631 
632     /// Returns count of pages present in the swap files.
compute_swap_pages(&self) -> usize633     fn compute_swap_pages(&self) -> usize {
634         self.ctx.lock().file.present_pages()
635     }
636 
637     /// Fill [SwapMetrics] with page handler metrics.
load_metrics(&self, metrics: &mut SwapMetrics)638     pub fn load_metrics(&self, metrics: &mut SwapMetrics) {
639         metrics.copied_from_file_pages = self.compute_copied_from_file_pages() as u64;
640         metrics.copied_from_staging_pages = self.compute_copied_from_staging_pages() as u64;
641         metrics.zeroed_pages = self.compute_zeroed_pages() as u64;
642         metrics.redundant_pages = self.compute_redundant_pages() as u64;
643         metrics.staging_pages = self.compute_staging_pages() as u64;
644         metrics.swap_pages = self.compute_swap_pages() as u64;
645     }
646 }
647 
648 /// Context for swap-in operation.
649 ///
650 /// This holds cursor of indices in the regions for each step for optimization.
651 pub struct SwapInContext<'a> {
652     ctx: &'a Mutex<PageHandleContext<'a>>,
653     cur_staging: usize,
654 }
655 
656 impl SwapInContext<'_> {
657     /// Swap in a chunk of consecutive pages from the staging memory and the swap file.
658     ///
659     /// If there is no more pages present outside of the guest memory, this returns `Ok(0)`.
660     ///
661     /// Returns the count of swapped in pages.
662     ///
663     /// # Arguments
664     ///
665     /// * `uffd` - the main [Userfaultfd].
666     /// * `max_size` - the upper limit of the chunk size to swap into the guest memory at once. The
667     ///   chunk is splitted if it is bigger than `max_size`.
swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize>668     pub fn swap_in(&mut self, uffd: &Userfaultfd, max_size: usize) -> Result<usize> {
669         let mut ctx = self.ctx.lock();
670         // Request the kernel to pre-populate the present pages in the swap file to page cache
671         // background. At most 16MB of pages will be populated.
672         // The threshold is to apply MADV_WILLNEED to bigger chunk of pages. The kernel populates
673         // consecutive pages at once on MADV_WILLNEED.
674         if ctx.mlock_budget_pages > bytes_to_pages(PREFETCH_THRESHOLD) {
675             let mlock_budget_pages = ctx.mlock_budget_pages;
676             let locked_pages = ctx.file.lock_and_async_prefetch(mlock_budget_pages)?;
677             ctx.mlock_budget_pages -= locked_pages;
678         }
679 
680         let max_pages = bytes_to_pages(max_size);
681         for region in ctx.regions[self.cur_staging..].iter_mut() {
682             // TODO(kawasin): swap_in multiple chunks less than max_size at once.
683             if let Some(idx_range) = region.staging_memory.first_data_range(max_pages) {
684                 let pages = idx_range.end - idx_range.start;
685                 let page_addr = page_idx_to_addr(region.head_page_idx + idx_range.start);
686                 let slice = region.staging_memory.get_slice(idx_range.clone())?;
687                 uffd_copy_all(uffd, page_addr, slice, false)?;
688                 // Clear the staging memory to avoid memory spike.
689                 // TODO(kawasin): reduce the call count of MADV_REMOVE by removing several data
690                 // at once.
691                 region.staging_memory.clear_range(idx_range)?;
692                 region.swap_in_pages += pages;
693                 return Ok(pages);
694             }
695             self.cur_staging += 1;
696         }
697 
698         if let Some(mut idx_range_in_file) = ctx.file.first_data_range(max_pages) {
699             let PageHandleContext { regions, file, .. } = &mut *ctx;
700             for region in regions.iter_mut() {
701                 let region_tail_idx_in_file = region.base_page_idx_in_file + region.num_pages;
702                 if idx_range_in_file.start >= region_tail_idx_in_file {
703                     continue;
704                 } else if idx_range_in_file.start < region.base_page_idx_in_file {
705                     return Err(Error::File(FileError::OutOfRange));
706                 } else if idx_range_in_file.end > region_tail_idx_in_file {
707                     // The consecutive pages can be across regions. Swap-in pages in a region at
708                     // once.
709                     idx_range_in_file.end = region_tail_idx_in_file;
710                 }
711                 let pages = idx_range_in_file.end - idx_range_in_file.start;
712                 let page_addr = page_idx_to_addr(
713                     idx_range_in_file.start - region.base_page_idx_in_file + region.head_page_idx,
714                 );
715                 let slice = file.get_slice(idx_range_in_file.clone())?;
716                 // TODO(kawasin): Unlock regions to proceed page fault handling on the main thread.
717                 //                We also need to handle the EEXIST error from UFFD_COPY.
718                 uffd_copy_all(uffd, page_addr, slice, false)?;
719                 // Do not erase each chunk of pages from disk on swap_in. The whole file will be
720                 // truncated when swap_in is completed. Even if swap_in is aborted, the remaining
721                 // disk contents help the trimming optimization on swap_out.
722                 let munlocked_pages = file.clear_range(idx_range_in_file)?;
723                 region.swap_in_pages += pages;
724                 ctx.mlock_budget_pages += munlocked_pages;
725                 return Ok(pages);
726             }
727             // File has remaining pages, but regions has been consumed.
728             return Err(Error::File(FileError::OutOfRange));
729         }
730 
731         Ok(0)
732     }
733 }
734 
735 impl Drop for SwapInContext<'_> {
drop(&mut self)736     fn drop(&mut self) {
737         let mut ctx = self.ctx.lock();
738         if let Err(e) = ctx.file.clear_mlock() {
739             panic!("failed to clear mlock: {:?}", e);
740         }
741         ctx.mlock_budget_pages = bytes_to_pages(MLOCK_BUDGET);
742     }
743 }
744 
745 /// Context for trim operation.
746 ///
747 /// This drops 2 types of pages in the staging memory to reduce disk write.
748 ///
749 /// * Clean pages
750 ///   * The pages which have been swapped out to the disk and have not been changed.
751 ///   * Drop the pages in the staging memory and mark it as present on the swap file.
752 /// * Zero pages
753 ///   * Drop the pages in the staging memory. The pages will be UFFD_ZEROed on page fault.
754 pub struct TrimContext<'a> {
755     ctx: &'a Mutex<PageHandleContext<'a>>,
756     cur_region: usize,
757     cur_page: usize,
758     /// The page idx range of pages which have been stored in the swap file.
759     next_data_in_file: Range<usize>,
760     clean_pages: usize,
761     zero_pages: usize,
762 }
763 
764 impl TrimContext<'_> {
765     /// Trim pages in the staging memory.
766     ///
767     /// This returns the pages trimmed. This returns `None` if it traversed all pages in the staging
768     /// memory.
769     ///
770     /// # Arguments
771     ///
772     /// `max_size` - The maximum pages to be compared.
trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>>773     pub fn trim_pages(&mut self, max_pages: usize) -> anyhow::Result<Option<usize>> {
774         let mut ctx = self.ctx.lock();
775         if self.cur_region >= ctx.regions.len() {
776             return Ok(None);
777         }
778         let PageHandleContext { regions, file, .. } = &mut *ctx;
779         let region = &mut regions[self.cur_region];
780         let mut n_trimmed = 0;
781 
782         for _ in 0..max_pages {
783             if let Some(slice_in_staging) = region
784                 .staging_memory
785                 .page_content(self.cur_page)
786                 .context("get page of staging memory")?
787             {
788                 let idx_range = self.cur_page..self.cur_page + 1;
789                 let idx_in_file = idx_range.start + region.base_page_idx_in_file;
790 
791                 // Check zero page on the staging memory first. If the page is non-zero and have not
792                 // been changed, zero checking is useless, but less cost than file I/O for the pages
793                 // which were in the swap file and now is zero.
794                 // Check 2 types of page in the same loop to utilize CPU cache for staging memory.
795                 if slice_in_staging.is_all_zero() {
796                     region
797                         .staging_memory
798                         .clear_range(idx_range.clone())
799                         .context("clear a page in staging memory")?;
800                     // The page is on the swap file as well.
801                     let munlocked_pages = file
802                         .free_range(idx_in_file..idx_in_file + 1)
803                         .context("clear a page in swap file")?;
804                     if munlocked_pages != 0 {
805                         // Only either of swap-in or trimming runs at the same time. This is not
806                         // expected path. Just logging an error because leaking
807                         // mlock_budget_pages is not fatal.
808                         error!("pages are mlock(2)ed while trimming");
809                     }
810                     n_trimmed += 1;
811                     self.zero_pages += 1;
812                 } else if let Some(slice_in_file) = file.page_content(idx_in_file, true)? {
813                     // Compare the page with the previous content of the page on the disk.
814                     if slice_in_staging == slice_in_file {
815                         region
816                             .staging_memory
817                             .clear_range(idx_range.clone())
818                             .context("clear a page in staging memory")?;
819                         file.mark_as_present(idx_in_file)?;
820                         n_trimmed += 1;
821                         self.clean_pages += 1;
822                     }
823                 }
824             }
825 
826             self.cur_page += 1;
827             if self.cur_page >= region.num_pages {
828                 self.cur_region += 1;
829                 self.cur_page = 0;
830                 self.next_data_in_file = 0..0;
831                 break;
832             }
833         }
834 
835         Ok(Some(n_trimmed))
836     }
837 
838     /// Total trimmed clean pages.
trimmed_clean_pages(&self) -> usize839     pub fn trimmed_clean_pages(&self) -> usize {
840         self.clean_pages
841     }
842 
843     /// Total trimmed zero pages.
trimmed_zero_pages(&self) -> usize844     pub fn trimmed_zero_pages(&self) -> usize {
845         self.zero_pages
846     }
847 }
848