1 //! A Linux mechanism for handling page faults in user space. 2 //! 3 //! The main way to interact with this library is to create a `Uffd` object with a `UffdBuilder`, 4 //! then use the methods of `Uffd` from a worker thread. 5 //! 6 //! See [`userfaultfd(2)`](http://man7.org/linux/man-pages/man2/userfaultfd.2.html) and 7 //! [`ioctl_userfaultfd(2)`](http://man7.org/linux/man-pages/man2/ioctl_userfaultfd.2.html) for more 8 //! details. 9 10 mod builder; 11 mod error; 12 mod event; 13 mod raw; 14 15 pub use crate::builder::{FeatureFlags, UffdBuilder}; 16 pub use crate::error::{Error, Result}; 17 pub use crate::event::{Event, FaultKind, ReadWrite}; 18 19 use bitflags::bitflags; 20 use libc::{self, c_void}; 21 use nix::errno::Errno; 22 use nix::unistd::read; 23 use std::mem; 24 use std::os::fd::{AsFd, BorrowedFd}; 25 use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, RawFd}; 26 27 /// Represents an opaque buffer where userfaultfd events are stored. 28 /// 29 /// This is used in conjunction with [`Uffd::read_events`]. 30 pub struct EventBuffer(Vec<raw::uffd_msg>); 31 32 impl EventBuffer { 33 /// Creates a new buffer for `size` number of events. 34 /// 35 /// [`Uffd::read_events`] will read up to this many events at a time. new(size: usize) -> Self36 pub fn new(size: usize) -> Self { 37 Self(vec![unsafe { mem::zeroed() }; size]) 38 } 39 } 40 41 /// The userfaultfd object. 42 /// 43 /// The userspace representation of the object is a file descriptor, so this type implements 44 /// `AsRawFd`, `FromRawFd`, and `IntoRawFd`. These methods should be used with caution, but can be 45 /// essential for using functions like `poll` on a worker thread. 46 #[derive(Debug)] 47 pub struct Uffd { 48 fd: RawFd, 49 } 50 51 impl Drop for Uffd { drop(&mut self)52 fn drop(&mut self) { 53 unsafe { libc::close(self.fd) }; 54 } 55 } 56 57 impl AsFd for Uffd { as_fd(&self) -> BorrowedFd<'_>58 fn as_fd(&self) -> BorrowedFd<'_> { 59 unsafe { BorrowedFd::borrow_raw(self.as_raw_fd()) } 60 } 61 } 62 63 impl AsRawFd for Uffd { as_raw_fd(&self) -> RawFd64 fn as_raw_fd(&self) -> RawFd { 65 self.fd 66 } 67 } 68 69 impl IntoRawFd for Uffd { into_raw_fd(self) -> RawFd70 fn into_raw_fd(self) -> RawFd { 71 self.fd 72 } 73 } 74 75 impl FromRawFd for Uffd { from_raw_fd(fd: RawFd) -> Self76 unsafe fn from_raw_fd(fd: RawFd) -> Self { 77 Uffd { fd } 78 } 79 } 80 81 bitflags! { 82 /// The registration mode used when registering an address range with `Uffd`. 83 #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] 84 pub struct RegisterMode: u64 { 85 /// Registers the range for missing page faults. 86 const MISSING = raw::UFFDIO_REGISTER_MODE_MISSING; 87 /// Registers the range for write faults. 88 #[cfg(feature = "linux5_7")] 89 const WRITE_PROTECT = raw::UFFDIO_REGISTER_MODE_WP; 90 } 91 } 92 93 impl Uffd { 94 /// Register a memory address range with the userfaultfd object, and returns the `IoctlFlags` 95 /// that are available for the selected range. 96 /// 97 /// This method only registers the given range for missing page faults. register(&self, start: *mut c_void, len: usize) -> Result<IoctlFlags>98 pub fn register(&self, start: *mut c_void, len: usize) -> Result<IoctlFlags> { 99 self.register_with_mode(start, len, RegisterMode::MISSING) 100 } 101 102 /// Register a memory address range with the userfaultfd object for the given mode and 103 /// returns the `IoctlFlags` that are available for the selected range. register_with_mode( &self, start: *mut c_void, len: usize, mode: RegisterMode, ) -> Result<IoctlFlags>104 pub fn register_with_mode( 105 &self, 106 start: *mut c_void, 107 len: usize, 108 mode: RegisterMode, 109 ) -> Result<IoctlFlags> { 110 let mut register = raw::uffdio_register { 111 range: raw::uffdio_range { 112 start: start as u64, 113 len: len as u64, 114 }, 115 mode: mode.bits(), 116 ioctls: 0, 117 }; 118 unsafe { 119 raw::register(self.as_raw_fd(), &mut register as *mut raw::uffdio_register)?; 120 } 121 IoctlFlags::from_bits(register.ioctls).ok_or(Error::UnrecognizedIoctls(register.ioctls)) 122 } 123 124 /// Unregister a memory address range from the userfaultfd object. unregister(&self, start: *mut c_void, len: usize) -> Result<()>125 pub fn unregister(&self, start: *mut c_void, len: usize) -> Result<()> { 126 let mut range = raw::uffdio_range { 127 start: start as u64, 128 len: len as u64, 129 }; 130 unsafe { 131 raw::unregister(self.as_raw_fd(), &mut range as *mut raw::uffdio_range)?; 132 } 133 Ok(()) 134 } 135 136 /// Atomically copy a continuous memory chunk into the userfaultfd-registered range, and return 137 /// the number of bytes that were successfully copied. 138 /// 139 /// If `wake` is `true`, wake up the thread waiting for page fault resolution on the memory 140 /// range. copy( &self, src: *const c_void, dst: *mut c_void, len: usize, wake: bool, ) -> Result<usize>141 pub unsafe fn copy( 142 &self, 143 src: *const c_void, 144 dst: *mut c_void, 145 len: usize, 146 wake: bool, 147 ) -> Result<usize> { 148 let mut copy = raw::uffdio_copy { 149 src: src as u64, 150 dst: dst as u64, 151 len: len as u64, 152 mode: if wake { 153 0 154 } else { 155 raw::UFFDIO_COPY_MODE_DONTWAKE 156 }, 157 copy: 0, 158 }; 159 160 let _ = 161 raw::copy(self.as_raw_fd(), &mut copy as *mut raw::uffdio_copy).map_err(|errno| { 162 match errno { 163 Errno::EAGAIN => Error::PartiallyCopied(copy.copy as usize), 164 _ => Error::CopyFailed(errno), 165 } 166 })?; 167 if copy.copy < 0 { 168 // shouldn't ever get here, as errno should be caught above 169 Err(Error::CopyFailed(Errno::from_i32(-copy.copy as i32))) 170 } else { 171 Ok(copy.copy as usize) 172 } 173 } 174 175 /// Zero out a memory address range registered with userfaultfd, and return the number of bytes 176 /// that were successfully zeroed. 177 /// 178 /// If `wake` is `true`, wake up the thread waiting for page fault resolution on the memory 179 /// address range. zeropage(&self, start: *mut c_void, len: usize, wake: bool) -> Result<usize>180 pub unsafe fn zeropage(&self, start: *mut c_void, len: usize, wake: bool) -> Result<usize> { 181 let mut zeropage = raw::uffdio_zeropage { 182 range: raw::uffdio_range { 183 start: start as u64, 184 len: len as u64, 185 }, 186 mode: if wake { 187 0 188 } else { 189 raw::UFFDIO_ZEROPAGE_MODE_DONTWAKE 190 }, 191 zeropage: 0, 192 }; 193 194 let _ = raw::zeropage(self.as_raw_fd(), &mut zeropage as &mut raw::uffdio_zeropage) 195 .map_err(Error::ZeropageFailed)?; 196 if zeropage.zeropage < 0 { 197 // shouldn't ever get here, as errno should be caught above 198 Err(Error::ZeropageFailed(Errno::from_i32( 199 -zeropage.zeropage as i32, 200 ))) 201 } else { 202 Ok(zeropage.zeropage as usize) 203 } 204 } 205 206 /// Wake up the thread waiting for page fault resolution on the specified memory address range. wake(&self, start: *mut c_void, len: usize) -> Result<()>207 pub fn wake(&self, start: *mut c_void, len: usize) -> Result<()> { 208 let mut range = raw::uffdio_range { 209 start: start as u64, 210 len: len as u64, 211 }; 212 unsafe { 213 raw::wake(self.as_raw_fd(), &mut range as *mut raw::uffdio_range)?; 214 } 215 Ok(()) 216 } 217 218 /// Makes a range write-protected. 219 #[cfg(feature = "linux5_7")] write_protect(&self, start: *mut c_void, len: usize) -> Result<()>220 pub fn write_protect(&self, start: *mut c_void, len: usize) -> Result<()> { 221 let mut ioctl = raw::uffdio_writeprotect { 222 range: raw::uffdio_range { 223 start: start as u64, 224 len: len as u64, 225 }, 226 mode: raw::UFFDIO_WRITEPROTECT_MODE_WP, 227 }; 228 229 unsafe { 230 raw::write_protect( 231 self.as_raw_fd(), 232 &mut ioctl as *mut raw::uffdio_writeprotect, 233 )?; 234 } 235 236 Ok(()) 237 } 238 239 /// Removes the write-protection for a range. 240 /// 241 /// If `wake` is `true`, wake up the thread waiting for page fault resolution on the memory 242 /// address range. 243 #[cfg(feature = "linux5_7")] remove_write_protection( &self, start: *mut c_void, len: usize, wake: bool, ) -> Result<()>244 pub fn remove_write_protection( 245 &self, 246 start: *mut c_void, 247 len: usize, 248 wake: bool, 249 ) -> Result<()> { 250 let mut ioctl = raw::uffdio_writeprotect { 251 range: raw::uffdio_range { 252 start: start as u64, 253 len: len as u64, 254 }, 255 mode: if wake { 256 0 257 } else { 258 raw::UFFDIO_WRITEPROTECT_MODE_DONTWAKE 259 }, 260 }; 261 262 unsafe { 263 raw::write_protect( 264 self.as_raw_fd(), 265 &mut ioctl as *mut raw::uffdio_writeprotect, 266 )?; 267 } 268 269 Ok(()) 270 } 271 272 /// Read an `Event` from the userfaultfd object. 273 /// 274 /// If the `Uffd` object was created with `non_blocking` set to `false`, this will block until 275 /// an event is successfully read (returning `Some(event)`, or an error is returned. 276 /// 277 /// If `non_blocking` was `true`, this will immediately return `None` if no event is ready to 278 /// read. 279 /// 280 /// Note that while this method doesn't require a mutable reference to the `Uffd` object, it 281 /// does consume bytes (thread-safely) from the underlying file descriptor. 282 /// 283 /// # Examples 284 /// 285 /// ```rust 286 /// # use userfaultfd::{Uffd, Result}; 287 /// fn read_event(uffd: &Uffd) -> Result<()> { 288 /// // Read a single event 289 /// match uffd.read_event()? { 290 /// Some(e) => { 291 /// // Do something with the event 292 /// }, 293 /// None => { 294 /// // This was a non-blocking read and the descriptor was not ready for read 295 /// }, 296 /// } 297 /// Ok(()) 298 /// } 299 /// ``` read_event(&self) -> Result<Option<Event>>300 pub fn read_event(&self) -> Result<Option<Event>> { 301 let mut buf = [unsafe { std::mem::zeroed() }; 1]; 302 let mut iter = self.read(&mut buf)?; 303 let event = iter.next().transpose()?; 304 assert!(iter.next().is_none()); 305 Ok(event) 306 } 307 308 /// Read multiple events from the userfaultfd object using the given event buffer. 309 /// 310 /// If the `Uffd` object was created with `non_blocking` set to `false`, this will block until 311 /// an event is successfully read or an error is returned. 312 /// 313 /// If `non_blocking` was `true`, this will immediately return an empty iterator if the file 314 /// descriptor is not ready for reading. 315 /// 316 /// # Examples 317 /// 318 /// ```rust 319 /// # use userfaultfd::{Uffd, EventBuffer}; 320 /// fn read_events(uffd: &Uffd) -> userfaultfd::Result<()> { 321 /// // Read up to 100 events at a time 322 /// let mut buf = EventBuffer::new(100); 323 /// for event in uffd.read_events(&mut buf)? { 324 /// let event = event?; 325 /// // Do something with the event... 326 /// } 327 /// Ok(()) 328 /// } 329 /// ``` read_events<'a>( &self, buf: &'a mut EventBuffer, ) -> Result<impl Iterator<Item = Result<Event>> + 'a>330 pub fn read_events<'a>( 331 &self, 332 buf: &'a mut EventBuffer, 333 ) -> Result<impl Iterator<Item = Result<Event>> + 'a> { 334 self.read(&mut buf.0) 335 } 336 read<'a>( &self, msgs: &'a mut [raw::uffd_msg], ) -> Result<impl Iterator<Item = Result<Event>> + 'a>337 fn read<'a>( 338 &self, 339 msgs: &'a mut [raw::uffd_msg], 340 ) -> Result<impl Iterator<Item = Result<Event>> + 'a> { 341 const MSG_SIZE: usize = std::mem::size_of::<raw::uffd_msg>(); 342 343 let buf = unsafe { 344 std::slice::from_raw_parts_mut(msgs.as_mut_ptr() as _, msgs.len() * MSG_SIZE) 345 }; 346 347 let count = match read(self.as_raw_fd(), buf) { 348 Err(e) if e == Errno::EAGAIN => 0, 349 Err(e) => return Err(Error::SystemError(e)), 350 Ok(0) => return Err(Error::ReadEof), 351 Ok(bytes_read) => { 352 let remainder = bytes_read % MSG_SIZE; 353 if remainder != 0 { 354 return Err(Error::IncompleteMsg { 355 read: remainder, 356 expected: MSG_SIZE, 357 }); 358 } 359 360 bytes_read / MSG_SIZE 361 } 362 }; 363 364 Ok(msgs.iter().take(count).map(|msg| Event::from_uffd_msg(msg))) 365 } 366 } 367 368 bitflags! { 369 /// Used with `UffdBuilder` and `Uffd::register()` to determine which operations are available. 370 #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] 371 pub struct IoctlFlags: u64 { 372 const REGISTER = 1 << raw::_UFFDIO_REGISTER; 373 const UNREGISTER = 1 << raw::_UFFDIO_UNREGISTER; 374 const WAKE = 1 << raw::_UFFDIO_WAKE; 375 const COPY = 1 << raw::_UFFDIO_COPY; 376 const ZEROPAGE = 1 << raw::_UFFDIO_ZEROPAGE; 377 #[cfg(feature = "linux5_7")] 378 const WRITE_PROTECT = 1 << raw::_UFFDIO_WRITEPROTECT; 379 const API = 1 << raw::_UFFDIO_API; 380 } 381 } 382 383 #[cfg(test)] 384 mod test { 385 use super::*; 386 use std::ptr; 387 use std::thread; 388 389 #[test] test_read_event() -> Result<()>390 fn test_read_event() -> Result<()> { 391 const PAGE_SIZE: usize = 4096; 392 393 unsafe { 394 let uffd = UffdBuilder::new().close_on_exec(true).create()?; 395 396 let mapping = libc::mmap( 397 ptr::null_mut(), 398 PAGE_SIZE, 399 libc::PROT_READ | libc::PROT_WRITE, 400 libc::MAP_PRIVATE | libc::MAP_ANON, 401 -1, 402 0, 403 ); 404 405 assert!(!mapping.is_null()); 406 407 uffd.register(mapping, PAGE_SIZE)?; 408 409 let ptr = mapping as usize; 410 let thread = thread::spawn(move || { 411 let ptr = ptr as *mut u8; 412 *ptr = 1; 413 }); 414 415 match uffd.read_event()? { 416 Some(Event::Pagefault { 417 rw: ReadWrite::Write, 418 addr, 419 .. 420 }) => { 421 assert_eq!(addr, mapping); 422 uffd.zeropage(addr, PAGE_SIZE, true)?; 423 } 424 _ => panic!("unexpected event"), 425 } 426 427 thread.join().expect("failed to join thread"); 428 429 uffd.unregister(mapping, PAGE_SIZE)?; 430 431 assert_eq!(libc::munmap(mapping, PAGE_SIZE), 0); 432 } 433 434 Ok(()) 435 } 436 437 #[test] test_nonblocking_read_event() -> Result<()>438 fn test_nonblocking_read_event() -> Result<()> { 439 const PAGE_SIZE: usize = 4096; 440 441 unsafe { 442 let uffd = UffdBuilder::new() 443 .close_on_exec(true) 444 .non_blocking(true) 445 .create()?; 446 447 let mapping = libc::mmap( 448 ptr::null_mut(), 449 PAGE_SIZE, 450 libc::PROT_READ | libc::PROT_WRITE, 451 libc::MAP_PRIVATE | libc::MAP_ANON, 452 -1, 453 0, 454 ); 455 456 assert!(!mapping.is_null()); 457 458 uffd.register(mapping, PAGE_SIZE)?; 459 460 assert!(uffd.read_event()?.is_none()); 461 462 let ptr = mapping as usize; 463 let thread = thread::spawn(move || { 464 let ptr = ptr as *mut u8; 465 *ptr = 1; 466 }); 467 468 loop { 469 match uffd.read_event()? { 470 Some(Event::Pagefault { 471 rw: ReadWrite::Write, 472 addr, 473 .. 474 }) => { 475 assert_eq!(addr, mapping); 476 uffd.zeropage(addr, PAGE_SIZE, true)?; 477 break; 478 } 479 Some(_) => panic!("unexpected event"), 480 None => thread::sleep(std::time::Duration::from_millis(50)), 481 } 482 } 483 484 thread.join().expect("failed to join thread"); 485 486 uffd.unregister(mapping, PAGE_SIZE)?; 487 488 assert_eq!(libc::munmap(mapping, PAGE_SIZE), 0); 489 } 490 491 Ok(()) 492 } 493 494 #[test] test_read_events() -> Result<()>495 fn test_read_events() -> Result<()> { 496 unsafe { 497 const MAX_THREADS: usize = 5; 498 const PAGE_SIZE: usize = 4096; 499 const MEM_SIZE: usize = PAGE_SIZE * MAX_THREADS; 500 501 let uffd = UffdBuilder::new().close_on_exec(true).create()?; 502 503 let mapping = libc::mmap( 504 ptr::null_mut(), 505 MEM_SIZE, 506 libc::PROT_READ | libc::PROT_WRITE, 507 libc::MAP_PRIVATE | libc::MAP_ANON, 508 -1, 509 0, 510 ); 511 512 assert!(!mapping.is_null()); 513 514 uffd.register(mapping, MEM_SIZE)?; 515 516 // As accessing the memory will suspend each thread with a page fault event, 517 // there is no way to signal that the operations the test thread is waiting on to 518 // complete have been performed. 519 // 520 // Therefore, this is inherently racy. The best we can do is simply sleep-wait for 521 // all threads to have signaled that the operation is *about to be performed*. 522 let mut seen = [false; MAX_THREADS]; 523 let mut threads = Vec::new(); 524 for i in 0..MAX_THREADS { 525 let seen = &mut seen[i] as *mut _ as usize; 526 let ptr = (mapping as *mut u8).add(PAGE_SIZE * i) as usize; 527 threads.push(thread::spawn(move || { 528 let seen = seen as *mut bool; 529 let ptr = ptr as *mut u8; 530 *seen = true; 531 *ptr = 1; 532 })); 533 } 534 535 loop { 536 // Sleep even if all threads have "signaled", just in case any 537 // thread is preempted prior to faulting the memory access. 538 // Still, there's no guarantee that the call to `read_events` below will 539 // read all the events at once, but this should be "good enough". 540 let done = seen.iter().all(|b| *b); 541 thread::sleep(std::time::Duration::from_millis(50)); 542 if done { 543 break; 544 } 545 } 546 547 // Read all the events at once 548 let mut buf = EventBuffer::new(MAX_THREADS); 549 let mut iter = uffd.read_events(&mut buf)?; 550 551 let mut seen = [false; MAX_THREADS]; 552 for _ in 0..MAX_THREADS { 553 match iter 554 .next() 555 .transpose()? 556 .expect("failed to read all events; potential race condition was hit") 557 { 558 Event::Pagefault { 559 rw: ReadWrite::Write, 560 addr, 561 .. 562 } => { 563 let index = (addr as usize - mapping as usize) / PAGE_SIZE; 564 assert_eq!(seen[index], false); 565 seen[index] = true; 566 uffd.zeropage(addr, PAGE_SIZE, true)?; 567 } 568 _ => panic!("unexpected event"), 569 } 570 } 571 572 assert!(seen.iter().all(|b| *b)); 573 574 for thread in threads { 575 thread.join().expect("failed to join thread"); 576 } 577 578 uffd.unregister(mapping, MEM_SIZE)?; 579 580 assert_eq!(libc::munmap(mapping, MEM_SIZE), 0); 581 } 582 583 Ok(()) 584 } 585 586 #[cfg(feature = "linux5_7")] 587 #[test] test_write_protect() -> Result<()>588 fn test_write_protect() -> Result<()> { 589 const PAGE_SIZE: usize = 4096; 590 591 unsafe { 592 let uffd = UffdBuilder::new() 593 .require_features(FeatureFlags::PAGEFAULT_FLAG_WP) 594 .close_on_exec(true) 595 .create()?; 596 597 let mapping = libc::mmap( 598 ptr::null_mut(), 599 PAGE_SIZE, 600 libc::PROT_READ | libc::PROT_WRITE, 601 libc::MAP_PRIVATE | libc::MAP_ANON, 602 -1, 603 0, 604 ); 605 606 assert!(!mapping.is_null()); 607 608 // This test uses both missing and write-protect modes for a reason. 609 // The `uffdio_writeprotect` ioctl can only be used on a range *after* 610 // the missing fault is handled, it seems. This means we either need to 611 // read/write the page *before* we protect it or handle the missing 612 // page fault by changing the protection level *after* we zero the page. 613 assert!(uffd 614 .register_with_mode( 615 mapping, 616 PAGE_SIZE, 617 RegisterMode::MISSING | RegisterMode::WRITE_PROTECT 618 )? 619 .contains(IoctlFlags::WRITE_PROTECT)); 620 621 let ptr = mapping as usize; 622 let thread = thread::spawn(move || { 623 let ptr = ptr as *mut u8; 624 *ptr = 1; 625 *ptr = 2; 626 }); 627 628 loop { 629 match uffd.read_event()? { 630 Some(Event::Pagefault { 631 kind, 632 rw: ReadWrite::Write, 633 addr, 634 .. 635 }) => match kind { 636 FaultKind::WriteProtected => { 637 assert_eq!(addr, mapping); 638 assert_eq!(*(addr as *const u8), 0); 639 // Remove the protection and wake the page 640 uffd.remove_write_protection(mapping, PAGE_SIZE, true)?; 641 break; 642 } 643 FaultKind::Missing => { 644 assert_eq!(addr, mapping); 645 uffd.zeropage(mapping, PAGE_SIZE, false)?; 646 647 // Technically, we already know it was a write that triggered 648 // the missing page fault, so there's little point in immediately 649 // write-protecting the page to cause another fault; in the real 650 // world, a missing fault with `rw` being `ReadWrite::Write` would 651 // be enough to mark the page as "dirty". For this test, however, 652 // we do it this way to ensure a write-protected fault is read. 653 assert_eq!(*(addr as *const u8), 0); 654 uffd.write_protect(mapping, PAGE_SIZE)?; 655 uffd.wake(mapping, PAGE_SIZE)?; 656 } 657 }, 658 _ => panic!("unexpected event"), 659 } 660 } 661 662 thread.join().expect("failed to join thread"); 663 664 assert_eq!(*(mapping as *const u8), 2); 665 666 uffd.unregister(mapping, PAGE_SIZE)?; 667 668 assert_eq!(libc::munmap(mapping, PAGE_SIZE), 0); 669 } 670 671 Ok(()) 672 } 673 } 674