xref: /aosp_15_r20/external/crosvm/base/src/sys/linux/mod.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 //! Small system utility modules for usage by other modules.
6 
7 #[cfg(target_os = "android")]
8 mod android;
9 #[cfg(target_os = "android")]
10 use android as target_os;
11 #[cfg(target_os = "linux")]
12 #[allow(clippy::module_inception)]
13 mod linux;
14 #[cfg(target_os = "linux")]
15 use linux as target_os;
16 use log::warn;
17 #[macro_use]
18 pub mod ioctl;
19 #[macro_use]
20 pub mod syslog;
21 mod acpi_event;
22 mod capabilities;
23 mod descriptor;
24 mod event;
25 mod file;
26 mod file_traits;
27 mod mmap;
28 mod net;
29 mod netlink;
30 mod notifiers;
31 pub mod platform_timer_resolution;
32 mod poll;
33 mod priority;
34 mod sched;
35 mod shm;
36 pub mod signal;
37 mod signalfd;
38 mod terminal;
39 mod timer;
40 pub mod vsock;
41 mod write_zeroes;
42 
43 use std::ffi::CString;
44 use std::fs::remove_file;
45 use std::fs::File;
46 use std::fs::OpenOptions;
47 use std::mem;
48 use std::mem::MaybeUninit;
49 use std::ops::Deref;
50 use std::os::unix::io::FromRawFd;
51 use std::os::unix::io::RawFd;
52 use std::os::unix::net::UnixDatagram;
53 use std::os::unix::net::UnixListener;
54 use std::os::unix::process::ExitStatusExt;
55 use std::path::Path;
56 use std::path::PathBuf;
57 use std::process::ExitStatus;
58 use std::ptr;
59 use std::time::Duration;
60 
61 pub use acpi_event::*;
62 pub use capabilities::drop_capabilities;
63 pub use event::EventExt;
64 pub(crate) use event::PlatformEvent;
65 pub use file::find_next_data;
66 pub use file::FileDataIterator;
67 pub(crate) use file_traits::lib::*;
68 pub use ioctl::*;
69 use libc::c_int;
70 use libc::c_long;
71 use libc::fcntl;
72 use libc::pipe2;
73 use libc::prctl;
74 use libc::syscall;
75 use libc::waitpid;
76 use libc::SYS_getpid;
77 use libc::SYS_getppid;
78 use libc::SYS_gettid;
79 use libc::EINVAL;
80 use libc::O_CLOEXEC;
81 use libc::PR_SET_NAME;
82 use libc::SIGKILL;
83 use libc::WNOHANG;
84 pub use mmap::*;
85 pub(in crate::sys) use net::sendmsg_nosignal as sendmsg;
86 pub(in crate::sys) use net::sockaddr_un;
87 pub(in crate::sys) use net::sockaddrv4_to_lib_c;
88 pub(in crate::sys) use net::sockaddrv6_to_lib_c;
89 pub use netlink::*;
90 use once_cell::sync::OnceCell;
91 pub use poll::EventContext;
92 pub use priority::*;
93 pub use sched::*;
94 pub use shm::MemfdSeals;
95 pub use shm::SharedMemoryLinux;
96 pub use signal::*;
97 pub use signalfd::Error as SignalFdError;
98 pub use signalfd::*;
99 pub use terminal::*;
100 pub(crate) use write_zeroes::file_punch_hole;
101 pub(crate) use write_zeroes::file_write_zeroes_at;
102 
103 use crate::descriptor::FromRawDescriptor;
104 use crate::descriptor::SafeDescriptor;
105 pub use crate::errno::Error;
106 pub use crate::errno::Result;
107 pub use crate::errno::*;
108 use crate::number_of_logical_cores;
109 use crate::round_up_to_page_size;
110 pub use crate::sys::unix::descriptor::*;
111 use crate::syscall;
112 use crate::AsRawDescriptor;
113 use crate::Pid;
114 
115 /// Re-export libc types that are part of the API.
116 pub type Uid = libc::uid_t;
117 pub type Gid = libc::gid_t;
118 pub type Mode = libc::mode_t;
119 
120 /// Safe wrapper for PR_SET_NAME(2const)
121 #[inline(always)]
set_thread_name(name: &str) -> Result<()>122 pub fn set_thread_name(name: &str) -> Result<()> {
123     let name = CString::new(name).or(Err(Error::new(EINVAL)))?;
124     // SAFETY: prctl copies name and doesn't expect it to outlive this function.
125     let ret = unsafe { prctl(PR_SET_NAME, name.as_c_str()) };
126     if ret == 0 {
127         Ok(())
128     } else {
129         errno_result()
130     }
131 }
132 
133 /// This bypasses `libc`'s caching `getpid(2)` wrapper which can be invalid if a raw clone was used
134 /// elsewhere.
135 #[inline(always)]
getpid() -> Pid136 pub fn getpid() -> Pid {
137     // SAFETY:
138     // Safe because this syscall can never fail and we give it a valid syscall number.
139     unsafe { syscall(SYS_getpid as c_long) as Pid }
140 }
141 
142 /// Safe wrapper for the geppid Linux systemcall.
143 #[inline(always)]
getppid() -> Pid144 pub fn getppid() -> Pid {
145     // SAFETY:
146     // Safe because this syscall can never fail and we give it a valid syscall number.
147     unsafe { syscall(SYS_getppid as c_long) as Pid }
148 }
149 
150 /// Safe wrapper for the gettid Linux systemcall.
gettid() -> Pid151 pub fn gettid() -> Pid {
152     // SAFETY:
153     // Calling the gettid() sycall is always safe.
154     unsafe { syscall(SYS_gettid as c_long) as Pid }
155 }
156 
157 /// Safe wrapper for `geteuid(2)`.
158 #[inline(always)]
geteuid() -> Uid159 pub fn geteuid() -> Uid {
160     // SAFETY:
161     // trivially safe
162     unsafe { libc::geteuid() }
163 }
164 
165 /// Safe wrapper for `getegid(2)`.
166 #[inline(always)]
getegid() -> Gid167 pub fn getegid() -> Gid {
168     // SAFETY:
169     // trivially safe
170     unsafe { libc::getegid() }
171 }
172 
173 /// The operation to perform with `flock`.
174 pub enum FlockOperation {
175     LockShared,
176     LockExclusive,
177     Unlock,
178 }
179 
180 /// Safe wrapper for flock(2) with the operation `op` and optionally `nonblocking`. The lock will be
181 /// dropped automatically when `file` is dropped.
182 #[inline(always)]
flock<F: AsRawDescriptor>(file: &F, op: FlockOperation, nonblocking: bool) -> Result<()>183 pub fn flock<F: AsRawDescriptor>(file: &F, op: FlockOperation, nonblocking: bool) -> Result<()> {
184     let mut operation = match op {
185         FlockOperation::LockShared => libc::LOCK_SH,
186         FlockOperation::LockExclusive => libc::LOCK_EX,
187         FlockOperation::Unlock => libc::LOCK_UN,
188     };
189 
190     if nonblocking {
191         operation |= libc::LOCK_NB;
192     }
193 
194     // SAFETY:
195     // Safe since we pass in a valid fd and flock operation, and check the return value.
196     syscall!(unsafe { libc::flock(file.as_raw_descriptor(), operation) }).map(|_| ())
197 }
198 
199 /// The operation to perform with `fallocate`.
200 pub enum FallocateMode {
201     PunchHole,
202     ZeroRange,
203     Allocate,
204 }
205 
206 impl From<FallocateMode> for i32 {
from(value: FallocateMode) -> Self207     fn from(value: FallocateMode) -> Self {
208         match value {
209             FallocateMode::Allocate => libc::FALLOC_FL_KEEP_SIZE,
210             FallocateMode::PunchHole => libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE,
211             FallocateMode::ZeroRange => libc::FALLOC_FL_ZERO_RANGE | libc::FALLOC_FL_KEEP_SIZE,
212         }
213     }
214 }
215 
216 impl From<FallocateMode> for u32 {
from(value: FallocateMode) -> Self217     fn from(value: FallocateMode) -> Self {
218         Into::<i32>::into(value) as u32
219     }
220 }
221 
222 /// Safe wrapper for `fallocate()`.
fallocate<F: AsRawDescriptor>( file: &F, mode: FallocateMode, offset: u64, len: u64, ) -> Result<()>223 pub fn fallocate<F: AsRawDescriptor>(
224     file: &F,
225     mode: FallocateMode,
226     offset: u64,
227     len: u64,
228 ) -> Result<()> {
229     let offset = if offset > libc::off64_t::MAX as u64 {
230         return Err(Error::new(libc::EINVAL));
231     } else {
232         offset as libc::off64_t
233     };
234 
235     let len = if len > libc::off64_t::MAX as u64 {
236         return Err(Error::new(libc::EINVAL));
237     } else {
238         len as libc::off64_t
239     };
240 
241     // SAFETY:
242     // Safe since we pass in a valid fd and fallocate mode, validate offset and len,
243     // and check the return value.
244     syscall!(unsafe { libc::fallocate64(file.as_raw_descriptor(), mode.into(), offset, len) })
245         .map(|_| ())
246 }
247 
248 /// Safe wrapper for `fstat()`.
fstat<F: AsRawDescriptor>(f: &F) -> Result<libc::stat64>249 pub fn fstat<F: AsRawDescriptor>(f: &F) -> Result<libc::stat64> {
250     let mut st = MaybeUninit::<libc::stat64>::zeroed();
251 
252     // SAFETY:
253     // Safe because the kernel will only write data in `st` and we check the return
254     // value.
255     syscall!(unsafe { libc::fstat64(f.as_raw_descriptor(), st.as_mut_ptr()) })?;
256 
257     // SAFETY:
258     // Safe because the kernel guarantees that the struct is now fully initialized.
259     Ok(unsafe { st.assume_init() })
260 }
261 
262 /// Checks whether a file is a block device fie or not.
is_block_file<F: AsRawDescriptor>(file: &F) -> Result<bool>263 pub fn is_block_file<F: AsRawDescriptor>(file: &F) -> Result<bool> {
264     let stat = fstat(file)?;
265     Ok((stat.st_mode & libc::S_IFMT) == libc::S_IFBLK)
266 }
267 
268 const BLOCK_IO_TYPE: u32 = 0x12;
269 ioctl_io_nr!(BLKDISCARD, BLOCK_IO_TYPE, 119);
270 
271 /// Discards the given range of a block file.
discard_block<F: AsRawDescriptor>(file: &F, offset: u64, len: u64) -> Result<()>272 pub fn discard_block<F: AsRawDescriptor>(file: &F, offset: u64, len: u64) -> Result<()> {
273     let range: [u64; 2] = [offset, len];
274     // SAFETY:
275     // Safe because
276     // - we check the return value.
277     // - ioctl(BLKDISCARD) does not hold the descriptor after the call.
278     // - ioctl(BLKDISCARD) does not break the file descriptor.
279     // - ioctl(BLKDISCARD) does not modify the given range.
280     syscall!(unsafe { libc::ioctl(file.as_raw_descriptor(), BLKDISCARD, &range) }).map(|_| ())
281 }
282 
283 /// A trait used to abstract types that provide a process id that can be operated on.
284 pub trait AsRawPid {
as_raw_pid(&self) -> Pid285     fn as_raw_pid(&self) -> Pid;
286 }
287 
288 impl AsRawPid for Pid {
as_raw_pid(&self) -> Pid289     fn as_raw_pid(&self) -> Pid {
290         *self
291     }
292 }
293 
294 impl AsRawPid for std::process::Child {
as_raw_pid(&self) -> Pid295     fn as_raw_pid(&self) -> Pid {
296         self.id() as Pid
297     }
298 }
299 
300 /// A safe wrapper around waitpid.
301 ///
302 /// On success if a process was reaped, it will be returned as the first value.
303 /// The second returned value is the ExitStatus from the libc::waitpid() call.
304 ///
305 /// Note: this can block if libc::WNOHANG is not set and EINTR is not handled internally.
wait_for_pid<A: AsRawPid>(pid: A, options: c_int) -> Result<(Option<Pid>, ExitStatus)>306 pub fn wait_for_pid<A: AsRawPid>(pid: A, options: c_int) -> Result<(Option<Pid>, ExitStatus)> {
307     let pid = pid.as_raw_pid();
308     let mut status: c_int = 1;
309     // SAFETY:
310     // Safe because status is owned and the error is checked.
311     let ret = unsafe { libc::waitpid(pid, &mut status, options) };
312     if ret < 0 {
313         return errno_result();
314     }
315     Ok((
316         if ret == 0 { None } else { Some(ret) },
317         ExitStatus::from_raw(status),
318     ))
319 }
320 
321 /// Reaps a child process that has terminated.
322 ///
323 /// Returns `Ok(pid)` where `pid` is the process that was reaped or `Ok(0)` if none of the children
324 /// have terminated. An `Error` is with `errno == ECHILD` if there are no children left to reap.
325 ///
326 /// # Examples
327 ///
328 /// Reaps all child processes until there are no terminated children to reap.
329 ///
330 /// ```
331 /// fn reap_children() {
332 ///     loop {
333 ///         match base::linux::reap_child() {
334 ///             Ok(0) => println!("no children ready to reap"),
335 ///             Ok(pid) => {
336 ///                 println!("reaped {}", pid);
337 ///                 continue
338 ///             },
339 ///             Err(e) if e.errno() == libc::ECHILD => println!("no children left"),
340 ///             Err(e) => println!("error reaping children: {}", e),
341 ///         }
342 ///         break
343 ///     }
344 /// }
345 /// ```
reap_child() -> Result<Pid>346 pub fn reap_child() -> Result<Pid> {
347     // SAFETY:
348     // Safe because we pass in no memory, prevent blocking with WNOHANG, and check for error.
349     let ret = unsafe { waitpid(-1, ptr::null_mut(), WNOHANG) };
350     if ret == -1 {
351         errno_result()
352     } else {
353         Ok(ret)
354     }
355 }
356 
357 /// Kill all processes in the current process group.
358 ///
359 /// On success, this kills all processes in the current process group, including the current
360 /// process, meaning this will not return. This is equivalent to a call to `kill(0, SIGKILL)`.
kill_process_group() -> Result<()>361 pub fn kill_process_group() -> Result<()> {
362     // SAFETY: Safe because pid is 'self group' and return value doesn't matter.
363     unsafe { kill(0, SIGKILL) }?;
364     // Kill succeeded, so this process never reaches here.
365     unreachable!();
366 }
367 
368 /// Spawns a pipe pair where the first pipe is the read end and the second pipe is the write end.
369 ///
370 /// The `O_CLOEXEC` flag will be set during pipe creation.
pipe() -> Result<(File, File)>371 pub fn pipe() -> Result<(File, File)> {
372     let mut pipe_fds = [-1; 2];
373     // SAFETY:
374     // Safe because pipe2 will only write 2 element array of i32 to the given pointer, and we check
375     // for error.
376     let ret = unsafe { pipe2(&mut pipe_fds[0], O_CLOEXEC) };
377     if ret == -1 {
378         errno_result()
379     } else {
380         // SAFETY:
381         // Safe because both fds must be valid for pipe2 to have returned sucessfully and we have
382         // exclusive ownership of them.
383         Ok(unsafe {
384             (
385                 File::from_raw_fd(pipe_fds[0]),
386                 File::from_raw_fd(pipe_fds[1]),
387             )
388         })
389     }
390 }
391 
392 /// Sets the pipe signified with fd to `size`.
393 ///
394 /// Returns the new size of the pipe or an error if the OS fails to set the pipe size.
set_pipe_size(fd: RawFd, size: usize) -> Result<usize>395 pub fn set_pipe_size(fd: RawFd, size: usize) -> Result<usize> {
396     // SAFETY:
397     // Safe because fcntl with the `F_SETPIPE_SZ` arg doesn't touch memory.
398     syscall!(unsafe { fcntl(fd, libc::F_SETPIPE_SZ, size as c_int) }).map(|ret| ret as usize)
399 }
400 
401 /// Test-only function used to create a pipe that is full. The pipe is created, has its size set to
402 /// the minimum and then has that much data written to it. Use `new_pipe_full` to test handling of
403 /// blocking `write` calls in unit tests.
new_pipe_full() -> Result<(File, File)>404 pub fn new_pipe_full() -> Result<(File, File)> {
405     use std::io::Write;
406 
407     let (rx, mut tx) = pipe()?;
408     // The smallest allowed size of a pipe is the system page size on linux.
409     let page_size = set_pipe_size(tx.as_raw_descriptor(), round_up_to_page_size(1))?;
410 
411     // Fill the pipe with page_size zeros so the next write call will block.
412     let buf = vec![0u8; page_size];
413     tx.write_all(&buf)?;
414 
415     Ok((rx, tx))
416 }
417 
418 /// Used to attempt to clean up a named pipe after it is no longer used.
419 pub struct UnlinkUnixDatagram(pub UnixDatagram);
420 impl AsRef<UnixDatagram> for UnlinkUnixDatagram {
as_ref(&self) -> &UnixDatagram421     fn as_ref(&self) -> &UnixDatagram {
422         &self.0
423     }
424 }
425 impl Drop for UnlinkUnixDatagram {
drop(&mut self)426     fn drop(&mut self) {
427         if let Ok(addr) = self.0.local_addr() {
428             if let Some(path) = addr.as_pathname() {
429                 if let Err(e) = remove_file(path) {
430                     warn!("failed to remove control socket file: {}", e);
431                 }
432             }
433         }
434     }
435 }
436 
437 /// Used to attempt to clean up a named pipe after it is no longer used.
438 pub struct UnlinkUnixListener(pub UnixListener);
439 
440 impl AsRef<UnixListener> for UnlinkUnixListener {
as_ref(&self) -> &UnixListener441     fn as_ref(&self) -> &UnixListener {
442         &self.0
443     }
444 }
445 
446 impl Deref for UnlinkUnixListener {
447     type Target = UnixListener;
448 
deref(&self) -> &UnixListener449     fn deref(&self) -> &UnixListener {
450         &self.0
451     }
452 }
453 
454 impl Drop for UnlinkUnixListener {
drop(&mut self)455     fn drop(&mut self) {
456         if let Ok(addr) = self.0.local_addr() {
457             if let Some(path) = addr.as_pathname() {
458                 if let Err(e) = remove_file(path) {
459                     warn!("failed to remove control socket file: {}", e);
460                 }
461             }
462         }
463     }
464 }
465 
466 /// Verifies that |raw_descriptor| is actually owned by this process and duplicates it
467 /// to ensure that we have a unique handle to it.
validate_raw_descriptor(raw_descriptor: RawDescriptor) -> Result<RawDescriptor>468 pub fn validate_raw_descriptor(raw_descriptor: RawDescriptor) -> Result<RawDescriptor> {
469     validate_raw_fd(&raw_descriptor)
470 }
471 
472 /// Verifies that |raw_fd| is actually owned by this process and duplicates it to ensure that
473 /// we have a unique handle to it.
validate_raw_fd(raw_fd: &RawFd) -> Result<RawFd>474 pub fn validate_raw_fd(raw_fd: &RawFd) -> Result<RawFd> {
475     // Checking that close-on-exec isn't set helps filter out FDs that were opened by
476     // crosvm as all crosvm FDs are close on exec.
477     // SAFETY:
478     // Safe because this doesn't modify any memory and we check the return value.
479     let flags = unsafe { libc::fcntl(*raw_fd, libc::F_GETFD) };
480     if flags < 0 || (flags & libc::FD_CLOEXEC) != 0 {
481         return Err(Error::new(libc::EBADF));
482     }
483 
484     // SAFETY:
485     // Duplicate the fd to ensure that we don't accidentally close an fd previously
486     // opened by another subsystem.  Safe because this doesn't modify any memory and
487     // we check the return value.
488     let dup_fd = unsafe { libc::fcntl(*raw_fd, libc::F_DUPFD_CLOEXEC, 0) };
489     if dup_fd < 0 {
490         return Err(Error::last());
491     }
492     Ok(dup_fd as RawFd)
493 }
494 
495 /// Utility function that returns true if the given FD is readable without blocking.
496 ///
497 /// On an error, such as an invalid or incompatible FD, this will return false, which can not be
498 /// distinguished from a non-ready to read FD.
poll_in<F: AsRawDescriptor>(fd: &F) -> bool499 pub fn poll_in<F: AsRawDescriptor>(fd: &F) -> bool {
500     let mut fds = libc::pollfd {
501         fd: fd.as_raw_descriptor(),
502         events: libc::POLLIN,
503         revents: 0,
504     };
505     // SAFETY:
506     // Safe because we give a valid pointer to a list (of 1) FD and check the return value.
507     let ret = unsafe { libc::poll(&mut fds, 1, 0) };
508     // An error probably indicates an invalid FD, or an FD that can't be polled. Returning false in
509     // that case is probably correct as such an FD is unlikely to be readable, although there are
510     // probably corner cases in which that is wrong.
511     if ret == -1 {
512         return false;
513     }
514     fds.revents & libc::POLLIN != 0
515 }
516 
517 /// Return the maximum Duration that can be used with libc::timespec.
max_timeout() -> Duration518 pub fn max_timeout() -> Duration {
519     Duration::new(libc::time_t::MAX as u64, 999999999)
520 }
521 
522 /// If the given path is of the form /proc/self/fd/N for some N, returns `Ok(Some(N))`. Otherwise
523 /// returns `Ok(None)`.
safe_descriptor_from_path<P: AsRef<Path>>(path: P) -> Result<Option<SafeDescriptor>>524 pub fn safe_descriptor_from_path<P: AsRef<Path>>(path: P) -> Result<Option<SafeDescriptor>> {
525     let path = path.as_ref();
526     if path.parent() == Some(Path::new("/proc/self/fd")) {
527         let raw_descriptor = path
528             .file_name()
529             .and_then(|fd_osstr| fd_osstr.to_str())
530             .and_then(|fd_str| fd_str.parse::<RawFd>().ok())
531             .ok_or_else(|| Error::new(EINVAL))?;
532         let validated_fd = validate_raw_fd(&raw_descriptor)?;
533         Ok(Some(
534             // SAFETY:
535             // Safe because nothing else has access to validated_fd after this call.
536             unsafe { SafeDescriptor::from_raw_descriptor(validated_fd) },
537         ))
538     } else {
539         Ok(None)
540     }
541 }
542 
543 /// Check FD is not opened by crosvm and returns a FD that is freshly DUPFD_CLOEXEC's.
544 /// A SafeDescriptor is created from the duplicated fd. It does not take ownership of
545 /// fd passed by argument.
safe_descriptor_from_cmdline_fd(fd: &RawFd) -> Result<SafeDescriptor>546 pub fn safe_descriptor_from_cmdline_fd(fd: &RawFd) -> Result<SafeDescriptor> {
547     let validated_fd = validate_raw_fd(fd)?;
548     Ok(
549         // SAFETY:
550         // Safe because nothing else has access to validated_fd after this call.
551         unsafe { SafeDescriptor::from_raw_descriptor(validated_fd) },
552     )
553 }
554 
555 /// Open the file with the given path, or if it is of the form `/proc/self/fd/N` then just use the
556 /// file descriptor.
557 ///
558 /// Note that this will not work properly if the same `/proc/self/fd/N` path is used twice in
559 /// different places, as the metadata (including the offset) will be shared between both file
560 /// descriptors.
open_file_or_duplicate<P: AsRef<Path>>(path: P, options: &OpenOptions) -> Result<File>561 pub fn open_file_or_duplicate<P: AsRef<Path>>(path: P, options: &OpenOptions) -> Result<File> {
562     let path = path.as_ref();
563     // Special case '/proc/self/fd/*' paths. The FD is already open, just use it.
564     Ok(if let Some(fd) = safe_descriptor_from_path(path)? {
565         fd.into()
566     } else {
567         options.open(path)?
568     })
569 }
570 
571 /// Get the soft and hard limits of max number of open files allowed by the environment.
max_open_files() -> Result<libc::rlimit64>572 pub fn max_open_files() -> Result<libc::rlimit64> {
573     let mut buf = mem::MaybeUninit::<libc::rlimit64>::zeroed();
574 
575     // SAFETY:
576     // Safe because this will only modify `buf` and we check the return value.
577     let res = unsafe { libc::prlimit64(0, libc::RLIMIT_NOFILE, ptr::null(), buf.as_mut_ptr()) };
578     if res == 0 {
579         // SAFETY:
580         // Safe because the kernel guarantees that the struct is fully initialized.
581         let limit = unsafe { buf.assume_init() };
582         Ok(limit)
583     } else {
584         errno_result()
585     }
586 }
587 
588 /// Executes the given callback with extended soft limit of max number of open files. After the
589 /// callback executed, restore the limit.
call_with_extended_max_files<T, E>( callback: impl FnOnce() -> std::result::Result<T, E>, ) -> Result<std::result::Result<T, E>>590 pub fn call_with_extended_max_files<T, E>(
591     callback: impl FnOnce() -> std::result::Result<T, E>,
592 ) -> Result<std::result::Result<T, E>> {
593     let cur_limit = max_open_files()?;
594     let new_limit = libc::rlimit64 {
595         rlim_cur: cur_limit.rlim_max,
596         ..cur_limit
597     };
598     let needs_extension = cur_limit.rlim_cur < new_limit.rlim_cur;
599     if needs_extension {
600         set_max_open_files(new_limit)?;
601     }
602 
603     let r = callback();
604 
605     // Restore the soft limit.
606     if needs_extension {
607         set_max_open_files(cur_limit)?;
608     }
609 
610     Ok(r)
611 }
612 
613 /// Set the soft and hard limits of max number of open files to the given value.
set_max_open_files(limit: libc::rlimit64) -> Result<()>614 fn set_max_open_files(limit: libc::rlimit64) -> Result<()> {
615     // SAFETY: RLIMIT_NOFILE is known only to read a buffer of size rlimit64, and we have always
616     // rlimit64 allocated.
617     let res = unsafe { libc::setrlimit64(libc::RLIMIT_NOFILE, &limit) };
618     if res == 0 {
619         Ok(())
620     } else {
621         errno_result()
622     }
623 }
624 
625 /// Moves the requested PID/TID to a particular cgroup
move_to_cgroup(cgroup_path: PathBuf, id_to_write: Pid, cgroup_file: &str) -> Result<()>626 pub fn move_to_cgroup(cgroup_path: PathBuf, id_to_write: Pid, cgroup_file: &str) -> Result<()> {
627     use std::io::Write;
628 
629     let gpu_cgroup_file = cgroup_path.join(cgroup_file);
630     let mut f = File::create(gpu_cgroup_file)?;
631     f.write_all(id_to_write.to_string().as_bytes())?;
632     Ok(())
633 }
634 
move_task_to_cgroup(cgroup_path: PathBuf, thread_id: Pid) -> Result<()>635 pub fn move_task_to_cgroup(cgroup_path: PathBuf, thread_id: Pid) -> Result<()> {
636     move_to_cgroup(cgroup_path, thread_id, "tasks")
637 }
638 
move_proc_to_cgroup(cgroup_path: PathBuf, process_id: Pid) -> Result<()>639 pub fn move_proc_to_cgroup(cgroup_path: PathBuf, process_id: Pid) -> Result<()> {
640     move_to_cgroup(cgroup_path, process_id, "cgroup.procs")
641 }
642 
643 /// Queries the property of a specified CPU sysfs node.
parse_sysfs_cpu_info_vec(cpu_id: usize, property: &str) -> Result<Vec<u32>>644 fn parse_sysfs_cpu_info_vec(cpu_id: usize, property: &str) -> Result<Vec<u32>> {
645     let path = format!("/sys/devices/system/cpu/cpu{cpu_id}/{property}");
646     let res: Result<Vec<_>> = std::fs::read_to_string(path)?
647         .split_whitespace()
648         .map(|x| x.parse().map_err(|_| Error::new(libc::EINVAL)))
649         .collect();
650     res
651 }
652 
653 /// Returns a list of supported frequencies in kHz for a given logical core.
logical_core_frequencies_khz(cpu_id: usize) -> Result<Vec<u32>>654 pub fn logical_core_frequencies_khz(cpu_id: usize) -> Result<Vec<u32>> {
655     parse_sysfs_cpu_info_vec(cpu_id, "cpufreq/scaling_available_frequencies")
656 }
657 
parse_sysfs_cpu_info(cpu_id: usize, property: &str) -> Result<u32>658 fn parse_sysfs_cpu_info(cpu_id: usize, property: &str) -> Result<u32> {
659     let path = format!("/sys/devices/system/cpu/cpu{cpu_id}/{property}");
660     std::fs::read_to_string(path)?
661         .trim()
662         .parse()
663         .map_err(|_| Error::new(libc::EINVAL))
664 }
665 
666 /// Returns the capacity (measure of performance) of a given logical core.
logical_core_capacity(cpu_id: usize) -> Result<u32>667 pub fn logical_core_capacity(cpu_id: usize) -> Result<u32> {
668     static CPU_MAX_FREQS: OnceCell<Vec<u32>> = OnceCell::new();
669 
670     let cpu_capacity = parse_sysfs_cpu_info(cpu_id, "cpu_capacity")?;
671 
672     // Collect and cache the maximum frequencies of all cores. We need to know
673     // the largest maximum frequency between all cores to reverse normalization,
674     // so collect all the values once on the first call to this function.
675     let cpu_max_freqs = CPU_MAX_FREQS.get_or_try_init(|| {
676         (0..number_of_logical_cores()?)
677             .map(logical_core_max_freq_khz)
678             .collect()
679     });
680 
681     if let Ok(cpu_max_freqs) = cpu_max_freqs {
682         let largest_max_freq = *cpu_max_freqs.iter().max().ok_or(Error::new(EINVAL))?;
683         let cpu_max_freq = *cpu_max_freqs.get(cpu_id).ok_or(Error::new(EINVAL))?;
684         let normalized_cpu_capacity = (u64::from(cpu_capacity) * u64::from(largest_max_freq))
685             .checked_div(u64::from(cpu_max_freq))
686             .ok_or(Error::new(EINVAL))?;
687         normalized_cpu_capacity
688             .try_into()
689             .map_err(|_| Error::new(EINVAL))
690     } else {
691         // cpu-freq is not enabled. Fall back to using the normalized capacity.
692         Ok(cpu_capacity)
693     }
694 }
695 
696 /// Returns the cluster ID of a given logical core.
logical_core_cluster_id(cpu_id: usize) -> Result<u32>697 pub fn logical_core_cluster_id(cpu_id: usize) -> Result<u32> {
698     parse_sysfs_cpu_info(cpu_id, "topology/physical_package_id")
699 }
700 
701 /// Returns the maximum frequency (in kHz) of a given logical core.
logical_core_max_freq_khz(cpu_id: usize) -> Result<u32>702 pub fn logical_core_max_freq_khz(cpu_id: usize) -> Result<u32> {
703     parse_sysfs_cpu_info(cpu_id, "cpufreq/cpuinfo_max_freq")
704 }
705 
706 #[repr(C)]
707 pub struct sched_attr {
708     pub size: u32,
709 
710     pub sched_policy: u32,
711     pub sched_flags: u64,
712     pub sched_nice: i32,
713 
714     pub sched_priority: u32,
715 
716     pub sched_runtime: u64,
717     pub sched_deadline: u64,
718     pub sched_period: u64,
719 
720     pub sched_util_min: u32,
721     pub sched_util_max: u32,
722 }
723 
724 impl sched_attr {
default() -> Self725     pub fn default() -> Self {
726         Self {
727             size: std::mem::size_of::<sched_attr>() as u32,
728             sched_policy: 0,
729             sched_flags: 0,
730             sched_nice: 0,
731             sched_priority: 0,
732             sched_runtime: 0,
733             sched_deadline: 0,
734             sched_period: 0,
735             sched_util_min: 0,
736             sched_util_max: 0,
737         }
738     }
739 }
740 
sched_setattr(pid: Pid, attr: &mut sched_attr, flags: u32) -> Result<()>741 pub fn sched_setattr(pid: Pid, attr: &mut sched_attr, flags: u32) -> Result<()> {
742     // SAFETY: Safe becuase all the args are valid and the return valud is checked.
743     let ret = unsafe {
744         libc::syscall(
745             libc::SYS_sched_setattr,
746             pid as usize,
747             attr as *mut sched_attr as usize,
748             flags as usize,
749         )
750     };
751 
752     if ret < 0 {
753         return Err(Error::last());
754     }
755     Ok(())
756 }
757 
758 #[cfg(test)]
759 mod tests {
760     use std::io::Write;
761     use std::os::fd::AsRawFd;
762 
763     use super::*;
764     use crate::unix::add_fd_flags;
765 
766     #[test]
pipe_size_and_fill()767     fn pipe_size_and_fill() {
768         let (_rx, mut tx) = new_pipe_full().expect("Failed to pipe");
769 
770         // To  check that setting the size worked, set the descriptor to non blocking and check that
771         // write returns an error.
772         add_fd_flags(tx.as_raw_fd(), libc::O_NONBLOCK).expect("Failed to set tx non blocking");
773         tx.write(&[0u8; 8])
774             .expect_err("Write after fill didn't fail");
775     }
776 }
777