xref: /aosp_15_r20/external/crosvm/jail/src/helpers.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #![deny(missing_docs)]
6 #![allow(dead_code)]
7 
8 use std::path::Path;
9 use std::str;
10 
11 use anyhow::bail;
12 use anyhow::Context;
13 use anyhow::Result;
14 #[cfg(feature = "seccomp_trace")]
15 use base::debug;
16 use base::getegid;
17 use base::geteuid;
18 #[cfg(feature = "seccomp_trace")]
19 use base::warn;
20 use libc::c_ulong;
21 use minijail::Minijail;
22 #[cfg(not(feature = "seccomp_trace"))]
23 use once_cell::sync::Lazy;
24 #[cfg(feature = "seccomp_trace")]
25 use static_assertions::assert_eq_size;
26 #[cfg(feature = "seccomp_trace")]
27 use zerocopy::AsBytes;
28 
29 use crate::config::JailConfig;
30 
31 // ANDROID: b/246968493
32 #[cfg(not(feature = "seccomp_trace"))]
33 static EMBEDDED_BPFS: Lazy<std::collections::HashMap<&str, Vec<u8>>> =
34     Lazy::new(|| std::collections::HashMap::<&str, Vec<u8>>::new());
35 
36 /// Most devices don't need to open many fds.
37 pub const MAX_OPEN_FILES_DEFAULT: u64 = 1024;
38 /// The max open files for gpu processes.
39 const MAX_OPEN_FILES_FOR_GPU: u64 = 32768;
40 /// The max open files for jail warden, matching FD_RAW_FAILURE.
41 pub const MAX_OPEN_FILES_FOR_JAIL_WARDEN: u64 = 65536;
42 
43 /// The user in the jail to run as.
44 pub enum RunAsUser {
45     /// Do not specify the user
46     Unspecified,
47     /// Runs as the same user in the jail as the current user.
48     CurrentUser,
49     /// Runs as the root user in the jail.
50     Root,
51     /// Runs as the specified uid and gid.
52     /// This requires `SandboxConfig::ugid_map` to be set.
53     Specified(u32, u32),
54 }
55 
56 /// Config for the sandbox to be created by [Minijail].
57 pub struct SandboxConfig<'a> {
58     /// Whether or not to drop all capabilities in the sandbox.
59     pub limit_caps: bool,
60     log_failures: bool,
61     seccomp_policy_dir: Option<&'a Path>,
62     seccomp_policy_name: &'a str,
63     /// The pair of `uid_map` and `gid_map`.
64     pub ugid_map: Option<(&'a str, &'a str)>,
65     /// The remount mode instead of default MS_PRIVATE.
66     pub remount_mode: Option<c_ulong>,
67     /// Whether to use empty net namespace. Enabled by default.
68     pub namespace_net: bool,
69     /// Whether or not to configure the jail to support bind-mounts.
70     ///
71     /// Note that most device processes deny `open(2)` and `openat(2)` by seccomp policy and just
72     /// returns `ENOENT`. Passing opened file descriptors is recommended over opening files in the
73     /// sandbox.
74     pub bind_mounts: bool,
75     /// Specify the user in the jail to run as.
76     pub run_as: RunAsUser,
77 }
78 
79 impl<'a> SandboxConfig<'a> {
80     /// Creates [SandboxConfig].
new(jail_config: &'a JailConfig, policy: &'a str) -> Self81     pub fn new(jail_config: &'a JailConfig, policy: &'a str) -> Self {
82         Self {
83             limit_caps: true,
84             log_failures: jail_config.seccomp_log_failures,
85             seccomp_policy_dir: jail_config.seccomp_policy_dir.as_ref().map(Path::new),
86             seccomp_policy_name: policy,
87             ugid_map: None,
88             remount_mode: None,
89             namespace_net: true,
90             bind_mounts: false,
91             run_as: RunAsUser::Unspecified,
92         }
93     }
94 }
95 
96 /// Wrapper that cleans up a [Minijail] when it is dropped
97 pub struct ScopedMinijail(pub Minijail);
98 
99 impl Drop for ScopedMinijail {
drop(&mut self)100     fn drop(&mut self) {
101         let _ = self.0.kill();
102     }
103 }
104 
105 /// Creates a [Minijail] instance which just changes the root using pivot_root(2) path and
106 /// `max_open_files` using `RLIMIT_NOFILE`.
107 ///
108 /// If `root` path is "/", the minijail don't change the root.
109 ///
110 /// # Arguments
111 ///
112 /// * `root` - The root path to be changed to by minijail.
113 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
114 #[allow(clippy::unnecessary_cast)]
create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail>115 pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail> {
116     // Validate new root directory. Path::is_dir() also checks the existence.
117     if !root.is_dir() {
118         bail!("{:?} is not a directory, cannot create jail", root);
119     }
120     // chroot accepts absolute path only.
121     if !root.is_absolute() {
122         bail!("{:?} is not absolute path", root);
123     }
124 
125     let mut jail = Minijail::new().context("failed to jail device")?;
126 
127     // Only pivot_root if we are not re-using the current root directory.
128     if root != Path::new("/") {
129         // Run in a new mount namespace.
130         jail.namespace_vfs();
131         jail.enter_pivot_root(root)
132             .context("failed to pivot root device")?;
133     }
134 
135     jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
136         .context("error setting max open files")?;
137 
138     Ok(jail)
139 }
140 
141 /// Creates a [Minijail] instance which just invokes a jail process and sets
142 /// `max_open_files` using `RLIMIT_NOFILE`. This is helpful with crosvm process
143 /// runs as a non-root user without SYS_ADMIN capabilities.
144 ///
145 /// Unlike `create_base_minijail`, this function doesn't call `pivot_root`
146 /// and `mount namespace`. So, it runs as a non-root user without
147 /// SYS_ADMIN capabilities.
148 ///
149 /// Note that since there is no file system isolation provided by this function,
150 /// caller of this function should enforce other security mechanisum such as selinux
151 /// on the host to protect directories.
152 ///
153 /// # Arguments
154 ///
155 /// * `root` - The root path to checked before the process is jailed
156 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
157 #[allow(clippy::unnecessary_cast)]
create_base_minijail_without_pivot_root( root: &Path, max_open_files: u64, ) -> Result<Minijail>158 pub fn create_base_minijail_without_pivot_root(
159     root: &Path,
160     max_open_files: u64,
161 ) -> Result<Minijail> {
162     // Validate new root directory. Path::is_dir() also checks the existence.
163     if !root.is_dir() {
164         bail!("{:?} is not a directory, cannot create jail", root);
165     }
166     if !root.is_absolute() {
167         bail!("{:?} is not absolute path", root);
168     }
169 
170     let mut jail = Minijail::new().context("failed to jail device")?;
171     jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
172         .context("error setting max open files")?;
173 
174     Ok(jail)
175 }
176 
177 /// Creates a [Minijail] instance which creates a sandbox.
178 ///
179 /// # Arguments
180 ///
181 /// * `root` - The root path to be changed to by minijail.
182 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
183 /// * `config` - The [SandboxConfig] to control details of the sandbox.
create_sandbox_minijail( root: &Path, max_open_files: u64, config: &SandboxConfig, ) -> Result<Minijail>184 pub fn create_sandbox_minijail(
185     root: &Path,
186     max_open_files: u64,
187     config: &SandboxConfig,
188 ) -> Result<Minijail> {
189     let mut jail = create_base_minijail(root, max_open_files)?;
190 
191     jail.namespace_pids();
192     jail.namespace_user();
193     jail.namespace_user_disable_setgroups();
194     if config.limit_caps {
195         // Don't need any capabilities.
196         jail.use_caps(0);
197     }
198     match config.run_as {
199         RunAsUser::Unspecified => {
200             if config.bind_mounts && config.ugid_map.is_none() {
201                 // Minijail requires to set user/group map to mount extra directories.
202                 add_current_user_to_jail(&mut jail)?;
203             }
204         }
205         RunAsUser::CurrentUser => {
206             add_current_user_to_jail(&mut jail)?;
207         }
208         RunAsUser::Root => {
209             // Add the current user as root in the jail.
210             let crosvm_uid = geteuid();
211             let crosvm_gid = getegid();
212             jail.uidmap(&format!("0 {} 1", crosvm_uid))
213                 .context("error setting UID map")?;
214             jail.gidmap(&format!("0 {} 1", crosvm_gid))
215                 .context("error setting GID map")?;
216         }
217         RunAsUser::Specified(uid, gid) => {
218             if uid != 0 {
219                 jail.change_uid(uid)
220             }
221             if gid != 0 {
222                 jail.change_gid(gid)
223             }
224         }
225     }
226     if config.bind_mounts {
227         // Create a tmpfs in the device's root directory so that we can bind mount files.
228         // The size=67108864 is size=64*1024*1024 or size=64MB.
229         // TODO(b/267581374): Use appropriate size for tmpfs.
230         jail.mount_with_data(
231             Path::new("none"),
232             Path::new("/"),
233             "tmpfs",
234             (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
235             "size=67108864",
236         )?;
237     }
238     if let Some((uid_map, gid_map)) = config.ugid_map {
239         jail.uidmap(uid_map).context("error setting UID map")?;
240         jail.gidmap(gid_map).context("error setting GID map")?;
241     }
242     // Run in a new mount namespace.
243     jail.namespace_vfs();
244 
245     if config.namespace_net {
246         // Run in an empty network namespace.
247         jail.namespace_net();
248     }
249 
250     // Don't allow the device to gain new privileges.
251     jail.no_new_privs();
252 
253     #[cfg(feature = "seccomp_trace")]
254     {
255         #[repr(C)]
256         #[derive(AsBytes)]
257         struct sock_filter {
258             /* Filter block */
259             code: u16, /* Actual filter code */
260             jt: u8,    /* Jump true */
261             jf: u8,    /* Jump false */
262             k: u32,    /* Generic multiuse field */
263         }
264 
265         // BPF constant is defined in https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/bpf_common.h
266         // BPF parser/assembler is defined in https://elixir.bootlin.com/linux/v4.9/source/tools/net/bpf_exp.y
267         const SECCOMP_RET_TRACE: u32 = 0x7ff00000;
268         const SECCOMP_RET_LOG: u32 = 0x7ffc0000;
269         const BPF_RET: u16 = 0x06;
270         const BPF_K: u16 = 0x00;
271 
272         // return SECCOMP_RET_LOG for all syscalls
273         const FILTER_RET_LOG_BLOCK: sock_filter = sock_filter {
274             code: BPF_RET | BPF_K,
275             jt: 0,
276             jf: 0,
277             k: SECCOMP_RET_LOG,
278         };
279 
280         warn!("The running crosvm is compiled with seccomp_trace feature, and is striclty used for debugging purpose only. DO NOT USE IN PRODUCTION!!!");
281         debug!(
282             "seccomp_trace {{\"event\": \"minijail_create\", \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
283             config.seccomp_policy_name,
284             read_jail_addr(&jail),
285         );
286         jail.parse_seccomp_bytes(FILTER_RET_LOG_BLOCK.as_bytes())
287             .unwrap();
288     }
289 
290     #[cfg(not(feature = "seccomp_trace"))]
291     if let Some(seccomp_policy_dir) = config.seccomp_policy_dir {
292         let seccomp_policy_path = seccomp_policy_dir.join(config.seccomp_policy_name);
293         // By default we'll prioritize using the pre-compiled .bpf over the .policy file (the .bpf
294         // is expected to be compiled using "trap" as the failure behavior instead of the default
295         // "kill" behavior) when a policy path is supplied in the command line arugments. Otherwise
296         // the built-in pre-compiled policies will be used.
297         // Refer to the code comment for the "seccomp-log-failures" command-line parameter for an
298         // explanation about why the |log_failures| flag forces the use of .policy files (and the
299         // build-time alternative to this run-time flag).
300         let bpf_policy_file = seccomp_policy_path.with_extension("bpf");
301         if bpf_policy_file.exists() && !config.log_failures {
302             jail.parse_seccomp_program(&bpf_policy_file)
303                 .with_context(|| {
304                     format!(
305                         "failed to parse precompiled seccomp policy: {}",
306                         bpf_policy_file.display()
307                     )
308                 })?;
309         } else {
310             // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly
311             // kill the entire device process if a worker thread commits a seccomp violation.
312             jail.set_seccomp_filter_tsync();
313             if config.log_failures {
314                 jail.log_seccomp_filter_failures();
315             }
316             let bpf_policy_file = seccomp_policy_path.with_extension("policy");
317             jail.parse_seccomp_filters(&bpf_policy_file)
318                 .with_context(|| {
319                     format!(
320                         "failed to parse seccomp policy: {}",
321                         bpf_policy_file.display()
322                     )
323                 })?;
324         }
325     } else {
326         set_embedded_bpf_program(&mut jail, config.seccomp_policy_name)?;
327     }
328 
329     jail.use_seccomp_filter();
330     // Don't do init setup.
331     jail.run_as_init();
332     // Set up requested remount mode instead of default MS_PRIVATE.
333     if let Some(mode) = config.remount_mode {
334         jail.set_remount_mode(mode);
335     }
336 
337     Ok(jail)
338 }
339 
340 /// Creates a basic [Minijail] if `jail_config` is present.
341 ///
342 /// Returns `None` if `jail_config` is none.
simple_jail(jail_config: &Option<JailConfig>, policy: &str) -> Result<Option<Minijail>>343 pub fn simple_jail(jail_config: &Option<JailConfig>, policy: &str) -> Result<Option<Minijail>> {
344     if let Some(jail_config) = jail_config {
345         let config = SandboxConfig::new(jail_config, policy);
346         Ok(Some(create_sandbox_minijail(
347             &jail_config.pivot_root,
348             MAX_OPEN_FILES_DEFAULT,
349             &config,
350         )?))
351     } else {
352         Ok(None)
353     }
354 }
355 
356 /// Creates [Minijail] for gpu processes.
create_gpu_minijail( root: &Path, config: &SandboxConfig, render_node_only: bool, ) -> Result<Minijail>357 pub fn create_gpu_minijail(
358     root: &Path,
359     config: &SandboxConfig,
360     render_node_only: bool,
361 ) -> Result<Minijail> {
362     let mut jail = create_sandbox_minijail(root, MAX_OPEN_FILES_FOR_GPU, config)?;
363 
364     // Device nodes required for DRM.
365     let sys_dev_char_path = Path::new("/sys/dev/char");
366     jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
367 
368     // Necessary for CGROUP control of the vGPU threads
369     // This is not necessary UNLESS one wants to make use
370     // of the gpu cgroup command line options.
371     let sys_cpuset_path = Path::new("/sys/fs/cgroup/cpuset");
372     if sys_cpuset_path.exists() {
373         jail.mount_bind(sys_cpuset_path, sys_cpuset_path, true)?;
374     }
375 
376     let sys_devices_path = Path::new("/sys/devices");
377     jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
378 
379     jail_mount_bind_drm(&mut jail, render_node_only)?;
380 
381     // If the ARM specific devices exist on the host, bind mount them in.
382     let mali0_path = Path::new("/dev/mali0");
383     if mali0_path.exists() {
384         jail.mount_bind(mali0_path, mali0_path, true)?;
385     }
386 
387     let pvr_sync_path = Path::new("/dev/pvr_sync");
388     if pvr_sync_path.exists() {
389         jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
390     }
391 
392     // If the udmabuf driver exists on the host, bind mount it in.
393     let udmabuf_path = Path::new("/dev/udmabuf");
394     if udmabuf_path.exists() {
395         jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
396     }
397 
398     // Libraries that are required when mesa drivers are dynamically loaded.
399     jail_mount_bind_if_exists(
400         &mut jail,
401         &[
402             "/usr/lib",
403             "/usr/lib64",
404             "/lib",
405             "/lib64",
406             "/usr/share/drirc.d",
407             "/usr/share/glvnd",
408             "/usr/share/libdrm",
409             "/usr/share/vulkan",
410         ],
411     )?;
412 
413     // pvr driver requires read access to /proc/self/task/*/comm.
414     mount_proc(&mut jail)?;
415 
416     // To enable perfetto tracing, we need to give access to the perfetto service IPC
417     // endpoints.
418     let perfetto_path = Path::new("/run/perfetto");
419     if perfetto_path.exists() {
420         jail.mount_bind(perfetto_path, perfetto_path, true)?;
421     }
422 
423     Ok(jail)
424 }
425 
426 /// Selectively bind mount drm nodes into `jail` based on `render_node_only`
427 ///
428 /// This function will not return an error if drm nodes don't exist
jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()>429 pub fn jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()> {
430     if render_node_only {
431         const DRM_NUM_NODES: u32 = 63;
432         const DRM_RENDER_NODE_START: u32 = 128;
433         for offset in 0..DRM_NUM_NODES {
434             let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset);
435             let drm_dri_path = Path::new(&path_str);
436             if !drm_dri_path.exists() {
437                 break;
438             }
439             jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
440         }
441     } else {
442         let drm_dri_path = Path::new("/dev/dri");
443         if drm_dri_path.exists() {
444             jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
445         }
446     }
447 
448     Ok(())
449 }
450 
451 /// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis.
452 ///
453 /// This function will not return an error if any of the directories in `dirs` is missing.
jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>( jail: &mut Minijail, dirs: &[P], ) -> Result<()>454 pub fn jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>(
455     jail: &mut Minijail,
456     dirs: &[P],
457 ) -> Result<()> {
458     for dir in dirs {
459         let dir_path = Path::new(dir);
460         if dir_path.exists() {
461             jail.mount_bind(dir_path, dir_path, false)?;
462         }
463     }
464 
465     Ok(())
466 }
467 
468 /// Mount proc in the sandbox.
mount_proc(jail: &mut Minijail) -> Result<()>469 pub fn mount_proc(jail: &mut Minijail) -> Result<()> {
470     jail.mount(
471         Path::new("proc"),
472         Path::new("/proc"),
473         "proc",
474         (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
475     )?;
476     Ok(())
477 }
478 
479 /// Read minijail internal struct address for uniquely identifying and tracking jail's lifetime
480 #[cfg(feature = "seccomp_trace")]
read_jail_addr(jail: &Minijail) -> usize481 pub fn read_jail_addr(jail: &Minijail) -> usize {
482     // We can only hope minijail's rust object will always only contain a pointer to C jail struct
483     assert_eq_size!(Minijail, usize);
484     // Safe because it's only doing a read within bound checked by static assert
485     unsafe { *(jail as *const Minijail as *const usize) }
486 }
487 
488 /// Set the uid/gid for the jailed process and give a basic id map. This is
489 /// required for bind mounts to work.
add_current_user_to_jail(jail: &mut Minijail) -> Result<()>490 fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
491     let crosvm_uid = geteuid();
492     let crosvm_gid = getegid();
493 
494     jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
495         .context("error setting UID map")?;
496     jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
497         .context("error setting GID map")?;
498 
499     if crosvm_uid != 0 {
500         jail.change_uid(crosvm_uid);
501     }
502     if crosvm_gid != 0 {
503         jail.change_gid(crosvm_gid);
504     }
505     Ok(())
506 }
507 
508 /// Set the seccomp policy for a jail from embedded bpfs
set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()>509 pub fn set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()> {
510     let bpf_program = EMBEDDED_BPFS.get(seccomp_policy_name).with_context(|| {
511         format!(
512             "failed to find embedded seccomp policy: {}",
513             seccomp_policy_name
514         )
515     })?;
516     jail.parse_seccomp_bytes(bpf_program).with_context(|| {
517         format!(
518             "failed to parse embedded seccomp policy: {}",
519             seccomp_policy_name
520         )
521     })?;
522     Ok(())
523 }
524