1 // Copyright 2017 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #![deny(missing_docs)]
6 #![allow(dead_code)]
7
8 use std::path::Path;
9 use std::str;
10
11 use anyhow::bail;
12 use anyhow::Context;
13 use anyhow::Result;
14 #[cfg(feature = "seccomp_trace")]
15 use base::debug;
16 use base::getegid;
17 use base::geteuid;
18 #[cfg(feature = "seccomp_trace")]
19 use base::warn;
20 use libc::c_ulong;
21 use minijail::Minijail;
22 #[cfg(not(feature = "seccomp_trace"))]
23 use once_cell::sync::Lazy;
24 #[cfg(feature = "seccomp_trace")]
25 use static_assertions::assert_eq_size;
26 #[cfg(feature = "seccomp_trace")]
27 use zerocopy::AsBytes;
28
29 use crate::config::JailConfig;
30
31 // ANDROID: b/246968493
32 #[cfg(not(feature = "seccomp_trace"))]
33 static EMBEDDED_BPFS: Lazy<std::collections::HashMap<&str, Vec<u8>>> =
34 Lazy::new(|| std::collections::HashMap::<&str, Vec<u8>>::new());
35
36 /// Most devices don't need to open many fds.
37 pub const MAX_OPEN_FILES_DEFAULT: u64 = 1024;
38 /// The max open files for gpu processes.
39 const MAX_OPEN_FILES_FOR_GPU: u64 = 32768;
40 /// The max open files for jail warden, matching FD_RAW_FAILURE.
41 pub const MAX_OPEN_FILES_FOR_JAIL_WARDEN: u64 = 65536;
42
43 /// The user in the jail to run as.
44 pub enum RunAsUser {
45 /// Do not specify the user
46 Unspecified,
47 /// Runs as the same user in the jail as the current user.
48 CurrentUser,
49 /// Runs as the root user in the jail.
50 Root,
51 /// Runs as the specified uid and gid.
52 /// This requires `SandboxConfig::ugid_map` to be set.
53 Specified(u32, u32),
54 }
55
56 /// Config for the sandbox to be created by [Minijail].
57 pub struct SandboxConfig<'a> {
58 /// Whether or not to drop all capabilities in the sandbox.
59 pub limit_caps: bool,
60 log_failures: bool,
61 seccomp_policy_dir: Option<&'a Path>,
62 seccomp_policy_name: &'a str,
63 /// The pair of `uid_map` and `gid_map`.
64 pub ugid_map: Option<(&'a str, &'a str)>,
65 /// The remount mode instead of default MS_PRIVATE.
66 pub remount_mode: Option<c_ulong>,
67 /// Whether to use empty net namespace. Enabled by default.
68 pub namespace_net: bool,
69 /// Whether or not to configure the jail to support bind-mounts.
70 ///
71 /// Note that most device processes deny `open(2)` and `openat(2)` by seccomp policy and just
72 /// returns `ENOENT`. Passing opened file descriptors is recommended over opening files in the
73 /// sandbox.
74 pub bind_mounts: bool,
75 /// Specify the user in the jail to run as.
76 pub run_as: RunAsUser,
77 }
78
79 impl<'a> SandboxConfig<'a> {
80 /// Creates [SandboxConfig].
new(jail_config: &'a JailConfig, policy: &'a str) -> Self81 pub fn new(jail_config: &'a JailConfig, policy: &'a str) -> Self {
82 Self {
83 limit_caps: true,
84 log_failures: jail_config.seccomp_log_failures,
85 seccomp_policy_dir: jail_config.seccomp_policy_dir.as_ref().map(Path::new),
86 seccomp_policy_name: policy,
87 ugid_map: None,
88 remount_mode: None,
89 namespace_net: true,
90 bind_mounts: false,
91 run_as: RunAsUser::Unspecified,
92 }
93 }
94 }
95
96 /// Wrapper that cleans up a [Minijail] when it is dropped
97 pub struct ScopedMinijail(pub Minijail);
98
99 impl Drop for ScopedMinijail {
drop(&mut self)100 fn drop(&mut self) {
101 let _ = self.0.kill();
102 }
103 }
104
105 /// Creates a [Minijail] instance which just changes the root using pivot_root(2) path and
106 /// `max_open_files` using `RLIMIT_NOFILE`.
107 ///
108 /// If `root` path is "/", the minijail don't change the root.
109 ///
110 /// # Arguments
111 ///
112 /// * `root` - The root path to be changed to by minijail.
113 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
114 #[allow(clippy::unnecessary_cast)]
create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail>115 pub fn create_base_minijail(root: &Path, max_open_files: u64) -> Result<Minijail> {
116 // Validate new root directory. Path::is_dir() also checks the existence.
117 if !root.is_dir() {
118 bail!("{:?} is not a directory, cannot create jail", root);
119 }
120 // chroot accepts absolute path only.
121 if !root.is_absolute() {
122 bail!("{:?} is not absolute path", root);
123 }
124
125 let mut jail = Minijail::new().context("failed to jail device")?;
126
127 // Only pivot_root if we are not re-using the current root directory.
128 if root != Path::new("/") {
129 // Run in a new mount namespace.
130 jail.namespace_vfs();
131 jail.enter_pivot_root(root)
132 .context("failed to pivot root device")?;
133 }
134
135 jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
136 .context("error setting max open files")?;
137
138 Ok(jail)
139 }
140
141 /// Creates a [Minijail] instance which just invokes a jail process and sets
142 /// `max_open_files` using `RLIMIT_NOFILE`. This is helpful with crosvm process
143 /// runs as a non-root user without SYS_ADMIN capabilities.
144 ///
145 /// Unlike `create_base_minijail`, this function doesn't call `pivot_root`
146 /// and `mount namespace`. So, it runs as a non-root user without
147 /// SYS_ADMIN capabilities.
148 ///
149 /// Note that since there is no file system isolation provided by this function,
150 /// caller of this function should enforce other security mechanisum such as selinux
151 /// on the host to protect directories.
152 ///
153 /// # Arguments
154 ///
155 /// * `root` - The root path to checked before the process is jailed
156 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
157 #[allow(clippy::unnecessary_cast)]
create_base_minijail_without_pivot_root( root: &Path, max_open_files: u64, ) -> Result<Minijail>158 pub fn create_base_minijail_without_pivot_root(
159 root: &Path,
160 max_open_files: u64,
161 ) -> Result<Minijail> {
162 // Validate new root directory. Path::is_dir() also checks the existence.
163 if !root.is_dir() {
164 bail!("{:?} is not a directory, cannot create jail", root);
165 }
166 if !root.is_absolute() {
167 bail!("{:?} is not absolute path", root);
168 }
169
170 let mut jail = Minijail::new().context("failed to jail device")?;
171 jail.set_rlimit(libc::RLIMIT_NOFILE as i32, max_open_files, max_open_files)
172 .context("error setting max open files")?;
173
174 Ok(jail)
175 }
176
177 /// Creates a [Minijail] instance which creates a sandbox.
178 ///
179 /// # Arguments
180 ///
181 /// * `root` - The root path to be changed to by minijail.
182 /// * `max_open_files` - The maximum number of file descriptors to allow a jailed process to open.
183 /// * `config` - The [SandboxConfig] to control details of the sandbox.
create_sandbox_minijail( root: &Path, max_open_files: u64, config: &SandboxConfig, ) -> Result<Minijail>184 pub fn create_sandbox_minijail(
185 root: &Path,
186 max_open_files: u64,
187 config: &SandboxConfig,
188 ) -> Result<Minijail> {
189 let mut jail = create_base_minijail(root, max_open_files)?;
190
191 jail.namespace_pids();
192 jail.namespace_user();
193 jail.namespace_user_disable_setgroups();
194 if config.limit_caps {
195 // Don't need any capabilities.
196 jail.use_caps(0);
197 }
198 match config.run_as {
199 RunAsUser::Unspecified => {
200 if config.bind_mounts && config.ugid_map.is_none() {
201 // Minijail requires to set user/group map to mount extra directories.
202 add_current_user_to_jail(&mut jail)?;
203 }
204 }
205 RunAsUser::CurrentUser => {
206 add_current_user_to_jail(&mut jail)?;
207 }
208 RunAsUser::Root => {
209 // Add the current user as root in the jail.
210 let crosvm_uid = geteuid();
211 let crosvm_gid = getegid();
212 jail.uidmap(&format!("0 {} 1", crosvm_uid))
213 .context("error setting UID map")?;
214 jail.gidmap(&format!("0 {} 1", crosvm_gid))
215 .context("error setting GID map")?;
216 }
217 RunAsUser::Specified(uid, gid) => {
218 if uid != 0 {
219 jail.change_uid(uid)
220 }
221 if gid != 0 {
222 jail.change_gid(gid)
223 }
224 }
225 }
226 if config.bind_mounts {
227 // Create a tmpfs in the device's root directory so that we can bind mount files.
228 // The size=67108864 is size=64*1024*1024 or size=64MB.
229 // TODO(b/267581374): Use appropriate size for tmpfs.
230 jail.mount_with_data(
231 Path::new("none"),
232 Path::new("/"),
233 "tmpfs",
234 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
235 "size=67108864",
236 )?;
237 }
238 if let Some((uid_map, gid_map)) = config.ugid_map {
239 jail.uidmap(uid_map).context("error setting UID map")?;
240 jail.gidmap(gid_map).context("error setting GID map")?;
241 }
242 // Run in a new mount namespace.
243 jail.namespace_vfs();
244
245 if config.namespace_net {
246 // Run in an empty network namespace.
247 jail.namespace_net();
248 }
249
250 // Don't allow the device to gain new privileges.
251 jail.no_new_privs();
252
253 #[cfg(feature = "seccomp_trace")]
254 {
255 #[repr(C)]
256 #[derive(AsBytes)]
257 struct sock_filter {
258 /* Filter block */
259 code: u16, /* Actual filter code */
260 jt: u8, /* Jump true */
261 jf: u8, /* Jump false */
262 k: u32, /* Generic multiuse field */
263 }
264
265 // BPF constant is defined in https://elixir.bootlin.com/linux/latest/source/include/uapi/linux/bpf_common.h
266 // BPF parser/assembler is defined in https://elixir.bootlin.com/linux/v4.9/source/tools/net/bpf_exp.y
267 const SECCOMP_RET_TRACE: u32 = 0x7ff00000;
268 const SECCOMP_RET_LOG: u32 = 0x7ffc0000;
269 const BPF_RET: u16 = 0x06;
270 const BPF_K: u16 = 0x00;
271
272 // return SECCOMP_RET_LOG for all syscalls
273 const FILTER_RET_LOG_BLOCK: sock_filter = sock_filter {
274 code: BPF_RET | BPF_K,
275 jt: 0,
276 jf: 0,
277 k: SECCOMP_RET_LOG,
278 };
279
280 warn!("The running crosvm is compiled with seccomp_trace feature, and is striclty used for debugging purpose only. DO NOT USE IN PRODUCTION!!!");
281 debug!(
282 "seccomp_trace {{\"event\": \"minijail_create\", \"name\": \"{}\", \"jail_addr\": \"0x{:x}\"}}",
283 config.seccomp_policy_name,
284 read_jail_addr(&jail),
285 );
286 jail.parse_seccomp_bytes(FILTER_RET_LOG_BLOCK.as_bytes())
287 .unwrap();
288 }
289
290 #[cfg(not(feature = "seccomp_trace"))]
291 if let Some(seccomp_policy_dir) = config.seccomp_policy_dir {
292 let seccomp_policy_path = seccomp_policy_dir.join(config.seccomp_policy_name);
293 // By default we'll prioritize using the pre-compiled .bpf over the .policy file (the .bpf
294 // is expected to be compiled using "trap" as the failure behavior instead of the default
295 // "kill" behavior) when a policy path is supplied in the command line arugments. Otherwise
296 // the built-in pre-compiled policies will be used.
297 // Refer to the code comment for the "seccomp-log-failures" command-line parameter for an
298 // explanation about why the |log_failures| flag forces the use of .policy files (and the
299 // build-time alternative to this run-time flag).
300 let bpf_policy_file = seccomp_policy_path.with_extension("bpf");
301 if bpf_policy_file.exists() && !config.log_failures {
302 jail.parse_seccomp_program(&bpf_policy_file)
303 .with_context(|| {
304 format!(
305 "failed to parse precompiled seccomp policy: {}",
306 bpf_policy_file.display()
307 )
308 })?;
309 } else {
310 // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly
311 // kill the entire device process if a worker thread commits a seccomp violation.
312 jail.set_seccomp_filter_tsync();
313 if config.log_failures {
314 jail.log_seccomp_filter_failures();
315 }
316 let bpf_policy_file = seccomp_policy_path.with_extension("policy");
317 jail.parse_seccomp_filters(&bpf_policy_file)
318 .with_context(|| {
319 format!(
320 "failed to parse seccomp policy: {}",
321 bpf_policy_file.display()
322 )
323 })?;
324 }
325 } else {
326 set_embedded_bpf_program(&mut jail, config.seccomp_policy_name)?;
327 }
328
329 jail.use_seccomp_filter();
330 // Don't do init setup.
331 jail.run_as_init();
332 // Set up requested remount mode instead of default MS_PRIVATE.
333 if let Some(mode) = config.remount_mode {
334 jail.set_remount_mode(mode);
335 }
336
337 Ok(jail)
338 }
339
340 /// Creates a basic [Minijail] if `jail_config` is present.
341 ///
342 /// Returns `None` if `jail_config` is none.
simple_jail(jail_config: &Option<JailConfig>, policy: &str) -> Result<Option<Minijail>>343 pub fn simple_jail(jail_config: &Option<JailConfig>, policy: &str) -> Result<Option<Minijail>> {
344 if let Some(jail_config) = jail_config {
345 let config = SandboxConfig::new(jail_config, policy);
346 Ok(Some(create_sandbox_minijail(
347 &jail_config.pivot_root,
348 MAX_OPEN_FILES_DEFAULT,
349 &config,
350 )?))
351 } else {
352 Ok(None)
353 }
354 }
355
356 /// Creates [Minijail] for gpu processes.
create_gpu_minijail( root: &Path, config: &SandboxConfig, render_node_only: bool, ) -> Result<Minijail>357 pub fn create_gpu_minijail(
358 root: &Path,
359 config: &SandboxConfig,
360 render_node_only: bool,
361 ) -> Result<Minijail> {
362 let mut jail = create_sandbox_minijail(root, MAX_OPEN_FILES_FOR_GPU, config)?;
363
364 // Device nodes required for DRM.
365 let sys_dev_char_path = Path::new("/sys/dev/char");
366 jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)?;
367
368 // Necessary for CGROUP control of the vGPU threads
369 // This is not necessary UNLESS one wants to make use
370 // of the gpu cgroup command line options.
371 let sys_cpuset_path = Path::new("/sys/fs/cgroup/cpuset");
372 if sys_cpuset_path.exists() {
373 jail.mount_bind(sys_cpuset_path, sys_cpuset_path, true)?;
374 }
375
376 let sys_devices_path = Path::new("/sys/devices");
377 jail.mount_bind(sys_devices_path, sys_devices_path, false)?;
378
379 jail_mount_bind_drm(&mut jail, render_node_only)?;
380
381 // If the ARM specific devices exist on the host, bind mount them in.
382 let mali0_path = Path::new("/dev/mali0");
383 if mali0_path.exists() {
384 jail.mount_bind(mali0_path, mali0_path, true)?;
385 }
386
387 let pvr_sync_path = Path::new("/dev/pvr_sync");
388 if pvr_sync_path.exists() {
389 jail.mount_bind(pvr_sync_path, pvr_sync_path, true)?;
390 }
391
392 // If the udmabuf driver exists on the host, bind mount it in.
393 let udmabuf_path = Path::new("/dev/udmabuf");
394 if udmabuf_path.exists() {
395 jail.mount_bind(udmabuf_path, udmabuf_path, true)?;
396 }
397
398 // Libraries that are required when mesa drivers are dynamically loaded.
399 jail_mount_bind_if_exists(
400 &mut jail,
401 &[
402 "/usr/lib",
403 "/usr/lib64",
404 "/lib",
405 "/lib64",
406 "/usr/share/drirc.d",
407 "/usr/share/glvnd",
408 "/usr/share/libdrm",
409 "/usr/share/vulkan",
410 ],
411 )?;
412
413 // pvr driver requires read access to /proc/self/task/*/comm.
414 mount_proc(&mut jail)?;
415
416 // To enable perfetto tracing, we need to give access to the perfetto service IPC
417 // endpoints.
418 let perfetto_path = Path::new("/run/perfetto");
419 if perfetto_path.exists() {
420 jail.mount_bind(perfetto_path, perfetto_path, true)?;
421 }
422
423 Ok(jail)
424 }
425
426 /// Selectively bind mount drm nodes into `jail` based on `render_node_only`
427 ///
428 /// This function will not return an error if drm nodes don't exist
jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()>429 pub fn jail_mount_bind_drm(jail: &mut Minijail, render_node_only: bool) -> Result<()> {
430 if render_node_only {
431 const DRM_NUM_NODES: u32 = 63;
432 const DRM_RENDER_NODE_START: u32 = 128;
433 for offset in 0..DRM_NUM_NODES {
434 let path_str = format!("/dev/dri/renderD{}", DRM_RENDER_NODE_START + offset);
435 let drm_dri_path = Path::new(&path_str);
436 if !drm_dri_path.exists() {
437 break;
438 }
439 jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
440 }
441 } else {
442 let drm_dri_path = Path::new("/dev/dri");
443 if drm_dri_path.exists() {
444 jail.mount_bind(drm_dri_path, drm_dri_path, false)?;
445 }
446 }
447
448 Ok(())
449 }
450
451 /// Mirror-mount all the directories in `dirs` into `jail` on a best-effort basis.
452 ///
453 /// This function will not return an error if any of the directories in `dirs` is missing.
jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>( jail: &mut Minijail, dirs: &[P], ) -> Result<()>454 pub fn jail_mount_bind_if_exists<P: AsRef<std::ffi::OsStr>>(
455 jail: &mut Minijail,
456 dirs: &[P],
457 ) -> Result<()> {
458 for dir in dirs {
459 let dir_path = Path::new(dir);
460 if dir_path.exists() {
461 jail.mount_bind(dir_path, dir_path, false)?;
462 }
463 }
464
465 Ok(())
466 }
467
468 /// Mount proc in the sandbox.
mount_proc(jail: &mut Minijail) -> Result<()>469 pub fn mount_proc(jail: &mut Minijail) -> Result<()> {
470 jail.mount(
471 Path::new("proc"),
472 Path::new("/proc"),
473 "proc",
474 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC | libc::MS_RDONLY) as usize,
475 )?;
476 Ok(())
477 }
478
479 /// Read minijail internal struct address for uniquely identifying and tracking jail's lifetime
480 #[cfg(feature = "seccomp_trace")]
read_jail_addr(jail: &Minijail) -> usize481 pub fn read_jail_addr(jail: &Minijail) -> usize {
482 // We can only hope minijail's rust object will always only contain a pointer to C jail struct
483 assert_eq_size!(Minijail, usize);
484 // Safe because it's only doing a read within bound checked by static assert
485 unsafe { *(jail as *const Minijail as *const usize) }
486 }
487
488 /// Set the uid/gid for the jailed process and give a basic id map. This is
489 /// required for bind mounts to work.
add_current_user_to_jail(jail: &mut Minijail) -> Result<()>490 fn add_current_user_to_jail(jail: &mut Minijail) -> Result<()> {
491 let crosvm_uid = geteuid();
492 let crosvm_gid = getegid();
493
494 jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
495 .context("error setting UID map")?;
496 jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
497 .context("error setting GID map")?;
498
499 if crosvm_uid != 0 {
500 jail.change_uid(crosvm_uid);
501 }
502 if crosvm_gid != 0 {
503 jail.change_gid(crosvm_gid);
504 }
505 Ok(())
506 }
507
508 /// Set the seccomp policy for a jail from embedded bpfs
set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()>509 pub fn set_embedded_bpf_program(jail: &mut Minijail, seccomp_policy_name: &str) -> Result<()> {
510 let bpf_program = EMBEDDED_BPFS.get(seccomp_policy_name).with_context(|| {
511 format!(
512 "failed to find embedded seccomp policy: {}",
513 seccomp_policy_name
514 )
515 })?;
516 jail.parse_seccomp_bytes(bpf_program).with_context(|| {
517 format!(
518 "failed to parse embedded seccomp policy: {}",
519 seccomp_policy_name
520 )
521 })?;
522 Ok(())
523 }
524