xref: /aosp_15_r20/external/libcap/cap/cap.go (revision 2810ac1b38eead2603277920c78344c84ddf3aff)
1// Package cap provides all the Linux Capabilities userspace library API
2// bindings in native Go.
3//
4// Capabilities are a feature of the Linux kernel that allow fine
5// grain permissions to perform privileged operations. Privileged
6// operations are required to do irregular system level operations
7// from code. You can read more about how Capabilities are intended to
8// work here:
9//
10//   https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33528.pdf
11//
12// This package supports native Go bindings for all the features
13// described in that paper as well as supporting subsequent changes to
14// the kernel for other styles of inheritable Capability.
15//
16// Some simple things you can do with this package are:
17//
18//   // Read and display the capabilities of the running process
19//   c := cap.GetProc()
20//   log.Printf("this process has these caps:", c)
21//
22//   // Drop any privilege a process might have (including for root,
23//   // but note root 'owns' a lot of system files so a cap-limited
24//   // root can still do considerable damage to a running system).
25//   old := cap.GetProc()
26//   empty := cap.NewSet()
27//   if err := empty.SetProc(); err != nil {
28//       log.Fatalf("failed to drop privilege: %q -> %q: %v", old, empty, err)
29//   }
30//   now := cap.GetProc()
31//   if cf, _ := now.Cf(empty); cf != 0 {
32//       log.Fatalf("failed to fully drop privilege: have=%q, wanted=%q", now, empty)
33//   }
34//
35// The "cap" package operates with POSIX semantics for security
36// state. That is all OS threads are kept in sync at all times. The
37// package "kernel.org/pub/linux/libs/security/libcap/psx" is used to
38// implement POSIX semantics system calls that manipulate thread state
39// uniformly over the whole Go (and any CGo linked) process runtime.
40//
41// Note, if the Go runtime syscall interface contains the Linux
42// variant syscall.AllThreadsSyscall() API (it debuted in go1.16 see
43// https://github.com/golang/go/issues/1435 for its history) then the
44// "libcap/psx" package will use that to invoke Capability setting
45// system calls in pure Go binaries. With such an enhanced Go runtime,
46// to force this behavior, use the CGO_ENABLED=0 environment variable.
47//
48// POSIX semantics are more secure than trying to manage privilege at
49// a thread level when those threads share a common memory image as
50// they do under Linux: it is trivial to exploit a vulnerability in
51// one thread of a process to cause execution on any another
52// thread. So, any imbalance in security state, in such cases will
53// readily create an opportunity for a privilege escalation
54// vulnerability.
55//
56// POSIX semantics also work well with Go, which deliberately tries to
57// insulate the user from worrying about the number of OS threads that
58// are actually running in their program. Indeed, Go can efficiently
59// launch and manage tens of thousands of concurrent goroutines
60// without bogging the program or wider system down. It does this by
61// aggressively migrating idle threads to make progress on unblocked
62// goroutines. So, inconsistent security state across OS threads can
63// also lead to program misbehavior.
64//
65// The only exception to this process-wide common security state is
66// the cap.Launcher related functionality. This briefly locks an OS
67// thread to a goroutine in order to launch another executable - the
68// robust implementation of this kind of support is quite subtle, so
69// please read its documentation carefully, if you find that you need
70// it.
71//
72// See https://sites.google.com/site/fullycapable/ for recent updates,
73// some more complete walk-through examples of ways of using
74// 'cap.Set's etc and information on how to file bugs.
75//
76// Copyright (c) 2019-21 Andrew G. Morgan <[email protected]>
77//
78// The cap and psx packages are licensed with a (you choose) BSD
79// 3-clause or GPL2. See LICENSE file for details.
80package cap // import "kernel.org/pub/linux/libs/security/libcap/cap"
81
82import (
83	"errors"
84	"sort"
85	"sync"
86	"syscall"
87	"unsafe"
88)
89
90// Value is the type of a single capability (or permission) bit.
91type Value uint
92
93// Flag is the type of one of the three Value dimensions held in a
94// Set.  It is also used in the (*IAB).Fill() method for changing the
95// Bounding and Ambient Vectors.
96type Flag uint
97
98// Effective, Permitted, Inheritable are the three Flags of Values
99// held in a Set.
100const (
101	Effective Flag = iota
102	Permitted
103	Inheritable
104)
105
106// Diff summarizes the result of the (*Set).Cf() function.
107type Diff uint
108
109const (
110	effectiveDiff   Diff = 1 << Effective
111	permittedDiff   Diff = 1 << Permitted
112	inheritableDiff Diff = 1 << Inheritable
113)
114
115// String identifies a Flag value by its conventional "e", "p" or "i"
116// string abbreviation.
117func (f Flag) String() string {
118	switch f {
119	case Effective:
120		return "e"
121	case Permitted:
122		return "p"
123	case Inheritable:
124		return "i"
125	default:
126		return "<Error>"
127	}
128}
129
130// data holds a 32-bit slice of the compressed bitmaps of capability
131// sets as understood by the kernel.
132type data [Inheritable + 1]uint32
133
134// Set is an opaque capabilities container for a set of system
135// capbilities. It holds individually addressable capability Value's
136// for the three capability Flag's. See GetFlag() and SetFlag() for
137// how to adjust them individually, and Clear() and ClearFlag() for
138// how to do bulk operations.
139//
140// For admin tasks associated with managing namespace specific file
141// capabilities, Set can also support a namespace-root-UID value which
142// defaults to zero. See GetNSOwner() and SetNSOwner().
143type Set struct {
144	// mu protects all other members of a Set.
145	mu sync.RWMutex
146
147	// flat holds Flag Value bitmaps for all capabilities
148	// associated with this Set.
149	flat []data
150
151	// Linux specific
152	nsRoot int
153}
154
155// Various known kernel magic values.
156const (
157	kv1 = 0x19980330 // First iteration of process capabilities (32 bits).
158	kv2 = 0x20071026 // First iteration of process and file capabilities (64 bits) - deprecated.
159	kv3 = 0x20080522 // Most recently supported process and file capabilities (64 bits).
160)
161
162var (
163	// startUp protects setting of the following values: magic,
164	// words, maxValues.
165	startUp sync.Once
166
167	// magic holds the preferred magic number for the kernel ABI.
168	magic uint32
169
170	// words holds the number of uint32's associated with each
171	// capability Flag for this session.
172	words int
173
174	// maxValues holds the number of bit values that are named by
175	// the running kernel. This is generally expected to match
176	// ValueCount which is autogenerated at packaging time.
177	maxValues uint
178)
179
180type header struct {
181	magic uint32
182	pid   int32
183}
184
185// syscaller is a type for abstracting syscalls. The r* variants are
186// for reading state, and can be parallelized, the w* variants need to
187// be serialized so all OS threads can share state.
188type syscaller struct {
189	r3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno)
190	w3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno)
191	r6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno)
192	w6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno)
193}
194
195// caprcall provides a pointer etc wrapper for the system calls
196// associated with getcap.
197//go:uintptrescapes
198func (sc *syscaller) caprcall(call uintptr, h *header, d []data) error {
199	x := uintptr(0)
200	if d != nil {
201		x = uintptr(unsafe.Pointer(&d[0]))
202	}
203	_, _, err := sc.r3(call, uintptr(unsafe.Pointer(h)), x, 0)
204	if err != 0 {
205		return err
206	}
207	return nil
208}
209
210// capwcall provides a pointer etc wrapper for the system calls
211// associated with setcap.
212//go:uintptrescapes
213func (sc *syscaller) capwcall(call uintptr, h *header, d []data) error {
214	x := uintptr(0)
215	if d != nil {
216		x = uintptr(unsafe.Pointer(&d[0]))
217	}
218	_, _, err := sc.w3(call, uintptr(unsafe.Pointer(h)), x, 0)
219	if err != 0 {
220		return err
221	}
222	return nil
223}
224
225// prctlrcall provides a wrapper for the prctl systemcalls that only
226// read kernel state. There is a limited number of arguments needed
227// and the caller should use 0 for those not needed.
228func (sc *syscaller) prctlrcall(prVal, v1, v2 uintptr) (int, error) {
229	r, _, err := sc.r3(syscall.SYS_PRCTL, prVal, v1, v2)
230	if err != 0 {
231		return int(r), err
232	}
233	return int(r), nil
234}
235
236// prctlrcall6 provides a wrapper for the prctl systemcalls that only
237// read kernel state and require 6 arguments - ambient cap API, I'm
238// looking at you. There is a limited number of arguments needed and
239// the caller should use 0 for those not needed.
240func (sc *syscaller) prctlrcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) {
241	r, _, err := sc.r6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5)
242	if err != 0 {
243		return int(r), err
244	}
245	return int(r), nil
246}
247
248// prctlwcall provides a wrapper for the prctl systemcalls that
249// write/modify kernel state. Where available, these will use the
250// POSIX semantics fixup system calls. There is a limited number of
251// arguments needed and the caller should use 0 for those not needed.
252func (sc *syscaller) prctlwcall(prVal, v1, v2 uintptr) (int, error) {
253	r, _, err := sc.w3(syscall.SYS_PRCTL, prVal, v1, v2)
254	if err != 0 {
255		return int(r), err
256	}
257	return int(r), nil
258}
259
260// prctlwcall6 provides a wrapper for the prctl systemcalls that
261// write/modify kernel state and require 6 arguments - ambient cap
262// API, I'm looking at you. (Where available, these will use the POSIX
263// semantics fixup system calls). There is a limited number of
264// arguments needed and the caller should use 0 for those not needed.
265func (sc *syscaller) prctlwcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) {
266	r, _, err := sc.w6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5)
267	if err != 0 {
268		return int(r), err
269	}
270	return int(r), nil
271}
272
273// cInit performs the lazy identification of the capability vintage of
274// the running system.
275func (sc *syscaller) cInit() {
276	h := &header{
277		magic: kv3,
278	}
279	sc.caprcall(syscall.SYS_CAPGET, h, nil)
280	magic = h.magic
281	switch magic {
282	case kv1:
283		words = 1
284	case kv2, kv3:
285		words = 2
286	default:
287		// Fall back to a known good version.
288		magic = kv3
289		words = 2
290	}
291	// Use the bounding set to evaluate which capabilities exist.
292	maxValues = uint(sort.Search(32*words, func(n int) bool {
293		_, err := GetBound(Value(n))
294		return err != nil
295	}))
296	if maxValues == 0 {
297		// Fall back to using the largest value defined at build time.
298		maxValues = NamedCount
299	}
300}
301
302// MaxBits returns the number of kernel-named capabilities discovered
303// at runtime in the current system.
304func MaxBits() Value {
305	startUp.Do(multisc.cInit)
306	return Value(maxValues)
307}
308
309// NewSet returns an empty capability set.
310func NewSet() *Set {
311	startUp.Do(multisc.cInit)
312	return &Set{
313		flat: make([]data, words),
314	}
315}
316
317// ErrBadSet indicates a nil pointer was used for a *Set, or the
318// request of the Set is invalid in some way.
319var ErrBadSet = errors.New("bad capability set")
320
321// good confirms that c looks valid.
322func (c *Set) good() error {
323	if c == nil || len(c.flat) == 0 {
324		return ErrBadSet
325	}
326	return nil
327}
328
329// Dup returns a copy of the specified capability set.
330func (c *Set) Dup() (*Set, error) {
331	if err := c.good(); err != nil {
332		return nil, err
333	}
334	n := NewSet()
335	c.mu.RLock()
336	defer c.mu.RUnlock()
337	copy(n.flat, c.flat)
338	n.nsRoot = c.nsRoot
339	return n, nil
340}
341
342// GetPID returns the capability set associated with the target process
343// id; pid=0 is an alias for current.
344func GetPID(pid int) (*Set, error) {
345	v := NewSet()
346	if err := multisc.caprcall(syscall.SYS_CAPGET, &header{magic: magic, pid: int32(pid)}, v.flat); err != nil {
347		return nil, err
348	}
349	return v, nil
350}
351
352// GetProc returns the capability Set of the current process. If the
353// kernel is unable to determine the Set associated with the current
354// process, the function panic()s.
355func GetProc() *Set {
356	c, err := GetPID(0)
357	if err != nil {
358		panic(err)
359	}
360	return c
361}
362
363// setProc uses syscaller to set process capabilities.  Note, c is
364// either private to or (read) locked by the caller.
365func (sc *syscaller) setProc(c *Set) error {
366	return sc.capwcall(syscall.SYS_CAPSET, &header{magic: magic}, c.flat)
367}
368
369// SetProc attempts to set the capability Set of the current
370// process. The kernel will perform permission checks and an error
371// will be returned if the attempt fails. Should the attempt fail
372// no process capabilities will have been modified.
373//
374// Note, the general behavior of this call is to set the
375// process-shared capabilities. However, when called from a callback
376// function as part of a (*Launcher).Launch(), the call only sets the
377// capabilities of the thread being used to perform the launch.
378func (c *Set) SetProc() error {
379	if err := c.good(); err != nil {
380		return err
381	}
382	state, sc := scwStateSC()
383	defer scwSetState(launchBlocked, state, -1)
384	c.mu.RLock()
385	defer c.mu.RUnlock()
386	return sc.setProc(c)
387}
388
389// defines from uapi/linux/prctl.h
390const (
391	prCapBSetRead = 23
392	prCapBSetDrop = 24
393)
394
395// GetBound determines if a specific capability is currently part of
396// the local bounding set. On systems where the bounding set Value is
397// not present, this function returns an error.
398func GetBound(val Value) (bool, error) {
399	v, err := multisc.prctlrcall(prCapBSetRead, uintptr(val), 0)
400	if err != nil {
401		return false, err
402	}
403	return v > 0, nil
404}
405
406//go:uintptrescapes
407func (sc *syscaller) dropBound(val ...Value) error {
408	for _, v := range val {
409		if _, err := sc.prctlwcall(prCapBSetDrop, uintptr(v), 0); err != nil {
410			return err
411		}
412	}
413	return nil
414}
415
416// DropBound attempts to suppress bounding set Values. The kernel will
417// never allow a bounding set Value bit to be raised once successfully
418// dropped. However, dropping requires the current process is
419// sufficiently capable (usually via cap.SETPCAP being raised in the
420// Effective flag of the process' Set). Note, the drops are performed
421// in order and if one bounding value cannot be dropped, the function
422// returns immediately with an error which may leave the system in an
423// ill-defined state. The caller can determine where things went wrong
424// using GetBound().
425func DropBound(val ...Value) error {
426	state, sc := scwStateSC()
427	defer scwSetState(launchBlocked, state, -1)
428	return sc.dropBound(val...)
429}
430
431// defines from uapi/linux/prctl.h
432const (
433	prCapAmbient = 47
434
435	prCapAmbientIsSet    = 1
436	prCapAmbientRaise    = 2
437	prCapAmbientLower    = 3
438	prCapAmbientClearAll = 4
439)
440
441// GetAmbient determines if a specific capability is currently part of
442// the local ambient set. On systems where the ambient set Value is
443// not present, this function returns an error.
444func GetAmbient(val Value) (bool, error) {
445	r, err := multisc.prctlrcall6(prCapAmbient, prCapAmbientIsSet, uintptr(val), 0, 0, 0)
446	return r > 0, err
447}
448
449//go:uintptrescapes
450func (sc *syscaller) setAmbient(enable bool, val ...Value) error {
451	dir := uintptr(prCapAmbientLower)
452	if enable {
453		dir = prCapAmbientRaise
454	}
455	for _, v := range val {
456		_, err := sc.prctlwcall6(prCapAmbient, dir, uintptr(v), 0, 0, 0)
457		if err != nil {
458			return err
459		}
460	}
461	return nil
462}
463
464// SetAmbient attempts to set a specific Value bit to the state,
465// enable. This function will return an error if insufficient
466// permission is available to perform this task. The settings are
467// performed in order and the function returns immediately an error is
468// detected. Use GetAmbient() to unravel where things went
469// wrong. Note, the cap package manages an abstraction IAB that
470// captures all three inheritable vectors in a single type. Consider
471// using that.
472func SetAmbient(enable bool, val ...Value) error {
473	state, sc := scwStateSC()
474	defer scwSetState(launchBlocked, state, -1)
475	return sc.setAmbient(enable, val...)
476}
477
478func (sc *syscaller) resetAmbient() error {
479	var v bool
480	var err error
481
482	for c := Value(0); !v; c++ {
483		if v, err = GetAmbient(c); err != nil {
484			// no non-zero values found.
485			return nil
486		}
487	}
488	_, err = sc.prctlwcall6(prCapAmbient, prCapAmbientClearAll, 0, 0, 0, 0)
489	return err
490}
491
492// ResetAmbient attempts to ensure the Ambient set is fully
493// cleared. It works by first reading the set and if it finds any bits
494// raised it will attempt a reset. The test before attempting a reset
495// behavior is a workaround for situations where the Ambient API is
496// locked, but a reset is not actually needed. No Ambient bit not
497// already raised in both the Permitted and Inheritable Set is allowed
498// to be raised by the kernel.
499func ResetAmbient() error {
500	state, sc := scwStateSC()
501	defer scwSetState(launchBlocked, state, -1)
502	return sc.resetAmbient()
503}
504