1// Package cap provides all the Linux Capabilities userspace library API 2// bindings in native Go. 3// 4// Capabilities are a feature of the Linux kernel that allow fine 5// grain permissions to perform privileged operations. Privileged 6// operations are required to do irregular system level operations 7// from code. You can read more about how Capabilities are intended to 8// work here: 9// 10// https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33528.pdf 11// 12// This package supports native Go bindings for all the features 13// described in that paper as well as supporting subsequent changes to 14// the kernel for other styles of inheritable Capability. 15// 16// Some simple things you can do with this package are: 17// 18// // Read and display the capabilities of the running process 19// c := cap.GetProc() 20// log.Printf("this process has these caps:", c) 21// 22// // Drop any privilege a process might have (including for root, 23// // but note root 'owns' a lot of system files so a cap-limited 24// // root can still do considerable damage to a running system). 25// old := cap.GetProc() 26// empty := cap.NewSet() 27// if err := empty.SetProc(); err != nil { 28// log.Fatalf("failed to drop privilege: %q -> %q: %v", old, empty, err) 29// } 30// now := cap.GetProc() 31// if cf, _ := now.Cf(empty); cf != 0 { 32// log.Fatalf("failed to fully drop privilege: have=%q, wanted=%q", now, empty) 33// } 34// 35// The "cap" package operates with POSIX semantics for security 36// state. That is all OS threads are kept in sync at all times. The 37// package "kernel.org/pub/linux/libs/security/libcap/psx" is used to 38// implement POSIX semantics system calls that manipulate thread state 39// uniformly over the whole Go (and any CGo linked) process runtime. 40// 41// Note, if the Go runtime syscall interface contains the Linux 42// variant syscall.AllThreadsSyscall() API (it debuted in go1.16 see 43// https://github.com/golang/go/issues/1435 for its history) then the 44// "libcap/psx" package will use that to invoke Capability setting 45// system calls in pure Go binaries. With such an enhanced Go runtime, 46// to force this behavior, use the CGO_ENABLED=0 environment variable. 47// 48// POSIX semantics are more secure than trying to manage privilege at 49// a thread level when those threads share a common memory image as 50// they do under Linux: it is trivial to exploit a vulnerability in 51// one thread of a process to cause execution on any another 52// thread. So, any imbalance in security state, in such cases will 53// readily create an opportunity for a privilege escalation 54// vulnerability. 55// 56// POSIX semantics also work well with Go, which deliberately tries to 57// insulate the user from worrying about the number of OS threads that 58// are actually running in their program. Indeed, Go can efficiently 59// launch and manage tens of thousands of concurrent goroutines 60// without bogging the program or wider system down. It does this by 61// aggressively migrating idle threads to make progress on unblocked 62// goroutines. So, inconsistent security state across OS threads can 63// also lead to program misbehavior. 64// 65// The only exception to this process-wide common security state is 66// the cap.Launcher related functionality. This briefly locks an OS 67// thread to a goroutine in order to launch another executable - the 68// robust implementation of this kind of support is quite subtle, so 69// please read its documentation carefully, if you find that you need 70// it. 71// 72// See https://sites.google.com/site/fullycapable/ for recent updates, 73// some more complete walk-through examples of ways of using 74// 'cap.Set's etc and information on how to file bugs. 75// 76// Copyright (c) 2019-21 Andrew G. Morgan <[email protected]> 77// 78// The cap and psx packages are licensed with a (you choose) BSD 79// 3-clause or GPL2. See LICENSE file for details. 80package cap // import "kernel.org/pub/linux/libs/security/libcap/cap" 81 82import ( 83 "errors" 84 "sort" 85 "sync" 86 "syscall" 87 "unsafe" 88) 89 90// Value is the type of a single capability (or permission) bit. 91type Value uint 92 93// Flag is the type of one of the three Value dimensions held in a 94// Set. It is also used in the (*IAB).Fill() method for changing the 95// Bounding and Ambient Vectors. 96type Flag uint 97 98// Effective, Permitted, Inheritable are the three Flags of Values 99// held in a Set. 100const ( 101 Effective Flag = iota 102 Permitted 103 Inheritable 104) 105 106// Diff summarizes the result of the (*Set).Cf() function. 107type Diff uint 108 109const ( 110 effectiveDiff Diff = 1 << Effective 111 permittedDiff Diff = 1 << Permitted 112 inheritableDiff Diff = 1 << Inheritable 113) 114 115// String identifies a Flag value by its conventional "e", "p" or "i" 116// string abbreviation. 117func (f Flag) String() string { 118 switch f { 119 case Effective: 120 return "e" 121 case Permitted: 122 return "p" 123 case Inheritable: 124 return "i" 125 default: 126 return "<Error>" 127 } 128} 129 130// data holds a 32-bit slice of the compressed bitmaps of capability 131// sets as understood by the kernel. 132type data [Inheritable + 1]uint32 133 134// Set is an opaque capabilities container for a set of system 135// capbilities. It holds individually addressable capability Value's 136// for the three capability Flag's. See GetFlag() and SetFlag() for 137// how to adjust them individually, and Clear() and ClearFlag() for 138// how to do bulk operations. 139// 140// For admin tasks associated with managing namespace specific file 141// capabilities, Set can also support a namespace-root-UID value which 142// defaults to zero. See GetNSOwner() and SetNSOwner(). 143type Set struct { 144 // mu protects all other members of a Set. 145 mu sync.RWMutex 146 147 // flat holds Flag Value bitmaps for all capabilities 148 // associated with this Set. 149 flat []data 150 151 // Linux specific 152 nsRoot int 153} 154 155// Various known kernel magic values. 156const ( 157 kv1 = 0x19980330 // First iteration of process capabilities (32 bits). 158 kv2 = 0x20071026 // First iteration of process and file capabilities (64 bits) - deprecated. 159 kv3 = 0x20080522 // Most recently supported process and file capabilities (64 bits). 160) 161 162var ( 163 // startUp protects setting of the following values: magic, 164 // words, maxValues. 165 startUp sync.Once 166 167 // magic holds the preferred magic number for the kernel ABI. 168 magic uint32 169 170 // words holds the number of uint32's associated with each 171 // capability Flag for this session. 172 words int 173 174 // maxValues holds the number of bit values that are named by 175 // the running kernel. This is generally expected to match 176 // ValueCount which is autogenerated at packaging time. 177 maxValues uint 178) 179 180type header struct { 181 magic uint32 182 pid int32 183} 184 185// syscaller is a type for abstracting syscalls. The r* variants are 186// for reading state, and can be parallelized, the w* variants need to 187// be serialized so all OS threads can share state. 188type syscaller struct { 189 r3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) 190 w3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) 191 r6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) 192 w6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) 193} 194 195// caprcall provides a pointer etc wrapper for the system calls 196// associated with getcap. 197//go:uintptrescapes 198func (sc *syscaller) caprcall(call uintptr, h *header, d []data) error { 199 x := uintptr(0) 200 if d != nil { 201 x = uintptr(unsafe.Pointer(&d[0])) 202 } 203 _, _, err := sc.r3(call, uintptr(unsafe.Pointer(h)), x, 0) 204 if err != 0 { 205 return err 206 } 207 return nil 208} 209 210// capwcall provides a pointer etc wrapper for the system calls 211// associated with setcap. 212//go:uintptrescapes 213func (sc *syscaller) capwcall(call uintptr, h *header, d []data) error { 214 x := uintptr(0) 215 if d != nil { 216 x = uintptr(unsafe.Pointer(&d[0])) 217 } 218 _, _, err := sc.w3(call, uintptr(unsafe.Pointer(h)), x, 0) 219 if err != 0 { 220 return err 221 } 222 return nil 223} 224 225// prctlrcall provides a wrapper for the prctl systemcalls that only 226// read kernel state. There is a limited number of arguments needed 227// and the caller should use 0 for those not needed. 228func (sc *syscaller) prctlrcall(prVal, v1, v2 uintptr) (int, error) { 229 r, _, err := sc.r3(syscall.SYS_PRCTL, prVal, v1, v2) 230 if err != 0 { 231 return int(r), err 232 } 233 return int(r), nil 234} 235 236// prctlrcall6 provides a wrapper for the prctl systemcalls that only 237// read kernel state and require 6 arguments - ambient cap API, I'm 238// looking at you. There is a limited number of arguments needed and 239// the caller should use 0 for those not needed. 240func (sc *syscaller) prctlrcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) { 241 r, _, err := sc.r6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5) 242 if err != 0 { 243 return int(r), err 244 } 245 return int(r), nil 246} 247 248// prctlwcall provides a wrapper for the prctl systemcalls that 249// write/modify kernel state. Where available, these will use the 250// POSIX semantics fixup system calls. There is a limited number of 251// arguments needed and the caller should use 0 for those not needed. 252func (sc *syscaller) prctlwcall(prVal, v1, v2 uintptr) (int, error) { 253 r, _, err := sc.w3(syscall.SYS_PRCTL, prVal, v1, v2) 254 if err != 0 { 255 return int(r), err 256 } 257 return int(r), nil 258} 259 260// prctlwcall6 provides a wrapper for the prctl systemcalls that 261// write/modify kernel state and require 6 arguments - ambient cap 262// API, I'm looking at you. (Where available, these will use the POSIX 263// semantics fixup system calls). There is a limited number of 264// arguments needed and the caller should use 0 for those not needed. 265func (sc *syscaller) prctlwcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) { 266 r, _, err := sc.w6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5) 267 if err != 0 { 268 return int(r), err 269 } 270 return int(r), nil 271} 272 273// cInit performs the lazy identification of the capability vintage of 274// the running system. 275func (sc *syscaller) cInit() { 276 h := &header{ 277 magic: kv3, 278 } 279 sc.caprcall(syscall.SYS_CAPGET, h, nil) 280 magic = h.magic 281 switch magic { 282 case kv1: 283 words = 1 284 case kv2, kv3: 285 words = 2 286 default: 287 // Fall back to a known good version. 288 magic = kv3 289 words = 2 290 } 291 // Use the bounding set to evaluate which capabilities exist. 292 maxValues = uint(sort.Search(32*words, func(n int) bool { 293 _, err := GetBound(Value(n)) 294 return err != nil 295 })) 296 if maxValues == 0 { 297 // Fall back to using the largest value defined at build time. 298 maxValues = NamedCount 299 } 300} 301 302// MaxBits returns the number of kernel-named capabilities discovered 303// at runtime in the current system. 304func MaxBits() Value { 305 startUp.Do(multisc.cInit) 306 return Value(maxValues) 307} 308 309// NewSet returns an empty capability set. 310func NewSet() *Set { 311 startUp.Do(multisc.cInit) 312 return &Set{ 313 flat: make([]data, words), 314 } 315} 316 317// ErrBadSet indicates a nil pointer was used for a *Set, or the 318// request of the Set is invalid in some way. 319var ErrBadSet = errors.New("bad capability set") 320 321// good confirms that c looks valid. 322func (c *Set) good() error { 323 if c == nil || len(c.flat) == 0 { 324 return ErrBadSet 325 } 326 return nil 327} 328 329// Dup returns a copy of the specified capability set. 330func (c *Set) Dup() (*Set, error) { 331 if err := c.good(); err != nil { 332 return nil, err 333 } 334 n := NewSet() 335 c.mu.RLock() 336 defer c.mu.RUnlock() 337 copy(n.flat, c.flat) 338 n.nsRoot = c.nsRoot 339 return n, nil 340} 341 342// GetPID returns the capability set associated with the target process 343// id; pid=0 is an alias for current. 344func GetPID(pid int) (*Set, error) { 345 v := NewSet() 346 if err := multisc.caprcall(syscall.SYS_CAPGET, &header{magic: magic, pid: int32(pid)}, v.flat); err != nil { 347 return nil, err 348 } 349 return v, nil 350} 351 352// GetProc returns the capability Set of the current process. If the 353// kernel is unable to determine the Set associated with the current 354// process, the function panic()s. 355func GetProc() *Set { 356 c, err := GetPID(0) 357 if err != nil { 358 panic(err) 359 } 360 return c 361} 362 363// setProc uses syscaller to set process capabilities. Note, c is 364// either private to or (read) locked by the caller. 365func (sc *syscaller) setProc(c *Set) error { 366 return sc.capwcall(syscall.SYS_CAPSET, &header{magic: magic}, c.flat) 367} 368 369// SetProc attempts to set the capability Set of the current 370// process. The kernel will perform permission checks and an error 371// will be returned if the attempt fails. Should the attempt fail 372// no process capabilities will have been modified. 373// 374// Note, the general behavior of this call is to set the 375// process-shared capabilities. However, when called from a callback 376// function as part of a (*Launcher).Launch(), the call only sets the 377// capabilities of the thread being used to perform the launch. 378func (c *Set) SetProc() error { 379 if err := c.good(); err != nil { 380 return err 381 } 382 state, sc := scwStateSC() 383 defer scwSetState(launchBlocked, state, -1) 384 c.mu.RLock() 385 defer c.mu.RUnlock() 386 return sc.setProc(c) 387} 388 389// defines from uapi/linux/prctl.h 390const ( 391 prCapBSetRead = 23 392 prCapBSetDrop = 24 393) 394 395// GetBound determines if a specific capability is currently part of 396// the local bounding set. On systems where the bounding set Value is 397// not present, this function returns an error. 398func GetBound(val Value) (bool, error) { 399 v, err := multisc.prctlrcall(prCapBSetRead, uintptr(val), 0) 400 if err != nil { 401 return false, err 402 } 403 return v > 0, nil 404} 405 406//go:uintptrescapes 407func (sc *syscaller) dropBound(val ...Value) error { 408 for _, v := range val { 409 if _, err := sc.prctlwcall(prCapBSetDrop, uintptr(v), 0); err != nil { 410 return err 411 } 412 } 413 return nil 414} 415 416// DropBound attempts to suppress bounding set Values. The kernel will 417// never allow a bounding set Value bit to be raised once successfully 418// dropped. However, dropping requires the current process is 419// sufficiently capable (usually via cap.SETPCAP being raised in the 420// Effective flag of the process' Set). Note, the drops are performed 421// in order and if one bounding value cannot be dropped, the function 422// returns immediately with an error which may leave the system in an 423// ill-defined state. The caller can determine where things went wrong 424// using GetBound(). 425func DropBound(val ...Value) error { 426 state, sc := scwStateSC() 427 defer scwSetState(launchBlocked, state, -1) 428 return sc.dropBound(val...) 429} 430 431// defines from uapi/linux/prctl.h 432const ( 433 prCapAmbient = 47 434 435 prCapAmbientIsSet = 1 436 prCapAmbientRaise = 2 437 prCapAmbientLower = 3 438 prCapAmbientClearAll = 4 439) 440 441// GetAmbient determines if a specific capability is currently part of 442// the local ambient set. On systems where the ambient set Value is 443// not present, this function returns an error. 444func GetAmbient(val Value) (bool, error) { 445 r, err := multisc.prctlrcall6(prCapAmbient, prCapAmbientIsSet, uintptr(val), 0, 0, 0) 446 return r > 0, err 447} 448 449//go:uintptrescapes 450func (sc *syscaller) setAmbient(enable bool, val ...Value) error { 451 dir := uintptr(prCapAmbientLower) 452 if enable { 453 dir = prCapAmbientRaise 454 } 455 for _, v := range val { 456 _, err := sc.prctlwcall6(prCapAmbient, dir, uintptr(v), 0, 0, 0) 457 if err != nil { 458 return err 459 } 460 } 461 return nil 462} 463 464// SetAmbient attempts to set a specific Value bit to the state, 465// enable. This function will return an error if insufficient 466// permission is available to perform this task. The settings are 467// performed in order and the function returns immediately an error is 468// detected. Use GetAmbient() to unravel where things went 469// wrong. Note, the cap package manages an abstraction IAB that 470// captures all three inheritable vectors in a single type. Consider 471// using that. 472func SetAmbient(enable bool, val ...Value) error { 473 state, sc := scwStateSC() 474 defer scwSetState(launchBlocked, state, -1) 475 return sc.setAmbient(enable, val...) 476} 477 478func (sc *syscaller) resetAmbient() error { 479 var v bool 480 var err error 481 482 for c := Value(0); !v; c++ { 483 if v, err = GetAmbient(c); err != nil { 484 // no non-zero values found. 485 return nil 486 } 487 } 488 _, err = sc.prctlwcall6(prCapAmbient, prCapAmbientClearAll, 0, 0, 0, 0) 489 return err 490} 491 492// ResetAmbient attempts to ensure the Ambient set is fully 493// cleared. It works by first reading the set and if it finds any bits 494// raised it will attempt a reset. The test before attempting a reset 495// behavior is a workaround for situations where the Ambient API is 496// locked, but a reset is not actually needed. No Ambient bit not 497// already raised in both the Permitted and Inheritable Set is allowed 498// to be raised by the kernel. 499func ResetAmbient() error { 500 state, sc := scwStateSC() 501 defer scwSetState(launchBlocked, state, -1) 502 return sc.resetAmbient() 503} 504