1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Semaphore implementation exposed to Go. 6// Intended use is provide a sleep and wakeup 7// primitive that can be used in the contended case 8// of other synchronization primitives. 9// Thus it targets the same goal as Linux's futex, 10// but it has much simpler semantics. 11// 12// That is, don't think of these as semaphores. 13// Think of them as a way to implement sleep and wakeup 14// such that every sleep is paired with a single wakeup, 15// even if, due to races, the wakeup happens before the sleep. 16// 17// See Mullender and Cox, ``Semaphores in Plan 9,'' 18// https://swtch.com/semaphore.pdf 19 20package runtime 21 22import ( 23 "internal/cpu" 24 "internal/runtime/atomic" 25 "unsafe" 26) 27 28// Asynchronous semaphore for sync.Mutex. 29 30// A semaRoot holds a balanced tree of sudog with distinct addresses (s.elem). 31// Each of those sudog may in turn point (through s.waitlink) to a list 32// of other sudogs waiting on the same address. 33// The operations on the inner lists of sudogs with the same address 34// are all O(1). The scanning of the top-level semaRoot list is O(log n), 35// where n is the number of distinct addresses with goroutines blocked 36// on them that hash to the given semaRoot. 37// See golang.org/issue/17953 for a program that worked badly 38// before we introduced the second level of list, and 39// BenchmarkSemTable/OneAddrCollision/* for a benchmark that exercises this. 40type semaRoot struct { 41 lock mutex 42 treap *sudog // root of balanced tree of unique waiters. 43 nwait atomic.Uint32 // Number of waiters. Read w/o the lock. 44} 45 46var semtable semTable 47 48// Prime to not correlate with any user patterns. 49const semTabSize = 251 50 51type semTable [semTabSize]struct { 52 root semaRoot 53 pad [cpu.CacheLinePadSize - unsafe.Sizeof(semaRoot{})]byte 54} 55 56func (t *semTable) rootFor(addr *uint32) *semaRoot { 57 return &t[(uintptr(unsafe.Pointer(addr))>>3)%semTabSize].root 58} 59 60// sync_runtime_Semacquire should be an internal detail, 61// but widely used packages access it using linkname. 62// Notable members of the hall of shame include: 63// - gvisor.dev/gvisor 64// - github.com/sagernet/gvisor 65// 66// Do not remove or change the type signature. 67// See go.dev/issue/67401. 68// 69//go:linkname sync_runtime_Semacquire sync.runtime_Semacquire 70func sync_runtime_Semacquire(addr *uint32) { 71 semacquire1(addr, false, semaBlockProfile, 0, waitReasonSemacquire) 72} 73 74//go:linkname poll_runtime_Semacquire internal/poll.runtime_Semacquire 75func poll_runtime_Semacquire(addr *uint32) { 76 semacquire1(addr, false, semaBlockProfile, 0, waitReasonSemacquire) 77} 78 79// sync_runtime_Semrelease should be an internal detail, 80// but widely used packages access it using linkname. 81// Notable members of the hall of shame include: 82// - gvisor.dev/gvisor 83// - github.com/sagernet/gvisor 84// 85// Do not remove or change the type signature. 86// See go.dev/issue/67401. 87// 88//go:linkname sync_runtime_Semrelease sync.runtime_Semrelease 89func sync_runtime_Semrelease(addr *uint32, handoff bool, skipframes int) { 90 semrelease1(addr, handoff, skipframes) 91} 92 93//go:linkname sync_runtime_SemacquireMutex sync.runtime_SemacquireMutex 94func sync_runtime_SemacquireMutex(addr *uint32, lifo bool, skipframes int) { 95 semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes, waitReasonSyncMutexLock) 96} 97 98//go:linkname sync_runtime_SemacquireRWMutexR sync.runtime_SemacquireRWMutexR 99func sync_runtime_SemacquireRWMutexR(addr *uint32, lifo bool, skipframes int) { 100 semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes, waitReasonSyncRWMutexRLock) 101} 102 103//go:linkname sync_runtime_SemacquireRWMutex sync.runtime_SemacquireRWMutex 104func sync_runtime_SemacquireRWMutex(addr *uint32, lifo bool, skipframes int) { 105 semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes, waitReasonSyncRWMutexLock) 106} 107 108//go:linkname poll_runtime_Semrelease internal/poll.runtime_Semrelease 109func poll_runtime_Semrelease(addr *uint32) { 110 semrelease(addr) 111} 112 113func readyWithTime(s *sudog, traceskip int) { 114 if s.releasetime != 0 { 115 s.releasetime = cputicks() 116 } 117 goready(s.g, traceskip) 118} 119 120type semaProfileFlags int 121 122const ( 123 semaBlockProfile semaProfileFlags = 1 << iota 124 semaMutexProfile 125) 126 127// Called from runtime. 128func semacquire(addr *uint32) { 129 semacquire1(addr, false, 0, 0, waitReasonSemacquire) 130} 131 132func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags, skipframes int, reason waitReason) { 133 gp := getg() 134 if gp != gp.m.curg { 135 throw("semacquire not on the G stack") 136 } 137 138 // Easy case. 139 if cansemacquire(addr) { 140 return 141 } 142 143 // Harder case: 144 // increment waiter count 145 // try cansemacquire one more time, return if succeeded 146 // enqueue itself as a waiter 147 // sleep 148 // (waiter descriptor is dequeued by signaler) 149 s := acquireSudog() 150 root := semtable.rootFor(addr) 151 t0 := int64(0) 152 s.releasetime = 0 153 s.acquiretime = 0 154 s.ticket = 0 155 if profile&semaBlockProfile != 0 && blockprofilerate > 0 { 156 t0 = cputicks() 157 s.releasetime = -1 158 } 159 if profile&semaMutexProfile != 0 && mutexprofilerate > 0 { 160 if t0 == 0 { 161 t0 = cputicks() 162 } 163 s.acquiretime = t0 164 } 165 for { 166 lockWithRank(&root.lock, lockRankRoot) 167 // Add ourselves to nwait to disable "easy case" in semrelease. 168 root.nwait.Add(1) 169 // Check cansemacquire to avoid missed wakeup. 170 if cansemacquire(addr) { 171 root.nwait.Add(-1) 172 unlock(&root.lock) 173 break 174 } 175 // Any semrelease after the cansemacquire knows we're waiting 176 // (we set nwait above), so go to sleep. 177 root.queue(addr, s, lifo) 178 goparkunlock(&root.lock, reason, traceBlockSync, 4+skipframes) 179 if s.ticket != 0 || cansemacquire(addr) { 180 break 181 } 182 } 183 if s.releasetime > 0 { 184 blockevent(s.releasetime-t0, 3+skipframes) 185 } 186 releaseSudog(s) 187} 188 189func semrelease(addr *uint32) { 190 semrelease1(addr, false, 0) 191} 192 193func semrelease1(addr *uint32, handoff bool, skipframes int) { 194 root := semtable.rootFor(addr) 195 atomic.Xadd(addr, 1) 196 197 // Easy case: no waiters? 198 // This check must happen after the xadd, to avoid a missed wakeup 199 // (see loop in semacquire). 200 if root.nwait.Load() == 0 { 201 return 202 } 203 204 // Harder case: search for a waiter and wake it. 205 lockWithRank(&root.lock, lockRankRoot) 206 if root.nwait.Load() == 0 { 207 // The count is already consumed by another goroutine, 208 // so no need to wake up another goroutine. 209 unlock(&root.lock) 210 return 211 } 212 s, t0, tailtime := root.dequeue(addr) 213 if s != nil { 214 root.nwait.Add(-1) 215 } 216 unlock(&root.lock) 217 if s != nil { // May be slow or even yield, so unlock first 218 acquiretime := s.acquiretime 219 if acquiretime != 0 { 220 // Charge contention that this (delayed) unlock caused. 221 // If there are N more goroutines waiting beyond the 222 // one that's waking up, charge their delay as well, so that 223 // contention holding up many goroutines shows up as 224 // more costly than contention holding up a single goroutine. 225 // It would take O(N) time to calculate how long each goroutine 226 // has been waiting, so instead we charge avg(head-wait, tail-wait)*N. 227 // head-wait is the longest wait and tail-wait is the shortest. 228 // (When we do a lifo insertion, we preserve this property by 229 // copying the old head's acquiretime into the inserted new head. 230 // In that case the overall average may be slightly high, but that's fine: 231 // the average of the ends is only an approximation to the actual 232 // average anyway.) 233 // The root.dequeue above changed the head and tail acquiretime 234 // to the current time, so the next unlock will not re-count this contention. 235 dt0 := t0 - acquiretime 236 dt := dt0 237 if s.waiters != 0 { 238 dtail := t0 - tailtime 239 dt += (dtail + dt0) / 2 * int64(s.waiters) 240 } 241 mutexevent(dt, 3+skipframes) 242 } 243 if s.ticket != 0 { 244 throw("corrupted semaphore ticket") 245 } 246 if handoff && cansemacquire(addr) { 247 s.ticket = 1 248 } 249 readyWithTime(s, 5+skipframes) 250 if s.ticket == 1 && getg().m.locks == 0 { 251 // Direct G handoff 252 // readyWithTime has added the waiter G as runnext in the 253 // current P; we now call the scheduler so that we start running 254 // the waiter G immediately. 255 // Note that waiter inherits our time slice: this is desirable 256 // to avoid having a highly contended semaphore hog the P 257 // indefinitely. goyield is like Gosched, but it emits a 258 // "preempted" trace event instead and, more importantly, puts 259 // the current G on the local runq instead of the global one. 260 // We only do this in the starving regime (handoff=true), as in 261 // the non-starving case it is possible for a different waiter 262 // to acquire the semaphore while we are yielding/scheduling, 263 // and this would be wasteful. We wait instead to enter starving 264 // regime, and then we start to do direct handoffs of ticket and 265 // P. 266 // See issue 33747 for discussion. 267 goyield() 268 } 269 } 270} 271 272func cansemacquire(addr *uint32) bool { 273 for { 274 v := atomic.Load(addr) 275 if v == 0 { 276 return false 277 } 278 if atomic.Cas(addr, v, v-1) { 279 return true 280 } 281 } 282} 283 284// queue adds s to the blocked goroutines in semaRoot. 285func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) { 286 s.g = getg() 287 s.elem = unsafe.Pointer(addr) 288 s.next = nil 289 s.prev = nil 290 s.waiters = 0 291 292 var last *sudog 293 pt := &root.treap 294 for t := *pt; t != nil; t = *pt { 295 if t.elem == unsafe.Pointer(addr) { 296 // Already have addr in list. 297 if lifo { 298 // Substitute s in t's place in treap. 299 *pt = s 300 s.ticket = t.ticket 301 s.acquiretime = t.acquiretime // preserve head acquiretime as oldest time 302 s.parent = t.parent 303 s.prev = t.prev 304 s.next = t.next 305 if s.prev != nil { 306 s.prev.parent = s 307 } 308 if s.next != nil { 309 s.next.parent = s 310 } 311 // Add t first in s's wait list. 312 s.waitlink = t 313 s.waittail = t.waittail 314 if s.waittail == nil { 315 s.waittail = t 316 } 317 s.waiters = t.waiters 318 if s.waiters+1 != 0 { 319 s.waiters++ 320 } 321 t.parent = nil 322 t.prev = nil 323 t.next = nil 324 t.waittail = nil 325 } else { 326 // Add s to end of t's wait list. 327 if t.waittail == nil { 328 t.waitlink = s 329 } else { 330 t.waittail.waitlink = s 331 } 332 t.waittail = s 333 s.waitlink = nil 334 if t.waiters+1 != 0 { 335 t.waiters++ 336 } 337 } 338 return 339 } 340 last = t 341 if uintptr(unsafe.Pointer(addr)) < uintptr(t.elem) { 342 pt = &t.prev 343 } else { 344 pt = &t.next 345 } 346 } 347 348 // Add s as new leaf in tree of unique addrs. 349 // The balanced tree is a treap using ticket as the random heap priority. 350 // That is, it is a binary tree ordered according to the elem addresses, 351 // but then among the space of possible binary trees respecting those 352 // addresses, it is kept balanced on average by maintaining a heap ordering 353 // on the ticket: s.ticket <= both s.prev.ticket and s.next.ticket. 354 // https://en.wikipedia.org/wiki/Treap 355 // https://faculty.washington.edu/aragon/pubs/rst89.pdf 356 // 357 // s.ticket compared with zero in couple of places, therefore set lowest bit. 358 // It will not affect treap's quality noticeably. 359 s.ticket = cheaprand() | 1 360 s.parent = last 361 *pt = s 362 363 // Rotate up into tree according to ticket (priority). 364 for s.parent != nil && s.parent.ticket > s.ticket { 365 if s.parent.prev == s { 366 root.rotateRight(s.parent) 367 } else { 368 if s.parent.next != s { 369 panic("semaRoot queue") 370 } 371 root.rotateLeft(s.parent) 372 } 373 } 374} 375 376// dequeue searches for and finds the first goroutine 377// in semaRoot blocked on addr. 378// If the sudog was being profiled, dequeue returns the time 379// at which it was woken up as now. Otherwise now is 0. 380// If there are additional entries in the wait list, dequeue 381// returns tailtime set to the last entry's acquiretime. 382// Otherwise tailtime is found.acquiretime. 383func (root *semaRoot) dequeue(addr *uint32) (found *sudog, now, tailtime int64) { 384 ps := &root.treap 385 s := *ps 386 for ; s != nil; s = *ps { 387 if s.elem == unsafe.Pointer(addr) { 388 goto Found 389 } 390 if uintptr(unsafe.Pointer(addr)) < uintptr(s.elem) { 391 ps = &s.prev 392 } else { 393 ps = &s.next 394 } 395 } 396 return nil, 0, 0 397 398Found: 399 now = int64(0) 400 if s.acquiretime != 0 { 401 now = cputicks() 402 } 403 if t := s.waitlink; t != nil { 404 // Substitute t, also waiting on addr, for s in root tree of unique addrs. 405 *ps = t 406 t.ticket = s.ticket 407 t.parent = s.parent 408 t.prev = s.prev 409 if t.prev != nil { 410 t.prev.parent = t 411 } 412 t.next = s.next 413 if t.next != nil { 414 t.next.parent = t 415 } 416 if t.waitlink != nil { 417 t.waittail = s.waittail 418 } else { 419 t.waittail = nil 420 } 421 t.waiters = s.waiters 422 if t.waiters > 1 { 423 t.waiters-- 424 } 425 // Set head and tail acquire time to 'now', 426 // because the caller will take care of charging 427 // the delays before now for all entries in the list. 428 t.acquiretime = now 429 tailtime = s.waittail.acquiretime 430 s.waittail.acquiretime = now 431 s.waitlink = nil 432 s.waittail = nil 433 } else { 434 // Rotate s down to be leaf of tree for removal, respecting priorities. 435 for s.next != nil || s.prev != nil { 436 if s.next == nil || s.prev != nil && s.prev.ticket < s.next.ticket { 437 root.rotateRight(s) 438 } else { 439 root.rotateLeft(s) 440 } 441 } 442 // Remove s, now a leaf. 443 if s.parent != nil { 444 if s.parent.prev == s { 445 s.parent.prev = nil 446 } else { 447 s.parent.next = nil 448 } 449 } else { 450 root.treap = nil 451 } 452 tailtime = s.acquiretime 453 } 454 s.parent = nil 455 s.elem = nil 456 s.next = nil 457 s.prev = nil 458 s.ticket = 0 459 return s, now, tailtime 460} 461 462// rotateLeft rotates the tree rooted at node x. 463// turning (x a (y b c)) into (y (x a b) c). 464func (root *semaRoot) rotateLeft(x *sudog) { 465 // p -> (x a (y b c)) 466 p := x.parent 467 y := x.next 468 b := y.prev 469 470 y.prev = x 471 x.parent = y 472 x.next = b 473 if b != nil { 474 b.parent = x 475 } 476 477 y.parent = p 478 if p == nil { 479 root.treap = y 480 } else if p.prev == x { 481 p.prev = y 482 } else { 483 if p.next != x { 484 throw("semaRoot rotateLeft") 485 } 486 p.next = y 487 } 488} 489 490// rotateRight rotates the tree rooted at node y. 491// turning (y (x a b) c) into (x a (y b c)). 492func (root *semaRoot) rotateRight(y *sudog) { 493 // p -> (y (x a b) c) 494 p := y.parent 495 x := y.prev 496 b := x.next 497 498 x.next = y 499 y.parent = x 500 y.prev = b 501 if b != nil { 502 b.parent = y 503 } 504 505 x.parent = p 506 if p == nil { 507 root.treap = x 508 } else if p.prev == y { 509 p.prev = x 510 } else { 511 if p.next != y { 512 throw("semaRoot rotateRight") 513 } 514 p.next = x 515 } 516} 517 518// notifyList is a ticket-based notification list used to implement sync.Cond. 519// 520// It must be kept in sync with the sync package. 521type notifyList struct { 522 // wait is the ticket number of the next waiter. It is atomically 523 // incremented outside the lock. 524 wait atomic.Uint32 525 526 // notify is the ticket number of the next waiter to be notified. It can 527 // be read outside the lock, but is only written to with lock held. 528 // 529 // Both wait & notify can wrap around, and such cases will be correctly 530 // handled as long as their "unwrapped" difference is bounded by 2^31. 531 // For this not to be the case, we'd need to have 2^31+ goroutines 532 // blocked on the same condvar, which is currently not possible. 533 notify uint32 534 535 // List of parked waiters. 536 lock mutex 537 head *sudog 538 tail *sudog 539} 540 541// less checks if a < b, considering a & b running counts that may overflow the 542// 32-bit range, and that their "unwrapped" difference is always less than 2^31. 543func less(a, b uint32) bool { 544 return int32(a-b) < 0 545} 546 547// notifyListAdd adds the caller to a notify list such that it can receive 548// notifications. The caller must eventually call notifyListWait to wait for 549// such a notification, passing the returned ticket number. 550// 551//go:linkname notifyListAdd sync.runtime_notifyListAdd 552func notifyListAdd(l *notifyList) uint32 { 553 // This may be called concurrently, for example, when called from 554 // sync.Cond.Wait while holding a RWMutex in read mode. 555 return l.wait.Add(1) - 1 556} 557 558// notifyListWait waits for a notification. If one has been sent since 559// notifyListAdd was called, it returns immediately. Otherwise, it blocks. 560// 561//go:linkname notifyListWait sync.runtime_notifyListWait 562func notifyListWait(l *notifyList, t uint32) { 563 lockWithRank(&l.lock, lockRankNotifyList) 564 565 // Return right away if this ticket has already been notified. 566 if less(t, l.notify) { 567 unlock(&l.lock) 568 return 569 } 570 571 // Enqueue itself. 572 s := acquireSudog() 573 s.g = getg() 574 s.ticket = t 575 s.releasetime = 0 576 t0 := int64(0) 577 if blockprofilerate > 0 { 578 t0 = cputicks() 579 s.releasetime = -1 580 } 581 if l.tail == nil { 582 l.head = s 583 } else { 584 l.tail.next = s 585 } 586 l.tail = s 587 goparkunlock(&l.lock, waitReasonSyncCondWait, traceBlockCondWait, 3) 588 if t0 != 0 { 589 blockevent(s.releasetime-t0, 2) 590 } 591 releaseSudog(s) 592} 593 594// notifyListNotifyAll notifies all entries in the list. 595// 596//go:linkname notifyListNotifyAll sync.runtime_notifyListNotifyAll 597func notifyListNotifyAll(l *notifyList) { 598 // Fast-path: if there are no new waiters since the last notification 599 // we don't need to acquire the lock. 600 if l.wait.Load() == atomic.Load(&l.notify) { 601 return 602 } 603 604 // Pull the list out into a local variable, waiters will be readied 605 // outside the lock. 606 lockWithRank(&l.lock, lockRankNotifyList) 607 s := l.head 608 l.head = nil 609 l.tail = nil 610 611 // Update the next ticket to be notified. We can set it to the current 612 // value of wait because any previous waiters are already in the list 613 // or will notice that they have already been notified when trying to 614 // add themselves to the list. 615 atomic.Store(&l.notify, l.wait.Load()) 616 unlock(&l.lock) 617 618 // Go through the local list and ready all waiters. 619 for s != nil { 620 next := s.next 621 s.next = nil 622 readyWithTime(s, 4) 623 s = next 624 } 625} 626 627// notifyListNotifyOne notifies one entry in the list. 628// 629//go:linkname notifyListNotifyOne sync.runtime_notifyListNotifyOne 630func notifyListNotifyOne(l *notifyList) { 631 // Fast-path: if there are no new waiters since the last notification 632 // we don't need to acquire the lock at all. 633 if l.wait.Load() == atomic.Load(&l.notify) { 634 return 635 } 636 637 lockWithRank(&l.lock, lockRankNotifyList) 638 639 // Re-check under the lock if we need to do anything. 640 t := l.notify 641 if t == l.wait.Load() { 642 unlock(&l.lock) 643 return 644 } 645 646 // Update the next notify ticket number. 647 atomic.Store(&l.notify, t+1) 648 649 // Try to find the g that needs to be notified. 650 // If it hasn't made it to the list yet we won't find it, 651 // but it won't park itself once it sees the new notify number. 652 // 653 // This scan looks linear but essentially always stops quickly. 654 // Because g's queue separately from taking numbers, 655 // there may be minor reorderings in the list, but we 656 // expect the g we're looking for to be near the front. 657 // The g has others in front of it on the list only to the 658 // extent that it lost the race, so the iteration will not 659 // be too long. This applies even when the g is missing: 660 // it hasn't yet gotten to sleep and has lost the race to 661 // the (few) other g's that we find on the list. 662 for p, s := (*sudog)(nil), l.head; s != nil; p, s = s, s.next { 663 if s.ticket == t { 664 n := s.next 665 if p != nil { 666 p.next = n 667 } else { 668 l.head = n 669 } 670 if n == nil { 671 l.tail = p 672 } 673 unlock(&l.lock) 674 s.next = nil 675 readyWithTime(s, 4) 676 return 677 } 678 } 679 unlock(&l.lock) 680} 681 682//go:linkname notifyListCheck sync.runtime_notifyListCheck 683func notifyListCheck(sz uintptr) { 684 if sz != unsafe.Sizeof(notifyList{}) { 685 print("runtime: bad notifyList size - sync=", sz, " runtime=", unsafe.Sizeof(notifyList{}), "\n") 686 throw("bad notifyList size") 687 } 688} 689 690//go:linkname sync_nanotime sync.runtime_nanotime 691func sync_nanotime() int64 { 692 return nanotime() 693} 694