1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Semaphore implementation exposed to Go.
6// Intended use is provide a sleep and wakeup
7// primitive that can be used in the contended case
8// of other synchronization primitives.
9// Thus it targets the same goal as Linux's futex,
10// but it has much simpler semantics.
11//
12// That is, don't think of these as semaphores.
13// Think of them as a way to implement sleep and wakeup
14// such that every sleep is paired with a single wakeup,
15// even if, due to races, the wakeup happens before the sleep.
16//
17// See Mullender and Cox, ``Semaphores in Plan 9,''
18// https://swtch.com/semaphore.pdf
19
20package runtime
21
22import (
23	"internal/cpu"
24	"internal/runtime/atomic"
25	"unsafe"
26)
27
28// Asynchronous semaphore for sync.Mutex.
29
30// A semaRoot holds a balanced tree of sudog with distinct addresses (s.elem).
31// Each of those sudog may in turn point (through s.waitlink) to a list
32// of other sudogs waiting on the same address.
33// The operations on the inner lists of sudogs with the same address
34// are all O(1). The scanning of the top-level semaRoot list is O(log n),
35// where n is the number of distinct addresses with goroutines blocked
36// on them that hash to the given semaRoot.
37// See golang.org/issue/17953 for a program that worked badly
38// before we introduced the second level of list, and
39// BenchmarkSemTable/OneAddrCollision/* for a benchmark that exercises this.
40type semaRoot struct {
41	lock  mutex
42	treap *sudog        // root of balanced tree of unique waiters.
43	nwait atomic.Uint32 // Number of waiters. Read w/o the lock.
44}
45
46var semtable semTable
47
48// Prime to not correlate with any user patterns.
49const semTabSize = 251
50
51type semTable [semTabSize]struct {
52	root semaRoot
53	pad  [cpu.CacheLinePadSize - unsafe.Sizeof(semaRoot{})]byte
54}
55
56func (t *semTable) rootFor(addr *uint32) *semaRoot {
57	return &t[(uintptr(unsafe.Pointer(addr))>>3)%semTabSize].root
58}
59
60// sync_runtime_Semacquire should be an internal detail,
61// but widely used packages access it using linkname.
62// Notable members of the hall of shame include:
63//   - gvisor.dev/gvisor
64//   - github.com/sagernet/gvisor
65//
66// Do not remove or change the type signature.
67// See go.dev/issue/67401.
68//
69//go:linkname sync_runtime_Semacquire sync.runtime_Semacquire
70func sync_runtime_Semacquire(addr *uint32) {
71	semacquire1(addr, false, semaBlockProfile, 0, waitReasonSemacquire)
72}
73
74//go:linkname poll_runtime_Semacquire internal/poll.runtime_Semacquire
75func poll_runtime_Semacquire(addr *uint32) {
76	semacquire1(addr, false, semaBlockProfile, 0, waitReasonSemacquire)
77}
78
79// sync_runtime_Semrelease should be an internal detail,
80// but widely used packages access it using linkname.
81// Notable members of the hall of shame include:
82//   - gvisor.dev/gvisor
83//   - github.com/sagernet/gvisor
84//
85// Do not remove or change the type signature.
86// See go.dev/issue/67401.
87//
88//go:linkname sync_runtime_Semrelease sync.runtime_Semrelease
89func sync_runtime_Semrelease(addr *uint32, handoff bool, skipframes int) {
90	semrelease1(addr, handoff, skipframes)
91}
92
93//go:linkname sync_runtime_SemacquireMutex sync.runtime_SemacquireMutex
94func sync_runtime_SemacquireMutex(addr *uint32, lifo bool, skipframes int) {
95	semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes, waitReasonSyncMutexLock)
96}
97
98//go:linkname sync_runtime_SemacquireRWMutexR sync.runtime_SemacquireRWMutexR
99func sync_runtime_SemacquireRWMutexR(addr *uint32, lifo bool, skipframes int) {
100	semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes, waitReasonSyncRWMutexRLock)
101}
102
103//go:linkname sync_runtime_SemacquireRWMutex sync.runtime_SemacquireRWMutex
104func sync_runtime_SemacquireRWMutex(addr *uint32, lifo bool, skipframes int) {
105	semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes, waitReasonSyncRWMutexLock)
106}
107
108//go:linkname poll_runtime_Semrelease internal/poll.runtime_Semrelease
109func poll_runtime_Semrelease(addr *uint32) {
110	semrelease(addr)
111}
112
113func readyWithTime(s *sudog, traceskip int) {
114	if s.releasetime != 0 {
115		s.releasetime = cputicks()
116	}
117	goready(s.g, traceskip)
118}
119
120type semaProfileFlags int
121
122const (
123	semaBlockProfile semaProfileFlags = 1 << iota
124	semaMutexProfile
125)
126
127// Called from runtime.
128func semacquire(addr *uint32) {
129	semacquire1(addr, false, 0, 0, waitReasonSemacquire)
130}
131
132func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags, skipframes int, reason waitReason) {
133	gp := getg()
134	if gp != gp.m.curg {
135		throw("semacquire not on the G stack")
136	}
137
138	// Easy case.
139	if cansemacquire(addr) {
140		return
141	}
142
143	// Harder case:
144	//	increment waiter count
145	//	try cansemacquire one more time, return if succeeded
146	//	enqueue itself as a waiter
147	//	sleep
148	//	(waiter descriptor is dequeued by signaler)
149	s := acquireSudog()
150	root := semtable.rootFor(addr)
151	t0 := int64(0)
152	s.releasetime = 0
153	s.acquiretime = 0
154	s.ticket = 0
155	if profile&semaBlockProfile != 0 && blockprofilerate > 0 {
156		t0 = cputicks()
157		s.releasetime = -1
158	}
159	if profile&semaMutexProfile != 0 && mutexprofilerate > 0 {
160		if t0 == 0 {
161			t0 = cputicks()
162		}
163		s.acquiretime = t0
164	}
165	for {
166		lockWithRank(&root.lock, lockRankRoot)
167		// Add ourselves to nwait to disable "easy case" in semrelease.
168		root.nwait.Add(1)
169		// Check cansemacquire to avoid missed wakeup.
170		if cansemacquire(addr) {
171			root.nwait.Add(-1)
172			unlock(&root.lock)
173			break
174		}
175		// Any semrelease after the cansemacquire knows we're waiting
176		// (we set nwait above), so go to sleep.
177		root.queue(addr, s, lifo)
178		goparkunlock(&root.lock, reason, traceBlockSync, 4+skipframes)
179		if s.ticket != 0 || cansemacquire(addr) {
180			break
181		}
182	}
183	if s.releasetime > 0 {
184		blockevent(s.releasetime-t0, 3+skipframes)
185	}
186	releaseSudog(s)
187}
188
189func semrelease(addr *uint32) {
190	semrelease1(addr, false, 0)
191}
192
193func semrelease1(addr *uint32, handoff bool, skipframes int) {
194	root := semtable.rootFor(addr)
195	atomic.Xadd(addr, 1)
196
197	// Easy case: no waiters?
198	// This check must happen after the xadd, to avoid a missed wakeup
199	// (see loop in semacquire).
200	if root.nwait.Load() == 0 {
201		return
202	}
203
204	// Harder case: search for a waiter and wake it.
205	lockWithRank(&root.lock, lockRankRoot)
206	if root.nwait.Load() == 0 {
207		// The count is already consumed by another goroutine,
208		// so no need to wake up another goroutine.
209		unlock(&root.lock)
210		return
211	}
212	s, t0, tailtime := root.dequeue(addr)
213	if s != nil {
214		root.nwait.Add(-1)
215	}
216	unlock(&root.lock)
217	if s != nil { // May be slow or even yield, so unlock first
218		acquiretime := s.acquiretime
219		if acquiretime != 0 {
220			// Charge contention that this (delayed) unlock caused.
221			// If there are N more goroutines waiting beyond the
222			// one that's waking up, charge their delay as well, so that
223			// contention holding up many goroutines shows up as
224			// more costly than contention holding up a single goroutine.
225			// It would take O(N) time to calculate how long each goroutine
226			// has been waiting, so instead we charge avg(head-wait, tail-wait)*N.
227			// head-wait is the longest wait and tail-wait is the shortest.
228			// (When we do a lifo insertion, we preserve this property by
229			// copying the old head's acquiretime into the inserted new head.
230			// In that case the overall average may be slightly high, but that's fine:
231			// the average of the ends is only an approximation to the actual
232			// average anyway.)
233			// The root.dequeue above changed the head and tail acquiretime
234			// to the current time, so the next unlock will not re-count this contention.
235			dt0 := t0 - acquiretime
236			dt := dt0
237			if s.waiters != 0 {
238				dtail := t0 - tailtime
239				dt += (dtail + dt0) / 2 * int64(s.waiters)
240			}
241			mutexevent(dt, 3+skipframes)
242		}
243		if s.ticket != 0 {
244			throw("corrupted semaphore ticket")
245		}
246		if handoff && cansemacquire(addr) {
247			s.ticket = 1
248		}
249		readyWithTime(s, 5+skipframes)
250		if s.ticket == 1 && getg().m.locks == 0 {
251			// Direct G handoff
252			// readyWithTime has added the waiter G as runnext in the
253			// current P; we now call the scheduler so that we start running
254			// the waiter G immediately.
255			// Note that waiter inherits our time slice: this is desirable
256			// to avoid having a highly contended semaphore hog the P
257			// indefinitely. goyield is like Gosched, but it emits a
258			// "preempted" trace event instead and, more importantly, puts
259			// the current G on the local runq instead of the global one.
260			// We only do this in the starving regime (handoff=true), as in
261			// the non-starving case it is possible for a different waiter
262			// to acquire the semaphore while we are yielding/scheduling,
263			// and this would be wasteful. We wait instead to enter starving
264			// regime, and then we start to do direct handoffs of ticket and
265			// P.
266			// See issue 33747 for discussion.
267			goyield()
268		}
269	}
270}
271
272func cansemacquire(addr *uint32) bool {
273	for {
274		v := atomic.Load(addr)
275		if v == 0 {
276			return false
277		}
278		if atomic.Cas(addr, v, v-1) {
279			return true
280		}
281	}
282}
283
284// queue adds s to the blocked goroutines in semaRoot.
285func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
286	s.g = getg()
287	s.elem = unsafe.Pointer(addr)
288	s.next = nil
289	s.prev = nil
290	s.waiters = 0
291
292	var last *sudog
293	pt := &root.treap
294	for t := *pt; t != nil; t = *pt {
295		if t.elem == unsafe.Pointer(addr) {
296			// Already have addr in list.
297			if lifo {
298				// Substitute s in t's place in treap.
299				*pt = s
300				s.ticket = t.ticket
301				s.acquiretime = t.acquiretime // preserve head acquiretime as oldest time
302				s.parent = t.parent
303				s.prev = t.prev
304				s.next = t.next
305				if s.prev != nil {
306					s.prev.parent = s
307				}
308				if s.next != nil {
309					s.next.parent = s
310				}
311				// Add t first in s's wait list.
312				s.waitlink = t
313				s.waittail = t.waittail
314				if s.waittail == nil {
315					s.waittail = t
316				}
317				s.waiters = t.waiters
318				if s.waiters+1 != 0 {
319					s.waiters++
320				}
321				t.parent = nil
322				t.prev = nil
323				t.next = nil
324				t.waittail = nil
325			} else {
326				// Add s to end of t's wait list.
327				if t.waittail == nil {
328					t.waitlink = s
329				} else {
330					t.waittail.waitlink = s
331				}
332				t.waittail = s
333				s.waitlink = nil
334				if t.waiters+1 != 0 {
335					t.waiters++
336				}
337			}
338			return
339		}
340		last = t
341		if uintptr(unsafe.Pointer(addr)) < uintptr(t.elem) {
342			pt = &t.prev
343		} else {
344			pt = &t.next
345		}
346	}
347
348	// Add s as new leaf in tree of unique addrs.
349	// The balanced tree is a treap using ticket as the random heap priority.
350	// That is, it is a binary tree ordered according to the elem addresses,
351	// but then among the space of possible binary trees respecting those
352	// addresses, it is kept balanced on average by maintaining a heap ordering
353	// on the ticket: s.ticket <= both s.prev.ticket and s.next.ticket.
354	// https://en.wikipedia.org/wiki/Treap
355	// https://faculty.washington.edu/aragon/pubs/rst89.pdf
356	//
357	// s.ticket compared with zero in couple of places, therefore set lowest bit.
358	// It will not affect treap's quality noticeably.
359	s.ticket = cheaprand() | 1
360	s.parent = last
361	*pt = s
362
363	// Rotate up into tree according to ticket (priority).
364	for s.parent != nil && s.parent.ticket > s.ticket {
365		if s.parent.prev == s {
366			root.rotateRight(s.parent)
367		} else {
368			if s.parent.next != s {
369				panic("semaRoot queue")
370			}
371			root.rotateLeft(s.parent)
372		}
373	}
374}
375
376// dequeue searches for and finds the first goroutine
377// in semaRoot blocked on addr.
378// If the sudog was being profiled, dequeue returns the time
379// at which it was woken up as now. Otherwise now is 0.
380// If there are additional entries in the wait list, dequeue
381// returns tailtime set to the last entry's acquiretime.
382// Otherwise tailtime is found.acquiretime.
383func (root *semaRoot) dequeue(addr *uint32) (found *sudog, now, tailtime int64) {
384	ps := &root.treap
385	s := *ps
386	for ; s != nil; s = *ps {
387		if s.elem == unsafe.Pointer(addr) {
388			goto Found
389		}
390		if uintptr(unsafe.Pointer(addr)) < uintptr(s.elem) {
391			ps = &s.prev
392		} else {
393			ps = &s.next
394		}
395	}
396	return nil, 0, 0
397
398Found:
399	now = int64(0)
400	if s.acquiretime != 0 {
401		now = cputicks()
402	}
403	if t := s.waitlink; t != nil {
404		// Substitute t, also waiting on addr, for s in root tree of unique addrs.
405		*ps = t
406		t.ticket = s.ticket
407		t.parent = s.parent
408		t.prev = s.prev
409		if t.prev != nil {
410			t.prev.parent = t
411		}
412		t.next = s.next
413		if t.next != nil {
414			t.next.parent = t
415		}
416		if t.waitlink != nil {
417			t.waittail = s.waittail
418		} else {
419			t.waittail = nil
420		}
421		t.waiters = s.waiters
422		if t.waiters > 1 {
423			t.waiters--
424		}
425		// Set head and tail acquire time to 'now',
426		// because the caller will take care of charging
427		// the delays before now for all entries in the list.
428		t.acquiretime = now
429		tailtime = s.waittail.acquiretime
430		s.waittail.acquiretime = now
431		s.waitlink = nil
432		s.waittail = nil
433	} else {
434		// Rotate s down to be leaf of tree for removal, respecting priorities.
435		for s.next != nil || s.prev != nil {
436			if s.next == nil || s.prev != nil && s.prev.ticket < s.next.ticket {
437				root.rotateRight(s)
438			} else {
439				root.rotateLeft(s)
440			}
441		}
442		// Remove s, now a leaf.
443		if s.parent != nil {
444			if s.parent.prev == s {
445				s.parent.prev = nil
446			} else {
447				s.parent.next = nil
448			}
449		} else {
450			root.treap = nil
451		}
452		tailtime = s.acquiretime
453	}
454	s.parent = nil
455	s.elem = nil
456	s.next = nil
457	s.prev = nil
458	s.ticket = 0
459	return s, now, tailtime
460}
461
462// rotateLeft rotates the tree rooted at node x.
463// turning (x a (y b c)) into (y (x a b) c).
464func (root *semaRoot) rotateLeft(x *sudog) {
465	// p -> (x a (y b c))
466	p := x.parent
467	y := x.next
468	b := y.prev
469
470	y.prev = x
471	x.parent = y
472	x.next = b
473	if b != nil {
474		b.parent = x
475	}
476
477	y.parent = p
478	if p == nil {
479		root.treap = y
480	} else if p.prev == x {
481		p.prev = y
482	} else {
483		if p.next != x {
484			throw("semaRoot rotateLeft")
485		}
486		p.next = y
487	}
488}
489
490// rotateRight rotates the tree rooted at node y.
491// turning (y (x a b) c) into (x a (y b c)).
492func (root *semaRoot) rotateRight(y *sudog) {
493	// p -> (y (x a b) c)
494	p := y.parent
495	x := y.prev
496	b := x.next
497
498	x.next = y
499	y.parent = x
500	y.prev = b
501	if b != nil {
502		b.parent = y
503	}
504
505	x.parent = p
506	if p == nil {
507		root.treap = x
508	} else if p.prev == y {
509		p.prev = x
510	} else {
511		if p.next != y {
512			throw("semaRoot rotateRight")
513		}
514		p.next = x
515	}
516}
517
518// notifyList is a ticket-based notification list used to implement sync.Cond.
519//
520// It must be kept in sync with the sync package.
521type notifyList struct {
522	// wait is the ticket number of the next waiter. It is atomically
523	// incremented outside the lock.
524	wait atomic.Uint32
525
526	// notify is the ticket number of the next waiter to be notified. It can
527	// be read outside the lock, but is only written to with lock held.
528	//
529	// Both wait & notify can wrap around, and such cases will be correctly
530	// handled as long as their "unwrapped" difference is bounded by 2^31.
531	// For this not to be the case, we'd need to have 2^31+ goroutines
532	// blocked on the same condvar, which is currently not possible.
533	notify uint32
534
535	// List of parked waiters.
536	lock mutex
537	head *sudog
538	tail *sudog
539}
540
541// less checks if a < b, considering a & b running counts that may overflow the
542// 32-bit range, and that their "unwrapped" difference is always less than 2^31.
543func less(a, b uint32) bool {
544	return int32(a-b) < 0
545}
546
547// notifyListAdd adds the caller to a notify list such that it can receive
548// notifications. The caller must eventually call notifyListWait to wait for
549// such a notification, passing the returned ticket number.
550//
551//go:linkname notifyListAdd sync.runtime_notifyListAdd
552func notifyListAdd(l *notifyList) uint32 {
553	// This may be called concurrently, for example, when called from
554	// sync.Cond.Wait while holding a RWMutex in read mode.
555	return l.wait.Add(1) - 1
556}
557
558// notifyListWait waits for a notification. If one has been sent since
559// notifyListAdd was called, it returns immediately. Otherwise, it blocks.
560//
561//go:linkname notifyListWait sync.runtime_notifyListWait
562func notifyListWait(l *notifyList, t uint32) {
563	lockWithRank(&l.lock, lockRankNotifyList)
564
565	// Return right away if this ticket has already been notified.
566	if less(t, l.notify) {
567		unlock(&l.lock)
568		return
569	}
570
571	// Enqueue itself.
572	s := acquireSudog()
573	s.g = getg()
574	s.ticket = t
575	s.releasetime = 0
576	t0 := int64(0)
577	if blockprofilerate > 0 {
578		t0 = cputicks()
579		s.releasetime = -1
580	}
581	if l.tail == nil {
582		l.head = s
583	} else {
584		l.tail.next = s
585	}
586	l.tail = s
587	goparkunlock(&l.lock, waitReasonSyncCondWait, traceBlockCondWait, 3)
588	if t0 != 0 {
589		blockevent(s.releasetime-t0, 2)
590	}
591	releaseSudog(s)
592}
593
594// notifyListNotifyAll notifies all entries in the list.
595//
596//go:linkname notifyListNotifyAll sync.runtime_notifyListNotifyAll
597func notifyListNotifyAll(l *notifyList) {
598	// Fast-path: if there are no new waiters since the last notification
599	// we don't need to acquire the lock.
600	if l.wait.Load() == atomic.Load(&l.notify) {
601		return
602	}
603
604	// Pull the list out into a local variable, waiters will be readied
605	// outside the lock.
606	lockWithRank(&l.lock, lockRankNotifyList)
607	s := l.head
608	l.head = nil
609	l.tail = nil
610
611	// Update the next ticket to be notified. We can set it to the current
612	// value of wait because any previous waiters are already in the list
613	// or will notice that they have already been notified when trying to
614	// add themselves to the list.
615	atomic.Store(&l.notify, l.wait.Load())
616	unlock(&l.lock)
617
618	// Go through the local list and ready all waiters.
619	for s != nil {
620		next := s.next
621		s.next = nil
622		readyWithTime(s, 4)
623		s = next
624	}
625}
626
627// notifyListNotifyOne notifies one entry in the list.
628//
629//go:linkname notifyListNotifyOne sync.runtime_notifyListNotifyOne
630func notifyListNotifyOne(l *notifyList) {
631	// Fast-path: if there are no new waiters since the last notification
632	// we don't need to acquire the lock at all.
633	if l.wait.Load() == atomic.Load(&l.notify) {
634		return
635	}
636
637	lockWithRank(&l.lock, lockRankNotifyList)
638
639	// Re-check under the lock if we need to do anything.
640	t := l.notify
641	if t == l.wait.Load() {
642		unlock(&l.lock)
643		return
644	}
645
646	// Update the next notify ticket number.
647	atomic.Store(&l.notify, t+1)
648
649	// Try to find the g that needs to be notified.
650	// If it hasn't made it to the list yet we won't find it,
651	// but it won't park itself once it sees the new notify number.
652	//
653	// This scan looks linear but essentially always stops quickly.
654	// Because g's queue separately from taking numbers,
655	// there may be minor reorderings in the list, but we
656	// expect the g we're looking for to be near the front.
657	// The g has others in front of it on the list only to the
658	// extent that it lost the race, so the iteration will not
659	// be too long. This applies even when the g is missing:
660	// it hasn't yet gotten to sleep and has lost the race to
661	// the (few) other g's that we find on the list.
662	for p, s := (*sudog)(nil), l.head; s != nil; p, s = s, s.next {
663		if s.ticket == t {
664			n := s.next
665			if p != nil {
666				p.next = n
667			} else {
668				l.head = n
669			}
670			if n == nil {
671				l.tail = p
672			}
673			unlock(&l.lock)
674			s.next = nil
675			readyWithTime(s, 4)
676			return
677		}
678	}
679	unlock(&l.lock)
680}
681
682//go:linkname notifyListCheck sync.runtime_notifyListCheck
683func notifyListCheck(sz uintptr) {
684	if sz != unsafe.Sizeof(notifyList{}) {
685		print("runtime: bad notifyList size - sync=", sz, " runtime=", unsafe.Sizeof(notifyList{}), "\n")
686		throw("bad notifyList size")
687	}
688}
689
690//go:linkname sync_nanotime sync.runtime_nanotime
691func sync_nanotime() int64 {
692	return nanotime()
693}
694