1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Malloc profiling.
6// Patterned after tcmalloc's algorithms; shorter code.
7
8package runtime
9
10import (
11	"internal/abi"
12	"internal/goarch"
13	"internal/profilerecord"
14	"internal/runtime/atomic"
15	"runtime/internal/sys"
16	"unsafe"
17)
18
19// NOTE(rsc): Everything here could use cas if contention became an issue.
20var (
21	// profInsertLock protects changes to the start of all *bucket linked lists
22	profInsertLock mutex
23	// profBlockLock protects the contents of every blockRecord struct
24	profBlockLock mutex
25	// profMemActiveLock protects the active field of every memRecord struct
26	profMemActiveLock mutex
27	// profMemFutureLock is a set of locks that protect the respective elements
28	// of the future array of every memRecord struct
29	profMemFutureLock [len(memRecord{}.future)]mutex
30)
31
32// All memory allocations are local and do not escape outside of the profiler.
33// The profiler is forbidden from referring to garbage-collected memory.
34
35const (
36	// profile types
37	memProfile bucketType = 1 + iota
38	blockProfile
39	mutexProfile
40
41	// size of bucket hash table
42	buckHashSize = 179999
43
44	// maxSkip is to account for deferred inline expansion
45	// when using frame pointer unwinding. We record the stack
46	// with "physical" frame pointers but handle skipping "logical"
47	// frames at some point after collecting the stack. So
48	// we need extra space in order to avoid getting fewer than the
49	// desired maximum number of frames after expansion.
50	// This should be at least as large as the largest skip value
51	// used for profiling; otherwise stacks may be truncated inconsistently
52	maxSkip = 5
53
54	// maxProfStackDepth is the highest valid value for debug.profstackdepth.
55	// It's used for the bucket.stk func.
56	// TODO(fg): can we get rid of this?
57	maxProfStackDepth = 1024
58)
59
60type bucketType int
61
62// A bucket holds per-call-stack profiling information.
63// The representation is a bit sleazy, inherited from C.
64// This struct defines the bucket header. It is followed in
65// memory by the stack words and then the actual record
66// data, either a memRecord or a blockRecord.
67//
68// Per-call-stack profiling information.
69// Lookup by hashing call stack into a linked-list hash table.
70//
71// None of the fields in this bucket header are modified after
72// creation, including its next and allnext links.
73//
74// No heap pointers.
75type bucket struct {
76	_       sys.NotInHeap
77	next    *bucket
78	allnext *bucket
79	typ     bucketType // memBucket or blockBucket (includes mutexProfile)
80	hash    uintptr
81	size    uintptr
82	nstk    uintptr
83}
84
85// A memRecord is the bucket data for a bucket of type memProfile,
86// part of the memory profile.
87type memRecord struct {
88	// The following complex 3-stage scheme of stats accumulation
89	// is required to obtain a consistent picture of mallocs and frees
90	// for some point in time.
91	// The problem is that mallocs come in real time, while frees
92	// come only after a GC during concurrent sweeping. So if we would
93	// naively count them, we would get a skew toward mallocs.
94	//
95	// Hence, we delay information to get consistent snapshots as
96	// of mark termination. Allocations count toward the next mark
97	// termination's snapshot, while sweep frees count toward the
98	// previous mark termination's snapshot:
99	//
100	//              MT          MT          MT          MT
101	//             .·|         .·|         .·|         .·|
102	//          .·˙  |      .·˙  |      .·˙  |      .·˙  |
103	//       .·˙     |   .·˙     |   .·˙     |   .·˙     |
104	//    .·˙        |.·˙        |.·˙        |.·˙        |
105	//
106	//       alloc → ▲ ← free
107	//               ┠┅┅┅┅┅┅┅┅┅┅┅P
108	//       C+2     →    C+1    →  C
109	//
110	//                   alloc → ▲ ← free
111	//                           ┠┅┅┅┅┅┅┅┅┅┅┅P
112	//                   C+2     →    C+1    →  C
113	//
114	// Since we can't publish a consistent snapshot until all of
115	// the sweep frees are accounted for, we wait until the next
116	// mark termination ("MT" above) to publish the previous mark
117	// termination's snapshot ("P" above). To do this, allocation
118	// and free events are accounted to *future* heap profile
119	// cycles ("C+n" above) and we only publish a cycle once all
120	// of the events from that cycle must be done. Specifically:
121	//
122	// Mallocs are accounted to cycle C+2.
123	// Explicit frees are accounted to cycle C+2.
124	// GC frees (done during sweeping) are accounted to cycle C+1.
125	//
126	// After mark termination, we increment the global heap
127	// profile cycle counter and accumulate the stats from cycle C
128	// into the active profile.
129
130	// active is the currently published profile. A profiling
131	// cycle can be accumulated into active once its complete.
132	active memRecordCycle
133
134	// future records the profile events we're counting for cycles
135	// that have not yet been published. This is ring buffer
136	// indexed by the global heap profile cycle C and stores
137	// cycles C, C+1, and C+2. Unlike active, these counts are
138	// only for a single cycle; they are not cumulative across
139	// cycles.
140	//
141	// We store cycle C here because there's a window between when
142	// C becomes the active cycle and when we've flushed it to
143	// active.
144	future [3]memRecordCycle
145}
146
147// memRecordCycle
148type memRecordCycle struct {
149	allocs, frees           uintptr
150	alloc_bytes, free_bytes uintptr
151}
152
153// add accumulates b into a. It does not zero b.
154func (a *memRecordCycle) add(b *memRecordCycle) {
155	a.allocs += b.allocs
156	a.frees += b.frees
157	a.alloc_bytes += b.alloc_bytes
158	a.free_bytes += b.free_bytes
159}
160
161// A blockRecord is the bucket data for a bucket of type blockProfile,
162// which is used in blocking and mutex profiles.
163type blockRecord struct {
164	count  float64
165	cycles int64
166}
167
168var (
169	mbuckets atomic.UnsafePointer // *bucket, memory profile buckets
170	bbuckets atomic.UnsafePointer // *bucket, blocking profile buckets
171	xbuckets atomic.UnsafePointer // *bucket, mutex profile buckets
172	buckhash atomic.UnsafePointer // *buckhashArray
173
174	mProfCycle mProfCycleHolder
175)
176
177type buckhashArray [buckHashSize]atomic.UnsafePointer // *bucket
178
179const mProfCycleWrap = uint32(len(memRecord{}.future)) * (2 << 24)
180
181// mProfCycleHolder holds the global heap profile cycle number (wrapped at
182// mProfCycleWrap, stored starting at bit 1), and a flag (stored at bit 0) to
183// indicate whether future[cycle] in all buckets has been queued to flush into
184// the active profile.
185type mProfCycleHolder struct {
186	value atomic.Uint32
187}
188
189// read returns the current cycle count.
190func (c *mProfCycleHolder) read() (cycle uint32) {
191	v := c.value.Load()
192	cycle = v >> 1
193	return cycle
194}
195
196// setFlushed sets the flushed flag. It returns the current cycle count and the
197// previous value of the flushed flag.
198func (c *mProfCycleHolder) setFlushed() (cycle uint32, alreadyFlushed bool) {
199	for {
200		prev := c.value.Load()
201		cycle = prev >> 1
202		alreadyFlushed = (prev & 0x1) != 0
203		next := prev | 0x1
204		if c.value.CompareAndSwap(prev, next) {
205			return cycle, alreadyFlushed
206		}
207	}
208}
209
210// increment increases the cycle count by one, wrapping the value at
211// mProfCycleWrap. It clears the flushed flag.
212func (c *mProfCycleHolder) increment() {
213	// We explicitly wrap mProfCycle rather than depending on
214	// uint wraparound because the memRecord.future ring does not
215	// itself wrap at a power of two.
216	for {
217		prev := c.value.Load()
218		cycle := prev >> 1
219		cycle = (cycle + 1) % mProfCycleWrap
220		next := cycle << 1
221		if c.value.CompareAndSwap(prev, next) {
222			break
223		}
224	}
225}
226
227// newBucket allocates a bucket with the given type and number of stack entries.
228func newBucket(typ bucketType, nstk int) *bucket {
229	size := unsafe.Sizeof(bucket{}) + uintptr(nstk)*unsafe.Sizeof(uintptr(0))
230	switch typ {
231	default:
232		throw("invalid profile bucket type")
233	case memProfile:
234		size += unsafe.Sizeof(memRecord{})
235	case blockProfile, mutexProfile:
236		size += unsafe.Sizeof(blockRecord{})
237	}
238
239	b := (*bucket)(persistentalloc(size, 0, &memstats.buckhash_sys))
240	b.typ = typ
241	b.nstk = uintptr(nstk)
242	return b
243}
244
245// stk returns the slice in b holding the stack. The caller can asssume that the
246// backing array is immutable.
247func (b *bucket) stk() []uintptr {
248	stk := (*[maxProfStackDepth]uintptr)(add(unsafe.Pointer(b), unsafe.Sizeof(*b)))
249	if b.nstk > maxProfStackDepth {
250		// prove that slicing works; otherwise a failure requires a P
251		throw("bad profile stack count")
252	}
253	return stk[:b.nstk:b.nstk]
254}
255
256// mp returns the memRecord associated with the memProfile bucket b.
257func (b *bucket) mp() *memRecord {
258	if b.typ != memProfile {
259		throw("bad use of bucket.mp")
260	}
261	data := add(unsafe.Pointer(b), unsafe.Sizeof(*b)+b.nstk*unsafe.Sizeof(uintptr(0)))
262	return (*memRecord)(data)
263}
264
265// bp returns the blockRecord associated with the blockProfile bucket b.
266func (b *bucket) bp() *blockRecord {
267	if b.typ != blockProfile && b.typ != mutexProfile {
268		throw("bad use of bucket.bp")
269	}
270	data := add(unsafe.Pointer(b), unsafe.Sizeof(*b)+b.nstk*unsafe.Sizeof(uintptr(0)))
271	return (*blockRecord)(data)
272}
273
274// Return the bucket for stk[0:nstk], allocating new bucket if needed.
275func stkbucket(typ bucketType, size uintptr, stk []uintptr, alloc bool) *bucket {
276	bh := (*buckhashArray)(buckhash.Load())
277	if bh == nil {
278		lock(&profInsertLock)
279		// check again under the lock
280		bh = (*buckhashArray)(buckhash.Load())
281		if bh == nil {
282			bh = (*buckhashArray)(sysAlloc(unsafe.Sizeof(buckhashArray{}), &memstats.buckhash_sys))
283			if bh == nil {
284				throw("runtime: cannot allocate memory")
285			}
286			buckhash.StoreNoWB(unsafe.Pointer(bh))
287		}
288		unlock(&profInsertLock)
289	}
290
291	// Hash stack.
292	var h uintptr
293	for _, pc := range stk {
294		h += pc
295		h += h << 10
296		h ^= h >> 6
297	}
298	// hash in size
299	h += size
300	h += h << 10
301	h ^= h >> 6
302	// finalize
303	h += h << 3
304	h ^= h >> 11
305
306	i := int(h % buckHashSize)
307	// first check optimistically, without the lock
308	for b := (*bucket)(bh[i].Load()); b != nil; b = b.next {
309		if b.typ == typ && b.hash == h && b.size == size && eqslice(b.stk(), stk) {
310			return b
311		}
312	}
313
314	if !alloc {
315		return nil
316	}
317
318	lock(&profInsertLock)
319	// check again under the insertion lock
320	for b := (*bucket)(bh[i].Load()); b != nil; b = b.next {
321		if b.typ == typ && b.hash == h && b.size == size && eqslice(b.stk(), stk) {
322			unlock(&profInsertLock)
323			return b
324		}
325	}
326
327	// Create new bucket.
328	b := newBucket(typ, len(stk))
329	copy(b.stk(), stk)
330	b.hash = h
331	b.size = size
332
333	var allnext *atomic.UnsafePointer
334	if typ == memProfile {
335		allnext = &mbuckets
336	} else if typ == mutexProfile {
337		allnext = &xbuckets
338	} else {
339		allnext = &bbuckets
340	}
341
342	b.next = (*bucket)(bh[i].Load())
343	b.allnext = (*bucket)(allnext.Load())
344
345	bh[i].StoreNoWB(unsafe.Pointer(b))
346	allnext.StoreNoWB(unsafe.Pointer(b))
347
348	unlock(&profInsertLock)
349	return b
350}
351
352func eqslice(x, y []uintptr) bool {
353	if len(x) != len(y) {
354		return false
355	}
356	for i, xi := range x {
357		if xi != y[i] {
358			return false
359		}
360	}
361	return true
362}
363
364// mProf_NextCycle publishes the next heap profile cycle and creates a
365// fresh heap profile cycle. This operation is fast and can be done
366// during STW. The caller must call mProf_Flush before calling
367// mProf_NextCycle again.
368//
369// This is called by mark termination during STW so allocations and
370// frees after the world is started again count towards a new heap
371// profiling cycle.
372func mProf_NextCycle() {
373	mProfCycle.increment()
374}
375
376// mProf_Flush flushes the events from the current heap profiling
377// cycle into the active profile. After this it is safe to start a new
378// heap profiling cycle with mProf_NextCycle.
379//
380// This is called by GC after mark termination starts the world. In
381// contrast with mProf_NextCycle, this is somewhat expensive, but safe
382// to do concurrently.
383func mProf_Flush() {
384	cycle, alreadyFlushed := mProfCycle.setFlushed()
385	if alreadyFlushed {
386		return
387	}
388
389	index := cycle % uint32(len(memRecord{}.future))
390	lock(&profMemActiveLock)
391	lock(&profMemFutureLock[index])
392	mProf_FlushLocked(index)
393	unlock(&profMemFutureLock[index])
394	unlock(&profMemActiveLock)
395}
396
397// mProf_FlushLocked flushes the events from the heap profiling cycle at index
398// into the active profile. The caller must hold the lock for the active profile
399// (profMemActiveLock) and for the profiling cycle at index
400// (profMemFutureLock[index]).
401func mProf_FlushLocked(index uint32) {
402	assertLockHeld(&profMemActiveLock)
403	assertLockHeld(&profMemFutureLock[index])
404	head := (*bucket)(mbuckets.Load())
405	for b := head; b != nil; b = b.allnext {
406		mp := b.mp()
407
408		// Flush cycle C into the published profile and clear
409		// it for reuse.
410		mpc := &mp.future[index]
411		mp.active.add(mpc)
412		*mpc = memRecordCycle{}
413	}
414}
415
416// mProf_PostSweep records that all sweep frees for this GC cycle have
417// completed. This has the effect of publishing the heap profile
418// snapshot as of the last mark termination without advancing the heap
419// profile cycle.
420func mProf_PostSweep() {
421	// Flush cycle C+1 to the active profile so everything as of
422	// the last mark termination becomes visible. *Don't* advance
423	// the cycle, since we're still accumulating allocs in cycle
424	// C+2, which have to become C+1 in the next mark termination
425	// and so on.
426	cycle := mProfCycle.read() + 1
427
428	index := cycle % uint32(len(memRecord{}.future))
429	lock(&profMemActiveLock)
430	lock(&profMemFutureLock[index])
431	mProf_FlushLocked(index)
432	unlock(&profMemFutureLock[index])
433	unlock(&profMemActiveLock)
434}
435
436// Called by malloc to record a profiled block.
437func mProf_Malloc(mp *m, p unsafe.Pointer, size uintptr) {
438	if mp.profStack == nil {
439		// mp.profStack is nil if we happen to sample an allocation during the
440		// initialization of mp. This case is rare, so we just ignore such
441		// allocations. Change MemProfileRate to 1 if you need to reproduce such
442		// cases for testing purposes.
443		return
444	}
445	// Only use the part of mp.profStack we need and ignore the extra space
446	// reserved for delayed inline expansion with frame pointer unwinding.
447	nstk := callers(4, mp.profStack[:debug.profstackdepth])
448	index := (mProfCycle.read() + 2) % uint32(len(memRecord{}.future))
449
450	b := stkbucket(memProfile, size, mp.profStack[:nstk], true)
451	mr := b.mp()
452	mpc := &mr.future[index]
453
454	lock(&profMemFutureLock[index])
455	mpc.allocs++
456	mpc.alloc_bytes += size
457	unlock(&profMemFutureLock[index])
458
459	// Setprofilebucket locks a bunch of other mutexes, so we call it outside of
460	// the profiler locks. This reduces potential contention and chances of
461	// deadlocks. Since the object must be alive during the call to
462	// mProf_Malloc, it's fine to do this non-atomically.
463	systemstack(func() {
464		setprofilebucket(p, b)
465	})
466}
467
468// Called when freeing a profiled block.
469func mProf_Free(b *bucket, size uintptr) {
470	index := (mProfCycle.read() + 1) % uint32(len(memRecord{}.future))
471
472	mp := b.mp()
473	mpc := &mp.future[index]
474
475	lock(&profMemFutureLock[index])
476	mpc.frees++
477	mpc.free_bytes += size
478	unlock(&profMemFutureLock[index])
479}
480
481var blockprofilerate uint64 // in CPU ticks
482
483// SetBlockProfileRate controls the fraction of goroutine blocking events
484// that are reported in the blocking profile. The profiler aims to sample
485// an average of one blocking event per rate nanoseconds spent blocked.
486//
487// To include every blocking event in the profile, pass rate = 1.
488// To turn off profiling entirely, pass rate <= 0.
489func SetBlockProfileRate(rate int) {
490	var r int64
491	if rate <= 0 {
492		r = 0 // disable profiling
493	} else if rate == 1 {
494		r = 1 // profile everything
495	} else {
496		// convert ns to cycles, use float64 to prevent overflow during multiplication
497		r = int64(float64(rate) * float64(ticksPerSecond()) / (1000 * 1000 * 1000))
498		if r == 0 {
499			r = 1
500		}
501	}
502
503	atomic.Store64(&blockprofilerate, uint64(r))
504}
505
506func blockevent(cycles int64, skip int) {
507	if cycles <= 0 {
508		cycles = 1
509	}
510
511	rate := int64(atomic.Load64(&blockprofilerate))
512	if blocksampled(cycles, rate) {
513		saveblockevent(cycles, rate, skip+1, blockProfile)
514	}
515}
516
517// blocksampled returns true for all events where cycles >= rate. Shorter
518// events have a cycles/rate random chance of returning true.
519func blocksampled(cycles, rate int64) bool {
520	if rate <= 0 || (rate > cycles && cheaprand64()%rate > cycles) {
521		return false
522	}
523	return true
524}
525
526// saveblockevent records a profile event of the type specified by which.
527// cycles is the quantity associated with this event and rate is the sampling rate,
528// used to adjust the cycles value in the manner determined by the profile type.
529// skip is the number of frames to omit from the traceback associated with the event.
530// The traceback will be recorded from the stack of the goroutine associated with the current m.
531// skip should be positive if this event is recorded from the current stack
532// (e.g. when this is not called from a system stack)
533func saveblockevent(cycles, rate int64, skip int, which bucketType) {
534	if debug.profstackdepth == 0 {
535		// profstackdepth is set to 0 by the user, so mp.profStack is nil and we
536		// can't record a stack trace.
537		return
538	}
539	if skip > maxSkip {
540		print("requested skip=", skip)
541		throw("invalid skip value")
542	}
543	gp := getg()
544	mp := acquirem() // we must not be preempted while accessing profstack
545
546	var nstk int
547	if tracefpunwindoff() || gp.m.hasCgoOnStack() {
548		if gp.m.curg == nil || gp.m.curg == gp {
549			nstk = callers(skip, mp.profStack)
550		} else {
551			nstk = gcallers(gp.m.curg, skip, mp.profStack)
552		}
553	} else {
554		if gp.m.curg == nil || gp.m.curg == gp {
555			if skip > 0 {
556				// We skip one fewer frame than the provided value for frame
557				// pointer unwinding because the skip value includes the current
558				// frame, whereas the saved frame pointer will give us the
559				// caller's return address first (so, not including
560				// saveblockevent)
561				skip -= 1
562			}
563			nstk = fpTracebackPartialExpand(skip, unsafe.Pointer(getfp()), mp.profStack)
564		} else {
565			mp.profStack[0] = gp.m.curg.sched.pc
566			nstk = 1 + fpTracebackPartialExpand(skip, unsafe.Pointer(gp.m.curg.sched.bp), mp.profStack[1:])
567		}
568	}
569
570	saveBlockEventStack(cycles, rate, mp.profStack[:nstk], which)
571	releasem(mp)
572}
573
574// fpTracebackPartialExpand records a call stack obtained starting from fp.
575// This function will skip the given number of frames, properly accounting for
576// inlining, and save remaining frames as "physical" return addresses. The
577// consumer should later use CallersFrames or similar to expand inline frames.
578func fpTracebackPartialExpand(skip int, fp unsafe.Pointer, pcBuf []uintptr) int {
579	var n int
580	lastFuncID := abi.FuncIDNormal
581	skipOrAdd := func(retPC uintptr) bool {
582		if skip > 0 {
583			skip--
584		} else if n < len(pcBuf) {
585			pcBuf[n] = retPC
586			n++
587		}
588		return n < len(pcBuf)
589	}
590	for n < len(pcBuf) && fp != nil {
591		// return addr sits one word above the frame pointer
592		pc := *(*uintptr)(unsafe.Pointer(uintptr(fp) + goarch.PtrSize))
593
594		if skip > 0 {
595			callPC := pc - 1
596			fi := findfunc(callPC)
597			u, uf := newInlineUnwinder(fi, callPC)
598			for ; uf.valid(); uf = u.next(uf) {
599				sf := u.srcFunc(uf)
600				if sf.funcID == abi.FuncIDWrapper && elideWrapperCalling(lastFuncID) {
601					// ignore wrappers
602				} else if more := skipOrAdd(uf.pc + 1); !more {
603					return n
604				}
605				lastFuncID = sf.funcID
606			}
607		} else {
608			// We've skipped the desired number of frames, so no need
609			// to perform further inline expansion now.
610			pcBuf[n] = pc
611			n++
612		}
613
614		// follow the frame pointer to the next one
615		fp = unsafe.Pointer(*(*uintptr)(fp))
616	}
617	return n
618}
619
620// lockTimer assists with profiling contention on runtime-internal locks.
621//
622// There are several steps between the time that an M experiences contention and
623// when that contention may be added to the profile. This comes from our
624// constraints: We need to keep the critical section of each lock small,
625// especially when those locks are contended. The reporting code cannot acquire
626// new locks until the M has released all other locks, which means no memory
627// allocations and encourages use of (temporary) M-local storage.
628//
629// The M will have space for storing one call stack that caused contention, and
630// for the magnitude of that contention. It will also have space to store the
631// magnitude of additional contention the M caused, since it only has space to
632// remember one call stack and might encounter several contention events before
633// it releases all of its locks and is thus able to transfer the local buffer
634// into the profile.
635//
636// The M will collect the call stack when it unlocks the contended lock. That
637// minimizes the impact on the critical section of the contended lock, and
638// matches the mutex profile's behavior for contention in sync.Mutex: measured
639// at the Unlock method.
640//
641// The profile for contention on sync.Mutex blames the caller of Unlock for the
642// amount of contention experienced by the callers of Lock which had to wait.
643// When there are several critical sections, this allows identifying which of
644// them is responsible.
645//
646// Matching that behavior for runtime-internal locks will require identifying
647// which Ms are blocked on the mutex. The semaphore-based implementation is
648// ready to allow that, but the futex-based implementation will require a bit
649// more work. Until then, we report contention on runtime-internal locks with a
650// call stack taken from the unlock call (like the rest of the user-space
651// "mutex" profile), but assign it a duration value based on how long the
652// previous lock call took (like the user-space "block" profile).
653//
654// Thus, reporting the call stacks of runtime-internal lock contention is
655// guarded by GODEBUG for now. Set GODEBUG=runtimecontentionstacks=1 to enable.
656//
657// TODO(rhysh): plumb through the delay duration, remove GODEBUG, update comment
658//
659// The M will track this by storing a pointer to the lock; lock/unlock pairs for
660// runtime-internal locks are always on the same M.
661//
662// Together, that demands several steps for recording contention. First, when
663// finally acquiring a contended lock, the M decides whether it should plan to
664// profile that event by storing a pointer to the lock in its "to be profiled
665// upon unlock" field. If that field is already set, it uses the relative
666// magnitudes to weight a random choice between itself and the other lock, with
667// the loser's time being added to the "additional contention" field. Otherwise
668// if the M's call stack buffer is occupied, it does the comparison against that
669// sample's magnitude.
670//
671// Second, having unlocked a mutex the M checks to see if it should capture the
672// call stack into its local buffer. Finally, when the M unlocks its last mutex,
673// it transfers the local buffer into the profile. As part of that step, it also
674// transfers any "additional contention" time to the profile. Any lock
675// contention that it experiences while adding samples to the profile will be
676// recorded later as "additional contention" and not include a call stack, to
677// avoid an echo.
678type lockTimer struct {
679	lock      *mutex
680	timeRate  int64
681	timeStart int64
682	tickStart int64
683}
684
685func (lt *lockTimer) begin() {
686	rate := int64(atomic.Load64(&mutexprofilerate))
687
688	lt.timeRate = gTrackingPeriod
689	if rate != 0 && rate < lt.timeRate {
690		lt.timeRate = rate
691	}
692	if int64(cheaprand())%lt.timeRate == 0 {
693		lt.timeStart = nanotime()
694	}
695
696	if rate > 0 && int64(cheaprand())%rate == 0 {
697		lt.tickStart = cputicks()
698	}
699}
700
701func (lt *lockTimer) end() {
702	gp := getg()
703
704	if lt.timeStart != 0 {
705		nowTime := nanotime()
706		gp.m.mLockProfile.waitTime.Add((nowTime - lt.timeStart) * lt.timeRate)
707	}
708
709	if lt.tickStart != 0 {
710		nowTick := cputicks()
711		gp.m.mLockProfile.recordLock(nowTick-lt.tickStart, lt.lock)
712	}
713}
714
715type mLockProfile struct {
716	waitTime   atomic.Int64 // total nanoseconds spent waiting in runtime.lockWithRank
717	stack      []uintptr    // stack that experienced contention in runtime.lockWithRank
718	pending    uintptr      // *mutex that experienced contention (to be traceback-ed)
719	cycles     int64        // cycles attributable to "pending" (if set), otherwise to "stack"
720	cyclesLost int64        // contention for which we weren't able to record a call stack
721	disabled   bool         // attribute all time to "lost"
722}
723
724func (prof *mLockProfile) recordLock(cycles int64, l *mutex) {
725	if cycles <= 0 {
726		return
727	}
728
729	if prof.disabled {
730		// We're experiencing contention while attempting to report contention.
731		// Make a note of its magnitude, but don't allow it to be the sole cause
732		// of another contention report.
733		prof.cyclesLost += cycles
734		return
735	}
736
737	if uintptr(unsafe.Pointer(l)) == prof.pending {
738		// Optimization: we'd already planned to profile this same lock (though
739		// possibly from a different unlock site).
740		prof.cycles += cycles
741		return
742	}
743
744	if prev := prof.cycles; prev > 0 {
745		// We can only store one call stack for runtime-internal lock contention
746		// on this M, and we've already got one. Decide which should stay, and
747		// add the other to the report for runtime._LostContendedRuntimeLock.
748		prevScore := uint64(cheaprand64()) % uint64(prev)
749		thisScore := uint64(cheaprand64()) % uint64(cycles)
750		if prevScore > thisScore {
751			prof.cyclesLost += cycles
752			return
753		} else {
754			prof.cyclesLost += prev
755		}
756	}
757	// Saving the *mutex as a uintptr is safe because:
758	//  - lockrank_on.go does this too, which gives it regular exercise
759	//  - the lock would only move if it's stack allocated, which means it
760	//      cannot experience multi-M contention
761	prof.pending = uintptr(unsafe.Pointer(l))
762	prof.cycles = cycles
763}
764
765// From unlock2, we might not be holding a p in this code.
766//
767//go:nowritebarrierrec
768func (prof *mLockProfile) recordUnlock(l *mutex) {
769	if uintptr(unsafe.Pointer(l)) == prof.pending {
770		prof.captureStack()
771	}
772	if gp := getg(); gp.m.locks == 1 && gp.m.mLockProfile.cycles != 0 {
773		prof.store()
774	}
775}
776
777func (prof *mLockProfile) captureStack() {
778	if debug.profstackdepth == 0 {
779		// profstackdepth is set to 0 by the user, so mp.profStack is nil and we
780		// can't record a stack trace.
781		return
782	}
783
784	skip := 3 // runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
785	if staticLockRanking {
786		// When static lock ranking is enabled, we'll always be on the system
787		// stack at this point. There will be a runtime.unlockWithRank.func1
788		// frame, and if the call to runtime.unlock took place on a user stack
789		// then there'll also be a runtime.systemstack frame. To keep stack
790		// traces somewhat consistent whether or not static lock ranking is
791		// enabled, we'd like to skip those. But it's hard to tell how long
792		// we've been on the system stack so accept an extra frame in that case,
793		// with a leaf of "runtime.unlockWithRank runtime.unlock" instead of
794		// "runtime.unlock".
795		skip += 1 // runtime.unlockWithRank.func1
796	}
797	prof.pending = 0
798
799	prof.stack[0] = logicalStackSentinel
800	if debug.runtimeContentionStacks.Load() == 0 {
801		prof.stack[1] = abi.FuncPCABIInternal(_LostContendedRuntimeLock) + sys.PCQuantum
802		prof.stack[2] = 0
803		return
804	}
805
806	var nstk int
807	gp := getg()
808	sp := getcallersp()
809	pc := getcallerpc()
810	systemstack(func() {
811		var u unwinder
812		u.initAt(pc, sp, 0, gp, unwindSilentErrors|unwindJumpStack)
813		nstk = 1 + tracebackPCs(&u, skip, prof.stack[1:])
814	})
815	if nstk < len(prof.stack) {
816		prof.stack[nstk] = 0
817	}
818}
819
820func (prof *mLockProfile) store() {
821	// Report any contention we experience within this function as "lost"; it's
822	// important that the act of reporting a contention event not lead to a
823	// reportable contention event. This also means we can use prof.stack
824	// without copying, since it won't change during this function.
825	mp := acquirem()
826	prof.disabled = true
827
828	nstk := int(debug.profstackdepth)
829	for i := 0; i < nstk; i++ {
830		if pc := prof.stack[i]; pc == 0 {
831			nstk = i
832			break
833		}
834	}
835
836	cycles, lost := prof.cycles, prof.cyclesLost
837	prof.cycles, prof.cyclesLost = 0, 0
838
839	rate := int64(atomic.Load64(&mutexprofilerate))
840	saveBlockEventStack(cycles, rate, prof.stack[:nstk], mutexProfile)
841	if lost > 0 {
842		lostStk := [...]uintptr{
843			logicalStackSentinel,
844			abi.FuncPCABIInternal(_LostContendedRuntimeLock) + sys.PCQuantum,
845		}
846		saveBlockEventStack(lost, rate, lostStk[:], mutexProfile)
847	}
848
849	prof.disabled = false
850	releasem(mp)
851}
852
853func saveBlockEventStack(cycles, rate int64, stk []uintptr, which bucketType) {
854	b := stkbucket(which, 0, stk, true)
855	bp := b.bp()
856
857	lock(&profBlockLock)
858	// We want to up-scale the count and cycles according to the
859	// probability that the event was sampled. For block profile events,
860	// the sample probability is 1 if cycles >= rate, and cycles / rate
861	// otherwise. For mutex profile events, the sample probability is 1 / rate.
862	// We scale the events by 1 / (probability the event was sampled).
863	if which == blockProfile && cycles < rate {
864		// Remove sampling bias, see discussion on http://golang.org/cl/299991.
865		bp.count += float64(rate) / float64(cycles)
866		bp.cycles += rate
867	} else if which == mutexProfile {
868		bp.count += float64(rate)
869		bp.cycles += rate * cycles
870	} else {
871		bp.count++
872		bp.cycles += cycles
873	}
874	unlock(&profBlockLock)
875}
876
877var mutexprofilerate uint64 // fraction sampled
878
879// SetMutexProfileFraction controls the fraction of mutex contention events
880// that are reported in the mutex profile. On average 1/rate events are
881// reported. The previous rate is returned.
882//
883// To turn off profiling entirely, pass rate 0.
884// To just read the current rate, pass rate < 0.
885// (For n>1 the details of sampling may change.)
886func SetMutexProfileFraction(rate int) int {
887	if rate < 0 {
888		return int(mutexprofilerate)
889	}
890	old := mutexprofilerate
891	atomic.Store64(&mutexprofilerate, uint64(rate))
892	return int(old)
893}
894
895//go:linkname mutexevent sync.event
896func mutexevent(cycles int64, skip int) {
897	if cycles < 0 {
898		cycles = 0
899	}
900	rate := int64(atomic.Load64(&mutexprofilerate))
901	if rate > 0 && cheaprand64()%rate == 0 {
902		saveblockevent(cycles, rate, skip+1, mutexProfile)
903	}
904}
905
906// Go interface to profile data.
907
908// A StackRecord describes a single execution stack.
909type StackRecord struct {
910	Stack0 [32]uintptr // stack trace for this record; ends at first 0 entry
911}
912
913// Stack returns the stack trace associated with the record,
914// a prefix of r.Stack0.
915func (r *StackRecord) Stack() []uintptr {
916	for i, v := range r.Stack0 {
917		if v == 0 {
918			return r.Stack0[0:i]
919		}
920	}
921	return r.Stack0[0:]
922}
923
924// MemProfileRate controls the fraction of memory allocations
925// that are recorded and reported in the memory profile.
926// The profiler aims to sample an average of
927// one allocation per MemProfileRate bytes allocated.
928//
929// To include every allocated block in the profile, set MemProfileRate to 1.
930// To turn off profiling entirely, set MemProfileRate to 0.
931//
932// The tools that process the memory profiles assume that the
933// profile rate is constant across the lifetime of the program
934// and equal to the current value. Programs that change the
935// memory profiling rate should do so just once, as early as
936// possible in the execution of the program (for example,
937// at the beginning of main).
938var MemProfileRate int = 512 * 1024
939
940// disableMemoryProfiling is set by the linker if memory profiling
941// is not used and the link type guarantees nobody else could use it
942// elsewhere.
943// We check if the runtime.memProfileInternal symbol is present.
944var disableMemoryProfiling bool
945
946// A MemProfileRecord describes the live objects allocated
947// by a particular call sequence (stack trace).
948type MemProfileRecord struct {
949	AllocBytes, FreeBytes     int64       // number of bytes allocated, freed
950	AllocObjects, FreeObjects int64       // number of objects allocated, freed
951	Stack0                    [32]uintptr // stack trace for this record; ends at first 0 entry
952}
953
954// InUseBytes returns the number of bytes in use (AllocBytes - FreeBytes).
955func (r *MemProfileRecord) InUseBytes() int64 { return r.AllocBytes - r.FreeBytes }
956
957// InUseObjects returns the number of objects in use (AllocObjects - FreeObjects).
958func (r *MemProfileRecord) InUseObjects() int64 {
959	return r.AllocObjects - r.FreeObjects
960}
961
962// Stack returns the stack trace associated with the record,
963// a prefix of r.Stack0.
964func (r *MemProfileRecord) Stack() []uintptr {
965	for i, v := range r.Stack0 {
966		if v == 0 {
967			return r.Stack0[0:i]
968		}
969	}
970	return r.Stack0[0:]
971}
972
973// MemProfile returns a profile of memory allocated and freed per allocation
974// site.
975//
976// MemProfile returns n, the number of records in the current memory profile.
977// If len(p) >= n, MemProfile copies the profile into p and returns n, true.
978// If len(p) < n, MemProfile does not change p and returns n, false.
979//
980// If inuseZero is true, the profile includes allocation records
981// where r.AllocBytes > 0 but r.AllocBytes == r.FreeBytes.
982// These are sites where memory was allocated, but it has all
983// been released back to the runtime.
984//
985// The returned profile may be up to two garbage collection cycles old.
986// This is to avoid skewing the profile toward allocations; because
987// allocations happen in real time but frees are delayed until the garbage
988// collector performs sweeping, the profile only accounts for allocations
989// that have had a chance to be freed by the garbage collector.
990//
991// Most clients should use the runtime/pprof package or
992// the testing package's -test.memprofile flag instead
993// of calling MemProfile directly.
994func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
995	return memProfileInternal(len(p), inuseZero, func(r profilerecord.MemProfileRecord) {
996		copyMemProfileRecord(&p[0], r)
997		p = p[1:]
998	})
999}
1000
1001// memProfileInternal returns the number of records n in the profile. If there
1002// are less than size records, copyFn is invoked for each record, and ok returns
1003// true.
1004//
1005// The linker set disableMemoryProfiling to true to disable memory profiling
1006// if this function is not reachable. Mark it noinline to ensure the symbol exists.
1007// (This function is big and normally not inlined anyway.)
1008// See also disableMemoryProfiling above and cmd/link/internal/ld/lib.go:linksetup.
1009//
1010//go:noinline
1011func memProfileInternal(size int, inuseZero bool, copyFn func(profilerecord.MemProfileRecord)) (n int, ok bool) {
1012	cycle := mProfCycle.read()
1013	// If we're between mProf_NextCycle and mProf_Flush, take care
1014	// of flushing to the active profile so we only have to look
1015	// at the active profile below.
1016	index := cycle % uint32(len(memRecord{}.future))
1017	lock(&profMemActiveLock)
1018	lock(&profMemFutureLock[index])
1019	mProf_FlushLocked(index)
1020	unlock(&profMemFutureLock[index])
1021	clear := true
1022	head := (*bucket)(mbuckets.Load())
1023	for b := head; b != nil; b = b.allnext {
1024		mp := b.mp()
1025		if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
1026			n++
1027		}
1028		if mp.active.allocs != 0 || mp.active.frees != 0 {
1029			clear = false
1030		}
1031	}
1032	if clear {
1033		// Absolutely no data, suggesting that a garbage collection
1034		// has not yet happened. In order to allow profiling when
1035		// garbage collection is disabled from the beginning of execution,
1036		// accumulate all of the cycles, and recount buckets.
1037		n = 0
1038		for b := head; b != nil; b = b.allnext {
1039			mp := b.mp()
1040			for c := range mp.future {
1041				lock(&profMemFutureLock[c])
1042				mp.active.add(&mp.future[c])
1043				mp.future[c] = memRecordCycle{}
1044				unlock(&profMemFutureLock[c])
1045			}
1046			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
1047				n++
1048			}
1049		}
1050	}
1051	if n <= size {
1052		ok = true
1053		for b := head; b != nil; b = b.allnext {
1054			mp := b.mp()
1055			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
1056				r := profilerecord.MemProfileRecord{
1057					AllocBytes:   int64(mp.active.alloc_bytes),
1058					FreeBytes:    int64(mp.active.free_bytes),
1059					AllocObjects: int64(mp.active.allocs),
1060					FreeObjects:  int64(mp.active.frees),
1061					Stack:        b.stk(),
1062				}
1063				copyFn(r)
1064			}
1065		}
1066	}
1067	unlock(&profMemActiveLock)
1068	return
1069}
1070
1071func copyMemProfileRecord(dst *MemProfileRecord, src profilerecord.MemProfileRecord) {
1072	dst.AllocBytes = src.AllocBytes
1073	dst.FreeBytes = src.FreeBytes
1074	dst.AllocObjects = src.AllocObjects
1075	dst.FreeObjects = src.FreeObjects
1076	if raceenabled {
1077		racewriterangepc(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0), getcallerpc(), abi.FuncPCABIInternal(MemProfile))
1078	}
1079	if msanenabled {
1080		msanwrite(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0))
1081	}
1082	if asanenabled {
1083		asanwrite(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0))
1084	}
1085	i := copy(dst.Stack0[:], src.Stack)
1086	clear(dst.Stack0[i:])
1087}
1088
1089//go:linkname pprof_memProfileInternal
1090func pprof_memProfileInternal(p []profilerecord.MemProfileRecord, inuseZero bool) (n int, ok bool) {
1091	return memProfileInternal(len(p), inuseZero, func(r profilerecord.MemProfileRecord) {
1092		p[0] = r
1093		p = p[1:]
1094	})
1095}
1096
1097func iterate_memprof(fn func(*bucket, uintptr, *uintptr, uintptr, uintptr, uintptr)) {
1098	lock(&profMemActiveLock)
1099	head := (*bucket)(mbuckets.Load())
1100	for b := head; b != nil; b = b.allnext {
1101		mp := b.mp()
1102		fn(b, b.nstk, &b.stk()[0], b.size, mp.active.allocs, mp.active.frees)
1103	}
1104	unlock(&profMemActiveLock)
1105}
1106
1107// BlockProfileRecord describes blocking events originated
1108// at a particular call sequence (stack trace).
1109type BlockProfileRecord struct {
1110	Count  int64
1111	Cycles int64
1112	StackRecord
1113}
1114
1115// BlockProfile returns n, the number of records in the current blocking profile.
1116// If len(p) >= n, BlockProfile copies the profile into p and returns n, true.
1117// If len(p) < n, BlockProfile does not change p and returns n, false.
1118//
1119// Most clients should use the [runtime/pprof] package or
1120// the [testing] package's -test.blockprofile flag instead
1121// of calling BlockProfile directly.
1122func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
1123	var m int
1124	n, ok = blockProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) {
1125		copyBlockProfileRecord(&p[m], r)
1126		m++
1127	})
1128	if ok {
1129		expandFrames(p[:n])
1130	}
1131	return
1132}
1133
1134func expandFrames(p []BlockProfileRecord) {
1135	expandedStack := makeProfStack()
1136	for i := range p {
1137		cf := CallersFrames(p[i].Stack())
1138		j := 0
1139		for ; j < len(expandedStack); j++ {
1140			f, more := cf.Next()
1141			// f.PC is a "call PC", but later consumers will expect
1142			// "return PCs"
1143			expandedStack[j] = f.PC + 1
1144			if !more {
1145				break
1146			}
1147		}
1148		k := copy(p[i].Stack0[:], expandedStack[:j])
1149		clear(p[i].Stack0[k:])
1150	}
1151}
1152
1153// blockProfileInternal returns the number of records n in the profile. If there
1154// are less than size records, copyFn is invoked for each record, and ok returns
1155// true.
1156func blockProfileInternal(size int, copyFn func(profilerecord.BlockProfileRecord)) (n int, ok bool) {
1157	lock(&profBlockLock)
1158	head := (*bucket)(bbuckets.Load())
1159	for b := head; b != nil; b = b.allnext {
1160		n++
1161	}
1162	if n <= size {
1163		ok = true
1164		for b := head; b != nil; b = b.allnext {
1165			bp := b.bp()
1166			r := profilerecord.BlockProfileRecord{
1167				Count:  int64(bp.count),
1168				Cycles: bp.cycles,
1169				Stack:  b.stk(),
1170			}
1171			// Prevent callers from having to worry about division by zero errors.
1172			// See discussion on http://golang.org/cl/299991.
1173			if r.Count == 0 {
1174				r.Count = 1
1175			}
1176			copyFn(r)
1177		}
1178	}
1179	unlock(&profBlockLock)
1180	return
1181}
1182
1183// copyBlockProfileRecord copies the sample values and call stack from src to dst.
1184// The call stack is copied as-is. The caller is responsible for handling inline
1185// expansion, needed when the call stack was collected with frame pointer unwinding.
1186func copyBlockProfileRecord(dst *BlockProfileRecord, src profilerecord.BlockProfileRecord) {
1187	dst.Count = src.Count
1188	dst.Cycles = src.Cycles
1189	if raceenabled {
1190		racewriterangepc(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0), getcallerpc(), abi.FuncPCABIInternal(BlockProfile))
1191	}
1192	if msanenabled {
1193		msanwrite(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0))
1194	}
1195	if asanenabled {
1196		asanwrite(unsafe.Pointer(&dst.Stack0[0]), unsafe.Sizeof(dst.Stack0))
1197	}
1198	// We just copy the stack here without inline expansion
1199	// (needed if frame pointer unwinding is used)
1200	// since this function is called under the profile lock,
1201	// and doing something that might allocate can violate lock ordering.
1202	i := copy(dst.Stack0[:], src.Stack)
1203	clear(dst.Stack0[i:])
1204}
1205
1206//go:linkname pprof_blockProfileInternal
1207func pprof_blockProfileInternal(p []profilerecord.BlockProfileRecord) (n int, ok bool) {
1208	return blockProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) {
1209		p[0] = r
1210		p = p[1:]
1211	})
1212}
1213
1214// MutexProfile returns n, the number of records in the current mutex profile.
1215// If len(p) >= n, MutexProfile copies the profile into p and returns n, true.
1216// Otherwise, MutexProfile does not change p, and returns n, false.
1217//
1218// Most clients should use the [runtime/pprof] package
1219// instead of calling MutexProfile directly.
1220func MutexProfile(p []BlockProfileRecord) (n int, ok bool) {
1221	var m int
1222	n, ok = mutexProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) {
1223		copyBlockProfileRecord(&p[m], r)
1224		m++
1225	})
1226	if ok {
1227		expandFrames(p[:n])
1228	}
1229	return
1230}
1231
1232// mutexProfileInternal returns the number of records n in the profile. If there
1233// are less than size records, copyFn is invoked for each record, and ok returns
1234// true.
1235func mutexProfileInternal(size int, copyFn func(profilerecord.BlockProfileRecord)) (n int, ok bool) {
1236	lock(&profBlockLock)
1237	head := (*bucket)(xbuckets.Load())
1238	for b := head; b != nil; b = b.allnext {
1239		n++
1240	}
1241	if n <= size {
1242		ok = true
1243		for b := head; b != nil; b = b.allnext {
1244			bp := b.bp()
1245			r := profilerecord.BlockProfileRecord{
1246				Count:  int64(bp.count),
1247				Cycles: bp.cycles,
1248				Stack:  b.stk(),
1249			}
1250			copyFn(r)
1251		}
1252	}
1253	unlock(&profBlockLock)
1254	return
1255}
1256
1257//go:linkname pprof_mutexProfileInternal
1258func pprof_mutexProfileInternal(p []profilerecord.BlockProfileRecord) (n int, ok bool) {
1259	return mutexProfileInternal(len(p), func(r profilerecord.BlockProfileRecord) {
1260		p[0] = r
1261		p = p[1:]
1262	})
1263}
1264
1265// ThreadCreateProfile returns n, the number of records in the thread creation profile.
1266// If len(p) >= n, ThreadCreateProfile copies the profile into p and returns n, true.
1267// If len(p) < n, ThreadCreateProfile does not change p and returns n, false.
1268//
1269// Most clients should use the runtime/pprof package instead
1270// of calling ThreadCreateProfile directly.
1271func ThreadCreateProfile(p []StackRecord) (n int, ok bool) {
1272	return threadCreateProfileInternal(len(p), func(r profilerecord.StackRecord) {
1273		copy(p[0].Stack0[:], r.Stack)
1274		p = p[1:]
1275	})
1276}
1277
1278// threadCreateProfileInternal returns the number of records n in the profile.
1279// If there are less than size records, copyFn is invoked for each record, and
1280// ok returns true.
1281func threadCreateProfileInternal(size int, copyFn func(profilerecord.StackRecord)) (n int, ok bool) {
1282	first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
1283	for mp := first; mp != nil; mp = mp.alllink {
1284		n++
1285	}
1286	if n <= size {
1287		ok = true
1288		for mp := first; mp != nil; mp = mp.alllink {
1289			r := profilerecord.StackRecord{Stack: mp.createstack[:]}
1290			copyFn(r)
1291		}
1292	}
1293	return
1294}
1295
1296//go:linkname pprof_threadCreateInternal
1297func pprof_threadCreateInternal(p []profilerecord.StackRecord) (n int, ok bool) {
1298	return threadCreateProfileInternal(len(p), func(r profilerecord.StackRecord) {
1299		p[0] = r
1300		p = p[1:]
1301	})
1302}
1303
1304//go:linkname pprof_goroutineProfileWithLabels
1305func pprof_goroutineProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1306	return goroutineProfileWithLabels(p, labels)
1307}
1308
1309// labels may be nil. If labels is non-nil, it must have the same length as p.
1310func goroutineProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1311	if labels != nil && len(labels) != len(p) {
1312		labels = nil
1313	}
1314
1315	return goroutineProfileWithLabelsConcurrent(p, labels)
1316}
1317
1318var goroutineProfile = struct {
1319	sema    uint32
1320	active  bool
1321	offset  atomic.Int64
1322	records []profilerecord.StackRecord
1323	labels  []unsafe.Pointer
1324}{
1325	sema: 1,
1326}
1327
1328// goroutineProfileState indicates the status of a goroutine's stack for the
1329// current in-progress goroutine profile. Goroutines' stacks are initially
1330// "Absent" from the profile, and end up "Satisfied" by the time the profile is
1331// complete. While a goroutine's stack is being captured, its
1332// goroutineProfileState will be "InProgress" and it will not be able to run
1333// until the capture completes and the state moves to "Satisfied".
1334//
1335// Some goroutines (the finalizer goroutine, which at various times can be
1336// either a "system" or a "user" goroutine, and the goroutine that is
1337// coordinating the profile, any goroutines created during the profile) move
1338// directly to the "Satisfied" state.
1339type goroutineProfileState uint32
1340
1341const (
1342	goroutineProfileAbsent goroutineProfileState = iota
1343	goroutineProfileInProgress
1344	goroutineProfileSatisfied
1345)
1346
1347type goroutineProfileStateHolder atomic.Uint32
1348
1349func (p *goroutineProfileStateHolder) Load() goroutineProfileState {
1350	return goroutineProfileState((*atomic.Uint32)(p).Load())
1351}
1352
1353func (p *goroutineProfileStateHolder) Store(value goroutineProfileState) {
1354	(*atomic.Uint32)(p).Store(uint32(value))
1355}
1356
1357func (p *goroutineProfileStateHolder) CompareAndSwap(old, new goroutineProfileState) bool {
1358	return (*atomic.Uint32)(p).CompareAndSwap(uint32(old), uint32(new))
1359}
1360
1361func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1362	if len(p) == 0 {
1363		// An empty slice is obviously too small. Return a rough
1364		// allocation estimate without bothering to STW. As long as
1365		// this is close, then we'll only need to STW once (on the next
1366		// call).
1367		return int(gcount()), false
1368	}
1369
1370	semacquire(&goroutineProfile.sema)
1371
1372	ourg := getg()
1373
1374	pcbuf := makeProfStack() // see saveg() for explanation
1375	stw := stopTheWorld(stwGoroutineProfile)
1376	// Using gcount while the world is stopped should give us a consistent view
1377	// of the number of live goroutines, minus the number of goroutines that are
1378	// alive and permanently marked as "system". But to make this count agree
1379	// with what we'd get from isSystemGoroutine, we need special handling for
1380	// goroutines that can vary between user and system to ensure that the count
1381	// doesn't change during the collection. So, check the finalizer goroutine
1382	// in particular.
1383	n = int(gcount())
1384	if fingStatus.Load()&fingRunningFinalizer != 0 {
1385		n++
1386	}
1387
1388	if n > len(p) {
1389		// There's not enough space in p to store the whole profile, so (per the
1390		// contract of runtime.GoroutineProfile) we're not allowed to write to p
1391		// at all and must return n, false.
1392		startTheWorld(stw)
1393		semrelease(&goroutineProfile.sema)
1394		return n, false
1395	}
1396
1397	// Save current goroutine.
1398	sp := getcallersp()
1399	pc := getcallerpc()
1400	systemstack(func() {
1401		saveg(pc, sp, ourg, &p[0], pcbuf)
1402	})
1403	if labels != nil {
1404		labels[0] = ourg.labels
1405	}
1406	ourg.goroutineProfiled.Store(goroutineProfileSatisfied)
1407	goroutineProfile.offset.Store(1)
1408
1409	// Prepare for all other goroutines to enter the profile. Aside from ourg,
1410	// every goroutine struct in the allgs list has its goroutineProfiled field
1411	// cleared. Any goroutine created from this point on (while
1412	// goroutineProfile.active is set) will start with its goroutineProfiled
1413	// field set to goroutineProfileSatisfied.
1414	goroutineProfile.active = true
1415	goroutineProfile.records = p
1416	goroutineProfile.labels = labels
1417	// The finalizer goroutine needs special handling because it can vary over
1418	// time between being a user goroutine (eligible for this profile) and a
1419	// system goroutine (to be excluded). Pick one before restarting the world.
1420	if fing != nil {
1421		fing.goroutineProfiled.Store(goroutineProfileSatisfied)
1422		if readgstatus(fing) != _Gdead && !isSystemGoroutine(fing, false) {
1423			doRecordGoroutineProfile(fing, pcbuf)
1424		}
1425	}
1426	startTheWorld(stw)
1427
1428	// Visit each goroutine that existed as of the startTheWorld call above.
1429	//
1430	// New goroutines may not be in this list, but we didn't want to know about
1431	// them anyway. If they do appear in this list (via reusing a dead goroutine
1432	// struct, or racing to launch between the world restarting and us getting
1433	// the list), they will already have their goroutineProfiled field set to
1434	// goroutineProfileSatisfied before their state transitions out of _Gdead.
1435	//
1436	// Any goroutine that the scheduler tries to execute concurrently with this
1437	// call will start by adding itself to the profile (before the act of
1438	// executing can cause any changes in its stack).
1439	forEachGRace(func(gp1 *g) {
1440		tryRecordGoroutineProfile(gp1, pcbuf, Gosched)
1441	})
1442
1443	stw = stopTheWorld(stwGoroutineProfileCleanup)
1444	endOffset := goroutineProfile.offset.Swap(0)
1445	goroutineProfile.active = false
1446	goroutineProfile.records = nil
1447	goroutineProfile.labels = nil
1448	startTheWorld(stw)
1449
1450	// Restore the invariant that every goroutine struct in allgs has its
1451	// goroutineProfiled field cleared.
1452	forEachGRace(func(gp1 *g) {
1453		gp1.goroutineProfiled.Store(goroutineProfileAbsent)
1454	})
1455
1456	if raceenabled {
1457		raceacquire(unsafe.Pointer(&labelSync))
1458	}
1459
1460	if n != int(endOffset) {
1461		// It's a big surprise that the number of goroutines changed while we
1462		// were collecting the profile. But probably better to return a
1463		// truncated profile than to crash the whole process.
1464		//
1465		// For instance, needm moves a goroutine out of the _Gdead state and so
1466		// might be able to change the goroutine count without interacting with
1467		// the scheduler. For code like that, the race windows are small and the
1468		// combination of features is uncommon, so it's hard to be (and remain)
1469		// sure we've caught them all.
1470	}
1471
1472	semrelease(&goroutineProfile.sema)
1473	return n, true
1474}
1475
1476// tryRecordGoroutineProfileWB asserts that write barriers are allowed and calls
1477// tryRecordGoroutineProfile.
1478//
1479//go:yeswritebarrierrec
1480func tryRecordGoroutineProfileWB(gp1 *g) {
1481	if getg().m.p.ptr() == nil {
1482		throw("no P available, write barriers are forbidden")
1483	}
1484	tryRecordGoroutineProfile(gp1, nil, osyield)
1485}
1486
1487// tryRecordGoroutineProfile ensures that gp1 has the appropriate representation
1488// in the current goroutine profile: either that it should not be profiled, or
1489// that a snapshot of its call stack and labels are now in the profile.
1490func tryRecordGoroutineProfile(gp1 *g, pcbuf []uintptr, yield func()) {
1491	if readgstatus(gp1) == _Gdead {
1492		// Dead goroutines should not appear in the profile. Goroutines that
1493		// start while profile collection is active will get goroutineProfiled
1494		// set to goroutineProfileSatisfied before transitioning out of _Gdead,
1495		// so here we check _Gdead first.
1496		return
1497	}
1498	if isSystemGoroutine(gp1, true) {
1499		// System goroutines should not appear in the profile. (The finalizer
1500		// goroutine is marked as "already profiled".)
1501		return
1502	}
1503
1504	for {
1505		prev := gp1.goroutineProfiled.Load()
1506		if prev == goroutineProfileSatisfied {
1507			// This goroutine is already in the profile (or is new since the
1508			// start of collection, so shouldn't appear in the profile).
1509			break
1510		}
1511		if prev == goroutineProfileInProgress {
1512			// Something else is adding gp1 to the goroutine profile right now.
1513			// Give that a moment to finish.
1514			yield()
1515			continue
1516		}
1517
1518		// While we have gp1.goroutineProfiled set to
1519		// goroutineProfileInProgress, gp1 may appear _Grunnable but will not
1520		// actually be able to run. Disable preemption for ourselves, to make
1521		// sure we finish profiling gp1 right away instead of leaving it stuck
1522		// in this limbo.
1523		mp := acquirem()
1524		if gp1.goroutineProfiled.CompareAndSwap(goroutineProfileAbsent, goroutineProfileInProgress) {
1525			doRecordGoroutineProfile(gp1, pcbuf)
1526			gp1.goroutineProfiled.Store(goroutineProfileSatisfied)
1527		}
1528		releasem(mp)
1529	}
1530}
1531
1532// doRecordGoroutineProfile writes gp1's call stack and labels to an in-progress
1533// goroutine profile. Preemption is disabled.
1534//
1535// This may be called via tryRecordGoroutineProfile in two ways: by the
1536// goroutine that is coordinating the goroutine profile (running on its own
1537// stack), or from the scheduler in preparation to execute gp1 (running on the
1538// system stack).
1539func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) {
1540	if readgstatus(gp1) == _Grunning {
1541		print("doRecordGoroutineProfile gp1=", gp1.goid, "\n")
1542		throw("cannot read stack of running goroutine")
1543	}
1544
1545	offset := int(goroutineProfile.offset.Add(1)) - 1
1546
1547	if offset >= len(goroutineProfile.records) {
1548		// Should be impossible, but better to return a truncated profile than
1549		// to crash the entire process at this point. Instead, deal with it in
1550		// goroutineProfileWithLabelsConcurrent where we have more context.
1551		return
1552	}
1553
1554	// saveg calls gentraceback, which may call cgo traceback functions. When
1555	// called from the scheduler, this is on the system stack already so
1556	// traceback.go:cgoContextPCs will avoid calling back into the scheduler.
1557	//
1558	// When called from the goroutine coordinating the profile, we still have
1559	// set gp1.goroutineProfiled to goroutineProfileInProgress and so are still
1560	// preventing it from being truly _Grunnable. So we'll use the system stack
1561	// to avoid schedule delays.
1562	systemstack(func() { saveg(^uintptr(0), ^uintptr(0), gp1, &goroutineProfile.records[offset], pcbuf) })
1563
1564	if goroutineProfile.labels != nil {
1565		goroutineProfile.labels[offset] = gp1.labels
1566	}
1567}
1568
1569func goroutineProfileWithLabelsSync(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
1570	gp := getg()
1571
1572	isOK := func(gp1 *g) bool {
1573		// Checking isSystemGoroutine here makes GoroutineProfile
1574		// consistent with both NumGoroutine and Stack.
1575		return gp1 != gp && readgstatus(gp1) != _Gdead && !isSystemGoroutine(gp1, false)
1576	}
1577
1578	pcbuf := makeProfStack() // see saveg() for explanation
1579	stw := stopTheWorld(stwGoroutineProfile)
1580
1581	// World is stopped, no locking required.
1582	n = 1
1583	forEachGRace(func(gp1 *g) {
1584		if isOK(gp1) {
1585			n++
1586		}
1587	})
1588
1589	if n <= len(p) {
1590		ok = true
1591		r, lbl := p, labels
1592
1593		// Save current goroutine.
1594		sp := getcallersp()
1595		pc := getcallerpc()
1596		systemstack(func() {
1597			saveg(pc, sp, gp, &r[0], pcbuf)
1598		})
1599		r = r[1:]
1600
1601		// If we have a place to put our goroutine labelmap, insert it there.
1602		if labels != nil {
1603			lbl[0] = gp.labels
1604			lbl = lbl[1:]
1605		}
1606
1607		// Save other goroutines.
1608		forEachGRace(func(gp1 *g) {
1609			if !isOK(gp1) {
1610				return
1611			}
1612
1613			if len(r) == 0 {
1614				// Should be impossible, but better to return a
1615				// truncated profile than to crash the entire process.
1616				return
1617			}
1618			// saveg calls gentraceback, which may call cgo traceback functions.
1619			// The world is stopped, so it cannot use cgocall (which will be
1620			// blocked at exitsyscall). Do it on the system stack so it won't
1621			// call into the schedular (see traceback.go:cgoContextPCs).
1622			systemstack(func() { saveg(^uintptr(0), ^uintptr(0), gp1, &r[0], pcbuf) })
1623			if labels != nil {
1624				lbl[0] = gp1.labels
1625				lbl = lbl[1:]
1626			}
1627			r = r[1:]
1628		})
1629	}
1630
1631	if raceenabled {
1632		raceacquire(unsafe.Pointer(&labelSync))
1633	}
1634
1635	startTheWorld(stw)
1636	return n, ok
1637}
1638
1639// GoroutineProfile returns n, the number of records in the active goroutine stack profile.
1640// If len(p) >= n, GoroutineProfile copies the profile into p and returns n, true.
1641// If len(p) < n, GoroutineProfile does not change p and returns n, false.
1642//
1643// Most clients should use the [runtime/pprof] package instead
1644// of calling GoroutineProfile directly.
1645func GoroutineProfile(p []StackRecord) (n int, ok bool) {
1646	records := make([]profilerecord.StackRecord, len(p))
1647	n, ok = goroutineProfileInternal(records)
1648	if !ok {
1649		return
1650	}
1651	for i, mr := range records[0:n] {
1652		copy(p[i].Stack0[:], mr.Stack)
1653	}
1654	return
1655}
1656
1657func goroutineProfileInternal(p []profilerecord.StackRecord) (n int, ok bool) {
1658	return goroutineProfileWithLabels(p, nil)
1659}
1660
1661func saveg(pc, sp uintptr, gp *g, r *profilerecord.StackRecord, pcbuf []uintptr) {
1662	// To reduce memory usage, we want to allocate a r.Stack that is just big
1663	// enough to hold gp's stack trace. Naively we might achieve this by
1664	// recording our stack trace into mp.profStack, and then allocating a
1665	// r.Stack of the right size. However, mp.profStack is also used for
1666	// allocation profiling, so it could get overwritten if the slice allocation
1667	// gets profiled. So instead we record the stack trace into a temporary
1668	// pcbuf which is usually given to us by our caller. When it's not, we have
1669	// to allocate one here. This will only happen for goroutines that were in a
1670	// syscall when the goroutine profile started or for goroutines that manage
1671	// to execute before we finish iterating over all the goroutines.
1672	if pcbuf == nil {
1673		pcbuf = makeProfStack()
1674	}
1675
1676	var u unwinder
1677	u.initAt(pc, sp, 0, gp, unwindSilentErrors)
1678	n := tracebackPCs(&u, 0, pcbuf)
1679	r.Stack = make([]uintptr, n)
1680	copy(r.Stack, pcbuf)
1681}
1682
1683// Stack formats a stack trace of the calling goroutine into buf
1684// and returns the number of bytes written to buf.
1685// If all is true, Stack formats stack traces of all other goroutines
1686// into buf after the trace for the current goroutine.
1687func Stack(buf []byte, all bool) int {
1688	var stw worldStop
1689	if all {
1690		stw = stopTheWorld(stwAllGoroutinesStack)
1691	}
1692
1693	n := 0
1694	if len(buf) > 0 {
1695		gp := getg()
1696		sp := getcallersp()
1697		pc := getcallerpc()
1698		systemstack(func() {
1699			g0 := getg()
1700			// Force traceback=1 to override GOTRACEBACK setting,
1701			// so that Stack's results are consistent.
1702			// GOTRACEBACK is only about crash dumps.
1703			g0.m.traceback = 1
1704			g0.writebuf = buf[0:0:len(buf)]
1705			goroutineheader(gp)
1706			traceback(pc, sp, 0, gp)
1707			if all {
1708				tracebackothers(gp)
1709			}
1710			g0.m.traceback = 0
1711			n = len(g0.writebuf)
1712			g0.writebuf = nil
1713		})
1714	}
1715
1716	if all {
1717		startTheWorld(stw)
1718	}
1719	return n
1720}
1721