1// Copyright 2023 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Go execution tracer.
6// The tracer captures a wide range of execution events like goroutine
7// creation/blocking/unblocking, syscall enter/exit/block, GC-related events,
8// changes of heap size, processor start/stop, etc and writes them to a buffer
9// in a compact form. A precise nanosecond-precision timestamp and a stack
10// trace is captured for most events.
11//
12// Tracer invariants (to keep the synchronization making sense):
13// - An m that has a trace buffer must be on either the allm or sched.freem lists.
14// - Any trace buffer mutation must either be happening in traceAdvance or between
15//   a traceAcquire and a subsequent traceRelease.
16// - traceAdvance cannot return until the previous generation's buffers are all flushed.
17//
18// See https://go.dev/issue/60773 for a link to the full design.
19
20package runtime
21
22import (
23	"internal/runtime/atomic"
24	"unsafe"
25)
26
27// Trace state.
28
29// trace is global tracing context.
30var trace struct {
31	// trace.lock must only be acquired on the system stack where
32	// stack splits cannot happen while it is held.
33	lock mutex
34
35	// Trace buffer management.
36	//
37	// First we check the empty list for any free buffers. If not, buffers
38	// are allocated directly from the OS. Once they're filled up and/or
39	// flushed, they end up on the full queue for trace.gen%2.
40	//
41	// The trace reader takes buffers off the full list one-by-one and
42	// places them into reading until they're finished being read from.
43	// Then they're placed onto the empty list.
44	//
45	// Protected by trace.lock.
46	reading       *traceBuf // buffer currently handed off to user
47	empty         *traceBuf // stack of empty buffers
48	full          [2]traceBufQueue
49	workAvailable atomic.Bool
50
51	// State for the trace reader goroutine.
52	//
53	// Protected by trace.lock.
54	readerGen     atomic.Uintptr // the generation the reader is currently reading for
55	flushedGen    atomic.Uintptr // the last completed generation
56	headerWritten bool           // whether ReadTrace has emitted trace header
57
58	// doneSema is used to synchronize the reader and traceAdvance. Specifically,
59	// it notifies traceAdvance that the reader is done with a generation.
60	// Both semaphores are 0 by default (so, acquires block). traceAdvance
61	// attempts to acquire for gen%2 after flushing the last buffers for gen.
62	// Meanwhile the reader releases the sema for gen%2 when it has finished
63	// processing gen.
64	doneSema [2]uint32
65
66	// Trace data tables for deduplicating data going into the trace.
67	// There are 2 of each: one for gen%2, one for 1-gen%2.
68	stackTab  [2]traceStackTable  // maps stack traces to unique ids
69	stringTab [2]traceStringTable // maps strings to unique ids
70	typeTab   [2]traceTypeTable   // maps type pointers to unique ids
71
72	// cpuLogRead accepts CPU profile samples from the signal handler where
73	// they're generated. There are two profBufs here: one for gen%2, one for
74	// 1-gen%2. These profBufs use a three-word header to hold the IDs of the P, G,
75	// and M (respectively) that were active at the time of the sample. Because
76	// profBuf uses a record with all zeros in its header to indicate overflow,
77	// we make sure to make the P field always non-zero: The ID of a real P will
78	// start at bit 1, and bit 0 will be set. Samples that arrive while no P is
79	// running (such as near syscalls) will set the first header field to 0b10.
80	// This careful handling of the first header field allows us to store ID of
81	// the active G directly in the second field, even though that will be 0
82	// when sampling g0.
83	//
84	// Initialization and teardown of these fields is protected by traceAdvanceSema.
85	cpuLogRead  [2]*profBuf
86	signalLock  atomic.Uint32              // protects use of the following member, only usable in signal handlers
87	cpuLogWrite [2]atomic.Pointer[profBuf] // copy of cpuLogRead for use in signal handlers, set without signalLock
88	cpuSleep    *wakeableSleep
89	cpuLogDone  <-chan struct{}
90	cpuBuf      [2]*traceBuf
91
92	reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil
93
94	// Fast mappings from enumerations to string IDs that are prepopulated
95	// in the trace.
96	markWorkerLabels [2][len(gcMarkWorkerModeStrings)]traceArg
97	goStopReasons    [2][len(traceGoStopReasonStrings)]traceArg
98	goBlockReasons   [2][len(traceBlockReasonStrings)]traceArg
99
100	// enabled indicates whether tracing is enabled, but it is only an optimization,
101	// NOT the source of truth on whether tracing is enabled. Tracing is only truly
102	// enabled if gen != 0. This is used as an optimistic fast path check.
103	//
104	// Transitioning this value from true -> false is easy (once gen is 0)
105	// because it's OK for enabled to have a stale "true" value. traceAcquire will
106	// always double-check gen.
107	//
108	// Transitioning this value from false -> true is harder. We need to make sure
109	// this is observable as true strictly before gen != 0. To maintain this invariant
110	// we only make this transition with the world stopped and use the store to gen
111	// as a publication barrier.
112	enabled bool
113
114	// enabledWithAllocFree is set if debug.traceallocfree is != 0 when tracing begins.
115	// It follows the same synchronization protocol as enabled.
116	enabledWithAllocFree bool
117
118	// Trace generation counter.
119	gen            atomic.Uintptr
120	lastNonZeroGen uintptr // last non-zero value of gen
121
122	// shutdown is set when we are waiting for trace reader to finish after setting gen to 0
123	//
124	// Writes protected by trace.lock.
125	shutdown atomic.Bool
126
127	// Number of goroutines in syscall exiting slow path.
128	exitingSyscall atomic.Int32
129
130	// seqGC is the sequence counter for GC begin/end.
131	//
132	// Mutated only during stop-the-world.
133	seqGC uint64
134
135	// minPageHeapAddr is the minimum address of the page heap when tracing started.
136	minPageHeapAddr uint64
137
138	// debugMalloc is the value of debug.malloc before tracing began.
139	debugMalloc bool
140}
141
142// Trace public API.
143
144var (
145	traceAdvanceSema  uint32 = 1
146	traceShutdownSema uint32 = 1
147)
148
149// StartTrace enables tracing for the current process.
150// While tracing, the data will be buffered and available via [ReadTrace].
151// StartTrace returns an error if tracing is already enabled.
152// Most clients should use the [runtime/trace] package or the [testing] package's
153// -test.trace flag instead of calling StartTrace directly.
154func StartTrace() error {
155	if traceEnabled() || traceShuttingDown() {
156		return errorString("tracing is already enabled")
157	}
158	// Block until cleanup of the last trace is done.
159	semacquire(&traceShutdownSema)
160	semrelease(&traceShutdownSema)
161
162	// Hold traceAdvanceSema across trace start, since we'll want it on
163	// the other side of tracing being enabled globally.
164	semacquire(&traceAdvanceSema)
165
166	// Initialize CPU profile -> trace ingestion.
167	traceInitReadCPU()
168
169	// Compute the first generation for this StartTrace.
170	//
171	// Note: we start from the last non-zero generation rather than 1 so we
172	// can avoid resetting all the arrays indexed by gen%2 or gen%3. There's
173	// more than one of each per m, p, and goroutine.
174	firstGen := traceNextGen(trace.lastNonZeroGen)
175
176	// Reset GC sequencer.
177	trace.seqGC = 1
178
179	// Reset trace reader state.
180	trace.headerWritten = false
181	trace.readerGen.Store(firstGen)
182	trace.flushedGen.Store(0)
183
184	// Register some basic strings in the string tables.
185	traceRegisterLabelsAndReasons(firstGen)
186
187	// Stop the world.
188	//
189	// The purpose of stopping the world is to make sure that no goroutine is in a
190	// context where it could emit an event by bringing all goroutines to a safe point
191	// with no opportunity to transition.
192	//
193	// The exception to this rule are goroutines that are concurrently exiting a syscall.
194	// Those will all be forced into the syscalling slow path, and we'll just make sure
195	// that we don't observe any goroutines in that critical section before starting
196	// the world again.
197	//
198	// A good follow-up question to this is why stopping the world is necessary at all
199	// given that we have traceAcquire and traceRelease. Unfortunately, those only help
200	// us when tracing is already active (for performance, so when tracing is off the
201	// tracing seqlock is left untouched). The main issue here is subtle: we're going to
202	// want to obtain a correct starting status for each goroutine, but there are windows
203	// of time in which we could read and emit an incorrect status. Specifically:
204	//
205	//	trace := traceAcquire()
206	//  // <----> problem window
207	//	casgstatus(gp, _Gwaiting, _Grunnable)
208	//	if trace.ok() {
209	//		trace.GoUnpark(gp, 2)
210	//		traceRelease(trace)
211	//	}
212	//
213	// More precisely, if we readgstatus for a gp while another goroutine is in the problem
214	// window and that goroutine didn't observe that tracing had begun, then we might write
215	// a GoStatus(GoWaiting) event for that goroutine, but it won't trace an event marking
216	// the transition from GoWaiting to GoRunnable. The trace will then be broken, because
217	// future events will be emitted assuming the tracer sees GoRunnable.
218	//
219	// In short, what we really need here is to make sure that the next time *any goroutine*
220	// hits a traceAcquire, it sees that the trace is enabled.
221	//
222	// Note also that stopping the world is necessary to make sure sweep-related events are
223	// coherent. Since the world is stopped and sweeps are non-preemptible, we can never start
224	// the world and see an unpaired sweep 'end' event. Other parts of the tracer rely on this.
225	stw := stopTheWorld(stwStartTrace)
226
227	// Prevent sysmon from running any code that could generate events.
228	lock(&sched.sysmonlock)
229
230	// Grab the minimum page heap address. All Ps are stopped, so it's safe to read this since
231	// nothing can allocate heap memory.
232	trace.minPageHeapAddr = uint64(mheap_.pages.inUse.ranges[0].base.addr())
233
234	// Reset mSyscallID on all Ps while we have them stationary and the trace is disabled.
235	for _, pp := range allp {
236		pp.trace.mSyscallID = -1
237	}
238
239	// Start tracing.
240	//
241	// Set trace.enabled. This is *very* subtle. We need to maintain the invariant that if
242	// trace.gen != 0, then trace.enabled is always observed as true. Simultaneously, for
243	// performance, we need trace.enabled to be read without any synchronization.
244	//
245	// We ensure this is safe by stopping the world, which acts a global barrier on almost
246	// every M, and explicitly synchronize with any other Ms that could be running concurrently
247	// with us. Today, there are only two such cases:
248	// - sysmon, which we synchronized with by acquiring sysmonlock.
249	// - goroutines exiting syscalls, which we synchronize with via trace.exitingSyscall.
250	//
251	// After trace.gen is updated, other Ms may start creating trace buffers and emitting
252	// data into them.
253	trace.enabled = true
254	if debug.traceallocfree.Load() != 0 {
255		// Enable memory events since the GODEBUG is set.
256		trace.debugMalloc = debug.malloc
257		trace.enabledWithAllocFree = true
258		debug.malloc = true
259	}
260	trace.gen.Store(firstGen)
261
262	// Wait for exitingSyscall to drain.
263	//
264	// It may not monotonically decrease to zero, but in the limit it will always become
265	// zero because the world is stopped and there are no available Ps for syscall-exited
266	// goroutines to run on.
267	//
268	// Because we set gen before checking this, and because exitingSyscall is always incremented
269	// *before* traceAcquire (which checks gen), we can be certain that when exitingSyscall is zero
270	// that any goroutine that goes to exit a syscall from then on *must* observe the new gen as
271	// well as trace.enabled being set to true.
272	//
273	// The critical section on each goroutine here is going to be quite short, so the likelihood
274	// that we observe a zero value is high.
275	for trace.exitingSyscall.Load() != 0 {
276		osyield()
277	}
278
279	// Record some initial pieces of information.
280	//
281	// N.B. This will also emit a status event for this goroutine.
282	tl := traceAcquire()
283	tl.Gomaxprocs(gomaxprocs)  // Get this as early in the trace as possible. See comment in traceAdvance.
284	tl.STWStart(stwStartTrace) // We didn't trace this above, so trace it now.
285
286	// Record the fact that a GC is active, if applicable.
287	if gcphase == _GCmark || gcphase == _GCmarktermination {
288		tl.GCActive()
289	}
290
291	// Dump a snapshot of memory, if enabled.
292	if trace.enabledWithAllocFree {
293		traceSnapshotMemory(firstGen)
294	}
295
296	// Record the heap goal so we have it at the very beginning of the trace.
297	tl.HeapGoal()
298
299	// Make sure a ProcStatus is emitted for every P, while we're here.
300	for _, pp := range allp {
301		tl.writer().writeProcStatusForP(pp, pp == tl.mp.p.ptr()).end()
302	}
303	traceRelease(tl)
304
305	unlock(&sched.sysmonlock)
306	startTheWorld(stw)
307
308	traceStartReadCPU()
309	traceAdvancer.start()
310
311	semrelease(&traceAdvanceSema)
312	return nil
313}
314
315// StopTrace stops tracing, if it was previously enabled.
316// StopTrace only returns after all the reads for the trace have completed.
317func StopTrace() {
318	traceAdvance(true)
319}
320
321// traceAdvance moves tracing to the next generation, and cleans up the current generation,
322// ensuring that it's flushed out before returning. If stopTrace is true, it disables tracing
323// altogether instead of advancing to the next generation.
324//
325// traceAdvanceSema must not be held.
326//
327// traceAdvance is called by golang.org/x/exp/trace using linkname.
328//
329//go:linkname traceAdvance
330func traceAdvance(stopTrace bool) {
331	semacquire(&traceAdvanceSema)
332
333	// Get the gen that we're advancing from. In this function we don't really care much
334	// about the generation we're advancing _into_ since we'll do all the cleanup in this
335	// generation for the next advancement.
336	gen := trace.gen.Load()
337	if gen == 0 {
338		// We may end up here traceAdvance is called concurrently with StopTrace.
339		semrelease(&traceAdvanceSema)
340		return
341	}
342
343	// Write an EvFrequency event for this generation.
344	//
345	// N.B. This may block for quite a while to get a good frequency estimate, so make sure we do
346	// this here and not e.g. on the trace reader.
347	traceFrequency(gen)
348
349	// Collect all the untraced Gs.
350	type untracedG struct {
351		gp           *g
352		goid         uint64
353		mid          int64
354		stackID      uint64
355		status       uint32
356		waitreason   waitReason
357		inMarkAssist bool
358	}
359	var untracedGs []untracedG
360	forEachGRace(func(gp *g) {
361		// Make absolutely sure all Gs are ready for the next
362		// generation. We need to do this even for dead Gs because
363		// they may come alive with a new identity, and its status
364		// traced bookkeeping might end up being stale.
365		// We may miss totally new goroutines, but they'll always
366		// have clean bookkeeping.
367		gp.trace.readyNextGen(gen)
368		// If the status was traced, nothing else to do.
369		if gp.trace.statusWasTraced(gen) {
370			return
371		}
372		// Scribble down information about this goroutine.
373		ug := untracedG{gp: gp, mid: -1}
374		systemstack(func() {
375			me := getg().m.curg
376			// We don't have to handle this G status transition because we
377			// already eliminated ourselves from consideration above.
378			casGToWaitingForGC(me, _Grunning, waitReasonTraceGoroutineStatus)
379			// We need to suspend and take ownership of the G to safely read its
380			// goid. Note that we can't actually emit the event at this point
381			// because we might stop the G in a window where it's unsafe to write
382			// events based on the G's status. We need the global trace buffer flush
383			// coming up to make sure we're not racing with the G.
384			//
385			// It should be very unlikely that we try to preempt a running G here.
386			// The only situation that we might is that we're racing with a G
387			// that's running for the first time in this generation. Therefore,
388			// this should be relatively fast.
389			s := suspendG(gp)
390			if !s.dead {
391				ug.goid = s.g.goid
392				if s.g.m != nil {
393					ug.mid = int64(s.g.m.procid)
394				}
395				ug.status = readgstatus(s.g) &^ _Gscan
396				ug.waitreason = s.g.waitreason
397				ug.inMarkAssist = s.g.inMarkAssist
398				ug.stackID = traceStack(0, gp, gen)
399			}
400			resumeG(s)
401			casgstatus(me, _Gwaiting, _Grunning)
402		})
403		if ug.goid != 0 {
404			untracedGs = append(untracedGs, ug)
405		}
406	})
407
408	if !stopTrace {
409		// Re-register runtime goroutine labels and stop/block reasons.
410		traceRegisterLabelsAndReasons(traceNextGen(gen))
411	}
412
413	// Now that we've done some of the heavy stuff, prevent the world from stopping.
414	// This is necessary to ensure the consistency of the STW events. If we're feeling
415	// adventurous we could lift this restriction and add a STWActive event, but the
416	// cost of maintaining this consistency is low. We're not going to hold this semaphore
417	// for very long and most STW periods are very short.
418	// Once we hold worldsema, prevent preemption as well so we're not interrupted partway
419	// through this. We want to get this done as soon as possible.
420	semacquire(&worldsema)
421	mp := acquirem()
422
423	// Advance the generation or stop the trace.
424	trace.lastNonZeroGen = gen
425	if stopTrace {
426		systemstack(func() {
427			// Ordering is important here. Set shutdown first, then disable tracing,
428			// so that conditions like (traceEnabled() || traceShuttingDown()) have
429			// no opportunity to be false. Hold the trace lock so this update appears
430			// atomic to the trace reader.
431			lock(&trace.lock)
432			trace.shutdown.Store(true)
433			trace.gen.Store(0)
434			unlock(&trace.lock)
435
436			// Clear trace.enabled. It is totally OK for this value to be stale,
437			// because traceAcquire will always double-check gen.
438			trace.enabled = false
439		})
440	} else {
441		trace.gen.Store(traceNextGen(gen))
442	}
443
444	// Emit a ProcsChange event so we have one on record for each generation.
445	// Let's emit it as soon as possible so that downstream tools can rely on the value
446	// being there fairly soon in a generation.
447	//
448	// It's important that we do this before allowing stop-the-worlds again,
449	// because the procs count could change.
450	if !stopTrace {
451		tl := traceAcquire()
452		tl.Gomaxprocs(gomaxprocs)
453		traceRelease(tl)
454	}
455
456	// Emit a GCActive event in the new generation if necessary.
457	//
458	// It's important that we do this before allowing stop-the-worlds again,
459	// because that could emit global GC-related events.
460	if !stopTrace && (gcphase == _GCmark || gcphase == _GCmarktermination) {
461		tl := traceAcquire()
462		tl.GCActive()
463		traceRelease(tl)
464	}
465
466	// Preemption is OK again after this. If the world stops or whatever it's fine.
467	// We're just cleaning up the last generation after this point.
468	//
469	// We also don't care if the GC starts again after this for the same reasons.
470	releasem(mp)
471	semrelease(&worldsema)
472
473	// Snapshot allm and freem.
474	//
475	// Snapshotting after the generation counter update is sufficient.
476	// Because an m must be on either allm or sched.freem if it has an active trace
477	// buffer, new threads added to allm after this point must necessarily observe
478	// the new generation number (sched.lock acts as a barrier).
479	//
480	// Threads that exit before this point and are on neither list explicitly
481	// flush their own buffers in traceThreadDestroy.
482	//
483	// Snapshotting freem is necessary because Ms can continue to emit events
484	// while they're still on that list. Removal from sched.freem is serialized with
485	// this snapshot, so either we'll capture an m on sched.freem and race with
486	// the removal to flush its buffers (resolved by traceThreadDestroy acquiring
487	// the thread's seqlock, which one of us must win, so at least its old gen buffer
488	// will be flushed in time for the new generation) or it will have flushed its
489	// buffers before we snapshotted it to begin with.
490	lock(&sched.lock)
491	mToFlush := allm
492	for mp := mToFlush; mp != nil; mp = mp.alllink {
493		mp.trace.link = mp.alllink
494	}
495	for mp := sched.freem; mp != nil; mp = mp.freelink {
496		mp.trace.link = mToFlush
497		mToFlush = mp
498	}
499	unlock(&sched.lock)
500
501	// Iterate over our snapshot, flushing every buffer until we're done.
502	//
503	// Because trace writers read the generation while the seqlock is
504	// held, we can be certain that when there are no writers there are
505	// also no stale generation values left. Therefore, it's safe to flush
506	// any buffers that remain in that generation's slot.
507	const debugDeadlock = false
508	systemstack(func() {
509		// Track iterations for some rudimentary deadlock detection.
510		i := 0
511		detectedDeadlock := false
512
513		for mToFlush != nil {
514			prev := &mToFlush
515			for mp := *prev; mp != nil; {
516				if mp.trace.seqlock.Load()%2 != 0 {
517					// The M is writing. Come back to it later.
518					prev = &mp.trace.link
519					mp = mp.trace.link
520					continue
521				}
522				// Flush the trace buffer.
523				//
524				// trace.lock needed for traceBufFlush, but also to synchronize
525				// with traceThreadDestroy, which flushes both buffers unconditionally.
526				lock(&trace.lock)
527				bufp := &mp.trace.buf[gen%2]
528				if *bufp != nil {
529					traceBufFlush(*bufp, gen)
530					*bufp = nil
531				}
532				unlock(&trace.lock)
533
534				// Remove the m from the flush list.
535				*prev = mp.trace.link
536				mp.trace.link = nil
537				mp = *prev
538			}
539			// Yield only if we're going to be going around the loop again.
540			if mToFlush != nil {
541				osyield()
542			}
543
544			if debugDeadlock {
545				// Try to detect a deadlock. We probably shouldn't loop here
546				// this many times.
547				if i > 100000 && !detectedDeadlock {
548					detectedDeadlock = true
549					println("runtime: failing to flush")
550					for mp := mToFlush; mp != nil; mp = mp.trace.link {
551						print("runtime: m=", mp.id, "\n")
552					}
553				}
554				i++
555			}
556		}
557	})
558
559	// At this point, the old generation is fully flushed minus stack and string
560	// tables, CPU samples, and goroutines that haven't run at all during the last
561	// generation.
562
563	// Check to see if any Gs still haven't had events written out for them.
564	statusWriter := unsafeTraceWriter(gen, nil)
565	for _, ug := range untracedGs {
566		if ug.gp.trace.statusWasTraced(gen) {
567			// It was traced, we don't need to do anything.
568			continue
569		}
570		// It still wasn't traced. Because we ensured all Ms stopped writing trace
571		// events to the last generation, that must mean the G never had its status
572		// traced in gen between when we recorded it and now. If that's true, the goid
573		// and status we recorded then is exactly what we want right now.
574		status := goStatusToTraceGoStatus(ug.status, ug.waitreason)
575		statusWriter = statusWriter.writeGoStatus(ug.goid, ug.mid, status, ug.inMarkAssist, ug.stackID)
576	}
577	statusWriter.flush().end()
578
579	// Read everything out of the last gen's CPU profile buffer.
580	traceReadCPU(gen)
581
582	// Flush CPU samples, stacks, and strings for the last generation. This is safe,
583	// because we're now certain no M is writing to the last generation.
584	//
585	// Ordering is important here. traceCPUFlush may generate new stacks and dumping
586	// stacks may generate new strings.
587	traceCPUFlush(gen)
588	trace.stackTab[gen%2].dump(gen)
589	trace.typeTab[gen%2].dump(gen)
590	trace.stringTab[gen%2].reset(gen)
591
592	// That's it. This generation is done producing buffers.
593	systemstack(func() {
594		lock(&trace.lock)
595		trace.flushedGen.Store(gen)
596		unlock(&trace.lock)
597	})
598
599	// Perform status reset on dead Ps because they just appear as idle.
600	//
601	// Preventing preemption is sufficient to access allp safely. allp is only
602	// mutated by GOMAXPROCS calls, which require a STW.
603	//
604	// TODO(mknyszek): Consider explicitly emitting ProcCreate and ProcDestroy
605	// events to indicate whether a P exists, rather than just making its
606	// existence implicit.
607	mp = acquirem()
608	for _, pp := range allp[len(allp):cap(allp)] {
609		pp.trace.readyNextGen(traceNextGen(gen))
610	}
611	releasem(mp)
612
613	if stopTrace {
614		// Acquire the shutdown sema to begin the shutdown process.
615		semacquire(&traceShutdownSema)
616
617		// Finish off CPU profile reading.
618		traceStopReadCPU()
619
620		// Reset debug.malloc if necessary. Note that this is set in a racy
621		// way; that's OK. Some mallocs may still enter into the debug.malloc
622		// block, but they won't generate events because tracing is disabled.
623		// That is, it's OK if mallocs read a stale debug.malloc or
624		// trace.enabledWithAllocFree value.
625		if trace.enabledWithAllocFree {
626			trace.enabledWithAllocFree = false
627			debug.malloc = trace.debugMalloc
628		}
629	} else {
630		// Go over each P and emit a status event for it if necessary.
631		//
632		// We do this at the beginning of the new generation instead of the
633		// end like we do for goroutines because forEachP doesn't give us a
634		// hook to skip Ps that have already been traced. Since we have to
635		// preempt all Ps anyway, might as well stay consistent with StartTrace
636		// which does this during the STW.
637		semacquire(&worldsema)
638		forEachP(waitReasonTraceProcStatus, func(pp *p) {
639			tl := traceAcquire()
640			if !pp.trace.statusWasTraced(tl.gen) {
641				tl.writer().writeProcStatusForP(pp, false).end()
642			}
643			traceRelease(tl)
644		})
645		semrelease(&worldsema)
646	}
647
648	// Block until the trace reader has finished processing the last generation.
649	semacquire(&trace.doneSema[gen%2])
650	if raceenabled {
651		raceacquire(unsafe.Pointer(&trace.doneSema[gen%2]))
652	}
653
654	// Double-check that things look as we expect after advancing and perform some
655	// final cleanup if the trace has fully stopped.
656	systemstack(func() {
657		lock(&trace.lock)
658		if !trace.full[gen%2].empty() {
659			throw("trace: non-empty full trace buffer for done generation")
660		}
661		if stopTrace {
662			if !trace.full[1-(gen%2)].empty() {
663				throw("trace: non-empty full trace buffer for next generation")
664			}
665			if trace.reading != nil || trace.reader.Load() != nil {
666				throw("trace: reading after shutdown")
667			}
668			// Free all the empty buffers.
669			for trace.empty != nil {
670				buf := trace.empty
671				trace.empty = buf.link
672				sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf), &memstats.other_sys)
673			}
674			// Clear trace.shutdown and other flags.
675			trace.headerWritten = false
676			trace.shutdown.Store(false)
677		}
678		unlock(&trace.lock)
679	})
680
681	if stopTrace {
682		// Clear the sweep state on every P for the next time tracing is enabled.
683		//
684		// It may be stale in the next trace because we may have ended tracing in
685		// the middle of a sweep on a P.
686		//
687		// It's fine not to call forEachP here because tracing is disabled and we
688		// know at this point that nothing is calling into the tracer, but we do
689		// need to look at dead Ps too just because GOMAXPROCS could have been called
690		// at any point since we stopped tracing, and we have to ensure there's no
691		// bad state on dead Ps too. Prevent a STW and a concurrent GOMAXPROCS that
692		// might mutate allp by making ourselves briefly non-preemptible.
693		mp := acquirem()
694		for _, pp := range allp[:cap(allp)] {
695			pp.trace.inSweep = false
696			pp.trace.maySweep = false
697			pp.trace.swept = 0
698			pp.trace.reclaimed = 0
699		}
700		releasem(mp)
701	}
702
703	// Release the advance semaphore. If stopTrace is true we're still holding onto
704	// traceShutdownSema.
705	//
706	// Do a direct handoff. Don't let one caller of traceAdvance starve
707	// other calls to traceAdvance.
708	semrelease1(&traceAdvanceSema, true, 0)
709
710	if stopTrace {
711		// Stop the traceAdvancer. We can't be holding traceAdvanceSema here because
712		// we'll deadlock (we're blocked on the advancer goroutine exiting, but it
713		// may be currently trying to acquire traceAdvanceSema).
714		traceAdvancer.stop()
715		semrelease(&traceShutdownSema)
716	}
717}
718
719func traceNextGen(gen uintptr) uintptr {
720	if gen == ^uintptr(0) {
721		// gen is used both %2 and %3 and we want both patterns to continue when we loop around.
722		// ^uint32(0) and ^uint64(0) are both odd and multiples of 3. Therefore the next generation
723		// we want is even and one more than a multiple of 3. The smallest such number is 4.
724		return 4
725	}
726	return gen + 1
727}
728
729// traceRegisterLabelsAndReasons re-registers mark worker labels and
730// goroutine stop/block reasons in the string table for the provided
731// generation. Note: the provided generation must not have started yet.
732func traceRegisterLabelsAndReasons(gen uintptr) {
733	for i, label := range gcMarkWorkerModeStrings[:] {
734		trace.markWorkerLabels[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, label))
735	}
736	for i, str := range traceBlockReasonStrings[:] {
737		trace.goBlockReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str))
738	}
739	for i, str := range traceGoStopReasonStrings[:] {
740		trace.goStopReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str))
741	}
742}
743
744// ReadTrace returns the next chunk of binary tracing data, blocking until data
745// is available. If tracing is turned off and all the data accumulated while it
746// was on has been returned, ReadTrace returns nil. The caller must copy the
747// returned data before calling ReadTrace again.
748// ReadTrace must be called from one goroutine at a time.
749func ReadTrace() []byte {
750top:
751	var buf []byte
752	var park bool
753	systemstack(func() {
754		buf, park = readTrace0()
755	})
756	if park {
757		gopark(func(gp *g, _ unsafe.Pointer) bool {
758			if !trace.reader.CompareAndSwapNoWB(nil, gp) {
759				// We're racing with another reader.
760				// Wake up and handle this case.
761				return false
762			}
763
764			if g2 := traceReader(); gp == g2 {
765				// New data arrived between unlocking
766				// and the CAS and we won the wake-up
767				// race, so wake up directly.
768				return false
769			} else if g2 != nil {
770				printlock()
771				println("runtime: got trace reader", g2, g2.goid)
772				throw("unexpected trace reader")
773			}
774
775			return true
776		}, nil, waitReasonTraceReaderBlocked, traceBlockSystemGoroutine, 2)
777		goto top
778	}
779
780	return buf
781}
782
783// readTrace0 is ReadTrace's continuation on g0. This must run on the
784// system stack because it acquires trace.lock.
785//
786//go:systemstack
787func readTrace0() (buf []byte, park bool) {
788	if raceenabled {
789		// g0 doesn't have a race context. Borrow the user G's.
790		if getg().racectx != 0 {
791			throw("expected racectx == 0")
792		}
793		getg().racectx = getg().m.curg.racectx
794		// (This defer should get open-coded, which is safe on
795		// the system stack.)
796		defer func() { getg().racectx = 0 }()
797	}
798
799	// This function must not allocate while holding trace.lock:
800	// allocation can call heap allocate, which will try to emit a trace
801	// event while holding heap lock.
802	lock(&trace.lock)
803
804	if trace.reader.Load() != nil {
805		// More than one goroutine reads trace. This is bad.
806		// But we rather do not crash the program because of tracing,
807		// because tracing can be enabled at runtime on prod servers.
808		unlock(&trace.lock)
809		println("runtime: ReadTrace called from multiple goroutines simultaneously")
810		return nil, false
811	}
812	// Recycle the old buffer.
813	if buf := trace.reading; buf != nil {
814		buf.link = trace.empty
815		trace.empty = buf
816		trace.reading = nil
817	}
818	// Write trace header.
819	if !trace.headerWritten {
820		trace.headerWritten = true
821		unlock(&trace.lock)
822		return []byte("go 1.23 trace\x00\x00\x00"), false
823	}
824
825	// Read the next buffer.
826
827	if trace.readerGen.Load() == 0 {
828		trace.readerGen.Store(1)
829	}
830	var gen uintptr
831	for {
832		assertLockHeld(&trace.lock)
833		gen = trace.readerGen.Load()
834
835		// Check to see if we need to block for more data in this generation
836		// or if we need to move our generation forward.
837		if !trace.full[gen%2].empty() {
838			break
839		}
840		// Most of the time readerGen is one generation ahead of flushedGen, as the
841		// current generation is being read from. Then, once the last buffer is flushed
842		// into readerGen, flushedGen will rise to meet it. At this point, the tracer
843		// is waiting on the reader to finish flushing the last generation so that it
844		// can continue to advance.
845		if trace.flushedGen.Load() == gen {
846			if trace.shutdown.Load() {
847				unlock(&trace.lock)
848
849				// Wake up anyone waiting for us to be done with this generation.
850				//
851				// Do this after reading trace.shutdown, because the thread we're
852				// waking up is going to clear trace.shutdown.
853				if raceenabled {
854					// Model synchronization on trace.doneSema, which te race
855					// detector does not see. This is required to avoid false
856					// race reports on writer passed to trace.Start.
857					racerelease(unsafe.Pointer(&trace.doneSema[gen%2]))
858				}
859				semrelease(&trace.doneSema[gen%2])
860
861				// We're shutting down, and the last generation is fully
862				// read. We're done.
863				return nil, false
864			}
865			// The previous gen has had all of its buffers flushed, and
866			// there's nothing else for us to read. Advance the generation
867			// we're reading from and try again.
868			trace.readerGen.Store(trace.gen.Load())
869			unlock(&trace.lock)
870
871			// Wake up anyone waiting for us to be done with this generation.
872			//
873			// Do this after reading gen to make sure we can't have the trace
874			// advance until we've read it.
875			if raceenabled {
876				// See comment above in the shutdown case.
877				racerelease(unsafe.Pointer(&trace.doneSema[gen%2]))
878			}
879			semrelease(&trace.doneSema[gen%2])
880
881			// Reacquire the lock and go back to the top of the loop.
882			lock(&trace.lock)
883			continue
884		}
885		// Wait for new data.
886		//
887		// We don't simply use a note because the scheduler
888		// executes this goroutine directly when it wakes up
889		// (also a note would consume an M).
890		//
891		// Before we drop the lock, clear the workAvailable flag. Work can
892		// only be queued with trace.lock held, so this is at least true until
893		// we drop the lock.
894		trace.workAvailable.Store(false)
895		unlock(&trace.lock)
896		return nil, true
897	}
898	// Pull a buffer.
899	tbuf := trace.full[gen%2].pop()
900	trace.reading = tbuf
901	unlock(&trace.lock)
902	return tbuf.arr[:tbuf.pos], false
903}
904
905// traceReader returns the trace reader that should be woken up, if any.
906// Callers should first check (traceEnabled() || traceShuttingDown()).
907//
908// This must run on the system stack because it acquires trace.lock.
909//
910//go:systemstack
911func traceReader() *g {
912	gp := traceReaderAvailable()
913	if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) {
914		return nil
915	}
916	return gp
917}
918
919// traceReaderAvailable returns the trace reader if it is not currently
920// scheduled and should be. Callers should first check that
921// (traceEnabled() || traceShuttingDown()) is true.
922func traceReaderAvailable() *g {
923	// There are three conditions under which we definitely want to schedule
924	// the reader:
925	// - The reader is lagging behind in finishing off the last generation.
926	//   In this case, trace buffers could even be empty, but the trace
927	//   advancer will be waiting on the reader, so we have to make sure
928	//   to schedule the reader ASAP.
929	// - The reader has pending work to process for it's reader generation
930	//   (assuming readerGen is not lagging behind). Note that we also want
931	//   to be careful *not* to schedule the reader if there's no work to do.
932	// - The trace is shutting down. The trace stopper blocks on the reader
933	//   to finish, much like trace advancement.
934	//
935	// We also want to be careful not to schedule the reader if there's no
936	// reason to.
937	if trace.flushedGen.Load() == trace.readerGen.Load() || trace.workAvailable.Load() || trace.shutdown.Load() {
938		return trace.reader.Load()
939	}
940	return nil
941}
942
943// Trace advancer goroutine.
944var traceAdvancer traceAdvancerState
945
946type traceAdvancerState struct {
947	timer *wakeableSleep
948	done  chan struct{}
949}
950
951// start starts a new traceAdvancer.
952func (s *traceAdvancerState) start() {
953	// Start a goroutine to periodically advance the trace generation.
954	s.done = make(chan struct{})
955	s.timer = newWakeableSleep()
956	go func() {
957		for traceEnabled() {
958			// Set a timer to wake us up
959			s.timer.sleep(int64(debug.traceadvanceperiod))
960
961			// Try to advance the trace.
962			traceAdvance(false)
963		}
964		s.done <- struct{}{}
965	}()
966}
967
968// stop stops a traceAdvancer and blocks until it exits.
969func (s *traceAdvancerState) stop() {
970	s.timer.wake()
971	<-s.done
972	close(s.done)
973	s.timer.close()
974}
975
976// traceAdvancePeriod is the approximate period between
977// new generations.
978const defaultTraceAdvancePeriod = 1e9 // 1 second.
979
980// wakeableSleep manages a wakeable goroutine sleep.
981//
982// Users of this type must call init before first use and
983// close to free up resources. Once close is called, init
984// must be called before another use.
985type wakeableSleep struct {
986	timer *timer
987
988	// lock protects access to wakeup, but not send/recv on it.
989	lock   mutex
990	wakeup chan struct{}
991}
992
993// newWakeableSleep initializes a new wakeableSleep and returns it.
994func newWakeableSleep() *wakeableSleep {
995	s := new(wakeableSleep)
996	lockInit(&s.lock, lockRankWakeableSleep)
997	s.wakeup = make(chan struct{}, 1)
998	s.timer = new(timer)
999	f := func(s any, _ uintptr, _ int64) {
1000		s.(*wakeableSleep).wake()
1001	}
1002	s.timer.init(f, s)
1003	return s
1004}
1005
1006// sleep sleeps for the provided duration in nanoseconds or until
1007// another goroutine calls wake.
1008//
1009// Must not be called by more than one goroutine at a time and
1010// must not be called concurrently with close.
1011func (s *wakeableSleep) sleep(ns int64) {
1012	s.timer.reset(nanotime()+ns, 0)
1013	lock(&s.lock)
1014	if raceenabled {
1015		raceacquire(unsafe.Pointer(&s.lock))
1016	}
1017	wakeup := s.wakeup
1018	if raceenabled {
1019		racerelease(unsafe.Pointer(&s.lock))
1020	}
1021	unlock(&s.lock)
1022	<-wakeup
1023	s.timer.stop()
1024}
1025
1026// wake awakens any goroutine sleeping on the timer.
1027//
1028// Safe for concurrent use with all other methods.
1029func (s *wakeableSleep) wake() {
1030	// Grab the wakeup channel, which may be nil if we're
1031	// racing with close.
1032	lock(&s.lock)
1033	if raceenabled {
1034		raceacquire(unsafe.Pointer(&s.lock))
1035	}
1036	if s.wakeup != nil {
1037		// Non-blocking send.
1038		//
1039		// Others may also write to this channel and we don't
1040		// want to block on the receiver waking up. This also
1041		// effectively batches together wakeup notifications.
1042		select {
1043		case s.wakeup <- struct{}{}:
1044		default:
1045		}
1046	}
1047	if raceenabled {
1048		racerelease(unsafe.Pointer(&s.lock))
1049	}
1050	unlock(&s.lock)
1051}
1052
1053// close wakes any goroutine sleeping on the timer and prevents
1054// further sleeping on it.
1055//
1056// Once close is called, the wakeableSleep must no longer be used.
1057//
1058// It must only be called once no goroutine is sleeping on the
1059// timer *and* nothing else will call wake concurrently.
1060func (s *wakeableSleep) close() {
1061	// Set wakeup to nil so that a late timer ends up being a no-op.
1062	lock(&s.lock)
1063	if raceenabled {
1064		raceacquire(unsafe.Pointer(&s.lock))
1065	}
1066	wakeup := s.wakeup
1067	s.wakeup = nil
1068
1069	// Close the channel.
1070	close(wakeup)
1071
1072	if raceenabled {
1073		racerelease(unsafe.Pointer(&s.lock))
1074	}
1075	unlock(&s.lock)
1076	return
1077}
1078