1// Copyright 2023 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// CPU profile -> trace
6
7package runtime
8
9// traceInitReadCPU initializes CPU profile -> tracer state for tracing.
10//
11// Returns a profBuf for reading from.
12func traceInitReadCPU() {
13	if traceEnabled() {
14		throw("traceInitReadCPU called with trace enabled")
15	}
16	// Create new profBuf for CPU samples that will be emitted as events.
17	// Format: after the timestamp, header is [pp.id, gp.goid, mp.procid].
18	trace.cpuLogRead[0] = newProfBuf(3, profBufWordCount, profBufTagCount)
19	trace.cpuLogRead[1] = newProfBuf(3, profBufWordCount, profBufTagCount)
20	// We must not acquire trace.signalLock outside of a signal handler: a
21	// profiling signal may arrive at any time and try to acquire it, leading to
22	// deadlock. Because we can't use that lock to protect updates to
23	// trace.cpuLogWrite (only use of the structure it references), reads and
24	// writes of the pointer must be atomic. (And although this field is never
25	// the sole pointer to the profBuf value, it's best to allow a write barrier
26	// here.)
27	trace.cpuLogWrite[0].Store(trace.cpuLogRead[0])
28	trace.cpuLogWrite[1].Store(trace.cpuLogRead[1])
29}
30
31// traceStartReadCPU creates a goroutine to start reading CPU profile
32// data into an active trace.
33//
34// traceAdvanceSema must be held.
35func traceStartReadCPU() {
36	if !traceEnabled() {
37		throw("traceStartReadCPU called with trace disabled")
38	}
39	// Spin up the logger goroutine.
40	trace.cpuSleep = newWakeableSleep()
41	done := make(chan struct{}, 1)
42	go func() {
43		for traceEnabled() {
44			// Sleep here because traceReadCPU is non-blocking. This mirrors
45			// how the runtime/pprof package obtains CPU profile data.
46			//
47			// We can't do a blocking read here because Darwin can't do a
48			// wakeup from a signal handler, so all CPU profiling is just
49			// non-blocking. See #61768 for more details.
50			//
51			// Like the runtime/pprof package, even if that bug didn't exist
52			// we would still want to do a goroutine-level sleep in between
53			// reads to avoid frequent wakeups.
54			trace.cpuSleep.sleep(100_000_000)
55
56			tl := traceAcquire()
57			if !tl.ok() {
58				// Tracing disabled.
59				break
60			}
61			keepGoing := traceReadCPU(tl.gen)
62			traceRelease(tl)
63			if !keepGoing {
64				break
65			}
66		}
67		done <- struct{}{}
68	}()
69	trace.cpuLogDone = done
70}
71
72// traceStopReadCPU blocks until the trace CPU reading goroutine exits.
73//
74// traceAdvanceSema must be held, and tracing must be disabled.
75func traceStopReadCPU() {
76	if traceEnabled() {
77		throw("traceStopReadCPU called with trace enabled")
78	}
79
80	// Once we close the profbuf, we'll be in one of two situations:
81	// - The logger goroutine has already exited because it observed
82	//   that the trace is disabled.
83	// - The logger goroutine is asleep.
84	//
85	// Wake the goroutine so it can observe that their the buffer is
86	// closed an exit.
87	trace.cpuLogWrite[0].Store(nil)
88	trace.cpuLogWrite[1].Store(nil)
89	trace.cpuLogRead[0].close()
90	trace.cpuLogRead[1].close()
91	trace.cpuSleep.wake()
92
93	// Wait until the logger goroutine exits.
94	<-trace.cpuLogDone
95
96	// Clear state for the next trace.
97	trace.cpuLogDone = nil
98	trace.cpuLogRead[0] = nil
99	trace.cpuLogRead[1] = nil
100	trace.cpuSleep.close()
101}
102
103// traceReadCPU attempts to read from the provided profBuf[gen%2] and write
104// into the trace. Returns true if there might be more to read or false
105// if the profBuf is closed or the caller should otherwise stop reading.
106//
107// The caller is responsible for ensuring that gen does not change. Either
108// the caller must be in a traceAcquire/traceRelease block, or must be calling
109// with traceAdvanceSema held.
110//
111// No more than one goroutine may be in traceReadCPU for the same
112// profBuf at a time.
113//
114// Must not run on the system stack because profBuf.read performs race
115// operations.
116func traceReadCPU(gen uintptr) bool {
117	var pcBuf [traceStackSize]uintptr
118
119	data, tags, eof := trace.cpuLogRead[gen%2].read(profBufNonBlocking)
120	for len(data) > 0 {
121		if len(data) < 4 || data[0] > uint64(len(data)) {
122			break // truncated profile
123		}
124		if data[0] < 4 || tags != nil && len(tags) < 1 {
125			break // malformed profile
126		}
127		if len(tags) < 1 {
128			break // mismatched profile records and tags
129		}
130
131		// Deserialize the data in the profile buffer.
132		recordLen := data[0]
133		timestamp := data[1]
134		ppid := data[2] >> 1
135		if hasP := (data[2] & 0b1) != 0; !hasP {
136			ppid = ^uint64(0)
137		}
138		goid := data[3]
139		mpid := data[4]
140		stk := data[5:recordLen]
141
142		// Overflow records always have their headers contain
143		// all zeroes.
144		isOverflowRecord := len(stk) == 1 && data[2] == 0 && data[3] == 0 && data[4] == 0
145
146		// Move the data iterator forward.
147		data = data[recordLen:]
148		// No support here for reporting goroutine tags at the moment; if
149		// that information is to be part of the execution trace, we'd
150		// probably want to see when the tags are applied and when they
151		// change, instead of only seeing them when we get a CPU sample.
152		tags = tags[1:]
153
154		if isOverflowRecord {
155			// Looks like an overflow record from the profBuf. Not much to
156			// do here, we only want to report full records.
157			continue
158		}
159
160		// Construct the stack for insertion to the stack table.
161		nstk := 1
162		pcBuf[0] = logicalStackSentinel
163		for ; nstk < len(pcBuf) && nstk-1 < len(stk); nstk++ {
164			pcBuf[nstk] = uintptr(stk[nstk-1])
165		}
166
167		// Write out a trace event.
168		w := unsafeTraceWriter(gen, trace.cpuBuf[gen%2])
169
170		// Ensure we have a place to write to.
171		var flushed bool
172		w, flushed = w.ensure(2 + 5*traceBytesPerNumber /* traceEvCPUSamples + traceEvCPUSample + timestamp + g + m + p + stack ID */)
173		if flushed {
174			// Annotate the batch as containing strings.
175			w.byte(byte(traceEvCPUSamples))
176		}
177
178		// Add the stack to the table.
179		stackID := trace.stackTab[gen%2].put(pcBuf[:nstk])
180
181		// Write out the CPU sample.
182		w.byte(byte(traceEvCPUSample))
183		w.varint(timestamp)
184		w.varint(mpid)
185		w.varint(ppid)
186		w.varint(goid)
187		w.varint(stackID)
188
189		trace.cpuBuf[gen%2] = w.traceBuf
190	}
191	return !eof
192}
193
194// traceCPUFlush flushes trace.cpuBuf[gen%2]. The caller must be certain that gen
195// has completed and that there are no more writers to it.
196func traceCPUFlush(gen uintptr) {
197	// Flush any remaining trace buffers containing CPU samples.
198	if buf := trace.cpuBuf[gen%2]; buf != nil {
199		systemstack(func() {
200			lock(&trace.lock)
201			traceBufFlush(buf, gen)
202			unlock(&trace.lock)
203			trace.cpuBuf[gen%2] = nil
204		})
205	}
206}
207
208// traceCPUSample writes a CPU profile sample stack to the execution tracer's
209// profiling buffer. It is called from a signal handler, so is limited in what
210// it can do. mp must be the thread that is currently stopped in a signal.
211func traceCPUSample(gp *g, mp *m, pp *p, stk []uintptr) {
212	if !traceEnabled() {
213		// Tracing is usually turned off; don't spend time acquiring the signal
214		// lock unless it's active.
215		return
216	}
217	if mp == nil {
218		// Drop samples that don't have an identifiable thread. We can't render
219		// this in any useful way anyway.
220		return
221	}
222
223	// We're going to conditionally write to one of two buffers based on the
224	// generation. To make sure we write to the correct one, we need to make
225	// sure this thread's trace seqlock is held. If it already is, then we're
226	// in the tracer and we can just take advantage of that. If it isn't, then
227	// we need to acquire it and read the generation.
228	locked := false
229	if mp.trace.seqlock.Load()%2 == 0 {
230		mp.trace.seqlock.Add(1)
231		locked = true
232	}
233	gen := trace.gen.Load()
234	if gen == 0 {
235		// Tracing is disabled, as it turns out. Release the seqlock if necessary
236		// and exit.
237		if locked {
238			mp.trace.seqlock.Add(1)
239		}
240		return
241	}
242
243	now := traceClockNow()
244	// The "header" here is the ID of the M that was running the profiled code,
245	// followed by the IDs of the P and goroutine. (For normal CPU profiling, it's
246	// usually the number of samples with the given stack.) Near syscalls, pp
247	// may be nil. Reporting goid of 0 is fine for either g0 or a nil gp.
248	var hdr [3]uint64
249	if pp != nil {
250		// Overflow records in profBuf have all header values set to zero. Make
251		// sure that real headers have at least one bit set.
252		hdr[0] = uint64(pp.id)<<1 | 0b1
253	} else {
254		hdr[0] = 0b10
255	}
256	if gp != nil {
257		hdr[1] = gp.goid
258	}
259	hdr[2] = uint64(mp.procid)
260
261	// Allow only one writer at a time
262	for !trace.signalLock.CompareAndSwap(0, 1) {
263		// TODO: Is it safe to osyield here? https://go.dev/issue/52672
264		osyield()
265	}
266
267	if log := trace.cpuLogWrite[gen%2].Load(); log != nil {
268		// Note: we don't pass a tag pointer here (how should profiling tags
269		// interact with the execution tracer?), but if we did we'd need to be
270		// careful about write barriers. See the long comment in profBuf.write.
271		log.write(nil, int64(now), hdr[:], stk)
272	}
273
274	trace.signalLock.Store(0)
275
276	// Release the seqlock if we acquired it earlier.
277	if locked {
278		mp.trace.seqlock.Add(1)
279	}
280}
281