1// Copyright 2023 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package trace
6
7import (
8	"cmp"
9	"slices"
10	"strings"
11	"time"
12)
13
14// Summary is the analysis result produced by the summarizer.
15type Summary struct {
16	Goroutines map[GoID]*GoroutineSummary
17	Tasks      map[TaskID]*UserTaskSummary
18}
19
20// GoroutineSummary contains statistics and execution details of a single goroutine.
21// (For v2 traces.)
22type GoroutineSummary struct {
23	ID           GoID
24	Name         string // A non-unique human-friendly identifier for the goroutine.
25	PC           uint64 // The first PC we saw for the entry function of the goroutine
26	CreationTime Time   // Timestamp of the first appearance in the trace.
27	StartTime    Time   // Timestamp of the first time it started running. 0 if the goroutine never ran.
28	EndTime      Time   // Timestamp of when the goroutine exited. 0 if the goroutine never exited.
29
30	// List of regions in the goroutine, sorted based on the start time.
31	Regions []*UserRegionSummary
32
33	// Statistics of execution time during the goroutine execution.
34	GoroutineExecStats
35
36	// goroutineSummary is state used just for computing this structure.
37	// It's dropped before being returned to the caller.
38	//
39	// More specifically, if it's nil, it indicates that this summary has
40	// already been finalized.
41	*goroutineSummary
42}
43
44// UserTaskSummary represents a task in the trace.
45type UserTaskSummary struct {
46	ID       TaskID
47	Name     string
48	Parent   *UserTaskSummary // nil if the parent is unknown.
49	Children []*UserTaskSummary
50
51	// Task begin event. An EventTaskBegin event or nil.
52	Start *Event
53
54	// End end event. Normally EventTaskEnd event or nil.
55	End *Event
56
57	// Logs is a list of EventLog events associated with the task.
58	Logs []*Event
59
60	// List of regions in the task, sorted based on the start time.
61	Regions []*UserRegionSummary
62
63	// Goroutines is the set of goroutines associated with this task.
64	Goroutines map[GoID]*GoroutineSummary
65}
66
67// Complete returns true if we have complete information about the task
68// from the trace: both a start and an end.
69func (s *UserTaskSummary) Complete() bool {
70	return s.Start != nil && s.End != nil
71}
72
73// Descendents returns a slice consisting of itself (always the first task returned),
74// and the transitive closure of all of its children.
75func (s *UserTaskSummary) Descendents() []*UserTaskSummary {
76	descendents := []*UserTaskSummary{s}
77	for _, child := range s.Children {
78		descendents = append(descendents, child.Descendents()...)
79	}
80	return descendents
81}
82
83// UserRegionSummary represents a region and goroutine execution stats
84// while the region was active. (For v2 traces.)
85type UserRegionSummary struct {
86	TaskID TaskID
87	Name   string
88
89	// Region start event. Normally EventRegionBegin event or nil,
90	// but can be a state transition event from NotExist or Undetermined
91	// if the region is a synthetic region representing task inheritance
92	// from the parent goroutine.
93	Start *Event
94
95	// Region end event. Normally EventRegionEnd event or nil,
96	// but can be a state transition event to NotExist if the goroutine
97	// terminated without explicitly ending the region.
98	End *Event
99
100	GoroutineExecStats
101}
102
103// GoroutineExecStats contains statistics about a goroutine's execution
104// during a period of time.
105type GoroutineExecStats struct {
106	// These stats are all non-overlapping.
107	ExecTime          time.Duration
108	SchedWaitTime     time.Duration
109	BlockTimeByReason map[string]time.Duration
110	SyscallTime       time.Duration
111	SyscallBlockTime  time.Duration
112
113	// TotalTime is the duration of the goroutine's presence in the trace.
114	// Necessarily overlaps with other stats.
115	TotalTime time.Duration
116
117	// Total time the goroutine spent in certain ranges; may overlap
118	// with other stats.
119	RangeTime map[string]time.Duration
120}
121
122func (s GoroutineExecStats) NonOverlappingStats() map[string]time.Duration {
123	stats := map[string]time.Duration{
124		"Execution time":         s.ExecTime,
125		"Sched wait time":        s.SchedWaitTime,
126		"Syscall execution time": s.SyscallTime,
127		"Block time (syscall)":   s.SyscallBlockTime,
128		"Unknown time":           s.UnknownTime(),
129	}
130	for reason, dt := range s.BlockTimeByReason {
131		stats["Block time ("+reason+")"] += dt
132	}
133	// N.B. Don't include RangeTime or TotalTime; they overlap with these other
134	// stats.
135	return stats
136}
137
138// UnknownTime returns whatever isn't accounted for in TotalTime.
139func (s GoroutineExecStats) UnknownTime() time.Duration {
140	sum := s.ExecTime + s.SchedWaitTime + s.SyscallTime +
141		s.SyscallBlockTime
142	for _, dt := range s.BlockTimeByReason {
143		sum += dt
144	}
145	// N.B. Don't include range time. Ranges overlap with
146	// other stats, whereas these stats are non-overlapping.
147	if sum < s.TotalTime {
148		return s.TotalTime - sum
149	}
150	return 0
151}
152
153// sub returns the stats v-s.
154func (s GoroutineExecStats) sub(v GoroutineExecStats) (r GoroutineExecStats) {
155	r = s.clone()
156	r.ExecTime -= v.ExecTime
157	r.SchedWaitTime -= v.SchedWaitTime
158	for reason := range s.BlockTimeByReason {
159		r.BlockTimeByReason[reason] -= v.BlockTimeByReason[reason]
160	}
161	r.SyscallTime -= v.SyscallTime
162	r.SyscallBlockTime -= v.SyscallBlockTime
163	r.TotalTime -= v.TotalTime
164	for name := range s.RangeTime {
165		r.RangeTime[name] -= v.RangeTime[name]
166	}
167	return r
168}
169
170func (s GoroutineExecStats) clone() (r GoroutineExecStats) {
171	r = s
172	r.BlockTimeByReason = make(map[string]time.Duration)
173	for reason, dt := range s.BlockTimeByReason {
174		r.BlockTimeByReason[reason] = dt
175	}
176	r.RangeTime = make(map[string]time.Duration)
177	for name, dt := range s.RangeTime {
178		r.RangeTime[name] = dt
179	}
180	return r
181}
182
183// snapshotStat returns the snapshot of the goroutine execution statistics.
184// This is called as we process the ordered trace event stream. lastTs is used
185// to process pending statistics if this is called before any goroutine end event.
186func (g *GoroutineSummary) snapshotStat(lastTs Time) (ret GoroutineExecStats) {
187	ret = g.GoroutineExecStats.clone()
188
189	if g.goroutineSummary == nil {
190		return ret // Already finalized; no pending state.
191	}
192
193	// Set the total time if necessary.
194	if g.TotalTime == 0 {
195		ret.TotalTime = lastTs.Sub(g.CreationTime)
196	}
197
198	// Add in time since lastTs.
199	if g.lastStartTime != 0 {
200		ret.ExecTime += lastTs.Sub(g.lastStartTime)
201	}
202	if g.lastRunnableTime != 0 {
203		ret.SchedWaitTime += lastTs.Sub(g.lastRunnableTime)
204	}
205	if g.lastBlockTime != 0 {
206		ret.BlockTimeByReason[g.lastBlockReason] += lastTs.Sub(g.lastBlockTime)
207	}
208	if g.lastSyscallTime != 0 {
209		ret.SyscallTime += lastTs.Sub(g.lastSyscallTime)
210	}
211	if g.lastSyscallBlockTime != 0 {
212		ret.SchedWaitTime += lastTs.Sub(g.lastSyscallBlockTime)
213	}
214	for name, ts := range g.lastRangeTime {
215		ret.RangeTime[name] += lastTs.Sub(ts)
216	}
217	return ret
218}
219
220// finalize is called when processing a goroutine end event or at
221// the end of trace processing. This finalizes the execution stat
222// and any active regions in the goroutine, in which case trigger is nil.
223func (g *GoroutineSummary) finalize(lastTs Time, trigger *Event) {
224	if trigger != nil {
225		g.EndTime = trigger.Time()
226	}
227	finalStat := g.snapshotStat(lastTs)
228
229	g.GoroutineExecStats = finalStat
230
231	// System goroutines are never part of regions, even though they
232	// "inherit" a task due to creation (EvGoCreate) from within a region.
233	// This may happen e.g. if the first GC is triggered within a region,
234	// starting the GC worker goroutines.
235	if !IsSystemGoroutine(g.Name) {
236		for _, s := range g.activeRegions {
237			s.End = trigger
238			s.GoroutineExecStats = finalStat.sub(s.GoroutineExecStats)
239			g.Regions = append(g.Regions, s)
240		}
241	}
242	*(g.goroutineSummary) = goroutineSummary{}
243}
244
245// goroutineSummary is a private part of GoroutineSummary that is required only during analysis.
246type goroutineSummary struct {
247	lastStartTime        Time
248	lastRunnableTime     Time
249	lastBlockTime        Time
250	lastBlockReason      string
251	lastSyscallTime      Time
252	lastSyscallBlockTime Time
253	lastRangeTime        map[string]Time
254	activeRegions        []*UserRegionSummary // stack of active regions
255}
256
257// Summarizer constructs per-goroutine time statistics for v2 traces.
258type Summarizer struct {
259	// gs contains the map of goroutine summaries we're building up to return to the caller.
260	gs map[GoID]*GoroutineSummary
261
262	// tasks contains the map of task summaries we're building up to return to the caller.
263	tasks map[TaskID]*UserTaskSummary
264
265	// syscallingP and syscallingG represent a binding between a P and G in a syscall.
266	// Used to correctly identify and clean up after syscalls (blocking or otherwise).
267	syscallingP map[ProcID]GoID
268	syscallingG map[GoID]ProcID
269
270	// rangesP is used for optimistic tracking of P-based ranges for goroutines.
271	//
272	// It's a best-effort mapping of an active range on a P to the goroutine we think
273	// is associated with it.
274	rangesP map[rangeP]GoID
275
276	lastTs Time // timestamp of the last event processed.
277	syncTs Time // timestamp of the last sync event processed (or the first timestamp in the trace).
278}
279
280// NewSummarizer creates a new struct to build goroutine stats from a trace.
281func NewSummarizer() *Summarizer {
282	return &Summarizer{
283		gs:          make(map[GoID]*GoroutineSummary),
284		tasks:       make(map[TaskID]*UserTaskSummary),
285		syscallingP: make(map[ProcID]GoID),
286		syscallingG: make(map[GoID]ProcID),
287		rangesP:     make(map[rangeP]GoID),
288	}
289}
290
291type rangeP struct {
292	id   ProcID
293	name string
294}
295
296// Event feeds a single event into the stats summarizer.
297func (s *Summarizer) Event(ev *Event) {
298	if s.syncTs == 0 {
299		s.syncTs = ev.Time()
300	}
301	s.lastTs = ev.Time()
302
303	switch ev.Kind() {
304	// Record sync time for the RangeActive events.
305	case EventSync:
306		s.syncTs = ev.Time()
307
308	// Handle state transitions.
309	case EventStateTransition:
310		st := ev.StateTransition()
311		switch st.Resource.Kind {
312		// Handle goroutine transitions, which are the meat of this computation.
313		case ResourceGoroutine:
314			id := st.Resource.Goroutine()
315			old, new := st.Goroutine()
316			if old == new {
317				// Skip these events; they're not telling us anything new.
318				break
319			}
320
321			// Handle transition out.
322			g := s.gs[id]
323			switch old {
324			case GoUndetermined, GoNotExist:
325				g = &GoroutineSummary{ID: id, goroutineSummary: &goroutineSummary{}}
326				// If we're coming out of GoUndetermined, then the creation time is the
327				// time of the last sync.
328				if old == GoUndetermined {
329					g.CreationTime = s.syncTs
330				} else {
331					g.CreationTime = ev.Time()
332				}
333				// The goroutine is being created, or it's being named for the first time.
334				g.lastRangeTime = make(map[string]Time)
335				g.BlockTimeByReason = make(map[string]time.Duration)
336				g.RangeTime = make(map[string]time.Duration)
337
338				// When a goroutine is newly created, inherit the task
339				// of the active region. For ease handling of this
340				// case, we create a fake region description with the
341				// task id. This isn't strictly necessary as this
342				// goroutine may not be associated with the task, but
343				// it can be convenient to see all children created
344				// during a region.
345				//
346				// N.B. ev.Goroutine() will always be NoGoroutine for the
347				// Undetermined case, so this is will simply not fire.
348				if creatorG := s.gs[ev.Goroutine()]; creatorG != nil && len(creatorG.activeRegions) > 0 {
349					regions := creatorG.activeRegions
350					s := regions[len(regions)-1]
351					g.activeRegions = []*UserRegionSummary{{TaskID: s.TaskID, Start: ev}}
352				}
353				s.gs[g.ID] = g
354			case GoRunning:
355				// Record execution time as we transition out of running
356				g.ExecTime += ev.Time().Sub(g.lastStartTime)
357				g.lastStartTime = 0
358			case GoWaiting:
359				// Record block time as we transition out of waiting.
360				if g.lastBlockTime != 0 {
361					g.BlockTimeByReason[g.lastBlockReason] += ev.Time().Sub(g.lastBlockTime)
362					g.lastBlockTime = 0
363				}
364			case GoRunnable:
365				// Record sched latency time as we transition out of runnable.
366				if g.lastRunnableTime != 0 {
367					g.SchedWaitTime += ev.Time().Sub(g.lastRunnableTime)
368					g.lastRunnableTime = 0
369				}
370			case GoSyscall:
371				// Record syscall execution time and syscall block time as we transition out of syscall.
372				if g.lastSyscallTime != 0 {
373					if g.lastSyscallBlockTime != 0 {
374						g.SyscallBlockTime += ev.Time().Sub(g.lastSyscallBlockTime)
375						g.SyscallTime += g.lastSyscallBlockTime.Sub(g.lastSyscallTime)
376					} else {
377						g.SyscallTime += ev.Time().Sub(g.lastSyscallTime)
378					}
379					g.lastSyscallTime = 0
380					g.lastSyscallBlockTime = 0
381
382					// Clear the syscall map.
383					delete(s.syscallingP, s.syscallingG[id])
384					delete(s.syscallingG, id)
385				}
386			}
387
388			// The goroutine hasn't been identified yet. Take the transition stack
389			// and identify the goroutine by the root frame of that stack.
390			// This root frame will be identical for all transitions on this
391			// goroutine, because it represents its immutable start point.
392			if g.Name == "" {
393				stk := st.Stack
394				if stk != NoStack {
395					var frame StackFrame
396					var ok bool
397					stk.Frames(func(f StackFrame) bool {
398						frame = f
399						ok = true
400						return true
401					})
402					if ok {
403						// NB: this PC won't actually be consistent for
404						// goroutines which existed at the start of the
405						// trace. The UI doesn't use it directly; this
406						// mainly serves as an indication that we
407						// actually saw a call stack for the goroutine
408						g.PC = frame.PC
409						g.Name = frame.Func
410					}
411				}
412			}
413
414			// Handle transition in.
415			switch new {
416			case GoRunning:
417				// We started running. Record it.
418				g.lastStartTime = ev.Time()
419				if g.StartTime == 0 {
420					g.StartTime = ev.Time()
421				}
422			case GoRunnable:
423				g.lastRunnableTime = ev.Time()
424			case GoWaiting:
425				if st.Reason != "forever" {
426					g.lastBlockTime = ev.Time()
427					g.lastBlockReason = st.Reason
428					break
429				}
430				// "Forever" is like goroutine death.
431				fallthrough
432			case GoNotExist:
433				g.finalize(ev.Time(), ev)
434			case GoSyscall:
435				s.syscallingP[ev.Proc()] = id
436				s.syscallingG[id] = ev.Proc()
437				g.lastSyscallTime = ev.Time()
438			}
439
440		// Handle procs to detect syscall blocking, which si identifiable as a
441		// proc going idle while the goroutine it was attached to is in a syscall.
442		case ResourceProc:
443			id := st.Resource.Proc()
444			old, new := st.Proc()
445			if old != new && new == ProcIdle {
446				if goid, ok := s.syscallingP[id]; ok {
447					g := s.gs[goid]
448					g.lastSyscallBlockTime = ev.Time()
449					delete(s.syscallingP, id)
450				}
451			}
452		}
453
454	// Handle ranges of all kinds.
455	case EventRangeBegin, EventRangeActive:
456		r := ev.Range()
457		var g *GoroutineSummary
458		switch r.Scope.Kind {
459		case ResourceGoroutine:
460			// Simple goroutine range. We attribute the entire range regardless of
461			// goroutine stats. Lots of situations are still identifiable, e.g. a
462			// goroutine blocked often in mark assist will have both high mark assist
463			// and high block times. Those interested in a deeper view can look at the
464			// trace viewer.
465			g = s.gs[r.Scope.Goroutine()]
466		case ResourceProc:
467			// N.B. These ranges are not actually bound to the goroutine, they're
468			// bound to the P. But if we happen to be on the P the whole time, let's
469			// try to attribute it to the goroutine. (e.g. GC sweeps are here.)
470			g = s.gs[ev.Goroutine()]
471			if g != nil {
472				s.rangesP[rangeP{id: r.Scope.Proc(), name: r.Name}] = ev.Goroutine()
473			}
474		}
475		if g == nil {
476			break
477		}
478		if ev.Kind() == EventRangeActive {
479			if ts := g.lastRangeTime[r.Name]; ts != 0 {
480				g.RangeTime[r.Name] += s.syncTs.Sub(ts)
481			}
482			g.lastRangeTime[r.Name] = s.syncTs
483		} else {
484			g.lastRangeTime[r.Name] = ev.Time()
485		}
486	case EventRangeEnd:
487		r := ev.Range()
488		var g *GoroutineSummary
489		switch r.Scope.Kind {
490		case ResourceGoroutine:
491			g = s.gs[r.Scope.Goroutine()]
492		case ResourceProc:
493			rp := rangeP{id: r.Scope.Proc(), name: r.Name}
494			if goid, ok := s.rangesP[rp]; ok {
495				if goid == ev.Goroutine() {
496					// As the comment in the RangeBegin case states, this is only OK
497					// if we finish on the same goroutine we started on.
498					g = s.gs[goid]
499				}
500				delete(s.rangesP, rp)
501			}
502		}
503		if g == nil {
504			break
505		}
506		ts := g.lastRangeTime[r.Name]
507		if ts == 0 {
508			break
509		}
510		g.RangeTime[r.Name] += ev.Time().Sub(ts)
511		delete(g.lastRangeTime, r.Name)
512
513	// Handle user-defined regions.
514	case EventRegionBegin:
515		g := s.gs[ev.Goroutine()]
516		r := ev.Region()
517		region := &UserRegionSummary{
518			Name:               r.Type,
519			TaskID:             r.Task,
520			Start:              ev,
521			GoroutineExecStats: g.snapshotStat(ev.Time()),
522		}
523		g.activeRegions = append(g.activeRegions, region)
524		// Associate the region and current goroutine to the task.
525		task := s.getOrAddTask(r.Task)
526		task.Regions = append(task.Regions, region)
527		task.Goroutines[g.ID] = g
528	case EventRegionEnd:
529		g := s.gs[ev.Goroutine()]
530		r := ev.Region()
531		var sd *UserRegionSummary
532		if regionStk := g.activeRegions; len(regionStk) > 0 {
533			// Pop the top region from the stack since that's what must have ended.
534			n := len(regionStk)
535			sd = regionStk[n-1]
536			regionStk = regionStk[:n-1]
537			g.activeRegions = regionStk
538			// N.B. No need to add the region to a task; the EventRegionBegin already handled it.
539		} else {
540			// This is an "end" without a start. Just fabricate the region now.
541			sd = &UserRegionSummary{Name: r.Type, TaskID: r.Task}
542			// Associate the region and current goroutine to the task.
543			task := s.getOrAddTask(r.Task)
544			task.Goroutines[g.ID] = g
545			task.Regions = append(task.Regions, sd)
546		}
547		sd.GoroutineExecStats = g.snapshotStat(ev.Time()).sub(sd.GoroutineExecStats)
548		sd.End = ev
549		g.Regions = append(g.Regions, sd)
550
551	// Handle tasks and logs.
552	case EventTaskBegin, EventTaskEnd:
553		// Initialize the task.
554		t := ev.Task()
555		task := s.getOrAddTask(t.ID)
556		task.Name = t.Type
557		task.Goroutines[ev.Goroutine()] = s.gs[ev.Goroutine()]
558		if ev.Kind() == EventTaskBegin {
559			task.Start = ev
560		} else {
561			task.End = ev
562		}
563		// Initialize the parent, if one exists and it hasn't been done yet.
564		// We need to avoid doing it twice, otherwise we could appear twice
565		// in the parent's Children list.
566		if t.Parent != NoTask && task.Parent == nil {
567			parent := s.getOrAddTask(t.Parent)
568			task.Parent = parent
569			parent.Children = append(parent.Children, task)
570		}
571	case EventLog:
572		log := ev.Log()
573		// Just add the log to the task. We'll create the task if it
574		// doesn't exist (it's just been mentioned now).
575		task := s.getOrAddTask(log.Task)
576		task.Goroutines[ev.Goroutine()] = s.gs[ev.Goroutine()]
577		task.Logs = append(task.Logs, ev)
578	}
579}
580
581func (s *Summarizer) getOrAddTask(id TaskID) *UserTaskSummary {
582	task := s.tasks[id]
583	if task == nil {
584		task = &UserTaskSummary{ID: id, Goroutines: make(map[GoID]*GoroutineSummary)}
585		s.tasks[id] = task
586	}
587	return task
588}
589
590// Finalize indicates to the summarizer that we're done processing the trace.
591// It cleans up any remaining state and returns the full summary.
592func (s *Summarizer) Finalize() *Summary {
593	for _, g := range s.gs {
594		g.finalize(s.lastTs, nil)
595
596		// Sort based on region start time.
597		slices.SortFunc(g.Regions, func(a, b *UserRegionSummary) int {
598			x := a.Start
599			y := b.Start
600			if x == nil {
601				if y == nil {
602					return 0
603				}
604				return -1
605			}
606			if y == nil {
607				return +1
608			}
609			return cmp.Compare(x.Time(), y.Time())
610		})
611		g.goroutineSummary = nil
612	}
613	return &Summary{
614		Goroutines: s.gs,
615		Tasks:      s.tasks,
616	}
617}
618
619// RelatedGoroutinesV2 finds a set of goroutines related to goroutine goid for v2 traces.
620// The association is based on whether they have synchronized with each other in the Go
621// scheduler (one has unblocked another).
622func RelatedGoroutinesV2(events []Event, goid GoID) map[GoID]struct{} {
623	// Process all the events, looking for transitions of goroutines
624	// out of GoWaiting. If there was an active goroutine when this
625	// happened, then we know that active goroutine unblocked another.
626	// Scribble all these down so we can process them.
627	type unblockEdge struct {
628		operator GoID
629		operand  GoID
630	}
631	var unblockEdges []unblockEdge
632	for _, ev := range events {
633		if ev.Goroutine() == NoGoroutine {
634			continue
635		}
636		if ev.Kind() != EventStateTransition {
637			continue
638		}
639		st := ev.StateTransition()
640		if st.Resource.Kind != ResourceGoroutine {
641			continue
642		}
643		id := st.Resource.Goroutine()
644		old, new := st.Goroutine()
645		if old == new || old != GoWaiting {
646			continue
647		}
648		unblockEdges = append(unblockEdges, unblockEdge{
649			operator: ev.Goroutine(),
650			operand:  id,
651		})
652	}
653	// Compute the transitive closure of depth 2 of goroutines that have unblocked each other
654	// (starting from goid).
655	gmap := make(map[GoID]struct{})
656	gmap[goid] = struct{}{}
657	for i := 0; i < 2; i++ {
658		// Copy the map.
659		gmap1 := make(map[GoID]struct{})
660		for g := range gmap {
661			gmap1[g] = struct{}{}
662		}
663		for _, edge := range unblockEdges {
664			if _, ok := gmap[edge.operand]; ok {
665				gmap1[edge.operator] = struct{}{}
666			}
667		}
668		gmap = gmap1
669	}
670	return gmap
671}
672
673func IsSystemGoroutine(entryFn string) bool {
674	// This mimics runtime.isSystemGoroutine as closely as
675	// possible.
676	// Also, locked g in extra M (with empty entryFn) is system goroutine.
677	return entryFn == "" || entryFn != "runtime.main" && strings.HasPrefix(entryFn, "runtime.")
678}
679