1// Copyright 2023 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package trace 6 7import ( 8 "cmp" 9 "slices" 10 "strings" 11 "time" 12) 13 14// Summary is the analysis result produced by the summarizer. 15type Summary struct { 16 Goroutines map[GoID]*GoroutineSummary 17 Tasks map[TaskID]*UserTaskSummary 18} 19 20// GoroutineSummary contains statistics and execution details of a single goroutine. 21// (For v2 traces.) 22type GoroutineSummary struct { 23 ID GoID 24 Name string // A non-unique human-friendly identifier for the goroutine. 25 PC uint64 // The first PC we saw for the entry function of the goroutine 26 CreationTime Time // Timestamp of the first appearance in the trace. 27 StartTime Time // Timestamp of the first time it started running. 0 if the goroutine never ran. 28 EndTime Time // Timestamp of when the goroutine exited. 0 if the goroutine never exited. 29 30 // List of regions in the goroutine, sorted based on the start time. 31 Regions []*UserRegionSummary 32 33 // Statistics of execution time during the goroutine execution. 34 GoroutineExecStats 35 36 // goroutineSummary is state used just for computing this structure. 37 // It's dropped before being returned to the caller. 38 // 39 // More specifically, if it's nil, it indicates that this summary has 40 // already been finalized. 41 *goroutineSummary 42} 43 44// UserTaskSummary represents a task in the trace. 45type UserTaskSummary struct { 46 ID TaskID 47 Name string 48 Parent *UserTaskSummary // nil if the parent is unknown. 49 Children []*UserTaskSummary 50 51 // Task begin event. An EventTaskBegin event or nil. 52 Start *Event 53 54 // End end event. Normally EventTaskEnd event or nil. 55 End *Event 56 57 // Logs is a list of EventLog events associated with the task. 58 Logs []*Event 59 60 // List of regions in the task, sorted based on the start time. 61 Regions []*UserRegionSummary 62 63 // Goroutines is the set of goroutines associated with this task. 64 Goroutines map[GoID]*GoroutineSummary 65} 66 67// Complete returns true if we have complete information about the task 68// from the trace: both a start and an end. 69func (s *UserTaskSummary) Complete() bool { 70 return s.Start != nil && s.End != nil 71} 72 73// Descendents returns a slice consisting of itself (always the first task returned), 74// and the transitive closure of all of its children. 75func (s *UserTaskSummary) Descendents() []*UserTaskSummary { 76 descendents := []*UserTaskSummary{s} 77 for _, child := range s.Children { 78 descendents = append(descendents, child.Descendents()...) 79 } 80 return descendents 81} 82 83// UserRegionSummary represents a region and goroutine execution stats 84// while the region was active. (For v2 traces.) 85type UserRegionSummary struct { 86 TaskID TaskID 87 Name string 88 89 // Region start event. Normally EventRegionBegin event or nil, 90 // but can be a state transition event from NotExist or Undetermined 91 // if the region is a synthetic region representing task inheritance 92 // from the parent goroutine. 93 Start *Event 94 95 // Region end event. Normally EventRegionEnd event or nil, 96 // but can be a state transition event to NotExist if the goroutine 97 // terminated without explicitly ending the region. 98 End *Event 99 100 GoroutineExecStats 101} 102 103// GoroutineExecStats contains statistics about a goroutine's execution 104// during a period of time. 105type GoroutineExecStats struct { 106 // These stats are all non-overlapping. 107 ExecTime time.Duration 108 SchedWaitTime time.Duration 109 BlockTimeByReason map[string]time.Duration 110 SyscallTime time.Duration 111 SyscallBlockTime time.Duration 112 113 // TotalTime is the duration of the goroutine's presence in the trace. 114 // Necessarily overlaps with other stats. 115 TotalTime time.Duration 116 117 // Total time the goroutine spent in certain ranges; may overlap 118 // with other stats. 119 RangeTime map[string]time.Duration 120} 121 122func (s GoroutineExecStats) NonOverlappingStats() map[string]time.Duration { 123 stats := map[string]time.Duration{ 124 "Execution time": s.ExecTime, 125 "Sched wait time": s.SchedWaitTime, 126 "Syscall execution time": s.SyscallTime, 127 "Block time (syscall)": s.SyscallBlockTime, 128 "Unknown time": s.UnknownTime(), 129 } 130 for reason, dt := range s.BlockTimeByReason { 131 stats["Block time ("+reason+")"] += dt 132 } 133 // N.B. Don't include RangeTime or TotalTime; they overlap with these other 134 // stats. 135 return stats 136} 137 138// UnknownTime returns whatever isn't accounted for in TotalTime. 139func (s GoroutineExecStats) UnknownTime() time.Duration { 140 sum := s.ExecTime + s.SchedWaitTime + s.SyscallTime + 141 s.SyscallBlockTime 142 for _, dt := range s.BlockTimeByReason { 143 sum += dt 144 } 145 // N.B. Don't include range time. Ranges overlap with 146 // other stats, whereas these stats are non-overlapping. 147 if sum < s.TotalTime { 148 return s.TotalTime - sum 149 } 150 return 0 151} 152 153// sub returns the stats v-s. 154func (s GoroutineExecStats) sub(v GoroutineExecStats) (r GoroutineExecStats) { 155 r = s.clone() 156 r.ExecTime -= v.ExecTime 157 r.SchedWaitTime -= v.SchedWaitTime 158 for reason := range s.BlockTimeByReason { 159 r.BlockTimeByReason[reason] -= v.BlockTimeByReason[reason] 160 } 161 r.SyscallTime -= v.SyscallTime 162 r.SyscallBlockTime -= v.SyscallBlockTime 163 r.TotalTime -= v.TotalTime 164 for name := range s.RangeTime { 165 r.RangeTime[name] -= v.RangeTime[name] 166 } 167 return r 168} 169 170func (s GoroutineExecStats) clone() (r GoroutineExecStats) { 171 r = s 172 r.BlockTimeByReason = make(map[string]time.Duration) 173 for reason, dt := range s.BlockTimeByReason { 174 r.BlockTimeByReason[reason] = dt 175 } 176 r.RangeTime = make(map[string]time.Duration) 177 for name, dt := range s.RangeTime { 178 r.RangeTime[name] = dt 179 } 180 return r 181} 182 183// snapshotStat returns the snapshot of the goroutine execution statistics. 184// This is called as we process the ordered trace event stream. lastTs is used 185// to process pending statistics if this is called before any goroutine end event. 186func (g *GoroutineSummary) snapshotStat(lastTs Time) (ret GoroutineExecStats) { 187 ret = g.GoroutineExecStats.clone() 188 189 if g.goroutineSummary == nil { 190 return ret // Already finalized; no pending state. 191 } 192 193 // Set the total time if necessary. 194 if g.TotalTime == 0 { 195 ret.TotalTime = lastTs.Sub(g.CreationTime) 196 } 197 198 // Add in time since lastTs. 199 if g.lastStartTime != 0 { 200 ret.ExecTime += lastTs.Sub(g.lastStartTime) 201 } 202 if g.lastRunnableTime != 0 { 203 ret.SchedWaitTime += lastTs.Sub(g.lastRunnableTime) 204 } 205 if g.lastBlockTime != 0 { 206 ret.BlockTimeByReason[g.lastBlockReason] += lastTs.Sub(g.lastBlockTime) 207 } 208 if g.lastSyscallTime != 0 { 209 ret.SyscallTime += lastTs.Sub(g.lastSyscallTime) 210 } 211 if g.lastSyscallBlockTime != 0 { 212 ret.SchedWaitTime += lastTs.Sub(g.lastSyscallBlockTime) 213 } 214 for name, ts := range g.lastRangeTime { 215 ret.RangeTime[name] += lastTs.Sub(ts) 216 } 217 return ret 218} 219 220// finalize is called when processing a goroutine end event or at 221// the end of trace processing. This finalizes the execution stat 222// and any active regions in the goroutine, in which case trigger is nil. 223func (g *GoroutineSummary) finalize(lastTs Time, trigger *Event) { 224 if trigger != nil { 225 g.EndTime = trigger.Time() 226 } 227 finalStat := g.snapshotStat(lastTs) 228 229 g.GoroutineExecStats = finalStat 230 231 // System goroutines are never part of regions, even though they 232 // "inherit" a task due to creation (EvGoCreate) from within a region. 233 // This may happen e.g. if the first GC is triggered within a region, 234 // starting the GC worker goroutines. 235 if !IsSystemGoroutine(g.Name) { 236 for _, s := range g.activeRegions { 237 s.End = trigger 238 s.GoroutineExecStats = finalStat.sub(s.GoroutineExecStats) 239 g.Regions = append(g.Regions, s) 240 } 241 } 242 *(g.goroutineSummary) = goroutineSummary{} 243} 244 245// goroutineSummary is a private part of GoroutineSummary that is required only during analysis. 246type goroutineSummary struct { 247 lastStartTime Time 248 lastRunnableTime Time 249 lastBlockTime Time 250 lastBlockReason string 251 lastSyscallTime Time 252 lastSyscallBlockTime Time 253 lastRangeTime map[string]Time 254 activeRegions []*UserRegionSummary // stack of active regions 255} 256 257// Summarizer constructs per-goroutine time statistics for v2 traces. 258type Summarizer struct { 259 // gs contains the map of goroutine summaries we're building up to return to the caller. 260 gs map[GoID]*GoroutineSummary 261 262 // tasks contains the map of task summaries we're building up to return to the caller. 263 tasks map[TaskID]*UserTaskSummary 264 265 // syscallingP and syscallingG represent a binding between a P and G in a syscall. 266 // Used to correctly identify and clean up after syscalls (blocking or otherwise). 267 syscallingP map[ProcID]GoID 268 syscallingG map[GoID]ProcID 269 270 // rangesP is used for optimistic tracking of P-based ranges for goroutines. 271 // 272 // It's a best-effort mapping of an active range on a P to the goroutine we think 273 // is associated with it. 274 rangesP map[rangeP]GoID 275 276 lastTs Time // timestamp of the last event processed. 277 syncTs Time // timestamp of the last sync event processed (or the first timestamp in the trace). 278} 279 280// NewSummarizer creates a new struct to build goroutine stats from a trace. 281func NewSummarizer() *Summarizer { 282 return &Summarizer{ 283 gs: make(map[GoID]*GoroutineSummary), 284 tasks: make(map[TaskID]*UserTaskSummary), 285 syscallingP: make(map[ProcID]GoID), 286 syscallingG: make(map[GoID]ProcID), 287 rangesP: make(map[rangeP]GoID), 288 } 289} 290 291type rangeP struct { 292 id ProcID 293 name string 294} 295 296// Event feeds a single event into the stats summarizer. 297func (s *Summarizer) Event(ev *Event) { 298 if s.syncTs == 0 { 299 s.syncTs = ev.Time() 300 } 301 s.lastTs = ev.Time() 302 303 switch ev.Kind() { 304 // Record sync time for the RangeActive events. 305 case EventSync: 306 s.syncTs = ev.Time() 307 308 // Handle state transitions. 309 case EventStateTransition: 310 st := ev.StateTransition() 311 switch st.Resource.Kind { 312 // Handle goroutine transitions, which are the meat of this computation. 313 case ResourceGoroutine: 314 id := st.Resource.Goroutine() 315 old, new := st.Goroutine() 316 if old == new { 317 // Skip these events; they're not telling us anything new. 318 break 319 } 320 321 // Handle transition out. 322 g := s.gs[id] 323 switch old { 324 case GoUndetermined, GoNotExist: 325 g = &GoroutineSummary{ID: id, goroutineSummary: &goroutineSummary{}} 326 // If we're coming out of GoUndetermined, then the creation time is the 327 // time of the last sync. 328 if old == GoUndetermined { 329 g.CreationTime = s.syncTs 330 } else { 331 g.CreationTime = ev.Time() 332 } 333 // The goroutine is being created, or it's being named for the first time. 334 g.lastRangeTime = make(map[string]Time) 335 g.BlockTimeByReason = make(map[string]time.Duration) 336 g.RangeTime = make(map[string]time.Duration) 337 338 // When a goroutine is newly created, inherit the task 339 // of the active region. For ease handling of this 340 // case, we create a fake region description with the 341 // task id. This isn't strictly necessary as this 342 // goroutine may not be associated with the task, but 343 // it can be convenient to see all children created 344 // during a region. 345 // 346 // N.B. ev.Goroutine() will always be NoGoroutine for the 347 // Undetermined case, so this is will simply not fire. 348 if creatorG := s.gs[ev.Goroutine()]; creatorG != nil && len(creatorG.activeRegions) > 0 { 349 regions := creatorG.activeRegions 350 s := regions[len(regions)-1] 351 g.activeRegions = []*UserRegionSummary{{TaskID: s.TaskID, Start: ev}} 352 } 353 s.gs[g.ID] = g 354 case GoRunning: 355 // Record execution time as we transition out of running 356 g.ExecTime += ev.Time().Sub(g.lastStartTime) 357 g.lastStartTime = 0 358 case GoWaiting: 359 // Record block time as we transition out of waiting. 360 if g.lastBlockTime != 0 { 361 g.BlockTimeByReason[g.lastBlockReason] += ev.Time().Sub(g.lastBlockTime) 362 g.lastBlockTime = 0 363 } 364 case GoRunnable: 365 // Record sched latency time as we transition out of runnable. 366 if g.lastRunnableTime != 0 { 367 g.SchedWaitTime += ev.Time().Sub(g.lastRunnableTime) 368 g.lastRunnableTime = 0 369 } 370 case GoSyscall: 371 // Record syscall execution time and syscall block time as we transition out of syscall. 372 if g.lastSyscallTime != 0 { 373 if g.lastSyscallBlockTime != 0 { 374 g.SyscallBlockTime += ev.Time().Sub(g.lastSyscallBlockTime) 375 g.SyscallTime += g.lastSyscallBlockTime.Sub(g.lastSyscallTime) 376 } else { 377 g.SyscallTime += ev.Time().Sub(g.lastSyscallTime) 378 } 379 g.lastSyscallTime = 0 380 g.lastSyscallBlockTime = 0 381 382 // Clear the syscall map. 383 delete(s.syscallingP, s.syscallingG[id]) 384 delete(s.syscallingG, id) 385 } 386 } 387 388 // The goroutine hasn't been identified yet. Take the transition stack 389 // and identify the goroutine by the root frame of that stack. 390 // This root frame will be identical for all transitions on this 391 // goroutine, because it represents its immutable start point. 392 if g.Name == "" { 393 stk := st.Stack 394 if stk != NoStack { 395 var frame StackFrame 396 var ok bool 397 stk.Frames(func(f StackFrame) bool { 398 frame = f 399 ok = true 400 return true 401 }) 402 if ok { 403 // NB: this PC won't actually be consistent for 404 // goroutines which existed at the start of the 405 // trace. The UI doesn't use it directly; this 406 // mainly serves as an indication that we 407 // actually saw a call stack for the goroutine 408 g.PC = frame.PC 409 g.Name = frame.Func 410 } 411 } 412 } 413 414 // Handle transition in. 415 switch new { 416 case GoRunning: 417 // We started running. Record it. 418 g.lastStartTime = ev.Time() 419 if g.StartTime == 0 { 420 g.StartTime = ev.Time() 421 } 422 case GoRunnable: 423 g.lastRunnableTime = ev.Time() 424 case GoWaiting: 425 if st.Reason != "forever" { 426 g.lastBlockTime = ev.Time() 427 g.lastBlockReason = st.Reason 428 break 429 } 430 // "Forever" is like goroutine death. 431 fallthrough 432 case GoNotExist: 433 g.finalize(ev.Time(), ev) 434 case GoSyscall: 435 s.syscallingP[ev.Proc()] = id 436 s.syscallingG[id] = ev.Proc() 437 g.lastSyscallTime = ev.Time() 438 } 439 440 // Handle procs to detect syscall blocking, which si identifiable as a 441 // proc going idle while the goroutine it was attached to is in a syscall. 442 case ResourceProc: 443 id := st.Resource.Proc() 444 old, new := st.Proc() 445 if old != new && new == ProcIdle { 446 if goid, ok := s.syscallingP[id]; ok { 447 g := s.gs[goid] 448 g.lastSyscallBlockTime = ev.Time() 449 delete(s.syscallingP, id) 450 } 451 } 452 } 453 454 // Handle ranges of all kinds. 455 case EventRangeBegin, EventRangeActive: 456 r := ev.Range() 457 var g *GoroutineSummary 458 switch r.Scope.Kind { 459 case ResourceGoroutine: 460 // Simple goroutine range. We attribute the entire range regardless of 461 // goroutine stats. Lots of situations are still identifiable, e.g. a 462 // goroutine blocked often in mark assist will have both high mark assist 463 // and high block times. Those interested in a deeper view can look at the 464 // trace viewer. 465 g = s.gs[r.Scope.Goroutine()] 466 case ResourceProc: 467 // N.B. These ranges are not actually bound to the goroutine, they're 468 // bound to the P. But if we happen to be on the P the whole time, let's 469 // try to attribute it to the goroutine. (e.g. GC sweeps are here.) 470 g = s.gs[ev.Goroutine()] 471 if g != nil { 472 s.rangesP[rangeP{id: r.Scope.Proc(), name: r.Name}] = ev.Goroutine() 473 } 474 } 475 if g == nil { 476 break 477 } 478 if ev.Kind() == EventRangeActive { 479 if ts := g.lastRangeTime[r.Name]; ts != 0 { 480 g.RangeTime[r.Name] += s.syncTs.Sub(ts) 481 } 482 g.lastRangeTime[r.Name] = s.syncTs 483 } else { 484 g.lastRangeTime[r.Name] = ev.Time() 485 } 486 case EventRangeEnd: 487 r := ev.Range() 488 var g *GoroutineSummary 489 switch r.Scope.Kind { 490 case ResourceGoroutine: 491 g = s.gs[r.Scope.Goroutine()] 492 case ResourceProc: 493 rp := rangeP{id: r.Scope.Proc(), name: r.Name} 494 if goid, ok := s.rangesP[rp]; ok { 495 if goid == ev.Goroutine() { 496 // As the comment in the RangeBegin case states, this is only OK 497 // if we finish on the same goroutine we started on. 498 g = s.gs[goid] 499 } 500 delete(s.rangesP, rp) 501 } 502 } 503 if g == nil { 504 break 505 } 506 ts := g.lastRangeTime[r.Name] 507 if ts == 0 { 508 break 509 } 510 g.RangeTime[r.Name] += ev.Time().Sub(ts) 511 delete(g.lastRangeTime, r.Name) 512 513 // Handle user-defined regions. 514 case EventRegionBegin: 515 g := s.gs[ev.Goroutine()] 516 r := ev.Region() 517 region := &UserRegionSummary{ 518 Name: r.Type, 519 TaskID: r.Task, 520 Start: ev, 521 GoroutineExecStats: g.snapshotStat(ev.Time()), 522 } 523 g.activeRegions = append(g.activeRegions, region) 524 // Associate the region and current goroutine to the task. 525 task := s.getOrAddTask(r.Task) 526 task.Regions = append(task.Regions, region) 527 task.Goroutines[g.ID] = g 528 case EventRegionEnd: 529 g := s.gs[ev.Goroutine()] 530 r := ev.Region() 531 var sd *UserRegionSummary 532 if regionStk := g.activeRegions; len(regionStk) > 0 { 533 // Pop the top region from the stack since that's what must have ended. 534 n := len(regionStk) 535 sd = regionStk[n-1] 536 regionStk = regionStk[:n-1] 537 g.activeRegions = regionStk 538 // N.B. No need to add the region to a task; the EventRegionBegin already handled it. 539 } else { 540 // This is an "end" without a start. Just fabricate the region now. 541 sd = &UserRegionSummary{Name: r.Type, TaskID: r.Task} 542 // Associate the region and current goroutine to the task. 543 task := s.getOrAddTask(r.Task) 544 task.Goroutines[g.ID] = g 545 task.Regions = append(task.Regions, sd) 546 } 547 sd.GoroutineExecStats = g.snapshotStat(ev.Time()).sub(sd.GoroutineExecStats) 548 sd.End = ev 549 g.Regions = append(g.Regions, sd) 550 551 // Handle tasks and logs. 552 case EventTaskBegin, EventTaskEnd: 553 // Initialize the task. 554 t := ev.Task() 555 task := s.getOrAddTask(t.ID) 556 task.Name = t.Type 557 task.Goroutines[ev.Goroutine()] = s.gs[ev.Goroutine()] 558 if ev.Kind() == EventTaskBegin { 559 task.Start = ev 560 } else { 561 task.End = ev 562 } 563 // Initialize the parent, if one exists and it hasn't been done yet. 564 // We need to avoid doing it twice, otherwise we could appear twice 565 // in the parent's Children list. 566 if t.Parent != NoTask && task.Parent == nil { 567 parent := s.getOrAddTask(t.Parent) 568 task.Parent = parent 569 parent.Children = append(parent.Children, task) 570 } 571 case EventLog: 572 log := ev.Log() 573 // Just add the log to the task. We'll create the task if it 574 // doesn't exist (it's just been mentioned now). 575 task := s.getOrAddTask(log.Task) 576 task.Goroutines[ev.Goroutine()] = s.gs[ev.Goroutine()] 577 task.Logs = append(task.Logs, ev) 578 } 579} 580 581func (s *Summarizer) getOrAddTask(id TaskID) *UserTaskSummary { 582 task := s.tasks[id] 583 if task == nil { 584 task = &UserTaskSummary{ID: id, Goroutines: make(map[GoID]*GoroutineSummary)} 585 s.tasks[id] = task 586 } 587 return task 588} 589 590// Finalize indicates to the summarizer that we're done processing the trace. 591// It cleans up any remaining state and returns the full summary. 592func (s *Summarizer) Finalize() *Summary { 593 for _, g := range s.gs { 594 g.finalize(s.lastTs, nil) 595 596 // Sort based on region start time. 597 slices.SortFunc(g.Regions, func(a, b *UserRegionSummary) int { 598 x := a.Start 599 y := b.Start 600 if x == nil { 601 if y == nil { 602 return 0 603 } 604 return -1 605 } 606 if y == nil { 607 return +1 608 } 609 return cmp.Compare(x.Time(), y.Time()) 610 }) 611 g.goroutineSummary = nil 612 } 613 return &Summary{ 614 Goroutines: s.gs, 615 Tasks: s.tasks, 616 } 617} 618 619// RelatedGoroutinesV2 finds a set of goroutines related to goroutine goid for v2 traces. 620// The association is based on whether they have synchronized with each other in the Go 621// scheduler (one has unblocked another). 622func RelatedGoroutinesV2(events []Event, goid GoID) map[GoID]struct{} { 623 // Process all the events, looking for transitions of goroutines 624 // out of GoWaiting. If there was an active goroutine when this 625 // happened, then we know that active goroutine unblocked another. 626 // Scribble all these down so we can process them. 627 type unblockEdge struct { 628 operator GoID 629 operand GoID 630 } 631 var unblockEdges []unblockEdge 632 for _, ev := range events { 633 if ev.Goroutine() == NoGoroutine { 634 continue 635 } 636 if ev.Kind() != EventStateTransition { 637 continue 638 } 639 st := ev.StateTransition() 640 if st.Resource.Kind != ResourceGoroutine { 641 continue 642 } 643 id := st.Resource.Goroutine() 644 old, new := st.Goroutine() 645 if old == new || old != GoWaiting { 646 continue 647 } 648 unblockEdges = append(unblockEdges, unblockEdge{ 649 operator: ev.Goroutine(), 650 operand: id, 651 }) 652 } 653 // Compute the transitive closure of depth 2 of goroutines that have unblocked each other 654 // (starting from goid). 655 gmap := make(map[GoID]struct{}) 656 gmap[goid] = struct{}{} 657 for i := 0; i < 2; i++ { 658 // Copy the map. 659 gmap1 := make(map[GoID]struct{}) 660 for g := range gmap { 661 gmap1[g] = struct{}{} 662 } 663 for _, edge := range unblockEdges { 664 if _, ok := gmap[edge.operand]; ok { 665 gmap1[edge.operator] = struct{}{} 666 } 667 } 668 gmap = gmap1 669 } 670 return gmap 671} 672 673func IsSystemGoroutine(entryFn string) bool { 674 // This mimics runtime.isSystemGoroutine as closely as 675 // possible. 676 // Also, locked g in extra M (with empty entryFn) is system goroutine. 677 return entryFn == "" || entryFn != "runtime.main" && strings.HasPrefix(entryFn, "runtime.") 678} 679