1// Copyright 2014 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package runtime 6 7import ( 8 "internal/abi" 9 "internal/cpu" 10 "internal/goarch" 11 "internal/goos" 12 "internal/runtime/atomic" 13 "internal/runtime/exithook" 14 "internal/stringslite" 15 "runtime/internal/sys" 16 "unsafe" 17) 18 19// set using cmd/go/internal/modload.ModInfoProg 20var modinfo string 21 22// Goroutine scheduler 23// The scheduler's job is to distribute ready-to-run goroutines over worker threads. 24// 25// The main concepts are: 26// G - goroutine. 27// M - worker thread, or machine. 28// P - processor, a resource that is required to execute Go code. 29// M must have an associated P to execute Go code, however it can be 30// blocked or in a syscall w/o an associated P. 31// 32// Design doc at https://golang.org/s/go11sched. 33 34// Worker thread parking/unparking. 35// We need to balance between keeping enough running worker threads to utilize 36// available hardware parallelism and parking excessive running worker threads 37// to conserve CPU resources and power. This is not simple for two reasons: 38// (1) scheduler state is intentionally distributed (in particular, per-P work 39// queues), so it is not possible to compute global predicates on fast paths; 40// (2) for optimal thread management we would need to know the future (don't park 41// a worker thread when a new goroutine will be readied in near future). 42// 43// Three rejected approaches that would work badly: 44// 1. Centralize all scheduler state (would inhibit scalability). 45// 2. Direct goroutine handoff. That is, when we ready a new goroutine and there 46// is a spare P, unpark a thread and handoff it the thread and the goroutine. 47// This would lead to thread state thrashing, as the thread that readied the 48// goroutine can be out of work the very next moment, we will need to park it. 49// Also, it would destroy locality of computation as we want to preserve 50// dependent goroutines on the same thread; and introduce additional latency. 51// 3. Unpark an additional thread whenever we ready a goroutine and there is an 52// idle P, but don't do handoff. This would lead to excessive thread parking/ 53// unparking as the additional threads will instantly park without discovering 54// any work to do. 55// 56// The current approach: 57// 58// This approach applies to three primary sources of potential work: readying a 59// goroutine, new/modified-earlier timers, and idle-priority GC. See below for 60// additional details. 61// 62// We unpark an additional thread when we submit work if (this is wakep()): 63// 1. There is an idle P, and 64// 2. There are no "spinning" worker threads. 65// 66// A worker thread is considered spinning if it is out of local work and did 67// not find work in the global run queue or netpoller; the spinning state is 68// denoted in m.spinning and in sched.nmspinning. Threads unparked this way are 69// also considered spinning; we don't do goroutine handoff so such threads are 70// out of work initially. Spinning threads spin on looking for work in per-P 71// run queues and timer heaps or from the GC before parking. If a spinning 72// thread finds work it takes itself out of the spinning state and proceeds to 73// execution. If it does not find work it takes itself out of the spinning 74// state and then parks. 75// 76// If there is at least one spinning thread (sched.nmspinning>1), we don't 77// unpark new threads when submitting work. To compensate for that, if the last 78// spinning thread finds work and stops spinning, it must unpark a new spinning 79// thread. This approach smooths out unjustified spikes of thread unparking, 80// but at the same time guarantees eventual maximal CPU parallelism 81// utilization. 82// 83// The main implementation complication is that we need to be very careful 84// during spinning->non-spinning thread transition. This transition can race 85// with submission of new work, and either one part or another needs to unpark 86// another worker thread. If they both fail to do that, we can end up with 87// semi-persistent CPU underutilization. 88// 89// The general pattern for submission is: 90// 1. Submit work to the local or global run queue, timer heap, or GC state. 91// 2. #StoreLoad-style memory barrier. 92// 3. Check sched.nmspinning. 93// 94// The general pattern for spinning->non-spinning transition is: 95// 1. Decrement nmspinning. 96// 2. #StoreLoad-style memory barrier. 97// 3. Check all per-P work queues and GC for new work. 98// 99// Note that all this complexity does not apply to global run queue as we are 100// not sloppy about thread unparking when submitting to global queue. Also see 101// comments for nmspinning manipulation. 102// 103// How these different sources of work behave varies, though it doesn't affect 104// the synchronization approach: 105// * Ready goroutine: this is an obvious source of work; the goroutine is 106// immediately ready and must run on some thread eventually. 107// * New/modified-earlier timer: The current timer implementation (see time.go) 108// uses netpoll in a thread with no work available to wait for the soonest 109// timer. If there is no thread waiting, we want a new spinning thread to go 110// wait. 111// * Idle-priority GC: The GC wakes a stopped idle thread to contribute to 112// background GC work (note: currently disabled per golang.org/issue/19112). 113// Also see golang.org/issue/44313, as this should be extended to all GC 114// workers. 115 116var ( 117 m0 m 118 g0 g 119 mcache0 *mcache 120 raceprocctx0 uintptr 121 raceFiniLock mutex 122) 123 124// This slice records the initializing tasks that need to be 125// done to start up the runtime. It is built by the linker. 126var runtime_inittasks []*initTask 127 128// main_init_done is a signal used by cgocallbackg that initialization 129// has been completed. It is made before _cgo_notify_runtime_init_done, 130// so all cgo calls can rely on it existing. When main_init is complete, 131// it is closed, meaning cgocallbackg can reliably receive from it. 132var main_init_done chan bool 133 134//go:linkname main_main main.main 135func main_main() 136 137// mainStarted indicates that the main M has started. 138var mainStarted bool 139 140// runtimeInitTime is the nanotime() at which the runtime started. 141var runtimeInitTime int64 142 143// Value to use for signal mask for newly created M's. 144var initSigmask sigset 145 146// The main goroutine. 147func main() { 148 mp := getg().m 149 150 // Racectx of m0->g0 is used only as the parent of the main goroutine. 151 // It must not be used for anything else. 152 mp.g0.racectx = 0 153 154 // Max stack size is 1 GB on 64-bit, 250 MB on 32-bit. 155 // Using decimal instead of binary GB and MB because 156 // they look nicer in the stack overflow failure message. 157 if goarch.PtrSize == 8 { 158 maxstacksize = 1000000000 159 } else { 160 maxstacksize = 250000000 161 } 162 163 // An upper limit for max stack size. Used to avoid random crashes 164 // after calling SetMaxStack and trying to allocate a stack that is too big, 165 // since stackalloc works with 32-bit sizes. 166 maxstackceiling = 2 * maxstacksize 167 168 // Allow newproc to start new Ms. 169 mainStarted = true 170 171 if haveSysmon { 172 systemstack(func() { 173 newm(sysmon, nil, -1) 174 }) 175 } 176 177 // Lock the main goroutine onto this, the main OS thread, 178 // during initialization. Most programs won't care, but a few 179 // do require certain calls to be made by the main thread. 180 // Those can arrange for main.main to run in the main thread 181 // by calling runtime.LockOSThread during initialization 182 // to preserve the lock. 183 lockOSThread() 184 185 if mp != &m0 { 186 throw("runtime.main not on m0") 187 } 188 189 // Record when the world started. 190 // Must be before doInit for tracing init. 191 runtimeInitTime = nanotime() 192 if runtimeInitTime == 0 { 193 throw("nanotime returning zero") 194 } 195 196 if debug.inittrace != 0 { 197 inittrace.id = getg().goid 198 inittrace.active = true 199 } 200 201 doInit(runtime_inittasks) // Must be before defer. 202 203 // Defer unlock so that runtime.Goexit during init does the unlock too. 204 needUnlock := true 205 defer func() { 206 if needUnlock { 207 unlockOSThread() 208 } 209 }() 210 211 gcenable() 212 213 main_init_done = make(chan bool) 214 if iscgo { 215 if _cgo_pthread_key_created == nil { 216 throw("_cgo_pthread_key_created missing") 217 } 218 219 if _cgo_thread_start == nil { 220 throw("_cgo_thread_start missing") 221 } 222 if GOOS != "windows" { 223 if _cgo_setenv == nil { 224 throw("_cgo_setenv missing") 225 } 226 if _cgo_unsetenv == nil { 227 throw("_cgo_unsetenv missing") 228 } 229 } 230 if _cgo_notify_runtime_init_done == nil { 231 throw("_cgo_notify_runtime_init_done missing") 232 } 233 234 // Set the x_crosscall2_ptr C function pointer variable point to crosscall2. 235 if set_crosscall2 == nil { 236 throw("set_crosscall2 missing") 237 } 238 set_crosscall2() 239 240 // Start the template thread in case we enter Go from 241 // a C-created thread and need to create a new thread. 242 startTemplateThread() 243 cgocall(_cgo_notify_runtime_init_done, nil) 244 } 245 246 // Run the initializing tasks. Depending on build mode this 247 // list can arrive a few different ways, but it will always 248 // contain the init tasks computed by the linker for all the 249 // packages in the program (excluding those added at runtime 250 // by package plugin). Run through the modules in dependency 251 // order (the order they are initialized by the dynamic 252 // loader, i.e. they are added to the moduledata linked list). 253 for m := &firstmoduledata; m != nil; m = m.next { 254 doInit(m.inittasks) 255 } 256 257 // Disable init tracing after main init done to avoid overhead 258 // of collecting statistics in malloc and newproc 259 inittrace.active = false 260 261 close(main_init_done) 262 263 needUnlock = false 264 unlockOSThread() 265 266 if isarchive || islibrary { 267 // A program compiled with -buildmode=c-archive or c-shared 268 // has a main, but it is not executed. 269 return 270 } 271 fn := main_main // make an indirect call, as the linker doesn't know the address of the main package when laying down the runtime 272 fn() 273 if raceenabled { 274 runExitHooks(0) // run hooks now, since racefini does not return 275 racefini() 276 } 277 278 // Make racy client program work: if panicking on 279 // another goroutine at the same time as main returns, 280 // let the other goroutine finish printing the panic trace. 281 // Once it does, it will exit. See issues 3934 and 20018. 282 if runningPanicDefers.Load() != 0 { 283 // Running deferred functions should not take long. 284 for c := 0; c < 1000; c++ { 285 if runningPanicDefers.Load() == 0 { 286 break 287 } 288 Gosched() 289 } 290 } 291 if panicking.Load() != 0 { 292 gopark(nil, nil, waitReasonPanicWait, traceBlockForever, 1) 293 } 294 runExitHooks(0) 295 296 exit(0) 297 for { 298 var x *int32 299 *x = 0 300 } 301} 302 303// os_beforeExit is called from os.Exit(0). 304// 305//go:linkname os_beforeExit os.runtime_beforeExit 306func os_beforeExit(exitCode int) { 307 runExitHooks(exitCode) 308 if exitCode == 0 && raceenabled { 309 racefini() 310 } 311} 312 313func init() { 314 exithook.Gosched = Gosched 315 exithook.Goid = func() uint64 { return getg().goid } 316 exithook.Throw = throw 317} 318 319func runExitHooks(code int) { 320 exithook.Run(code) 321} 322 323// start forcegc helper goroutine 324func init() { 325 go forcegchelper() 326} 327 328func forcegchelper() { 329 forcegc.g = getg() 330 lockInit(&forcegc.lock, lockRankForcegc) 331 for { 332 lock(&forcegc.lock) 333 if forcegc.idle.Load() { 334 throw("forcegc: phase error") 335 } 336 forcegc.idle.Store(true) 337 goparkunlock(&forcegc.lock, waitReasonForceGCIdle, traceBlockSystemGoroutine, 1) 338 // this goroutine is explicitly resumed by sysmon 339 if debug.gctrace > 0 { 340 println("GC forced") 341 } 342 // Time-triggered, fully concurrent. 343 gcStart(gcTrigger{kind: gcTriggerTime, now: nanotime()}) 344 } 345} 346 347// Gosched yields the processor, allowing other goroutines to run. It does not 348// suspend the current goroutine, so execution resumes automatically. 349// 350//go:nosplit 351func Gosched() { 352 checkTimeouts() 353 mcall(gosched_m) 354} 355 356// goschedguarded yields the processor like gosched, but also checks 357// for forbidden states and opts out of the yield in those cases. 358// 359//go:nosplit 360func goschedguarded() { 361 mcall(goschedguarded_m) 362} 363 364// goschedIfBusy yields the processor like gosched, but only does so if 365// there are no idle Ps or if we're on the only P and there's nothing in 366// the run queue. In both cases, there is freely available idle time. 367// 368//go:nosplit 369func goschedIfBusy() { 370 gp := getg() 371 // Call gosched if gp.preempt is set; we may be in a tight loop that 372 // doesn't otherwise yield. 373 if !gp.preempt && sched.npidle.Load() > 0 { 374 return 375 } 376 mcall(gosched_m) 377} 378 379// Puts the current goroutine into a waiting state and calls unlockf on the 380// system stack. 381// 382// If unlockf returns false, the goroutine is resumed. 383// 384// unlockf must not access this G's stack, as it may be moved between 385// the call to gopark and the call to unlockf. 386// 387// Note that because unlockf is called after putting the G into a waiting 388// state, the G may have already been readied by the time unlockf is called 389// unless there is external synchronization preventing the G from being 390// readied. If unlockf returns false, it must guarantee that the G cannot be 391// externally readied. 392// 393// Reason explains why the goroutine has been parked. It is displayed in stack 394// traces and heap dumps. Reasons should be unique and descriptive. Do not 395// re-use reasons, add new ones. 396// 397// gopark should be an internal detail, 398// but widely used packages access it using linkname. 399// Notable members of the hall of shame include: 400// - gvisor.dev/gvisor 401// - github.com/sagernet/gvisor 402// 403// Do not remove or change the type signature. 404// See go.dev/issue/67401. 405// 406//go:linkname gopark 407func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason waitReason, traceReason traceBlockReason, traceskip int) { 408 if reason != waitReasonSleep { 409 checkTimeouts() // timeouts may expire while two goroutines keep the scheduler busy 410 } 411 mp := acquirem() 412 gp := mp.curg 413 status := readgstatus(gp) 414 if status != _Grunning && status != _Gscanrunning { 415 throw("gopark: bad g status") 416 } 417 mp.waitlock = lock 418 mp.waitunlockf = unlockf 419 gp.waitreason = reason 420 mp.waitTraceBlockReason = traceReason 421 mp.waitTraceSkip = traceskip 422 releasem(mp) 423 // can't do anything that might move the G between Ms here. 424 mcall(park_m) 425} 426 427// Puts the current goroutine into a waiting state and unlocks the lock. 428// The goroutine can be made runnable again by calling goready(gp). 429func goparkunlock(lock *mutex, reason waitReason, traceReason traceBlockReason, traceskip int) { 430 gopark(parkunlock_c, unsafe.Pointer(lock), reason, traceReason, traceskip) 431} 432 433// goready should be an internal detail, 434// but widely used packages access it using linkname. 435// Notable members of the hall of shame include: 436// - gvisor.dev/gvisor 437// - github.com/sagernet/gvisor 438// 439// Do not remove or change the type signature. 440// See go.dev/issue/67401. 441// 442//go:linkname goready 443func goready(gp *g, traceskip int) { 444 systemstack(func() { 445 ready(gp, traceskip, true) 446 }) 447} 448 449//go:nosplit 450func acquireSudog() *sudog { 451 // Delicate dance: the semaphore implementation calls 452 // acquireSudog, acquireSudog calls new(sudog), 453 // new calls malloc, malloc can call the garbage collector, 454 // and the garbage collector calls the semaphore implementation 455 // in stopTheWorld. 456 // Break the cycle by doing acquirem/releasem around new(sudog). 457 // The acquirem/releasem increments m.locks during new(sudog), 458 // which keeps the garbage collector from being invoked. 459 mp := acquirem() 460 pp := mp.p.ptr() 461 if len(pp.sudogcache) == 0 { 462 lock(&sched.sudoglock) 463 // First, try to grab a batch from central cache. 464 for len(pp.sudogcache) < cap(pp.sudogcache)/2 && sched.sudogcache != nil { 465 s := sched.sudogcache 466 sched.sudogcache = s.next 467 s.next = nil 468 pp.sudogcache = append(pp.sudogcache, s) 469 } 470 unlock(&sched.sudoglock) 471 // If the central cache is empty, allocate a new one. 472 if len(pp.sudogcache) == 0 { 473 pp.sudogcache = append(pp.sudogcache, new(sudog)) 474 } 475 } 476 n := len(pp.sudogcache) 477 s := pp.sudogcache[n-1] 478 pp.sudogcache[n-1] = nil 479 pp.sudogcache = pp.sudogcache[:n-1] 480 if s.elem != nil { 481 throw("acquireSudog: found s.elem != nil in cache") 482 } 483 releasem(mp) 484 return s 485} 486 487//go:nosplit 488func releaseSudog(s *sudog) { 489 if s.elem != nil { 490 throw("runtime: sudog with non-nil elem") 491 } 492 if s.isSelect { 493 throw("runtime: sudog with non-false isSelect") 494 } 495 if s.next != nil { 496 throw("runtime: sudog with non-nil next") 497 } 498 if s.prev != nil { 499 throw("runtime: sudog with non-nil prev") 500 } 501 if s.waitlink != nil { 502 throw("runtime: sudog with non-nil waitlink") 503 } 504 if s.c != nil { 505 throw("runtime: sudog with non-nil c") 506 } 507 gp := getg() 508 if gp.param != nil { 509 throw("runtime: releaseSudog with non-nil gp.param") 510 } 511 mp := acquirem() // avoid rescheduling to another P 512 pp := mp.p.ptr() 513 if len(pp.sudogcache) == cap(pp.sudogcache) { 514 // Transfer half of local cache to the central cache. 515 var first, last *sudog 516 for len(pp.sudogcache) > cap(pp.sudogcache)/2 { 517 n := len(pp.sudogcache) 518 p := pp.sudogcache[n-1] 519 pp.sudogcache[n-1] = nil 520 pp.sudogcache = pp.sudogcache[:n-1] 521 if first == nil { 522 first = p 523 } else { 524 last.next = p 525 } 526 last = p 527 } 528 lock(&sched.sudoglock) 529 last.next = sched.sudogcache 530 sched.sudogcache = first 531 unlock(&sched.sudoglock) 532 } 533 pp.sudogcache = append(pp.sudogcache, s) 534 releasem(mp) 535} 536 537// called from assembly. 538func badmcall(fn func(*g)) { 539 throw("runtime: mcall called on m->g0 stack") 540} 541 542func badmcall2(fn func(*g)) { 543 throw("runtime: mcall function returned") 544} 545 546func badreflectcall() { 547 panic(plainError("arg size to reflect.call more than 1GB")) 548} 549 550//go:nosplit 551//go:nowritebarrierrec 552func badmorestackg0() { 553 if !crashStackImplemented { 554 writeErrStr("fatal: morestack on g0\n") 555 return 556 } 557 558 g := getg() 559 switchToCrashStack(func() { 560 print("runtime: morestack on g0, stack [", hex(g.stack.lo), " ", hex(g.stack.hi), "], sp=", hex(g.sched.sp), ", called from\n") 561 g.m.traceback = 2 // include pc and sp in stack trace 562 traceback1(g.sched.pc, g.sched.sp, g.sched.lr, g, 0) 563 print("\n") 564 565 throw("morestack on g0") 566 }) 567} 568 569//go:nosplit 570//go:nowritebarrierrec 571func badmorestackgsignal() { 572 writeErrStr("fatal: morestack on gsignal\n") 573} 574 575//go:nosplit 576func badctxt() { 577 throw("ctxt != 0") 578} 579 580// gcrash is a fake g that can be used when crashing due to bad 581// stack conditions. 582var gcrash g 583 584var crashingG atomic.Pointer[g] 585 586// Switch to crashstack and call fn, with special handling of 587// concurrent and recursive cases. 588// 589// Nosplit as it is called in a bad stack condition (we know 590// morestack would fail). 591// 592//go:nosplit 593//go:nowritebarrierrec 594func switchToCrashStack(fn func()) { 595 me := getg() 596 if crashingG.CompareAndSwapNoWB(nil, me) { 597 switchToCrashStack0(fn) // should never return 598 abort() 599 } 600 if crashingG.Load() == me { 601 // recursive crashing. too bad. 602 writeErrStr("fatal: recursive switchToCrashStack\n") 603 abort() 604 } 605 // Another g is crashing. Give it some time, hopefully it will finish traceback. 606 usleep_no_g(100) 607 writeErrStr("fatal: concurrent switchToCrashStack\n") 608 abort() 609} 610 611// Disable crash stack on Windows for now. Apparently, throwing an exception 612// on a non-system-allocated crash stack causes EXCEPTION_STACK_OVERFLOW and 613// hangs the process (see issue 63938). 614const crashStackImplemented = GOOS != "windows" 615 616//go:noescape 617func switchToCrashStack0(fn func()) // in assembly 618 619func lockedOSThread() bool { 620 gp := getg() 621 return gp.lockedm != 0 && gp.m.lockedg != 0 622} 623 624var ( 625 // allgs contains all Gs ever created (including dead Gs), and thus 626 // never shrinks. 627 // 628 // Access via the slice is protected by allglock or stop-the-world. 629 // Readers that cannot take the lock may (carefully!) use the atomic 630 // variables below. 631 allglock mutex 632 allgs []*g 633 634 // allglen and allgptr are atomic variables that contain len(allgs) and 635 // &allgs[0] respectively. Proper ordering depends on totally-ordered 636 // loads and stores. Writes are protected by allglock. 637 // 638 // allgptr is updated before allglen. Readers should read allglen 639 // before allgptr to ensure that allglen is always <= len(allgptr). New 640 // Gs appended during the race can be missed. For a consistent view of 641 // all Gs, allglock must be held. 642 // 643 // allgptr copies should always be stored as a concrete type or 644 // unsafe.Pointer, not uintptr, to ensure that GC can still reach it 645 // even if it points to a stale array. 646 allglen uintptr 647 allgptr **g 648) 649 650func allgadd(gp *g) { 651 if readgstatus(gp) == _Gidle { 652 throw("allgadd: bad status Gidle") 653 } 654 655 lock(&allglock) 656 allgs = append(allgs, gp) 657 if &allgs[0] != allgptr { 658 atomicstorep(unsafe.Pointer(&allgptr), unsafe.Pointer(&allgs[0])) 659 } 660 atomic.Storeuintptr(&allglen, uintptr(len(allgs))) 661 unlock(&allglock) 662} 663 664// allGsSnapshot returns a snapshot of the slice of all Gs. 665// 666// The world must be stopped or allglock must be held. 667func allGsSnapshot() []*g { 668 assertWorldStoppedOrLockHeld(&allglock) 669 670 // Because the world is stopped or allglock is held, allgadd 671 // cannot happen concurrently with this. allgs grows 672 // monotonically and existing entries never change, so we can 673 // simply return a copy of the slice header. For added safety, 674 // we trim everything past len because that can still change. 675 return allgs[:len(allgs):len(allgs)] 676} 677 678// atomicAllG returns &allgs[0] and len(allgs) for use with atomicAllGIndex. 679func atomicAllG() (**g, uintptr) { 680 length := atomic.Loaduintptr(&allglen) 681 ptr := (**g)(atomic.Loadp(unsafe.Pointer(&allgptr))) 682 return ptr, length 683} 684 685// atomicAllGIndex returns ptr[i] with the allgptr returned from atomicAllG. 686func atomicAllGIndex(ptr **g, i uintptr) *g { 687 return *(**g)(add(unsafe.Pointer(ptr), i*goarch.PtrSize)) 688} 689 690// forEachG calls fn on every G from allgs. 691// 692// forEachG takes a lock to exclude concurrent addition of new Gs. 693func forEachG(fn func(gp *g)) { 694 lock(&allglock) 695 for _, gp := range allgs { 696 fn(gp) 697 } 698 unlock(&allglock) 699} 700 701// forEachGRace calls fn on every G from allgs. 702// 703// forEachGRace avoids locking, but does not exclude addition of new Gs during 704// execution, which may be missed. 705func forEachGRace(fn func(gp *g)) { 706 ptr, length := atomicAllG() 707 for i := uintptr(0); i < length; i++ { 708 gp := atomicAllGIndex(ptr, i) 709 fn(gp) 710 } 711 return 712} 713 714const ( 715 // Number of goroutine ids to grab from sched.goidgen to local per-P cache at once. 716 // 16 seems to provide enough amortization, but other than that it's mostly arbitrary number. 717 _GoidCacheBatch = 16 718) 719 720// cpuinit sets up CPU feature flags and calls internal/cpu.Initialize. env should be the complete 721// value of the GODEBUG environment variable. 722func cpuinit(env string) { 723 switch GOOS { 724 case "aix", "darwin", "ios", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux": 725 cpu.DebugOptions = true 726 } 727 cpu.Initialize(env) 728 729 // Support cpu feature variables are used in code generated by the compiler 730 // to guard execution of instructions that can not be assumed to be always supported. 731 switch GOARCH { 732 case "386", "amd64": 733 x86HasPOPCNT = cpu.X86.HasPOPCNT 734 x86HasSSE41 = cpu.X86.HasSSE41 735 x86HasFMA = cpu.X86.HasFMA 736 737 case "arm": 738 armHasVFPv4 = cpu.ARM.HasVFPv4 739 740 case "arm64": 741 arm64HasATOMICS = cpu.ARM64.HasATOMICS 742 } 743} 744 745// getGodebugEarly extracts the environment variable GODEBUG from the environment on 746// Unix-like operating systems and returns it. This function exists to extract GODEBUG 747// early before much of the runtime is initialized. 748func getGodebugEarly() string { 749 const prefix = "GODEBUG=" 750 var env string 751 switch GOOS { 752 case "aix", "darwin", "ios", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux": 753 // Similar to goenv_unix but extracts the environment value for 754 // GODEBUG directly. 755 // TODO(moehrmann): remove when general goenvs() can be called before cpuinit() 756 n := int32(0) 757 for argv_index(argv, argc+1+n) != nil { 758 n++ 759 } 760 761 for i := int32(0); i < n; i++ { 762 p := argv_index(argv, argc+1+i) 763 s := unsafe.String(p, findnull(p)) 764 765 if stringslite.HasPrefix(s, prefix) { 766 env = gostring(p)[len(prefix):] 767 break 768 } 769 } 770 } 771 return env 772} 773 774// The bootstrap sequence is: 775// 776// call osinit 777// call schedinit 778// make & queue new G 779// call runtime·mstart 780// 781// The new G calls runtime·main. 782func schedinit() { 783 lockInit(&sched.lock, lockRankSched) 784 lockInit(&sched.sysmonlock, lockRankSysmon) 785 lockInit(&sched.deferlock, lockRankDefer) 786 lockInit(&sched.sudoglock, lockRankSudog) 787 lockInit(&deadlock, lockRankDeadlock) 788 lockInit(&paniclk, lockRankPanic) 789 lockInit(&allglock, lockRankAllg) 790 lockInit(&allpLock, lockRankAllp) 791 lockInit(&reflectOffs.lock, lockRankReflectOffs) 792 lockInit(&finlock, lockRankFin) 793 lockInit(&cpuprof.lock, lockRankCpuprof) 794 allocmLock.init(lockRankAllocmR, lockRankAllocmRInternal, lockRankAllocmW) 795 execLock.init(lockRankExecR, lockRankExecRInternal, lockRankExecW) 796 traceLockInit() 797 // Enforce that this lock is always a leaf lock. 798 // All of this lock's critical sections should be 799 // extremely short. 800 lockInit(&memstats.heapStats.noPLock, lockRankLeafRank) 801 802 // raceinit must be the first call to race detector. 803 // In particular, it must be done before mallocinit below calls racemapshadow. 804 gp := getg() 805 if raceenabled { 806 gp.racectx, raceprocctx0 = raceinit() 807 } 808 809 sched.maxmcount = 10000 810 crashFD.Store(^uintptr(0)) 811 812 // The world starts stopped. 813 worldStopped() 814 815 ticks.init() // run as early as possible 816 moduledataverify() 817 stackinit() 818 mallocinit() 819 godebug := getGodebugEarly() 820 cpuinit(godebug) // must run before alginit 821 randinit() // must run before alginit, mcommoninit 822 alginit() // maps, hash, rand must not be used before this call 823 mcommoninit(gp.m, -1) 824 modulesinit() // provides activeModules 825 typelinksinit() // uses maps, activeModules 826 itabsinit() // uses activeModules 827 stkobjinit() // must run before GC starts 828 829 sigsave(&gp.m.sigmask) 830 initSigmask = gp.m.sigmask 831 832 goargs() 833 goenvs() 834 secure() 835 checkfds() 836 parsedebugvars() 837 gcinit() 838 839 // Allocate stack space that can be used when crashing due to bad stack 840 // conditions, e.g. morestack on g0. 841 gcrash.stack = stackalloc(16384) 842 gcrash.stackguard0 = gcrash.stack.lo + 1000 843 gcrash.stackguard1 = gcrash.stack.lo + 1000 844 845 // if disableMemoryProfiling is set, update MemProfileRate to 0 to turn off memprofile. 846 // Note: parsedebugvars may update MemProfileRate, but when disableMemoryProfiling is 847 // set to true by the linker, it means that nothing is consuming the profile, it is 848 // safe to set MemProfileRate to 0. 849 if disableMemoryProfiling { 850 MemProfileRate = 0 851 } 852 853 // mcommoninit runs before parsedebugvars, so init profstacks again. 854 mProfStackInit(gp.m) 855 856 lock(&sched.lock) 857 sched.lastpoll.Store(nanotime()) 858 procs := ncpu 859 if n, ok := atoi32(gogetenv("GOMAXPROCS")); ok && n > 0 { 860 procs = n 861 } 862 if procresize(procs) != nil { 863 throw("unknown runnable goroutine during bootstrap") 864 } 865 unlock(&sched.lock) 866 867 // World is effectively started now, as P's can run. 868 worldStarted() 869 870 if buildVersion == "" { 871 // Condition should never trigger. This code just serves 872 // to ensure runtime·buildVersion is kept in the resulting binary. 873 buildVersion = "unknown" 874 } 875 if len(modinfo) == 1 { 876 // Condition should never trigger. This code just serves 877 // to ensure runtime·modinfo is kept in the resulting binary. 878 modinfo = "" 879 } 880} 881 882func dumpgstatus(gp *g) { 883 thisg := getg() 884 print("runtime: gp: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n") 885 print("runtime: getg: g=", thisg, ", goid=", thisg.goid, ", g->atomicstatus=", readgstatus(thisg), "\n") 886} 887 888// sched.lock must be held. 889func checkmcount() { 890 assertLockHeld(&sched.lock) 891 892 // Exclude extra M's, which are used for cgocallback from threads 893 // created in C. 894 // 895 // The purpose of the SetMaxThreads limit is to avoid accidental fork 896 // bomb from something like millions of goroutines blocking on system 897 // calls, causing the runtime to create millions of threads. By 898 // definition, this isn't a problem for threads created in C, so we 899 // exclude them from the limit. See https://go.dev/issue/60004. 900 count := mcount() - int32(extraMInUse.Load()) - int32(extraMLength.Load()) 901 if count > sched.maxmcount { 902 print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n") 903 throw("thread exhaustion") 904 } 905} 906 907// mReserveID returns the next ID to use for a new m. This new m is immediately 908// considered 'running' by checkdead. 909// 910// sched.lock must be held. 911func mReserveID() int64 { 912 assertLockHeld(&sched.lock) 913 914 if sched.mnext+1 < sched.mnext { 915 throw("runtime: thread ID overflow") 916 } 917 id := sched.mnext 918 sched.mnext++ 919 checkmcount() 920 return id 921} 922 923// Pre-allocated ID may be passed as 'id', or omitted by passing -1. 924func mcommoninit(mp *m, id int64) { 925 gp := getg() 926 927 // g0 stack won't make sense for user (and is not necessary unwindable). 928 if gp != gp.m.g0 { 929 callers(1, mp.createstack[:]) 930 } 931 932 lock(&sched.lock) 933 934 if id >= 0 { 935 mp.id = id 936 } else { 937 mp.id = mReserveID() 938 } 939 940 mrandinit(mp) 941 942 mpreinit(mp) 943 if mp.gsignal != nil { 944 mp.gsignal.stackguard1 = mp.gsignal.stack.lo + stackGuard 945 } 946 947 // Add to allm so garbage collector doesn't free g->m 948 // when it is just in a register or thread-local storage. 949 mp.alllink = allm 950 951 // NumCgoCall() and others iterate over allm w/o schedlock, 952 // so we need to publish it safely. 953 atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp)) 954 unlock(&sched.lock) 955 956 // Allocate memory to hold a cgo traceback if the cgo call crashes. 957 if iscgo || GOOS == "solaris" || GOOS == "illumos" || GOOS == "windows" { 958 mp.cgoCallers = new(cgoCallers) 959 } 960 mProfStackInit(mp) 961} 962 963// mProfStackInit is used to eagerly initialize stack trace buffers for 964// profiling. Lazy allocation would have to deal with reentrancy issues in 965// malloc and runtime locks for mLockProfile. 966// TODO(mknyszek): Implement lazy allocation if this becomes a problem. 967func mProfStackInit(mp *m) { 968 if debug.profstackdepth == 0 { 969 // debug.profstack is set to 0 by the user, or we're being called from 970 // schedinit before parsedebugvars. 971 return 972 } 973 mp.profStack = makeProfStackFP() 974 mp.mLockProfile.stack = makeProfStackFP() 975} 976 977// makeProfStackFP creates a buffer large enough to hold a maximum-sized stack 978// trace as well as any additional frames needed for frame pointer unwinding 979// with delayed inline expansion. 980func makeProfStackFP() []uintptr { 981 // The "1" term is to account for the first stack entry being 982 // taken up by a "skip" sentinel value for profilers which 983 // defer inline frame expansion until the profile is reported. 984 // The "maxSkip" term is for frame pointer unwinding, where we 985 // want to end up with debug.profstackdebth frames but will discard 986 // some "physical" frames to account for skipping. 987 return make([]uintptr, 1+maxSkip+debug.profstackdepth) 988} 989 990// makeProfStack returns a buffer large enough to hold a maximum-sized stack 991// trace. 992func makeProfStack() []uintptr { return make([]uintptr, debug.profstackdepth) } 993 994//go:linkname pprof_makeProfStack 995func pprof_makeProfStack() []uintptr { return makeProfStack() } 996 997func (mp *m) becomeSpinning() { 998 mp.spinning = true 999 sched.nmspinning.Add(1) 1000 sched.needspinning.Store(0) 1001} 1002 1003func (mp *m) hasCgoOnStack() bool { 1004 return mp.ncgo > 0 || mp.isextra 1005} 1006 1007const ( 1008 // osHasLowResTimer indicates that the platform's internal timer system has a low resolution, 1009 // typically on the order of 1 ms or more. 1010 osHasLowResTimer = GOOS == "windows" || GOOS == "openbsd" || GOOS == "netbsd" 1011 1012 // osHasLowResClockInt is osHasLowResClock but in integer form, so it can be used to create 1013 // constants conditionally. 1014 osHasLowResClockInt = goos.IsWindows 1015 1016 // osHasLowResClock indicates that timestamps produced by nanotime on the platform have a 1017 // low resolution, typically on the order of 1 ms or more. 1018 osHasLowResClock = osHasLowResClockInt > 0 1019) 1020 1021// Mark gp ready to run. 1022func ready(gp *g, traceskip int, next bool) { 1023 status := readgstatus(gp) 1024 1025 // Mark runnable. 1026 mp := acquirem() // disable preemption because it can be holding p in a local var 1027 if status&^_Gscan != _Gwaiting { 1028 dumpgstatus(gp) 1029 throw("bad g->status in ready") 1030 } 1031 1032 // status is Gwaiting or Gscanwaiting, make Grunnable and put on runq 1033 trace := traceAcquire() 1034 casgstatus(gp, _Gwaiting, _Grunnable) 1035 if trace.ok() { 1036 trace.GoUnpark(gp, traceskip) 1037 traceRelease(trace) 1038 } 1039 runqput(mp.p.ptr(), gp, next) 1040 wakep() 1041 releasem(mp) 1042} 1043 1044// freezeStopWait is a large value that freezetheworld sets 1045// sched.stopwait to in order to request that all Gs permanently stop. 1046const freezeStopWait = 0x7fffffff 1047 1048// freezing is set to non-zero if the runtime is trying to freeze the 1049// world. 1050var freezing atomic.Bool 1051 1052// Similar to stopTheWorld but best-effort and can be called several times. 1053// There is no reverse operation, used during crashing. 1054// This function must not lock any mutexes. 1055func freezetheworld() { 1056 freezing.Store(true) 1057 if debug.dontfreezetheworld > 0 { 1058 // Don't prempt Ps to stop goroutines. That will perturb 1059 // scheduler state, making debugging more difficult. Instead, 1060 // allow goroutines to continue execution. 1061 // 1062 // fatalpanic will tracebackothers to trace all goroutines. It 1063 // is unsafe to trace a running goroutine, so tracebackothers 1064 // will skip running goroutines. That is OK and expected, we 1065 // expect users of dontfreezetheworld to use core files anyway. 1066 // 1067 // However, allowing the scheduler to continue running free 1068 // introduces a race: a goroutine may be stopped when 1069 // tracebackothers checks its status, and then start running 1070 // later when we are in the middle of traceback, potentially 1071 // causing a crash. 1072 // 1073 // To mitigate this, when an M naturally enters the scheduler, 1074 // schedule checks if freezing is set and if so stops 1075 // execution. This guarantees that while Gs can transition from 1076 // running to stopped, they can never transition from stopped 1077 // to running. 1078 // 1079 // The sleep here allows racing Ms that missed freezing and are 1080 // about to run a G to complete the transition to running 1081 // before we start traceback. 1082 usleep(1000) 1083 return 1084 } 1085 1086 // stopwait and preemption requests can be lost 1087 // due to races with concurrently executing threads, 1088 // so try several times 1089 for i := 0; i < 5; i++ { 1090 // this should tell the scheduler to not start any new goroutines 1091 sched.stopwait = freezeStopWait 1092 sched.gcwaiting.Store(true) 1093 // this should stop running goroutines 1094 if !preemptall() { 1095 break // no running goroutines 1096 } 1097 usleep(1000) 1098 } 1099 // to be sure 1100 usleep(1000) 1101 preemptall() 1102 usleep(1000) 1103} 1104 1105// All reads and writes of g's status go through readgstatus, casgstatus 1106// castogscanstatus, casfrom_Gscanstatus. 1107// 1108//go:nosplit 1109func readgstatus(gp *g) uint32 { 1110 return gp.atomicstatus.Load() 1111} 1112 1113// The Gscanstatuses are acting like locks and this releases them. 1114// If it proves to be a performance hit we should be able to make these 1115// simple atomic stores but for now we are going to throw if 1116// we see an inconsistent state. 1117func casfrom_Gscanstatus(gp *g, oldval, newval uint32) { 1118 success := false 1119 1120 // Check that transition is valid. 1121 switch oldval { 1122 default: 1123 print("runtime: casfrom_Gscanstatus bad oldval gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n") 1124 dumpgstatus(gp) 1125 throw("casfrom_Gscanstatus:top gp->status is not in scan state") 1126 case _Gscanrunnable, 1127 _Gscanwaiting, 1128 _Gscanrunning, 1129 _Gscansyscall, 1130 _Gscanpreempted: 1131 if newval == oldval&^_Gscan { 1132 success = gp.atomicstatus.CompareAndSwap(oldval, newval) 1133 } 1134 } 1135 if !success { 1136 print("runtime: casfrom_Gscanstatus failed gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n") 1137 dumpgstatus(gp) 1138 throw("casfrom_Gscanstatus: gp->status is not in scan state") 1139 } 1140 releaseLockRankAndM(lockRankGscan) 1141} 1142 1143// This will return false if the gp is not in the expected status and the cas fails. 1144// This acts like a lock acquire while the casfromgstatus acts like a lock release. 1145func castogscanstatus(gp *g, oldval, newval uint32) bool { 1146 switch oldval { 1147 case _Grunnable, 1148 _Grunning, 1149 _Gwaiting, 1150 _Gsyscall: 1151 if newval == oldval|_Gscan { 1152 r := gp.atomicstatus.CompareAndSwap(oldval, newval) 1153 if r { 1154 acquireLockRankAndM(lockRankGscan) 1155 } 1156 return r 1157 1158 } 1159 } 1160 print("runtime: castogscanstatus oldval=", hex(oldval), " newval=", hex(newval), "\n") 1161 throw("castogscanstatus") 1162 panic("not reached") 1163} 1164 1165// casgstatusAlwaysTrack is a debug flag that causes casgstatus to always track 1166// various latencies on every transition instead of sampling them. 1167var casgstatusAlwaysTrack = false 1168 1169// If asked to move to or from a Gscanstatus this will throw. Use the castogscanstatus 1170// and casfrom_Gscanstatus instead. 1171// casgstatus will loop if the g->atomicstatus is in a Gscan status until the routine that 1172// put it in the Gscan state is finished. 1173// 1174//go:nosplit 1175func casgstatus(gp *g, oldval, newval uint32) { 1176 if (oldval&_Gscan != 0) || (newval&_Gscan != 0) || oldval == newval { 1177 systemstack(func() { 1178 // Call on the systemstack to prevent print and throw from counting 1179 // against the nosplit stack reservation. 1180 print("runtime: casgstatus: oldval=", hex(oldval), " newval=", hex(newval), "\n") 1181 throw("casgstatus: bad incoming values") 1182 }) 1183 } 1184 1185 lockWithRankMayAcquire(nil, lockRankGscan) 1186 1187 // See https://golang.org/cl/21503 for justification of the yield delay. 1188 const yieldDelay = 5 * 1000 1189 var nextYield int64 1190 1191 // loop if gp->atomicstatus is in a scan state giving 1192 // GC time to finish and change the state to oldval. 1193 for i := 0; !gp.atomicstatus.CompareAndSwap(oldval, newval); i++ { 1194 if oldval == _Gwaiting && gp.atomicstatus.Load() == _Grunnable { 1195 systemstack(func() { 1196 // Call on the systemstack to prevent throw from counting 1197 // against the nosplit stack reservation. 1198 throw("casgstatus: waiting for Gwaiting but is Grunnable") 1199 }) 1200 } 1201 if i == 0 { 1202 nextYield = nanotime() + yieldDelay 1203 } 1204 if nanotime() < nextYield { 1205 for x := 0; x < 10 && gp.atomicstatus.Load() != oldval; x++ { 1206 procyield(1) 1207 } 1208 } else { 1209 osyield() 1210 nextYield = nanotime() + yieldDelay/2 1211 } 1212 } 1213 1214 if oldval == _Grunning { 1215 // Track every gTrackingPeriod time a goroutine transitions out of running. 1216 if casgstatusAlwaysTrack || gp.trackingSeq%gTrackingPeriod == 0 { 1217 gp.tracking = true 1218 } 1219 gp.trackingSeq++ 1220 } 1221 if !gp.tracking { 1222 return 1223 } 1224 1225 // Handle various kinds of tracking. 1226 // 1227 // Currently: 1228 // - Time spent in runnable. 1229 // - Time spent blocked on a sync.Mutex or sync.RWMutex. 1230 switch oldval { 1231 case _Grunnable: 1232 // We transitioned out of runnable, so measure how much 1233 // time we spent in this state and add it to 1234 // runnableTime. 1235 now := nanotime() 1236 gp.runnableTime += now - gp.trackingStamp 1237 gp.trackingStamp = 0 1238 case _Gwaiting: 1239 if !gp.waitreason.isMutexWait() { 1240 // Not blocking on a lock. 1241 break 1242 } 1243 // Blocking on a lock, measure it. Note that because we're 1244 // sampling, we have to multiply by our sampling period to get 1245 // a more representative estimate of the absolute value. 1246 // gTrackingPeriod also represents an accurate sampling period 1247 // because we can only enter this state from _Grunning. 1248 now := nanotime() 1249 sched.totalMutexWaitTime.Add((now - gp.trackingStamp) * gTrackingPeriod) 1250 gp.trackingStamp = 0 1251 } 1252 switch newval { 1253 case _Gwaiting: 1254 if !gp.waitreason.isMutexWait() { 1255 // Not blocking on a lock. 1256 break 1257 } 1258 // Blocking on a lock. Write down the timestamp. 1259 now := nanotime() 1260 gp.trackingStamp = now 1261 case _Grunnable: 1262 // We just transitioned into runnable, so record what 1263 // time that happened. 1264 now := nanotime() 1265 gp.trackingStamp = now 1266 case _Grunning: 1267 // We're transitioning into running, so turn off 1268 // tracking and record how much time we spent in 1269 // runnable. 1270 gp.tracking = false 1271 sched.timeToRun.record(gp.runnableTime) 1272 gp.runnableTime = 0 1273 } 1274} 1275 1276// casGToWaiting transitions gp from old to _Gwaiting, and sets the wait reason. 1277// 1278// Use this over casgstatus when possible to ensure that a waitreason is set. 1279func casGToWaiting(gp *g, old uint32, reason waitReason) { 1280 // Set the wait reason before calling casgstatus, because casgstatus will use it. 1281 gp.waitreason = reason 1282 casgstatus(gp, old, _Gwaiting) 1283} 1284 1285// casGToWaitingForGC transitions gp from old to _Gwaiting, and sets the wait reason. 1286// The wait reason must be a valid isWaitingForGC wait reason. 1287// 1288// Use this over casgstatus when possible to ensure that a waitreason is set. 1289func casGToWaitingForGC(gp *g, old uint32, reason waitReason) { 1290 if !reason.isWaitingForGC() { 1291 throw("casGToWaitingForGC with non-isWaitingForGC wait reason") 1292 } 1293 casGToWaiting(gp, old, reason) 1294} 1295 1296// casgstatus(gp, oldstatus, Gcopystack), assuming oldstatus is Gwaiting or Grunnable. 1297// Returns old status. Cannot call casgstatus directly, because we are racing with an 1298// async wakeup that might come in from netpoll. If we see Gwaiting from the readgstatus, 1299// it might have become Grunnable by the time we get to the cas. If we called casgstatus, 1300// it would loop waiting for the status to go back to Gwaiting, which it never will. 1301// 1302//go:nosplit 1303func casgcopystack(gp *g) uint32 { 1304 for { 1305 oldstatus := readgstatus(gp) &^ _Gscan 1306 if oldstatus != _Gwaiting && oldstatus != _Grunnable { 1307 throw("copystack: bad status, not Gwaiting or Grunnable") 1308 } 1309 if gp.atomicstatus.CompareAndSwap(oldstatus, _Gcopystack) { 1310 return oldstatus 1311 } 1312 } 1313} 1314 1315// casGToPreemptScan transitions gp from _Grunning to _Gscan|_Gpreempted. 1316// 1317// TODO(austin): This is the only status operation that both changes 1318// the status and locks the _Gscan bit. Rethink this. 1319func casGToPreemptScan(gp *g, old, new uint32) { 1320 if old != _Grunning || new != _Gscan|_Gpreempted { 1321 throw("bad g transition") 1322 } 1323 acquireLockRankAndM(lockRankGscan) 1324 for !gp.atomicstatus.CompareAndSwap(_Grunning, _Gscan|_Gpreempted) { 1325 } 1326} 1327 1328// casGFromPreempted attempts to transition gp from _Gpreempted to 1329// _Gwaiting. If successful, the caller is responsible for 1330// re-scheduling gp. 1331func casGFromPreempted(gp *g, old, new uint32) bool { 1332 if old != _Gpreempted || new != _Gwaiting { 1333 throw("bad g transition") 1334 } 1335 gp.waitreason = waitReasonPreempted 1336 return gp.atomicstatus.CompareAndSwap(_Gpreempted, _Gwaiting) 1337} 1338 1339// stwReason is an enumeration of reasons the world is stopping. 1340type stwReason uint8 1341 1342// Reasons to stop-the-world. 1343// 1344// Avoid reusing reasons and add new ones instead. 1345const ( 1346 stwUnknown stwReason = iota // "unknown" 1347 stwGCMarkTerm // "GC mark termination" 1348 stwGCSweepTerm // "GC sweep termination" 1349 stwWriteHeapDump // "write heap dump" 1350 stwGoroutineProfile // "goroutine profile" 1351 stwGoroutineProfileCleanup // "goroutine profile cleanup" 1352 stwAllGoroutinesStack // "all goroutines stack trace" 1353 stwReadMemStats // "read mem stats" 1354 stwAllThreadsSyscall // "AllThreadsSyscall" 1355 stwGOMAXPROCS // "GOMAXPROCS" 1356 stwStartTrace // "start trace" 1357 stwStopTrace // "stop trace" 1358 stwForTestCountPagesInUse // "CountPagesInUse (test)" 1359 stwForTestReadMetricsSlow // "ReadMetricsSlow (test)" 1360 stwForTestReadMemStatsSlow // "ReadMemStatsSlow (test)" 1361 stwForTestPageCachePagesLeaked // "PageCachePagesLeaked (test)" 1362 stwForTestResetDebugLog // "ResetDebugLog (test)" 1363) 1364 1365func (r stwReason) String() string { 1366 return stwReasonStrings[r] 1367} 1368 1369func (r stwReason) isGC() bool { 1370 return r == stwGCMarkTerm || r == stwGCSweepTerm 1371} 1372 1373// If you add to this list, also add it to src/internal/trace/parser.go. 1374// If you change the values of any of the stw* constants, bump the trace 1375// version number and make a copy of this. 1376var stwReasonStrings = [...]string{ 1377 stwUnknown: "unknown", 1378 stwGCMarkTerm: "GC mark termination", 1379 stwGCSweepTerm: "GC sweep termination", 1380 stwWriteHeapDump: "write heap dump", 1381 stwGoroutineProfile: "goroutine profile", 1382 stwGoroutineProfileCleanup: "goroutine profile cleanup", 1383 stwAllGoroutinesStack: "all goroutines stack trace", 1384 stwReadMemStats: "read mem stats", 1385 stwAllThreadsSyscall: "AllThreadsSyscall", 1386 stwGOMAXPROCS: "GOMAXPROCS", 1387 stwStartTrace: "start trace", 1388 stwStopTrace: "stop trace", 1389 stwForTestCountPagesInUse: "CountPagesInUse (test)", 1390 stwForTestReadMetricsSlow: "ReadMetricsSlow (test)", 1391 stwForTestReadMemStatsSlow: "ReadMemStatsSlow (test)", 1392 stwForTestPageCachePagesLeaked: "PageCachePagesLeaked (test)", 1393 stwForTestResetDebugLog: "ResetDebugLog (test)", 1394} 1395 1396// worldStop provides context from the stop-the-world required by the 1397// start-the-world. 1398type worldStop struct { 1399 reason stwReason 1400 startedStopping int64 1401 finishedStopping int64 1402 stoppingCPUTime int64 1403} 1404 1405// Temporary variable for stopTheWorld, when it can't write to the stack. 1406// 1407// Protected by worldsema. 1408var stopTheWorldContext worldStop 1409 1410// stopTheWorld stops all P's from executing goroutines, interrupting 1411// all goroutines at GC safe points and records reason as the reason 1412// for the stop. On return, only the current goroutine's P is running. 1413// stopTheWorld must not be called from a system stack and the caller 1414// must not hold worldsema. The caller must call startTheWorld when 1415// other P's should resume execution. 1416// 1417// stopTheWorld is safe for multiple goroutines to call at the 1418// same time. Each will execute its own stop, and the stops will 1419// be serialized. 1420// 1421// This is also used by routines that do stack dumps. If the system is 1422// in panic or being exited, this may not reliably stop all 1423// goroutines. 1424// 1425// Returns the STW context. When starting the world, this context must be 1426// passed to startTheWorld. 1427func stopTheWorld(reason stwReason) worldStop { 1428 semacquire(&worldsema) 1429 gp := getg() 1430 gp.m.preemptoff = reason.String() 1431 systemstack(func() { 1432 // Mark the goroutine which called stopTheWorld preemptible so its 1433 // stack may be scanned. 1434 // This lets a mark worker scan us while we try to stop the world 1435 // since otherwise we could get in a mutual preemption deadlock. 1436 // We must not modify anything on the G stack because a stack shrink 1437 // may occur. A stack shrink is otherwise OK though because in order 1438 // to return from this function (and to leave the system stack) we 1439 // must have preempted all goroutines, including any attempting 1440 // to scan our stack, in which case, any stack shrinking will 1441 // have already completed by the time we exit. 1442 // 1443 // N.B. The execution tracer is not aware of this status 1444 // transition and handles it specially based on the 1445 // wait reason. 1446 casGToWaitingForGC(gp, _Grunning, waitReasonStoppingTheWorld) 1447 stopTheWorldContext = stopTheWorldWithSema(reason) // avoid write to stack 1448 casgstatus(gp, _Gwaiting, _Grunning) 1449 }) 1450 return stopTheWorldContext 1451} 1452 1453// startTheWorld undoes the effects of stopTheWorld. 1454// 1455// w must be the worldStop returned by stopTheWorld. 1456func startTheWorld(w worldStop) { 1457 systemstack(func() { startTheWorldWithSema(0, w) }) 1458 1459 // worldsema must be held over startTheWorldWithSema to ensure 1460 // gomaxprocs cannot change while worldsema is held. 1461 // 1462 // Release worldsema with direct handoff to the next waiter, but 1463 // acquirem so that semrelease1 doesn't try to yield our time. 1464 // 1465 // Otherwise if e.g. ReadMemStats is being called in a loop, 1466 // it might stomp on other attempts to stop the world, such as 1467 // for starting or ending GC. The operation this blocks is 1468 // so heavy-weight that we should just try to be as fair as 1469 // possible here. 1470 // 1471 // We don't want to just allow us to get preempted between now 1472 // and releasing the semaphore because then we keep everyone 1473 // (including, for example, GCs) waiting longer. 1474 mp := acquirem() 1475 mp.preemptoff = "" 1476 semrelease1(&worldsema, true, 0) 1477 releasem(mp) 1478} 1479 1480// stopTheWorldGC has the same effect as stopTheWorld, but blocks 1481// until the GC is not running. It also blocks a GC from starting 1482// until startTheWorldGC is called. 1483func stopTheWorldGC(reason stwReason) worldStop { 1484 semacquire(&gcsema) 1485 return stopTheWorld(reason) 1486} 1487 1488// startTheWorldGC undoes the effects of stopTheWorldGC. 1489// 1490// w must be the worldStop returned by stopTheWorld. 1491func startTheWorldGC(w worldStop) { 1492 startTheWorld(w) 1493 semrelease(&gcsema) 1494} 1495 1496// Holding worldsema grants an M the right to try to stop the world. 1497var worldsema uint32 = 1 1498 1499// Holding gcsema grants the M the right to block a GC, and blocks 1500// until the current GC is done. In particular, it prevents gomaxprocs 1501// from changing concurrently. 1502// 1503// TODO(mknyszek): Once gomaxprocs and the execution tracer can handle 1504// being changed/enabled during a GC, remove this. 1505var gcsema uint32 = 1 1506 1507// stopTheWorldWithSema is the core implementation of stopTheWorld. 1508// The caller is responsible for acquiring worldsema and disabling 1509// preemption first and then should stopTheWorldWithSema on the system 1510// stack: 1511// 1512// semacquire(&worldsema, 0) 1513// m.preemptoff = "reason" 1514// var stw worldStop 1515// systemstack(func() { 1516// stw = stopTheWorldWithSema(reason) 1517// }) 1518// 1519// When finished, the caller must either call startTheWorld or undo 1520// these three operations separately: 1521// 1522// m.preemptoff = "" 1523// systemstack(func() { 1524// now = startTheWorldWithSema(stw) 1525// }) 1526// semrelease(&worldsema) 1527// 1528// It is allowed to acquire worldsema once and then execute multiple 1529// startTheWorldWithSema/stopTheWorldWithSema pairs. 1530// Other P's are able to execute between successive calls to 1531// startTheWorldWithSema and stopTheWorldWithSema. 1532// Holding worldsema causes any other goroutines invoking 1533// stopTheWorld to block. 1534// 1535// Returns the STW context. When starting the world, this context must be 1536// passed to startTheWorldWithSema. 1537func stopTheWorldWithSema(reason stwReason) worldStop { 1538 trace := traceAcquire() 1539 if trace.ok() { 1540 trace.STWStart(reason) 1541 traceRelease(trace) 1542 } 1543 gp := getg() 1544 1545 // If we hold a lock, then we won't be able to stop another M 1546 // that is blocked trying to acquire the lock. 1547 if gp.m.locks > 0 { 1548 throw("stopTheWorld: holding locks") 1549 } 1550 1551 lock(&sched.lock) 1552 start := nanotime() // exclude time waiting for sched.lock from start and total time metrics. 1553 sched.stopwait = gomaxprocs 1554 sched.gcwaiting.Store(true) 1555 preemptall() 1556 // stop current P 1557 gp.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic. 1558 gp.m.p.ptr().gcStopTime = start 1559 sched.stopwait-- 1560 // try to retake all P's in Psyscall status 1561 trace = traceAcquire() 1562 for _, pp := range allp { 1563 s := pp.status 1564 if s == _Psyscall && atomic.Cas(&pp.status, s, _Pgcstop) { 1565 if trace.ok() { 1566 trace.ProcSteal(pp, false) 1567 } 1568 pp.syscalltick++ 1569 pp.gcStopTime = nanotime() 1570 sched.stopwait-- 1571 } 1572 } 1573 if trace.ok() { 1574 traceRelease(trace) 1575 } 1576 1577 // stop idle P's 1578 now := nanotime() 1579 for { 1580 pp, _ := pidleget(now) 1581 if pp == nil { 1582 break 1583 } 1584 pp.status = _Pgcstop 1585 pp.gcStopTime = nanotime() 1586 sched.stopwait-- 1587 } 1588 wait := sched.stopwait > 0 1589 unlock(&sched.lock) 1590 1591 // wait for remaining P's to stop voluntarily 1592 if wait { 1593 for { 1594 // wait for 100us, then try to re-preempt in case of any races 1595 if notetsleep(&sched.stopnote, 100*1000) { 1596 noteclear(&sched.stopnote) 1597 break 1598 } 1599 preemptall() 1600 } 1601 } 1602 1603 finish := nanotime() 1604 startTime := finish - start 1605 if reason.isGC() { 1606 sched.stwStoppingTimeGC.record(startTime) 1607 } else { 1608 sched.stwStoppingTimeOther.record(startTime) 1609 } 1610 1611 // Double-check we actually stopped everything, and all the invariants hold. 1612 // Also accumulate all the time spent by each P in _Pgcstop up to the point 1613 // where everything was stopped. This will be accumulated into the total pause 1614 // CPU time by the caller. 1615 stoppingCPUTime := int64(0) 1616 bad := "" 1617 if sched.stopwait != 0 { 1618 bad = "stopTheWorld: not stopped (stopwait != 0)" 1619 } else { 1620 for _, pp := range allp { 1621 if pp.status != _Pgcstop { 1622 bad = "stopTheWorld: not stopped (status != _Pgcstop)" 1623 } 1624 if pp.gcStopTime == 0 && bad == "" { 1625 bad = "stopTheWorld: broken CPU time accounting" 1626 } 1627 stoppingCPUTime += finish - pp.gcStopTime 1628 pp.gcStopTime = 0 1629 } 1630 } 1631 if freezing.Load() { 1632 // Some other thread is panicking. This can cause the 1633 // sanity checks above to fail if the panic happens in 1634 // the signal handler on a stopped thread. Either way, 1635 // we should halt this thread. 1636 lock(&deadlock) 1637 lock(&deadlock) 1638 } 1639 if bad != "" { 1640 throw(bad) 1641 } 1642 1643 worldStopped() 1644 1645 return worldStop{ 1646 reason: reason, 1647 startedStopping: start, 1648 finishedStopping: finish, 1649 stoppingCPUTime: stoppingCPUTime, 1650 } 1651} 1652 1653// reason is the same STW reason passed to stopTheWorld. start is the start 1654// time returned by stopTheWorld. 1655// 1656// now is the current time; prefer to pass 0 to capture a fresh timestamp. 1657// 1658// stattTheWorldWithSema returns now. 1659func startTheWorldWithSema(now int64, w worldStop) int64 { 1660 assertWorldStopped() 1661 1662 mp := acquirem() // disable preemption because it can be holding p in a local var 1663 if netpollinited() { 1664 list, delta := netpoll(0) // non-blocking 1665 injectglist(&list) 1666 netpollAdjustWaiters(delta) 1667 } 1668 lock(&sched.lock) 1669 1670 procs := gomaxprocs 1671 if newprocs != 0 { 1672 procs = newprocs 1673 newprocs = 0 1674 } 1675 p1 := procresize(procs) 1676 sched.gcwaiting.Store(false) 1677 if sched.sysmonwait.Load() { 1678 sched.sysmonwait.Store(false) 1679 notewakeup(&sched.sysmonnote) 1680 } 1681 unlock(&sched.lock) 1682 1683 worldStarted() 1684 1685 for p1 != nil { 1686 p := p1 1687 p1 = p1.link.ptr() 1688 if p.m != 0 { 1689 mp := p.m.ptr() 1690 p.m = 0 1691 if mp.nextp != 0 { 1692 throw("startTheWorld: inconsistent mp->nextp") 1693 } 1694 mp.nextp.set(p) 1695 notewakeup(&mp.park) 1696 } else { 1697 // Start M to run P. Do not start another M below. 1698 newm(nil, p, -1) 1699 } 1700 } 1701 1702 // Capture start-the-world time before doing clean-up tasks. 1703 if now == 0 { 1704 now = nanotime() 1705 } 1706 totalTime := now - w.startedStopping 1707 if w.reason.isGC() { 1708 sched.stwTotalTimeGC.record(totalTime) 1709 } else { 1710 sched.stwTotalTimeOther.record(totalTime) 1711 } 1712 trace := traceAcquire() 1713 if trace.ok() { 1714 trace.STWDone() 1715 traceRelease(trace) 1716 } 1717 1718 // Wakeup an additional proc in case we have excessive runnable goroutines 1719 // in local queues or in the global queue. If we don't, the proc will park itself. 1720 // If we have lots of excessive work, resetspinning will unpark additional procs as necessary. 1721 wakep() 1722 1723 releasem(mp) 1724 1725 return now 1726} 1727 1728// usesLibcall indicates whether this runtime performs system calls 1729// via libcall. 1730func usesLibcall() bool { 1731 switch GOOS { 1732 case "aix", "darwin", "illumos", "ios", "solaris", "windows": 1733 return true 1734 case "openbsd": 1735 return GOARCH != "mips64" 1736 } 1737 return false 1738} 1739 1740// mStackIsSystemAllocated indicates whether this runtime starts on a 1741// system-allocated stack. 1742func mStackIsSystemAllocated() bool { 1743 switch GOOS { 1744 case "aix", "darwin", "plan9", "illumos", "ios", "solaris", "windows": 1745 return true 1746 case "openbsd": 1747 return GOARCH != "mips64" 1748 } 1749 return false 1750} 1751 1752// mstart is the entry-point for new Ms. 1753// It is written in assembly, uses ABI0, is marked TOPFRAME, and calls mstart0. 1754func mstart() 1755 1756// mstart0 is the Go entry-point for new Ms. 1757// This must not split the stack because we may not even have stack 1758// bounds set up yet. 1759// 1760// May run during STW (because it doesn't have a P yet), so write 1761// barriers are not allowed. 1762// 1763//go:nosplit 1764//go:nowritebarrierrec 1765func mstart0() { 1766 gp := getg() 1767 1768 osStack := gp.stack.lo == 0 1769 if osStack { 1770 // Initialize stack bounds from system stack. 1771 // Cgo may have left stack size in stack.hi. 1772 // minit may update the stack bounds. 1773 // 1774 // Note: these bounds may not be very accurate. 1775 // We set hi to &size, but there are things above 1776 // it. The 1024 is supposed to compensate this, 1777 // but is somewhat arbitrary. 1778 size := gp.stack.hi 1779 if size == 0 { 1780 size = 16384 * sys.StackGuardMultiplier 1781 } 1782 gp.stack.hi = uintptr(noescape(unsafe.Pointer(&size))) 1783 gp.stack.lo = gp.stack.hi - size + 1024 1784 } 1785 // Initialize stack guard so that we can start calling regular 1786 // Go code. 1787 gp.stackguard0 = gp.stack.lo + stackGuard 1788 // This is the g0, so we can also call go:systemstack 1789 // functions, which check stackguard1. 1790 gp.stackguard1 = gp.stackguard0 1791 mstart1() 1792 1793 // Exit this thread. 1794 if mStackIsSystemAllocated() { 1795 // Windows, Solaris, illumos, Darwin, AIX and Plan 9 always system-allocate 1796 // the stack, but put it in gp.stack before mstart, 1797 // so the logic above hasn't set osStack yet. 1798 osStack = true 1799 } 1800 mexit(osStack) 1801} 1802 1803// The go:noinline is to guarantee the getcallerpc/getcallersp below are safe, 1804// so that we can set up g0.sched to return to the call of mstart1 above. 1805// 1806//go:noinline 1807func mstart1() { 1808 gp := getg() 1809 1810 if gp != gp.m.g0 { 1811 throw("bad runtime·mstart") 1812 } 1813 1814 // Set up m.g0.sched as a label returning to just 1815 // after the mstart1 call in mstart0 above, for use by goexit0 and mcall. 1816 // We're never coming back to mstart1 after we call schedule, 1817 // so other calls can reuse the current frame. 1818 // And goexit0 does a gogo that needs to return from mstart1 1819 // and let mstart0 exit the thread. 1820 gp.sched.g = guintptr(unsafe.Pointer(gp)) 1821 gp.sched.pc = getcallerpc() 1822 gp.sched.sp = getcallersp() 1823 1824 asminit() 1825 minit() 1826 1827 // Install signal handlers; after minit so that minit can 1828 // prepare the thread to be able to handle the signals. 1829 if gp.m == &m0 { 1830 mstartm0() 1831 } 1832 1833 if fn := gp.m.mstartfn; fn != nil { 1834 fn() 1835 } 1836 1837 if gp.m != &m0 { 1838 acquirep(gp.m.nextp.ptr()) 1839 gp.m.nextp = 0 1840 } 1841 schedule() 1842} 1843 1844// mstartm0 implements part of mstart1 that only runs on the m0. 1845// 1846// Write barriers are allowed here because we know the GC can't be 1847// running yet, so they'll be no-ops. 1848// 1849//go:yeswritebarrierrec 1850func mstartm0() { 1851 // Create an extra M for callbacks on threads not created by Go. 1852 // An extra M is also needed on Windows for callbacks created by 1853 // syscall.NewCallback. See issue #6751 for details. 1854 if (iscgo || GOOS == "windows") && !cgoHasExtraM { 1855 cgoHasExtraM = true 1856 newextram() 1857 } 1858 initsig(false) 1859} 1860 1861// mPark causes a thread to park itself, returning once woken. 1862// 1863//go:nosplit 1864func mPark() { 1865 gp := getg() 1866 notesleep(&gp.m.park) 1867 noteclear(&gp.m.park) 1868} 1869 1870// mexit tears down and exits the current thread. 1871// 1872// Don't call this directly to exit the thread, since it must run at 1873// the top of the thread stack. Instead, use gogo(&gp.m.g0.sched) to 1874// unwind the stack to the point that exits the thread. 1875// 1876// It is entered with m.p != nil, so write barriers are allowed. It 1877// will release the P before exiting. 1878// 1879//go:yeswritebarrierrec 1880func mexit(osStack bool) { 1881 mp := getg().m 1882 1883 if mp == &m0 { 1884 // This is the main thread. Just wedge it. 1885 // 1886 // On Linux, exiting the main thread puts the process 1887 // into a non-waitable zombie state. On Plan 9, 1888 // exiting the main thread unblocks wait even though 1889 // other threads are still running. On Solaris we can 1890 // neither exitThread nor return from mstart. Other 1891 // bad things probably happen on other platforms. 1892 // 1893 // We could try to clean up this M more before wedging 1894 // it, but that complicates signal handling. 1895 handoffp(releasep()) 1896 lock(&sched.lock) 1897 sched.nmfreed++ 1898 checkdead() 1899 unlock(&sched.lock) 1900 mPark() 1901 throw("locked m0 woke up") 1902 } 1903 1904 sigblock(true) 1905 unminit() 1906 1907 // Free the gsignal stack. 1908 if mp.gsignal != nil { 1909 stackfree(mp.gsignal.stack) 1910 // On some platforms, when calling into VDSO (e.g. nanotime) 1911 // we store our g on the gsignal stack, if there is one. 1912 // Now the stack is freed, unlink it from the m, so we 1913 // won't write to it when calling VDSO code. 1914 mp.gsignal = nil 1915 } 1916 1917 // Remove m from allm. 1918 lock(&sched.lock) 1919 for pprev := &allm; *pprev != nil; pprev = &(*pprev).alllink { 1920 if *pprev == mp { 1921 *pprev = mp.alllink 1922 goto found 1923 } 1924 } 1925 throw("m not found in allm") 1926found: 1927 // Events must not be traced after this point. 1928 1929 // Delay reaping m until it's done with the stack. 1930 // 1931 // Put mp on the free list, though it will not be reaped while freeWait 1932 // is freeMWait. mp is no longer reachable via allm, so even if it is 1933 // on an OS stack, we must keep a reference to mp alive so that the GC 1934 // doesn't free mp while we are still using it. 1935 // 1936 // Note that the free list must not be linked through alllink because 1937 // some functions walk allm without locking, so may be using alllink. 1938 // 1939 // N.B. It's important that the M appears on the free list simultaneously 1940 // with it being removed so that the tracer can find it. 1941 mp.freeWait.Store(freeMWait) 1942 mp.freelink = sched.freem 1943 sched.freem = mp 1944 unlock(&sched.lock) 1945 1946 atomic.Xadd64(&ncgocall, int64(mp.ncgocall)) 1947 sched.totalRuntimeLockWaitTime.Add(mp.mLockProfile.waitTime.Load()) 1948 1949 // Release the P. 1950 handoffp(releasep()) 1951 // After this point we must not have write barriers. 1952 1953 // Invoke the deadlock detector. This must happen after 1954 // handoffp because it may have started a new M to take our 1955 // P's work. 1956 lock(&sched.lock) 1957 sched.nmfreed++ 1958 checkdead() 1959 unlock(&sched.lock) 1960 1961 if GOOS == "darwin" || GOOS == "ios" { 1962 // Make sure pendingPreemptSignals is correct when an M exits. 1963 // For #41702. 1964 if mp.signalPending.Load() != 0 { 1965 pendingPreemptSignals.Add(-1) 1966 } 1967 } 1968 1969 // Destroy all allocated resources. After this is called, we may no 1970 // longer take any locks. 1971 mdestroy(mp) 1972 1973 if osStack { 1974 // No more uses of mp, so it is safe to drop the reference. 1975 mp.freeWait.Store(freeMRef) 1976 1977 // Return from mstart and let the system thread 1978 // library free the g0 stack and terminate the thread. 1979 return 1980 } 1981 1982 // mstart is the thread's entry point, so there's nothing to 1983 // return to. Exit the thread directly. exitThread will clear 1984 // m.freeWait when it's done with the stack and the m can be 1985 // reaped. 1986 exitThread(&mp.freeWait) 1987} 1988 1989// forEachP calls fn(p) for every P p when p reaches a GC safe point. 1990// If a P is currently executing code, this will bring the P to a GC 1991// safe point and execute fn on that P. If the P is not executing code 1992// (it is idle or in a syscall), this will call fn(p) directly while 1993// preventing the P from exiting its state. This does not ensure that 1994// fn will run on every CPU executing Go code, but it acts as a global 1995// memory barrier. GC uses this as a "ragged barrier." 1996// 1997// The caller must hold worldsema. fn must not refer to any 1998// part of the current goroutine's stack, since the GC may move it. 1999func forEachP(reason waitReason, fn func(*p)) { 2000 systemstack(func() { 2001 gp := getg().m.curg 2002 // Mark the user stack as preemptible so that it may be scanned. 2003 // Otherwise, our attempt to force all P's to a safepoint could 2004 // result in a deadlock as we attempt to preempt a worker that's 2005 // trying to preempt us (e.g. for a stack scan). 2006 // 2007 // N.B. The execution tracer is not aware of this status 2008 // transition and handles it specially based on the 2009 // wait reason. 2010 casGToWaitingForGC(gp, _Grunning, reason) 2011 forEachPInternal(fn) 2012 casgstatus(gp, _Gwaiting, _Grunning) 2013 }) 2014} 2015 2016// forEachPInternal calls fn(p) for every P p when p reaches a GC safe point. 2017// It is the internal implementation of forEachP. 2018// 2019// The caller must hold worldsema and either must ensure that a GC is not 2020// running (otherwise this may deadlock with the GC trying to preempt this P) 2021// or it must leave its goroutine in a preemptible state before it switches 2022// to the systemstack. Due to these restrictions, prefer forEachP when possible. 2023// 2024//go:systemstack 2025func forEachPInternal(fn func(*p)) { 2026 mp := acquirem() 2027 pp := getg().m.p.ptr() 2028 2029 lock(&sched.lock) 2030 if sched.safePointWait != 0 { 2031 throw("forEachP: sched.safePointWait != 0") 2032 } 2033 sched.safePointWait = gomaxprocs - 1 2034 sched.safePointFn = fn 2035 2036 // Ask all Ps to run the safe point function. 2037 for _, p2 := range allp { 2038 if p2 != pp { 2039 atomic.Store(&p2.runSafePointFn, 1) 2040 } 2041 } 2042 preemptall() 2043 2044 // Any P entering _Pidle or _Psyscall from now on will observe 2045 // p.runSafePointFn == 1 and will call runSafePointFn when 2046 // changing its status to _Pidle/_Psyscall. 2047 2048 // Run safe point function for all idle Ps. sched.pidle will 2049 // not change because we hold sched.lock. 2050 for p := sched.pidle.ptr(); p != nil; p = p.link.ptr() { 2051 if atomic.Cas(&p.runSafePointFn, 1, 0) { 2052 fn(p) 2053 sched.safePointWait-- 2054 } 2055 } 2056 2057 wait := sched.safePointWait > 0 2058 unlock(&sched.lock) 2059 2060 // Run fn for the current P. 2061 fn(pp) 2062 2063 // Force Ps currently in _Psyscall into _Pidle and hand them 2064 // off to induce safe point function execution. 2065 for _, p2 := range allp { 2066 s := p2.status 2067 2068 // We need to be fine-grained about tracing here, since handoffp 2069 // might call into the tracer, and the tracer is non-reentrant. 2070 trace := traceAcquire() 2071 if s == _Psyscall && p2.runSafePointFn == 1 && atomic.Cas(&p2.status, s, _Pidle) { 2072 if trace.ok() { 2073 // It's important that we traceRelease before we call handoffp, which may also traceAcquire. 2074 trace.ProcSteal(p2, false) 2075 traceRelease(trace) 2076 } 2077 p2.syscalltick++ 2078 handoffp(p2) 2079 } else if trace.ok() { 2080 traceRelease(trace) 2081 } 2082 } 2083 2084 // Wait for remaining Ps to run fn. 2085 if wait { 2086 for { 2087 // Wait for 100us, then try to re-preempt in 2088 // case of any races. 2089 // 2090 // Requires system stack. 2091 if notetsleep(&sched.safePointNote, 100*1000) { 2092 noteclear(&sched.safePointNote) 2093 break 2094 } 2095 preemptall() 2096 } 2097 } 2098 if sched.safePointWait != 0 { 2099 throw("forEachP: not done") 2100 } 2101 for _, p2 := range allp { 2102 if p2.runSafePointFn != 0 { 2103 throw("forEachP: P did not run fn") 2104 } 2105 } 2106 2107 lock(&sched.lock) 2108 sched.safePointFn = nil 2109 unlock(&sched.lock) 2110 releasem(mp) 2111} 2112 2113// runSafePointFn runs the safe point function, if any, for this P. 2114// This should be called like 2115// 2116// if getg().m.p.runSafePointFn != 0 { 2117// runSafePointFn() 2118// } 2119// 2120// runSafePointFn must be checked on any transition in to _Pidle or 2121// _Psyscall to avoid a race where forEachP sees that the P is running 2122// just before the P goes into _Pidle/_Psyscall and neither forEachP 2123// nor the P run the safe-point function. 2124func runSafePointFn() { 2125 p := getg().m.p.ptr() 2126 // Resolve the race between forEachP running the safe-point 2127 // function on this P's behalf and this P running the 2128 // safe-point function directly. 2129 if !atomic.Cas(&p.runSafePointFn, 1, 0) { 2130 return 2131 } 2132 sched.safePointFn(p) 2133 lock(&sched.lock) 2134 sched.safePointWait-- 2135 if sched.safePointWait == 0 { 2136 notewakeup(&sched.safePointNote) 2137 } 2138 unlock(&sched.lock) 2139} 2140 2141// When running with cgo, we call _cgo_thread_start 2142// to start threads for us so that we can play nicely with 2143// foreign code. 2144var cgoThreadStart unsafe.Pointer 2145 2146type cgothreadstart struct { 2147 g guintptr 2148 tls *uint64 2149 fn unsafe.Pointer 2150} 2151 2152// Allocate a new m unassociated with any thread. 2153// Can use p for allocation context if needed. 2154// fn is recorded as the new m's m.mstartfn. 2155// id is optional pre-allocated m ID. Omit by passing -1. 2156// 2157// This function is allowed to have write barriers even if the caller 2158// isn't because it borrows pp. 2159// 2160//go:yeswritebarrierrec 2161func allocm(pp *p, fn func(), id int64) *m { 2162 allocmLock.rlock() 2163 2164 // The caller owns pp, but we may borrow (i.e., acquirep) it. We must 2165 // disable preemption to ensure it is not stolen, which would make the 2166 // caller lose ownership. 2167 acquirem() 2168 2169 gp := getg() 2170 if gp.m.p == 0 { 2171 acquirep(pp) // temporarily borrow p for mallocs in this function 2172 } 2173 2174 // Release the free M list. We need to do this somewhere and 2175 // this may free up a stack we can use. 2176 if sched.freem != nil { 2177 lock(&sched.lock) 2178 var newList *m 2179 for freem := sched.freem; freem != nil; { 2180 // Wait for freeWait to indicate that freem's stack is unused. 2181 wait := freem.freeWait.Load() 2182 if wait == freeMWait { 2183 next := freem.freelink 2184 freem.freelink = newList 2185 newList = freem 2186 freem = next 2187 continue 2188 } 2189 // Drop any remaining trace resources. 2190 // Ms can continue to emit events all the way until wait != freeMWait, 2191 // so it's only safe to call traceThreadDestroy at this point. 2192 if traceEnabled() || traceShuttingDown() { 2193 traceThreadDestroy(freem) 2194 } 2195 // Free the stack if needed. For freeMRef, there is 2196 // nothing to do except drop freem from the sched.freem 2197 // list. 2198 if wait == freeMStack { 2199 // stackfree must be on the system stack, but allocm is 2200 // reachable off the system stack transitively from 2201 // startm. 2202 systemstack(func() { 2203 stackfree(freem.g0.stack) 2204 }) 2205 } 2206 freem = freem.freelink 2207 } 2208 sched.freem = newList 2209 unlock(&sched.lock) 2210 } 2211 2212 mp := new(m) 2213 mp.mstartfn = fn 2214 mcommoninit(mp, id) 2215 2216 // In case of cgo or Solaris or illumos or Darwin, pthread_create will make us a stack. 2217 // Windows and Plan 9 will layout sched stack on OS stack. 2218 if iscgo || mStackIsSystemAllocated() { 2219 mp.g0 = malg(-1) 2220 } else { 2221 mp.g0 = malg(16384 * sys.StackGuardMultiplier) 2222 } 2223 mp.g0.m = mp 2224 2225 if pp == gp.m.p.ptr() { 2226 releasep() 2227 } 2228 2229 releasem(gp.m) 2230 allocmLock.runlock() 2231 return mp 2232} 2233 2234// needm is called when a cgo callback happens on a 2235// thread without an m (a thread not created by Go). 2236// In this case, needm is expected to find an m to use 2237// and return with m, g initialized correctly. 2238// Since m and g are not set now (likely nil, but see below) 2239// needm is limited in what routines it can call. In particular 2240// it can only call nosplit functions (textflag 7) and cannot 2241// do any scheduling that requires an m. 2242// 2243// In order to avoid needing heavy lifting here, we adopt 2244// the following strategy: there is a stack of available m's 2245// that can be stolen. Using compare-and-swap 2246// to pop from the stack has ABA races, so we simulate 2247// a lock by doing an exchange (via Casuintptr) to steal the stack 2248// head and replace the top pointer with MLOCKED (1). 2249// This serves as a simple spin lock that we can use even 2250// without an m. The thread that locks the stack in this way 2251// unlocks the stack by storing a valid stack head pointer. 2252// 2253// In order to make sure that there is always an m structure 2254// available to be stolen, we maintain the invariant that there 2255// is always one more than needed. At the beginning of the 2256// program (if cgo is in use) the list is seeded with a single m. 2257// If needm finds that it has taken the last m off the list, its job 2258// is - once it has installed its own m so that it can do things like 2259// allocate memory - to create a spare m and put it on the list. 2260// 2261// Each of these extra m's also has a g0 and a curg that are 2262// pressed into service as the scheduling stack and current 2263// goroutine for the duration of the cgo callback. 2264// 2265// It calls dropm to put the m back on the list, 2266// 1. when the callback is done with the m in non-pthread platforms, 2267// 2. or when the C thread exiting on pthread platforms. 2268// 2269// The signal argument indicates whether we're called from a signal 2270// handler. 2271// 2272//go:nosplit 2273func needm(signal bool) { 2274 if (iscgo || GOOS == "windows") && !cgoHasExtraM { 2275 // Can happen if C/C++ code calls Go from a global ctor. 2276 // Can also happen on Windows if a global ctor uses a 2277 // callback created by syscall.NewCallback. See issue #6751 2278 // for details. 2279 // 2280 // Can not throw, because scheduler is not initialized yet. 2281 writeErrStr("fatal error: cgo callback before cgo call\n") 2282 exit(1) 2283 } 2284 2285 // Save and block signals before getting an M. 2286 // The signal handler may call needm itself, 2287 // and we must avoid a deadlock. Also, once g is installed, 2288 // any incoming signals will try to execute, 2289 // but we won't have the sigaltstack settings and other data 2290 // set up appropriately until the end of minit, which will 2291 // unblock the signals. This is the same dance as when 2292 // starting a new m to run Go code via newosproc. 2293 var sigmask sigset 2294 sigsave(&sigmask) 2295 sigblock(false) 2296 2297 // getExtraM is safe here because of the invariant above, 2298 // that the extra list always contains or will soon contain 2299 // at least one m. 2300 mp, last := getExtraM() 2301 2302 // Set needextram when we've just emptied the list, 2303 // so that the eventual call into cgocallbackg will 2304 // allocate a new m for the extra list. We delay the 2305 // allocation until then so that it can be done 2306 // after exitsyscall makes sure it is okay to be 2307 // running at all (that is, there's no garbage collection 2308 // running right now). 2309 mp.needextram = last 2310 2311 // Store the original signal mask for use by minit. 2312 mp.sigmask = sigmask 2313 2314 // Install TLS on some platforms (previously setg 2315 // would do this if necessary). 2316 osSetupTLS(mp) 2317 2318 // Install g (= m->g0) and set the stack bounds 2319 // to match the current stack. 2320 setg(mp.g0) 2321 sp := getcallersp() 2322 callbackUpdateSystemStack(mp, sp, signal) 2323 2324 // Should mark we are already in Go now. 2325 // Otherwise, we may call needm again when we get a signal, before cgocallbackg1, 2326 // which means the extram list may be empty, that will cause a deadlock. 2327 mp.isExtraInC = false 2328 2329 // Initialize this thread to use the m. 2330 asminit() 2331 minit() 2332 2333 // Emit a trace event for this dead -> syscall transition, 2334 // but only if we're not in a signal handler. 2335 // 2336 // N.B. the tracer can run on a bare M just fine, we just have 2337 // to make sure to do this before setg(nil) and unminit. 2338 var trace traceLocker 2339 if !signal { 2340 trace = traceAcquire() 2341 } 2342 2343 // mp.curg is now a real goroutine. 2344 casgstatus(mp.curg, _Gdead, _Gsyscall) 2345 sched.ngsys.Add(-1) 2346 2347 if !signal { 2348 if trace.ok() { 2349 trace.GoCreateSyscall(mp.curg) 2350 traceRelease(trace) 2351 } 2352 } 2353 mp.isExtraInSig = signal 2354} 2355 2356// Acquire an extra m and bind it to the C thread when a pthread key has been created. 2357// 2358//go:nosplit 2359func needAndBindM() { 2360 needm(false) 2361 2362 if _cgo_pthread_key_created != nil && *(*uintptr)(_cgo_pthread_key_created) != 0 { 2363 cgoBindM() 2364 } 2365} 2366 2367// newextram allocates m's and puts them on the extra list. 2368// It is called with a working local m, so that it can do things 2369// like call schedlock and allocate. 2370func newextram() { 2371 c := extraMWaiters.Swap(0) 2372 if c > 0 { 2373 for i := uint32(0); i < c; i++ { 2374 oneNewExtraM() 2375 } 2376 } else if extraMLength.Load() == 0 { 2377 // Make sure there is at least one extra M. 2378 oneNewExtraM() 2379 } 2380} 2381 2382// oneNewExtraM allocates an m and puts it on the extra list. 2383func oneNewExtraM() { 2384 // Create extra goroutine locked to extra m. 2385 // The goroutine is the context in which the cgo callback will run. 2386 // The sched.pc will never be returned to, but setting it to 2387 // goexit makes clear to the traceback routines where 2388 // the goroutine stack ends. 2389 mp := allocm(nil, nil, -1) 2390 gp := malg(4096) 2391 gp.sched.pc = abi.FuncPCABI0(goexit) + sys.PCQuantum 2392 gp.sched.sp = gp.stack.hi 2393 gp.sched.sp -= 4 * goarch.PtrSize // extra space in case of reads slightly beyond frame 2394 gp.sched.lr = 0 2395 gp.sched.g = guintptr(unsafe.Pointer(gp)) 2396 gp.syscallpc = gp.sched.pc 2397 gp.syscallsp = gp.sched.sp 2398 gp.stktopsp = gp.sched.sp 2399 // malg returns status as _Gidle. Change to _Gdead before 2400 // adding to allg where GC can see it. We use _Gdead to hide 2401 // this from tracebacks and stack scans since it isn't a 2402 // "real" goroutine until needm grabs it. 2403 casgstatus(gp, _Gidle, _Gdead) 2404 gp.m = mp 2405 mp.curg = gp 2406 mp.isextra = true 2407 // mark we are in C by default. 2408 mp.isExtraInC = true 2409 mp.lockedInt++ 2410 mp.lockedg.set(gp) 2411 gp.lockedm.set(mp) 2412 gp.goid = sched.goidgen.Add(1) 2413 if raceenabled { 2414 gp.racectx = racegostart(abi.FuncPCABIInternal(newextram) + sys.PCQuantum) 2415 } 2416 // put on allg for garbage collector 2417 allgadd(gp) 2418 2419 // gp is now on the allg list, but we don't want it to be 2420 // counted by gcount. It would be more "proper" to increment 2421 // sched.ngfree, but that requires locking. Incrementing ngsys 2422 // has the same effect. 2423 sched.ngsys.Add(1) 2424 2425 // Add m to the extra list. 2426 addExtraM(mp) 2427} 2428 2429// dropm puts the current m back onto the extra list. 2430// 2431// 1. On systems without pthreads, like Windows 2432// dropm is called when a cgo callback has called needm but is now 2433// done with the callback and returning back into the non-Go thread. 2434// 2435// The main expense here is the call to signalstack to release the 2436// m's signal stack, and then the call to needm on the next callback 2437// from this thread. It is tempting to try to save the m for next time, 2438// which would eliminate both these costs, but there might not be 2439// a next time: the current thread (which Go does not control) might exit. 2440// If we saved the m for that thread, there would be an m leak each time 2441// such a thread exited. Instead, we acquire and release an m on each 2442// call. These should typically not be scheduling operations, just a few 2443// atomics, so the cost should be small. 2444// 2445// 2. On systems with pthreads 2446// dropm is called while a non-Go thread is exiting. 2447// We allocate a pthread per-thread variable using pthread_key_create, 2448// to register a thread-exit-time destructor. 2449// And store the g into a thread-specific value associated with the pthread key, 2450// when first return back to C. 2451// So that the destructor would invoke dropm while the non-Go thread is exiting. 2452// This is much faster since it avoids expensive signal-related syscalls. 2453// 2454// This always runs without a P, so //go:nowritebarrierrec is required. 2455// 2456// This may run with a different stack than was recorded in g0 (there is no 2457// call to callbackUpdateSystemStack prior to dropm), so this must be 2458// //go:nosplit to avoid the stack bounds check. 2459// 2460//go:nowritebarrierrec 2461//go:nosplit 2462func dropm() { 2463 // Clear m and g, and return m to the extra list. 2464 // After the call to setg we can only call nosplit functions 2465 // with no pointer manipulation. 2466 mp := getg().m 2467 2468 // Emit a trace event for this syscall -> dead transition. 2469 // 2470 // N.B. the tracer can run on a bare M just fine, we just have 2471 // to make sure to do this before setg(nil) and unminit. 2472 var trace traceLocker 2473 if !mp.isExtraInSig { 2474 trace = traceAcquire() 2475 } 2476 2477 // Return mp.curg to dead state. 2478 casgstatus(mp.curg, _Gsyscall, _Gdead) 2479 mp.curg.preemptStop = false 2480 sched.ngsys.Add(1) 2481 2482 if !mp.isExtraInSig { 2483 if trace.ok() { 2484 trace.GoDestroySyscall() 2485 traceRelease(trace) 2486 } 2487 } 2488 2489 // Trash syscalltick so that it doesn't line up with mp.old.syscalltick anymore. 2490 // 2491 // In the new tracer, we model needm and dropm and a goroutine being created and 2492 // destroyed respectively. The m then might get reused with a different procid but 2493 // still with a reference to oldp, and still with the same syscalltick. The next 2494 // time a G is "created" in needm, it'll return and quietly reacquire its P from a 2495 // different m with a different procid, which will confuse the trace parser. By 2496 // trashing syscalltick, we ensure that it'll appear as if we lost the P to the 2497 // tracer parser and that we just reacquired it. 2498 // 2499 // Trash the value by decrementing because that gets us as far away from the value 2500 // the syscall exit code expects as possible. Setting to zero is risky because 2501 // syscalltick could already be zero (and in fact, is initialized to zero). 2502 mp.syscalltick-- 2503 2504 // Reset trace state unconditionally. This goroutine is being 'destroyed' 2505 // from the perspective of the tracer. 2506 mp.curg.trace.reset() 2507 2508 // Flush all the M's buffers. This is necessary because the M might 2509 // be used on a different thread with a different procid, so we have 2510 // to make sure we don't write into the same buffer. 2511 if traceEnabled() || traceShuttingDown() { 2512 // Acquire sched.lock across thread destruction. One of the invariants of the tracer 2513 // is that a thread cannot disappear from the tracer's view (allm or freem) without 2514 // it noticing, so it requires that sched.lock be held over traceThreadDestroy. 2515 // 2516 // This isn't strictly necessary in this case, because this thread never leaves allm, 2517 // but the critical section is short and dropm is rare on pthread platforms, so just 2518 // take the lock and play it safe. traceThreadDestroy also asserts that the lock is held. 2519 lock(&sched.lock) 2520 traceThreadDestroy(mp) 2521 unlock(&sched.lock) 2522 } 2523 mp.isExtraInSig = false 2524 2525 // Block signals before unminit. 2526 // Unminit unregisters the signal handling stack (but needs g on some systems). 2527 // Setg(nil) clears g, which is the signal handler's cue not to run Go handlers. 2528 // It's important not to try to handle a signal between those two steps. 2529 sigmask := mp.sigmask 2530 sigblock(false) 2531 unminit() 2532 2533 setg(nil) 2534 2535 // Clear g0 stack bounds to ensure that needm always refreshes the 2536 // bounds when reusing this M. 2537 g0 := mp.g0 2538 g0.stack.hi = 0 2539 g0.stack.lo = 0 2540 g0.stackguard0 = 0 2541 g0.stackguard1 = 0 2542 2543 putExtraM(mp) 2544 2545 msigrestore(sigmask) 2546} 2547 2548// bindm store the g0 of the current m into a thread-specific value. 2549// 2550// We allocate a pthread per-thread variable using pthread_key_create, 2551// to register a thread-exit-time destructor. 2552// We are here setting the thread-specific value of the pthread key, to enable the destructor. 2553// So that the pthread_key_destructor would dropm while the C thread is exiting. 2554// 2555// And the saved g will be used in pthread_key_destructor, 2556// since the g stored in the TLS by Go might be cleared in some platforms, 2557// before the destructor invoked, so, we restore g by the stored g, before dropm. 2558// 2559// We store g0 instead of m, to make the assembly code simpler, 2560// since we need to restore g0 in runtime.cgocallback. 2561// 2562// On systems without pthreads, like Windows, bindm shouldn't be used. 2563// 2564// NOTE: this always runs without a P, so, nowritebarrierrec required. 2565// 2566//go:nosplit 2567//go:nowritebarrierrec 2568func cgoBindM() { 2569 if GOOS == "windows" || GOOS == "plan9" { 2570 fatal("bindm in unexpected GOOS") 2571 } 2572 g := getg() 2573 if g.m.g0 != g { 2574 fatal("the current g is not g0") 2575 } 2576 if _cgo_bindm != nil { 2577 asmcgocall(_cgo_bindm, unsafe.Pointer(g)) 2578 } 2579} 2580 2581// A helper function for EnsureDropM. 2582// 2583// getm should be an internal detail, 2584// but widely used packages access it using linkname. 2585// Notable members of the hall of shame include: 2586// - fortio.org/log 2587// 2588// Do not remove or change the type signature. 2589// See go.dev/issue/67401. 2590// 2591//go:linkname getm 2592func getm() uintptr { 2593 return uintptr(unsafe.Pointer(getg().m)) 2594} 2595 2596var ( 2597 // Locking linked list of extra M's, via mp.schedlink. Must be accessed 2598 // only via lockextra/unlockextra. 2599 // 2600 // Can't be atomic.Pointer[m] because we use an invalid pointer as a 2601 // "locked" sentinel value. M's on this list remain visible to the GC 2602 // because their mp.curg is on allgs. 2603 extraM atomic.Uintptr 2604 // Number of M's in the extraM list. 2605 extraMLength atomic.Uint32 2606 // Number of waiters in lockextra. 2607 extraMWaiters atomic.Uint32 2608 2609 // Number of extra M's in use by threads. 2610 extraMInUse atomic.Uint32 2611) 2612 2613// lockextra locks the extra list and returns the list head. 2614// The caller must unlock the list by storing a new list head 2615// to extram. If nilokay is true, then lockextra will 2616// return a nil list head if that's what it finds. If nilokay is false, 2617// lockextra will keep waiting until the list head is no longer nil. 2618// 2619//go:nosplit 2620func lockextra(nilokay bool) *m { 2621 const locked = 1 2622 2623 incr := false 2624 for { 2625 old := extraM.Load() 2626 if old == locked { 2627 osyield_no_g() 2628 continue 2629 } 2630 if old == 0 && !nilokay { 2631 if !incr { 2632 // Add 1 to the number of threads 2633 // waiting for an M. 2634 // This is cleared by newextram. 2635 extraMWaiters.Add(1) 2636 incr = true 2637 } 2638 usleep_no_g(1) 2639 continue 2640 } 2641 if extraM.CompareAndSwap(old, locked) { 2642 return (*m)(unsafe.Pointer(old)) 2643 } 2644 osyield_no_g() 2645 continue 2646 } 2647} 2648 2649//go:nosplit 2650func unlockextra(mp *m, delta int32) { 2651 extraMLength.Add(delta) 2652 extraM.Store(uintptr(unsafe.Pointer(mp))) 2653} 2654 2655// Return an M from the extra M list. Returns last == true if the list becomes 2656// empty because of this call. 2657// 2658// Spins waiting for an extra M, so caller must ensure that the list always 2659// contains or will soon contain at least one M. 2660// 2661//go:nosplit 2662func getExtraM() (mp *m, last bool) { 2663 mp = lockextra(false) 2664 extraMInUse.Add(1) 2665 unlockextra(mp.schedlink.ptr(), -1) 2666 return mp, mp.schedlink.ptr() == nil 2667} 2668 2669// Returns an extra M back to the list. mp must be from getExtraM. Newly 2670// allocated M's should use addExtraM. 2671// 2672//go:nosplit 2673func putExtraM(mp *m) { 2674 extraMInUse.Add(-1) 2675 addExtraM(mp) 2676} 2677 2678// Adds a newly allocated M to the extra M list. 2679// 2680//go:nosplit 2681func addExtraM(mp *m) { 2682 mnext := lockextra(true) 2683 mp.schedlink.set(mnext) 2684 unlockextra(mp, 1) 2685} 2686 2687var ( 2688 // allocmLock is locked for read when creating new Ms in allocm and their 2689 // addition to allm. Thus acquiring this lock for write blocks the 2690 // creation of new Ms. 2691 allocmLock rwmutex 2692 2693 // execLock serializes exec and clone to avoid bugs or unspecified 2694 // behaviour around exec'ing while creating/destroying threads. See 2695 // issue #19546. 2696 execLock rwmutex 2697) 2698 2699// These errors are reported (via writeErrStr) by some OS-specific 2700// versions of newosproc and newosproc0. 2701const ( 2702 failthreadcreate = "runtime: failed to create new OS thread\n" 2703 failallocatestack = "runtime: failed to allocate stack for the new OS thread\n" 2704) 2705 2706// newmHandoff contains a list of m structures that need new OS threads. 2707// This is used by newm in situations where newm itself can't safely 2708// start an OS thread. 2709var newmHandoff struct { 2710 lock mutex 2711 2712 // newm points to a list of M structures that need new OS 2713 // threads. The list is linked through m.schedlink. 2714 newm muintptr 2715 2716 // waiting indicates that wake needs to be notified when an m 2717 // is put on the list. 2718 waiting bool 2719 wake note 2720 2721 // haveTemplateThread indicates that the templateThread has 2722 // been started. This is not protected by lock. Use cas to set 2723 // to 1. 2724 haveTemplateThread uint32 2725} 2726 2727// Create a new m. It will start off with a call to fn, or else the scheduler. 2728// fn needs to be static and not a heap allocated closure. 2729// May run with m.p==nil, so write barriers are not allowed. 2730// 2731// id is optional pre-allocated m ID. Omit by passing -1. 2732// 2733//go:nowritebarrierrec 2734func newm(fn func(), pp *p, id int64) { 2735 // allocm adds a new M to allm, but they do not start until created by 2736 // the OS in newm1 or the template thread. 2737 // 2738 // doAllThreadsSyscall requires that every M in allm will eventually 2739 // start and be signal-able, even with a STW. 2740 // 2741 // Disable preemption here until we start the thread to ensure that 2742 // newm is not preempted between allocm and starting the new thread, 2743 // ensuring that anything added to allm is guaranteed to eventually 2744 // start. 2745 acquirem() 2746 2747 mp := allocm(pp, fn, id) 2748 mp.nextp.set(pp) 2749 mp.sigmask = initSigmask 2750 if gp := getg(); gp != nil && gp.m != nil && (gp.m.lockedExt != 0 || gp.m.incgo) && GOOS != "plan9" { 2751 // We're on a locked M or a thread that may have been 2752 // started by C. The kernel state of this thread may 2753 // be strange (the user may have locked it for that 2754 // purpose). We don't want to clone that into another 2755 // thread. Instead, ask a known-good thread to create 2756 // the thread for us. 2757 // 2758 // This is disabled on Plan 9. See golang.org/issue/22227. 2759 // 2760 // TODO: This may be unnecessary on Windows, which 2761 // doesn't model thread creation off fork. 2762 lock(&newmHandoff.lock) 2763 if newmHandoff.haveTemplateThread == 0 { 2764 throw("on a locked thread with no template thread") 2765 } 2766 mp.schedlink = newmHandoff.newm 2767 newmHandoff.newm.set(mp) 2768 if newmHandoff.waiting { 2769 newmHandoff.waiting = false 2770 notewakeup(&newmHandoff.wake) 2771 } 2772 unlock(&newmHandoff.lock) 2773 // The M has not started yet, but the template thread does not 2774 // participate in STW, so it will always process queued Ms and 2775 // it is safe to releasem. 2776 releasem(getg().m) 2777 return 2778 } 2779 newm1(mp) 2780 releasem(getg().m) 2781} 2782 2783func newm1(mp *m) { 2784 if iscgo { 2785 var ts cgothreadstart 2786 if _cgo_thread_start == nil { 2787 throw("_cgo_thread_start missing") 2788 } 2789 ts.g.set(mp.g0) 2790 ts.tls = (*uint64)(unsafe.Pointer(&mp.tls[0])) 2791 ts.fn = unsafe.Pointer(abi.FuncPCABI0(mstart)) 2792 if msanenabled { 2793 msanwrite(unsafe.Pointer(&ts), unsafe.Sizeof(ts)) 2794 } 2795 if asanenabled { 2796 asanwrite(unsafe.Pointer(&ts), unsafe.Sizeof(ts)) 2797 } 2798 execLock.rlock() // Prevent process clone. 2799 asmcgocall(_cgo_thread_start, unsafe.Pointer(&ts)) 2800 execLock.runlock() 2801 return 2802 } 2803 execLock.rlock() // Prevent process clone. 2804 newosproc(mp) 2805 execLock.runlock() 2806} 2807 2808// startTemplateThread starts the template thread if it is not already 2809// running. 2810// 2811// The calling thread must itself be in a known-good state. 2812func startTemplateThread() { 2813 if GOARCH == "wasm" { // no threads on wasm yet 2814 return 2815 } 2816 2817 // Disable preemption to guarantee that the template thread will be 2818 // created before a park once haveTemplateThread is set. 2819 mp := acquirem() 2820 if !atomic.Cas(&newmHandoff.haveTemplateThread, 0, 1) { 2821 releasem(mp) 2822 return 2823 } 2824 newm(templateThread, nil, -1) 2825 releasem(mp) 2826} 2827 2828// templateThread is a thread in a known-good state that exists solely 2829// to start new threads in known-good states when the calling thread 2830// may not be in a good state. 2831// 2832// Many programs never need this, so templateThread is started lazily 2833// when we first enter a state that might lead to running on a thread 2834// in an unknown state. 2835// 2836// templateThread runs on an M without a P, so it must not have write 2837// barriers. 2838// 2839//go:nowritebarrierrec 2840func templateThread() { 2841 lock(&sched.lock) 2842 sched.nmsys++ 2843 checkdead() 2844 unlock(&sched.lock) 2845 2846 for { 2847 lock(&newmHandoff.lock) 2848 for newmHandoff.newm != 0 { 2849 newm := newmHandoff.newm.ptr() 2850 newmHandoff.newm = 0 2851 unlock(&newmHandoff.lock) 2852 for newm != nil { 2853 next := newm.schedlink.ptr() 2854 newm.schedlink = 0 2855 newm1(newm) 2856 newm = next 2857 } 2858 lock(&newmHandoff.lock) 2859 } 2860 newmHandoff.waiting = true 2861 noteclear(&newmHandoff.wake) 2862 unlock(&newmHandoff.lock) 2863 notesleep(&newmHandoff.wake) 2864 } 2865} 2866 2867// Stops execution of the current m until new work is available. 2868// Returns with acquired P. 2869func stopm() { 2870 gp := getg() 2871 2872 if gp.m.locks != 0 { 2873 throw("stopm holding locks") 2874 } 2875 if gp.m.p != 0 { 2876 throw("stopm holding p") 2877 } 2878 if gp.m.spinning { 2879 throw("stopm spinning") 2880 } 2881 2882 lock(&sched.lock) 2883 mput(gp.m) 2884 unlock(&sched.lock) 2885 mPark() 2886 acquirep(gp.m.nextp.ptr()) 2887 gp.m.nextp = 0 2888} 2889 2890func mspinning() { 2891 // startm's caller incremented nmspinning. Set the new M's spinning. 2892 getg().m.spinning = true 2893} 2894 2895// Schedules some M to run the p (creates an M if necessary). 2896// If p==nil, tries to get an idle P, if no idle P's does nothing. 2897// May run with m.p==nil, so write barriers are not allowed. 2898// If spinning is set, the caller has incremented nmspinning and must provide a 2899// P. startm will set m.spinning in the newly started M. 2900// 2901// Callers passing a non-nil P must call from a non-preemptible context. See 2902// comment on acquirem below. 2903// 2904// Argument lockheld indicates whether the caller already acquired the 2905// scheduler lock. Callers holding the lock when making the call must pass 2906// true. The lock might be temporarily dropped, but will be reacquired before 2907// returning. 2908// 2909// Must not have write barriers because this may be called without a P. 2910// 2911//go:nowritebarrierrec 2912func startm(pp *p, spinning, lockheld bool) { 2913 // Disable preemption. 2914 // 2915 // Every owned P must have an owner that will eventually stop it in the 2916 // event of a GC stop request. startm takes transient ownership of a P 2917 // (either from argument or pidleget below) and transfers ownership to 2918 // a started M, which will be responsible for performing the stop. 2919 // 2920 // Preemption must be disabled during this transient ownership, 2921 // otherwise the P this is running on may enter GC stop while still 2922 // holding the transient P, leaving that P in limbo and deadlocking the 2923 // STW. 2924 // 2925 // Callers passing a non-nil P must already be in non-preemptible 2926 // context, otherwise such preemption could occur on function entry to 2927 // startm. Callers passing a nil P may be preemptible, so we must 2928 // disable preemption before acquiring a P from pidleget below. 2929 mp := acquirem() 2930 if !lockheld { 2931 lock(&sched.lock) 2932 } 2933 if pp == nil { 2934 if spinning { 2935 // TODO(prattmic): All remaining calls to this function 2936 // with _p_ == nil could be cleaned up to find a P 2937 // before calling startm. 2938 throw("startm: P required for spinning=true") 2939 } 2940 pp, _ = pidleget(0) 2941 if pp == nil { 2942 if !lockheld { 2943 unlock(&sched.lock) 2944 } 2945 releasem(mp) 2946 return 2947 } 2948 } 2949 nmp := mget() 2950 if nmp == nil { 2951 // No M is available, we must drop sched.lock and call newm. 2952 // However, we already own a P to assign to the M. 2953 // 2954 // Once sched.lock is released, another G (e.g., in a syscall), 2955 // could find no idle P while checkdead finds a runnable G but 2956 // no running M's because this new M hasn't started yet, thus 2957 // throwing in an apparent deadlock. 2958 // This apparent deadlock is possible when startm is called 2959 // from sysmon, which doesn't count as a running M. 2960 // 2961 // Avoid this situation by pre-allocating the ID for the new M, 2962 // thus marking it as 'running' before we drop sched.lock. This 2963 // new M will eventually run the scheduler to execute any 2964 // queued G's. 2965 id := mReserveID() 2966 unlock(&sched.lock) 2967 2968 var fn func() 2969 if spinning { 2970 // The caller incremented nmspinning, so set m.spinning in the new M. 2971 fn = mspinning 2972 } 2973 newm(fn, pp, id) 2974 2975 if lockheld { 2976 lock(&sched.lock) 2977 } 2978 // Ownership transfer of pp committed by start in newm. 2979 // Preemption is now safe. 2980 releasem(mp) 2981 return 2982 } 2983 if !lockheld { 2984 unlock(&sched.lock) 2985 } 2986 if nmp.spinning { 2987 throw("startm: m is spinning") 2988 } 2989 if nmp.nextp != 0 { 2990 throw("startm: m has p") 2991 } 2992 if spinning && !runqempty(pp) { 2993 throw("startm: p has runnable gs") 2994 } 2995 // The caller incremented nmspinning, so set m.spinning in the new M. 2996 nmp.spinning = spinning 2997 nmp.nextp.set(pp) 2998 notewakeup(&nmp.park) 2999 // Ownership transfer of pp committed by wakeup. Preemption is now 3000 // safe. 3001 releasem(mp) 3002} 3003 3004// Hands off P from syscall or locked M. 3005// Always runs without a P, so write barriers are not allowed. 3006// 3007//go:nowritebarrierrec 3008func handoffp(pp *p) { 3009 // handoffp must start an M in any situation where 3010 // findrunnable would return a G to run on pp. 3011 3012 // if it has local work, start it straight away 3013 if !runqempty(pp) || sched.runqsize != 0 { 3014 startm(pp, false, false) 3015 return 3016 } 3017 // if there's trace work to do, start it straight away 3018 if (traceEnabled() || traceShuttingDown()) && traceReaderAvailable() != nil { 3019 startm(pp, false, false) 3020 return 3021 } 3022 // if it has GC work, start it straight away 3023 if gcBlackenEnabled != 0 && gcMarkWorkAvailable(pp) { 3024 startm(pp, false, false) 3025 return 3026 } 3027 // no local work, check that there are no spinning/idle M's, 3028 // otherwise our help is not required 3029 if sched.nmspinning.Load()+sched.npidle.Load() == 0 && sched.nmspinning.CompareAndSwap(0, 1) { // TODO: fast atomic 3030 sched.needspinning.Store(0) 3031 startm(pp, true, false) 3032 return 3033 } 3034 lock(&sched.lock) 3035 if sched.gcwaiting.Load() { 3036 pp.status = _Pgcstop 3037 pp.gcStopTime = nanotime() 3038 sched.stopwait-- 3039 if sched.stopwait == 0 { 3040 notewakeup(&sched.stopnote) 3041 } 3042 unlock(&sched.lock) 3043 return 3044 } 3045 if pp.runSafePointFn != 0 && atomic.Cas(&pp.runSafePointFn, 1, 0) { 3046 sched.safePointFn(pp) 3047 sched.safePointWait-- 3048 if sched.safePointWait == 0 { 3049 notewakeup(&sched.safePointNote) 3050 } 3051 } 3052 if sched.runqsize != 0 { 3053 unlock(&sched.lock) 3054 startm(pp, false, false) 3055 return 3056 } 3057 // If this is the last running P and nobody is polling network, 3058 // need to wakeup another M to poll network. 3059 if sched.npidle.Load() == gomaxprocs-1 && sched.lastpoll.Load() != 0 { 3060 unlock(&sched.lock) 3061 startm(pp, false, false) 3062 return 3063 } 3064 3065 // The scheduler lock cannot be held when calling wakeNetPoller below 3066 // because wakeNetPoller may call wakep which may call startm. 3067 when := pp.timers.wakeTime() 3068 pidleput(pp, 0) 3069 unlock(&sched.lock) 3070 3071 if when != 0 { 3072 wakeNetPoller(when) 3073 } 3074} 3075 3076// Tries to add one more P to execute G's. 3077// Called when a G is made runnable (newproc, ready). 3078// Must be called with a P. 3079// 3080// wakep should be an internal detail, 3081// but widely used packages access it using linkname. 3082// Notable members of the hall of shame include: 3083// - gvisor.dev/gvisor 3084// 3085// Do not remove or change the type signature. 3086// See go.dev/issue/67401. 3087// 3088//go:linkname wakep 3089func wakep() { 3090 // Be conservative about spinning threads, only start one if none exist 3091 // already. 3092 if sched.nmspinning.Load() != 0 || !sched.nmspinning.CompareAndSwap(0, 1) { 3093 return 3094 } 3095 3096 // Disable preemption until ownership of pp transfers to the next M in 3097 // startm. Otherwise preemption here would leave pp stuck waiting to 3098 // enter _Pgcstop. 3099 // 3100 // See preemption comment on acquirem in startm for more details. 3101 mp := acquirem() 3102 3103 var pp *p 3104 lock(&sched.lock) 3105 pp, _ = pidlegetSpinning(0) 3106 if pp == nil { 3107 if sched.nmspinning.Add(-1) < 0 { 3108 throw("wakep: negative nmspinning") 3109 } 3110 unlock(&sched.lock) 3111 releasem(mp) 3112 return 3113 } 3114 // Since we always have a P, the race in the "No M is available" 3115 // comment in startm doesn't apply during the small window between the 3116 // unlock here and lock in startm. A checkdead in between will always 3117 // see at least one running M (ours). 3118 unlock(&sched.lock) 3119 3120 startm(pp, true, false) 3121 3122 releasem(mp) 3123} 3124 3125// Stops execution of the current m that is locked to a g until the g is runnable again. 3126// Returns with acquired P. 3127func stoplockedm() { 3128 gp := getg() 3129 3130 if gp.m.lockedg == 0 || gp.m.lockedg.ptr().lockedm.ptr() != gp.m { 3131 throw("stoplockedm: inconsistent locking") 3132 } 3133 if gp.m.p != 0 { 3134 // Schedule another M to run this p. 3135 pp := releasep() 3136 handoffp(pp) 3137 } 3138 incidlelocked(1) 3139 // Wait until another thread schedules lockedg again. 3140 mPark() 3141 status := readgstatus(gp.m.lockedg.ptr()) 3142 if status&^_Gscan != _Grunnable { 3143 print("runtime:stoplockedm: lockedg (atomicstatus=", status, ") is not Grunnable or Gscanrunnable\n") 3144 dumpgstatus(gp.m.lockedg.ptr()) 3145 throw("stoplockedm: not runnable") 3146 } 3147 acquirep(gp.m.nextp.ptr()) 3148 gp.m.nextp = 0 3149} 3150 3151// Schedules the locked m to run the locked gp. 3152// May run during STW, so write barriers are not allowed. 3153// 3154//go:nowritebarrierrec 3155func startlockedm(gp *g) { 3156 mp := gp.lockedm.ptr() 3157 if mp == getg().m { 3158 throw("startlockedm: locked to me") 3159 } 3160 if mp.nextp != 0 { 3161 throw("startlockedm: m has p") 3162 } 3163 // directly handoff current P to the locked m 3164 incidlelocked(-1) 3165 pp := releasep() 3166 mp.nextp.set(pp) 3167 notewakeup(&mp.park) 3168 stopm() 3169} 3170 3171// Stops the current m for stopTheWorld. 3172// Returns when the world is restarted. 3173func gcstopm() { 3174 gp := getg() 3175 3176 if !sched.gcwaiting.Load() { 3177 throw("gcstopm: not waiting for gc") 3178 } 3179 if gp.m.spinning { 3180 gp.m.spinning = false 3181 // OK to just drop nmspinning here, 3182 // startTheWorld will unpark threads as necessary. 3183 if sched.nmspinning.Add(-1) < 0 { 3184 throw("gcstopm: negative nmspinning") 3185 } 3186 } 3187 pp := releasep() 3188 lock(&sched.lock) 3189 pp.status = _Pgcstop 3190 pp.gcStopTime = nanotime() 3191 sched.stopwait-- 3192 if sched.stopwait == 0 { 3193 notewakeup(&sched.stopnote) 3194 } 3195 unlock(&sched.lock) 3196 stopm() 3197} 3198 3199// Schedules gp to run on the current M. 3200// If inheritTime is true, gp inherits the remaining time in the 3201// current time slice. Otherwise, it starts a new time slice. 3202// Never returns. 3203// 3204// Write barriers are allowed because this is called immediately after 3205// acquiring a P in several places. 3206// 3207//go:yeswritebarrierrec 3208func execute(gp *g, inheritTime bool) { 3209 mp := getg().m 3210 3211 if goroutineProfile.active { 3212 // Make sure that gp has had its stack written out to the goroutine 3213 // profile, exactly as it was when the goroutine profiler first stopped 3214 // the world. 3215 tryRecordGoroutineProfile(gp, nil, osyield) 3216 } 3217 3218 // Assign gp.m before entering _Grunning so running Gs have an 3219 // M. 3220 mp.curg = gp 3221 gp.m = mp 3222 casgstatus(gp, _Grunnable, _Grunning) 3223 gp.waitsince = 0 3224 gp.preempt = false 3225 gp.stackguard0 = gp.stack.lo + stackGuard 3226 if !inheritTime { 3227 mp.p.ptr().schedtick++ 3228 } 3229 3230 // Check whether the profiler needs to be turned on or off. 3231 hz := sched.profilehz 3232 if mp.profilehz != hz { 3233 setThreadCPUProfiler(hz) 3234 } 3235 3236 trace := traceAcquire() 3237 if trace.ok() { 3238 trace.GoStart() 3239 traceRelease(trace) 3240 } 3241 3242 gogo(&gp.sched) 3243} 3244 3245// Finds a runnable goroutine to execute. 3246// Tries to steal from other P's, get g from local or global queue, poll network. 3247// tryWakeP indicates that the returned goroutine is not normal (GC worker, trace 3248// reader) so the caller should try to wake a P. 3249func findRunnable() (gp *g, inheritTime, tryWakeP bool) { 3250 mp := getg().m 3251 3252 // The conditions here and in handoffp must agree: if 3253 // findrunnable would return a G to run, handoffp must start 3254 // an M. 3255 3256top: 3257 pp := mp.p.ptr() 3258 if sched.gcwaiting.Load() { 3259 gcstopm() 3260 goto top 3261 } 3262 if pp.runSafePointFn != 0 { 3263 runSafePointFn() 3264 } 3265 3266 // now and pollUntil are saved for work stealing later, 3267 // which may steal timers. It's important that between now 3268 // and then, nothing blocks, so these numbers remain mostly 3269 // relevant. 3270 now, pollUntil, _ := pp.timers.check(0) 3271 3272 // Try to schedule the trace reader. 3273 if traceEnabled() || traceShuttingDown() { 3274 gp := traceReader() 3275 if gp != nil { 3276 trace := traceAcquire() 3277 casgstatus(gp, _Gwaiting, _Grunnable) 3278 if trace.ok() { 3279 trace.GoUnpark(gp, 0) 3280 traceRelease(trace) 3281 } 3282 return gp, false, true 3283 } 3284 } 3285 3286 // Try to schedule a GC worker. 3287 if gcBlackenEnabled != 0 { 3288 gp, tnow := gcController.findRunnableGCWorker(pp, now) 3289 if gp != nil { 3290 return gp, false, true 3291 } 3292 now = tnow 3293 } 3294 3295 // Check the global runnable queue once in a while to ensure fairness. 3296 // Otherwise two goroutines can completely occupy the local runqueue 3297 // by constantly respawning each other. 3298 if pp.schedtick%61 == 0 && sched.runqsize > 0 { 3299 lock(&sched.lock) 3300 gp := globrunqget(pp, 1) 3301 unlock(&sched.lock) 3302 if gp != nil { 3303 return gp, false, false 3304 } 3305 } 3306 3307 // Wake up the finalizer G. 3308 if fingStatus.Load()&(fingWait|fingWake) == fingWait|fingWake { 3309 if gp := wakefing(); gp != nil { 3310 ready(gp, 0, true) 3311 } 3312 } 3313 if *cgo_yield != nil { 3314 asmcgocall(*cgo_yield, nil) 3315 } 3316 3317 // local runq 3318 if gp, inheritTime := runqget(pp); gp != nil { 3319 return gp, inheritTime, false 3320 } 3321 3322 // global runq 3323 if sched.runqsize != 0 { 3324 lock(&sched.lock) 3325 gp := globrunqget(pp, 0) 3326 unlock(&sched.lock) 3327 if gp != nil { 3328 return gp, false, false 3329 } 3330 } 3331 3332 // Poll network. 3333 // This netpoll is only an optimization before we resort to stealing. 3334 // We can safely skip it if there are no waiters or a thread is blocked 3335 // in netpoll already. If there is any kind of logical race with that 3336 // blocked thread (e.g. it has already returned from netpoll, but does 3337 // not set lastpoll yet), this thread will do blocking netpoll below 3338 // anyway. 3339 if netpollinited() && netpollAnyWaiters() && sched.lastpoll.Load() != 0 { 3340 if list, delta := netpoll(0); !list.empty() { // non-blocking 3341 gp := list.pop() 3342 injectglist(&list) 3343 netpollAdjustWaiters(delta) 3344 trace := traceAcquire() 3345 casgstatus(gp, _Gwaiting, _Grunnable) 3346 if trace.ok() { 3347 trace.GoUnpark(gp, 0) 3348 traceRelease(trace) 3349 } 3350 return gp, false, false 3351 } 3352 } 3353 3354 // Spinning Ms: steal work from other Ps. 3355 // 3356 // Limit the number of spinning Ms to half the number of busy Ps. 3357 // This is necessary to prevent excessive CPU consumption when 3358 // GOMAXPROCS>>1 but the program parallelism is low. 3359 if mp.spinning || 2*sched.nmspinning.Load() < gomaxprocs-sched.npidle.Load() { 3360 if !mp.spinning { 3361 mp.becomeSpinning() 3362 } 3363 3364 gp, inheritTime, tnow, w, newWork := stealWork(now) 3365 if gp != nil { 3366 // Successfully stole. 3367 return gp, inheritTime, false 3368 } 3369 if newWork { 3370 // There may be new timer or GC work; restart to 3371 // discover. 3372 goto top 3373 } 3374 3375 now = tnow 3376 if w != 0 && (pollUntil == 0 || w < pollUntil) { 3377 // Earlier timer to wait for. 3378 pollUntil = w 3379 } 3380 } 3381 3382 // We have nothing to do. 3383 // 3384 // If we're in the GC mark phase, can safely scan and blacken objects, 3385 // and have work to do, run idle-time marking rather than give up the P. 3386 if gcBlackenEnabled != 0 && gcMarkWorkAvailable(pp) && gcController.addIdleMarkWorker() { 3387 node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop()) 3388 if node != nil { 3389 pp.gcMarkWorkerMode = gcMarkWorkerIdleMode 3390 gp := node.gp.ptr() 3391 3392 trace := traceAcquire() 3393 casgstatus(gp, _Gwaiting, _Grunnable) 3394 if trace.ok() { 3395 trace.GoUnpark(gp, 0) 3396 traceRelease(trace) 3397 } 3398 return gp, false, false 3399 } 3400 gcController.removeIdleMarkWorker() 3401 } 3402 3403 // wasm only: 3404 // If a callback returned and no other goroutine is awake, 3405 // then wake event handler goroutine which pauses execution 3406 // until a callback was triggered. 3407 gp, otherReady := beforeIdle(now, pollUntil) 3408 if gp != nil { 3409 trace := traceAcquire() 3410 casgstatus(gp, _Gwaiting, _Grunnable) 3411 if trace.ok() { 3412 trace.GoUnpark(gp, 0) 3413 traceRelease(trace) 3414 } 3415 return gp, false, false 3416 } 3417 if otherReady { 3418 goto top 3419 } 3420 3421 // Before we drop our P, make a snapshot of the allp slice, 3422 // which can change underfoot once we no longer block 3423 // safe-points. We don't need to snapshot the contents because 3424 // everything up to cap(allp) is immutable. 3425 allpSnapshot := allp 3426 // Also snapshot masks. Value changes are OK, but we can't allow 3427 // len to change out from under us. 3428 idlepMaskSnapshot := idlepMask 3429 timerpMaskSnapshot := timerpMask 3430 3431 // return P and block 3432 lock(&sched.lock) 3433 if sched.gcwaiting.Load() || pp.runSafePointFn != 0 { 3434 unlock(&sched.lock) 3435 goto top 3436 } 3437 if sched.runqsize != 0 { 3438 gp := globrunqget(pp, 0) 3439 unlock(&sched.lock) 3440 return gp, false, false 3441 } 3442 if !mp.spinning && sched.needspinning.Load() == 1 { 3443 // See "Delicate dance" comment below. 3444 mp.becomeSpinning() 3445 unlock(&sched.lock) 3446 goto top 3447 } 3448 if releasep() != pp { 3449 throw("findrunnable: wrong p") 3450 } 3451 now = pidleput(pp, now) 3452 unlock(&sched.lock) 3453 3454 // Delicate dance: thread transitions from spinning to non-spinning 3455 // state, potentially concurrently with submission of new work. We must 3456 // drop nmspinning first and then check all sources again (with 3457 // #StoreLoad memory barrier in between). If we do it the other way 3458 // around, another thread can submit work after we've checked all 3459 // sources but before we drop nmspinning; as a result nobody will 3460 // unpark a thread to run the work. 3461 // 3462 // This applies to the following sources of work: 3463 // 3464 // * Goroutines added to the global or a per-P run queue. 3465 // * New/modified-earlier timers on a per-P timer heap. 3466 // * Idle-priority GC work (barring golang.org/issue/19112). 3467 // 3468 // If we discover new work below, we need to restore m.spinning as a 3469 // signal for resetspinning to unpark a new worker thread (because 3470 // there can be more than one starving goroutine). 3471 // 3472 // However, if after discovering new work we also observe no idle Ps 3473 // (either here or in resetspinning), we have a problem. We may be 3474 // racing with a non-spinning M in the block above, having found no 3475 // work and preparing to release its P and park. Allowing that P to go 3476 // idle will result in loss of work conservation (idle P while there is 3477 // runnable work). This could result in complete deadlock in the 3478 // unlikely event that we discover new work (from netpoll) right as we 3479 // are racing with _all_ other Ps going idle. 3480 // 3481 // We use sched.needspinning to synchronize with non-spinning Ms going 3482 // idle. If needspinning is set when they are about to drop their P, 3483 // they abort the drop and instead become a new spinning M on our 3484 // behalf. If we are not racing and the system is truly fully loaded 3485 // then no spinning threads are required, and the next thread to 3486 // naturally become spinning will clear the flag. 3487 // 3488 // Also see "Worker thread parking/unparking" comment at the top of the 3489 // file. 3490 wasSpinning := mp.spinning 3491 if mp.spinning { 3492 mp.spinning = false 3493 if sched.nmspinning.Add(-1) < 0 { 3494 throw("findrunnable: negative nmspinning") 3495 } 3496 3497 // Note the for correctness, only the last M transitioning from 3498 // spinning to non-spinning must perform these rechecks to 3499 // ensure no missed work. However, the runtime has some cases 3500 // of transient increments of nmspinning that are decremented 3501 // without going through this path, so we must be conservative 3502 // and perform the check on all spinning Ms. 3503 // 3504 // See https://go.dev/issue/43997. 3505 3506 // Check global and P runqueues again. 3507 3508 lock(&sched.lock) 3509 if sched.runqsize != 0 { 3510 pp, _ := pidlegetSpinning(0) 3511 if pp != nil { 3512 gp := globrunqget(pp, 0) 3513 if gp == nil { 3514 throw("global runq empty with non-zero runqsize") 3515 } 3516 unlock(&sched.lock) 3517 acquirep(pp) 3518 mp.becomeSpinning() 3519 return gp, false, false 3520 } 3521 } 3522 unlock(&sched.lock) 3523 3524 pp := checkRunqsNoP(allpSnapshot, idlepMaskSnapshot) 3525 if pp != nil { 3526 acquirep(pp) 3527 mp.becomeSpinning() 3528 goto top 3529 } 3530 3531 // Check for idle-priority GC work again. 3532 pp, gp := checkIdleGCNoP() 3533 if pp != nil { 3534 acquirep(pp) 3535 mp.becomeSpinning() 3536 3537 // Run the idle worker. 3538 pp.gcMarkWorkerMode = gcMarkWorkerIdleMode 3539 trace := traceAcquire() 3540 casgstatus(gp, _Gwaiting, _Grunnable) 3541 if trace.ok() { 3542 trace.GoUnpark(gp, 0) 3543 traceRelease(trace) 3544 } 3545 return gp, false, false 3546 } 3547 3548 // Finally, check for timer creation or expiry concurrently with 3549 // transitioning from spinning to non-spinning. 3550 // 3551 // Note that we cannot use checkTimers here because it calls 3552 // adjusttimers which may need to allocate memory, and that isn't 3553 // allowed when we don't have an active P. 3554 pollUntil = checkTimersNoP(allpSnapshot, timerpMaskSnapshot, pollUntil) 3555 } 3556 3557 // Poll network until next timer. 3558 if netpollinited() && (netpollAnyWaiters() || pollUntil != 0) && sched.lastpoll.Swap(0) != 0 { 3559 sched.pollUntil.Store(pollUntil) 3560 if mp.p != 0 { 3561 throw("findrunnable: netpoll with p") 3562 } 3563 if mp.spinning { 3564 throw("findrunnable: netpoll with spinning") 3565 } 3566 delay := int64(-1) 3567 if pollUntil != 0 { 3568 if now == 0 { 3569 now = nanotime() 3570 } 3571 delay = pollUntil - now 3572 if delay < 0 { 3573 delay = 0 3574 } 3575 } 3576 if faketime != 0 { 3577 // When using fake time, just poll. 3578 delay = 0 3579 } 3580 list, delta := netpoll(delay) // block until new work is available 3581 // Refresh now again, after potentially blocking. 3582 now = nanotime() 3583 sched.pollUntil.Store(0) 3584 sched.lastpoll.Store(now) 3585 if faketime != 0 && list.empty() { 3586 // Using fake time and nothing is ready; stop M. 3587 // When all M's stop, checkdead will call timejump. 3588 stopm() 3589 goto top 3590 } 3591 lock(&sched.lock) 3592 pp, _ := pidleget(now) 3593 unlock(&sched.lock) 3594 if pp == nil { 3595 injectglist(&list) 3596 netpollAdjustWaiters(delta) 3597 } else { 3598 acquirep(pp) 3599 if !list.empty() { 3600 gp := list.pop() 3601 injectglist(&list) 3602 netpollAdjustWaiters(delta) 3603 trace := traceAcquire() 3604 casgstatus(gp, _Gwaiting, _Grunnable) 3605 if trace.ok() { 3606 trace.GoUnpark(gp, 0) 3607 traceRelease(trace) 3608 } 3609 return gp, false, false 3610 } 3611 if wasSpinning { 3612 mp.becomeSpinning() 3613 } 3614 goto top 3615 } 3616 } else if pollUntil != 0 && netpollinited() { 3617 pollerPollUntil := sched.pollUntil.Load() 3618 if pollerPollUntil == 0 || pollerPollUntil > pollUntil { 3619 netpollBreak() 3620 } 3621 } 3622 stopm() 3623 goto top 3624} 3625 3626// pollWork reports whether there is non-background work this P could 3627// be doing. This is a fairly lightweight check to be used for 3628// background work loops, like idle GC. It checks a subset of the 3629// conditions checked by the actual scheduler. 3630func pollWork() bool { 3631 if sched.runqsize != 0 { 3632 return true 3633 } 3634 p := getg().m.p.ptr() 3635 if !runqempty(p) { 3636 return true 3637 } 3638 if netpollinited() && netpollAnyWaiters() && sched.lastpoll.Load() != 0 { 3639 if list, delta := netpoll(0); !list.empty() { 3640 injectglist(&list) 3641 netpollAdjustWaiters(delta) 3642 return true 3643 } 3644 } 3645 return false 3646} 3647 3648// stealWork attempts to steal a runnable goroutine or timer from any P. 3649// 3650// If newWork is true, new work may have been readied. 3651// 3652// If now is not 0 it is the current time. stealWork returns the passed time or 3653// the current time if now was passed as 0. 3654func stealWork(now int64) (gp *g, inheritTime bool, rnow, pollUntil int64, newWork bool) { 3655 pp := getg().m.p.ptr() 3656 3657 ranTimer := false 3658 3659 const stealTries = 4 3660 for i := 0; i < stealTries; i++ { 3661 stealTimersOrRunNextG := i == stealTries-1 3662 3663 for enum := stealOrder.start(cheaprand()); !enum.done(); enum.next() { 3664 if sched.gcwaiting.Load() { 3665 // GC work may be available. 3666 return nil, false, now, pollUntil, true 3667 } 3668 p2 := allp[enum.position()] 3669 if pp == p2 { 3670 continue 3671 } 3672 3673 // Steal timers from p2. This call to checkTimers is the only place 3674 // where we might hold a lock on a different P's timers. We do this 3675 // once on the last pass before checking runnext because stealing 3676 // from the other P's runnext should be the last resort, so if there 3677 // are timers to steal do that first. 3678 // 3679 // We only check timers on one of the stealing iterations because 3680 // the time stored in now doesn't change in this loop and checking 3681 // the timers for each P more than once with the same value of now 3682 // is probably a waste of time. 3683 // 3684 // timerpMask tells us whether the P may have timers at all. If it 3685 // can't, no need to check at all. 3686 if stealTimersOrRunNextG && timerpMask.read(enum.position()) { 3687 tnow, w, ran := p2.timers.check(now) 3688 now = tnow 3689 if w != 0 && (pollUntil == 0 || w < pollUntil) { 3690 pollUntil = w 3691 } 3692 if ran { 3693 // Running the timers may have 3694 // made an arbitrary number of G's 3695 // ready and added them to this P's 3696 // local run queue. That invalidates 3697 // the assumption of runqsteal 3698 // that it always has room to add 3699 // stolen G's. So check now if there 3700 // is a local G to run. 3701 if gp, inheritTime := runqget(pp); gp != nil { 3702 return gp, inheritTime, now, pollUntil, ranTimer 3703 } 3704 ranTimer = true 3705 } 3706 } 3707 3708 // Don't bother to attempt to steal if p2 is idle. 3709 if !idlepMask.read(enum.position()) { 3710 if gp := runqsteal(pp, p2, stealTimersOrRunNextG); gp != nil { 3711 return gp, false, now, pollUntil, ranTimer 3712 } 3713 } 3714 } 3715 } 3716 3717 // No goroutines found to steal. Regardless, running a timer may have 3718 // made some goroutine ready that we missed. Indicate the next timer to 3719 // wait for. 3720 return nil, false, now, pollUntil, ranTimer 3721} 3722 3723// Check all Ps for a runnable G to steal. 3724// 3725// On entry we have no P. If a G is available to steal and a P is available, 3726// the P is returned which the caller should acquire and attempt to steal the 3727// work to. 3728func checkRunqsNoP(allpSnapshot []*p, idlepMaskSnapshot pMask) *p { 3729 for id, p2 := range allpSnapshot { 3730 if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(p2) { 3731 lock(&sched.lock) 3732 pp, _ := pidlegetSpinning(0) 3733 if pp == nil { 3734 // Can't get a P, don't bother checking remaining Ps. 3735 unlock(&sched.lock) 3736 return nil 3737 } 3738 unlock(&sched.lock) 3739 return pp 3740 } 3741 } 3742 3743 // No work available. 3744 return nil 3745} 3746 3747// Check all Ps for a timer expiring sooner than pollUntil. 3748// 3749// Returns updated pollUntil value. 3750func checkTimersNoP(allpSnapshot []*p, timerpMaskSnapshot pMask, pollUntil int64) int64 { 3751 for id, p2 := range allpSnapshot { 3752 if timerpMaskSnapshot.read(uint32(id)) { 3753 w := p2.timers.wakeTime() 3754 if w != 0 && (pollUntil == 0 || w < pollUntil) { 3755 pollUntil = w 3756 } 3757 } 3758 } 3759 3760 return pollUntil 3761} 3762 3763// Check for idle-priority GC, without a P on entry. 3764// 3765// If some GC work, a P, and a worker G are all available, the P and G will be 3766// returned. The returned P has not been wired yet. 3767func checkIdleGCNoP() (*p, *g) { 3768 // N.B. Since we have no P, gcBlackenEnabled may change at any time; we 3769 // must check again after acquiring a P. As an optimization, we also check 3770 // if an idle mark worker is needed at all. This is OK here, because if we 3771 // observe that one isn't needed, at least one is currently running. Even if 3772 // it stops running, its own journey into the scheduler should schedule it 3773 // again, if need be (at which point, this check will pass, if relevant). 3774 if atomic.Load(&gcBlackenEnabled) == 0 || !gcController.needIdleMarkWorker() { 3775 return nil, nil 3776 } 3777 if !gcMarkWorkAvailable(nil) { 3778 return nil, nil 3779 } 3780 3781 // Work is available; we can start an idle GC worker only if there is 3782 // an available P and available worker G. 3783 // 3784 // We can attempt to acquire these in either order, though both have 3785 // synchronization concerns (see below). Workers are almost always 3786 // available (see comment in findRunnableGCWorker for the one case 3787 // there may be none). Since we're slightly less likely to find a P, 3788 // check for that first. 3789 // 3790 // Synchronization: note that we must hold sched.lock until we are 3791 // committed to keeping it. Otherwise we cannot put the unnecessary P 3792 // back in sched.pidle without performing the full set of idle 3793 // transition checks. 3794 // 3795 // If we were to check gcBgMarkWorkerPool first, we must somehow handle 3796 // the assumption in gcControllerState.findRunnableGCWorker that an 3797 // empty gcBgMarkWorkerPool is only possible if gcMarkDone is running. 3798 lock(&sched.lock) 3799 pp, now := pidlegetSpinning(0) 3800 if pp == nil { 3801 unlock(&sched.lock) 3802 return nil, nil 3803 } 3804 3805 // Now that we own a P, gcBlackenEnabled can't change (as it requires STW). 3806 if gcBlackenEnabled == 0 || !gcController.addIdleMarkWorker() { 3807 pidleput(pp, now) 3808 unlock(&sched.lock) 3809 return nil, nil 3810 } 3811 3812 node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop()) 3813 if node == nil { 3814 pidleput(pp, now) 3815 unlock(&sched.lock) 3816 gcController.removeIdleMarkWorker() 3817 return nil, nil 3818 } 3819 3820 unlock(&sched.lock) 3821 3822 return pp, node.gp.ptr() 3823} 3824 3825// wakeNetPoller wakes up the thread sleeping in the network poller if it isn't 3826// going to wake up before the when argument; or it wakes an idle P to service 3827// timers and the network poller if there isn't one already. 3828func wakeNetPoller(when int64) { 3829 if sched.lastpoll.Load() == 0 { 3830 // In findrunnable we ensure that when polling the pollUntil 3831 // field is either zero or the time to which the current 3832 // poll is expected to run. This can have a spurious wakeup 3833 // but should never miss a wakeup. 3834 pollerPollUntil := sched.pollUntil.Load() 3835 if pollerPollUntil == 0 || pollerPollUntil > when { 3836 netpollBreak() 3837 } 3838 } else { 3839 // There are no threads in the network poller, try to get 3840 // one there so it can handle new timers. 3841 if GOOS != "plan9" { // Temporary workaround - see issue #42303. 3842 wakep() 3843 } 3844 } 3845} 3846 3847func resetspinning() { 3848 gp := getg() 3849 if !gp.m.spinning { 3850 throw("resetspinning: not a spinning m") 3851 } 3852 gp.m.spinning = false 3853 nmspinning := sched.nmspinning.Add(-1) 3854 if nmspinning < 0 { 3855 throw("findrunnable: negative nmspinning") 3856 } 3857 // M wakeup policy is deliberately somewhat conservative, so check if we 3858 // need to wakeup another P here. See "Worker thread parking/unparking" 3859 // comment at the top of the file for details. 3860 wakep() 3861} 3862 3863// injectglist adds each runnable G on the list to some run queue, 3864// and clears glist. If there is no current P, they are added to the 3865// global queue, and up to npidle M's are started to run them. 3866// Otherwise, for each idle P, this adds a G to the global queue 3867// and starts an M. Any remaining G's are added to the current P's 3868// local run queue. 3869// This may temporarily acquire sched.lock. 3870// Can run concurrently with GC. 3871func injectglist(glist *gList) { 3872 if glist.empty() { 3873 return 3874 } 3875 trace := traceAcquire() 3876 if trace.ok() { 3877 for gp := glist.head.ptr(); gp != nil; gp = gp.schedlink.ptr() { 3878 trace.GoUnpark(gp, 0) 3879 } 3880 traceRelease(trace) 3881 } 3882 3883 // Mark all the goroutines as runnable before we put them 3884 // on the run queues. 3885 head := glist.head.ptr() 3886 var tail *g 3887 qsize := 0 3888 for gp := head; gp != nil; gp = gp.schedlink.ptr() { 3889 tail = gp 3890 qsize++ 3891 casgstatus(gp, _Gwaiting, _Grunnable) 3892 } 3893 3894 // Turn the gList into a gQueue. 3895 var q gQueue 3896 q.head.set(head) 3897 q.tail.set(tail) 3898 *glist = gList{} 3899 3900 startIdle := func(n int) { 3901 for i := 0; i < n; i++ { 3902 mp := acquirem() // See comment in startm. 3903 lock(&sched.lock) 3904 3905 pp, _ := pidlegetSpinning(0) 3906 if pp == nil { 3907 unlock(&sched.lock) 3908 releasem(mp) 3909 break 3910 } 3911 3912 startm(pp, false, true) 3913 unlock(&sched.lock) 3914 releasem(mp) 3915 } 3916 } 3917 3918 pp := getg().m.p.ptr() 3919 if pp == nil { 3920 lock(&sched.lock) 3921 globrunqputbatch(&q, int32(qsize)) 3922 unlock(&sched.lock) 3923 startIdle(qsize) 3924 return 3925 } 3926 3927 npidle := int(sched.npidle.Load()) 3928 var ( 3929 globq gQueue 3930 n int 3931 ) 3932 for n = 0; n < npidle && !q.empty(); n++ { 3933 g := q.pop() 3934 globq.pushBack(g) 3935 } 3936 if n > 0 { 3937 lock(&sched.lock) 3938 globrunqputbatch(&globq, int32(n)) 3939 unlock(&sched.lock) 3940 startIdle(n) 3941 qsize -= n 3942 } 3943 3944 if !q.empty() { 3945 runqputbatch(pp, &q, qsize) 3946 } 3947 3948 // Some P's might have become idle after we loaded `sched.npidle` 3949 // but before any goroutines were added to the queue, which could 3950 // lead to idle P's when there is work available in the global queue. 3951 // That could potentially last until other goroutines become ready 3952 // to run. That said, we need to find a way to hedge 3953 // 3954 // Calling wakep() here is the best bet, it will do nothing in the 3955 // common case (no racing on `sched.npidle`), while it could wake one 3956 // more P to execute G's, which might end up with >1 P's: the first one 3957 // wakes another P and so forth until there is no more work, but this 3958 // ought to be an extremely rare case. 3959 // 3960 // Also see "Worker thread parking/unparking" comment at the top of the file for details. 3961 wakep() 3962} 3963 3964// One round of scheduler: find a runnable goroutine and execute it. 3965// Never returns. 3966func schedule() { 3967 mp := getg().m 3968 3969 if mp.locks != 0 { 3970 throw("schedule: holding locks") 3971 } 3972 3973 if mp.lockedg != 0 { 3974 stoplockedm() 3975 execute(mp.lockedg.ptr(), false) // Never returns. 3976 } 3977 3978 // We should not schedule away from a g that is executing a cgo call, 3979 // since the cgo call is using the m's g0 stack. 3980 if mp.incgo { 3981 throw("schedule: in cgo") 3982 } 3983 3984top: 3985 pp := mp.p.ptr() 3986 pp.preempt = false 3987 3988 // Safety check: if we are spinning, the run queue should be empty. 3989 // Check this before calling checkTimers, as that might call 3990 // goready to put a ready goroutine on the local run queue. 3991 if mp.spinning && (pp.runnext != 0 || pp.runqhead != pp.runqtail) { 3992 throw("schedule: spinning with local work") 3993 } 3994 3995 gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available 3996 3997 if debug.dontfreezetheworld > 0 && freezing.Load() { 3998 // See comment in freezetheworld. We don't want to perturb 3999 // scheduler state, so we didn't gcstopm in findRunnable, but 4000 // also don't want to allow new goroutines to run. 4001 // 4002 // Deadlock here rather than in the findRunnable loop so if 4003 // findRunnable is stuck in a loop we don't perturb that 4004 // either. 4005 lock(&deadlock) 4006 lock(&deadlock) 4007 } 4008 4009 // This thread is going to run a goroutine and is not spinning anymore, 4010 // so if it was marked as spinning we need to reset it now and potentially 4011 // start a new spinning M. 4012 if mp.spinning { 4013 resetspinning() 4014 } 4015 4016 if sched.disable.user && !schedEnabled(gp) { 4017 // Scheduling of this goroutine is disabled. Put it on 4018 // the list of pending runnable goroutines for when we 4019 // re-enable user scheduling and look again. 4020 lock(&sched.lock) 4021 if schedEnabled(gp) { 4022 // Something re-enabled scheduling while we 4023 // were acquiring the lock. 4024 unlock(&sched.lock) 4025 } else { 4026 sched.disable.runnable.pushBack(gp) 4027 sched.disable.n++ 4028 unlock(&sched.lock) 4029 goto top 4030 } 4031 } 4032 4033 // If about to schedule a not-normal goroutine (a GCworker or tracereader), 4034 // wake a P if there is one. 4035 if tryWakeP { 4036 wakep() 4037 } 4038 if gp.lockedm != 0 { 4039 // Hands off own p to the locked m, 4040 // then blocks waiting for a new p. 4041 startlockedm(gp) 4042 goto top 4043 } 4044 4045 execute(gp, inheritTime) 4046} 4047 4048// dropg removes the association between m and the current goroutine m->curg (gp for short). 4049// Typically a caller sets gp's status away from Grunning and then 4050// immediately calls dropg to finish the job. The caller is also responsible 4051// for arranging that gp will be restarted using ready at an 4052// appropriate time. After calling dropg and arranging for gp to be 4053// readied later, the caller can do other work but eventually should 4054// call schedule to restart the scheduling of goroutines on this m. 4055func dropg() { 4056 gp := getg() 4057 4058 setMNoWB(&gp.m.curg.m, nil) 4059 setGNoWB(&gp.m.curg, nil) 4060} 4061 4062func parkunlock_c(gp *g, lock unsafe.Pointer) bool { 4063 unlock((*mutex)(lock)) 4064 return true 4065} 4066 4067// park continuation on g0. 4068func park_m(gp *g) { 4069 mp := getg().m 4070 4071 trace := traceAcquire() 4072 4073 if trace.ok() { 4074 // Trace the event before the transition. It may take a 4075 // stack trace, but we won't own the stack after the 4076 // transition anymore. 4077 trace.GoPark(mp.waitTraceBlockReason, mp.waitTraceSkip) 4078 } 4079 // N.B. Not using casGToWaiting here because the waitreason is 4080 // set by park_m's caller. 4081 casgstatus(gp, _Grunning, _Gwaiting) 4082 if trace.ok() { 4083 traceRelease(trace) 4084 } 4085 4086 dropg() 4087 4088 if fn := mp.waitunlockf; fn != nil { 4089 ok := fn(gp, mp.waitlock) 4090 mp.waitunlockf = nil 4091 mp.waitlock = nil 4092 if !ok { 4093 trace := traceAcquire() 4094 casgstatus(gp, _Gwaiting, _Grunnable) 4095 if trace.ok() { 4096 trace.GoUnpark(gp, 2) 4097 traceRelease(trace) 4098 } 4099 execute(gp, true) // Schedule it back, never returns. 4100 } 4101 } 4102 schedule() 4103} 4104 4105func goschedImpl(gp *g, preempted bool) { 4106 trace := traceAcquire() 4107 status := readgstatus(gp) 4108 if status&^_Gscan != _Grunning { 4109 dumpgstatus(gp) 4110 throw("bad g status") 4111 } 4112 if trace.ok() { 4113 // Trace the event before the transition. It may take a 4114 // stack trace, but we won't own the stack after the 4115 // transition anymore. 4116 if preempted { 4117 trace.GoPreempt() 4118 } else { 4119 trace.GoSched() 4120 } 4121 } 4122 casgstatus(gp, _Grunning, _Grunnable) 4123 if trace.ok() { 4124 traceRelease(trace) 4125 } 4126 4127 dropg() 4128 lock(&sched.lock) 4129 globrunqput(gp) 4130 unlock(&sched.lock) 4131 4132 if mainStarted { 4133 wakep() 4134 } 4135 4136 schedule() 4137} 4138 4139// Gosched continuation on g0. 4140func gosched_m(gp *g) { 4141 goschedImpl(gp, false) 4142} 4143 4144// goschedguarded is a forbidden-states-avoided version of gosched_m. 4145func goschedguarded_m(gp *g) { 4146 if !canPreemptM(gp.m) { 4147 gogo(&gp.sched) // never return 4148 } 4149 goschedImpl(gp, false) 4150} 4151 4152func gopreempt_m(gp *g) { 4153 goschedImpl(gp, true) 4154} 4155 4156// preemptPark parks gp and puts it in _Gpreempted. 4157// 4158//go:systemstack 4159func preemptPark(gp *g) { 4160 status := readgstatus(gp) 4161 if status&^_Gscan != _Grunning { 4162 dumpgstatus(gp) 4163 throw("bad g status") 4164 } 4165 4166 if gp.asyncSafePoint { 4167 // Double-check that async preemption does not 4168 // happen in SPWRITE assembly functions. 4169 // isAsyncSafePoint must exclude this case. 4170 f := findfunc(gp.sched.pc) 4171 if !f.valid() { 4172 throw("preempt at unknown pc") 4173 } 4174 if f.flag&abi.FuncFlagSPWrite != 0 { 4175 println("runtime: unexpected SPWRITE function", funcname(f), "in async preempt") 4176 throw("preempt SPWRITE") 4177 } 4178 } 4179 4180 // Transition from _Grunning to _Gscan|_Gpreempted. We can't 4181 // be in _Grunning when we dropg because then we'd be running 4182 // without an M, but the moment we're in _Gpreempted, 4183 // something could claim this G before we've fully cleaned it 4184 // up. Hence, we set the scan bit to lock down further 4185 // transitions until we can dropg. 4186 casGToPreemptScan(gp, _Grunning, _Gscan|_Gpreempted) 4187 dropg() 4188 4189 // Be careful about how we trace this next event. The ordering 4190 // is subtle. 4191 // 4192 // The moment we CAS into _Gpreempted, suspendG could CAS to 4193 // _Gwaiting, do its work, and ready the goroutine. All of 4194 // this could happen before we even get the chance to emit 4195 // an event. The end result is that the events could appear 4196 // out of order, and the tracer generally assumes the scheduler 4197 // takes care of the ordering between GoPark and GoUnpark. 4198 // 4199 // The answer here is simple: emit the event while we still hold 4200 // the _Gscan bit on the goroutine. We still need to traceAcquire 4201 // and traceRelease across the CAS because the tracer could be 4202 // what's calling suspendG in the first place, and we want the 4203 // CAS and event emission to appear atomic to the tracer. 4204 trace := traceAcquire() 4205 if trace.ok() { 4206 trace.GoPark(traceBlockPreempted, 0) 4207 } 4208 casfrom_Gscanstatus(gp, _Gscan|_Gpreempted, _Gpreempted) 4209 if trace.ok() { 4210 traceRelease(trace) 4211 } 4212 schedule() 4213} 4214 4215// goyield is like Gosched, but it: 4216// - emits a GoPreempt trace event instead of a GoSched trace event 4217// - puts the current G on the runq of the current P instead of the globrunq 4218// 4219// goyield should be an internal detail, 4220// but widely used packages access it using linkname. 4221// Notable members of the hall of shame include: 4222// - gvisor.dev/gvisor 4223// - github.com/sagernet/gvisor 4224// 4225// Do not remove or change the type signature. 4226// See go.dev/issue/67401. 4227// 4228//go:linkname goyield 4229func goyield() { 4230 checkTimeouts() 4231 mcall(goyield_m) 4232} 4233 4234func goyield_m(gp *g) { 4235 trace := traceAcquire() 4236 pp := gp.m.p.ptr() 4237 if trace.ok() { 4238 // Trace the event before the transition. It may take a 4239 // stack trace, but we won't own the stack after the 4240 // transition anymore. 4241 trace.GoPreempt() 4242 } 4243 casgstatus(gp, _Grunning, _Grunnable) 4244 if trace.ok() { 4245 traceRelease(trace) 4246 } 4247 dropg() 4248 runqput(pp, gp, false) 4249 schedule() 4250} 4251 4252// Finishes execution of the current goroutine. 4253func goexit1() { 4254 if raceenabled { 4255 racegoend() 4256 } 4257 trace := traceAcquire() 4258 if trace.ok() { 4259 trace.GoEnd() 4260 traceRelease(trace) 4261 } 4262 mcall(goexit0) 4263} 4264 4265// goexit continuation on g0. 4266func goexit0(gp *g) { 4267 gdestroy(gp) 4268 schedule() 4269} 4270 4271func gdestroy(gp *g) { 4272 mp := getg().m 4273 pp := mp.p.ptr() 4274 4275 casgstatus(gp, _Grunning, _Gdead) 4276 gcController.addScannableStack(pp, -int64(gp.stack.hi-gp.stack.lo)) 4277 if isSystemGoroutine(gp, false) { 4278 sched.ngsys.Add(-1) 4279 } 4280 gp.m = nil 4281 locked := gp.lockedm != 0 4282 gp.lockedm = 0 4283 mp.lockedg = 0 4284 gp.preemptStop = false 4285 gp.paniconfault = false 4286 gp._defer = nil // should be true already but just in case. 4287 gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data. 4288 gp.writebuf = nil 4289 gp.waitreason = waitReasonZero 4290 gp.param = nil 4291 gp.labels = nil 4292 gp.timer = nil 4293 4294 if gcBlackenEnabled != 0 && gp.gcAssistBytes > 0 { 4295 // Flush assist credit to the global pool. This gives 4296 // better information to pacing if the application is 4297 // rapidly creating an exiting goroutines. 4298 assistWorkPerByte := gcController.assistWorkPerByte.Load() 4299 scanCredit := int64(assistWorkPerByte * float64(gp.gcAssistBytes)) 4300 gcController.bgScanCredit.Add(scanCredit) 4301 gp.gcAssistBytes = 0 4302 } 4303 4304 dropg() 4305 4306 if GOARCH == "wasm" { // no threads yet on wasm 4307 gfput(pp, gp) 4308 return 4309 } 4310 4311 if locked && mp.lockedInt != 0 { 4312 print("runtime: mp.lockedInt = ", mp.lockedInt, "\n") 4313 throw("exited a goroutine internally locked to the OS thread") 4314 } 4315 gfput(pp, gp) 4316 if locked { 4317 // The goroutine may have locked this thread because 4318 // it put it in an unusual kernel state. Kill it 4319 // rather than returning it to the thread pool. 4320 4321 // Return to mstart, which will release the P and exit 4322 // the thread. 4323 if GOOS != "plan9" { // See golang.org/issue/22227. 4324 gogo(&mp.g0.sched) 4325 } else { 4326 // Clear lockedExt on plan9 since we may end up re-using 4327 // this thread. 4328 mp.lockedExt = 0 4329 } 4330 } 4331} 4332 4333// save updates getg().sched to refer to pc and sp so that a following 4334// gogo will restore pc and sp. 4335// 4336// save must not have write barriers because invoking a write barrier 4337// can clobber getg().sched. 4338// 4339//go:nosplit 4340//go:nowritebarrierrec 4341func save(pc, sp, bp uintptr) { 4342 gp := getg() 4343 4344 if gp == gp.m.g0 || gp == gp.m.gsignal { 4345 // m.g0.sched is special and must describe the context 4346 // for exiting the thread. mstart1 writes to it directly. 4347 // m.gsignal.sched should not be used at all. 4348 // This check makes sure save calls do not accidentally 4349 // run in contexts where they'd write to system g's. 4350 throw("save on system g not allowed") 4351 } 4352 4353 gp.sched.pc = pc 4354 gp.sched.sp = sp 4355 gp.sched.lr = 0 4356 gp.sched.ret = 0 4357 gp.sched.bp = bp 4358 // We need to ensure ctxt is zero, but can't have a write 4359 // barrier here. However, it should always already be zero. 4360 // Assert that. 4361 if gp.sched.ctxt != nil { 4362 badctxt() 4363 } 4364} 4365 4366// The goroutine g is about to enter a system call. 4367// Record that it's not using the cpu anymore. 4368// This is called only from the go syscall library and cgocall, 4369// not from the low-level system calls used by the runtime. 4370// 4371// Entersyscall cannot split the stack: the save must 4372// make g->sched refer to the caller's stack segment, because 4373// entersyscall is going to return immediately after. 4374// 4375// Nothing entersyscall calls can split the stack either. 4376// We cannot safely move the stack during an active call to syscall, 4377// because we do not know which of the uintptr arguments are 4378// really pointers (back into the stack). 4379// In practice, this means that we make the fast path run through 4380// entersyscall doing no-split things, and the slow path has to use systemstack 4381// to run bigger things on the system stack. 4382// 4383// reentersyscall is the entry point used by cgo callbacks, where explicitly 4384// saved SP and PC are restored. This is needed when exitsyscall will be called 4385// from a function further up in the call stack than the parent, as g->syscallsp 4386// must always point to a valid stack frame. entersyscall below is the normal 4387// entry point for syscalls, which obtains the SP and PC from the caller. 4388// 4389//go:nosplit 4390func reentersyscall(pc, sp, bp uintptr) { 4391 trace := traceAcquire() 4392 gp := getg() 4393 4394 // Disable preemption because during this function g is in Gsyscall status, 4395 // but can have inconsistent g->sched, do not let GC observe it. 4396 gp.m.locks++ 4397 4398 // Entersyscall must not call any function that might split/grow the stack. 4399 // (See details in comment above.) 4400 // Catch calls that might, by replacing the stack guard with something that 4401 // will trip any stack check and leaving a flag to tell newstack to die. 4402 gp.stackguard0 = stackPreempt 4403 gp.throwsplit = true 4404 4405 // Leave SP around for GC and traceback. 4406 save(pc, sp, bp) 4407 gp.syscallsp = sp 4408 gp.syscallpc = pc 4409 gp.syscallbp = bp 4410 casgstatus(gp, _Grunning, _Gsyscall) 4411 if staticLockRanking { 4412 // When doing static lock ranking casgstatus can call 4413 // systemstack which clobbers g.sched. 4414 save(pc, sp, bp) 4415 } 4416 if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp { 4417 systemstack(func() { 4418 print("entersyscall inconsistent sp ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n") 4419 throw("entersyscall") 4420 }) 4421 } 4422 if gp.syscallbp != 0 && gp.syscallbp < gp.stack.lo || gp.stack.hi < gp.syscallbp { 4423 systemstack(func() { 4424 print("entersyscall inconsistent bp ", hex(gp.syscallbp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n") 4425 throw("entersyscall") 4426 }) 4427 } 4428 4429 if trace.ok() { 4430 systemstack(func() { 4431 trace.GoSysCall() 4432 traceRelease(trace) 4433 }) 4434 // systemstack itself clobbers g.sched.{pc,sp} and we might 4435 // need them later when the G is genuinely blocked in a 4436 // syscall 4437 save(pc, sp, bp) 4438 } 4439 4440 if sched.sysmonwait.Load() { 4441 systemstack(entersyscall_sysmon) 4442 save(pc, sp, bp) 4443 } 4444 4445 if gp.m.p.ptr().runSafePointFn != 0 { 4446 // runSafePointFn may stack split if run on this stack 4447 systemstack(runSafePointFn) 4448 save(pc, sp, bp) 4449 } 4450 4451 gp.m.syscalltick = gp.m.p.ptr().syscalltick 4452 pp := gp.m.p.ptr() 4453 pp.m = 0 4454 gp.m.oldp.set(pp) 4455 gp.m.p = 0 4456 atomic.Store(&pp.status, _Psyscall) 4457 if sched.gcwaiting.Load() { 4458 systemstack(entersyscall_gcwait) 4459 save(pc, sp, bp) 4460 } 4461 4462 gp.m.locks-- 4463} 4464 4465// Standard syscall entry used by the go syscall library and normal cgo calls. 4466// 4467// This is exported via linkname to assembly in the syscall package and x/sys. 4468// 4469// Other packages should not be accessing entersyscall directly, 4470// but widely used packages access it using linkname. 4471// Notable members of the hall of shame include: 4472// - gvisor.dev/gvisor 4473// 4474// Do not remove or change the type signature. 4475// See go.dev/issue/67401. 4476// 4477//go:nosplit 4478//go:linkname entersyscall 4479func entersyscall() { 4480 // N.B. getcallerfp cannot be written directly as argument in the call 4481 // to reentersyscall because it forces spilling the other arguments to 4482 // the stack. This results in exceeding the nosplit stack requirements 4483 // on some platforms. 4484 fp := getcallerfp() 4485 reentersyscall(getcallerpc(), getcallersp(), fp) 4486} 4487 4488func entersyscall_sysmon() { 4489 lock(&sched.lock) 4490 if sched.sysmonwait.Load() { 4491 sched.sysmonwait.Store(false) 4492 notewakeup(&sched.sysmonnote) 4493 } 4494 unlock(&sched.lock) 4495} 4496 4497func entersyscall_gcwait() { 4498 gp := getg() 4499 pp := gp.m.oldp.ptr() 4500 4501 lock(&sched.lock) 4502 trace := traceAcquire() 4503 if sched.stopwait > 0 && atomic.Cas(&pp.status, _Psyscall, _Pgcstop) { 4504 if trace.ok() { 4505 // This is a steal in the new tracer. While it's very likely 4506 // that we were the ones to put this P into _Psyscall, between 4507 // then and now it's totally possible it had been stolen and 4508 // then put back into _Psyscall for us to acquire here. In such 4509 // case ProcStop would be incorrect. 4510 // 4511 // TODO(mknyszek): Consider emitting a ProcStop instead when 4512 // gp.m.syscalltick == pp.syscalltick, since then we know we never 4513 // lost the P. 4514 trace.ProcSteal(pp, true) 4515 traceRelease(trace) 4516 } 4517 pp.gcStopTime = nanotime() 4518 pp.syscalltick++ 4519 if sched.stopwait--; sched.stopwait == 0 { 4520 notewakeup(&sched.stopnote) 4521 } 4522 } else if trace.ok() { 4523 traceRelease(trace) 4524 } 4525 unlock(&sched.lock) 4526} 4527 4528// The same as entersyscall(), but with a hint that the syscall is blocking. 4529 4530// entersyscallblock should be an internal detail, 4531// but widely used packages access it using linkname. 4532// Notable members of the hall of shame include: 4533// - gvisor.dev/gvisor 4534// 4535// Do not remove or change the type signature. 4536// See go.dev/issue/67401. 4537// 4538//go:linkname entersyscallblock 4539//go:nosplit 4540func entersyscallblock() { 4541 gp := getg() 4542 4543 gp.m.locks++ // see comment in entersyscall 4544 gp.throwsplit = true 4545 gp.stackguard0 = stackPreempt // see comment in entersyscall 4546 gp.m.syscalltick = gp.m.p.ptr().syscalltick 4547 gp.m.p.ptr().syscalltick++ 4548 4549 // Leave SP around for GC and traceback. 4550 pc := getcallerpc() 4551 sp := getcallersp() 4552 bp := getcallerfp() 4553 save(pc, sp, bp) 4554 gp.syscallsp = gp.sched.sp 4555 gp.syscallpc = gp.sched.pc 4556 gp.syscallbp = gp.sched.bp 4557 if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp { 4558 sp1 := sp 4559 sp2 := gp.sched.sp 4560 sp3 := gp.syscallsp 4561 systemstack(func() { 4562 print("entersyscallblock inconsistent sp ", hex(sp1), " ", hex(sp2), " ", hex(sp3), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n") 4563 throw("entersyscallblock") 4564 }) 4565 } 4566 casgstatus(gp, _Grunning, _Gsyscall) 4567 if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp { 4568 systemstack(func() { 4569 print("entersyscallblock inconsistent sp ", hex(sp), " ", hex(gp.sched.sp), " ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n") 4570 throw("entersyscallblock") 4571 }) 4572 } 4573 if gp.syscallbp != 0 && gp.syscallbp < gp.stack.lo || gp.stack.hi < gp.syscallbp { 4574 systemstack(func() { 4575 print("entersyscallblock inconsistent bp ", hex(bp), " ", hex(gp.sched.bp), " ", hex(gp.syscallbp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n") 4576 throw("entersyscallblock") 4577 }) 4578 } 4579 4580 systemstack(entersyscallblock_handoff) 4581 4582 // Resave for traceback during blocked call. 4583 save(getcallerpc(), getcallersp(), getcallerfp()) 4584 4585 gp.m.locks-- 4586} 4587 4588func entersyscallblock_handoff() { 4589 trace := traceAcquire() 4590 if trace.ok() { 4591 trace.GoSysCall() 4592 traceRelease(trace) 4593 } 4594 handoffp(releasep()) 4595} 4596 4597// The goroutine g exited its system call. 4598// Arrange for it to run on a cpu again. 4599// This is called only from the go syscall library, not 4600// from the low-level system calls used by the runtime. 4601// 4602// Write barriers are not allowed because our P may have been stolen. 4603// 4604// This is exported via linkname to assembly in the syscall package. 4605// 4606// exitsyscall should be an internal detail, 4607// but widely used packages access it using linkname. 4608// Notable members of the hall of shame include: 4609// - gvisor.dev/gvisor 4610// 4611// Do not remove or change the type signature. 4612// See go.dev/issue/67401. 4613// 4614//go:nosplit 4615//go:nowritebarrierrec 4616//go:linkname exitsyscall 4617func exitsyscall() { 4618 gp := getg() 4619 4620 gp.m.locks++ // see comment in entersyscall 4621 if getcallersp() > gp.syscallsp { 4622 throw("exitsyscall: syscall frame is no longer valid") 4623 } 4624 4625 gp.waitsince = 0 4626 oldp := gp.m.oldp.ptr() 4627 gp.m.oldp = 0 4628 if exitsyscallfast(oldp) { 4629 // When exitsyscallfast returns success, we have a P so can now use 4630 // write barriers 4631 if goroutineProfile.active { 4632 // Make sure that gp has had its stack written out to the goroutine 4633 // profile, exactly as it was when the goroutine profiler first 4634 // stopped the world. 4635 systemstack(func() { 4636 tryRecordGoroutineProfileWB(gp) 4637 }) 4638 } 4639 trace := traceAcquire() 4640 if trace.ok() { 4641 lostP := oldp != gp.m.p.ptr() || gp.m.syscalltick != gp.m.p.ptr().syscalltick 4642 systemstack(func() { 4643 // Write out syscall exit eagerly. 4644 // 4645 // It's important that we write this *after* we know whether we 4646 // lost our P or not (determined by exitsyscallfast). 4647 trace.GoSysExit(lostP) 4648 if lostP { 4649 // We lost the P at some point, even though we got it back here. 4650 // Trace that we're starting again, because there was a traceGoSysBlock 4651 // call somewhere in exitsyscallfast (indicating that this goroutine 4652 // had blocked) and we're about to start running again. 4653 trace.GoStart() 4654 } 4655 }) 4656 } 4657 // There's a cpu for us, so we can run. 4658 gp.m.p.ptr().syscalltick++ 4659 // We need to cas the status and scan before resuming... 4660 casgstatus(gp, _Gsyscall, _Grunning) 4661 if trace.ok() { 4662 traceRelease(trace) 4663 } 4664 4665 // Garbage collector isn't running (since we are), 4666 // so okay to clear syscallsp. 4667 gp.syscallsp = 0 4668 gp.m.locks-- 4669 if gp.preempt { 4670 // restore the preemption request in case we've cleared it in newstack 4671 gp.stackguard0 = stackPreempt 4672 } else { 4673 // otherwise restore the real stackGuard, we've spoiled it in entersyscall/entersyscallblock 4674 gp.stackguard0 = gp.stack.lo + stackGuard 4675 } 4676 gp.throwsplit = false 4677 4678 if sched.disable.user && !schedEnabled(gp) { 4679 // Scheduling of this goroutine is disabled. 4680 Gosched() 4681 } 4682 4683 return 4684 } 4685 4686 gp.m.locks-- 4687 4688 // Call the scheduler. 4689 mcall(exitsyscall0) 4690 4691 // Scheduler returned, so we're allowed to run now. 4692 // Delete the syscallsp information that we left for 4693 // the garbage collector during the system call. 4694 // Must wait until now because until gosched returns 4695 // we don't know for sure that the garbage collector 4696 // is not running. 4697 gp.syscallsp = 0 4698 gp.m.p.ptr().syscalltick++ 4699 gp.throwsplit = false 4700} 4701 4702//go:nosplit 4703func exitsyscallfast(oldp *p) bool { 4704 // Freezetheworld sets stopwait but does not retake P's. 4705 if sched.stopwait == freezeStopWait { 4706 return false 4707 } 4708 4709 // Try to re-acquire the last P. 4710 trace := traceAcquire() 4711 if oldp != nil && oldp.status == _Psyscall && atomic.Cas(&oldp.status, _Psyscall, _Pidle) { 4712 // There's a cpu for us, so we can run. 4713 wirep(oldp) 4714 exitsyscallfast_reacquired(trace) 4715 if trace.ok() { 4716 traceRelease(trace) 4717 } 4718 return true 4719 } 4720 if trace.ok() { 4721 traceRelease(trace) 4722 } 4723 4724 // Try to get any other idle P. 4725 if sched.pidle != 0 { 4726 var ok bool 4727 systemstack(func() { 4728 ok = exitsyscallfast_pidle() 4729 }) 4730 if ok { 4731 return true 4732 } 4733 } 4734 return false 4735} 4736 4737// exitsyscallfast_reacquired is the exitsyscall path on which this G 4738// has successfully reacquired the P it was running on before the 4739// syscall. 4740// 4741//go:nosplit 4742func exitsyscallfast_reacquired(trace traceLocker) { 4743 gp := getg() 4744 if gp.m.syscalltick != gp.m.p.ptr().syscalltick { 4745 if trace.ok() { 4746 // The p was retaken and then enter into syscall again (since gp.m.syscalltick has changed). 4747 // traceGoSysBlock for this syscall was already emitted, 4748 // but here we effectively retake the p from the new syscall running on the same p. 4749 systemstack(func() { 4750 // We're stealing the P. It's treated 4751 // as if it temporarily stopped running. Then, start running. 4752 trace.ProcSteal(gp.m.p.ptr(), true) 4753 trace.ProcStart() 4754 }) 4755 } 4756 gp.m.p.ptr().syscalltick++ 4757 } 4758} 4759 4760func exitsyscallfast_pidle() bool { 4761 lock(&sched.lock) 4762 pp, _ := pidleget(0) 4763 if pp != nil && sched.sysmonwait.Load() { 4764 sched.sysmonwait.Store(false) 4765 notewakeup(&sched.sysmonnote) 4766 } 4767 unlock(&sched.lock) 4768 if pp != nil { 4769 acquirep(pp) 4770 return true 4771 } 4772 return false 4773} 4774 4775// exitsyscall slow path on g0. 4776// Failed to acquire P, enqueue gp as runnable. 4777// 4778// Called via mcall, so gp is the calling g from this M. 4779// 4780//go:nowritebarrierrec 4781func exitsyscall0(gp *g) { 4782 var trace traceLocker 4783 traceExitingSyscall() 4784 trace = traceAcquire() 4785 casgstatus(gp, _Gsyscall, _Grunnable) 4786 traceExitedSyscall() 4787 if trace.ok() { 4788 // Write out syscall exit eagerly. 4789 // 4790 // It's important that we write this *after* we know whether we 4791 // lost our P or not (determined by exitsyscallfast). 4792 trace.GoSysExit(true) 4793 traceRelease(trace) 4794 } 4795 dropg() 4796 lock(&sched.lock) 4797 var pp *p 4798 if schedEnabled(gp) { 4799 pp, _ = pidleget(0) 4800 } 4801 var locked bool 4802 if pp == nil { 4803 globrunqput(gp) 4804 4805 // Below, we stoplockedm if gp is locked. globrunqput releases 4806 // ownership of gp, so we must check if gp is locked prior to 4807 // committing the release by unlocking sched.lock, otherwise we 4808 // could race with another M transitioning gp from unlocked to 4809 // locked. 4810 locked = gp.lockedm != 0 4811 } else if sched.sysmonwait.Load() { 4812 sched.sysmonwait.Store(false) 4813 notewakeup(&sched.sysmonnote) 4814 } 4815 unlock(&sched.lock) 4816 if pp != nil { 4817 acquirep(pp) 4818 execute(gp, false) // Never returns. 4819 } 4820 if locked { 4821 // Wait until another thread schedules gp and so m again. 4822 // 4823 // N.B. lockedm must be this M, as this g was running on this M 4824 // before entersyscall. 4825 stoplockedm() 4826 execute(gp, false) // Never returns. 4827 } 4828 stopm() 4829 schedule() // Never returns. 4830} 4831 4832// Called from syscall package before fork. 4833// 4834// syscall_runtime_BeforeFork is for package syscall, 4835// but widely used packages access it using linkname. 4836// Notable members of the hall of shame include: 4837// - github.com/containerd/containerd 4838// - gvisor.dev/gvisor 4839// 4840// Do not remove or change the type signature. 4841// See go.dev/issue/67401. 4842// 4843//go:linkname syscall_runtime_BeforeFork syscall.runtime_BeforeFork 4844//go:nosplit 4845func syscall_runtime_BeforeFork() { 4846 gp := getg().m.curg 4847 4848 // Block signals during a fork, so that the child does not run 4849 // a signal handler before exec if a signal is sent to the process 4850 // group. See issue #18600. 4851 gp.m.locks++ 4852 sigsave(&gp.m.sigmask) 4853 sigblock(false) 4854 4855 // This function is called before fork in syscall package. 4856 // Code between fork and exec must not allocate memory nor even try to grow stack. 4857 // Here we spoil g.stackguard0 to reliably detect any attempts to grow stack. 4858 // runtime_AfterFork will undo this in parent process, but not in child. 4859 gp.stackguard0 = stackFork 4860} 4861 4862// Called from syscall package after fork in parent. 4863// 4864// syscall_runtime_AfterFork is for package syscall, 4865// but widely used packages access it using linkname. 4866// Notable members of the hall of shame include: 4867// - github.com/containerd/containerd 4868// - gvisor.dev/gvisor 4869// 4870// Do not remove or change the type signature. 4871// See go.dev/issue/67401. 4872// 4873//go:linkname syscall_runtime_AfterFork syscall.runtime_AfterFork 4874//go:nosplit 4875func syscall_runtime_AfterFork() { 4876 gp := getg().m.curg 4877 4878 // See the comments in beforefork. 4879 gp.stackguard0 = gp.stack.lo + stackGuard 4880 4881 msigrestore(gp.m.sigmask) 4882 4883 gp.m.locks-- 4884} 4885 4886// inForkedChild is true while manipulating signals in the child process. 4887// This is used to avoid calling libc functions in case we are using vfork. 4888var inForkedChild bool 4889 4890// Called from syscall package after fork in child. 4891// It resets non-sigignored signals to the default handler, and 4892// restores the signal mask in preparation for the exec. 4893// 4894// Because this might be called during a vfork, and therefore may be 4895// temporarily sharing address space with the parent process, this must 4896// not change any global variables or calling into C code that may do so. 4897// 4898// syscall_runtime_AfterForkInChild is for package syscall, 4899// but widely used packages access it using linkname. 4900// Notable members of the hall of shame include: 4901// - github.com/containerd/containerd 4902// - gvisor.dev/gvisor 4903// 4904// Do not remove or change the type signature. 4905// See go.dev/issue/67401. 4906// 4907//go:linkname syscall_runtime_AfterForkInChild syscall.runtime_AfterForkInChild 4908//go:nosplit 4909//go:nowritebarrierrec 4910func syscall_runtime_AfterForkInChild() { 4911 // It's OK to change the global variable inForkedChild here 4912 // because we are going to change it back. There is no race here, 4913 // because if we are sharing address space with the parent process, 4914 // then the parent process can not be running concurrently. 4915 inForkedChild = true 4916 4917 clearSignalHandlers() 4918 4919 // When we are the child we are the only thread running, 4920 // so we know that nothing else has changed gp.m.sigmask. 4921 msigrestore(getg().m.sigmask) 4922 4923 inForkedChild = false 4924} 4925 4926// pendingPreemptSignals is the number of preemption signals 4927// that have been sent but not received. This is only used on Darwin. 4928// For #41702. 4929var pendingPreemptSignals atomic.Int32 4930 4931// Called from syscall package before Exec. 4932// 4933//go:linkname syscall_runtime_BeforeExec syscall.runtime_BeforeExec 4934func syscall_runtime_BeforeExec() { 4935 // Prevent thread creation during exec. 4936 execLock.lock() 4937 4938 // On Darwin, wait for all pending preemption signals to 4939 // be received. See issue #41702. 4940 if GOOS == "darwin" || GOOS == "ios" { 4941 for pendingPreemptSignals.Load() > 0 { 4942 osyield() 4943 } 4944 } 4945} 4946 4947// Called from syscall package after Exec. 4948// 4949//go:linkname syscall_runtime_AfterExec syscall.runtime_AfterExec 4950func syscall_runtime_AfterExec() { 4951 execLock.unlock() 4952} 4953 4954// Allocate a new g, with a stack big enough for stacksize bytes. 4955func malg(stacksize int32) *g { 4956 newg := new(g) 4957 if stacksize >= 0 { 4958 stacksize = round2(stackSystem + stacksize) 4959 systemstack(func() { 4960 newg.stack = stackalloc(uint32(stacksize)) 4961 }) 4962 newg.stackguard0 = newg.stack.lo + stackGuard 4963 newg.stackguard1 = ^uintptr(0) 4964 // Clear the bottom word of the stack. We record g 4965 // there on gsignal stack during VDSO on ARM and ARM64. 4966 *(*uintptr)(unsafe.Pointer(newg.stack.lo)) = 0 4967 } 4968 return newg 4969} 4970 4971// Create a new g running fn. 4972// Put it on the queue of g's waiting to run. 4973// The compiler turns a go statement into a call to this. 4974func newproc(fn *funcval) { 4975 gp := getg() 4976 pc := getcallerpc() 4977 systemstack(func() { 4978 newg := newproc1(fn, gp, pc, false, waitReasonZero) 4979 4980 pp := getg().m.p.ptr() 4981 runqput(pp, newg, true) 4982 4983 if mainStarted { 4984 wakep() 4985 } 4986 }) 4987} 4988 4989// Create a new g in state _Grunnable (or _Gwaiting if parked is true), starting at fn. 4990// callerpc is the address of the go statement that created this. The caller is responsible 4991// for adding the new g to the scheduler. If parked is true, waitreason must be non-zero. 4992func newproc1(fn *funcval, callergp *g, callerpc uintptr, parked bool, waitreason waitReason) *g { 4993 if fn == nil { 4994 fatal("go of nil func value") 4995 } 4996 4997 mp := acquirem() // disable preemption because we hold M and P in local vars. 4998 pp := mp.p.ptr() 4999 newg := gfget(pp) 5000 if newg == nil { 5001 newg = malg(stackMin) 5002 casgstatus(newg, _Gidle, _Gdead) 5003 allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack. 5004 } 5005 if newg.stack.hi == 0 { 5006 throw("newproc1: newg missing stack") 5007 } 5008 5009 if readgstatus(newg) != _Gdead { 5010 throw("newproc1: new g is not Gdead") 5011 } 5012 5013 totalSize := uintptr(4*goarch.PtrSize + sys.MinFrameSize) // extra space in case of reads slightly beyond frame 5014 totalSize = alignUp(totalSize, sys.StackAlign) 5015 sp := newg.stack.hi - totalSize 5016 if usesLR { 5017 // caller's LR 5018 *(*uintptr)(unsafe.Pointer(sp)) = 0 5019 prepGoExitFrame(sp) 5020 } 5021 if GOARCH == "arm64" { 5022 // caller's FP 5023 *(*uintptr)(unsafe.Pointer(sp - goarch.PtrSize)) = 0 5024 } 5025 5026 memclrNoHeapPointers(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched)) 5027 newg.sched.sp = sp 5028 newg.stktopsp = sp 5029 newg.sched.pc = abi.FuncPCABI0(goexit) + sys.PCQuantum // +PCQuantum so that previous instruction is in same function 5030 newg.sched.g = guintptr(unsafe.Pointer(newg)) 5031 gostartcallfn(&newg.sched, fn) 5032 newg.parentGoid = callergp.goid 5033 newg.gopc = callerpc 5034 newg.ancestors = saveAncestors(callergp) 5035 newg.startpc = fn.fn 5036 if isSystemGoroutine(newg, false) { 5037 sched.ngsys.Add(1) 5038 } else { 5039 // Only user goroutines inherit pprof labels. 5040 if mp.curg != nil { 5041 newg.labels = mp.curg.labels 5042 } 5043 if goroutineProfile.active { 5044 // A concurrent goroutine profile is running. It should include 5045 // exactly the set of goroutines that were alive when the goroutine 5046 // profiler first stopped the world. That does not include newg, so 5047 // mark it as not needing a profile before transitioning it from 5048 // _Gdead. 5049 newg.goroutineProfiled.Store(goroutineProfileSatisfied) 5050 } 5051 } 5052 // Track initial transition? 5053 newg.trackingSeq = uint8(cheaprand()) 5054 if newg.trackingSeq%gTrackingPeriod == 0 { 5055 newg.tracking = true 5056 } 5057 gcController.addScannableStack(pp, int64(newg.stack.hi-newg.stack.lo)) 5058 5059 // Get a goid and switch to runnable. Make all this atomic to the tracer. 5060 trace := traceAcquire() 5061 var status uint32 = _Grunnable 5062 if parked { 5063 status = _Gwaiting 5064 newg.waitreason = waitreason 5065 } 5066 casgstatus(newg, _Gdead, status) 5067 if pp.goidcache == pp.goidcacheend { 5068 // Sched.goidgen is the last allocated id, 5069 // this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch]. 5070 // At startup sched.goidgen=0, so main goroutine receives goid=1. 5071 pp.goidcache = sched.goidgen.Add(_GoidCacheBatch) 5072 pp.goidcache -= _GoidCacheBatch - 1 5073 pp.goidcacheend = pp.goidcache + _GoidCacheBatch 5074 } 5075 newg.goid = pp.goidcache 5076 pp.goidcache++ 5077 newg.trace.reset() 5078 if trace.ok() { 5079 trace.GoCreate(newg, newg.startpc, parked) 5080 traceRelease(trace) 5081 } 5082 5083 // Set up race context. 5084 if raceenabled { 5085 newg.racectx = racegostart(callerpc) 5086 newg.raceignore = 0 5087 if newg.labels != nil { 5088 // See note in proflabel.go on labelSync's role in synchronizing 5089 // with the reads in the signal handler. 5090 racereleasemergeg(newg, unsafe.Pointer(&labelSync)) 5091 } 5092 } 5093 releasem(mp) 5094 5095 return newg 5096} 5097 5098// saveAncestors copies previous ancestors of the given caller g and 5099// includes info for the current caller into a new set of tracebacks for 5100// a g being created. 5101func saveAncestors(callergp *g) *[]ancestorInfo { 5102 // Copy all prior info, except for the root goroutine (goid 0). 5103 if debug.tracebackancestors <= 0 || callergp.goid == 0 { 5104 return nil 5105 } 5106 var callerAncestors []ancestorInfo 5107 if callergp.ancestors != nil { 5108 callerAncestors = *callergp.ancestors 5109 } 5110 n := int32(len(callerAncestors)) + 1 5111 if n > debug.tracebackancestors { 5112 n = debug.tracebackancestors 5113 } 5114 ancestors := make([]ancestorInfo, n) 5115 copy(ancestors[1:], callerAncestors) 5116 5117 var pcs [tracebackInnerFrames]uintptr 5118 npcs := gcallers(callergp, 0, pcs[:]) 5119 ipcs := make([]uintptr, npcs) 5120 copy(ipcs, pcs[:]) 5121 ancestors[0] = ancestorInfo{ 5122 pcs: ipcs, 5123 goid: callergp.goid, 5124 gopc: callergp.gopc, 5125 } 5126 5127 ancestorsp := new([]ancestorInfo) 5128 *ancestorsp = ancestors 5129 return ancestorsp 5130} 5131 5132// Put on gfree list. 5133// If local list is too long, transfer a batch to the global list. 5134func gfput(pp *p, gp *g) { 5135 if readgstatus(gp) != _Gdead { 5136 throw("gfput: bad status (not Gdead)") 5137 } 5138 5139 stksize := gp.stack.hi - gp.stack.lo 5140 5141 if stksize != uintptr(startingStackSize) { 5142 // non-standard stack size - free it. 5143 stackfree(gp.stack) 5144 gp.stack.lo = 0 5145 gp.stack.hi = 0 5146 gp.stackguard0 = 0 5147 } 5148 5149 pp.gFree.push(gp) 5150 pp.gFree.n++ 5151 if pp.gFree.n >= 64 { 5152 var ( 5153 inc int32 5154 stackQ gQueue 5155 noStackQ gQueue 5156 ) 5157 for pp.gFree.n >= 32 { 5158 gp := pp.gFree.pop() 5159 pp.gFree.n-- 5160 if gp.stack.lo == 0 { 5161 noStackQ.push(gp) 5162 } else { 5163 stackQ.push(gp) 5164 } 5165 inc++ 5166 } 5167 lock(&sched.gFree.lock) 5168 sched.gFree.noStack.pushAll(noStackQ) 5169 sched.gFree.stack.pushAll(stackQ) 5170 sched.gFree.n += inc 5171 unlock(&sched.gFree.lock) 5172 } 5173} 5174 5175// Get from gfree list. 5176// If local list is empty, grab a batch from global list. 5177func gfget(pp *p) *g { 5178retry: 5179 if pp.gFree.empty() && (!sched.gFree.stack.empty() || !sched.gFree.noStack.empty()) { 5180 lock(&sched.gFree.lock) 5181 // Move a batch of free Gs to the P. 5182 for pp.gFree.n < 32 { 5183 // Prefer Gs with stacks. 5184 gp := sched.gFree.stack.pop() 5185 if gp == nil { 5186 gp = sched.gFree.noStack.pop() 5187 if gp == nil { 5188 break 5189 } 5190 } 5191 sched.gFree.n-- 5192 pp.gFree.push(gp) 5193 pp.gFree.n++ 5194 } 5195 unlock(&sched.gFree.lock) 5196 goto retry 5197 } 5198 gp := pp.gFree.pop() 5199 if gp == nil { 5200 return nil 5201 } 5202 pp.gFree.n-- 5203 if gp.stack.lo != 0 && gp.stack.hi-gp.stack.lo != uintptr(startingStackSize) { 5204 // Deallocate old stack. We kept it in gfput because it was the 5205 // right size when the goroutine was put on the free list, but 5206 // the right size has changed since then. 5207 systemstack(func() { 5208 stackfree(gp.stack) 5209 gp.stack.lo = 0 5210 gp.stack.hi = 0 5211 gp.stackguard0 = 0 5212 }) 5213 } 5214 if gp.stack.lo == 0 { 5215 // Stack was deallocated in gfput or just above. Allocate a new one. 5216 systemstack(func() { 5217 gp.stack = stackalloc(startingStackSize) 5218 }) 5219 gp.stackguard0 = gp.stack.lo + stackGuard 5220 } else { 5221 if raceenabled { 5222 racemalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo) 5223 } 5224 if msanenabled { 5225 msanmalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo) 5226 } 5227 if asanenabled { 5228 asanunpoison(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo) 5229 } 5230 } 5231 return gp 5232} 5233 5234// Purge all cached G's from gfree list to the global list. 5235func gfpurge(pp *p) { 5236 var ( 5237 inc int32 5238 stackQ gQueue 5239 noStackQ gQueue 5240 ) 5241 for !pp.gFree.empty() { 5242 gp := pp.gFree.pop() 5243 pp.gFree.n-- 5244 if gp.stack.lo == 0 { 5245 noStackQ.push(gp) 5246 } else { 5247 stackQ.push(gp) 5248 } 5249 inc++ 5250 } 5251 lock(&sched.gFree.lock) 5252 sched.gFree.noStack.pushAll(noStackQ) 5253 sched.gFree.stack.pushAll(stackQ) 5254 sched.gFree.n += inc 5255 unlock(&sched.gFree.lock) 5256} 5257 5258// Breakpoint executes a breakpoint trap. 5259func Breakpoint() { 5260 breakpoint() 5261} 5262 5263// dolockOSThread is called by LockOSThread and lockOSThread below 5264// after they modify m.locked. Do not allow preemption during this call, 5265// or else the m might be different in this function than in the caller. 5266// 5267//go:nosplit 5268func dolockOSThread() { 5269 if GOARCH == "wasm" { 5270 return // no threads on wasm yet 5271 } 5272 gp := getg() 5273 gp.m.lockedg.set(gp) 5274 gp.lockedm.set(gp.m) 5275} 5276 5277// LockOSThread wires the calling goroutine to its current operating system thread. 5278// The calling goroutine will always execute in that thread, 5279// and no other goroutine will execute in it, 5280// until the calling goroutine has made as many calls to 5281// [UnlockOSThread] as to LockOSThread. 5282// If the calling goroutine exits without unlocking the thread, 5283// the thread will be terminated. 5284// 5285// All init functions are run on the startup thread. Calling LockOSThread 5286// from an init function will cause the main function to be invoked on 5287// that thread. 5288// 5289// A goroutine should call LockOSThread before calling OS services or 5290// non-Go library functions that depend on per-thread state. 5291// 5292//go:nosplit 5293func LockOSThread() { 5294 if atomic.Load(&newmHandoff.haveTemplateThread) == 0 && GOOS != "plan9" { 5295 // If we need to start a new thread from the locked 5296 // thread, we need the template thread. Start it now 5297 // while we're in a known-good state. 5298 startTemplateThread() 5299 } 5300 gp := getg() 5301 gp.m.lockedExt++ 5302 if gp.m.lockedExt == 0 { 5303 gp.m.lockedExt-- 5304 panic("LockOSThread nesting overflow") 5305 } 5306 dolockOSThread() 5307} 5308 5309//go:nosplit 5310func lockOSThread() { 5311 getg().m.lockedInt++ 5312 dolockOSThread() 5313} 5314 5315// dounlockOSThread is called by UnlockOSThread and unlockOSThread below 5316// after they update m->locked. Do not allow preemption during this call, 5317// or else the m might be in different in this function than in the caller. 5318// 5319//go:nosplit 5320func dounlockOSThread() { 5321 if GOARCH == "wasm" { 5322 return // no threads on wasm yet 5323 } 5324 gp := getg() 5325 if gp.m.lockedInt != 0 || gp.m.lockedExt != 0 { 5326 return 5327 } 5328 gp.m.lockedg = 0 5329 gp.lockedm = 0 5330} 5331 5332// UnlockOSThread undoes an earlier call to LockOSThread. 5333// If this drops the number of active LockOSThread calls on the 5334// calling goroutine to zero, it unwires the calling goroutine from 5335// its fixed operating system thread. 5336// If there are no active LockOSThread calls, this is a no-op. 5337// 5338// Before calling UnlockOSThread, the caller must ensure that the OS 5339// thread is suitable for running other goroutines. If the caller made 5340// any permanent changes to the state of the thread that would affect 5341// other goroutines, it should not call this function and thus leave 5342// the goroutine locked to the OS thread until the goroutine (and 5343// hence the thread) exits. 5344// 5345//go:nosplit 5346func UnlockOSThread() { 5347 gp := getg() 5348 if gp.m.lockedExt == 0 { 5349 return 5350 } 5351 gp.m.lockedExt-- 5352 dounlockOSThread() 5353} 5354 5355//go:nosplit 5356func unlockOSThread() { 5357 gp := getg() 5358 if gp.m.lockedInt == 0 { 5359 systemstack(badunlockosthread) 5360 } 5361 gp.m.lockedInt-- 5362 dounlockOSThread() 5363} 5364 5365func badunlockosthread() { 5366 throw("runtime: internal error: misuse of lockOSThread/unlockOSThread") 5367} 5368 5369func gcount() int32 { 5370 n := int32(atomic.Loaduintptr(&allglen)) - sched.gFree.n - sched.ngsys.Load() 5371 for _, pp := range allp { 5372 n -= pp.gFree.n 5373 } 5374 5375 // All these variables can be changed concurrently, so the result can be inconsistent. 5376 // But at least the current goroutine is running. 5377 if n < 1 { 5378 n = 1 5379 } 5380 return n 5381} 5382 5383func mcount() int32 { 5384 return int32(sched.mnext - sched.nmfreed) 5385} 5386 5387var prof struct { 5388 signalLock atomic.Uint32 5389 5390 // Must hold signalLock to write. Reads may be lock-free, but 5391 // signalLock should be taken to synchronize with changes. 5392 hz atomic.Int32 5393} 5394 5395func _System() { _System() } 5396func _ExternalCode() { _ExternalCode() } 5397func _LostExternalCode() { _LostExternalCode() } 5398func _GC() { _GC() } 5399func _LostSIGPROFDuringAtomic64() { _LostSIGPROFDuringAtomic64() } 5400func _LostContendedRuntimeLock() { _LostContendedRuntimeLock() } 5401func _VDSO() { _VDSO() } 5402 5403// Called if we receive a SIGPROF signal. 5404// Called by the signal handler, may run during STW. 5405// 5406//go:nowritebarrierrec 5407func sigprof(pc, sp, lr uintptr, gp *g, mp *m) { 5408 if prof.hz.Load() == 0 { 5409 return 5410 } 5411 5412 // If mp.profilehz is 0, then profiling is not enabled for this thread. 5413 // We must check this to avoid a deadlock between setcpuprofilerate 5414 // and the call to cpuprof.add, below. 5415 if mp != nil && mp.profilehz == 0 { 5416 return 5417 } 5418 5419 // On mips{,le}/arm, 64bit atomics are emulated with spinlocks, in 5420 // internal/runtime/atomic. If SIGPROF arrives while the program is inside 5421 // the critical section, it creates a deadlock (when writing the sample). 5422 // As a workaround, create a counter of SIGPROFs while in critical section 5423 // to store the count, and pass it to sigprof.add() later when SIGPROF is 5424 // received from somewhere else (with _LostSIGPROFDuringAtomic64 as pc). 5425 if GOARCH == "mips" || GOARCH == "mipsle" || GOARCH == "arm" { 5426 if f := findfunc(pc); f.valid() { 5427 if stringslite.HasPrefix(funcname(f), "internal/runtime/atomic") { 5428 cpuprof.lostAtomic++ 5429 return 5430 } 5431 } 5432 if GOARCH == "arm" && goarm < 7 && GOOS == "linux" && pc&0xffff0000 == 0xffff0000 { 5433 // internal/runtime/atomic functions call into kernel 5434 // helpers on arm < 7. See 5435 // internal/runtime/atomic/sys_linux_arm.s. 5436 cpuprof.lostAtomic++ 5437 return 5438 } 5439 } 5440 5441 // Profiling runs concurrently with GC, so it must not allocate. 5442 // Set a trap in case the code does allocate. 5443 // Note that on windows, one thread takes profiles of all the 5444 // other threads, so mp is usually not getg().m. 5445 // In fact mp may not even be stopped. 5446 // See golang.org/issue/17165. 5447 getg().m.mallocing++ 5448 5449 var u unwinder 5450 var stk [maxCPUProfStack]uintptr 5451 n := 0 5452 if mp.ncgo > 0 && mp.curg != nil && mp.curg.syscallpc != 0 && mp.curg.syscallsp != 0 { 5453 cgoOff := 0 5454 // Check cgoCallersUse to make sure that we are not 5455 // interrupting other code that is fiddling with 5456 // cgoCallers. We are running in a signal handler 5457 // with all signals blocked, so we don't have to worry 5458 // about any other code interrupting us. 5459 if mp.cgoCallersUse.Load() == 0 && mp.cgoCallers != nil && mp.cgoCallers[0] != 0 { 5460 for cgoOff < len(mp.cgoCallers) && mp.cgoCallers[cgoOff] != 0 { 5461 cgoOff++ 5462 } 5463 n += copy(stk[:], mp.cgoCallers[:cgoOff]) 5464 mp.cgoCallers[0] = 0 5465 } 5466 5467 // Collect Go stack that leads to the cgo call. 5468 u.initAt(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, unwindSilentErrors) 5469 } else if usesLibcall() && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 { 5470 // Libcall, i.e. runtime syscall on windows. 5471 // Collect Go stack that leads to the call. 5472 u.initAt(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), unwindSilentErrors) 5473 } else if mp != nil && mp.vdsoSP != 0 { 5474 // VDSO call, e.g. nanotime1 on Linux. 5475 // Collect Go stack that leads to the call. 5476 u.initAt(mp.vdsoPC, mp.vdsoSP, 0, gp, unwindSilentErrors|unwindJumpStack) 5477 } else { 5478 u.initAt(pc, sp, lr, gp, unwindSilentErrors|unwindTrap|unwindJumpStack) 5479 } 5480 n += tracebackPCs(&u, 0, stk[n:]) 5481 5482 if n <= 0 { 5483 // Normal traceback is impossible or has failed. 5484 // Account it against abstract "System" or "GC". 5485 n = 2 5486 if inVDSOPage(pc) { 5487 pc = abi.FuncPCABIInternal(_VDSO) + sys.PCQuantum 5488 } else if pc > firstmoduledata.etext { 5489 // "ExternalCode" is better than "etext". 5490 pc = abi.FuncPCABIInternal(_ExternalCode) + sys.PCQuantum 5491 } 5492 stk[0] = pc 5493 if mp.preemptoff != "" { 5494 stk[1] = abi.FuncPCABIInternal(_GC) + sys.PCQuantum 5495 } else { 5496 stk[1] = abi.FuncPCABIInternal(_System) + sys.PCQuantum 5497 } 5498 } 5499 5500 if prof.hz.Load() != 0 { 5501 // Note: it can happen on Windows that we interrupted a system thread 5502 // with no g, so gp could nil. The other nil checks are done out of 5503 // caution, but not expected to be nil in practice. 5504 var tagPtr *unsafe.Pointer 5505 if gp != nil && gp.m != nil && gp.m.curg != nil { 5506 tagPtr = &gp.m.curg.labels 5507 } 5508 cpuprof.add(tagPtr, stk[:n]) 5509 5510 gprof := gp 5511 var mp *m 5512 var pp *p 5513 if gp != nil && gp.m != nil { 5514 if gp.m.curg != nil { 5515 gprof = gp.m.curg 5516 } 5517 mp = gp.m 5518 pp = gp.m.p.ptr() 5519 } 5520 traceCPUSample(gprof, mp, pp, stk[:n]) 5521 } 5522 getg().m.mallocing-- 5523} 5524 5525// setcpuprofilerate sets the CPU profiling rate to hz times per second. 5526// If hz <= 0, setcpuprofilerate turns off CPU profiling. 5527func setcpuprofilerate(hz int32) { 5528 // Force sane arguments. 5529 if hz < 0 { 5530 hz = 0 5531 } 5532 5533 // Disable preemption, otherwise we can be rescheduled to another thread 5534 // that has profiling enabled. 5535 gp := getg() 5536 gp.m.locks++ 5537 5538 // Stop profiler on this thread so that it is safe to lock prof. 5539 // if a profiling signal came in while we had prof locked, 5540 // it would deadlock. 5541 setThreadCPUProfiler(0) 5542 5543 for !prof.signalLock.CompareAndSwap(0, 1) { 5544 osyield() 5545 } 5546 if prof.hz.Load() != hz { 5547 setProcessCPUProfiler(hz) 5548 prof.hz.Store(hz) 5549 } 5550 prof.signalLock.Store(0) 5551 5552 lock(&sched.lock) 5553 sched.profilehz = hz 5554 unlock(&sched.lock) 5555 5556 if hz != 0 { 5557 setThreadCPUProfiler(hz) 5558 } 5559 5560 gp.m.locks-- 5561} 5562 5563// init initializes pp, which may be a freshly allocated p or a 5564// previously destroyed p, and transitions it to status _Pgcstop. 5565func (pp *p) init(id int32) { 5566 pp.id = id 5567 pp.status = _Pgcstop 5568 pp.sudogcache = pp.sudogbuf[:0] 5569 pp.deferpool = pp.deferpoolbuf[:0] 5570 pp.wbBuf.reset() 5571 if pp.mcache == nil { 5572 if id == 0 { 5573 if mcache0 == nil { 5574 throw("missing mcache?") 5575 } 5576 // Use the bootstrap mcache0. Only one P will get 5577 // mcache0: the one with ID 0. 5578 pp.mcache = mcache0 5579 } else { 5580 pp.mcache = allocmcache() 5581 } 5582 } 5583 if raceenabled && pp.raceprocctx == 0 { 5584 if id == 0 { 5585 pp.raceprocctx = raceprocctx0 5586 raceprocctx0 = 0 // bootstrap 5587 } else { 5588 pp.raceprocctx = raceproccreate() 5589 } 5590 } 5591 lockInit(&pp.timers.mu, lockRankTimers) 5592 5593 // This P may get timers when it starts running. Set the mask here 5594 // since the P may not go through pidleget (notably P 0 on startup). 5595 timerpMask.set(id) 5596 // Similarly, we may not go through pidleget before this P starts 5597 // running if it is P 0 on startup. 5598 idlepMask.clear(id) 5599} 5600 5601// destroy releases all of the resources associated with pp and 5602// transitions it to status _Pdead. 5603// 5604// sched.lock must be held and the world must be stopped. 5605func (pp *p) destroy() { 5606 assertLockHeld(&sched.lock) 5607 assertWorldStopped() 5608 5609 // Move all runnable goroutines to the global queue 5610 for pp.runqhead != pp.runqtail { 5611 // Pop from tail of local queue 5612 pp.runqtail-- 5613 gp := pp.runq[pp.runqtail%uint32(len(pp.runq))].ptr() 5614 // Push onto head of global queue 5615 globrunqputhead(gp) 5616 } 5617 if pp.runnext != 0 { 5618 globrunqputhead(pp.runnext.ptr()) 5619 pp.runnext = 0 5620 } 5621 5622 // Move all timers to the local P. 5623 getg().m.p.ptr().timers.take(&pp.timers) 5624 5625 // Flush p's write barrier buffer. 5626 if gcphase != _GCoff { 5627 wbBufFlush1(pp) 5628 pp.gcw.dispose() 5629 } 5630 for i := range pp.sudogbuf { 5631 pp.sudogbuf[i] = nil 5632 } 5633 pp.sudogcache = pp.sudogbuf[:0] 5634 pp.pinnerCache = nil 5635 for j := range pp.deferpoolbuf { 5636 pp.deferpoolbuf[j] = nil 5637 } 5638 pp.deferpool = pp.deferpoolbuf[:0] 5639 systemstack(func() { 5640 for i := 0; i < pp.mspancache.len; i++ { 5641 // Safe to call since the world is stopped. 5642 mheap_.spanalloc.free(unsafe.Pointer(pp.mspancache.buf[i])) 5643 } 5644 pp.mspancache.len = 0 5645 lock(&mheap_.lock) 5646 pp.pcache.flush(&mheap_.pages) 5647 unlock(&mheap_.lock) 5648 }) 5649 freemcache(pp.mcache) 5650 pp.mcache = nil 5651 gfpurge(pp) 5652 if raceenabled { 5653 if pp.timers.raceCtx != 0 { 5654 // The race detector code uses a callback to fetch 5655 // the proc context, so arrange for that callback 5656 // to see the right thing. 5657 // This hack only works because we are the only 5658 // thread running. 5659 mp := getg().m 5660 phold := mp.p.ptr() 5661 mp.p.set(pp) 5662 5663 racectxend(pp.timers.raceCtx) 5664 pp.timers.raceCtx = 0 5665 5666 mp.p.set(phold) 5667 } 5668 raceprocdestroy(pp.raceprocctx) 5669 pp.raceprocctx = 0 5670 } 5671 pp.gcAssistTime = 0 5672 pp.status = _Pdead 5673} 5674 5675// Change number of processors. 5676// 5677// sched.lock must be held, and the world must be stopped. 5678// 5679// gcworkbufs must not be being modified by either the GC or the write barrier 5680// code, so the GC must not be running if the number of Ps actually changes. 5681// 5682// Returns list of Ps with local work, they need to be scheduled by the caller. 5683func procresize(nprocs int32) *p { 5684 assertLockHeld(&sched.lock) 5685 assertWorldStopped() 5686 5687 old := gomaxprocs 5688 if old < 0 || nprocs <= 0 { 5689 throw("procresize: invalid arg") 5690 } 5691 trace := traceAcquire() 5692 if trace.ok() { 5693 trace.Gomaxprocs(nprocs) 5694 traceRelease(trace) 5695 } 5696 5697 // update statistics 5698 now := nanotime() 5699 if sched.procresizetime != 0 { 5700 sched.totaltime += int64(old) * (now - sched.procresizetime) 5701 } 5702 sched.procresizetime = now 5703 5704 maskWords := (nprocs + 31) / 32 5705 5706 // Grow allp if necessary. 5707 if nprocs > int32(len(allp)) { 5708 // Synchronize with retake, which could be running 5709 // concurrently since it doesn't run on a P. 5710 lock(&allpLock) 5711 if nprocs <= int32(cap(allp)) { 5712 allp = allp[:nprocs] 5713 } else { 5714 nallp := make([]*p, nprocs) 5715 // Copy everything up to allp's cap so we 5716 // never lose old allocated Ps. 5717 copy(nallp, allp[:cap(allp)]) 5718 allp = nallp 5719 } 5720 5721 if maskWords <= int32(cap(idlepMask)) { 5722 idlepMask = idlepMask[:maskWords] 5723 timerpMask = timerpMask[:maskWords] 5724 } else { 5725 nidlepMask := make([]uint32, maskWords) 5726 // No need to copy beyond len, old Ps are irrelevant. 5727 copy(nidlepMask, idlepMask) 5728 idlepMask = nidlepMask 5729 5730 ntimerpMask := make([]uint32, maskWords) 5731 copy(ntimerpMask, timerpMask) 5732 timerpMask = ntimerpMask 5733 } 5734 unlock(&allpLock) 5735 } 5736 5737 // initialize new P's 5738 for i := old; i < nprocs; i++ { 5739 pp := allp[i] 5740 if pp == nil { 5741 pp = new(p) 5742 } 5743 pp.init(i) 5744 atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(pp)) 5745 } 5746 5747 gp := getg() 5748 if gp.m.p != 0 && gp.m.p.ptr().id < nprocs { 5749 // continue to use the current P 5750 gp.m.p.ptr().status = _Prunning 5751 gp.m.p.ptr().mcache.prepareForSweep() 5752 } else { 5753 // release the current P and acquire allp[0]. 5754 // 5755 // We must do this before destroying our current P 5756 // because p.destroy itself has write barriers, so we 5757 // need to do that from a valid P. 5758 if gp.m.p != 0 { 5759 trace := traceAcquire() 5760 if trace.ok() { 5761 // Pretend that we were descheduled 5762 // and then scheduled again to keep 5763 // the trace consistent. 5764 trace.GoSched() 5765 trace.ProcStop(gp.m.p.ptr()) 5766 traceRelease(trace) 5767 } 5768 gp.m.p.ptr().m = 0 5769 } 5770 gp.m.p = 0 5771 pp := allp[0] 5772 pp.m = 0 5773 pp.status = _Pidle 5774 acquirep(pp) 5775 trace := traceAcquire() 5776 if trace.ok() { 5777 trace.GoStart() 5778 traceRelease(trace) 5779 } 5780 } 5781 5782 // g.m.p is now set, so we no longer need mcache0 for bootstrapping. 5783 mcache0 = nil 5784 5785 // release resources from unused P's 5786 for i := nprocs; i < old; i++ { 5787 pp := allp[i] 5788 pp.destroy() 5789 // can't free P itself because it can be referenced by an M in syscall 5790 } 5791 5792 // Trim allp. 5793 if int32(len(allp)) != nprocs { 5794 lock(&allpLock) 5795 allp = allp[:nprocs] 5796 idlepMask = idlepMask[:maskWords] 5797 timerpMask = timerpMask[:maskWords] 5798 unlock(&allpLock) 5799 } 5800 5801 var runnablePs *p 5802 for i := nprocs - 1; i >= 0; i-- { 5803 pp := allp[i] 5804 if gp.m.p.ptr() == pp { 5805 continue 5806 } 5807 pp.status = _Pidle 5808 if runqempty(pp) { 5809 pidleput(pp, now) 5810 } else { 5811 pp.m.set(mget()) 5812 pp.link.set(runnablePs) 5813 runnablePs = pp 5814 } 5815 } 5816 stealOrder.reset(uint32(nprocs)) 5817 var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32 5818 atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs)) 5819 if old != nprocs { 5820 // Notify the limiter that the amount of procs has changed. 5821 gcCPULimiter.resetCapacity(now, nprocs) 5822 } 5823 return runnablePs 5824} 5825 5826// Associate p and the current m. 5827// 5828// This function is allowed to have write barriers even if the caller 5829// isn't because it immediately acquires pp. 5830// 5831//go:yeswritebarrierrec 5832func acquirep(pp *p) { 5833 // Do the part that isn't allowed to have write barriers. 5834 wirep(pp) 5835 5836 // Have p; write barriers now allowed. 5837 5838 // Perform deferred mcache flush before this P can allocate 5839 // from a potentially stale mcache. 5840 pp.mcache.prepareForSweep() 5841 5842 trace := traceAcquire() 5843 if trace.ok() { 5844 trace.ProcStart() 5845 traceRelease(trace) 5846 } 5847} 5848 5849// wirep is the first step of acquirep, which actually associates the 5850// current M to pp. This is broken out so we can disallow write 5851// barriers for this part, since we don't yet have a P. 5852// 5853//go:nowritebarrierrec 5854//go:nosplit 5855func wirep(pp *p) { 5856 gp := getg() 5857 5858 if gp.m.p != 0 { 5859 // Call on the systemstack to avoid a nosplit overflow build failure 5860 // on some platforms when built with -N -l. See #64113. 5861 systemstack(func() { 5862 throw("wirep: already in go") 5863 }) 5864 } 5865 if pp.m != 0 || pp.status != _Pidle { 5866 // Call on the systemstack to avoid a nosplit overflow build failure 5867 // on some platforms when built with -N -l. See #64113. 5868 systemstack(func() { 5869 id := int64(0) 5870 if pp.m != 0 { 5871 id = pp.m.ptr().id 5872 } 5873 print("wirep: p->m=", pp.m, "(", id, ") p->status=", pp.status, "\n") 5874 throw("wirep: invalid p state") 5875 }) 5876 } 5877 gp.m.p.set(pp) 5878 pp.m.set(gp.m) 5879 pp.status = _Prunning 5880} 5881 5882// Disassociate p and the current m. 5883func releasep() *p { 5884 trace := traceAcquire() 5885 if trace.ok() { 5886 trace.ProcStop(getg().m.p.ptr()) 5887 traceRelease(trace) 5888 } 5889 return releasepNoTrace() 5890} 5891 5892// Disassociate p and the current m without tracing an event. 5893func releasepNoTrace() *p { 5894 gp := getg() 5895 5896 if gp.m.p == 0 { 5897 throw("releasep: invalid arg") 5898 } 5899 pp := gp.m.p.ptr() 5900 if pp.m.ptr() != gp.m || pp.status != _Prunning { 5901 print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n") 5902 throw("releasep: invalid p state") 5903 } 5904 gp.m.p = 0 5905 pp.m = 0 5906 pp.status = _Pidle 5907 return pp 5908} 5909 5910func incidlelocked(v int32) { 5911 lock(&sched.lock) 5912 sched.nmidlelocked += v 5913 if v > 0 { 5914 checkdead() 5915 } 5916 unlock(&sched.lock) 5917} 5918 5919// Check for deadlock situation. 5920// The check is based on number of running M's, if 0 -> deadlock. 5921// sched.lock must be held. 5922func checkdead() { 5923 assertLockHeld(&sched.lock) 5924 5925 // For -buildmode=c-shared or -buildmode=c-archive it's OK if 5926 // there are no running goroutines. The calling program is 5927 // assumed to be running. 5928 if islibrary || isarchive { 5929 return 5930 } 5931 5932 // If we are dying because of a signal caught on an already idle thread, 5933 // freezetheworld will cause all running threads to block. 5934 // And runtime will essentially enter into deadlock state, 5935 // except that there is a thread that will call exit soon. 5936 if panicking.Load() > 0 { 5937 return 5938 } 5939 5940 // If we are not running under cgo, but we have an extra M then account 5941 // for it. (It is possible to have an extra M on Windows without cgo to 5942 // accommodate callbacks created by syscall.NewCallback. See issue #6751 5943 // for details.) 5944 var run0 int32 5945 if !iscgo && cgoHasExtraM && extraMLength.Load() > 0 { 5946 run0 = 1 5947 } 5948 5949 run := mcount() - sched.nmidle - sched.nmidlelocked - sched.nmsys 5950 if run > run0 { 5951 return 5952 } 5953 if run < 0 { 5954 print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", mcount(), " nmsys=", sched.nmsys, "\n") 5955 unlock(&sched.lock) 5956 throw("checkdead: inconsistent counts") 5957 } 5958 5959 grunning := 0 5960 forEachG(func(gp *g) { 5961 if isSystemGoroutine(gp, false) { 5962 return 5963 } 5964 s := readgstatus(gp) 5965 switch s &^ _Gscan { 5966 case _Gwaiting, 5967 _Gpreempted: 5968 grunning++ 5969 case _Grunnable, 5970 _Grunning, 5971 _Gsyscall: 5972 print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n") 5973 unlock(&sched.lock) 5974 throw("checkdead: runnable g") 5975 } 5976 }) 5977 if grunning == 0 { // possible if main goroutine calls runtime·Goexit() 5978 unlock(&sched.lock) // unlock so that GODEBUG=scheddetail=1 doesn't hang 5979 fatal("no goroutines (main called runtime.Goexit) - deadlock!") 5980 } 5981 5982 // Maybe jump time forward for playground. 5983 if faketime != 0 { 5984 if when := timeSleepUntil(); when < maxWhen { 5985 faketime = when 5986 5987 // Start an M to steal the timer. 5988 pp, _ := pidleget(faketime) 5989 if pp == nil { 5990 // There should always be a free P since 5991 // nothing is running. 5992 unlock(&sched.lock) 5993 throw("checkdead: no p for timer") 5994 } 5995 mp := mget() 5996 if mp == nil { 5997 // There should always be a free M since 5998 // nothing is running. 5999 unlock(&sched.lock) 6000 throw("checkdead: no m for timer") 6001 } 6002 // M must be spinning to steal. We set this to be 6003 // explicit, but since this is the only M it would 6004 // become spinning on its own anyways. 6005 sched.nmspinning.Add(1) 6006 mp.spinning = true 6007 mp.nextp.set(pp) 6008 notewakeup(&mp.park) 6009 return 6010 } 6011 } 6012 6013 // There are no goroutines running, so we can look at the P's. 6014 for _, pp := range allp { 6015 if len(pp.timers.heap) > 0 { 6016 return 6017 } 6018 } 6019 6020 unlock(&sched.lock) // unlock so that GODEBUG=scheddetail=1 doesn't hang 6021 fatal("all goroutines are asleep - deadlock!") 6022} 6023 6024// forcegcperiod is the maximum time in nanoseconds between garbage 6025// collections. If we go this long without a garbage collection, one 6026// is forced to run. 6027// 6028// This is a variable for testing purposes. It normally doesn't change. 6029var forcegcperiod int64 = 2 * 60 * 1e9 6030 6031// needSysmonWorkaround is true if the workaround for 6032// golang.org/issue/42515 is needed on NetBSD. 6033var needSysmonWorkaround bool = false 6034 6035// haveSysmon indicates whether there is sysmon thread support. 6036// 6037// No threads on wasm yet, so no sysmon. 6038const haveSysmon = GOARCH != "wasm" 6039 6040// Always runs without a P, so write barriers are not allowed. 6041// 6042//go:nowritebarrierrec 6043func sysmon() { 6044 lock(&sched.lock) 6045 sched.nmsys++ 6046 checkdead() 6047 unlock(&sched.lock) 6048 6049 lasttrace := int64(0) 6050 idle := 0 // how many cycles in succession we had not wokeup somebody 6051 delay := uint32(0) 6052 6053 for { 6054 if idle == 0 { // start with 20us sleep... 6055 delay = 20 6056 } else if idle > 50 { // start doubling the sleep after 1ms... 6057 delay *= 2 6058 } 6059 if delay > 10*1000 { // up to 10ms 6060 delay = 10 * 1000 6061 } 6062 usleep(delay) 6063 6064 // sysmon should not enter deep sleep if schedtrace is enabled so that 6065 // it can print that information at the right time. 6066 // 6067 // It should also not enter deep sleep if there are any active P's so 6068 // that it can retake P's from syscalls, preempt long running G's, and 6069 // poll the network if all P's are busy for long stretches. 6070 // 6071 // It should wakeup from deep sleep if any P's become active either due 6072 // to exiting a syscall or waking up due to a timer expiring so that it 6073 // can resume performing those duties. If it wakes from a syscall it 6074 // resets idle and delay as a bet that since it had retaken a P from a 6075 // syscall before, it may need to do it again shortly after the 6076 // application starts work again. It does not reset idle when waking 6077 // from a timer to avoid adding system load to applications that spend 6078 // most of their time sleeping. 6079 now := nanotime() 6080 if debug.schedtrace <= 0 && (sched.gcwaiting.Load() || sched.npidle.Load() == gomaxprocs) { 6081 lock(&sched.lock) 6082 if sched.gcwaiting.Load() || sched.npidle.Load() == gomaxprocs { 6083 syscallWake := false 6084 next := timeSleepUntil() 6085 if next > now { 6086 sched.sysmonwait.Store(true) 6087 unlock(&sched.lock) 6088 // Make wake-up period small enough 6089 // for the sampling to be correct. 6090 sleep := forcegcperiod / 2 6091 if next-now < sleep { 6092 sleep = next - now 6093 } 6094 shouldRelax := sleep >= osRelaxMinNS 6095 if shouldRelax { 6096 osRelax(true) 6097 } 6098 syscallWake = notetsleep(&sched.sysmonnote, sleep) 6099 if shouldRelax { 6100 osRelax(false) 6101 } 6102 lock(&sched.lock) 6103 sched.sysmonwait.Store(false) 6104 noteclear(&sched.sysmonnote) 6105 } 6106 if syscallWake { 6107 idle = 0 6108 delay = 20 6109 } 6110 } 6111 unlock(&sched.lock) 6112 } 6113 6114 lock(&sched.sysmonlock) 6115 // Update now in case we blocked on sysmonnote or spent a long time 6116 // blocked on schedlock or sysmonlock above. 6117 now = nanotime() 6118 6119 // trigger libc interceptors if needed 6120 if *cgo_yield != nil { 6121 asmcgocall(*cgo_yield, nil) 6122 } 6123 // poll network if not polled for more than 10ms 6124 lastpoll := sched.lastpoll.Load() 6125 if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now { 6126 sched.lastpoll.CompareAndSwap(lastpoll, now) 6127 list, delta := netpoll(0) // non-blocking - returns list of goroutines 6128 if !list.empty() { 6129 // Need to decrement number of idle locked M's 6130 // (pretending that one more is running) before injectglist. 6131 // Otherwise it can lead to the following situation: 6132 // injectglist grabs all P's but before it starts M's to run the P's, 6133 // another M returns from syscall, finishes running its G, 6134 // observes that there is no work to do and no other running M's 6135 // and reports deadlock. 6136 incidlelocked(-1) 6137 injectglist(&list) 6138 incidlelocked(1) 6139 netpollAdjustWaiters(delta) 6140 } 6141 } 6142 if GOOS == "netbsd" && needSysmonWorkaround { 6143 // netpoll is responsible for waiting for timer 6144 // expiration, so we typically don't have to worry 6145 // about starting an M to service timers. (Note that 6146 // sleep for timeSleepUntil above simply ensures sysmon 6147 // starts running again when that timer expiration may 6148 // cause Go code to run again). 6149 // 6150 // However, netbsd has a kernel bug that sometimes 6151 // misses netpollBreak wake-ups, which can lead to 6152 // unbounded delays servicing timers. If we detect this 6153 // overrun, then startm to get something to handle the 6154 // timer. 6155 // 6156 // See issue 42515 and 6157 // https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=50094. 6158 if next := timeSleepUntil(); next < now { 6159 startm(nil, false, false) 6160 } 6161 } 6162 if scavenger.sysmonWake.Load() != 0 { 6163 // Kick the scavenger awake if someone requested it. 6164 scavenger.wake() 6165 } 6166 // retake P's blocked in syscalls 6167 // and preempt long running G's 6168 if retake(now) != 0 { 6169 idle = 0 6170 } else { 6171 idle++ 6172 } 6173 // check if we need to force a GC 6174 if t := (gcTrigger{kind: gcTriggerTime, now: now}); t.test() && forcegc.idle.Load() { 6175 lock(&forcegc.lock) 6176 forcegc.idle.Store(false) 6177 var list gList 6178 list.push(forcegc.g) 6179 injectglist(&list) 6180 unlock(&forcegc.lock) 6181 } 6182 if debug.schedtrace > 0 && lasttrace+int64(debug.schedtrace)*1000000 <= now { 6183 lasttrace = now 6184 schedtrace(debug.scheddetail > 0) 6185 } 6186 unlock(&sched.sysmonlock) 6187 } 6188} 6189 6190type sysmontick struct { 6191 schedtick uint32 6192 syscalltick uint32 6193 schedwhen int64 6194 syscallwhen int64 6195} 6196 6197// forcePreemptNS is the time slice given to a G before it is 6198// preempted. 6199const forcePreemptNS = 10 * 1000 * 1000 // 10ms 6200 6201func retake(now int64) uint32 { 6202 n := 0 6203 // Prevent allp slice changes. This lock will be completely 6204 // uncontended unless we're already stopping the world. 6205 lock(&allpLock) 6206 // We can't use a range loop over allp because we may 6207 // temporarily drop the allpLock. Hence, we need to re-fetch 6208 // allp each time around the loop. 6209 for i := 0; i < len(allp); i++ { 6210 pp := allp[i] 6211 if pp == nil { 6212 // This can happen if procresize has grown 6213 // allp but not yet created new Ps. 6214 continue 6215 } 6216 pd := &pp.sysmontick 6217 s := pp.status 6218 sysretake := false 6219 if s == _Prunning || s == _Psyscall { 6220 // Preempt G if it's running on the same schedtick for 6221 // too long. This could be from a single long-running 6222 // goroutine or a sequence of goroutines run via 6223 // runnext, which share a single schedtick time slice. 6224 t := int64(pp.schedtick) 6225 if int64(pd.schedtick) != t { 6226 pd.schedtick = uint32(t) 6227 pd.schedwhen = now 6228 } else if pd.schedwhen+forcePreemptNS <= now { 6229 preemptone(pp) 6230 // In case of syscall, preemptone() doesn't 6231 // work, because there is no M wired to P. 6232 sysretake = true 6233 } 6234 } 6235 if s == _Psyscall { 6236 // Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us). 6237 t := int64(pp.syscalltick) 6238 if !sysretake && int64(pd.syscalltick) != t { 6239 pd.syscalltick = uint32(t) 6240 pd.syscallwhen = now 6241 continue 6242 } 6243 // On the one hand we don't want to retake Ps if there is no other work to do, 6244 // but on the other hand we want to retake them eventually 6245 // because they can prevent the sysmon thread from deep sleep. 6246 if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now { 6247 continue 6248 } 6249 // Drop allpLock so we can take sched.lock. 6250 unlock(&allpLock) 6251 // Need to decrement number of idle locked M's 6252 // (pretending that one more is running) before the CAS. 6253 // Otherwise the M from which we retake can exit the syscall, 6254 // increment nmidle and report deadlock. 6255 incidlelocked(-1) 6256 trace := traceAcquire() 6257 if atomic.Cas(&pp.status, s, _Pidle) { 6258 if trace.ok() { 6259 trace.ProcSteal(pp, false) 6260 traceRelease(trace) 6261 } 6262 n++ 6263 pp.syscalltick++ 6264 handoffp(pp) 6265 } else if trace.ok() { 6266 traceRelease(trace) 6267 } 6268 incidlelocked(1) 6269 lock(&allpLock) 6270 } 6271 } 6272 unlock(&allpLock) 6273 return uint32(n) 6274} 6275 6276// Tell all goroutines that they have been preempted and they should stop. 6277// This function is purely best-effort. It can fail to inform a goroutine if a 6278// processor just started running it. 6279// No locks need to be held. 6280// Returns true if preemption request was issued to at least one goroutine. 6281func preemptall() bool { 6282 res := false 6283 for _, pp := range allp { 6284 if pp.status != _Prunning { 6285 continue 6286 } 6287 if preemptone(pp) { 6288 res = true 6289 } 6290 } 6291 return res 6292} 6293 6294// Tell the goroutine running on processor P to stop. 6295// This function is purely best-effort. It can incorrectly fail to inform the 6296// goroutine. It can inform the wrong goroutine. Even if it informs the 6297// correct goroutine, that goroutine might ignore the request if it is 6298// simultaneously executing newstack. 6299// No lock needs to be held. 6300// Returns true if preemption request was issued. 6301// The actual preemption will happen at some point in the future 6302// and will be indicated by the gp->status no longer being 6303// Grunning 6304func preemptone(pp *p) bool { 6305 mp := pp.m.ptr() 6306 if mp == nil || mp == getg().m { 6307 return false 6308 } 6309 gp := mp.curg 6310 if gp == nil || gp == mp.g0 { 6311 return false 6312 } 6313 6314 gp.preempt = true 6315 6316 // Every call in a goroutine checks for stack overflow by 6317 // comparing the current stack pointer to gp->stackguard0. 6318 // Setting gp->stackguard0 to StackPreempt folds 6319 // preemption into the normal stack overflow check. 6320 gp.stackguard0 = stackPreempt 6321 6322 // Request an async preemption of this P. 6323 if preemptMSupported && debug.asyncpreemptoff == 0 { 6324 pp.preempt = true 6325 preemptM(mp) 6326 } 6327 6328 return true 6329} 6330 6331var starttime int64 6332 6333func schedtrace(detailed bool) { 6334 now := nanotime() 6335 if starttime == 0 { 6336 starttime = now 6337 } 6338 6339 lock(&sched.lock) 6340 print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle.Load(), " threads=", mcount(), " spinningthreads=", sched.nmspinning.Load(), " needspinning=", sched.needspinning.Load(), " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize) 6341 if detailed { 6342 print(" gcwaiting=", sched.gcwaiting.Load(), " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait.Load(), "\n") 6343 } 6344 // We must be careful while reading data from P's, M's and G's. 6345 // Even if we hold schedlock, most data can be changed concurrently. 6346 // E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil. 6347 for i, pp := range allp { 6348 mp := pp.m.ptr() 6349 h := atomic.Load(&pp.runqhead) 6350 t := atomic.Load(&pp.runqtail) 6351 if detailed { 6352 print(" P", i, ": status=", pp.status, " schedtick=", pp.schedtick, " syscalltick=", pp.syscalltick, " m=") 6353 if mp != nil { 6354 print(mp.id) 6355 } else { 6356 print("nil") 6357 } 6358 print(" runqsize=", t-h, " gfreecnt=", pp.gFree.n, " timerslen=", len(pp.timers.heap), "\n") 6359 } else { 6360 // In non-detailed mode format lengths of per-P run queues as: 6361 // [len1 len2 len3 len4] 6362 print(" ") 6363 if i == 0 { 6364 print("[") 6365 } 6366 print(t - h) 6367 if i == len(allp)-1 { 6368 print("]\n") 6369 } 6370 } 6371 } 6372 6373 if !detailed { 6374 unlock(&sched.lock) 6375 return 6376 } 6377 6378 for mp := allm; mp != nil; mp = mp.alllink { 6379 pp := mp.p.ptr() 6380 print(" M", mp.id, ": p=") 6381 if pp != nil { 6382 print(pp.id) 6383 } else { 6384 print("nil") 6385 } 6386 print(" curg=") 6387 if mp.curg != nil { 6388 print(mp.curg.goid) 6389 } else { 6390 print("nil") 6391 } 6392 print(" mallocing=", mp.mallocing, " throwing=", mp.throwing, " preemptoff=", mp.preemptoff, " locks=", mp.locks, " dying=", mp.dying, " spinning=", mp.spinning, " blocked=", mp.blocked, " lockedg=") 6393 if lockedg := mp.lockedg.ptr(); lockedg != nil { 6394 print(lockedg.goid) 6395 } else { 6396 print("nil") 6397 } 6398 print("\n") 6399 } 6400 6401 forEachG(func(gp *g) { 6402 print(" G", gp.goid, ": status=", readgstatus(gp), "(", gp.waitreason.String(), ") m=") 6403 if gp.m != nil { 6404 print(gp.m.id) 6405 } else { 6406 print("nil") 6407 } 6408 print(" lockedm=") 6409 if lockedm := gp.lockedm.ptr(); lockedm != nil { 6410 print(lockedm.id) 6411 } else { 6412 print("nil") 6413 } 6414 print("\n") 6415 }) 6416 unlock(&sched.lock) 6417} 6418 6419// schedEnableUser enables or disables the scheduling of user 6420// goroutines. 6421// 6422// This does not stop already running user goroutines, so the caller 6423// should first stop the world when disabling user goroutines. 6424func schedEnableUser(enable bool) { 6425 lock(&sched.lock) 6426 if sched.disable.user == !enable { 6427 unlock(&sched.lock) 6428 return 6429 } 6430 sched.disable.user = !enable 6431 if enable { 6432 n := sched.disable.n 6433 sched.disable.n = 0 6434 globrunqputbatch(&sched.disable.runnable, n) 6435 unlock(&sched.lock) 6436 for ; n != 0 && sched.npidle.Load() != 0; n-- { 6437 startm(nil, false, false) 6438 } 6439 } else { 6440 unlock(&sched.lock) 6441 } 6442} 6443 6444// schedEnabled reports whether gp should be scheduled. It returns 6445// false is scheduling of gp is disabled. 6446// 6447// sched.lock must be held. 6448func schedEnabled(gp *g) bool { 6449 assertLockHeld(&sched.lock) 6450 6451 if sched.disable.user { 6452 return isSystemGoroutine(gp, true) 6453 } 6454 return true 6455} 6456 6457// Put mp on midle list. 6458// sched.lock must be held. 6459// May run during STW, so write barriers are not allowed. 6460// 6461//go:nowritebarrierrec 6462func mput(mp *m) { 6463 assertLockHeld(&sched.lock) 6464 6465 mp.schedlink = sched.midle 6466 sched.midle.set(mp) 6467 sched.nmidle++ 6468 checkdead() 6469} 6470 6471// Try to get an m from midle list. 6472// sched.lock must be held. 6473// May run during STW, so write barriers are not allowed. 6474// 6475//go:nowritebarrierrec 6476func mget() *m { 6477 assertLockHeld(&sched.lock) 6478 6479 mp := sched.midle.ptr() 6480 if mp != nil { 6481 sched.midle = mp.schedlink 6482 sched.nmidle-- 6483 } 6484 return mp 6485} 6486 6487// Put gp on the global runnable queue. 6488// sched.lock must be held. 6489// May run during STW, so write barriers are not allowed. 6490// 6491//go:nowritebarrierrec 6492func globrunqput(gp *g) { 6493 assertLockHeld(&sched.lock) 6494 6495 sched.runq.pushBack(gp) 6496 sched.runqsize++ 6497} 6498 6499// Put gp at the head of the global runnable queue. 6500// sched.lock must be held. 6501// May run during STW, so write barriers are not allowed. 6502// 6503//go:nowritebarrierrec 6504func globrunqputhead(gp *g) { 6505 assertLockHeld(&sched.lock) 6506 6507 sched.runq.push(gp) 6508 sched.runqsize++ 6509} 6510 6511// Put a batch of runnable goroutines on the global runnable queue. 6512// This clears *batch. 6513// sched.lock must be held. 6514// May run during STW, so write barriers are not allowed. 6515// 6516//go:nowritebarrierrec 6517func globrunqputbatch(batch *gQueue, n int32) { 6518 assertLockHeld(&sched.lock) 6519 6520 sched.runq.pushBackAll(*batch) 6521 sched.runqsize += n 6522 *batch = gQueue{} 6523} 6524 6525// Try get a batch of G's from the global runnable queue. 6526// sched.lock must be held. 6527func globrunqget(pp *p, max int32) *g { 6528 assertLockHeld(&sched.lock) 6529 6530 if sched.runqsize == 0 { 6531 return nil 6532 } 6533 6534 n := sched.runqsize/gomaxprocs + 1 6535 if n > sched.runqsize { 6536 n = sched.runqsize 6537 } 6538 if max > 0 && n > max { 6539 n = max 6540 } 6541 if n > int32(len(pp.runq))/2 { 6542 n = int32(len(pp.runq)) / 2 6543 } 6544 6545 sched.runqsize -= n 6546 6547 gp := sched.runq.pop() 6548 n-- 6549 for ; n > 0; n-- { 6550 gp1 := sched.runq.pop() 6551 runqput(pp, gp1, false) 6552 } 6553 return gp 6554} 6555 6556// pMask is an atomic bitstring with one bit per P. 6557type pMask []uint32 6558 6559// read returns true if P id's bit is set. 6560func (p pMask) read(id uint32) bool { 6561 word := id / 32 6562 mask := uint32(1) << (id % 32) 6563 return (atomic.Load(&p[word]) & mask) != 0 6564} 6565 6566// set sets P id's bit. 6567func (p pMask) set(id int32) { 6568 word := id / 32 6569 mask := uint32(1) << (id % 32) 6570 atomic.Or(&p[word], mask) 6571} 6572 6573// clear clears P id's bit. 6574func (p pMask) clear(id int32) { 6575 word := id / 32 6576 mask := uint32(1) << (id % 32) 6577 atomic.And(&p[word], ^mask) 6578} 6579 6580// pidleput puts p on the _Pidle list. now must be a relatively recent call 6581// to nanotime or zero. Returns now or the current time if now was zero. 6582// 6583// This releases ownership of p. Once sched.lock is released it is no longer 6584// safe to use p. 6585// 6586// sched.lock must be held. 6587// 6588// May run during STW, so write barriers are not allowed. 6589// 6590//go:nowritebarrierrec 6591func pidleput(pp *p, now int64) int64 { 6592 assertLockHeld(&sched.lock) 6593 6594 if !runqempty(pp) { 6595 throw("pidleput: P has non-empty run queue") 6596 } 6597 if now == 0 { 6598 now = nanotime() 6599 } 6600 if pp.timers.len.Load() == 0 { 6601 timerpMask.clear(pp.id) 6602 } 6603 idlepMask.set(pp.id) 6604 pp.link = sched.pidle 6605 sched.pidle.set(pp) 6606 sched.npidle.Add(1) 6607 if !pp.limiterEvent.start(limiterEventIdle, now) { 6608 throw("must be able to track idle limiter event") 6609 } 6610 return now 6611} 6612 6613// pidleget tries to get a p from the _Pidle list, acquiring ownership. 6614// 6615// sched.lock must be held. 6616// 6617// May run during STW, so write barriers are not allowed. 6618// 6619//go:nowritebarrierrec 6620func pidleget(now int64) (*p, int64) { 6621 assertLockHeld(&sched.lock) 6622 6623 pp := sched.pidle.ptr() 6624 if pp != nil { 6625 // Timer may get added at any time now. 6626 if now == 0 { 6627 now = nanotime() 6628 } 6629 timerpMask.set(pp.id) 6630 idlepMask.clear(pp.id) 6631 sched.pidle = pp.link 6632 sched.npidle.Add(-1) 6633 pp.limiterEvent.stop(limiterEventIdle, now) 6634 } 6635 return pp, now 6636} 6637 6638// pidlegetSpinning tries to get a p from the _Pidle list, acquiring ownership. 6639// This is called by spinning Ms (or callers than need a spinning M) that have 6640// found work. If no P is available, this must synchronized with non-spinning 6641// Ms that may be preparing to drop their P without discovering this work. 6642// 6643// sched.lock must be held. 6644// 6645// May run during STW, so write barriers are not allowed. 6646// 6647//go:nowritebarrierrec 6648func pidlegetSpinning(now int64) (*p, int64) { 6649 assertLockHeld(&sched.lock) 6650 6651 pp, now := pidleget(now) 6652 if pp == nil { 6653 // See "Delicate dance" comment in findrunnable. We found work 6654 // that we cannot take, we must synchronize with non-spinning 6655 // Ms that may be preparing to drop their P. 6656 sched.needspinning.Store(1) 6657 return nil, now 6658 } 6659 6660 return pp, now 6661} 6662 6663// runqempty reports whether pp has no Gs on its local run queue. 6664// It never returns true spuriously. 6665func runqempty(pp *p) bool { 6666 // Defend against a race where 1) pp has G1 in runqnext but runqhead == runqtail, 6667 // 2) runqput on pp kicks G1 to the runq, 3) runqget on pp empties runqnext. 6668 // Simply observing that runqhead == runqtail and then observing that runqnext == nil 6669 // does not mean the queue is empty. 6670 for { 6671 head := atomic.Load(&pp.runqhead) 6672 tail := atomic.Load(&pp.runqtail) 6673 runnext := atomic.Loaduintptr((*uintptr)(unsafe.Pointer(&pp.runnext))) 6674 if tail == atomic.Load(&pp.runqtail) { 6675 return head == tail && runnext == 0 6676 } 6677 } 6678} 6679 6680// To shake out latent assumptions about scheduling order, 6681// we introduce some randomness into scheduling decisions 6682// when running with the race detector. 6683// The need for this was made obvious by changing the 6684// (deterministic) scheduling order in Go 1.5 and breaking 6685// many poorly-written tests. 6686// With the randomness here, as long as the tests pass 6687// consistently with -race, they shouldn't have latent scheduling 6688// assumptions. 6689const randomizeScheduler = raceenabled 6690 6691// runqput tries to put g on the local runnable queue. 6692// If next is false, runqput adds g to the tail of the runnable queue. 6693// If next is true, runqput puts g in the pp.runnext slot. 6694// If the run queue is full, runnext puts g on the global queue. 6695// Executed only by the owner P. 6696func runqput(pp *p, gp *g, next bool) { 6697 if !haveSysmon && next { 6698 // A runnext goroutine shares the same time slice as the 6699 // current goroutine (inheritTime from runqget). To prevent a 6700 // ping-pong pair of goroutines from starving all others, we 6701 // depend on sysmon to preempt "long-running goroutines". That 6702 // is, any set of goroutines sharing the same time slice. 6703 // 6704 // If there is no sysmon, we must avoid runnext entirely or 6705 // risk starvation. 6706 next = false 6707 } 6708 if randomizeScheduler && next && randn(2) == 0 { 6709 next = false 6710 } 6711 6712 if next { 6713 retryNext: 6714 oldnext := pp.runnext 6715 if !pp.runnext.cas(oldnext, guintptr(unsafe.Pointer(gp))) { 6716 goto retryNext 6717 } 6718 if oldnext == 0 { 6719 return 6720 } 6721 // Kick the old runnext out to the regular run queue. 6722 gp = oldnext.ptr() 6723 } 6724 6725retry: 6726 h := atomic.LoadAcq(&pp.runqhead) // load-acquire, synchronize with consumers 6727 t := pp.runqtail 6728 if t-h < uint32(len(pp.runq)) { 6729 pp.runq[t%uint32(len(pp.runq))].set(gp) 6730 atomic.StoreRel(&pp.runqtail, t+1) // store-release, makes the item available for consumption 6731 return 6732 } 6733 if runqputslow(pp, gp, h, t) { 6734 return 6735 } 6736 // the queue is not full, now the put above must succeed 6737 goto retry 6738} 6739 6740// Put g and a batch of work from local runnable queue on global queue. 6741// Executed only by the owner P. 6742func runqputslow(pp *p, gp *g, h, t uint32) bool { 6743 var batch [len(pp.runq)/2 + 1]*g 6744 6745 // First, grab a batch from local queue. 6746 n := t - h 6747 n = n / 2 6748 if n != uint32(len(pp.runq)/2) { 6749 throw("runqputslow: queue is not full") 6750 } 6751 for i := uint32(0); i < n; i++ { 6752 batch[i] = pp.runq[(h+i)%uint32(len(pp.runq))].ptr() 6753 } 6754 if !atomic.CasRel(&pp.runqhead, h, h+n) { // cas-release, commits consume 6755 return false 6756 } 6757 batch[n] = gp 6758 6759 if randomizeScheduler { 6760 for i := uint32(1); i <= n; i++ { 6761 j := cheaprandn(i + 1) 6762 batch[i], batch[j] = batch[j], batch[i] 6763 } 6764 } 6765 6766 // Link the goroutines. 6767 for i := uint32(0); i < n; i++ { 6768 batch[i].schedlink.set(batch[i+1]) 6769 } 6770 var q gQueue 6771 q.head.set(batch[0]) 6772 q.tail.set(batch[n]) 6773 6774 // Now put the batch on global queue. 6775 lock(&sched.lock) 6776 globrunqputbatch(&q, int32(n+1)) 6777 unlock(&sched.lock) 6778 return true 6779} 6780 6781// runqputbatch tries to put all the G's on q on the local runnable queue. 6782// If the queue is full, they are put on the global queue; in that case 6783// this will temporarily acquire the scheduler lock. 6784// Executed only by the owner P. 6785func runqputbatch(pp *p, q *gQueue, qsize int) { 6786 h := atomic.LoadAcq(&pp.runqhead) 6787 t := pp.runqtail 6788 n := uint32(0) 6789 for !q.empty() && t-h < uint32(len(pp.runq)) { 6790 gp := q.pop() 6791 pp.runq[t%uint32(len(pp.runq))].set(gp) 6792 t++ 6793 n++ 6794 } 6795 qsize -= int(n) 6796 6797 if randomizeScheduler { 6798 off := func(o uint32) uint32 { 6799 return (pp.runqtail + o) % uint32(len(pp.runq)) 6800 } 6801 for i := uint32(1); i < n; i++ { 6802 j := cheaprandn(i + 1) 6803 pp.runq[off(i)], pp.runq[off(j)] = pp.runq[off(j)], pp.runq[off(i)] 6804 } 6805 } 6806 6807 atomic.StoreRel(&pp.runqtail, t) 6808 if !q.empty() { 6809 lock(&sched.lock) 6810 globrunqputbatch(q, int32(qsize)) 6811 unlock(&sched.lock) 6812 } 6813} 6814 6815// Get g from local runnable queue. 6816// If inheritTime is true, gp should inherit the remaining time in the 6817// current time slice. Otherwise, it should start a new time slice. 6818// Executed only by the owner P. 6819func runqget(pp *p) (gp *g, inheritTime bool) { 6820 // If there's a runnext, it's the next G to run. 6821 next := pp.runnext 6822 // If the runnext is non-0 and the CAS fails, it could only have been stolen by another P, 6823 // because other Ps can race to set runnext to 0, but only the current P can set it to non-0. 6824 // Hence, there's no need to retry this CAS if it fails. 6825 if next != 0 && pp.runnext.cas(next, 0) { 6826 return next.ptr(), true 6827 } 6828 6829 for { 6830 h := atomic.LoadAcq(&pp.runqhead) // load-acquire, synchronize with other consumers 6831 t := pp.runqtail 6832 if t == h { 6833 return nil, false 6834 } 6835 gp := pp.runq[h%uint32(len(pp.runq))].ptr() 6836 if atomic.CasRel(&pp.runqhead, h, h+1) { // cas-release, commits consume 6837 return gp, false 6838 } 6839 } 6840} 6841 6842// runqdrain drains the local runnable queue of pp and returns all goroutines in it. 6843// Executed only by the owner P. 6844func runqdrain(pp *p) (drainQ gQueue, n uint32) { 6845 oldNext := pp.runnext 6846 if oldNext != 0 && pp.runnext.cas(oldNext, 0) { 6847 drainQ.pushBack(oldNext.ptr()) 6848 n++ 6849 } 6850 6851retry: 6852 h := atomic.LoadAcq(&pp.runqhead) // load-acquire, synchronize with other consumers 6853 t := pp.runqtail 6854 qn := t - h 6855 if qn == 0 { 6856 return 6857 } 6858 if qn > uint32(len(pp.runq)) { // read inconsistent h and t 6859 goto retry 6860 } 6861 6862 if !atomic.CasRel(&pp.runqhead, h, h+qn) { // cas-release, commits consume 6863 goto retry 6864 } 6865 6866 // We've inverted the order in which it gets G's from the local P's runnable queue 6867 // and then advances the head pointer because we don't want to mess up the statuses of G's 6868 // while runqdrain() and runqsteal() are running in parallel. 6869 // Thus we should advance the head pointer before draining the local P into a gQueue, 6870 // so that we can update any gp.schedlink only after we take the full ownership of G, 6871 // meanwhile, other P's can't access to all G's in local P's runnable queue and steal them. 6872 // See https://groups.google.com/g/golang-dev/c/0pTKxEKhHSc/m/6Q85QjdVBQAJ for more details. 6873 for i := uint32(0); i < qn; i++ { 6874 gp := pp.runq[(h+i)%uint32(len(pp.runq))].ptr() 6875 drainQ.pushBack(gp) 6876 n++ 6877 } 6878 return 6879} 6880 6881// Grabs a batch of goroutines from pp's runnable queue into batch. 6882// Batch is a ring buffer starting at batchHead. 6883// Returns number of grabbed goroutines. 6884// Can be executed by any P. 6885func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool) uint32 { 6886 for { 6887 h := atomic.LoadAcq(&pp.runqhead) // load-acquire, synchronize with other consumers 6888 t := atomic.LoadAcq(&pp.runqtail) // load-acquire, synchronize with the producer 6889 n := t - h 6890 n = n - n/2 6891 if n == 0 { 6892 if stealRunNextG { 6893 // Try to steal from pp.runnext. 6894 if next := pp.runnext; next != 0 { 6895 if pp.status == _Prunning { 6896 // Sleep to ensure that pp isn't about to run the g 6897 // we are about to steal. 6898 // The important use case here is when the g running 6899 // on pp ready()s another g and then almost 6900 // immediately blocks. Instead of stealing runnext 6901 // in this window, back off to give pp a chance to 6902 // schedule runnext. This will avoid thrashing gs 6903 // between different Ps. 6904 // A sync chan send/recv takes ~50ns as of time of 6905 // writing, so 3us gives ~50x overshoot. 6906 if !osHasLowResTimer { 6907 usleep(3) 6908 } else { 6909 // On some platforms system timer granularity is 6910 // 1-15ms, which is way too much for this 6911 // optimization. So just yield. 6912 osyield() 6913 } 6914 } 6915 if !pp.runnext.cas(next, 0) { 6916 continue 6917 } 6918 batch[batchHead%uint32(len(batch))] = next 6919 return 1 6920 } 6921 } 6922 return 0 6923 } 6924 if n > uint32(len(pp.runq)/2) { // read inconsistent h and t 6925 continue 6926 } 6927 for i := uint32(0); i < n; i++ { 6928 g := pp.runq[(h+i)%uint32(len(pp.runq))] 6929 batch[(batchHead+i)%uint32(len(batch))] = g 6930 } 6931 if atomic.CasRel(&pp.runqhead, h, h+n) { // cas-release, commits consume 6932 return n 6933 } 6934 } 6935} 6936 6937// Steal half of elements from local runnable queue of p2 6938// and put onto local runnable queue of p. 6939// Returns one of the stolen elements (or nil if failed). 6940func runqsteal(pp, p2 *p, stealRunNextG bool) *g { 6941 t := pp.runqtail 6942 n := runqgrab(p2, &pp.runq, t, stealRunNextG) 6943 if n == 0 { 6944 return nil 6945 } 6946 n-- 6947 gp := pp.runq[(t+n)%uint32(len(pp.runq))].ptr() 6948 if n == 0 { 6949 return gp 6950 } 6951 h := atomic.LoadAcq(&pp.runqhead) // load-acquire, synchronize with consumers 6952 if t-h+n >= uint32(len(pp.runq)) { 6953 throw("runqsteal: runq overflow") 6954 } 6955 atomic.StoreRel(&pp.runqtail, t+n) // store-release, makes the item available for consumption 6956 return gp 6957} 6958 6959// A gQueue is a dequeue of Gs linked through g.schedlink. A G can only 6960// be on one gQueue or gList at a time. 6961type gQueue struct { 6962 head guintptr 6963 tail guintptr 6964} 6965 6966// empty reports whether q is empty. 6967func (q *gQueue) empty() bool { 6968 return q.head == 0 6969} 6970 6971// push adds gp to the head of q. 6972func (q *gQueue) push(gp *g) { 6973 gp.schedlink = q.head 6974 q.head.set(gp) 6975 if q.tail == 0 { 6976 q.tail.set(gp) 6977 } 6978} 6979 6980// pushBack adds gp to the tail of q. 6981func (q *gQueue) pushBack(gp *g) { 6982 gp.schedlink = 0 6983 if q.tail != 0 { 6984 q.tail.ptr().schedlink.set(gp) 6985 } else { 6986 q.head.set(gp) 6987 } 6988 q.tail.set(gp) 6989} 6990 6991// pushBackAll adds all Gs in q2 to the tail of q. After this q2 must 6992// not be used. 6993func (q *gQueue) pushBackAll(q2 gQueue) { 6994 if q2.tail == 0 { 6995 return 6996 } 6997 q2.tail.ptr().schedlink = 0 6998 if q.tail != 0 { 6999 q.tail.ptr().schedlink = q2.head 7000 } else { 7001 q.head = q2.head 7002 } 7003 q.tail = q2.tail 7004} 7005 7006// pop removes and returns the head of queue q. It returns nil if 7007// q is empty. 7008func (q *gQueue) pop() *g { 7009 gp := q.head.ptr() 7010 if gp != nil { 7011 q.head = gp.schedlink 7012 if q.head == 0 { 7013 q.tail = 0 7014 } 7015 } 7016 return gp 7017} 7018 7019// popList takes all Gs in q and returns them as a gList. 7020func (q *gQueue) popList() gList { 7021 stack := gList{q.head} 7022 *q = gQueue{} 7023 return stack 7024} 7025 7026// A gList is a list of Gs linked through g.schedlink. A G can only be 7027// on one gQueue or gList at a time. 7028type gList struct { 7029 head guintptr 7030} 7031 7032// empty reports whether l is empty. 7033func (l *gList) empty() bool { 7034 return l.head == 0 7035} 7036 7037// push adds gp to the head of l. 7038func (l *gList) push(gp *g) { 7039 gp.schedlink = l.head 7040 l.head.set(gp) 7041} 7042 7043// pushAll prepends all Gs in q to l. 7044func (l *gList) pushAll(q gQueue) { 7045 if !q.empty() { 7046 q.tail.ptr().schedlink = l.head 7047 l.head = q.head 7048 } 7049} 7050 7051// pop removes and returns the head of l. If l is empty, it returns nil. 7052func (l *gList) pop() *g { 7053 gp := l.head.ptr() 7054 if gp != nil { 7055 l.head = gp.schedlink 7056 } 7057 return gp 7058} 7059 7060//go:linkname setMaxThreads runtime/debug.setMaxThreads 7061func setMaxThreads(in int) (out int) { 7062 lock(&sched.lock) 7063 out = int(sched.maxmcount) 7064 if in > 0x7fffffff { // MaxInt32 7065 sched.maxmcount = 0x7fffffff 7066 } else { 7067 sched.maxmcount = int32(in) 7068 } 7069 checkmcount() 7070 unlock(&sched.lock) 7071 return 7072} 7073 7074// procPin should be an internal detail, 7075// but widely used packages access it using linkname. 7076// Notable members of the hall of shame include: 7077// - github.com/bytedance/gopkg 7078// - github.com/choleraehyq/pid 7079// - github.com/songzhibin97/gkit 7080// 7081// Do not remove or change the type signature. 7082// See go.dev/issue/67401. 7083// 7084//go:linkname procPin 7085//go:nosplit 7086func procPin() int { 7087 gp := getg() 7088 mp := gp.m 7089 7090 mp.locks++ 7091 return int(mp.p.ptr().id) 7092} 7093 7094// procUnpin should be an internal detail, 7095// but widely used packages access it using linkname. 7096// Notable members of the hall of shame include: 7097// - github.com/bytedance/gopkg 7098// - github.com/choleraehyq/pid 7099// - github.com/songzhibin97/gkit 7100// 7101// Do not remove or change the type signature. 7102// See go.dev/issue/67401. 7103// 7104//go:linkname procUnpin 7105//go:nosplit 7106func procUnpin() { 7107 gp := getg() 7108 gp.m.locks-- 7109} 7110 7111//go:linkname sync_runtime_procPin sync.runtime_procPin 7112//go:nosplit 7113func sync_runtime_procPin() int { 7114 return procPin() 7115} 7116 7117//go:linkname sync_runtime_procUnpin sync.runtime_procUnpin 7118//go:nosplit 7119func sync_runtime_procUnpin() { 7120 procUnpin() 7121} 7122 7123//go:linkname sync_atomic_runtime_procPin sync/atomic.runtime_procPin 7124//go:nosplit 7125func sync_atomic_runtime_procPin() int { 7126 return procPin() 7127} 7128 7129//go:linkname sync_atomic_runtime_procUnpin sync/atomic.runtime_procUnpin 7130//go:nosplit 7131func sync_atomic_runtime_procUnpin() { 7132 procUnpin() 7133} 7134 7135// Active spinning for sync.Mutex. 7136// 7137// sync_runtime_canSpin should be an internal detail, 7138// but widely used packages access it using linkname. 7139// Notable members of the hall of shame include: 7140// - github.com/livekit/protocol 7141// - github.com/sagernet/gvisor 7142// - gvisor.dev/gvisor 7143// 7144// Do not remove or change the type signature. 7145// See go.dev/issue/67401. 7146// 7147//go:linkname sync_runtime_canSpin sync.runtime_canSpin 7148//go:nosplit 7149func sync_runtime_canSpin(i int) bool { 7150 // sync.Mutex is cooperative, so we are conservative with spinning. 7151 // Spin only few times and only if running on a multicore machine and 7152 // GOMAXPROCS>1 and there is at least one other running P and local runq is empty. 7153 // As opposed to runtime mutex we don't do passive spinning here, 7154 // because there can be work on global runq or on other Ps. 7155 if i >= active_spin || ncpu <= 1 || gomaxprocs <= sched.npidle.Load()+sched.nmspinning.Load()+1 { 7156 return false 7157 } 7158 if p := getg().m.p.ptr(); !runqempty(p) { 7159 return false 7160 } 7161 return true 7162} 7163 7164// sync_runtime_doSpin should be an internal detail, 7165// but widely used packages access it using linkname. 7166// Notable members of the hall of shame include: 7167// - github.com/livekit/protocol 7168// - github.com/sagernet/gvisor 7169// - gvisor.dev/gvisor 7170// 7171// Do not remove or change the type signature. 7172// See go.dev/issue/67401. 7173// 7174//go:linkname sync_runtime_doSpin sync.runtime_doSpin 7175//go:nosplit 7176func sync_runtime_doSpin() { 7177 procyield(active_spin_cnt) 7178} 7179 7180var stealOrder randomOrder 7181 7182// randomOrder/randomEnum are helper types for randomized work stealing. 7183// They allow to enumerate all Ps in different pseudo-random orders without repetitions. 7184// The algorithm is based on the fact that if we have X such that X and GOMAXPROCS 7185// are coprime, then a sequences of (i + X) % GOMAXPROCS gives the required enumeration. 7186type randomOrder struct { 7187 count uint32 7188 coprimes []uint32 7189} 7190 7191type randomEnum struct { 7192 i uint32 7193 count uint32 7194 pos uint32 7195 inc uint32 7196} 7197 7198func (ord *randomOrder) reset(count uint32) { 7199 ord.count = count 7200 ord.coprimes = ord.coprimes[:0] 7201 for i := uint32(1); i <= count; i++ { 7202 if gcd(i, count) == 1 { 7203 ord.coprimes = append(ord.coprimes, i) 7204 } 7205 } 7206} 7207 7208func (ord *randomOrder) start(i uint32) randomEnum { 7209 return randomEnum{ 7210 count: ord.count, 7211 pos: i % ord.count, 7212 inc: ord.coprimes[i/ord.count%uint32(len(ord.coprimes))], 7213 } 7214} 7215 7216func (enum *randomEnum) done() bool { 7217 return enum.i == enum.count 7218} 7219 7220func (enum *randomEnum) next() { 7221 enum.i++ 7222 enum.pos = (enum.pos + enum.inc) % enum.count 7223} 7224 7225func (enum *randomEnum) position() uint32 { 7226 return enum.pos 7227} 7228 7229func gcd(a, b uint32) uint32 { 7230 for b != 0 { 7231 a, b = b, a%b 7232 } 7233 return a 7234} 7235 7236// An initTask represents the set of initializations that need to be done for a package. 7237// Keep in sync with ../../test/noinit.go:initTask 7238type initTask struct { 7239 state uint32 // 0 = uninitialized, 1 = in progress, 2 = done 7240 nfns uint32 7241 // followed by nfns pcs, uintptr sized, one per init function to run 7242} 7243 7244// inittrace stores statistics for init functions which are 7245// updated by malloc and newproc when active is true. 7246var inittrace tracestat 7247 7248type tracestat struct { 7249 active bool // init tracing activation status 7250 id uint64 // init goroutine id 7251 allocs uint64 // heap allocations 7252 bytes uint64 // heap allocated bytes 7253} 7254 7255func doInit(ts []*initTask) { 7256 for _, t := range ts { 7257 doInit1(t) 7258 } 7259} 7260 7261func doInit1(t *initTask) { 7262 switch t.state { 7263 case 2: // fully initialized 7264 return 7265 case 1: // initialization in progress 7266 throw("recursive call during initialization - linker skew") 7267 default: // not initialized yet 7268 t.state = 1 // initialization in progress 7269 7270 var ( 7271 start int64 7272 before tracestat 7273 ) 7274 7275 if inittrace.active { 7276 start = nanotime() 7277 // Load stats non-atomically since tracinit is updated only by this init goroutine. 7278 before = inittrace 7279 } 7280 7281 if t.nfns == 0 { 7282 // We should have pruned all of these in the linker. 7283 throw("inittask with no functions") 7284 } 7285 7286 firstFunc := add(unsafe.Pointer(t), 8) 7287 for i := uint32(0); i < t.nfns; i++ { 7288 p := add(firstFunc, uintptr(i)*goarch.PtrSize) 7289 f := *(*func())(unsafe.Pointer(&p)) 7290 f() 7291 } 7292 7293 if inittrace.active { 7294 end := nanotime() 7295 // Load stats non-atomically since tracinit is updated only by this init goroutine. 7296 after := inittrace 7297 7298 f := *(*func())(unsafe.Pointer(&firstFunc)) 7299 pkg := funcpkgpath(findfunc(abi.FuncPCABIInternal(f))) 7300 7301 var sbuf [24]byte 7302 print("init ", pkg, " @") 7303 print(string(fmtNSAsMS(sbuf[:], uint64(start-runtimeInitTime))), " ms, ") 7304 print(string(fmtNSAsMS(sbuf[:], uint64(end-start))), " ms clock, ") 7305 print(string(itoa(sbuf[:], after.bytes-before.bytes)), " bytes, ") 7306 print(string(itoa(sbuf[:], after.allocs-before.allocs)), " allocs") 7307 print("\n") 7308 } 7309 7310 t.state = 2 // initialization done 7311 } 7312} 7313