1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Garbage collector (GC).
6//
7// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple
8// GC thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
9// non-generational and non-compacting. Allocation is done using size segregated per P allocation
10// areas to minimize fragmentation while eliminating locks in the common case.
11//
12// The algorithm decomposes into several steps.
13// This is a high level description of the algorithm being used. For an overview of GC a good
14// place to start is Richard Jones' gchandbook.org.
15//
16// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
17// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
18// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978),
19// 966-975.
20// For journal quality proofs that these steps are complete, correct, and terminate see
21// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
22// Concurrency and Computation: Practice and Experience 15(3-5), 2003.
23//
24// 1. GC performs sweep termination.
25//
26//    a. Stop the world. This causes all Ps to reach a GC safe-point.
27//
28//    b. Sweep any unswept spans. There will only be unswept spans if
29//    this GC cycle was forced before the expected time.
30//
31// 2. GC performs the mark phase.
32//
33//    a. Prepare for the mark phase by setting gcphase to _GCmark
34//    (from _GCoff), enabling the write barrier, enabling mutator
35//    assists, and enqueueing root mark jobs. No objects may be
36//    scanned until all Ps have enabled the write barrier, which is
37//    accomplished using STW.
38//
39//    b. Start the world. From this point, GC work is done by mark
40//    workers started by the scheduler and by assists performed as
41//    part of allocation. The write barrier shades both the
42//    overwritten pointer and the new pointer value for any pointer
43//    writes (see mbarrier.go for details). Newly allocated objects
44//    are immediately marked black.
45//
46//    c. GC performs root marking jobs. This includes scanning all
47//    stacks, shading all globals, and shading any heap pointers in
48//    off-heap runtime data structures. Scanning a stack stops a
49//    goroutine, shades any pointers found on its stack, and then
50//    resumes the goroutine.
51//
52//    d. GC drains the work queue of grey objects, scanning each grey
53//    object to black and shading all pointers found in the object
54//    (which in turn may add those pointers to the work queue).
55//
56//    e. Because GC work is spread across local caches, GC uses a
57//    distributed termination algorithm to detect when there are no
58//    more root marking jobs or grey objects (see gcMarkDone). At this
59//    point, GC transitions to mark termination.
60//
61// 3. GC performs mark termination.
62//
63//    a. Stop the world.
64//
65//    b. Set gcphase to _GCmarktermination, and disable workers and
66//    assists.
67//
68//    c. Perform housekeeping like flushing mcaches.
69//
70// 4. GC performs the sweep phase.
71//
72//    a. Prepare for the sweep phase by setting gcphase to _GCoff,
73//    setting up sweep state and disabling the write barrier.
74//
75//    b. Start the world. From this point on, newly allocated objects
76//    are white, and allocating sweeps spans before use if necessary.
77//
78//    c. GC does concurrent sweeping in the background and in response
79//    to allocation. See description below.
80//
81// 5. When sufficient allocation has taken place, replay the sequence
82// starting with 1 above. See discussion of GC rate below.
83
84// Concurrent sweep.
85//
86// The sweep phase proceeds concurrently with normal program execution.
87// The heap is swept span-by-span both lazily (when a goroutine needs another span)
88// and concurrently in a background goroutine (this helps programs that are not CPU bound).
89// At the end of STW mark termination all spans are marked as "needs sweeping".
90//
91// The background sweeper goroutine simply sweeps spans one-by-one.
92//
93// To avoid requesting more OS memory while there are unswept spans, when a
94// goroutine needs another span, it first attempts to reclaim that much memory
95// by sweeping. When a goroutine needs to allocate a new small-object span, it
96// sweeps small-object spans for the same object size until it frees at least
97// one object. When a goroutine needs to allocate large-object span from heap,
98// it sweeps spans until it frees at least that many pages into heap. There is
99// one case where this may not suffice: if a goroutine sweeps and frees two
100// nonadjacent one-page spans to the heap, it will allocate a new two-page
101// span, but there can still be other one-page unswept spans which could be
102// combined into a two-page span.
103//
104// It's critical to ensure that no operations proceed on unswept spans (that would corrupt
105// mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
106// so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
107// When a goroutine explicitly frees an object or sets a finalizer, it ensures that
108// the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
109// The finalizer goroutine is kicked off only when all spans are swept.
110// When the next GC starts, it sweeps all not-yet-swept spans (if any).
111
112// GC rate.
113// Next GC is after we've allocated an extra amount of memory proportional to
114// the amount already in use. The proportion is controlled by GOGC environment variable
115// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
116// (this mark is computed by the gcController.heapGoal method). This keeps the GC cost in
117// linear proportion to the allocation cost. Adjusting GOGC just changes the linear constant
118// (and also the amount of extra memory used).
119
120// Oblets
121//
122// In order to prevent long pauses while scanning large objects and to
123// improve parallelism, the garbage collector breaks up scan jobs for
124// objects larger than maxObletBytes into "oblets" of at most
125// maxObletBytes. When scanning encounters the beginning of a large
126// object, it scans only the first oblet and enqueues the remaining
127// oblets as new scan jobs.
128
129package runtime
130
131import (
132	"internal/cpu"
133	"internal/runtime/atomic"
134	"unsafe"
135)
136
137const (
138	_DebugGC      = 0
139	_FinBlockSize = 4 * 1024
140
141	// concurrentSweep is a debug flag. Disabling this flag
142	// ensures all spans are swept while the world is stopped.
143	concurrentSweep = true
144
145	// debugScanConservative enables debug logging for stack
146	// frames that are scanned conservatively.
147	debugScanConservative = false
148
149	// sweepMinHeapDistance is a lower bound on the heap distance
150	// (in bytes) reserved for concurrent sweeping between GC
151	// cycles.
152	sweepMinHeapDistance = 1024 * 1024
153)
154
155// heapObjectsCanMove always returns false in the current garbage collector.
156// It exists for go4.org/unsafe/assume-no-moving-gc, which is an
157// unfortunate idea that had an even more unfortunate implementation.
158// Every time a new Go release happened, the package stopped building,
159// and the authors had to add a new file with a new //go:build line, and
160// then the entire ecosystem of packages with that as a dependency had to
161// explicitly update to the new version. Many packages depend on
162// assume-no-moving-gc transitively, through paths like
163// inet.af/netaddr -> go4.org/intern -> assume-no-moving-gc.
164// This was causing a significant amount of friction around each new
165// release, so we added this bool for the package to //go:linkname
166// instead. The bool is still unfortunate, but it's not as bad as
167// breaking the ecosystem on every new release.
168//
169// If the Go garbage collector ever does move heap objects, we can set
170// this to true to break all the programs using assume-no-moving-gc.
171//
172//go:linkname heapObjectsCanMove
173func heapObjectsCanMove() bool {
174	return false
175}
176
177func gcinit() {
178	if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
179		throw("size of Workbuf is suboptimal")
180	}
181	// No sweep on the first cycle.
182	sweep.active.state.Store(sweepDrainedMask)
183
184	// Initialize GC pacer state.
185	// Use the environment variable GOGC for the initial gcPercent value.
186	// Use the environment variable GOMEMLIMIT for the initial memoryLimit value.
187	gcController.init(readGOGC(), readGOMEMLIMIT())
188
189	work.startSema = 1
190	work.markDoneSema = 1
191	lockInit(&work.sweepWaiters.lock, lockRankSweepWaiters)
192	lockInit(&work.assistQueue.lock, lockRankAssistQueue)
193	lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
194}
195
196// gcenable is called after the bulk of the runtime initialization,
197// just before we're about to start letting user code run.
198// It kicks off the background sweeper goroutine, the background
199// scavenger goroutine, and enables GC.
200func gcenable() {
201	// Kick off sweeping and scavenging.
202	c := make(chan int, 2)
203	go bgsweep(c)
204	go bgscavenge(c)
205	<-c
206	<-c
207	memstats.enablegc = true // now that runtime is initialized, GC is okay
208}
209
210// Garbage collector phase.
211// Indicates to write barrier and synchronization task to perform.
212var gcphase uint32
213
214// The compiler knows about this variable.
215// If you change it, you must change builtin/runtime.go, too.
216// If you change the first four bytes, you must also change the write
217// barrier insertion code.
218//
219// writeBarrier should be an internal detail,
220// but widely used packages access it using linkname.
221// Notable members of the hall of shame include:
222//   - github.com/bytedance/sonic
223//   - github.com/cloudwego/frugal
224//
225// Do not remove or change the type signature.
226// See go.dev/issue/67401.
227//
228//go:linkname writeBarrier
229var writeBarrier struct {
230	enabled bool    // compiler emits a check of this before calling write barrier
231	pad     [3]byte // compiler uses 32-bit load for "enabled" field
232	alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
233}
234
235// gcBlackenEnabled is 1 if mutator assists and background mark
236// workers are allowed to blacken objects. This must only be set when
237// gcphase == _GCmark.
238var gcBlackenEnabled uint32
239
240const (
241	_GCoff             = iota // GC not running; sweeping in background, write barrier disabled
242	_GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
243	_GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
244)
245
246//go:nosplit
247func setGCPhase(x uint32) {
248	atomic.Store(&gcphase, x)
249	writeBarrier.enabled = gcphase == _GCmark || gcphase == _GCmarktermination
250}
251
252// gcMarkWorkerMode represents the mode that a concurrent mark worker
253// should operate in.
254//
255// Concurrent marking happens through four different mechanisms. One
256// is mutator assists, which happen in response to allocations and are
257// not scheduled. The other three are variations in the per-P mark
258// workers and are distinguished by gcMarkWorkerMode.
259type gcMarkWorkerMode int
260
261const (
262	// gcMarkWorkerNotWorker indicates that the next scheduled G is not
263	// starting work and the mode should be ignored.
264	gcMarkWorkerNotWorker gcMarkWorkerMode = iota
265
266	// gcMarkWorkerDedicatedMode indicates that the P of a mark
267	// worker is dedicated to running that mark worker. The mark
268	// worker should run without preemption.
269	gcMarkWorkerDedicatedMode
270
271	// gcMarkWorkerFractionalMode indicates that a P is currently
272	// running the "fractional" mark worker. The fractional worker
273	// is necessary when GOMAXPROCS*gcBackgroundUtilization is not
274	// an integer and using only dedicated workers would result in
275	// utilization too far from the target of gcBackgroundUtilization.
276	// The fractional worker should run until it is preempted and
277	// will be scheduled to pick up the fractional part of
278	// GOMAXPROCS*gcBackgroundUtilization.
279	gcMarkWorkerFractionalMode
280
281	// gcMarkWorkerIdleMode indicates that a P is running the mark
282	// worker because it has nothing else to do. The idle worker
283	// should run until it is preempted and account its time
284	// against gcController.idleMarkTime.
285	gcMarkWorkerIdleMode
286)
287
288// gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
289// to use in execution traces.
290var gcMarkWorkerModeStrings = [...]string{
291	"Not worker",
292	"GC (dedicated)",
293	"GC (fractional)",
294	"GC (idle)",
295}
296
297// pollFractionalWorkerExit reports whether a fractional mark worker
298// should self-preempt. It assumes it is called from the fractional
299// worker.
300func pollFractionalWorkerExit() bool {
301	// This should be kept in sync with the fractional worker
302	// scheduler logic in findRunnableGCWorker.
303	now := nanotime()
304	delta := now - gcController.markStartTime
305	if delta <= 0 {
306		return true
307	}
308	p := getg().m.p.ptr()
309	selfTime := p.gcFractionalMarkTime + (now - p.gcMarkWorkerStartTime)
310	// Add some slack to the utilization goal so that the
311	// fractional worker isn't behind again the instant it exits.
312	return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
313}
314
315var work workType
316
317type workType struct {
318	full  lfstack          // lock-free list of full blocks workbuf
319	_     cpu.CacheLinePad // prevents false-sharing between full and empty
320	empty lfstack          // lock-free list of empty blocks workbuf
321	_     cpu.CacheLinePad // prevents false-sharing between empty and nproc/nwait
322
323	wbufSpans struct {
324		lock mutex
325		// free is a list of spans dedicated to workbufs, but
326		// that don't currently contain any workbufs.
327		free mSpanList
328		// busy is a list of all spans containing workbufs on
329		// one of the workbuf lists.
330		busy mSpanList
331	}
332
333	// Restore 64-bit alignment on 32-bit.
334	_ uint32
335
336	// bytesMarked is the number of bytes marked this cycle. This
337	// includes bytes blackened in scanned objects, noscan objects
338	// that go straight to black, and permagrey objects scanned by
339	// markroot during the concurrent scan phase. This is updated
340	// atomically during the cycle. Updates may be batched
341	// arbitrarily, since the value is only read at the end of the
342	// cycle.
343	//
344	// Because of benign races during marking, this number may not
345	// be the exact number of marked bytes, but it should be very
346	// close.
347	//
348	// Put this field here because it needs 64-bit atomic access
349	// (and thus 8-byte alignment even on 32-bit architectures).
350	bytesMarked uint64
351
352	markrootNext uint32 // next markroot job
353	markrootJobs uint32 // number of markroot jobs
354
355	nproc  uint32
356	tstart int64
357	nwait  uint32
358
359	// Number of roots of various root types. Set by gcMarkRootPrepare.
360	//
361	// nStackRoots == len(stackRoots), but we have nStackRoots for
362	// consistency.
363	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
364
365	// Base indexes of each root type. Set by gcMarkRootPrepare.
366	baseData, baseBSS, baseSpans, baseStacks, baseEnd uint32
367
368	// stackRoots is a snapshot of all of the Gs that existed
369	// before the beginning of concurrent marking. The backing
370	// store of this must not be modified because it might be
371	// shared with allgs.
372	stackRoots []*g
373
374	// Each type of GC state transition is protected by a lock.
375	// Since multiple threads can simultaneously detect the state
376	// transition condition, any thread that detects a transition
377	// condition must acquire the appropriate transition lock,
378	// re-check the transition condition and return if it no
379	// longer holds or perform the transition if it does.
380	// Likewise, any transition must invalidate the transition
381	// condition before releasing the lock. This ensures that each
382	// transition is performed by exactly one thread and threads
383	// that need the transition to happen block until it has
384	// happened.
385	//
386	// startSema protects the transition from "off" to mark or
387	// mark termination.
388	startSema uint32
389	// markDoneSema protects transitions from mark to mark termination.
390	markDoneSema uint32
391
392	bgMarkDone uint32 // cas to 1 when at a background mark completion point
393	// Background mark completion signaling
394
395	// mode is the concurrency mode of the current GC cycle.
396	mode gcMode
397
398	// userForced indicates the current GC cycle was forced by an
399	// explicit user call.
400	userForced bool
401
402	// initialHeapLive is the value of gcController.heapLive at the
403	// beginning of this GC cycle.
404	initialHeapLive uint64
405
406	// assistQueue is a queue of assists that are blocked because
407	// there was neither enough credit to steal or enough work to
408	// do.
409	assistQueue struct {
410		lock mutex
411		q    gQueue
412	}
413
414	// sweepWaiters is a list of blocked goroutines to wake when
415	// we transition from mark termination to sweep.
416	sweepWaiters struct {
417		lock mutex
418		list gList
419	}
420
421	// cycles is the number of completed GC cycles, where a GC
422	// cycle is sweep termination, mark, mark termination, and
423	// sweep. This differs from memstats.numgc, which is
424	// incremented at mark termination.
425	cycles atomic.Uint32
426
427	// Timing/utilization stats for this cycle.
428	stwprocs, maxprocs                 int32
429	tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
430
431	// pauseNS is the total STW time this cycle, measured as the time between
432	// when stopping began (just before trying to stop Ps) and just after the
433	// world started again.
434	pauseNS int64
435
436	// debug.gctrace heap sizes for this cycle.
437	heap0, heap1, heap2 uint64
438
439	// Cumulative estimated CPU usage.
440	cpuStats
441}
442
443// GC runs a garbage collection and blocks the caller until the
444// garbage collection is complete. It may also block the entire
445// program.
446func GC() {
447	// We consider a cycle to be: sweep termination, mark, mark
448	// termination, and sweep. This function shouldn't return
449	// until a full cycle has been completed, from beginning to
450	// end. Hence, we always want to finish up the current cycle
451	// and start a new one. That means:
452	//
453	// 1. In sweep termination, mark, or mark termination of cycle
454	// N, wait until mark termination N completes and transitions
455	// to sweep N.
456	//
457	// 2. In sweep N, help with sweep N.
458	//
459	// At this point we can begin a full cycle N+1.
460	//
461	// 3. Trigger cycle N+1 by starting sweep termination N+1.
462	//
463	// 4. Wait for mark termination N+1 to complete.
464	//
465	// 5. Help with sweep N+1 until it's done.
466	//
467	// This all has to be written to deal with the fact that the
468	// GC may move ahead on its own. For example, when we block
469	// until mark termination N, we may wake up in cycle N+2.
470
471	// Wait until the current sweep termination, mark, and mark
472	// termination complete.
473	n := work.cycles.Load()
474	gcWaitOnMark(n)
475
476	// We're now in sweep N or later. Trigger GC cycle N+1, which
477	// will first finish sweep N if necessary and then enter sweep
478	// termination N+1.
479	gcStart(gcTrigger{kind: gcTriggerCycle, n: n + 1})
480
481	// Wait for mark termination N+1 to complete.
482	gcWaitOnMark(n + 1)
483
484	// Finish sweep N+1 before returning. We do this both to
485	// complete the cycle and because runtime.GC() is often used
486	// as part of tests and benchmarks to get the system into a
487	// relatively stable and isolated state.
488	for work.cycles.Load() == n+1 && sweepone() != ^uintptr(0) {
489		Gosched()
490	}
491
492	// Callers may assume that the heap profile reflects the
493	// just-completed cycle when this returns (historically this
494	// happened because this was a STW GC), but right now the
495	// profile still reflects mark termination N, not N+1.
496	//
497	// As soon as all of the sweep frees from cycle N+1 are done,
498	// we can go ahead and publish the heap profile.
499	//
500	// First, wait for sweeping to finish. (We know there are no
501	// more spans on the sweep queue, but we may be concurrently
502	// sweeping spans, so we have to wait.)
503	for work.cycles.Load() == n+1 && !isSweepDone() {
504		Gosched()
505	}
506
507	// Now we're really done with sweeping, so we can publish the
508	// stable heap profile. Only do this if we haven't already hit
509	// another mark termination.
510	mp := acquirem()
511	cycle := work.cycles.Load()
512	if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
513		mProf_PostSweep()
514	}
515	releasem(mp)
516}
517
518// gcWaitOnMark blocks until GC finishes the Nth mark phase. If GC has
519// already completed this mark phase, it returns immediately.
520func gcWaitOnMark(n uint32) {
521	for {
522		// Disable phase transitions.
523		lock(&work.sweepWaiters.lock)
524		nMarks := work.cycles.Load()
525		if gcphase != _GCmark {
526			// We've already completed this cycle's mark.
527			nMarks++
528		}
529		if nMarks > n {
530			// We're done.
531			unlock(&work.sweepWaiters.lock)
532			return
533		}
534
535		// Wait until sweep termination, mark, and mark
536		// termination of cycle N complete.
537		work.sweepWaiters.list.push(getg())
538		goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceBlockUntilGCEnds, 1)
539	}
540}
541
542// gcMode indicates how concurrent a GC cycle should be.
543type gcMode int
544
545const (
546	gcBackgroundMode gcMode = iota // concurrent GC and sweep
547	gcForceMode                    // stop-the-world GC now, concurrent sweep
548	gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
549)
550
551// A gcTrigger is a predicate for starting a GC cycle. Specifically,
552// it is an exit condition for the _GCoff phase.
553type gcTrigger struct {
554	kind gcTriggerKind
555	now  int64  // gcTriggerTime: current time
556	n    uint32 // gcTriggerCycle: cycle number to start
557}
558
559type gcTriggerKind int
560
561const (
562	// gcTriggerHeap indicates that a cycle should be started when
563	// the heap size reaches the trigger heap size computed by the
564	// controller.
565	gcTriggerHeap gcTriggerKind = iota
566
567	// gcTriggerTime indicates that a cycle should be started when
568	// it's been more than forcegcperiod nanoseconds since the
569	// previous GC cycle.
570	gcTriggerTime
571
572	// gcTriggerCycle indicates that a cycle should be started if
573	// we have not yet started cycle number gcTrigger.n (relative
574	// to work.cycles).
575	gcTriggerCycle
576)
577
578// test reports whether the trigger condition is satisfied, meaning
579// that the exit condition for the _GCoff phase has been met. The exit
580// condition should be tested when allocating.
581func (t gcTrigger) test() bool {
582	if !memstats.enablegc || panicking.Load() != 0 || gcphase != _GCoff {
583		return false
584	}
585	switch t.kind {
586	case gcTriggerHeap:
587		trigger, _ := gcController.trigger()
588		return gcController.heapLive.Load() >= trigger
589	case gcTriggerTime:
590		if gcController.gcPercent.Load() < 0 {
591			return false
592		}
593		lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
594		return lastgc != 0 && t.now-lastgc > forcegcperiod
595	case gcTriggerCycle:
596		// t.n > work.cycles, but accounting for wraparound.
597		return int32(t.n-work.cycles.Load()) > 0
598	}
599	return true
600}
601
602// gcStart starts the GC. It transitions from _GCoff to _GCmark (if
603// debug.gcstoptheworld == 0) or performs all of GC (if
604// debug.gcstoptheworld != 0).
605//
606// This may return without performing this transition in some cases,
607// such as when called on a system stack or with locks held.
608func gcStart(trigger gcTrigger) {
609	// Since this is called from malloc and malloc is called in
610	// the guts of a number of libraries that might be holding
611	// locks, don't attempt to start GC in non-preemptible or
612	// potentially unstable situations.
613	mp := acquirem()
614	if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
615		releasem(mp)
616		return
617	}
618	releasem(mp)
619	mp = nil
620
621	// Pick up the remaining unswept/not being swept spans concurrently
622	//
623	// This shouldn't happen if we're being invoked in background
624	// mode since proportional sweep should have just finished
625	// sweeping everything, but rounding errors, etc, may leave a
626	// few spans unswept. In forced mode, this is necessary since
627	// GC can be forced at any point in the sweeping cycle.
628	//
629	// We check the transition condition continuously here in case
630	// this G gets delayed in to the next GC cycle.
631	for trigger.test() && sweepone() != ^uintptr(0) {
632	}
633
634	// Perform GC initialization and the sweep termination
635	// transition.
636	semacquire(&work.startSema)
637	// Re-check transition condition under transition lock.
638	if !trigger.test() {
639		semrelease(&work.startSema)
640		return
641	}
642
643	// In gcstoptheworld debug mode, upgrade the mode accordingly.
644	// We do this after re-checking the transition condition so
645	// that multiple goroutines that detect the heap trigger don't
646	// start multiple STW GCs.
647	mode := gcBackgroundMode
648	if debug.gcstoptheworld == 1 {
649		mode = gcForceMode
650	} else if debug.gcstoptheworld == 2 {
651		mode = gcForceBlockMode
652	}
653
654	// Ok, we're doing it! Stop everybody else
655	semacquire(&gcsema)
656	semacquire(&worldsema)
657
658	// For stats, check if this GC was forced by the user.
659	// Update it under gcsema to avoid gctrace getting wrong values.
660	work.userForced = trigger.kind == gcTriggerCycle
661
662	trace := traceAcquire()
663	if trace.ok() {
664		trace.GCStart()
665		traceRelease(trace)
666	}
667
668	// Check that all Ps have finished deferred mcache flushes.
669	for _, p := range allp {
670		if fg := p.mcache.flushGen.Load(); fg != mheap_.sweepgen {
671			println("runtime: p", p.id, "flushGen", fg, "!= sweepgen", mheap_.sweepgen)
672			throw("p mcache not flushed")
673		}
674	}
675
676	gcBgMarkStartWorkers()
677
678	systemstack(gcResetMarkState)
679
680	work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
681	if work.stwprocs > ncpu {
682		// This is used to compute CPU time of the STW phases,
683		// so it can't be more than ncpu, even if GOMAXPROCS is.
684		work.stwprocs = ncpu
685	}
686	work.heap0 = gcController.heapLive.Load()
687	work.pauseNS = 0
688	work.mode = mode
689
690	now := nanotime()
691	work.tSweepTerm = now
692	var stw worldStop
693	systemstack(func() {
694		stw = stopTheWorldWithSema(stwGCSweepTerm)
695	})
696
697	// Accumulate fine-grained stopping time.
698	work.cpuStats.accumulateGCPauseTime(stw.stoppingCPUTime, 1)
699
700	// Finish sweep before we start concurrent scan.
701	systemstack(func() {
702		finishsweep_m()
703	})
704
705	// clearpools before we start the GC. If we wait the memory will not be
706	// reclaimed until the next GC cycle.
707	clearpools()
708
709	work.cycles.Add(1)
710
711	// Assists and workers can start the moment we start
712	// the world.
713	gcController.startCycle(now, int(gomaxprocs), trigger)
714
715	// Notify the CPU limiter that assists may begin.
716	gcCPULimiter.startGCTransition(true, now)
717
718	// In STW mode, disable scheduling of user Gs. This may also
719	// disable scheduling of this goroutine, so it may block as
720	// soon as we start the world again.
721	if mode != gcBackgroundMode {
722		schedEnableUser(false)
723	}
724
725	// Enter concurrent mark phase and enable
726	// write barriers.
727	//
728	// Because the world is stopped, all Ps will
729	// observe that write barriers are enabled by
730	// the time we start the world and begin
731	// scanning.
732	//
733	// Write barriers must be enabled before assists are
734	// enabled because they must be enabled before
735	// any non-leaf heap objects are marked. Since
736	// allocations are blocked until assists can
737	// happen, we want to enable assists as early as
738	// possible.
739	setGCPhase(_GCmark)
740
741	gcBgMarkPrepare() // Must happen before assists are enabled.
742	gcMarkRootPrepare()
743
744	// Mark all active tinyalloc blocks. Since we're
745	// allocating from these, they need to be black like
746	// other allocations. The alternative is to blacken
747	// the tiny block on every allocation from it, which
748	// would slow down the tiny allocator.
749	gcMarkTinyAllocs()
750
751	// At this point all Ps have enabled the write
752	// barrier, thus maintaining the no white to
753	// black invariant. Enable mutator assists to
754	// put back-pressure on fast allocating
755	// mutators.
756	atomic.Store(&gcBlackenEnabled, 1)
757
758	// In STW mode, we could block the instant systemstack
759	// returns, so make sure we're not preemptible.
760	mp = acquirem()
761
762	// Update the CPU stats pause time.
763	//
764	// Use maxprocs instead of stwprocs here because the total time
765	// computed in the CPU stats is based on maxprocs, and we want them
766	// to be comparable.
767	work.cpuStats.accumulateGCPauseTime(nanotime()-stw.finishedStopping, work.maxprocs)
768
769	// Concurrent mark.
770	systemstack(func() {
771		now = startTheWorldWithSema(0, stw)
772		work.pauseNS += now - stw.startedStopping
773		work.tMark = now
774
775		// Release the CPU limiter.
776		gcCPULimiter.finishGCTransition(now)
777	})
778
779	// Release the world sema before Gosched() in STW mode
780	// because we will need to reacquire it later but before
781	// this goroutine becomes runnable again, and we could
782	// self-deadlock otherwise.
783	semrelease(&worldsema)
784	releasem(mp)
785
786	// Make sure we block instead of returning to user code
787	// in STW mode.
788	if mode != gcBackgroundMode {
789		Gosched()
790	}
791
792	semrelease(&work.startSema)
793}
794
795// gcMarkDoneFlushed counts the number of P's with flushed work.
796//
797// Ideally this would be a captured local in gcMarkDone, but forEachP
798// escapes its callback closure, so it can't capture anything.
799//
800// This is protected by markDoneSema.
801var gcMarkDoneFlushed uint32
802
803// gcMarkDone transitions the GC from mark to mark termination if all
804// reachable objects have been marked (that is, there are no grey
805// objects and can be no more in the future). Otherwise, it flushes
806// all local work to the global queues where it can be discovered by
807// other workers.
808//
809// This should be called when all local mark work has been drained and
810// there are no remaining workers. Specifically, when
811//
812//	work.nwait == work.nproc && !gcMarkWorkAvailable(p)
813//
814// The calling context must be preemptible.
815//
816// Flushing local work is important because idle Ps may have local
817// work queued. This is the only way to make that work visible and
818// drive GC to completion.
819//
820// It is explicitly okay to have write barriers in this function. If
821// it does transition to mark termination, then all reachable objects
822// have been marked, so the write barrier cannot shade any more
823// objects.
824func gcMarkDone() {
825	// Ensure only one thread is running the ragged barrier at a
826	// time.
827	semacquire(&work.markDoneSema)
828
829top:
830	// Re-check transition condition under transition lock.
831	//
832	// It's critical that this checks the global work queues are
833	// empty before performing the ragged barrier. Otherwise,
834	// there could be global work that a P could take after the P
835	// has passed the ragged barrier.
836	if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
837		semrelease(&work.markDoneSema)
838		return
839	}
840
841	// forEachP needs worldsema to execute, and we'll need it to
842	// stop the world later, so acquire worldsema now.
843	semacquire(&worldsema)
844
845	// Flush all local buffers and collect flushedWork flags.
846	gcMarkDoneFlushed = 0
847	forEachP(waitReasonGCMarkTermination, func(pp *p) {
848		// Flush the write barrier buffer, since this may add
849		// work to the gcWork.
850		wbBufFlush1(pp)
851
852		// Flush the gcWork, since this may create global work
853		// and set the flushedWork flag.
854		//
855		// TODO(austin): Break up these workbufs to
856		// better distribute work.
857		pp.gcw.dispose()
858		// Collect the flushedWork flag.
859		if pp.gcw.flushedWork {
860			atomic.Xadd(&gcMarkDoneFlushed, 1)
861			pp.gcw.flushedWork = false
862		}
863	})
864
865	if gcMarkDoneFlushed != 0 {
866		// More grey objects were discovered since the
867		// previous termination check, so there may be more
868		// work to do. Keep going. It's possible the
869		// transition condition became true again during the
870		// ragged barrier, so re-check it.
871		semrelease(&worldsema)
872		goto top
873	}
874
875	// There was no global work, no local work, and no Ps
876	// communicated work since we took markDoneSema. Therefore
877	// there are no grey objects and no more objects can be
878	// shaded. Transition to mark termination.
879	now := nanotime()
880	work.tMarkTerm = now
881	getg().m.preemptoff = "gcing"
882	var stw worldStop
883	systemstack(func() {
884		stw = stopTheWorldWithSema(stwGCMarkTerm)
885	})
886	// The gcphase is _GCmark, it will transition to _GCmarktermination
887	// below. The important thing is that the wb remains active until
888	// all marking is complete. This includes writes made by the GC.
889
890	// Accumulate fine-grained stopping time.
891	work.cpuStats.accumulateGCPauseTime(stw.stoppingCPUTime, 1)
892
893	// There is sometimes work left over when we enter mark termination due
894	// to write barriers performed after the completion barrier above.
895	// Detect this and resume concurrent mark. This is obviously
896	// unfortunate.
897	//
898	// See issue #27993 for details.
899	//
900	// Switch to the system stack to call wbBufFlush1, though in this case
901	// it doesn't matter because we're non-preemptible anyway.
902	restart := false
903	systemstack(func() {
904		for _, p := range allp {
905			wbBufFlush1(p)
906			if !p.gcw.empty() {
907				restart = true
908				break
909			}
910		}
911	})
912	if restart {
913		getg().m.preemptoff = ""
914		systemstack(func() {
915			// Accumulate the time we were stopped before we had to start again.
916			work.cpuStats.accumulateGCPauseTime(nanotime()-stw.finishedStopping, work.maxprocs)
917
918			// Start the world again.
919			now := startTheWorldWithSema(0, stw)
920			work.pauseNS += now - stw.startedStopping
921		})
922		semrelease(&worldsema)
923		goto top
924	}
925
926	gcComputeStartingStackSize()
927
928	// Disable assists and background workers. We must do
929	// this before waking blocked assists.
930	atomic.Store(&gcBlackenEnabled, 0)
931
932	// Notify the CPU limiter that GC assists will now cease.
933	gcCPULimiter.startGCTransition(false, now)
934
935	// Wake all blocked assists. These will run when we
936	// start the world again.
937	gcWakeAllAssists()
938
939	// Likewise, release the transition lock. Blocked
940	// workers and assists will run when we start the
941	// world again.
942	semrelease(&work.markDoneSema)
943
944	// In STW mode, re-enable user goroutines. These will be
945	// queued to run after we start the world.
946	schedEnableUser(true)
947
948	// endCycle depends on all gcWork cache stats being flushed.
949	// The termination algorithm above ensured that up to
950	// allocations since the ragged barrier.
951	gcController.endCycle(now, int(gomaxprocs), work.userForced)
952
953	// Perform mark termination. This will restart the world.
954	gcMarkTermination(stw)
955}
956
957// World must be stopped and mark assists and background workers must be
958// disabled.
959func gcMarkTermination(stw worldStop) {
960	// Start marktermination (write barrier remains enabled for now).
961	setGCPhase(_GCmarktermination)
962
963	work.heap1 = gcController.heapLive.Load()
964	startTime := nanotime()
965
966	mp := acquirem()
967	mp.preemptoff = "gcing"
968	mp.traceback = 2
969	curgp := mp.curg
970	// N.B. The execution tracer is not aware of this status
971	// transition and handles it specially based on the
972	// wait reason.
973	casGToWaitingForGC(curgp, _Grunning, waitReasonGarbageCollection)
974
975	// Run gc on the g0 stack. We do this so that the g stack
976	// we're currently running on will no longer change. Cuts
977	// the root set down a bit (g0 stacks are not scanned, and
978	// we don't need to scan gc's internal state).  We also
979	// need to switch to g0 so we can shrink the stack.
980	systemstack(func() {
981		gcMark(startTime)
982		// Must return immediately.
983		// The outer function's stack may have moved
984		// during gcMark (it shrinks stacks, including the
985		// outer function's stack), so we must not refer
986		// to any of its variables. Return back to the
987		// non-system stack to pick up the new addresses
988		// before continuing.
989	})
990
991	var stwSwept bool
992	systemstack(func() {
993		work.heap2 = work.bytesMarked
994		if debug.gccheckmark > 0 {
995			// Run a full non-parallel, stop-the-world
996			// mark using checkmark bits, to check that we
997			// didn't forget to mark anything during the
998			// concurrent mark process.
999			startCheckmarks()
1000			gcResetMarkState()
1001			gcw := &getg().m.p.ptr().gcw
1002			gcDrain(gcw, 0)
1003			wbBufFlush1(getg().m.p.ptr())
1004			gcw.dispose()
1005			endCheckmarks()
1006		}
1007
1008		// marking is complete so we can turn the write barrier off
1009		setGCPhase(_GCoff)
1010		stwSwept = gcSweep(work.mode)
1011	})
1012
1013	mp.traceback = 0
1014	casgstatus(curgp, _Gwaiting, _Grunning)
1015
1016	trace := traceAcquire()
1017	if trace.ok() {
1018		trace.GCDone()
1019		traceRelease(trace)
1020	}
1021
1022	// all done
1023	mp.preemptoff = ""
1024
1025	if gcphase != _GCoff {
1026		throw("gc done but gcphase != _GCoff")
1027	}
1028
1029	// Record heapInUse for scavenger.
1030	memstats.lastHeapInUse = gcController.heapInUse.load()
1031
1032	// Update GC trigger and pacing, as well as downstream consumers
1033	// of this pacing information, for the next cycle.
1034	systemstack(gcControllerCommit)
1035
1036	// Update timing memstats
1037	now := nanotime()
1038	sec, nsec, _ := time_now()
1039	unixNow := sec*1e9 + int64(nsec)
1040	work.pauseNS += now - stw.startedStopping
1041	work.tEnd = now
1042	atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
1043	atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
1044	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
1045	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
1046	memstats.pause_total_ns += uint64(work.pauseNS)
1047
1048	// Accumulate CPU stats.
1049	//
1050	// Use maxprocs instead of stwprocs for GC pause time because the total time
1051	// computed in the CPU stats is based on maxprocs, and we want them to be
1052	// comparable.
1053	//
1054	// Pass gcMarkPhase=true to accumulate so we can get all the latest GC CPU stats
1055	// in there too.
1056	work.cpuStats.accumulateGCPauseTime(now-stw.finishedStopping, work.maxprocs)
1057	work.cpuStats.accumulate(now, true)
1058
1059	// Compute overall GC CPU utilization.
1060	// Omit idle marking time from the overall utilization here since it's "free".
1061	memstats.gc_cpu_fraction = float64(work.cpuStats.GCTotalTime-work.cpuStats.GCIdleTime) / float64(work.cpuStats.TotalTime)
1062
1063	// Reset assist time and background time stats.
1064	//
1065	// Do this now, instead of at the start of the next GC cycle, because
1066	// these two may keep accumulating even if the GC is not active.
1067	scavenge.assistTime.Store(0)
1068	scavenge.backgroundTime.Store(0)
1069
1070	// Reset idle time stat.
1071	sched.idleTime.Store(0)
1072
1073	if work.userForced {
1074		memstats.numforcedgc++
1075	}
1076
1077	// Bump GC cycle count and wake goroutines waiting on sweep.
1078	lock(&work.sweepWaiters.lock)
1079	memstats.numgc++
1080	injectglist(&work.sweepWaiters.list)
1081	unlock(&work.sweepWaiters.lock)
1082
1083	// Increment the scavenge generation now.
1084	//
1085	// This moment represents peak heap in use because we're
1086	// about to start sweeping.
1087	mheap_.pages.scav.index.nextGen()
1088
1089	// Release the CPU limiter.
1090	gcCPULimiter.finishGCTransition(now)
1091
1092	// Finish the current heap profiling cycle and start a new
1093	// heap profiling cycle. We do this before starting the world
1094	// so events don't leak into the wrong cycle.
1095	mProf_NextCycle()
1096
1097	// There may be stale spans in mcaches that need to be swept.
1098	// Those aren't tracked in any sweep lists, so we need to
1099	// count them against sweep completion until we ensure all
1100	// those spans have been forced out.
1101	//
1102	// If gcSweep fully swept the heap (for example if the sweep
1103	// is not concurrent due to a GODEBUG setting), then we expect
1104	// the sweepLocker to be invalid, since sweeping is done.
1105	//
1106	// N.B. Below we might duplicate some work from gcSweep; this is
1107	// fine as all that work is idempotent within a GC cycle, and
1108	// we're still holding worldsema so a new cycle can't start.
1109	sl := sweep.active.begin()
1110	if !stwSwept && !sl.valid {
1111		throw("failed to set sweep barrier")
1112	} else if stwSwept && sl.valid {
1113		throw("non-concurrent sweep failed to drain all sweep queues")
1114	}
1115
1116	systemstack(func() {
1117		// The memstats updated above must be updated with the world
1118		// stopped to ensure consistency of some values, such as
1119		// sched.idleTime and sched.totaltime. memstats also include
1120		// the pause time (work,pauseNS), forcing computation of the
1121		// total pause time before the pause actually ends.
1122		//
1123		// Here we reuse the same now for start the world so that the
1124		// time added to /sched/pauses/total/gc:seconds will be
1125		// consistent with the value in memstats.
1126		startTheWorldWithSema(now, stw)
1127	})
1128
1129	// Flush the heap profile so we can start a new cycle next GC.
1130	// This is relatively expensive, so we don't do it with the
1131	// world stopped.
1132	mProf_Flush()
1133
1134	// Prepare workbufs for freeing by the sweeper. We do this
1135	// asynchronously because it can take non-trivial time.
1136	prepareFreeWorkbufs()
1137
1138	// Free stack spans. This must be done between GC cycles.
1139	systemstack(freeStackSpans)
1140
1141	// Ensure all mcaches are flushed. Each P will flush its own
1142	// mcache before allocating, but idle Ps may not. Since this
1143	// is necessary to sweep all spans, we need to ensure all
1144	// mcaches are flushed before we start the next GC cycle.
1145	//
1146	// While we're here, flush the page cache for idle Ps to avoid
1147	// having pages get stuck on them. These pages are hidden from
1148	// the scavenger, so in small idle heaps a significant amount
1149	// of additional memory might be held onto.
1150	//
1151	// Also, flush the pinner cache, to avoid leaking that memory
1152	// indefinitely.
1153	forEachP(waitReasonFlushProcCaches, func(pp *p) {
1154		pp.mcache.prepareForSweep()
1155		if pp.status == _Pidle {
1156			systemstack(func() {
1157				lock(&mheap_.lock)
1158				pp.pcache.flush(&mheap_.pages)
1159				unlock(&mheap_.lock)
1160			})
1161		}
1162		pp.pinnerCache = nil
1163	})
1164	if sl.valid {
1165		// Now that we've swept stale spans in mcaches, they don't
1166		// count against unswept spans.
1167		//
1168		// Note: this sweepLocker may not be valid if sweeping had
1169		// already completed during the STW. See the corresponding
1170		// begin() call that produced sl.
1171		sweep.active.end(sl)
1172	}
1173
1174	// Print gctrace before dropping worldsema. As soon as we drop
1175	// worldsema another cycle could start and smash the stats
1176	// we're trying to print.
1177	if debug.gctrace > 0 {
1178		util := int(memstats.gc_cpu_fraction * 100)
1179
1180		var sbuf [24]byte
1181		printlock()
1182		print("gc ", memstats.numgc,
1183			" @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
1184			util, "%: ")
1185		prev := work.tSweepTerm
1186		for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
1187			if i != 0 {
1188				print("+")
1189			}
1190			print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
1191			prev = ns
1192		}
1193		print(" ms clock, ")
1194		for i, ns := range []int64{
1195			int64(work.stwprocs) * (work.tMark - work.tSweepTerm),
1196			gcController.assistTime.Load(),
1197			gcController.dedicatedMarkTime.Load() + gcController.fractionalMarkTime.Load(),
1198			gcController.idleMarkTime.Load(),
1199			int64(work.stwprocs) * (work.tEnd - work.tMarkTerm),
1200		} {
1201			if i == 2 || i == 3 {
1202				// Separate mark time components with /.
1203				print("/")
1204			} else if i != 0 {
1205				print("+")
1206			}
1207			print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
1208		}
1209		print(" ms cpu, ",
1210			work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
1211			gcController.lastHeapGoal>>20, " MB goal, ",
1212			gcController.lastStackScan.Load()>>20, " MB stacks, ",
1213			gcController.globalsScan.Load()>>20, " MB globals, ",
1214			work.maxprocs, " P")
1215		if work.userForced {
1216			print(" (forced)")
1217		}
1218		print("\n")
1219		printunlock()
1220	}
1221
1222	// Set any arena chunks that were deferred to fault.
1223	lock(&userArenaState.lock)
1224	faultList := userArenaState.fault
1225	userArenaState.fault = nil
1226	unlock(&userArenaState.lock)
1227	for _, lc := range faultList {
1228		lc.mspan.setUserArenaChunkToFault()
1229	}
1230
1231	// Enable huge pages on some metadata if we cross a heap threshold.
1232	if gcController.heapGoal() > minHeapForMetadataHugePages {
1233		systemstack(func() {
1234			mheap_.enableMetadataHugePages()
1235		})
1236	}
1237
1238	semrelease(&worldsema)
1239	semrelease(&gcsema)
1240	// Careful: another GC cycle may start now.
1241
1242	releasem(mp)
1243	mp = nil
1244
1245	// now that gc is done, kick off finalizer thread if needed
1246	if !concurrentSweep {
1247		// give the queued finalizers, if any, a chance to run
1248		Gosched()
1249	}
1250}
1251
1252// gcBgMarkStartWorkers prepares background mark worker goroutines. These
1253// goroutines will not run until the mark phase, but they must be started while
1254// the work is not stopped and from a regular G stack. The caller must hold
1255// worldsema.
1256func gcBgMarkStartWorkers() {
1257	// Background marking is performed by per-P G's. Ensure that each P has
1258	// a background GC G.
1259	//
1260	// Worker Gs don't exit if gomaxprocs is reduced. If it is raised
1261	// again, we can reuse the old workers; no need to create new workers.
1262	if gcBgMarkWorkerCount >= gomaxprocs {
1263		return
1264	}
1265
1266	// Increment mp.locks when allocating. We are called within gcStart,
1267	// and thus must not trigger another gcStart via an allocation. gcStart
1268	// bails when allocating with locks held, so simulate that for these
1269	// allocations.
1270	//
1271	// TODO(prattmic): cleanup gcStart to use a more explicit "in gcStart"
1272	// check for bailing.
1273	mp := acquirem()
1274	ready := make(chan struct{}, 1)
1275	releasem(mp)
1276
1277	for gcBgMarkWorkerCount < gomaxprocs {
1278		mp := acquirem() // See above, we allocate a closure here.
1279		go gcBgMarkWorker(ready)
1280		releasem(mp)
1281
1282		// N.B. we intentionally wait on each goroutine individually
1283		// rather than starting all in a batch and then waiting once
1284		// afterwards. By running one goroutine at a time, we can take
1285		// advantage of runnext to bounce back and forth between
1286		// workers and this goroutine. In an overloaded application,
1287		// this can reduce GC start latency by prioritizing these
1288		// goroutines rather than waiting on the end of the run queue.
1289		<-ready
1290		// The worker is now guaranteed to be added to the pool before
1291		// its P's next findRunnableGCWorker.
1292
1293		gcBgMarkWorkerCount++
1294	}
1295}
1296
1297// gcBgMarkPrepare sets up state for background marking.
1298// Mutator assists must not yet be enabled.
1299func gcBgMarkPrepare() {
1300	// Background marking will stop when the work queues are empty
1301	// and there are no more workers (note that, since this is
1302	// concurrent, this may be a transient state, but mark
1303	// termination will clean it up). Between background workers
1304	// and assists, we don't really know how many workers there
1305	// will be, so we pretend to have an arbitrarily large number
1306	// of workers, almost all of which are "waiting". While a
1307	// worker is working it decrements nwait. If nproc == nwait,
1308	// there are no workers.
1309	work.nproc = ^uint32(0)
1310	work.nwait = ^uint32(0)
1311}
1312
1313// gcBgMarkWorkerNode is an entry in the gcBgMarkWorkerPool. It points to a single
1314// gcBgMarkWorker goroutine.
1315type gcBgMarkWorkerNode struct {
1316	// Unused workers are managed in a lock-free stack. This field must be first.
1317	node lfnode
1318
1319	// The g of this worker.
1320	gp guintptr
1321
1322	// Release this m on park. This is used to communicate with the unlock
1323	// function, which cannot access the G's stack. It is unused outside of
1324	// gcBgMarkWorker().
1325	m muintptr
1326}
1327
1328func gcBgMarkWorker(ready chan struct{}) {
1329	gp := getg()
1330
1331	// We pass node to a gopark unlock function, so it can't be on
1332	// the stack (see gopark). Prevent deadlock from recursively
1333	// starting GC by disabling preemption.
1334	gp.m.preemptoff = "GC worker init"
1335	node := new(gcBgMarkWorkerNode)
1336	gp.m.preemptoff = ""
1337
1338	node.gp.set(gp)
1339
1340	node.m.set(acquirem())
1341
1342	ready <- struct{}{}
1343	// After this point, the background mark worker is generally scheduled
1344	// cooperatively by gcController.findRunnableGCWorker. While performing
1345	// work on the P, preemption is disabled because we are working on
1346	// P-local work buffers. When the preempt flag is set, this puts itself
1347	// into _Gwaiting to be woken up by gcController.findRunnableGCWorker
1348	// at the appropriate time.
1349	//
1350	// When preemption is enabled (e.g., while in gcMarkDone), this worker
1351	// may be preempted and schedule as a _Grunnable G from a runq. That is
1352	// fine; it will eventually gopark again for further scheduling via
1353	// findRunnableGCWorker.
1354	//
1355	// Since we disable preemption before notifying ready, we guarantee that
1356	// this G will be in the worker pool for the next findRunnableGCWorker.
1357	// This isn't strictly necessary, but it reduces latency between
1358	// _GCmark starting and the workers starting.
1359
1360	for {
1361		// Go to sleep until woken by
1362		// gcController.findRunnableGCWorker.
1363		gopark(func(g *g, nodep unsafe.Pointer) bool {
1364			node := (*gcBgMarkWorkerNode)(nodep)
1365
1366			if mp := node.m.ptr(); mp != nil {
1367				// The worker G is no longer running; release
1368				// the M.
1369				//
1370				// N.B. it is _safe_ to release the M as soon
1371				// as we are no longer performing P-local mark
1372				// work.
1373				//
1374				// However, since we cooperatively stop work
1375				// when gp.preempt is set, if we releasem in
1376				// the loop then the following call to gopark
1377				// would immediately preempt the G. This is
1378				// also safe, but inefficient: the G must
1379				// schedule again only to enter gopark and park
1380				// again. Thus, we defer the release until
1381				// after parking the G.
1382				releasem(mp)
1383			}
1384
1385			// Release this G to the pool.
1386			gcBgMarkWorkerPool.push(&node.node)
1387			// Note that at this point, the G may immediately be
1388			// rescheduled and may be running.
1389			return true
1390		}, unsafe.Pointer(node), waitReasonGCWorkerIdle, traceBlockSystemGoroutine, 0)
1391
1392		// Preemption must not occur here, or another G might see
1393		// p.gcMarkWorkerMode.
1394
1395		// Disable preemption so we can use the gcw. If the
1396		// scheduler wants to preempt us, we'll stop draining,
1397		// dispose the gcw, and then preempt.
1398		node.m.set(acquirem())
1399		pp := gp.m.p.ptr() // P can't change with preemption disabled.
1400
1401		if gcBlackenEnabled == 0 {
1402			println("worker mode", pp.gcMarkWorkerMode)
1403			throw("gcBgMarkWorker: blackening not enabled")
1404		}
1405
1406		if pp.gcMarkWorkerMode == gcMarkWorkerNotWorker {
1407			throw("gcBgMarkWorker: mode not set")
1408		}
1409
1410		startTime := nanotime()
1411		pp.gcMarkWorkerStartTime = startTime
1412		var trackLimiterEvent bool
1413		if pp.gcMarkWorkerMode == gcMarkWorkerIdleMode {
1414			trackLimiterEvent = pp.limiterEvent.start(limiterEventIdleMarkWork, startTime)
1415		}
1416
1417		decnwait := atomic.Xadd(&work.nwait, -1)
1418		if decnwait == work.nproc {
1419			println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
1420			throw("work.nwait was > work.nproc")
1421		}
1422
1423		systemstack(func() {
1424			// Mark our goroutine preemptible so its stack
1425			// can be scanned. This lets two mark workers
1426			// scan each other (otherwise, they would
1427			// deadlock). We must not modify anything on
1428			// the G stack. However, stack shrinking is
1429			// disabled for mark workers, so it is safe to
1430			// read from the G stack.
1431			//
1432			// N.B. The execution tracer is not aware of this status
1433			// transition and handles it specially based on the
1434			// wait reason.
1435			casGToWaitingForGC(gp, _Grunning, waitReasonGCWorkerActive)
1436			switch pp.gcMarkWorkerMode {
1437			default:
1438				throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
1439			case gcMarkWorkerDedicatedMode:
1440				gcDrainMarkWorkerDedicated(&pp.gcw, true)
1441				if gp.preempt {
1442					// We were preempted. This is
1443					// a useful signal to kick
1444					// everything out of the run
1445					// queue so it can run
1446					// somewhere else.
1447					if drainQ, n := runqdrain(pp); n > 0 {
1448						lock(&sched.lock)
1449						globrunqputbatch(&drainQ, int32(n))
1450						unlock(&sched.lock)
1451					}
1452				}
1453				// Go back to draining, this time
1454				// without preemption.
1455				gcDrainMarkWorkerDedicated(&pp.gcw, false)
1456			case gcMarkWorkerFractionalMode:
1457				gcDrainMarkWorkerFractional(&pp.gcw)
1458			case gcMarkWorkerIdleMode:
1459				gcDrainMarkWorkerIdle(&pp.gcw)
1460			}
1461			casgstatus(gp, _Gwaiting, _Grunning)
1462		})
1463
1464		// Account for time and mark us as stopped.
1465		now := nanotime()
1466		duration := now - startTime
1467		gcController.markWorkerStop(pp.gcMarkWorkerMode, duration)
1468		if trackLimiterEvent {
1469			pp.limiterEvent.stop(limiterEventIdleMarkWork, now)
1470		}
1471		if pp.gcMarkWorkerMode == gcMarkWorkerFractionalMode {
1472			atomic.Xaddint64(&pp.gcFractionalMarkTime, duration)
1473		}
1474
1475		// Was this the last worker and did we run out
1476		// of work?
1477		incnwait := atomic.Xadd(&work.nwait, +1)
1478		if incnwait > work.nproc {
1479			println("runtime: p.gcMarkWorkerMode=", pp.gcMarkWorkerMode,
1480				"work.nwait=", incnwait, "work.nproc=", work.nproc)
1481			throw("work.nwait > work.nproc")
1482		}
1483
1484		// We'll releasem after this point and thus this P may run
1485		// something else. We must clear the worker mode to avoid
1486		// attributing the mode to a different (non-worker) G in
1487		// traceGoStart.
1488		pp.gcMarkWorkerMode = gcMarkWorkerNotWorker
1489
1490		// If this worker reached a background mark completion
1491		// point, signal the main GC goroutine.
1492		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
1493			// We don't need the P-local buffers here, allow
1494			// preemption because we may schedule like a regular
1495			// goroutine in gcMarkDone (block on locks, etc).
1496			releasem(node.m.ptr())
1497			node.m.set(nil)
1498
1499			gcMarkDone()
1500		}
1501	}
1502}
1503
1504// gcMarkWorkAvailable reports whether executing a mark worker
1505// on p is potentially useful. p may be nil, in which case it only
1506// checks the global sources of work.
1507func gcMarkWorkAvailable(p *p) bool {
1508	if p != nil && !p.gcw.empty() {
1509		return true
1510	}
1511	if !work.full.empty() {
1512		return true // global work available
1513	}
1514	if work.markrootNext < work.markrootJobs {
1515		return true // root scan work available
1516	}
1517	return false
1518}
1519
1520// gcMark runs the mark (or, for concurrent GC, mark termination)
1521// All gcWork caches must be empty.
1522// STW is in effect at this point.
1523func gcMark(startTime int64) {
1524	if gcphase != _GCmarktermination {
1525		throw("in gcMark expecting to see gcphase as _GCmarktermination")
1526	}
1527	work.tstart = startTime
1528
1529	// Check that there's no marking work remaining.
1530	if work.full != 0 || work.markrootNext < work.markrootJobs {
1531		print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n")
1532		panic("non-empty mark queue after concurrent mark")
1533	}
1534
1535	if debug.gccheckmark > 0 {
1536		// This is expensive when there's a large number of
1537		// Gs, so only do it if checkmark is also enabled.
1538		gcMarkRootCheck()
1539	}
1540
1541	// Drop allg snapshot. allgs may have grown, in which case
1542	// this is the only reference to the old backing store and
1543	// there's no need to keep it around.
1544	work.stackRoots = nil
1545
1546	// Clear out buffers and double-check that all gcWork caches
1547	// are empty. This should be ensured by gcMarkDone before we
1548	// enter mark termination.
1549	//
1550	// TODO: We could clear out buffers just before mark if this
1551	// has a non-negligible impact on STW time.
1552	for _, p := range allp {
1553		// The write barrier may have buffered pointers since
1554		// the gcMarkDone barrier. However, since the barrier
1555		// ensured all reachable objects were marked, all of
1556		// these must be pointers to black objects. Hence we
1557		// can just discard the write barrier buffer.
1558		if debug.gccheckmark > 0 {
1559			// For debugging, flush the buffer and make
1560			// sure it really was all marked.
1561			wbBufFlush1(p)
1562		} else {
1563			p.wbBuf.reset()
1564		}
1565
1566		gcw := &p.gcw
1567		if !gcw.empty() {
1568			printlock()
1569			print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
1570			if gcw.wbuf1 == nil {
1571				print(" wbuf1=<nil>")
1572			} else {
1573				print(" wbuf1.n=", gcw.wbuf1.nobj)
1574			}
1575			if gcw.wbuf2 == nil {
1576				print(" wbuf2=<nil>")
1577			} else {
1578				print(" wbuf2.n=", gcw.wbuf2.nobj)
1579			}
1580			print("\n")
1581			throw("P has cached GC work at end of mark termination")
1582		}
1583		// There may still be cached empty buffers, which we
1584		// need to flush since we're going to free them. Also,
1585		// there may be non-zero stats because we allocated
1586		// black after the gcMarkDone barrier.
1587		gcw.dispose()
1588	}
1589
1590	// Flush scanAlloc from each mcache since we're about to modify
1591	// heapScan directly. If we were to flush this later, then scanAlloc
1592	// might have incorrect information.
1593	//
1594	// Note that it's not important to retain this information; we know
1595	// exactly what heapScan is at this point via scanWork.
1596	for _, p := range allp {
1597		c := p.mcache
1598		if c == nil {
1599			continue
1600		}
1601		c.scanAlloc = 0
1602	}
1603
1604	// Reset controller state.
1605	gcController.resetLive(work.bytesMarked)
1606}
1607
1608// gcSweep must be called on the system stack because it acquires the heap
1609// lock. See mheap for details.
1610//
1611// Returns true if the heap was fully swept by this function.
1612//
1613// The world must be stopped.
1614//
1615//go:systemstack
1616func gcSweep(mode gcMode) bool {
1617	assertWorldStopped()
1618
1619	if gcphase != _GCoff {
1620		throw("gcSweep being done but phase is not GCoff")
1621	}
1622
1623	lock(&mheap_.lock)
1624	mheap_.sweepgen += 2
1625	sweep.active.reset()
1626	mheap_.pagesSwept.Store(0)
1627	mheap_.sweepArenas = mheap_.allArenas
1628	mheap_.reclaimIndex.Store(0)
1629	mheap_.reclaimCredit.Store(0)
1630	unlock(&mheap_.lock)
1631
1632	sweep.centralIndex.clear()
1633
1634	if !concurrentSweep || mode == gcForceBlockMode {
1635		// Special case synchronous sweep.
1636		// Record that no proportional sweeping has to happen.
1637		lock(&mheap_.lock)
1638		mheap_.sweepPagesPerByte = 0
1639		unlock(&mheap_.lock)
1640		// Flush all mcaches.
1641		for _, pp := range allp {
1642			pp.mcache.prepareForSweep()
1643		}
1644		// Sweep all spans eagerly.
1645		for sweepone() != ^uintptr(0) {
1646		}
1647		// Free workbufs eagerly.
1648		prepareFreeWorkbufs()
1649		for freeSomeWbufs(false) {
1650		}
1651		// All "free" events for this mark/sweep cycle have
1652		// now happened, so we can make this profile cycle
1653		// available immediately.
1654		mProf_NextCycle()
1655		mProf_Flush()
1656		return true
1657	}
1658
1659	// Background sweep.
1660	lock(&sweep.lock)
1661	if sweep.parked {
1662		sweep.parked = false
1663		ready(sweep.g, 0, true)
1664	}
1665	unlock(&sweep.lock)
1666	return false
1667}
1668
1669// gcResetMarkState resets global state prior to marking (concurrent
1670// or STW) and resets the stack scan state of all Gs.
1671//
1672// This is safe to do without the world stopped because any Gs created
1673// during or after this will start out in the reset state.
1674//
1675// gcResetMarkState must be called on the system stack because it acquires
1676// the heap lock. See mheap for details.
1677//
1678//go:systemstack
1679func gcResetMarkState() {
1680	// This may be called during a concurrent phase, so lock to make sure
1681	// allgs doesn't change.
1682	forEachG(func(gp *g) {
1683		gp.gcscandone = false // set to true in gcphasework
1684		gp.gcAssistBytes = 0
1685	})
1686
1687	// Clear page marks. This is just 1MB per 64GB of heap, so the
1688	// time here is pretty trivial.
1689	lock(&mheap_.lock)
1690	arenas := mheap_.allArenas
1691	unlock(&mheap_.lock)
1692	for _, ai := range arenas {
1693		ha := mheap_.arenas[ai.l1()][ai.l2()]
1694		clear(ha.pageMarks[:])
1695	}
1696
1697	work.bytesMarked = 0
1698	work.initialHeapLive = gcController.heapLive.Load()
1699}
1700
1701// Hooks for other packages
1702
1703var poolcleanup func()
1704var boringCaches []unsafe.Pointer  // for crypto/internal/boring
1705var uniqueMapCleanup chan struct{} // for unique
1706
1707// sync_runtime_registerPoolCleanup should be an internal detail,
1708// but widely used packages access it using linkname.
1709// Notable members of the hall of shame include:
1710//   - github.com/bytedance/gopkg
1711//   - github.com/songzhibin97/gkit
1712//
1713// Do not remove or change the type signature.
1714// See go.dev/issue/67401.
1715//
1716//go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
1717func sync_runtime_registerPoolCleanup(f func()) {
1718	poolcleanup = f
1719}
1720
1721//go:linkname boring_registerCache crypto/internal/boring/bcache.registerCache
1722func boring_registerCache(p unsafe.Pointer) {
1723	boringCaches = append(boringCaches, p)
1724}
1725
1726//go:linkname unique_runtime_registerUniqueMapCleanup unique.runtime_registerUniqueMapCleanup
1727func unique_runtime_registerUniqueMapCleanup(f func()) {
1728	// Start the goroutine in the runtime so it's counted as a system goroutine.
1729	uniqueMapCleanup = make(chan struct{}, 1)
1730	go func(cleanup func()) {
1731		for {
1732			<-uniqueMapCleanup
1733			cleanup()
1734		}
1735	}(f)
1736}
1737
1738func clearpools() {
1739	// clear sync.Pools
1740	if poolcleanup != nil {
1741		poolcleanup()
1742	}
1743
1744	// clear boringcrypto caches
1745	for _, p := range boringCaches {
1746		atomicstorep(p, nil)
1747	}
1748
1749	// clear unique maps
1750	if uniqueMapCleanup != nil {
1751		select {
1752		case uniqueMapCleanup <- struct{}{}:
1753		default:
1754		}
1755	}
1756
1757	// Clear central sudog cache.
1758	// Leave per-P caches alone, they have strictly bounded size.
1759	// Disconnect cached list before dropping it on the floor,
1760	// so that a dangling ref to one entry does not pin all of them.
1761	lock(&sched.sudoglock)
1762	var sg, sgnext *sudog
1763	for sg = sched.sudogcache; sg != nil; sg = sgnext {
1764		sgnext = sg.next
1765		sg.next = nil
1766	}
1767	sched.sudogcache = nil
1768	unlock(&sched.sudoglock)
1769
1770	// Clear central defer pool.
1771	// Leave per-P pools alone, they have strictly bounded size.
1772	lock(&sched.deferlock)
1773	// disconnect cached list before dropping it on the floor,
1774	// so that a dangling ref to one entry does not pin all of them.
1775	var d, dlink *_defer
1776	for d = sched.deferpool; d != nil; d = dlink {
1777		dlink = d.link
1778		d.link = nil
1779	}
1780	sched.deferpool = nil
1781	unlock(&sched.deferlock)
1782}
1783
1784// Timing
1785
1786// itoaDiv formats val/(10**dec) into buf.
1787func itoaDiv(buf []byte, val uint64, dec int) []byte {
1788	i := len(buf) - 1
1789	idec := i - dec
1790	for val >= 10 || i >= idec {
1791		buf[i] = byte(val%10 + '0')
1792		i--
1793		if i == idec {
1794			buf[i] = '.'
1795			i--
1796		}
1797		val /= 10
1798	}
1799	buf[i] = byte(val + '0')
1800	return buf[i:]
1801}
1802
1803// fmtNSAsMS nicely formats ns nanoseconds as milliseconds.
1804func fmtNSAsMS(buf []byte, ns uint64) []byte {
1805	if ns >= 10e6 {
1806		// Format as whole milliseconds.
1807		return itoaDiv(buf, ns/1e6, 0)
1808	}
1809	// Format two digits of precision, with at most three decimal places.
1810	x := ns / 1e3
1811	if x == 0 {
1812		buf[0] = '0'
1813		return buf[:1]
1814	}
1815	dec := 3
1816	for x >= 100 {
1817		x /= 10
1818		dec--
1819	}
1820	return itoaDiv(buf, x, dec)
1821}
1822
1823// Helpers for testing GC.
1824
1825// gcTestMoveStackOnNextCall causes the stack to be moved on a call
1826// immediately following the call to this. It may not work correctly
1827// if any other work appears after this call (such as returning).
1828// Typically the following call should be marked go:noinline so it
1829// performs a stack check.
1830//
1831// In rare cases this may not cause the stack to move, specifically if
1832// there's a preemption between this call and the next.
1833func gcTestMoveStackOnNextCall() {
1834	gp := getg()
1835	gp.stackguard0 = stackForceMove
1836}
1837
1838// gcTestIsReachable performs a GC and returns a bit set where bit i
1839// is set if ptrs[i] is reachable.
1840func gcTestIsReachable(ptrs ...unsafe.Pointer) (mask uint64) {
1841	// This takes the pointers as unsafe.Pointers in order to keep
1842	// them live long enough for us to attach specials. After
1843	// that, we drop our references to them.
1844
1845	if len(ptrs) > 64 {
1846		panic("too many pointers for uint64 mask")
1847	}
1848
1849	// Block GC while we attach specials and drop our references
1850	// to ptrs. Otherwise, if a GC is in progress, it could mark
1851	// them reachable via this function before we have a chance to
1852	// drop them.
1853	semacquire(&gcsema)
1854
1855	// Create reachability specials for ptrs.
1856	specials := make([]*specialReachable, len(ptrs))
1857	for i, p := range ptrs {
1858		lock(&mheap_.speciallock)
1859		s := (*specialReachable)(mheap_.specialReachableAlloc.alloc())
1860		unlock(&mheap_.speciallock)
1861		s.special.kind = _KindSpecialReachable
1862		if !addspecial(p, &s.special) {
1863			throw("already have a reachable special (duplicate pointer?)")
1864		}
1865		specials[i] = s
1866		// Make sure we don't retain ptrs.
1867		ptrs[i] = nil
1868	}
1869
1870	semrelease(&gcsema)
1871
1872	// Force a full GC and sweep.
1873	GC()
1874
1875	// Process specials.
1876	for i, s := range specials {
1877		if !s.done {
1878			printlock()
1879			println("runtime: object", i, "was not swept")
1880			throw("IsReachable failed")
1881		}
1882		if s.reachable {
1883			mask |= 1 << i
1884		}
1885		lock(&mheap_.speciallock)
1886		mheap_.specialReachableAlloc.free(unsafe.Pointer(s))
1887		unlock(&mheap_.speciallock)
1888	}
1889
1890	return mask
1891}
1892
1893// gcTestPointerClass returns the category of what p points to, one of:
1894// "heap", "stack", "data", "bss", "other". This is useful for checking
1895// that a test is doing what it's intended to do.
1896//
1897// This is nosplit simply to avoid extra pointer shuffling that may
1898// complicate a test.
1899//
1900//go:nosplit
1901func gcTestPointerClass(p unsafe.Pointer) string {
1902	p2 := uintptr(noescape(p))
1903	gp := getg()
1904	if gp.stack.lo <= p2 && p2 < gp.stack.hi {
1905		return "stack"
1906	}
1907	if base, _, _ := findObject(p2, 0, 0); base != 0 {
1908		return "heap"
1909	}
1910	for _, datap := range activeModules() {
1911		if datap.data <= p2 && p2 < datap.edata || datap.noptrdata <= p2 && p2 < datap.enoptrdata {
1912			return "data"
1913		}
1914		if datap.bss <= p2 && p2 < datap.ebss || datap.noptrbss <= p2 && p2 <= datap.enoptrbss {
1915			return "bss"
1916		}
1917	}
1918	KeepAlive(p)
1919	return "other"
1920}
1921