1// Copyright 2022 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package runtime
6
7import "internal/runtime/atomic"
8
9// gcCPULimiter is a mechanism to limit GC CPU utilization in situations
10// where it might become excessive and inhibit application progress (e.g.
11// a death spiral).
12//
13// The core of the limiter is a leaky bucket mechanism that fills with GC
14// CPU time and drains with mutator time. Because the bucket fills and
15// drains with time directly (i.e. without any weighting), this effectively
16// sets a very conservative limit of 50%. This limit could be enforced directly,
17// however, but the purpose of the bucket is to accommodate spikes in GC CPU
18// utilization without hurting throughput.
19//
20// Note that the bucket in the leaky bucket mechanism can never go negative,
21// so the GC never gets credit for a lot of CPU time spent without the GC
22// running. This is intentional, as an application that stays idle for, say,
23// an entire day, could build up enough credit to fail to prevent a death
24// spiral the following day. The bucket's capacity is the GC's only leeway.
25//
26// The capacity thus also sets the window the limiter considers. For example,
27// if the capacity of the bucket is 1 cpu-second, then the limiter will not
28// kick in until at least 1 full cpu-second in the last 2 cpu-second window
29// is spent on GC CPU time.
30var gcCPULimiter gcCPULimiterState
31
32type gcCPULimiterState struct {
33	lock atomic.Uint32
34
35	enabled atomic.Bool
36
37	// gcEnabled is an internal copy of gcBlackenEnabled that determines
38	// whether the limiter tracks total assist time.
39	//
40	// gcBlackenEnabled isn't used directly so as to keep this structure
41	// unit-testable.
42	gcEnabled bool
43
44	// transitioning is true when the GC is in a STW and transitioning between
45	// the mark and sweep phases.
46	transitioning bool
47
48	// test indicates whether this instance of the struct was made for testing purposes.
49	test bool
50
51	bucket struct {
52		// Invariants:
53		// - fill >= 0
54		// - capacity >= 0
55		// - fill <= capacity
56		fill, capacity uint64
57	}
58	// overflow is the cumulative amount of GC CPU time that we tried to fill the
59	// bucket with but exceeded its capacity.
60	overflow uint64
61
62	// assistTimePool is the accumulated assist time since the last update.
63	assistTimePool atomic.Int64
64
65	// idleMarkTimePool is the accumulated idle mark time since the last update.
66	idleMarkTimePool atomic.Int64
67
68	// idleTimePool is the accumulated time Ps spent on the idle list since the last update.
69	idleTimePool atomic.Int64
70
71	// lastUpdate is the nanotime timestamp of the last time update was called.
72	//
73	// Updated under lock, but may be read concurrently.
74	lastUpdate atomic.Int64
75
76	// lastEnabledCycle is the GC cycle that last had the limiter enabled.
77	lastEnabledCycle atomic.Uint32
78
79	// nprocs is an internal copy of gomaxprocs, used to determine total available
80	// CPU time.
81	//
82	// gomaxprocs isn't used directly so as to keep this structure unit-testable.
83	nprocs int32
84}
85
86// limiting returns true if the CPU limiter is currently enabled, meaning the Go GC
87// should take action to limit CPU utilization.
88//
89// It is safe to call concurrently with other operations.
90func (l *gcCPULimiterState) limiting() bool {
91	return l.enabled.Load()
92}
93
94// startGCTransition notifies the limiter of a GC transition.
95//
96// This call takes ownership of the limiter and disables all other means of
97// updating the limiter. Release ownership by calling finishGCTransition.
98//
99// It is safe to call concurrently with other operations.
100func (l *gcCPULimiterState) startGCTransition(enableGC bool, now int64) {
101	if !l.tryLock() {
102		// This must happen during a STW, so we can't fail to acquire the lock.
103		// If we did, something went wrong. Throw.
104		throw("failed to acquire lock to start a GC transition")
105	}
106	if l.gcEnabled == enableGC {
107		throw("transitioning GC to the same state as before?")
108	}
109	// Flush whatever was left between the last update and now.
110	l.updateLocked(now)
111	l.gcEnabled = enableGC
112	l.transitioning = true
113	// N.B. finishGCTransition releases the lock.
114	//
115	// We don't release here to increase the chance that if there's a failure
116	// to finish the transition, that we throw on failing to acquire the lock.
117}
118
119// finishGCTransition notifies the limiter that the GC transition is complete
120// and releases ownership of it. It also accumulates STW time in the bucket.
121// now must be the timestamp from the end of the STW pause.
122func (l *gcCPULimiterState) finishGCTransition(now int64) {
123	if !l.transitioning {
124		throw("finishGCTransition called without starting one?")
125	}
126	// Count the full nprocs set of CPU time because the world is stopped
127	// between startGCTransition and finishGCTransition. Even though the GC
128	// isn't running on all CPUs, it is preventing user code from doing so,
129	// so it might as well be.
130	if lastUpdate := l.lastUpdate.Load(); now >= lastUpdate {
131		l.accumulate(0, (now-lastUpdate)*int64(l.nprocs))
132	}
133	l.lastUpdate.Store(now)
134	l.transitioning = false
135	l.unlock()
136}
137
138// gcCPULimiterUpdatePeriod dictates the maximum amount of wall-clock time
139// we can go before updating the limiter.
140const gcCPULimiterUpdatePeriod = 10e6 // 10ms
141
142// needUpdate returns true if the limiter's maximum update period has been
143// exceeded, and so would benefit from an update.
144func (l *gcCPULimiterState) needUpdate(now int64) bool {
145	return now-l.lastUpdate.Load() > gcCPULimiterUpdatePeriod
146}
147
148// addAssistTime notifies the limiter of additional assist time. It will be
149// included in the next update.
150func (l *gcCPULimiterState) addAssistTime(t int64) {
151	l.assistTimePool.Add(t)
152}
153
154// addIdleTime notifies the limiter of additional time a P spent on the idle list. It will be
155// subtracted from the total CPU time in the next update.
156func (l *gcCPULimiterState) addIdleTime(t int64) {
157	l.idleTimePool.Add(t)
158}
159
160// update updates the bucket given runtime-specific information. now is the
161// current monotonic time in nanoseconds.
162//
163// This is safe to call concurrently with other operations, except *GCTransition.
164func (l *gcCPULimiterState) update(now int64) {
165	if !l.tryLock() {
166		// We failed to acquire the lock, which means something else is currently
167		// updating. Just drop our update, the next one to update will include
168		// our total assist time.
169		return
170	}
171	if l.transitioning {
172		throw("update during transition")
173	}
174	l.updateLocked(now)
175	l.unlock()
176}
177
178// updateLocked is the implementation of update. l.lock must be held.
179func (l *gcCPULimiterState) updateLocked(now int64) {
180	lastUpdate := l.lastUpdate.Load()
181	if now < lastUpdate {
182		// Defensively avoid overflow. This isn't even the latest update anyway.
183		return
184	}
185	windowTotalTime := (now - lastUpdate) * int64(l.nprocs)
186	l.lastUpdate.Store(now)
187
188	// Drain the pool of assist time.
189	assistTime := l.assistTimePool.Load()
190	if assistTime != 0 {
191		l.assistTimePool.Add(-assistTime)
192	}
193
194	// Drain the pool of idle time.
195	idleTime := l.idleTimePool.Load()
196	if idleTime != 0 {
197		l.idleTimePool.Add(-idleTime)
198	}
199
200	if !l.test {
201		// Consume time from in-flight events. Make sure we're not preemptible so allp can't change.
202		//
203		// The reason we do this instead of just waiting for those events to finish and push updates
204		// is to ensure that all the time we're accounting for happened sometime between lastUpdate
205		// and now. This dramatically simplifies reasoning about the limiter because we're not at
206		// risk of extra time being accounted for in this window than actually happened in this window,
207		// leading to all sorts of weird transient behavior.
208		mp := acquirem()
209		for _, pp := range allp {
210			typ, duration := pp.limiterEvent.consume(now)
211			switch typ {
212			case limiterEventIdleMarkWork:
213				fallthrough
214			case limiterEventIdle:
215				idleTime += duration
216				sched.idleTime.Add(duration)
217			case limiterEventMarkAssist:
218				fallthrough
219			case limiterEventScavengeAssist:
220				assistTime += duration
221			case limiterEventNone:
222				break
223			default:
224				throw("invalid limiter event type found")
225			}
226		}
227		releasem(mp)
228	}
229
230	// Compute total GC time.
231	windowGCTime := assistTime
232	if l.gcEnabled {
233		windowGCTime += int64(float64(windowTotalTime) * gcBackgroundUtilization)
234	}
235
236	// Subtract out all idle time from the total time. Do this after computing
237	// GC time, because the background utilization is dependent on the *real*
238	// total time, not the total time after idle time is subtracted.
239	//
240	// Idle time is counted as any time that a P is on the P idle list plus idle mark
241	// time. Idle mark workers soak up time that the application spends idle.
242	//
243	// On a heavily undersubscribed system, any additional idle time can skew GC CPU
244	// utilization, because the GC might be executing continuously and thrashing,
245	// yet the CPU utilization with respect to GOMAXPROCS will be quite low, so
246	// the limiter fails to turn on. By subtracting idle time, we're removing time that
247	// we know the application was idle giving a more accurate picture of whether
248	// the GC is thrashing.
249	//
250	// Note that this can cause the limiter to turn on even if it's not needed. For
251	// instance, on a system with 32 Ps but only 1 running goroutine, each GC will have
252	// 8 dedicated GC workers. Assuming the GC cycle is half mark phase and half sweep
253	// phase, then the GC CPU utilization over that cycle, with idle time removed, will
254	// be 8/(8+2) = 80%. Even though the limiter turns on, though, assist should be
255	// unnecessary, as the GC has way more CPU time to outpace the 1 goroutine that's
256	// running.
257	windowTotalTime -= idleTime
258
259	l.accumulate(windowTotalTime-windowGCTime, windowGCTime)
260}
261
262// accumulate adds time to the bucket and signals whether the limiter is enabled.
263//
264// This is an internal function that deals just with the bucket. Prefer update.
265// l.lock must be held.
266func (l *gcCPULimiterState) accumulate(mutatorTime, gcTime int64) {
267	headroom := l.bucket.capacity - l.bucket.fill
268	enabled := headroom == 0
269
270	// Let's be careful about three things here:
271	// 1. The addition and subtraction, for the invariants.
272	// 2. Overflow.
273	// 3. Excessive mutation of l.enabled, which is accessed
274	//    by all assists, potentially more than once.
275	change := gcTime - mutatorTime
276
277	// Handle limiting case.
278	if change > 0 && headroom <= uint64(change) {
279		l.overflow += uint64(change) - headroom
280		l.bucket.fill = l.bucket.capacity
281		if !enabled {
282			l.enabled.Store(true)
283			l.lastEnabledCycle.Store(memstats.numgc + 1)
284		}
285		return
286	}
287
288	// Handle non-limiting cases.
289	if change < 0 && l.bucket.fill <= uint64(-change) {
290		// Bucket emptied.
291		l.bucket.fill = 0
292	} else {
293		// All other cases.
294		l.bucket.fill -= uint64(-change)
295	}
296	if change != 0 && enabled {
297		l.enabled.Store(false)
298	}
299}
300
301// tryLock attempts to lock l. Returns true on success.
302func (l *gcCPULimiterState) tryLock() bool {
303	return l.lock.CompareAndSwap(0, 1)
304}
305
306// unlock releases the lock on l. Must be called if tryLock returns true.
307func (l *gcCPULimiterState) unlock() {
308	old := l.lock.Swap(0)
309	if old != 1 {
310		throw("double unlock")
311	}
312}
313
314// capacityPerProc is the limiter's bucket capacity for each P in GOMAXPROCS.
315const capacityPerProc = 1e9 // 1 second in nanoseconds
316
317// resetCapacity updates the capacity based on GOMAXPROCS. Must not be called
318// while the GC is enabled.
319//
320// It is safe to call concurrently with other operations.
321func (l *gcCPULimiterState) resetCapacity(now int64, nprocs int32) {
322	if !l.tryLock() {
323		// This must happen during a STW, so we can't fail to acquire the lock.
324		// If we did, something went wrong. Throw.
325		throw("failed to acquire lock to reset capacity")
326	}
327	// Flush the rest of the time for this period.
328	l.updateLocked(now)
329	l.nprocs = nprocs
330
331	l.bucket.capacity = uint64(nprocs) * capacityPerProc
332	if l.bucket.fill > l.bucket.capacity {
333		l.bucket.fill = l.bucket.capacity
334		l.enabled.Store(true)
335		l.lastEnabledCycle.Store(memstats.numgc + 1)
336	} else if l.bucket.fill < l.bucket.capacity {
337		l.enabled.Store(false)
338	}
339	l.unlock()
340}
341
342// limiterEventType indicates the type of an event occurring on some P.
343//
344// These events represent the full set of events that the GC CPU limiter tracks
345// to execute its function.
346//
347// This type may use no more than limiterEventBits bits of information.
348type limiterEventType uint8
349
350const (
351	limiterEventNone           limiterEventType = iota // None of the following events.
352	limiterEventIdleMarkWork                           // Refers to an idle mark worker (see gcMarkWorkerMode).
353	limiterEventMarkAssist                             // Refers to mark assist (see gcAssistAlloc).
354	limiterEventScavengeAssist                         // Refers to a scavenge assist (see allocSpan).
355	limiterEventIdle                                   // Refers to time a P spent on the idle list.
356
357	limiterEventBits = 3
358)
359
360// limiterEventTypeMask is a mask for the bits in p.limiterEventStart that represent
361// the event type. The rest of the bits of that field represent a timestamp.
362const (
363	limiterEventTypeMask  = uint64((1<<limiterEventBits)-1) << (64 - limiterEventBits)
364	limiterEventStampNone = limiterEventStamp(0)
365)
366
367// limiterEventStamp is a nanotime timestamp packed with a limiterEventType.
368type limiterEventStamp uint64
369
370// makeLimiterEventStamp creates a new stamp from the event type and the current timestamp.
371func makeLimiterEventStamp(typ limiterEventType, now int64) limiterEventStamp {
372	return limiterEventStamp(uint64(typ)<<(64-limiterEventBits) | (uint64(now) &^ limiterEventTypeMask))
373}
374
375// duration computes the difference between now and the start time stored in the stamp.
376//
377// Returns 0 if the difference is negative, which may happen if now is stale or if the
378// before and after timestamps cross a 2^(64-limiterEventBits) boundary.
379func (s limiterEventStamp) duration(now int64) int64 {
380	// The top limiterEventBits bits of the timestamp are derived from the current time
381	// when computing a duration.
382	start := int64((uint64(now) & limiterEventTypeMask) | (uint64(s) &^ limiterEventTypeMask))
383	if now < start {
384		return 0
385	}
386	return now - start
387}
388
389// type extracts the event type from the stamp.
390func (s limiterEventStamp) typ() limiterEventType {
391	return limiterEventType(s >> (64 - limiterEventBits))
392}
393
394// limiterEvent represents tracking state for an event tracked by the GC CPU limiter.
395type limiterEvent struct {
396	stamp atomic.Uint64 // Stores a limiterEventStamp.
397}
398
399// start begins tracking a new limiter event of the current type. If an event
400// is already in flight, then a new event cannot begin because the current time is
401// already being attributed to that event. In this case, this function returns false.
402// Otherwise, it returns true.
403//
404// The caller must be non-preemptible until at least stop is called or this function
405// returns false. Because this is trying to measure "on-CPU" time of some event, getting
406// scheduled away during it can mean that whatever we're measuring isn't a reflection
407// of "on-CPU" time. The OS could deschedule us at any time, but we want to maintain as
408// close of an approximation as we can.
409func (e *limiterEvent) start(typ limiterEventType, now int64) bool {
410	if limiterEventStamp(e.stamp.Load()).typ() != limiterEventNone {
411		return false
412	}
413	e.stamp.Store(uint64(makeLimiterEventStamp(typ, now)))
414	return true
415}
416
417// consume acquires the partial event CPU time from any in-flight event.
418// It achieves this by storing the current time as the new event time.
419//
420// Returns the type of the in-flight event, as well as how long it's currently been
421// executing for. Returns limiterEventNone if no event is active.
422func (e *limiterEvent) consume(now int64) (typ limiterEventType, duration int64) {
423	// Read the limiter event timestamp and update it to now.
424	for {
425		old := limiterEventStamp(e.stamp.Load())
426		typ = old.typ()
427		if typ == limiterEventNone {
428			// There's no in-flight event, so just push that up.
429			return
430		}
431		duration = old.duration(now)
432		if duration == 0 {
433			// We might have a stale now value, or this crossed the
434			// 2^(64-limiterEventBits) boundary in the clock readings.
435			// Just ignore it.
436			return limiterEventNone, 0
437		}
438		new := makeLimiterEventStamp(typ, now)
439		if e.stamp.CompareAndSwap(uint64(old), uint64(new)) {
440			break
441		}
442	}
443	return
444}
445
446// stop stops the active limiter event. Throws if the
447//
448// The caller must be non-preemptible across the event. See start as to why.
449func (e *limiterEvent) stop(typ limiterEventType, now int64) {
450	var stamp limiterEventStamp
451	for {
452		stamp = limiterEventStamp(e.stamp.Load())
453		if stamp.typ() != typ {
454			print("runtime: want=", typ, " got=", stamp.typ(), "\n")
455			throw("limiterEvent.stop: found wrong event in p's limiter event slot")
456		}
457		if e.stamp.CompareAndSwap(uint64(stamp), uint64(limiterEventStampNone)) {
458			break
459		}
460	}
461	duration := stamp.duration(now)
462	if duration == 0 {
463		// It's possible that we're missing time because we crossed a
464		// 2^(64-limiterEventBits) boundary between the start and end.
465		// In this case, we're dropping that information. This is OK because
466		// at worst it'll cause a transient hiccup that will quickly resolve
467		// itself as all new timestamps begin on the other side of the boundary.
468		// Such a hiccup should be incredibly rare.
469		return
470	}
471	// Account for the event.
472	switch typ {
473	case limiterEventIdleMarkWork:
474		gcCPULimiter.addIdleTime(duration)
475	case limiterEventIdle:
476		gcCPULimiter.addIdleTime(duration)
477		sched.idleTime.Add(duration)
478	case limiterEventMarkAssist:
479		fallthrough
480	case limiterEventScavengeAssist:
481		gcCPULimiter.addAssistTime(duration)
482	default:
483		throw("limiterEvent.stop: invalid limiter event type found")
484	}
485}
486