1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Memory allocator.
6//
7// This was originally based on tcmalloc, but has diverged quite a bit.
8// http://goog-perftools.sourceforge.net/doc/tcmalloc.html
9
10// The main allocator works in runs of pages.
11// Small allocation sizes (up to and including 32 kB) are
12// rounded to one of about 70 size classes, each of which
13// has its own free set of objects of exactly that size.
14// Any free page of memory can be split into a set of objects
15// of one size class, which are then managed using a free bitmap.
16//
17// The allocator's data structures are:
18//
19//	fixalloc: a free-list allocator for fixed-size off-heap objects,
20//		used to manage storage used by the allocator.
21//	mheap: the malloc heap, managed at page (8192-byte) granularity.
22//	mspan: a run of in-use pages managed by the mheap.
23//	mcentral: collects all spans of a given size class.
24//	mcache: a per-P cache of mspans with free space.
25//	mstats: allocation statistics.
26//
27// Allocating a small object proceeds up a hierarchy of caches:
28//
29//	1. Round the size up to one of the small size classes
30//	   and look in the corresponding mspan in this P's mcache.
31//	   Scan the mspan's free bitmap to find a free slot.
32//	   If there is a free slot, allocate it.
33//	   This can all be done without acquiring a lock.
34//
35//	2. If the mspan has no free slots, obtain a new mspan
36//	   from the mcentral's list of mspans of the required size
37//	   class that have free space.
38//	   Obtaining a whole span amortizes the cost of locking
39//	   the mcentral.
40//
41//	3. If the mcentral's mspan list is empty, obtain a run
42//	   of pages from the mheap to use for the mspan.
43//
44//	4. If the mheap is empty or has no page runs large enough,
45//	   allocate a new group of pages (at least 1MB) from the
46//	   operating system. Allocating a large run of pages
47//	   amortizes the cost of talking to the operating system.
48//
49// Sweeping an mspan and freeing objects on it proceeds up a similar
50// hierarchy:
51//
52//	1. If the mspan is being swept in response to allocation, it
53//	   is returned to the mcache to satisfy the allocation.
54//
55//	2. Otherwise, if the mspan still has allocated objects in it,
56//	   it is placed on the mcentral free list for the mspan's size
57//	   class.
58//
59//	3. Otherwise, if all objects in the mspan are free, the mspan's
60//	   pages are returned to the mheap and the mspan is now dead.
61//
62// Allocating and freeing a large object uses the mheap
63// directly, bypassing the mcache and mcentral.
64//
65// If mspan.needzero is false, then free object slots in the mspan are
66// already zeroed. Otherwise if needzero is true, objects are zeroed as
67// they are allocated. There are various benefits to delaying zeroing
68// this way:
69//
70//	1. Stack frame allocation can avoid zeroing altogether.
71//
72//	2. It exhibits better temporal locality, since the program is
73//	   probably about to write to the memory.
74//
75//	3. We don't zero pages that never get reused.
76
77// Virtual memory layout
78//
79// The heap consists of a set of arenas, which are 64MB on 64-bit and
80// 4MB on 32-bit (heapArenaBytes). Each arena's start address is also
81// aligned to the arena size.
82//
83// Each arena has an associated heapArena object that stores the
84// metadata for that arena: the heap bitmap for all words in the arena
85// and the span map for all pages in the arena. heapArena objects are
86// themselves allocated off-heap.
87//
88// Since arenas are aligned, the address space can be viewed as a
89// series of arena frames. The arena map (mheap_.arenas) maps from
90// arena frame number to *heapArena, or nil for parts of the address
91// space not backed by the Go heap. The arena map is structured as a
92// two-level array consisting of a "L1" arena map and many "L2" arena
93// maps; however, since arenas are large, on many architectures, the
94// arena map consists of a single, large L2 map.
95//
96// The arena map covers the entire possible address space, allowing
97// the Go heap to use any part of the address space. The allocator
98// attempts to keep arenas contiguous so that large spans (and hence
99// large objects) can cross arenas.
100
101package runtime
102
103import (
104	"internal/goarch"
105	"internal/goos"
106	"internal/runtime/atomic"
107	"runtime/internal/math"
108	"runtime/internal/sys"
109	"unsafe"
110)
111
112const (
113	maxTinySize   = _TinySize
114	tinySizeClass = _TinySizeClass
115	maxSmallSize  = _MaxSmallSize
116
117	pageShift = _PageShift
118	pageSize  = _PageSize
119
120	_PageSize = 1 << _PageShift
121	_PageMask = _PageSize - 1
122
123	// _64bit = 1 on 64-bit systems, 0 on 32-bit systems
124	_64bit = 1 << (^uintptr(0) >> 63) / 2
125
126	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.go.
127	_TinySize      = 16
128	_TinySizeClass = int8(2)
129
130	_FixAllocChunk = 16 << 10 // Chunk size for FixAlloc
131
132	// Per-P, per order stack segment cache size.
133	_StackCacheSize = 32 * 1024
134
135	// Number of orders that get caching. Order 0 is FixedStack
136	// and each successive order is twice as large.
137	// We want to cache 2KB, 4KB, 8KB, and 16KB stacks. Larger stacks
138	// will be allocated directly.
139	// Since FixedStack is different on different systems, we
140	// must vary NumStackOrders to keep the same maximum cached size.
141	//   OS               | FixedStack | NumStackOrders
142	//   -----------------+------------+---------------
143	//   linux/darwin/bsd | 2KB        | 4
144	//   windows/32       | 4KB        | 3
145	//   windows/64       | 8KB        | 2
146	//   plan9            | 4KB        | 3
147	_NumStackOrders = 4 - goarch.PtrSize/4*goos.IsWindows - 1*goos.IsPlan9
148
149	// heapAddrBits is the number of bits in a heap address. On
150	// amd64, addresses are sign-extended beyond heapAddrBits. On
151	// other arches, they are zero-extended.
152	//
153	// On most 64-bit platforms, we limit this to 48 bits based on a
154	// combination of hardware and OS limitations.
155	//
156	// amd64 hardware limits addresses to 48 bits, sign-extended
157	// to 64 bits. Addresses where the top 16 bits are not either
158	// all 0 or all 1 are "non-canonical" and invalid. Because of
159	// these "negative" addresses, we offset addresses by 1<<47
160	// (arenaBaseOffset) on amd64 before computing indexes into
161	// the heap arenas index. In 2017, amd64 hardware added
162	// support for 57 bit addresses; however, currently only Linux
163	// supports this extension and the kernel will never choose an
164	// address above 1<<47 unless mmap is called with a hint
165	// address above 1<<47 (which we never do).
166	//
167	// arm64 hardware (as of ARMv8) limits user addresses to 48
168	// bits, in the range [0, 1<<48).
169	//
170	// ppc64, mips64, and s390x support arbitrary 64 bit addresses
171	// in hardware. On Linux, Go leans on stricter OS limits. Based
172	// on Linux's processor.h, the user address space is limited as
173	// follows on 64-bit architectures:
174	//
175	// Architecture  Name              Maximum Value (exclusive)
176	// ---------------------------------------------------------------------
177	// amd64         TASK_SIZE_MAX     0x007ffffffff000 (47 bit addresses)
178	// arm64         TASK_SIZE_64      0x01000000000000 (48 bit addresses)
179	// ppc64{,le}    TASK_SIZE_USER64  0x00400000000000 (46 bit addresses)
180	// mips64{,le}   TASK_SIZE64       0x00010000000000 (40 bit addresses)
181	// s390x         TASK_SIZE         1<<64 (64 bit addresses)
182	//
183	// These limits may increase over time, but are currently at
184	// most 48 bits except on s390x. On all architectures, Linux
185	// starts placing mmap'd regions at addresses that are
186	// significantly below 48 bits, so even if it's possible to
187	// exceed Go's 48 bit limit, it's extremely unlikely in
188	// practice.
189	//
190	// On 32-bit platforms, we accept the full 32-bit address
191	// space because doing so is cheap.
192	// mips32 only has access to the low 2GB of virtual memory, so
193	// we further limit it to 31 bits.
194	//
195	// On ios/arm64, although 64-bit pointers are presumably
196	// available, pointers are truncated to 33 bits in iOS <14.
197	// Furthermore, only the top 4 GiB of the address space are
198	// actually available to the application. In iOS >=14, more
199	// of the address space is available, and the OS can now
200	// provide addresses outside of those 33 bits. Pick 40 bits
201	// as a reasonable balance between address space usage by the
202	// page allocator, and flexibility for what mmap'd regions
203	// we'll accept for the heap. We can't just move to the full
204	// 48 bits because this uses too much address space for older
205	// iOS versions.
206	// TODO(mknyszek): Once iOS <14 is deprecated, promote ios/arm64
207	// to a 48-bit address space like every other arm64 platform.
208	//
209	// WebAssembly currently has a limit of 4GB linear memory.
210	heapAddrBits = (_64bit*(1-goarch.IsWasm)*(1-goos.IsIos*goarch.IsArm64))*48 + (1-_64bit+goarch.IsWasm)*(32-(goarch.IsMips+goarch.IsMipsle)) + 40*goos.IsIos*goarch.IsArm64
211
212	// maxAlloc is the maximum size of an allocation. On 64-bit,
213	// it's theoretically possible to allocate 1<<heapAddrBits bytes. On
214	// 32-bit, however, this is one less than 1<<32 because the
215	// number of bytes in the address space doesn't actually fit
216	// in a uintptr.
217	maxAlloc = (1 << heapAddrBits) - (1-_64bit)*1
218
219	// The number of bits in a heap address, the size of heap
220	// arenas, and the L1 and L2 arena map sizes are related by
221	//
222	//   (1 << addr bits) = arena size * L1 entries * L2 entries
223	//
224	// Currently, we balance these as follows:
225	//
226	//       Platform  Addr bits  Arena size  L1 entries   L2 entries
227	// --------------  ---------  ----------  ----------  -----------
228	//       */64-bit         48        64MB           1    4M (32MB)
229	// windows/64-bit         48         4MB          64    1M  (8MB)
230	//      ios/arm64         33         4MB           1  2048  (8KB)
231	//       */32-bit         32         4MB           1  1024  (4KB)
232	//     */mips(le)         31         4MB           1   512  (2KB)
233
234	// heapArenaBytes is the size of a heap arena. The heap
235	// consists of mappings of size heapArenaBytes, aligned to
236	// heapArenaBytes. The initial heap mapping is one arena.
237	//
238	// This is currently 64MB on 64-bit non-Windows and 4MB on
239	// 32-bit and on Windows. We use smaller arenas on Windows
240	// because all committed memory is charged to the process,
241	// even if it's not touched. Hence, for processes with small
242	// heaps, the mapped arena space needs to be commensurate.
243	// This is particularly important with the race detector,
244	// since it significantly amplifies the cost of committed
245	// memory.
246	heapArenaBytes = 1 << logHeapArenaBytes
247
248	heapArenaWords = heapArenaBytes / goarch.PtrSize
249
250	// logHeapArenaBytes is log_2 of heapArenaBytes. For clarity,
251	// prefer using heapArenaBytes where possible (we need the
252	// constant to compute some other constants).
253	logHeapArenaBytes = (6+20)*(_64bit*(1-goos.IsWindows)*(1-goarch.IsWasm)*(1-goos.IsIos*goarch.IsArm64)) + (2+20)*(_64bit*goos.IsWindows) + (2+20)*(1-_64bit) + (2+20)*goarch.IsWasm + (2+20)*goos.IsIos*goarch.IsArm64
254
255	// heapArenaBitmapWords is the size of each heap arena's bitmap in uintptrs.
256	heapArenaBitmapWords = heapArenaWords / (8 * goarch.PtrSize)
257
258	pagesPerArena = heapArenaBytes / pageSize
259
260	// arenaL1Bits is the number of bits of the arena number
261	// covered by the first level arena map.
262	//
263	// This number should be small, since the first level arena
264	// map requires PtrSize*(1<<arenaL1Bits) of space in the
265	// binary's BSS. It can be zero, in which case the first level
266	// index is effectively unused. There is a performance benefit
267	// to this, since the generated code can be more efficient,
268	// but comes at the cost of having a large L2 mapping.
269	//
270	// We use the L1 map on 64-bit Windows because the arena size
271	// is small, but the address space is still 48 bits, and
272	// there's a high cost to having a large L2.
273	arenaL1Bits = 6 * (_64bit * goos.IsWindows)
274
275	// arenaL2Bits is the number of bits of the arena number
276	// covered by the second level arena index.
277	//
278	// The size of each arena map allocation is proportional to
279	// 1<<arenaL2Bits, so it's important that this not be too
280	// large. 48 bits leads to 32MB arena index allocations, which
281	// is about the practical threshold.
282	arenaL2Bits = heapAddrBits - logHeapArenaBytes - arenaL1Bits
283
284	// arenaL1Shift is the number of bits to shift an arena frame
285	// number by to compute an index into the first level arena map.
286	arenaL1Shift = arenaL2Bits
287
288	// arenaBits is the total bits in a combined arena map index.
289	// This is split between the index into the L1 arena map and
290	// the L2 arena map.
291	arenaBits = arenaL1Bits + arenaL2Bits
292
293	// arenaBaseOffset is the pointer value that corresponds to
294	// index 0 in the heap arena map.
295	//
296	// On amd64, the address space is 48 bits, sign extended to 64
297	// bits. This offset lets us handle "negative" addresses (or
298	// high addresses if viewed as unsigned).
299	//
300	// On aix/ppc64, this offset allows to keep the heapAddrBits to
301	// 48. Otherwise, it would be 60 in order to handle mmap addresses
302	// (in range 0x0a00000000000000 - 0x0afffffffffffff). But in this
303	// case, the memory reserved in (s *pageAlloc).init for chunks
304	// is causing important slowdowns.
305	//
306	// On other platforms, the user address space is contiguous
307	// and starts at 0, so no offset is necessary.
308	arenaBaseOffset = 0xffff800000000000*goarch.IsAmd64 + 0x0a00000000000000*goos.IsAix
309	// A typed version of this constant that will make it into DWARF (for viewcore).
310	arenaBaseOffsetUintptr = uintptr(arenaBaseOffset)
311
312	// Max number of threads to run garbage collection.
313	// 2, 3, and 4 are all plausible maximums depending
314	// on the hardware details of the machine. The garbage
315	// collector scales well to 32 cpus.
316	_MaxGcproc = 32
317
318	// minLegalPointer is the smallest possible legal pointer.
319	// This is the smallest possible architectural page size,
320	// since we assume that the first page is never mapped.
321	//
322	// This should agree with minZeroPage in the compiler.
323	minLegalPointer uintptr = 4096
324
325	// minHeapForMetadataHugePages sets a threshold on when certain kinds of
326	// heap metadata, currently the arenas map L2 entries and page alloc bitmap
327	// mappings, are allowed to be backed by huge pages. If the heap goal ever
328	// exceeds this threshold, then huge pages are enabled.
329	//
330	// These numbers are chosen with the assumption that huge pages are on the
331	// order of a few MiB in size.
332	//
333	// The kind of metadata this applies to has a very low overhead when compared
334	// to address space used, but their constant overheads for small heaps would
335	// be very high if they were to be backed by huge pages (e.g. a few MiB makes
336	// a huge difference for an 8 MiB heap, but barely any difference for a 1 GiB
337	// heap). The benefit of huge pages is also not worth it for small heaps,
338	// because only a very, very small part of the metadata is used for small heaps.
339	//
340	// N.B. If the heap goal exceeds the threshold then shrinks to a very small size
341	// again, then huge pages will still be enabled for this mapping. The reason is that
342	// there's no point unless we're also returning the physical memory for these
343	// metadata mappings back to the OS. That would be quite complex to do in general
344	// as the heap is likely fragmented after a reduction in heap size.
345	minHeapForMetadataHugePages = 1 << 30
346)
347
348// physPageSize is the size in bytes of the OS's physical pages.
349// Mapping and unmapping operations must be done at multiples of
350// physPageSize.
351//
352// This must be set by the OS init code (typically in osinit) before
353// mallocinit.
354var physPageSize uintptr
355
356// physHugePageSize is the size in bytes of the OS's default physical huge
357// page size whose allocation is opaque to the application. It is assumed
358// and verified to be a power of two.
359//
360// If set, this must be set by the OS init code (typically in osinit) before
361// mallocinit. However, setting it at all is optional, and leaving the default
362// value is always safe (though potentially less efficient).
363//
364// Since physHugePageSize is always assumed to be a power of two,
365// physHugePageShift is defined as physHugePageSize == 1 << physHugePageShift.
366// The purpose of physHugePageShift is to avoid doing divisions in
367// performance critical functions.
368var (
369	physHugePageSize  uintptr
370	physHugePageShift uint
371)
372
373func mallocinit() {
374	if class_to_size[_TinySizeClass] != _TinySize {
375		throw("bad TinySizeClass")
376	}
377
378	if heapArenaBitmapWords&(heapArenaBitmapWords-1) != 0 {
379		// heapBits expects modular arithmetic on bitmap
380		// addresses to work.
381		throw("heapArenaBitmapWords not a power of 2")
382	}
383
384	// Check physPageSize.
385	if physPageSize == 0 {
386		// The OS init code failed to fetch the physical page size.
387		throw("failed to get system page size")
388	}
389	if physPageSize > maxPhysPageSize {
390		print("system page size (", physPageSize, ") is larger than maximum page size (", maxPhysPageSize, ")\n")
391		throw("bad system page size")
392	}
393	if physPageSize < minPhysPageSize {
394		print("system page size (", physPageSize, ") is smaller than minimum page size (", minPhysPageSize, ")\n")
395		throw("bad system page size")
396	}
397	if physPageSize&(physPageSize-1) != 0 {
398		print("system page size (", physPageSize, ") must be a power of 2\n")
399		throw("bad system page size")
400	}
401	if physHugePageSize&(physHugePageSize-1) != 0 {
402		print("system huge page size (", physHugePageSize, ") must be a power of 2\n")
403		throw("bad system huge page size")
404	}
405	if physHugePageSize > maxPhysHugePageSize {
406		// physHugePageSize is greater than the maximum supported huge page size.
407		// Don't throw here, like in the other cases, since a system configured
408		// in this way isn't wrong, we just don't have the code to support them.
409		// Instead, silently set the huge page size to zero.
410		physHugePageSize = 0
411	}
412	if physHugePageSize != 0 {
413		// Since physHugePageSize is a power of 2, it suffices to increase
414		// physHugePageShift until 1<<physHugePageShift == physHugePageSize.
415		for 1<<physHugePageShift != physHugePageSize {
416			physHugePageShift++
417		}
418	}
419	if pagesPerArena%pagesPerSpanRoot != 0 {
420		print("pagesPerArena (", pagesPerArena, ") is not divisible by pagesPerSpanRoot (", pagesPerSpanRoot, ")\n")
421		throw("bad pagesPerSpanRoot")
422	}
423	if pagesPerArena%pagesPerReclaimerChunk != 0 {
424		print("pagesPerArena (", pagesPerArena, ") is not divisible by pagesPerReclaimerChunk (", pagesPerReclaimerChunk, ")\n")
425		throw("bad pagesPerReclaimerChunk")
426	}
427	// Check that the minimum size (exclusive) for a malloc header is also
428	// a size class boundary. This is important to making sure checks align
429	// across different parts of the runtime.
430	minSizeForMallocHeaderIsSizeClass := false
431	for i := 0; i < len(class_to_size); i++ {
432		if minSizeForMallocHeader == uintptr(class_to_size[i]) {
433			minSizeForMallocHeaderIsSizeClass = true
434			break
435		}
436	}
437	if !minSizeForMallocHeaderIsSizeClass {
438		throw("min size of malloc header is not a size class boundary")
439	}
440	// Check that the pointer bitmap for all small sizes without a malloc header
441	// fits in a word.
442	if minSizeForMallocHeader/goarch.PtrSize > 8*goarch.PtrSize {
443		throw("max pointer/scan bitmap size for headerless objects is too large")
444	}
445
446	if minTagBits > taggedPointerBits {
447		throw("taggedPointerbits too small")
448	}
449
450	// Initialize the heap.
451	mheap_.init()
452	mcache0 = allocmcache()
453	lockInit(&gcBitsArenas.lock, lockRankGcBitsArenas)
454	lockInit(&profInsertLock, lockRankProfInsert)
455	lockInit(&profBlockLock, lockRankProfBlock)
456	lockInit(&profMemActiveLock, lockRankProfMemActive)
457	for i := range profMemFutureLock {
458		lockInit(&profMemFutureLock[i], lockRankProfMemFuture)
459	}
460	lockInit(&globalAlloc.mutex, lockRankGlobalAlloc)
461
462	// Create initial arena growth hints.
463	if goarch.PtrSize == 8 {
464		// On a 64-bit machine, we pick the following hints
465		// because:
466		//
467		// 1. Starting from the middle of the address space
468		// makes it easier to grow out a contiguous range
469		// without running in to some other mapping.
470		//
471		// 2. This makes Go heap addresses more easily
472		// recognizable when debugging.
473		//
474		// 3. Stack scanning in gccgo is still conservative,
475		// so it's important that addresses be distinguishable
476		// from other data.
477		//
478		// Starting at 0x00c0 means that the valid memory addresses
479		// will begin 0x00c0, 0x00c1, ...
480		// In little-endian, that's c0 00, c1 00, ... None of those are valid
481		// UTF-8 sequences, and they are otherwise as far away from
482		// ff (likely a common byte) as possible. If that fails, we try other 0xXXc0
483		// addresses. An earlier attempt to use 0x11f8 caused out of memory errors
484		// on OS X during thread allocations.  0x00c0 causes conflicts with
485		// AddressSanitizer which reserves all memory up to 0x0100.
486		// These choices reduce the odds of a conservative garbage collector
487		// not collecting memory because some non-pointer block of memory
488		// had a bit pattern that matched a memory address.
489		//
490		// However, on arm64, we ignore all this advice above and slam the
491		// allocation at 0x40 << 32 because when using 4k pages with 3-level
492		// translation buffers, the user address space is limited to 39 bits
493		// On ios/arm64, the address space is even smaller.
494		//
495		// On AIX, mmaps starts at 0x0A00000000000000 for 64-bit.
496		// processes.
497		//
498		// Space mapped for user arenas comes immediately after the range
499		// originally reserved for the regular heap when race mode is not
500		// enabled because user arena chunks can never be used for regular heap
501		// allocations and we want to avoid fragmenting the address space.
502		//
503		// In race mode we have no choice but to just use the same hints because
504		// the race detector requires that the heap be mapped contiguously.
505		for i := 0x7f; i >= 0; i-- {
506			var p uintptr
507			switch {
508			case raceenabled:
509				// The TSAN runtime requires the heap
510				// to be in the range [0x00c000000000,
511				// 0x00e000000000).
512				p = uintptr(i)<<32 | uintptrMask&(0x00c0<<32)
513				if p >= uintptrMask&0x00e000000000 {
514					continue
515				}
516			case GOARCH == "arm64" && GOOS == "ios":
517				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
518			case GOARCH == "arm64":
519				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
520			case GOOS == "aix":
521				if i == 0 {
522					// We don't use addresses directly after 0x0A00000000000000
523					// to avoid collisions with others mmaps done by non-go programs.
524					continue
525				}
526				p = uintptr(i)<<40 | uintptrMask&(0xa0<<52)
527			default:
528				p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
529			}
530			// Switch to generating hints for user arenas if we've gone
531			// through about half the hints. In race mode, take only about
532			// a quarter; we don't have very much space to work with.
533			hintList := &mheap_.arenaHints
534			if (!raceenabled && i > 0x3f) || (raceenabled && i > 0x5f) {
535				hintList = &mheap_.userArena.arenaHints
536			}
537			hint := (*arenaHint)(mheap_.arenaHintAlloc.alloc())
538			hint.addr = p
539			hint.next, *hintList = *hintList, hint
540		}
541	} else {
542		// On a 32-bit machine, we're much more concerned
543		// about keeping the usable heap contiguous.
544		// Hence:
545		//
546		// 1. We reserve space for all heapArenas up front so
547		// they don't get interleaved with the heap. They're
548		// ~258MB, so this isn't too bad. (We could reserve a
549		// smaller amount of space up front if this is a
550		// problem.)
551		//
552		// 2. We hint the heap to start right above the end of
553		// the binary so we have the best chance of keeping it
554		// contiguous.
555		//
556		// 3. We try to stake out a reasonably large initial
557		// heap reservation.
558
559		const arenaMetaSize = (1 << arenaBits) * unsafe.Sizeof(heapArena{})
560		meta := uintptr(sysReserve(nil, arenaMetaSize))
561		if meta != 0 {
562			mheap_.heapArenaAlloc.init(meta, arenaMetaSize, true)
563		}
564
565		// We want to start the arena low, but if we're linked
566		// against C code, it's possible global constructors
567		// have called malloc and adjusted the process' brk.
568		// Query the brk so we can avoid trying to map the
569		// region over it (which will cause the kernel to put
570		// the region somewhere else, likely at a high
571		// address).
572		procBrk := sbrk0()
573
574		// If we ask for the end of the data segment but the
575		// operating system requires a little more space
576		// before we can start allocating, it will give out a
577		// slightly higher pointer. Except QEMU, which is
578		// buggy, as usual: it won't adjust the pointer
579		// upward. So adjust it upward a little bit ourselves:
580		// 1/4 MB to get away from the running binary image.
581		p := firstmoduledata.end
582		if p < procBrk {
583			p = procBrk
584		}
585		if mheap_.heapArenaAlloc.next <= p && p < mheap_.heapArenaAlloc.end {
586			p = mheap_.heapArenaAlloc.end
587		}
588		p = alignUp(p+(256<<10), heapArenaBytes)
589		// Because we're worried about fragmentation on
590		// 32-bit, we try to make a large initial reservation.
591		arenaSizes := []uintptr{
592			512 << 20,
593			256 << 20,
594			128 << 20,
595		}
596		for _, arenaSize := range arenaSizes {
597			a, size := sysReserveAligned(unsafe.Pointer(p), arenaSize, heapArenaBytes)
598			if a != nil {
599				mheap_.arena.init(uintptr(a), size, false)
600				p = mheap_.arena.end // For hint below
601				break
602			}
603		}
604		hint := (*arenaHint)(mheap_.arenaHintAlloc.alloc())
605		hint.addr = p
606		hint.next, mheap_.arenaHints = mheap_.arenaHints, hint
607
608		// Place the hint for user arenas just after the large reservation.
609		//
610		// While this potentially competes with the hint above, in practice we probably
611		// aren't going to be getting this far anyway on 32-bit platforms.
612		userArenaHint := (*arenaHint)(mheap_.arenaHintAlloc.alloc())
613		userArenaHint.addr = p
614		userArenaHint.next, mheap_.userArena.arenaHints = mheap_.userArena.arenaHints, userArenaHint
615	}
616	// Initialize the memory limit here because the allocator is going to look at it
617	// but we haven't called gcinit yet and we're definitely going to allocate memory before then.
618	gcController.memoryLimit.Store(maxInt64)
619}
620
621// sysAlloc allocates heap arena space for at least n bytes. The
622// returned pointer is always heapArenaBytes-aligned and backed by
623// h.arenas metadata. The returned size is always a multiple of
624// heapArenaBytes. sysAlloc returns nil on failure.
625// There is no corresponding free function.
626//
627// hintList is a list of hint addresses for where to allocate new
628// heap arenas. It must be non-nil.
629//
630// register indicates whether the heap arena should be registered
631// in allArenas.
632//
633// sysAlloc returns a memory region in the Reserved state. This region must
634// be transitioned to Prepared and then Ready before use.
635//
636// h must be locked.
637func (h *mheap) sysAlloc(n uintptr, hintList **arenaHint, register bool) (v unsafe.Pointer, size uintptr) {
638	assertLockHeld(&h.lock)
639
640	n = alignUp(n, heapArenaBytes)
641
642	if hintList == &h.arenaHints {
643		// First, try the arena pre-reservation.
644		// Newly-used mappings are considered released.
645		//
646		// Only do this if we're using the regular heap arena hints.
647		// This behavior is only for the heap.
648		v = h.arena.alloc(n, heapArenaBytes, &gcController.heapReleased)
649		if v != nil {
650			size = n
651			goto mapped
652		}
653	}
654
655	// Try to grow the heap at a hint address.
656	for *hintList != nil {
657		hint := *hintList
658		p := hint.addr
659		if hint.down {
660			p -= n
661		}
662		if p+n < p {
663			// We can't use this, so don't ask.
664			v = nil
665		} else if arenaIndex(p+n-1) >= 1<<arenaBits {
666			// Outside addressable heap. Can't use.
667			v = nil
668		} else {
669			v = sysReserve(unsafe.Pointer(p), n)
670		}
671		if p == uintptr(v) {
672			// Success. Update the hint.
673			if !hint.down {
674				p += n
675			}
676			hint.addr = p
677			size = n
678			break
679		}
680		// Failed. Discard this hint and try the next.
681		//
682		// TODO: This would be cleaner if sysReserve could be
683		// told to only return the requested address. In
684		// particular, this is already how Windows behaves, so
685		// it would simplify things there.
686		if v != nil {
687			sysFreeOS(v, n)
688		}
689		*hintList = hint.next
690		h.arenaHintAlloc.free(unsafe.Pointer(hint))
691	}
692
693	if size == 0 {
694		if raceenabled {
695			// The race detector assumes the heap lives in
696			// [0x00c000000000, 0x00e000000000), but we
697			// just ran out of hints in this region. Give
698			// a nice failure.
699			throw("too many address space collisions for -race mode")
700		}
701
702		// All of the hints failed, so we'll take any
703		// (sufficiently aligned) address the kernel will give
704		// us.
705		v, size = sysReserveAligned(nil, n, heapArenaBytes)
706		if v == nil {
707			return nil, 0
708		}
709
710		// Create new hints for extending this region.
711		hint := (*arenaHint)(h.arenaHintAlloc.alloc())
712		hint.addr, hint.down = uintptr(v), true
713		hint.next, mheap_.arenaHints = mheap_.arenaHints, hint
714		hint = (*arenaHint)(h.arenaHintAlloc.alloc())
715		hint.addr = uintptr(v) + size
716		hint.next, mheap_.arenaHints = mheap_.arenaHints, hint
717	}
718
719	// Check for bad pointers or pointers we can't use.
720	{
721		var bad string
722		p := uintptr(v)
723		if p+size < p {
724			bad = "region exceeds uintptr range"
725		} else if arenaIndex(p) >= 1<<arenaBits {
726			bad = "base outside usable address space"
727		} else if arenaIndex(p+size-1) >= 1<<arenaBits {
728			bad = "end outside usable address space"
729		}
730		if bad != "" {
731			// This should be impossible on most architectures,
732			// but it would be really confusing to debug.
733			print("runtime: memory allocated by OS [", hex(p), ", ", hex(p+size), ") not in usable address space: ", bad, "\n")
734			throw("memory reservation exceeds address space limit")
735		}
736	}
737
738	if uintptr(v)&(heapArenaBytes-1) != 0 {
739		throw("misrounded allocation in sysAlloc")
740	}
741
742mapped:
743	// Create arena metadata.
744	for ri := arenaIndex(uintptr(v)); ri <= arenaIndex(uintptr(v)+size-1); ri++ {
745		l2 := h.arenas[ri.l1()]
746		if l2 == nil {
747			// Allocate an L2 arena map.
748			//
749			// Use sysAllocOS instead of sysAlloc or persistentalloc because there's no
750			// statistic we can comfortably account for this space in. With this structure,
751			// we rely on demand paging to avoid large overheads, but tracking which memory
752			// is paged in is too expensive. Trying to account for the whole region means
753			// that it will appear like an enormous memory overhead in statistics, even though
754			// it is not.
755			l2 = (*[1 << arenaL2Bits]*heapArena)(sysAllocOS(unsafe.Sizeof(*l2)))
756			if l2 == nil {
757				throw("out of memory allocating heap arena map")
758			}
759			if h.arenasHugePages {
760				sysHugePage(unsafe.Pointer(l2), unsafe.Sizeof(*l2))
761			} else {
762				sysNoHugePage(unsafe.Pointer(l2), unsafe.Sizeof(*l2))
763			}
764			atomic.StorepNoWB(unsafe.Pointer(&h.arenas[ri.l1()]), unsafe.Pointer(l2))
765		}
766
767		if l2[ri.l2()] != nil {
768			throw("arena already initialized")
769		}
770		var r *heapArena
771		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), goarch.PtrSize, &memstats.gcMiscSys))
772		if r == nil {
773			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), goarch.PtrSize, &memstats.gcMiscSys))
774			if r == nil {
775				throw("out of memory allocating heap arena metadata")
776			}
777		}
778
779		// Register the arena in allArenas if requested.
780		if register {
781			if len(h.allArenas) == cap(h.allArenas) {
782				size := 2 * uintptr(cap(h.allArenas)) * goarch.PtrSize
783				if size == 0 {
784					size = physPageSize
785				}
786				newArray := (*notInHeap)(persistentalloc(size, goarch.PtrSize, &memstats.gcMiscSys))
787				if newArray == nil {
788					throw("out of memory allocating allArenas")
789				}
790				oldSlice := h.allArenas
791				*(*notInHeapSlice)(unsafe.Pointer(&h.allArenas)) = notInHeapSlice{newArray, len(h.allArenas), int(size / goarch.PtrSize)}
792				copy(h.allArenas, oldSlice)
793				// Do not free the old backing array because
794				// there may be concurrent readers. Since we
795				// double the array each time, this can lead
796				// to at most 2x waste.
797			}
798			h.allArenas = h.allArenas[:len(h.allArenas)+1]
799			h.allArenas[len(h.allArenas)-1] = ri
800		}
801
802		// Store atomically just in case an object from the
803		// new heap arena becomes visible before the heap lock
804		// is released (which shouldn't happen, but there's
805		// little downside to this).
806		atomic.StorepNoWB(unsafe.Pointer(&l2[ri.l2()]), unsafe.Pointer(r))
807	}
808
809	// Tell the race detector about the new heap memory.
810	if raceenabled {
811		racemapshadow(v, size)
812	}
813
814	return
815}
816
817// sysReserveAligned is like sysReserve, but the returned pointer is
818// aligned to align bytes. It may reserve either n or n+align bytes,
819// so it returns the size that was reserved.
820func sysReserveAligned(v unsafe.Pointer, size, align uintptr) (unsafe.Pointer, uintptr) {
821	// Since the alignment is rather large in uses of this
822	// function, we're not likely to get it by chance, so we ask
823	// for a larger region and remove the parts we don't need.
824	retries := 0
825retry:
826	p := uintptr(sysReserve(v, size+align))
827	switch {
828	case p == 0:
829		return nil, 0
830	case p&(align-1) == 0:
831		return unsafe.Pointer(p), size + align
832	case GOOS == "windows":
833		// On Windows we can't release pieces of a
834		// reservation, so we release the whole thing and
835		// re-reserve the aligned sub-region. This may race,
836		// so we may have to try again.
837		sysFreeOS(unsafe.Pointer(p), size+align)
838		p = alignUp(p, align)
839		p2 := sysReserve(unsafe.Pointer(p), size)
840		if p != uintptr(p2) {
841			// Must have raced. Try again.
842			sysFreeOS(p2, size)
843			if retries++; retries == 100 {
844				throw("failed to allocate aligned heap memory; too many retries")
845			}
846			goto retry
847		}
848		// Success.
849		return p2, size
850	default:
851		// Trim off the unaligned parts.
852		pAligned := alignUp(p, align)
853		sysFreeOS(unsafe.Pointer(p), pAligned-p)
854		end := pAligned + size
855		endLen := (p + size + align) - end
856		if endLen > 0 {
857			sysFreeOS(unsafe.Pointer(end), endLen)
858		}
859		return unsafe.Pointer(pAligned), size
860	}
861}
862
863// enableMetadataHugePages enables huge pages for various sources of heap metadata.
864//
865// A note on latency: for sufficiently small heaps (<10s of GiB) this function will take constant
866// time, but may take time proportional to the size of the mapped heap beyond that.
867//
868// This function is idempotent.
869//
870// The heap lock must not be held over this operation, since it will briefly acquire
871// the heap lock.
872//
873// Must be called on the system stack because it acquires the heap lock.
874//
875//go:systemstack
876func (h *mheap) enableMetadataHugePages() {
877	// Enable huge pages for page structure.
878	h.pages.enableChunkHugePages()
879
880	// Grab the lock and set arenasHugePages if it's not.
881	//
882	// Once arenasHugePages is set, all new L2 entries will be eligible for
883	// huge pages. We'll set all the old entries after we release the lock.
884	lock(&h.lock)
885	if h.arenasHugePages {
886		unlock(&h.lock)
887		return
888	}
889	h.arenasHugePages = true
890	unlock(&h.lock)
891
892	// N.B. The arenas L1 map is quite small on all platforms, so it's fine to
893	// just iterate over the whole thing.
894	for i := range h.arenas {
895		l2 := (*[1 << arenaL2Bits]*heapArena)(atomic.Loadp(unsafe.Pointer(&h.arenas[i])))
896		if l2 == nil {
897			continue
898		}
899		sysHugePage(unsafe.Pointer(l2), unsafe.Sizeof(*l2))
900	}
901}
902
903// base address for all 0-byte allocations
904var zerobase uintptr
905
906// nextFreeFast returns the next free object if one is quickly available.
907// Otherwise it returns 0.
908func nextFreeFast(s *mspan) gclinkptr {
909	theBit := sys.TrailingZeros64(s.allocCache) // Is there a free object in the allocCache?
910	if theBit < 64 {
911		result := s.freeindex + uint16(theBit)
912		if result < s.nelems {
913			freeidx := result + 1
914			if freeidx%64 == 0 && freeidx != s.nelems {
915				return 0
916			}
917			s.allocCache >>= uint(theBit + 1)
918			s.freeindex = freeidx
919			s.allocCount++
920			return gclinkptr(uintptr(result)*s.elemsize + s.base())
921		}
922	}
923	return 0
924}
925
926// nextFree returns the next free object from the cached span if one is available.
927// Otherwise it refills the cache with a span with an available object and
928// returns that object along with a flag indicating that this was a heavy
929// weight allocation. If it is a heavy weight allocation the caller must
930// determine whether a new GC cycle needs to be started or if the GC is active
931// whether this goroutine needs to assist the GC.
932//
933// Must run in a non-preemptible context since otherwise the owner of
934// c could change.
935func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, shouldhelpgc bool) {
936	s = c.alloc[spc]
937	shouldhelpgc = false
938	freeIndex := s.nextFreeIndex()
939	if freeIndex == s.nelems {
940		// The span is full.
941		if s.allocCount != s.nelems {
942			println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
943			throw("s.allocCount != s.nelems && freeIndex == s.nelems")
944		}
945		c.refill(spc)
946		shouldhelpgc = true
947		s = c.alloc[spc]
948
949		freeIndex = s.nextFreeIndex()
950	}
951
952	if freeIndex >= s.nelems {
953		throw("freeIndex is not valid")
954	}
955
956	v = gclinkptr(uintptr(freeIndex)*s.elemsize + s.base())
957	s.allocCount++
958	if s.allocCount > s.nelems {
959		println("s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
960		throw("s.allocCount > s.nelems")
961	}
962	return
963}
964
965// Allocate an object of size bytes.
966// Small objects are allocated from the per-P cache's free lists.
967// Large objects (> 32 kB) are allocated straight from the heap.
968//
969// mallocgc should be an internal detail,
970// but widely used packages access it using linkname.
971// Notable members of the hall of shame include:
972//   - github.com/bytedance/gopkg
973//   - github.com/bytedance/sonic
974//   - github.com/cloudwego/frugal
975//   - github.com/cockroachdb/cockroach
976//   - github.com/cockroachdb/pebble
977//   - github.com/ugorji/go/codec
978//
979// Do not remove or change the type signature.
980// See go.dev/issue/67401.
981//
982//go:linkname mallocgc
983func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
984	if gcphase == _GCmarktermination {
985		throw("mallocgc called with gcphase == _GCmarktermination")
986	}
987
988	if size == 0 {
989		return unsafe.Pointer(&zerobase)
990	}
991
992	// It's possible for any malloc to trigger sweeping, which may in
993	// turn queue finalizers. Record this dynamic lock edge.
994	lockRankMayQueueFinalizer()
995
996	userSize := size
997	if asanenabled {
998		// Refer to ASAN runtime library, the malloc() function allocates extra memory,
999		// the redzone, around the user requested memory region. And the redzones are marked
1000		// as unaddressable. We perform the same operations in Go to detect the overflows or
1001		// underflows.
1002		size += computeRZlog(size)
1003	}
1004
1005	if debug.malloc {
1006		if debug.sbrk != 0 {
1007			align := uintptr(16)
1008			if typ != nil {
1009				// TODO(austin): This should be just
1010				//   align = uintptr(typ.align)
1011				// but that's only 4 on 32-bit platforms,
1012				// even if there's a uint64 field in typ (see #599).
1013				// This causes 64-bit atomic accesses to panic.
1014				// Hence, we use stricter alignment that matches
1015				// the normal allocator better.
1016				if size&7 == 0 {
1017					align = 8
1018				} else if size&3 == 0 {
1019					align = 4
1020				} else if size&1 == 0 {
1021					align = 2
1022				} else {
1023					align = 1
1024				}
1025			}
1026			return persistentalloc(size, align, &memstats.other_sys)
1027		}
1028
1029		if inittrace.active && inittrace.id == getg().goid {
1030			// Init functions are executed sequentially in a single goroutine.
1031			inittrace.allocs += 1
1032		}
1033	}
1034
1035	// assistG is the G to charge for this allocation, or nil if
1036	// GC is not currently active.
1037	assistG := deductAssistCredit(size)
1038
1039	// Set mp.mallocing to keep from being preempted by GC.
1040	mp := acquirem()
1041	if mp.mallocing != 0 {
1042		throw("malloc deadlock")
1043	}
1044	if mp.gsignal == getg() {
1045		throw("malloc during signal")
1046	}
1047	mp.mallocing = 1
1048
1049	shouldhelpgc := false
1050	dataSize := userSize
1051	c := getMCache(mp)
1052	if c == nil {
1053		throw("mallocgc called without a P or outside bootstrapping")
1054	}
1055	var span *mspan
1056	var header **_type
1057	var x unsafe.Pointer
1058	noscan := typ == nil || !typ.Pointers()
1059	// In some cases block zeroing can profitably (for latency reduction purposes)
1060	// be delayed till preemption is possible; delayedZeroing tracks that state.
1061	delayedZeroing := false
1062	// Determine if it's a 'small' object that goes into a size-classed span.
1063	//
1064	// Note: This comparison looks a little strange, but it exists to smooth out
1065	// the crossover between the largest size class and large objects that have
1066	// their own spans. The small window of object sizes between maxSmallSize-mallocHeaderSize
1067	// and maxSmallSize will be considered large, even though they might fit in
1068	// a size class. In practice this is completely fine, since the largest small
1069	// size class has a single object in it already, precisely to make the transition
1070	// to large objects smooth.
1071	if size <= maxSmallSize-mallocHeaderSize {
1072		if noscan && size < maxTinySize {
1073			// Tiny allocator.
1074			//
1075			// Tiny allocator combines several tiny allocation requests
1076			// into a single memory block. The resulting memory block
1077			// is freed when all subobjects are unreachable. The subobjects
1078			// must be noscan (don't have pointers), this ensures that
1079			// the amount of potentially wasted memory is bounded.
1080			//
1081			// Size of the memory block used for combining (maxTinySize) is tunable.
1082			// Current setting is 16 bytes, which relates to 2x worst case memory
1083			// wastage (when all but one subobjects are unreachable).
1084			// 8 bytes would result in no wastage at all, but provides less
1085			// opportunities for combining.
1086			// 32 bytes provides more opportunities for combining,
1087			// but can lead to 4x worst case wastage.
1088			// The best case winning is 8x regardless of block size.
1089			//
1090			// Objects obtained from tiny allocator must not be freed explicitly.
1091			// So when an object will be freed explicitly, we ensure that
1092			// its size >= maxTinySize.
1093			//
1094			// SetFinalizer has a special case for objects potentially coming
1095			// from tiny allocator, it such case it allows to set finalizers
1096			// for an inner byte of a memory block.
1097			//
1098			// The main targets of tiny allocator are small strings and
1099			// standalone escaping variables. On a json benchmark
1100			// the allocator reduces number of allocations by ~12% and
1101			// reduces heap size by ~20%.
1102			off := c.tinyoffset
1103			// Align tiny pointer for required (conservative) alignment.
1104			if size&7 == 0 {
1105				off = alignUp(off, 8)
1106			} else if goarch.PtrSize == 4 && size == 12 {
1107				// Conservatively align 12-byte objects to 8 bytes on 32-bit
1108				// systems so that objects whose first field is a 64-bit
1109				// value is aligned to 8 bytes and does not cause a fault on
1110				// atomic access. See issue 37262.
1111				// TODO(mknyszek): Remove this workaround if/when issue 36606
1112				// is resolved.
1113				off = alignUp(off, 8)
1114			} else if size&3 == 0 {
1115				off = alignUp(off, 4)
1116			} else if size&1 == 0 {
1117				off = alignUp(off, 2)
1118			}
1119			if off+size <= maxTinySize && c.tiny != 0 {
1120				// The object fits into existing tiny block.
1121				x = unsafe.Pointer(c.tiny + off)
1122				c.tinyoffset = off + size
1123				c.tinyAllocs++
1124				mp.mallocing = 0
1125				releasem(mp)
1126				return x
1127			}
1128			// Allocate a new maxTinySize block.
1129			span = c.alloc[tinySpanClass]
1130			v := nextFreeFast(span)
1131			if v == 0 {
1132				v, span, shouldhelpgc = c.nextFree(tinySpanClass)
1133			}
1134			x = unsafe.Pointer(v)
1135			(*[2]uint64)(x)[0] = 0
1136			(*[2]uint64)(x)[1] = 0
1137			// See if we need to replace the existing tiny block with the new one
1138			// based on amount of remaining free space.
1139			if !raceenabled && (size < c.tinyoffset || c.tiny == 0) {
1140				// Note: disabled when race detector is on, see comment near end of this function.
1141				c.tiny = uintptr(x)
1142				c.tinyoffset = size
1143			}
1144			size = maxTinySize
1145		} else {
1146			hasHeader := !noscan && !heapBitsInSpan(size)
1147			if hasHeader {
1148				size += mallocHeaderSize
1149			}
1150			var sizeclass uint8
1151			if size <= smallSizeMax-8 {
1152				sizeclass = size_to_class8[divRoundUp(size, smallSizeDiv)]
1153			} else {
1154				sizeclass = size_to_class128[divRoundUp(size-smallSizeMax, largeSizeDiv)]
1155			}
1156			size = uintptr(class_to_size[sizeclass])
1157			spc := makeSpanClass(sizeclass, noscan)
1158			span = c.alloc[spc]
1159			v := nextFreeFast(span)
1160			if v == 0 {
1161				v, span, shouldhelpgc = c.nextFree(spc)
1162			}
1163			x = unsafe.Pointer(v)
1164			if needzero && span.needzero != 0 {
1165				memclrNoHeapPointers(x, size)
1166			}
1167			if hasHeader {
1168				header = (**_type)(x)
1169				x = add(x, mallocHeaderSize)
1170				size -= mallocHeaderSize
1171			}
1172		}
1173	} else {
1174		shouldhelpgc = true
1175		// For large allocations, keep track of zeroed state so that
1176		// bulk zeroing can be happen later in a preemptible context.
1177		span = c.allocLarge(size, noscan)
1178		span.freeindex = 1
1179		span.allocCount = 1
1180		size = span.elemsize
1181		x = unsafe.Pointer(span.base())
1182		if needzero && span.needzero != 0 {
1183			delayedZeroing = true
1184		}
1185		if !noscan {
1186			// Tell the GC not to look at this yet.
1187			span.largeType = nil
1188			header = &span.largeType
1189		}
1190	}
1191	if !noscan && !delayedZeroing {
1192		c.scanAlloc += heapSetType(uintptr(x), dataSize, typ, header, span)
1193	}
1194
1195	// Ensure that the stores above that initialize x to
1196	// type-safe memory and set the heap bits occur before
1197	// the caller can make x observable to the garbage
1198	// collector. Otherwise, on weakly ordered machines,
1199	// the garbage collector could follow a pointer to x,
1200	// but see uninitialized memory or stale heap bits.
1201	publicationBarrier()
1202	// As x and the heap bits are initialized, update
1203	// freeIndexForScan now so x is seen by the GC
1204	// (including conservative scan) as an allocated object.
1205	// While this pointer can't escape into user code as a
1206	// _live_ pointer until we return, conservative scanning
1207	// may find a dead pointer that happens to point into this
1208	// object. Delaying this update until now ensures that
1209	// conservative scanning considers this pointer dead until
1210	// this point.
1211	span.freeIndexForScan = span.freeindex
1212
1213	// Allocate black during GC.
1214	// All slots hold nil so no scanning is needed.
1215	// This may be racing with GC so do it atomically if there can be
1216	// a race marking the bit.
1217	if gcphase != _GCoff {
1218		gcmarknewobject(span, uintptr(x))
1219	}
1220
1221	if raceenabled {
1222		racemalloc(x, size)
1223	}
1224
1225	if msanenabled {
1226		msanmalloc(x, size)
1227	}
1228
1229	if asanenabled {
1230		// We should only read/write the memory with the size asked by the user.
1231		// The rest of the allocated memory should be poisoned, so that we can report
1232		// errors when accessing poisoned memory.
1233		// The allocated memory is larger than required userSize, it will also include
1234		// redzone and some other padding bytes.
1235		rzBeg := unsafe.Add(x, userSize)
1236		asanpoison(rzBeg, size-userSize)
1237		asanunpoison(x, userSize)
1238	}
1239
1240	// TODO(mknyszek): We should really count the header as part
1241	// of gc_sys or something. The code below just pretends it is
1242	// internal fragmentation and matches the GC's accounting by
1243	// using the whole allocation slot.
1244	fullSize := span.elemsize
1245	if rate := MemProfileRate; rate > 0 {
1246		// Note cache c only valid while m acquired; see #47302
1247		//
1248		// N.B. Use the full size because that matches how the GC
1249		// will update the mem profile on the "free" side.
1250		if rate != 1 && fullSize < c.nextSample {
1251			c.nextSample -= fullSize
1252		} else {
1253			profilealloc(mp, x, fullSize)
1254		}
1255	}
1256	mp.mallocing = 0
1257	releasem(mp)
1258
1259	// Objects can be zeroed late in a context where preemption can occur.
1260	// If the object contains pointers, its pointer data must be cleared
1261	// or otherwise indicate that the GC shouldn't scan it.
1262	// x will keep the memory alive.
1263	if delayedZeroing {
1264		// N.B. size == fullSize always in this case.
1265		memclrNoHeapPointersChunked(size, x) // This is a possible preemption point: see #47302
1266
1267		// Finish storing the type information for this case.
1268		if !noscan {
1269			mp := acquirem()
1270			getMCache(mp).scanAlloc += heapSetType(uintptr(x), dataSize, typ, header, span)
1271
1272			// Publish the type information with the zeroed memory.
1273			publicationBarrier()
1274			releasem(mp)
1275		}
1276	}
1277
1278	if debug.malloc {
1279		if inittrace.active && inittrace.id == getg().goid {
1280			// Init functions are executed sequentially in a single goroutine.
1281			inittrace.bytes += uint64(fullSize)
1282		}
1283
1284		if traceAllocFreeEnabled() {
1285			trace := traceAcquire()
1286			if trace.ok() {
1287				trace.HeapObjectAlloc(uintptr(x), typ)
1288				traceRelease(trace)
1289			}
1290		}
1291	}
1292
1293	if assistG != nil {
1294		// Account for internal fragmentation in the assist
1295		// debt now that we know it.
1296		//
1297		// N.B. Use the full size because that's how the rest
1298		// of the GC accounts for bytes marked.
1299		assistG.gcAssistBytes -= int64(fullSize - dataSize)
1300	}
1301
1302	if shouldhelpgc {
1303		if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
1304			gcStart(t)
1305		}
1306	}
1307
1308	if raceenabled && noscan && dataSize < maxTinySize {
1309		// Pad tinysize allocations so they are aligned with the end
1310		// of the tinyalloc region. This ensures that any arithmetic
1311		// that goes off the top end of the object will be detectable
1312		// by checkptr (issue 38872).
1313		// Note that we disable tinyalloc when raceenabled for this to work.
1314		// TODO: This padding is only performed when the race detector
1315		// is enabled. It would be nice to enable it if any package
1316		// was compiled with checkptr, but there's no easy way to
1317		// detect that (especially at compile time).
1318		// TODO: enable this padding for all allocations, not just
1319		// tinyalloc ones. It's tricky because of pointer maps.
1320		// Maybe just all noscan objects?
1321		x = add(x, size-dataSize)
1322	}
1323
1324	return x
1325}
1326
1327// deductAssistCredit reduces the current G's assist credit
1328// by size bytes, and assists the GC if necessary.
1329//
1330// Caller must be preemptible.
1331//
1332// Returns the G for which the assist credit was accounted.
1333func deductAssistCredit(size uintptr) *g {
1334	var assistG *g
1335	if gcBlackenEnabled != 0 {
1336		// Charge the current user G for this allocation.
1337		assistG = getg()
1338		if assistG.m.curg != nil {
1339			assistG = assistG.m.curg
1340		}
1341		// Charge the allocation against the G. We'll account
1342		// for internal fragmentation at the end of mallocgc.
1343		assistG.gcAssistBytes -= int64(size)
1344
1345		if assistG.gcAssistBytes < 0 {
1346			// This G is in debt. Assist the GC to correct
1347			// this before allocating. This must happen
1348			// before disabling preemption.
1349			gcAssistAlloc(assistG)
1350		}
1351	}
1352	return assistG
1353}
1354
1355// memclrNoHeapPointersChunked repeatedly calls memclrNoHeapPointers
1356// on chunks of the buffer to be zeroed, with opportunities for preemption
1357// along the way.  memclrNoHeapPointers contains no safepoints and also
1358// cannot be preemptively scheduled, so this provides a still-efficient
1359// block copy that can also be preempted on a reasonable granularity.
1360//
1361// Use this with care; if the data being cleared is tagged to contain
1362// pointers, this allows the GC to run before it is all cleared.
1363func memclrNoHeapPointersChunked(size uintptr, x unsafe.Pointer) {
1364	v := uintptr(x)
1365	// got this from benchmarking. 128k is too small, 512k is too large.
1366	const chunkBytes = 256 * 1024
1367	vsize := v + size
1368	for voff := v; voff < vsize; voff = voff + chunkBytes {
1369		if getg().preempt {
1370			// may hold locks, e.g., profiling
1371			goschedguarded()
1372		}
1373		// clear min(avail, lump) bytes
1374		n := vsize - voff
1375		if n > chunkBytes {
1376			n = chunkBytes
1377		}
1378		memclrNoHeapPointers(unsafe.Pointer(voff), n)
1379	}
1380}
1381
1382// implementation of new builtin
1383// compiler (both frontend and SSA backend) knows the signature
1384// of this function.
1385func newobject(typ *_type) unsafe.Pointer {
1386	return mallocgc(typ.Size_, typ, true)
1387}
1388
1389// reflect_unsafe_New is meant for package reflect,
1390// but widely used packages access it using linkname.
1391// Notable members of the hall of shame include:
1392//   - gitee.com/quant1x/gox
1393//   - github.com/goccy/json
1394//   - github.com/modern-go/reflect2
1395//   - github.com/v2pro/plz
1396//
1397// Do not remove or change the type signature.
1398// See go.dev/issue/67401.
1399//
1400//go:linkname reflect_unsafe_New reflect.unsafe_New
1401func reflect_unsafe_New(typ *_type) unsafe.Pointer {
1402	return mallocgc(typ.Size_, typ, true)
1403}
1404
1405//go:linkname reflectlite_unsafe_New internal/reflectlite.unsafe_New
1406func reflectlite_unsafe_New(typ *_type) unsafe.Pointer {
1407	return mallocgc(typ.Size_, typ, true)
1408}
1409
1410// newarray allocates an array of n elements of type typ.
1411//
1412// newarray should be an internal detail,
1413// but widely used packages access it using linkname.
1414// Notable members of the hall of shame include:
1415//   - github.com/RomiChan/protobuf
1416//   - github.com/segmentio/encoding
1417//   - github.com/ugorji/go/codec
1418//
1419// Do not remove or change the type signature.
1420// See go.dev/issue/67401.
1421//
1422//go:linkname newarray
1423func newarray(typ *_type, n int) unsafe.Pointer {
1424	if n == 1 {
1425		return mallocgc(typ.Size_, typ, true)
1426	}
1427	mem, overflow := math.MulUintptr(typ.Size_, uintptr(n))
1428	if overflow || mem > maxAlloc || n < 0 {
1429		panic(plainError("runtime: allocation size out of range"))
1430	}
1431	return mallocgc(mem, typ, true)
1432}
1433
1434// reflect_unsafe_NewArray is meant for package reflect,
1435// but widely used packages access it using linkname.
1436// Notable members of the hall of shame include:
1437//   - gitee.com/quant1x/gox
1438//   - github.com/bytedance/sonic
1439//   - github.com/goccy/json
1440//   - github.com/modern-go/reflect2
1441//   - github.com/segmentio/encoding
1442//   - github.com/segmentio/kafka-go
1443//   - github.com/v2pro/plz
1444//
1445// Do not remove or change the type signature.
1446// See go.dev/issue/67401.
1447//
1448//go:linkname reflect_unsafe_NewArray reflect.unsafe_NewArray
1449func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
1450	return newarray(typ, n)
1451}
1452
1453func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
1454	c := getMCache(mp)
1455	if c == nil {
1456		throw("profilealloc called without a P or outside bootstrapping")
1457	}
1458	c.nextSample = nextSample()
1459	mProf_Malloc(mp, x, size)
1460}
1461
1462// nextSample returns the next sampling point for heap profiling. The goal is
1463// to sample allocations on average every MemProfileRate bytes, but with a
1464// completely random distribution over the allocation timeline; this
1465// corresponds to a Poisson process with parameter MemProfileRate. In Poisson
1466// processes, the distance between two samples follows the exponential
1467// distribution (exp(MemProfileRate)), so the best return value is a random
1468// number taken from an exponential distribution whose mean is MemProfileRate.
1469func nextSample() uintptr {
1470	if MemProfileRate == 1 {
1471		// Callers assign our return value to
1472		// mcache.next_sample, but next_sample is not used
1473		// when the rate is 1. So avoid the math below and
1474		// just return something.
1475		return 0
1476	}
1477	if GOOS == "plan9" {
1478		// Plan 9 doesn't support floating point in note handler.
1479		if gp := getg(); gp == gp.m.gsignal {
1480			return nextSampleNoFP()
1481		}
1482	}
1483
1484	return uintptr(fastexprand(MemProfileRate))
1485}
1486
1487// fastexprand returns a random number from an exponential distribution with
1488// the specified mean.
1489func fastexprand(mean int) int32 {
1490	// Avoid overflow. Maximum possible step is
1491	// -ln(1/(1<<randomBitCount)) * mean, approximately 20 * mean.
1492	switch {
1493	case mean > 0x7000000:
1494		mean = 0x7000000
1495	case mean == 0:
1496		return 0
1497	}
1498
1499	// Take a random sample of the exponential distribution exp(-mean*x).
1500	// The probability distribution function is mean*exp(-mean*x), so the CDF is
1501	// p = 1 - exp(-mean*x), so
1502	// q = 1 - p == exp(-mean*x)
1503	// log_e(q) = -mean*x
1504	// -log_e(q)/mean = x
1505	// x = -log_e(q) * mean
1506	// x = log_2(q) * (-log_e(2)) * mean    ; Using log_2 for efficiency
1507	const randomBitCount = 26
1508	q := cheaprandn(1<<randomBitCount) + 1
1509	qlog := fastlog2(float64(q)) - randomBitCount
1510	if qlog > 0 {
1511		qlog = 0
1512	}
1513	const minusLog2 = -0.6931471805599453 // -ln(2)
1514	return int32(qlog*(minusLog2*float64(mean))) + 1
1515}
1516
1517// nextSampleNoFP is similar to nextSample, but uses older,
1518// simpler code to avoid floating point.
1519func nextSampleNoFP() uintptr {
1520	// Set first allocation sample size.
1521	rate := MemProfileRate
1522	if rate > 0x3fffffff { // make 2*rate not overflow
1523		rate = 0x3fffffff
1524	}
1525	if rate != 0 {
1526		return uintptr(cheaprandn(uint32(2 * rate)))
1527	}
1528	return 0
1529}
1530
1531type persistentAlloc struct {
1532	base *notInHeap
1533	off  uintptr
1534}
1535
1536var globalAlloc struct {
1537	mutex
1538	persistentAlloc
1539}
1540
1541// persistentChunkSize is the number of bytes we allocate when we grow
1542// a persistentAlloc.
1543const persistentChunkSize = 256 << 10
1544
1545// persistentChunks is a list of all the persistent chunks we have
1546// allocated. The list is maintained through the first word in the
1547// persistent chunk. This is updated atomically.
1548var persistentChunks *notInHeap
1549
1550// Wrapper around sysAlloc that can allocate small chunks.
1551// There is no associated free operation.
1552// Intended for things like function/type/debug-related persistent data.
1553// If align is 0, uses default align (currently 8).
1554// The returned memory will be zeroed.
1555// sysStat must be non-nil.
1556//
1557// Consider marking persistentalloc'd types not in heap by embedding
1558// runtime/internal/sys.NotInHeap.
1559func persistentalloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
1560	var p *notInHeap
1561	systemstack(func() {
1562		p = persistentalloc1(size, align, sysStat)
1563	})
1564	return unsafe.Pointer(p)
1565}
1566
1567// Must run on system stack because stack growth can (re)invoke it.
1568// See issue 9174.
1569//
1570//go:systemstack
1571func persistentalloc1(size, align uintptr, sysStat *sysMemStat) *notInHeap {
1572	const (
1573		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
1574	)
1575
1576	if size == 0 {
1577		throw("persistentalloc: size == 0")
1578	}
1579	if align != 0 {
1580		if align&(align-1) != 0 {
1581			throw("persistentalloc: align is not a power of 2")
1582		}
1583		if align > _PageSize {
1584			throw("persistentalloc: align is too large")
1585		}
1586	} else {
1587		align = 8
1588	}
1589
1590	if size >= maxBlock {
1591		return (*notInHeap)(sysAlloc(size, sysStat))
1592	}
1593
1594	mp := acquirem()
1595	var persistent *persistentAlloc
1596	if mp != nil && mp.p != 0 {
1597		persistent = &mp.p.ptr().palloc
1598	} else {
1599		lock(&globalAlloc.mutex)
1600		persistent = &globalAlloc.persistentAlloc
1601	}
1602	persistent.off = alignUp(persistent.off, align)
1603	if persistent.off+size > persistentChunkSize || persistent.base == nil {
1604		persistent.base = (*notInHeap)(sysAlloc(persistentChunkSize, &memstats.other_sys))
1605		if persistent.base == nil {
1606			if persistent == &globalAlloc.persistentAlloc {
1607				unlock(&globalAlloc.mutex)
1608			}
1609			throw("runtime: cannot allocate memory")
1610		}
1611
1612		// Add the new chunk to the persistentChunks list.
1613		for {
1614			chunks := uintptr(unsafe.Pointer(persistentChunks))
1615			*(*uintptr)(unsafe.Pointer(persistent.base)) = chunks
1616			if atomic.Casuintptr((*uintptr)(unsafe.Pointer(&persistentChunks)), chunks, uintptr(unsafe.Pointer(persistent.base))) {
1617				break
1618			}
1619		}
1620		persistent.off = alignUp(goarch.PtrSize, align)
1621	}
1622	p := persistent.base.add(persistent.off)
1623	persistent.off += size
1624	releasem(mp)
1625	if persistent == &globalAlloc.persistentAlloc {
1626		unlock(&globalAlloc.mutex)
1627	}
1628
1629	if sysStat != &memstats.other_sys {
1630		sysStat.add(int64(size))
1631		memstats.other_sys.add(-int64(size))
1632	}
1633	return p
1634}
1635
1636// inPersistentAlloc reports whether p points to memory allocated by
1637// persistentalloc. This must be nosplit because it is called by the
1638// cgo checker code, which is called by the write barrier code.
1639//
1640//go:nosplit
1641func inPersistentAlloc(p uintptr) bool {
1642	chunk := atomic.Loaduintptr((*uintptr)(unsafe.Pointer(&persistentChunks)))
1643	for chunk != 0 {
1644		if p >= chunk && p < chunk+persistentChunkSize {
1645			return true
1646		}
1647		chunk = *(*uintptr)(unsafe.Pointer(chunk))
1648	}
1649	return false
1650}
1651
1652// linearAlloc is a simple linear allocator that pre-reserves a region
1653// of memory and then optionally maps that region into the Ready state
1654// as needed.
1655//
1656// The caller is responsible for locking.
1657type linearAlloc struct {
1658	next   uintptr // next free byte
1659	mapped uintptr // one byte past end of mapped space
1660	end    uintptr // end of reserved space
1661
1662	mapMemory bool // transition memory from Reserved to Ready if true
1663}
1664
1665func (l *linearAlloc) init(base, size uintptr, mapMemory bool) {
1666	if base+size < base {
1667		// Chop off the last byte. The runtime isn't prepared
1668		// to deal with situations where the bounds could overflow.
1669		// Leave that memory reserved, though, so we don't map it
1670		// later.
1671		size -= 1
1672	}
1673	l.next, l.mapped = base, base
1674	l.end = base + size
1675	l.mapMemory = mapMemory
1676}
1677
1678func (l *linearAlloc) alloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
1679	p := alignUp(l.next, align)
1680	if p+size > l.end {
1681		return nil
1682	}
1683	l.next = p + size
1684	if pEnd := alignUp(l.next-1, physPageSize); pEnd > l.mapped {
1685		if l.mapMemory {
1686			// Transition from Reserved to Prepared to Ready.
1687			n := pEnd - l.mapped
1688			sysMap(unsafe.Pointer(l.mapped), n, sysStat)
1689			sysUsed(unsafe.Pointer(l.mapped), n, n)
1690		}
1691		l.mapped = pEnd
1692	}
1693	return unsafe.Pointer(p)
1694}
1695
1696// notInHeap is off-heap memory allocated by a lower-level allocator
1697// like sysAlloc or persistentAlloc.
1698//
1699// In general, it's better to use real types which embed
1700// runtime/internal/sys.NotInHeap, but this serves as a generic type
1701// for situations where that isn't possible (like in the allocators).
1702//
1703// TODO: Use this as the return type of sysAlloc, persistentAlloc, etc?
1704type notInHeap struct{ _ sys.NotInHeap }
1705
1706func (p *notInHeap) add(bytes uintptr) *notInHeap {
1707	return (*notInHeap)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + bytes))
1708}
1709
1710// computeRZlog computes the size of the redzone.
1711// Refer to the implementation of the compiler-rt.
1712func computeRZlog(userSize uintptr) uintptr {
1713	switch {
1714	case userSize <= (64 - 16):
1715		return 16 << 0
1716	case userSize <= (128 - 32):
1717		return 16 << 1
1718	case userSize <= (512 - 64):
1719		return 16 << 2
1720	case userSize <= (4096 - 128):
1721		return 16 << 3
1722	case userSize <= (1<<14)-256:
1723		return 16 << 4
1724	case userSize <= (1<<15)-512:
1725		return 16 << 5
1726	case userSize <= (1<<16)-1024:
1727		return 16 << 6
1728	default:
1729		return 16 << 7
1730	}
1731}
1732