1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "partition_alloc/partition_bucket.h"
6
7 #include <algorithm>
8 #include <bit>
9 #include <cstdint>
10 #include <tuple>
11
12 #include "build/build_config.h"
13 #include "partition_alloc/address_pool_manager.h"
14 #include "partition_alloc/freeslot_bitmap.h"
15 #include "partition_alloc/freeslot_bitmap_constants.h"
16 #include "partition_alloc/oom.h"
17 #include "partition_alloc/page_allocator.h"
18 #include "partition_alloc/page_allocator_constants.h"
19 #include "partition_alloc/partition_address_space.h"
20 #include "partition_alloc/partition_alloc.h"
21 #include "partition_alloc/partition_alloc_base/bits.h"
22 #include "partition_alloc/partition_alloc_base/compiler_specific.h"
23 #include "partition_alloc/partition_alloc_base/component_export.h"
24 #include "partition_alloc/partition_alloc_base/debug/alias.h"
25 #include "partition_alloc/partition_alloc_base/debug/debugging_buildflags.h"
26 #include "partition_alloc/partition_alloc_base/immediate_crash.h"
27 #include "partition_alloc/partition_alloc_base/thread_annotations.h"
28 #include "partition_alloc/partition_alloc_buildflags.h"
29 #include "partition_alloc/partition_alloc_check.h"
30 #include "partition_alloc/partition_alloc_config.h"
31 #include "partition_alloc/partition_alloc_constants.h"
32 #include "partition_alloc/partition_alloc_forward.h"
33 #include "partition_alloc/partition_direct_map_extent.h"
34 #include "partition_alloc/partition_freelist_entry.h"
35 #include "partition_alloc/partition_oom.h"
36 #include "partition_alloc/partition_page.h"
37 #include "partition_alloc/partition_root.h"
38 #include "partition_alloc/reservation_offset_table.h"
39 #include "partition_alloc/tagging.h"
40
41 #if BUILDFLAG(USE_STARSCAN)
42 #include "partition_alloc/starscan/pcscan.h"
43 #endif
44
45 namespace partition_alloc::internal {
46
47 namespace {
48
49 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
ShadowMetadataStart(uintptr_t super_page,pool_handle pool)50 PA_ALWAYS_INLINE uintptr_t ShadowMetadataStart(uintptr_t super_page,
51 pool_handle pool) {
52 uintptr_t shadow_metadata_start =
53 super_page + SystemPageSize() + ShadowPoolOffset(pool);
54 PA_DCHECK(!PartitionAddressSpace::IsInRegularPool(shadow_metadata_start));
55 PA_DCHECK(!PartitionAddressSpace::IsInBRPPool(shadow_metadata_start));
56 return shadow_metadata_start;
57 }
58 #endif
59
PartitionOutOfMemoryMappingFailure(PartitionRoot * root,size_t size)60 [[noreturn]] PA_NOINLINE void PartitionOutOfMemoryMappingFailure(
61 PartitionRoot* root,
62 size_t size) PA_LOCKS_EXCLUDED(PartitionRootLock(root)) {
63 PA_NO_CODE_FOLDING();
64 root->OutOfMemory(size);
65 PA_IMMEDIATE_CRASH(); // Not required, kept as documentation.
66 }
67
PartitionOutOfMemoryCommitFailure(PartitionRoot * root,size_t size)68 [[noreturn]] PA_NOINLINE void PartitionOutOfMemoryCommitFailure(
69 PartitionRoot* root,
70 size_t size) PA_LOCKS_EXCLUDED(PartitionRootLock(root)) {
71 PA_NO_CODE_FOLDING();
72 root->OutOfMemory(size);
73 PA_IMMEDIATE_CRASH(); // Not required, kept as documentation.
74 }
75
76 #if !BUILDFLAG(HAS_64_BIT_POINTERS) && BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
77 // |start| has to be aligned to kSuperPageSize, but |end| doesn't. This means
78 // that a partial super page is allowed at the end. Since the block list uses
79 // kSuperPageSize granularity, a partial super page is considered blocked if
80 // there is a raw_ptr<T> pointing anywhere in that super page, even if doesn't
81 // point to that partially allocated region.
AreAllowedSuperPagesForBRPPool(uintptr_t start,uintptr_t end)82 bool AreAllowedSuperPagesForBRPPool(uintptr_t start, uintptr_t end) {
83 PA_DCHECK(!(start % kSuperPageSize));
84 for (uintptr_t super_page = start; super_page < end;
85 super_page += kSuperPageSize) {
86 // If any blocked super page is found inside the given memory region,
87 // the memory region is blocked.
88 if (!AddressPoolManagerBitmap::IsAllowedSuperPageForBRPPool(super_page)) {
89 AddressPoolManagerBitmap::IncrementBlocklistHitCount();
90 return false;
91 }
92 }
93 return true;
94 }
95 #endif // !BUILDFLAG(HAS_64_BIT_POINTERS) &&
96 // BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
97
98 // Reserves |requested_size| worth of super pages from the specified pool.
99 // If BRP pool is requested this function will honor BRP block list.
100 //
101 // The returned address will be aligned to kSuperPageSize, and so
102 // |requested_address| should be. |requested_size| doesn't have to be, however.
103 //
104 // |requested_address| is merely a hint, which will be attempted, but easily
105 // given up on if doesn't work the first time.
106 //
107 // The function doesn't need to hold root->lock_ or any other locks, because:
108 // - It (1) reserves memory, (2) then consults AreAllowedSuperPagesForBRPPool
109 // for that memory, and (3) returns the memory if
110 // allowed, or unreserves and decommits if not allowed. So no other
111 // overlapping region can be allocated while executing
112 // AreAllowedSuperPagesForBRPPool.
113 // - IsAllowedSuperPageForBRPPool (used by AreAllowedSuperPagesForBRPPool) is
114 // designed to not need locking.
ReserveMemoryFromPool(pool_handle pool,uintptr_t requested_address,size_t requested_size)115 uintptr_t ReserveMemoryFromPool(pool_handle pool,
116 uintptr_t requested_address,
117 size_t requested_size) {
118 PA_DCHECK(!(requested_address % kSuperPageSize));
119
120 uintptr_t reserved_address = AddressPoolManager::GetInstance().Reserve(
121 pool, requested_address, requested_size);
122
123 // In 32-bit mode, when allocating from BRP pool, verify that the requested
124 // allocation honors the block list. Find a better address otherwise.
125 #if !BUILDFLAG(HAS_64_BIT_POINTERS) && BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
126 if (pool == kBRPPoolHandle) {
127 constexpr int kMaxRandomAddressTries = 10;
128 for (int i = 0; i < kMaxRandomAddressTries; ++i) {
129 if (!reserved_address ||
130 AreAllowedSuperPagesForBRPPool(reserved_address,
131 reserved_address + requested_size)) {
132 break;
133 }
134 AddressPoolManager::GetInstance().UnreserveAndDecommit(
135 pool, reserved_address, requested_size);
136 // No longer try to honor |requested_address|, because it didn't work for
137 // us last time.
138 reserved_address =
139 AddressPoolManager::GetInstance().Reserve(pool, 0, requested_size);
140 }
141
142 // If the allocation attempt succeeds, we will break out of the following
143 // loop immediately.
144 //
145 // Last resort: sequentially scan the whole 32-bit address space. The number
146 // of blocked super-pages should be very small, so we expect to practically
147 // never need to run the following code. Note that it may fail to find an
148 // available super page, e.g., when it becomes available after the scan
149 // passes through it, but we accept the risk.
150 for (uintptr_t address_to_try = kSuperPageSize; address_to_try != 0;
151 address_to_try += kSuperPageSize) {
152 if (!reserved_address ||
153 AreAllowedSuperPagesForBRPPool(reserved_address,
154 reserved_address + requested_size)) {
155 break;
156 }
157 AddressPoolManager::GetInstance().UnreserveAndDecommit(
158 pool, reserved_address, requested_size);
159 // Reserve() can return a different pointer than attempted.
160 reserved_address = AddressPoolManager::GetInstance().Reserve(
161 pool, address_to_try, requested_size);
162 }
163
164 // If the loop ends naturally, the last allocated region hasn't been
165 // verified. Do it now.
166 if (reserved_address &&
167 !AreAllowedSuperPagesForBRPPool(reserved_address,
168 reserved_address + requested_size)) {
169 AddressPoolManager::GetInstance().UnreserveAndDecommit(
170 pool, reserved_address, requested_size);
171 reserved_address = 0;
172 }
173 }
174 #endif // !BUILDFLAG(HAS_64_BIT_POINTERS) &&
175 // BUILDFLAG(ENABLE_BACKUP_REF_PTR_SUPPORT)
176
177 #if !BUILDFLAG(HAS_64_BIT_POINTERS)
178 // Only mark the region as belonging to the pool after it has passed the
179 // blocklist check in order to avoid a potential race with destructing a
180 // raw_ptr<T> object that points to non-PA memory in another thread.
181 // If `MarkUsed` was called earlier, the other thread could incorrectly
182 // determine that the allocation had come form PartitionAlloc.
183 if (reserved_address) {
184 AddressPoolManager::GetInstance().MarkUsed(pool, reserved_address,
185 requested_size);
186 }
187 #endif
188
189 PA_DCHECK(!(reserved_address % kSuperPageSize));
190 return reserved_address;
191 }
192
PartitionDirectMap(PartitionRoot * root,AllocFlags flags,size_t raw_size,size_t slot_span_alignment)193 SlotSpanMetadata* PartitionDirectMap(PartitionRoot* root,
194 AllocFlags flags,
195 size_t raw_size,
196 size_t slot_span_alignment) {
197 PA_DCHECK((slot_span_alignment >= PartitionPageSize()) &&
198 std::has_single_bit(slot_span_alignment));
199
200 // No static EXCLUSIVE_LOCKS_REQUIRED(), as the checker doesn't understand
201 // scoped unlocking.
202 PartitionRootLock(root).AssertAcquired();
203
204 const bool return_null = ContainsFlags(flags, AllocFlags::kReturnNull);
205 if (PA_UNLIKELY(raw_size > MaxDirectMapped())) {
206 if (return_null) {
207 return nullptr;
208 }
209
210 // The lock is here to protect PA from:
211 // 1. Concurrent calls
212 // 2. Reentrant calls
213 //
214 // This is fine here however, as:
215 // 1. Concurrency: |PartitionRoot::OutOfMemory()| never returns, so the lock
216 // will not be re-acquired, which would lead to acting on inconsistent
217 // data that could have been modified in-between releasing and acquiring
218 // it.
219 // 2. Reentrancy: This is why we release the lock. On some platforms,
220 // terminating the process may free() memory, or even possibly try to
221 // allocate some. Calling free() is fine, but will deadlock since
222 // |PartitionRoot::lock_| is not recursive.
223 //
224 // Supporting reentrant calls properly is hard, and not a requirement for
225 // PA. However up to that point, we've only *read* data, not *written* to
226 // any state. Reentrant calls are then fine, especially as we don't continue
227 // on this path. The only downside is possibly endless recursion if the OOM
228 // handler allocates and fails to use UncheckedMalloc() or equivalent, but
229 // that's violating the contract of base::TerminateBecauseOutOfMemory().
230 ScopedUnlockGuard unlock{PartitionRootLock(root)};
231 PartitionExcessiveAllocationSize(raw_size);
232 }
233
234 PartitionDirectMapExtent* map_extent = nullptr;
235 PartitionPageMetadata* page_metadata = nullptr;
236
237 {
238 // Getting memory for direct-mapped allocations doesn't interact with the
239 // rest of the allocator, but takes a long time, as it involves several
240 // system calls. Although no mmap() (or equivalent) calls are made on
241 // 64 bit systems, page permissions are changed with mprotect(), which is
242 // a syscall.
243 //
244 // These calls are almost always slow (at least a couple us per syscall on a
245 // desktop Linux machine), and they also have a very long latency tail,
246 // possibly from getting descheduled. As a consequence, we should not hold
247 // the lock when performing a syscall. This is not the only problematic
248 // location, but since this one doesn't interact with the rest of the
249 // allocator, we can safely drop and then re-acquire the lock.
250 //
251 // Note that this only affects allocations that are not served out of the
252 // thread cache, but as a simple example the buffer partition in blink is
253 // frequently used for large allocations (e.g. ArrayBuffer), and frequent,
254 // small ones (e.g. WTF::String), and does not have a thread cache.
255 ScopedUnlockGuard scoped_unlock{PartitionRootLock(root)};
256
257 const size_t slot_size = PartitionRoot::GetDirectMapSlotSize(raw_size);
258 // The super page starts with a partition page worth of metadata and guard
259 // pages, hence alignment requests ==PartitionPageSize() will be
260 // automatically satisfied. Padding is needed for higher-order alignment
261 // requests. Note, |slot_span_alignment| is at least 1 partition page.
262 const size_t padding_for_alignment =
263 slot_span_alignment - PartitionPageSize();
264 const size_t reservation_size = PartitionRoot::GetDirectMapReservationSize(
265 raw_size + padding_for_alignment);
266 PA_DCHECK(reservation_size >= raw_size);
267 #if BUILDFLAG(PA_DCHECK_IS_ON)
268 const size_t available_reservation_size =
269 reservation_size - padding_for_alignment -
270 PartitionRoot::GetDirectMapMetadataAndGuardPagesSize();
271 PA_DCHECK(slot_size <= available_reservation_size);
272 #endif
273
274 pool_handle pool = root->ChoosePool();
275 uintptr_t reservation_start;
276 {
277 // Reserving memory from the pool is actually not a syscall on 64 bit
278 // platforms.
279 #if !BUILDFLAG(HAS_64_BIT_POINTERS)
280 ScopedSyscallTimer timer{root};
281 #endif
282 reservation_start = ReserveMemoryFromPool(pool, 0, reservation_size);
283 }
284 if (PA_UNLIKELY(!reservation_start)) {
285 if (return_null) {
286 return nullptr;
287 }
288
289 PartitionOutOfMemoryMappingFailure(root, reservation_size);
290 }
291
292 root->total_size_of_direct_mapped_pages.fetch_add(
293 reservation_size, std::memory_order_relaxed);
294
295 // Shift by 1 partition page (metadata + guard pages) and alignment padding.
296 const uintptr_t slot_start =
297 reservation_start + PartitionPageSize() + padding_for_alignment;
298
299 {
300 ScopedSyscallTimer timer{root};
301 RecommitSystemPages(reservation_start + SystemPageSize(),
302 SystemPageSize(),
303 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
304 root->PageAccessibilityWithThreadIsolationIfEnabled(
305 PageAccessibilityConfiguration::kRead),
306 #else
307 root->PageAccessibilityWithThreadIsolationIfEnabled(
308 PageAccessibilityConfiguration::kReadWrite),
309 #endif
310 PageAccessibilityDisposition::kRequireUpdate);
311 }
312
313 if (pool == kBRPPoolHandle) {
314 // Allocate a system page for BRP ref-count table (only one of its
315 // elements will be used).
316 ScopedSyscallTimer timer{root};
317 RecommitSystemPages(reservation_start + SystemPageSize() * 2,
318 SystemPageSize(),
319 root->PageAccessibilityWithThreadIsolationIfEnabled(
320 PageAccessibilityConfiguration::kReadWrite),
321 PageAccessibilityDisposition::kRequireUpdate);
322 }
323
324 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
325 {
326 ScopedSyscallTimer timer{root};
327 RecommitSystemPages(ShadowMetadataStart(reservation_start, pool),
328 SystemPageSize(),
329 root->PageAccessibilityWithThreadIsolationIfEnabled(
330 PageAccessibilityConfiguration::kReadWrite),
331 PageAccessibilityDisposition::kRequireUpdate);
332 }
333 #endif
334
335 // No need to hold root->lock_. Now that memory is reserved, no other
336 // overlapping region can be allocated (because of how pools work),
337 // so no other thread can update the same offset table entries at the
338 // same time. Furthermore, nobody will be ready these offsets until this
339 // function returns.
340 auto* offset_ptr = ReservationOffsetPointer(reservation_start);
341 [[maybe_unused]] const auto* offset_ptr_end =
342 GetReservationOffsetTableEnd(reservation_start);
343
344 // |raw_size| > MaxBucketed(). So |reservation_size| > 0.
345 PA_DCHECK(reservation_size > 0);
346 const uint16_t offset_end = (reservation_size - 1) >> kSuperPageShift;
347 for (uint16_t offset = 0; offset <= offset_end; ++offset) {
348 PA_DCHECK(offset < kOffsetTagNormalBuckets);
349 PA_DCHECK(offset_ptr < offset_ptr_end);
350 *offset_ptr++ = offset;
351 }
352
353 auto* super_page_extent = PartitionSuperPageToExtent(reservation_start);
354 super_page_extent->root = root;
355 // The new structures are all located inside a fresh system page so they
356 // will all be zeroed out. These DCHECKs are for documentation and to assert
357 // our expectations of the kernel.
358 PA_DCHECK(!super_page_extent->number_of_consecutive_super_pages);
359 PA_DCHECK(!super_page_extent->next);
360
361 PartitionPageMetadata* first_page_metadata =
362 reinterpret_cast<PartitionPageMetadata*>(super_page_extent) + 1;
363 page_metadata = PartitionPageMetadata::FromAddr(slot_start);
364 // |first_page_metadata| and |page_metadata| may be equal, if there is no
365 // alignment padding.
366 if (page_metadata != first_page_metadata) {
367 PA_DCHECK(page_metadata > first_page_metadata);
368 PA_DCHECK(page_metadata - first_page_metadata <=
369 PartitionPageMetadata::kMaxSlotSpanMetadataOffset);
370 PA_CHECK(!first_page_metadata->is_valid);
371 first_page_metadata->has_valid_span_after_this = true;
372 first_page_metadata->slot_span_metadata_offset =
373 page_metadata - first_page_metadata;
374 }
375 auto* direct_map_metadata =
376 reinterpret_cast<PartitionDirectMapMetadata*>(page_metadata);
377 // Since direct map metadata is larger than PartitionPageMetadata, make sure
378 // the first and the last bytes are on the same system page, i.e. within the
379 // super page metadata region.
380 PA_DCHECK(
381 base::bits::AlignDown(reinterpret_cast<uintptr_t>(direct_map_metadata),
382 SystemPageSize()) ==
383 base::bits::AlignDown(reinterpret_cast<uintptr_t>(direct_map_metadata) +
384 sizeof(PartitionDirectMapMetadata) - 1,
385 SystemPageSize()));
386 PA_DCHECK(page_metadata == &direct_map_metadata->page_metadata);
387 page_metadata->is_valid = true;
388 PA_DCHECK(!page_metadata->has_valid_span_after_this);
389 PA_DCHECK(!page_metadata->slot_span_metadata_offset);
390 PA_DCHECK(!page_metadata->slot_span_metadata.next_slot_span);
391 PA_DCHECK(!page_metadata->slot_span_metadata.marked_full);
392 PA_DCHECK(!page_metadata->slot_span_metadata.num_allocated_slots);
393 PA_DCHECK(!page_metadata->slot_span_metadata.num_unprovisioned_slots);
394 PA_DCHECK(!page_metadata->slot_span_metadata.in_empty_cache());
395
396 PA_DCHECK(!direct_map_metadata->second_page_metadata
397 .subsequent_page_metadata.raw_size);
398 // Raw size is set later, by the caller.
399 direct_map_metadata->second_page_metadata.slot_span_metadata_offset = 1;
400
401 PA_DCHECK(!direct_map_metadata->bucket.active_slot_spans_head);
402 PA_DCHECK(!direct_map_metadata->bucket.empty_slot_spans_head);
403 PA_DCHECK(!direct_map_metadata->bucket.decommitted_slot_spans_head);
404 PA_DCHECK(!direct_map_metadata->bucket.num_system_pages_per_slot_span);
405 PA_DCHECK(!direct_map_metadata->bucket.num_full_slot_spans);
406 direct_map_metadata->bucket.slot_size = slot_size;
407
408 new (&page_metadata->slot_span_metadata)
409 SlotSpanMetadata(&direct_map_metadata->bucket);
410
411 // It is typically possible to map a large range of inaccessible pages, and
412 // this is leveraged in multiple places, including the pools. However,
413 // this doesn't mean that we can commit all this memory. For the vast
414 // majority of allocations, this just means that we crash in a slightly
415 // different place, but for callers ready to handle failures, we have to
416 // return nullptr. See crbug.com/1187404.
417 //
418 // Note that we didn't check above, because if we cannot even commit a
419 // single page, then this is likely hopeless anyway, and we will crash very
420 // soon.
421 //
422 // Direct map never uses tagging, as size is always >kMaxMemoryTaggingSize.
423 PA_DCHECK(raw_size > kMaxMemoryTaggingSize);
424 const bool ok = root->TryRecommitSystemPagesForDataWithAcquiringLock(
425 slot_start, slot_size, PageAccessibilityDisposition::kRequireUpdate,
426 false);
427 if (!ok) {
428 if (!return_null) {
429 PartitionOutOfMemoryCommitFailure(root, slot_size);
430 }
431
432 {
433 ScopedSyscallTimer timer{root};
434 #if !BUILDFLAG(HAS_64_BIT_POINTERS)
435 AddressPoolManager::GetInstance().MarkUnused(pool, reservation_start,
436 reservation_size);
437 #endif
438 AddressPoolManager::GetInstance().UnreserveAndDecommit(
439 pool, reservation_start, reservation_size);
440 }
441
442 root->total_size_of_direct_mapped_pages.fetch_sub(
443 reservation_size, std::memory_order_relaxed);
444
445 return nullptr;
446 }
447
448 auto* next_entry =
449 root->get_freelist_dispatcher()->EmplaceAndInitNull(slot_start);
450
451 page_metadata->slot_span_metadata.SetFreelistHead(next_entry);
452
453 map_extent = &direct_map_metadata->direct_map_extent;
454 map_extent->reservation_size = reservation_size;
455 map_extent->padding_for_alignment = padding_for_alignment;
456 map_extent->bucket = &direct_map_metadata->bucket;
457 }
458
459 PartitionRootLock(root).AssertAcquired();
460
461 // Maintain the doubly-linked list of all direct mappings.
462 map_extent->next_extent = root->direct_map_list;
463 if (map_extent->next_extent) {
464 map_extent->next_extent->prev_extent = map_extent;
465 }
466 map_extent->prev_extent = nullptr;
467 root->direct_map_list = map_extent;
468
469 return &page_metadata->slot_span_metadata;
470 }
471
ComputeSystemPagesPerSlotSpanPreferSmall(size_t slot_size)472 uint8_t ComputeSystemPagesPerSlotSpanPreferSmall(size_t slot_size) {
473 if (slot_size > MaxRegularSlotSpanSize()) {
474 // This is technically not needed, as for now all the larger slot sizes are
475 // multiples of the system page size.
476 return base::bits::AlignUp(slot_size, SystemPageSize()) / SystemPageSize();
477 }
478
479 // Smaller slot spans waste less address space, as well as potentially lower
480 // fragmentation:
481 // - Address space: This comes from fuller SuperPages (since the tail end of a
482 // SuperPage is more likely to be used when the slot span is smaller. Also,
483 // if a slot span is partially used, a smaller slot span will use less
484 // address space.
485 // - In-slot fragmentation: Slot span management code will prioritize
486 // almost-full slot spans, as well as trying to keep empty slot spans
487 // empty. The more granular this logic can work, the better.
488 //
489 // Since metadata space overhead is constant per-PartitionPage, keeping
490 // smaller slot spans makes sense.
491 //
492 // Underlying memory allocation is done per-PartitionPage, but memory commit
493 // is done per system page. This means that we prefer to fill the entirety of
494 // a PartitionPage with a slot span, but we can tolerate some system pages
495 // being empty at the end, as these will not cost committed or dirty memory.
496 //
497 // The choice below is, for multi-slot slot spans:
498 // - If a full PartitionPage slot span is possible with less than 2% of a
499 // *single* system page wasted, use it. The smallest possible size wins.
500 // - Otherwise, select the size with the smallest virtual address space
501 // loss. Allow a SlotSpan to leave some slack in its PartitionPage, up to
502 // 1/4 of the total.
503 for (size_t partition_page_count = 1;
504 partition_page_count <= kMaxPartitionPagesPerRegularSlotSpan;
505 partition_page_count++) {
506 size_t candidate_size = partition_page_count * PartitionPageSize();
507 size_t waste = candidate_size % slot_size;
508 if (waste <= .02 * SystemPageSize()) {
509 return partition_page_count * NumSystemPagesPerPartitionPage();
510 }
511 }
512
513 size_t best_count = 0;
514 size_t best_waste = std::numeric_limits<size_t>::max();
515 for (size_t partition_page_count = 1;
516 partition_page_count <= kMaxPartitionPagesPerRegularSlotSpan;
517 partition_page_count++) {
518 // Prefer no slack.
519 for (size_t slack = 0; slack < partition_page_count; slack++) {
520 size_t system_page_count =
521 partition_page_count * NumSystemPagesPerPartitionPage() - slack;
522 size_t candidate_size = system_page_count * SystemPageSize();
523 size_t waste = candidate_size % slot_size;
524 if (waste < best_waste) {
525 best_waste = waste;
526 best_count = system_page_count;
527 }
528 }
529 }
530 return best_count;
531 }
532
ComputeSystemPagesPerSlotSpanInternal(size_t slot_size)533 uint8_t ComputeSystemPagesPerSlotSpanInternal(size_t slot_size) {
534 // This works out reasonably for the current bucket sizes of the generic
535 // allocator, and the current values of partition page size and constants.
536 // Specifically, we have enough room to always pack the slots perfectly into
537 // some number of system pages. The only waste is the waste associated with
538 // unfaulted pages (i.e. wasted address space).
539 // TODO: we end up using a lot of system pages for very small sizes. For
540 // example, we'll use 12 system pages for slot size 24. The slot size is so
541 // small that the waste would be tiny with just 4, or 1, system pages. Later,
542 // we can investigate whether there are anti-fragmentation benefits to using
543 // fewer system pages.
544 double best_waste_ratio = 1.0f;
545 uint16_t best_pages = 0;
546 if (slot_size > MaxRegularSlotSpanSize()) {
547 // TODO(ajwong): Why is there a DCHECK here for this?
548 // http://crbug.com/776537
549 PA_DCHECK(!(slot_size % SystemPageSize()));
550 best_pages = static_cast<uint16_t>(slot_size >> SystemPageShift());
551 PA_CHECK(best_pages <= std::numeric_limits<uint8_t>::max());
552 return static_cast<uint8_t>(best_pages);
553 }
554 PA_DCHECK(slot_size <= MaxRegularSlotSpanSize());
555 for (uint16_t i = NumSystemPagesPerPartitionPage() - 1;
556 i <= MaxSystemPagesPerRegularSlotSpan(); ++i) {
557 size_t page_size = i << SystemPageShift();
558 size_t num_slots = page_size / slot_size;
559 size_t waste = page_size - (num_slots * slot_size);
560 // Leaving a page unfaulted is not free; the page will occupy an empty page
561 // table entry. Make a simple attempt to account for that.
562 //
563 // TODO(ajwong): This looks wrong. PTEs are allocated for all pages
564 // regardless of whether or not they are wasted. Should it just
565 // be waste += i * sizeof(void*)?
566 // http://crbug.com/776537
567 size_t num_remainder_pages = i & (NumSystemPagesPerPartitionPage() - 1);
568 size_t num_unfaulted_pages =
569 num_remainder_pages
570 ? (NumSystemPagesPerPartitionPage() - num_remainder_pages)
571 : 0;
572 waste += sizeof(void*) * num_unfaulted_pages;
573 double waste_ratio =
574 static_cast<double>(waste) / static_cast<double>(page_size);
575 if (waste_ratio < best_waste_ratio) {
576 best_waste_ratio = waste_ratio;
577 best_pages = i;
578 }
579 }
580 PA_DCHECK(best_pages > 0);
581 PA_CHECK(best_pages <= MaxSystemPagesPerRegularSlotSpan());
582 return static_cast<uint8_t>(best_pages);
583 }
584
585 } // namespace
586
ComputeSystemPagesPerSlotSpan(size_t slot_size,bool prefer_smaller_slot_spans)587 uint8_t ComputeSystemPagesPerSlotSpan(size_t slot_size,
588 bool prefer_smaller_slot_spans) {
589 if (prefer_smaller_slot_spans) {
590 size_t system_page_count =
591 ComputeSystemPagesPerSlotSpanPreferSmall(slot_size);
592 size_t waste = (system_page_count * SystemPageSize()) % slot_size;
593 // In case the waste is too large (more than 5% of a page), don't try to use
594 // the "small" slot span formula. This happens when we have a lot of
595 // buckets, in some cases the formula doesn't find a nice, small size.
596 if (waste <= .05 * SystemPageSize()) {
597 return system_page_count;
598 }
599 }
600
601 return ComputeSystemPagesPerSlotSpanInternal(slot_size);
602 }
603
Init(uint32_t new_slot_size)604 void PartitionBucket::Init(uint32_t new_slot_size) {
605 slot_size = new_slot_size;
606 slot_size_reciprocal = kReciprocalMask / new_slot_size + 1;
607 active_slot_spans_head = SlotSpanMetadata::get_sentinel_slot_span_non_const();
608 empty_slot_spans_head = nullptr;
609 decommitted_slot_spans_head = nullptr;
610 num_full_slot_spans = 0;
611 bool prefer_smaller_slot_spans =
612 #if PA_CONFIG(PREFER_SMALLER_SLOT_SPANS)
613 true
614 #else
615 false
616 #endif
617 ;
618 num_system_pages_per_slot_span =
619 ComputeSystemPagesPerSlotSpan(slot_size, prefer_smaller_slot_spans);
620 }
621
AllocNewSlotSpan(PartitionRoot * root,AllocFlags flags,size_t slot_span_alignment)622 PA_ALWAYS_INLINE SlotSpanMetadata* PartitionBucket::AllocNewSlotSpan(
623 PartitionRoot* root,
624 AllocFlags flags,
625 size_t slot_span_alignment) {
626 PA_DCHECK(!(root->next_partition_page % PartitionPageSize()));
627 PA_DCHECK(!(root->next_partition_page_end % PartitionPageSize()));
628
629 size_t num_partition_pages = get_pages_per_slot_span();
630 size_t slot_span_reservation_size = num_partition_pages
631 << PartitionPageShift();
632 size_t slot_span_committed_size = get_bytes_per_span();
633 PA_DCHECK(num_partition_pages <= NumPartitionPagesPerSuperPage());
634 PA_DCHECK(slot_span_committed_size % SystemPageSize() == 0);
635 PA_DCHECK(slot_span_committed_size <= slot_span_reservation_size);
636
637 uintptr_t adjusted_next_partition_page =
638 base::bits::AlignUp(root->next_partition_page, slot_span_alignment);
639 if (PA_UNLIKELY(adjusted_next_partition_page + slot_span_reservation_size >
640 root->next_partition_page_end)) {
641 // AllocNewSuperPage() may crash (e.g. address space exhaustion), put data
642 // on stack.
643 PA_DEBUG_DATA_ON_STACK("slotsize", slot_size);
644 PA_DEBUG_DATA_ON_STACK("spansize", slot_span_reservation_size);
645
646 // In this case, we can no longer hand out pages from the current super page
647 // allocation. Get a new super page.
648 if (!AllocNewSuperPage(root, flags)) {
649 return nullptr;
650 }
651 // AllocNewSuperPage() updates root->next_partition_page, re-query.
652 adjusted_next_partition_page =
653 base::bits::AlignUp(root->next_partition_page, slot_span_alignment);
654 PA_CHECK(adjusted_next_partition_page + slot_span_reservation_size <=
655 root->next_partition_page_end);
656 }
657
658 auto* gap_start_page =
659 PartitionPageMetadata::FromAddr(root->next_partition_page);
660 auto* gap_end_page =
661 PartitionPageMetadata::FromAddr(adjusted_next_partition_page);
662 for (auto* page = gap_start_page; page < gap_end_page; ++page) {
663 PA_DCHECK(!page->is_valid);
664 page->has_valid_span_after_this = 1;
665 }
666 root->next_partition_page =
667 adjusted_next_partition_page + slot_span_reservation_size;
668
669 uintptr_t slot_span_start = adjusted_next_partition_page;
670 auto* slot_span = &gap_end_page->slot_span_metadata;
671 InitializeSlotSpan(slot_span);
672 // Now that slot span is initialized, it's safe to call FromSlotStart.
673 PA_DCHECK(slot_span == SlotSpanMetadata::FromSlotStart(slot_span_start));
674
675 // System pages in the super page come in a decommited state. Commit them
676 // before vending them back.
677 // If lazy commit is enabled, pages will be committed when provisioning slots,
678 // in ProvisionMoreSlotsAndAllocOne(), not here.
679 if (!kUseLazyCommit) {
680 PA_DEBUG_DATA_ON_STACK("slotsize", slot_size);
681 PA_DEBUG_DATA_ON_STACK("spansize", slot_span_reservation_size);
682 PA_DEBUG_DATA_ON_STACK("spancmt", slot_span_committed_size);
683
684 root->RecommitSystemPagesForData(
685 slot_span_start, slot_span_committed_size,
686 PageAccessibilityDisposition::kRequireUpdate,
687 slot_size <= kMaxMemoryTaggingSize);
688 }
689
690 PA_CHECK(get_slots_per_span() <= kMaxSlotsPerSlotSpan);
691
692 // Double check that we had enough space in the super page for the new slot
693 // span.
694 PA_DCHECK(root->next_partition_page <= root->next_partition_page_end);
695
696 return slot_span;
697 }
698
AllocNewSuperPageSpan(PartitionRoot * root,size_t super_page_count,AllocFlags flags)699 uintptr_t PartitionBucket::AllocNewSuperPageSpan(PartitionRoot* root,
700 size_t super_page_count,
701 AllocFlags flags) {
702 PA_CHECK(super_page_count > 0);
703 PA_CHECK(super_page_count <=
704 std::numeric_limits<size_t>::max() / kSuperPageSize);
705 // Need a new super page. We want to allocate super pages in a contiguous
706 // address region as much as possible. This is important for not causing
707 // page table bloat and not fragmenting address spaces in 32 bit
708 // architectures.
709 uintptr_t requested_address = root->next_super_page;
710 pool_handle pool = root->ChoosePool();
711 uintptr_t super_page_span_start = ReserveMemoryFromPool(
712 pool, requested_address, super_page_count * kSuperPageSize);
713 if (PA_UNLIKELY(!super_page_span_start)) {
714 if (ContainsFlags(flags, AllocFlags::kReturnNull)) {
715 return 0;
716 }
717
718 // Didn't manage to get a new uncommitted super page -> address space issue.
719 ::partition_alloc::internal::ScopedUnlockGuard unlock{
720 PartitionRootLock(root)};
721 PartitionOutOfMemoryMappingFailure(root, kSuperPageSize);
722 }
723
724 uintptr_t super_page_span_end =
725 super_page_span_start + super_page_count * kSuperPageSize;
726 for (uintptr_t super_page = super_page_span_start;
727 super_page < super_page_span_end; super_page += kSuperPageSize) {
728 InitializeSuperPage(root, super_page, 0);
729 }
730 return super_page_span_start;
731 }
732
733 PA_ALWAYS_INLINE uintptr_t
AllocNewSuperPage(PartitionRoot * root,AllocFlags flags)734 PartitionBucket::AllocNewSuperPage(PartitionRoot* root, AllocFlags flags) {
735 auto super_page = AllocNewSuperPageSpan(root, 1, flags);
736 if (PA_UNLIKELY(!super_page)) {
737 // If the `kReturnNull` flag isn't set and the allocation attempt fails,
738 // `AllocNewSuperPageSpan` should've failed with an OOM crash.
739 PA_DCHECK(ContainsFlags(flags, AllocFlags::kReturnNull));
740 return 0;
741 }
742 return SuperPagePayloadBegin(super_page, root->IsQuarantineAllowed());
743 }
744
745 PA_ALWAYS_INLINE uintptr_t
InitializeSuperPage(PartitionRoot * root,uintptr_t super_page,uintptr_t requested_address)746 PartitionBucket::InitializeSuperPage(PartitionRoot* root,
747 uintptr_t super_page,
748 uintptr_t requested_address) {
749 *ReservationOffsetPointer(super_page) = kOffsetTagNormalBuckets;
750
751 root->total_size_of_super_pages.fetch_add(kSuperPageSize,
752 std::memory_order_relaxed);
753
754 root->next_super_page = super_page + kSuperPageSize;
755 uintptr_t state_bitmap =
756 super_page + PartitionPageSize() +
757 (is_direct_mapped() ? 0 : ReservedFreeSlotBitmapSize());
758 #if BUILDFLAG(USE_STARSCAN)
759 PA_DCHECK(SuperPageStateBitmapAddr(super_page) == state_bitmap);
760 const size_t state_bitmap_reservation_size =
761 root->IsQuarantineAllowed() ? ReservedStateBitmapSize() : 0;
762 const size_t state_bitmap_size_to_commit =
763 root->IsQuarantineAllowed() ? CommittedStateBitmapSize() : 0;
764 PA_DCHECK(state_bitmap_reservation_size % PartitionPageSize() == 0);
765 PA_DCHECK(state_bitmap_size_to_commit % SystemPageSize() == 0);
766 PA_DCHECK(state_bitmap_size_to_commit <= state_bitmap_reservation_size);
767 uintptr_t payload = state_bitmap + state_bitmap_reservation_size;
768 #else
769 uintptr_t payload = state_bitmap;
770 #endif // BUILDFLAG(USE_STARSCAN)
771
772 root->next_partition_page = payload;
773 root->next_partition_page_end = root->next_super_page - PartitionPageSize();
774 PA_DCHECK(payload ==
775 SuperPagePayloadBegin(super_page, root->IsQuarantineAllowed()));
776 PA_DCHECK(root->next_partition_page_end == SuperPagePayloadEnd(super_page));
777
778 // Keep the first partition page in the super page inaccessible to serve as a
779 // guard page, except an "island" in the middle where we put page metadata and
780 // also a tiny amount of extent metadata.
781 {
782 ScopedSyscallTimer timer{root};
783 RecommitSystemPages(super_page + SystemPageSize(), SystemPageSize(),
784 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
785 root->PageAccessibilityWithThreadIsolationIfEnabled(
786 PageAccessibilityConfiguration::kRead),
787 #else
788 root->PageAccessibilityWithThreadIsolationIfEnabled(
789 PageAccessibilityConfiguration::kReadWrite),
790 #endif
791 PageAccessibilityDisposition::kRequireUpdate);
792 }
793
794 if (root->ChoosePool() == kBRPPoolHandle) {
795 // Allocate a system page for BRP ref-count table.
796 ScopedSyscallTimer timer{root};
797 RecommitSystemPages(super_page + SystemPageSize() * 2, SystemPageSize(),
798 root->PageAccessibilityWithThreadIsolationIfEnabled(
799 PageAccessibilityConfiguration::kReadWrite),
800 PageAccessibilityDisposition::kRequireUpdate);
801 }
802
803 #if PA_CONFIG(ENABLE_SHADOW_METADATA)
804 {
805 ScopedSyscallTimer timer{root};
806 RecommitSystemPages(ShadowMetadataStart(super_page, root->ChoosePool()),
807 SystemPageSize(),
808 root->PageAccessibilityWithThreadIsolationIfEnabled(
809 PageAccessibilityConfiguration::kReadWrite),
810 PageAccessibilityDisposition::kRequireUpdate);
811 }
812 #endif
813
814 // If we were after a specific address, but didn't get it, assume that
815 // the system chose a lousy address. Here most OS'es have a default
816 // algorithm that isn't randomized. For example, most Linux
817 // distributions will allocate the mapping directly before the last
818 // successful mapping, which is far from random. So we just get fresh
819 // randomness for the next mapping attempt.
820 if (requested_address && requested_address != super_page) {
821 root->next_super_page = 0;
822 }
823
824 // We allocated a new super page so update super page metadata.
825 // First check if this is a new extent or not.
826 auto* latest_extent = PartitionSuperPageToExtent(super_page);
827 // By storing the root in every extent metadata object, we have a fast way
828 // to go from a pointer within the partition to the root object.
829 latest_extent->root = root;
830 // Most new extents will be part of a larger extent, and these two fields
831 // are unused, but we initialize them to 0 so that we get a clear signal
832 // in case they are accidentally used.
833 latest_extent->number_of_consecutive_super_pages = 0;
834 latest_extent->next = nullptr;
835 latest_extent->number_of_nonempty_slot_spans = 0;
836
837 PartitionSuperPageExtentEntry* current_extent = root->current_extent;
838 const bool is_new_extent = super_page != requested_address;
839 if (PA_UNLIKELY(is_new_extent)) {
840 if (PA_UNLIKELY(!current_extent)) {
841 PA_DCHECK(!root->first_extent);
842 root->first_extent = latest_extent;
843 } else {
844 PA_DCHECK(current_extent->number_of_consecutive_super_pages);
845 current_extent->next = latest_extent;
846 }
847 root->current_extent = latest_extent;
848 latest_extent->number_of_consecutive_super_pages = 1;
849 } else {
850 // We allocated next to an existing extent so just nudge the size up a
851 // little.
852 PA_DCHECK(current_extent->number_of_consecutive_super_pages);
853 ++current_extent->number_of_consecutive_super_pages;
854 PA_DCHECK(payload > SuperPagesBeginFromExtent(current_extent) &&
855 payload < SuperPagesEndFromExtent(current_extent));
856 }
857
858 // If PCScan is used, commit the state bitmap. Otherwise, leave it uncommitted
859 // and let PartitionRoot::RegisterScannableRoot() commit it when needed. Make
860 // sure to register the super-page after it has been fully initialized.
861 // Otherwise, the concurrent scanner may try to access |extent->root| which
862 // could be not initialized yet.
863 #if BUILDFLAG(USE_STARSCAN)
864 if (root->IsQuarantineEnabled()) {
865 {
866 ScopedSyscallTimer timer{root};
867 RecommitSystemPages(state_bitmap, state_bitmap_size_to_commit,
868 root->PageAccessibilityWithThreadIsolationIfEnabled(
869 PageAccessibilityConfiguration::kReadWrite),
870 PageAccessibilityDisposition::kRequireUpdate);
871 }
872 PCScan::RegisterNewSuperPage(root, super_page);
873 }
874 #endif // BUILDFLAG(USE_STARSCAN)
875
876 #if BUILDFLAG(USE_FREESLOT_BITMAP)
877 // Commit the pages for freeslot bitmap.
878 if (!is_direct_mapped()) {
879 uintptr_t freeslot_bitmap_addr = super_page + PartitionPageSize();
880 PA_DCHECK(SuperPageFreeSlotBitmapAddr(super_page) == freeslot_bitmap_addr);
881 ScopedSyscallTimer timer{root};
882 RecommitSystemPages(freeslot_bitmap_addr, CommittedFreeSlotBitmapSize(),
883 root->PageAccessibilityWithThreadIsolationIfEnabled(
884 PageAccessibilityConfiguration::kReadWrite),
885 PageAccessibilityDisposition::kRequireUpdate);
886 }
887 #endif
888
889 return payload;
890 }
891
InitializeSlotSpan(SlotSpanMetadata * slot_span)892 PA_ALWAYS_INLINE void PartitionBucket::InitializeSlotSpan(
893 SlotSpanMetadata* slot_span) {
894 new (slot_span) SlotSpanMetadata(this);
895
896 slot_span->Reset();
897
898 uint16_t num_partition_pages = get_pages_per_slot_span();
899 auto* page_metadata = reinterpret_cast<PartitionPageMetadata*>(slot_span);
900 for (uint16_t i = 0; i < num_partition_pages; ++i, ++page_metadata) {
901 PA_DCHECK(i <= PartitionPageMetadata::kMaxSlotSpanMetadataOffset);
902 page_metadata->slot_span_metadata_offset = i;
903 page_metadata->is_valid = true;
904 }
905 }
906
907 PA_ALWAYS_INLINE uintptr_t
ProvisionMoreSlotsAndAllocOne(PartitionRoot * root,AllocFlags flags,SlotSpanMetadata * slot_span)908 PartitionBucket::ProvisionMoreSlotsAndAllocOne(PartitionRoot* root,
909 AllocFlags flags,
910 SlotSpanMetadata* slot_span) {
911 PA_DCHECK(slot_span != SlotSpanMetadata::get_sentinel_slot_span());
912 size_t num_slots = slot_span->num_unprovisioned_slots;
913 PA_DCHECK(num_slots);
914 PA_DCHECK(num_slots <= get_slots_per_span());
915 // We should only get here when _every_ slot is either used or unprovisioned.
916 // (The third possible state is "on the freelist". If we have a non-empty
917 // freelist, we should not get here.)
918 PA_DCHECK(num_slots + slot_span->num_allocated_slots == get_slots_per_span());
919 // Similarly, make explicitly sure that the freelist is empty.
920 PA_DCHECK(!slot_span->get_freelist_head());
921 PA_DCHECK(!slot_span->is_full());
922
923 uintptr_t slot_span_start = SlotSpanMetadata::ToSlotSpanStart(slot_span);
924 // If we got here, the first unallocated slot is either partially or fully on
925 // an uncommitted page. If the latter, it must be at the start of that page.
926 uintptr_t return_slot =
927 slot_span_start + (slot_size * slot_span->num_allocated_slots);
928 uintptr_t next_slot = return_slot + slot_size;
929 uintptr_t commit_start = base::bits::AlignUp(return_slot, SystemPageSize());
930 PA_DCHECK(next_slot > commit_start);
931 uintptr_t commit_end = base::bits::AlignUp(next_slot, SystemPageSize());
932 // If the slot was partially committed, |return_slot| and |next_slot| fall
933 // in different pages. If the slot was fully uncommitted, |return_slot| points
934 // to the page start and |next_slot| doesn't, thus only the latter gets
935 // rounded up.
936 PA_DCHECK(commit_end > commit_start);
937
938 // If lazy commit is enabled, meaning system pages in the slot span come
939 // in an initially decommitted state, commit them here.
940 // Note, we can't use PageAccessibilityDisposition::kAllowKeepForPerf, because
941 // we have no knowledge which pages have been committed before (it doesn't
942 // matter on Windows anyway).
943 if (kUseLazyCommit) {
944 const bool ok = root->TryRecommitSystemPagesForDataLocked(
945 commit_start, commit_end - commit_start,
946 PageAccessibilityDisposition::kRequireUpdate,
947 slot_size <= kMaxMemoryTaggingSize);
948 if (!ok) {
949 if (!ContainsFlags(flags, AllocFlags::kReturnNull)) {
950 ScopedUnlockGuard unlock{PartitionRootLock(root)};
951 PartitionOutOfMemoryCommitFailure(root, slot_size);
952 }
953 return 0;
954 }
955 }
956
957 // The slot being returned is considered allocated.
958 slot_span->num_allocated_slots++;
959 // Round down, because a slot that doesn't fully fit in the new page(s) isn't
960 // provisioned.
961 size_t slots_to_provision = (commit_end - return_slot) / slot_size;
962 slot_span->num_unprovisioned_slots -= slots_to_provision;
963 PA_DCHECK(slot_span->num_allocated_slots +
964 slot_span->num_unprovisioned_slots <=
965 get_slots_per_span());
966
967 #if BUILDFLAG(HAS_MEMORY_TAGGING)
968 const bool use_tagging =
969 root->IsMemoryTaggingEnabled() && slot_size <= kMaxMemoryTaggingSize;
970 if (PA_LIKELY(use_tagging)) {
971 // Ensure the MTE-tag of the memory pointed by |return_slot| is unguessable.
972 TagMemoryRangeRandomly(return_slot, slot_size);
973 }
974 #endif // BUILDFLAG(HAS_MEMORY_TAGGING)
975 // Add all slots that fit within so far committed pages to the free list.
976 PartitionFreelistEntry* prev_entry = nullptr;
977 uintptr_t next_slot_end = next_slot + slot_size;
978 size_t free_list_entries_added = 0;
979
980 const auto* freelist_dispatcher = root->get_freelist_dispatcher();
981
982 while (next_slot_end <= commit_end) {
983 void* next_slot_ptr;
984 #if BUILDFLAG(HAS_MEMORY_TAGGING)
985 if (PA_LIKELY(use_tagging)) {
986 // Ensure the MTE-tag of the memory pointed by other provisioned slot is
987 // unguessable. They will be returned to the app as is, and the MTE-tag
988 // will only change upon calling Free().
989 next_slot_ptr = TagMemoryRangeRandomly(next_slot, slot_size);
990 } else {
991 // No MTE-tagging for larger slots, just cast.
992 next_slot_ptr = reinterpret_cast<void*>(next_slot);
993 }
994 #else // BUILDFLAG(HAS_MEMORY_TAGGING)
995 next_slot_ptr = reinterpret_cast<void*>(next_slot);
996 #endif
997
998 auto* entry = freelist_dispatcher->EmplaceAndInitNull(next_slot_ptr);
999
1000 if (!slot_span->get_freelist_head()) {
1001 PA_DCHECK(!prev_entry);
1002 PA_DCHECK(!free_list_entries_added);
1003 slot_span->SetFreelistHead(entry);
1004 } else {
1005 PA_DCHECK(free_list_entries_added);
1006 freelist_dispatcher->SetNext(prev_entry, entry);
1007 }
1008 #if BUILDFLAG(USE_FREESLOT_BITMAP)
1009 FreeSlotBitmapMarkSlotAsFree(next_slot);
1010 #endif
1011 next_slot = next_slot_end;
1012 next_slot_end = next_slot + slot_size;
1013 prev_entry = entry;
1014 #if BUILDFLAG(PA_DCHECK_IS_ON)
1015 free_list_entries_added++;
1016 #endif
1017 }
1018
1019 #if BUILDFLAG(USE_FREESLOT_BITMAP)
1020 FreeSlotBitmapMarkSlotAsFree(return_slot);
1021 #endif
1022
1023 #if BUILDFLAG(PA_DCHECK_IS_ON)
1024 // The only provisioned slot not added to the free list is the one being
1025 // returned.
1026 PA_DCHECK(slots_to_provision == free_list_entries_added + 1);
1027 // We didn't necessarily provision more than one slot (e.g. if |slot_size|
1028 // is large), meaning that |slot_span->freelist_head| can be nullptr.
1029 if (slot_span->get_freelist_head()) {
1030 PA_DCHECK(free_list_entries_added);
1031 freelist_dispatcher->CheckFreeList(slot_span->get_freelist_head(),
1032 slot_size);
1033 }
1034 #endif
1035
1036 // We had no free slots, and created some (potentially 0) in sorted order.
1037 slot_span->set_freelist_sorted();
1038
1039 return return_slot;
1040 }
1041
SetNewActiveSlotSpan()1042 bool PartitionBucket::SetNewActiveSlotSpan() {
1043 SlotSpanMetadata* slot_span = active_slot_spans_head;
1044 if (slot_span == SlotSpanMetadata::get_sentinel_slot_span()) {
1045 return false;
1046 }
1047
1048 SlotSpanMetadata* next_slot_span;
1049
1050 // The goal here is to find a suitable slot span in the active list. Suitable
1051 // slot spans are |is_active()|, i.e. they either have (a) freelist entries,
1052 // or (b) unprovisioned free space. The first case is preferable, since it
1053 // doesn't cost a system call, and doesn't cause new memory to become dirty.
1054 //
1055 // While looking for a new slot span, active list maintenance is performed,
1056 // that is:
1057 // - Empty and decommitted slot spans are moved to their respective lists.
1058 // - Full slot spans are removed from the active list but are not moved
1059 // anywhere. They could be tracked in a separate list, but this would
1060 // increase cost non trivially. Indeed, a full slot span is likely to become
1061 // non-full at some point (due to a free() hitting it). Since we only have
1062 // space in the metadata for a single linked list pointer, removing the
1063 // newly-non-full slot span from the "full" list would require walking it
1064 // (to know what's before it in the full list).
1065 //
1066 // Since we prefer slot spans with provisioned freelist entries, maintenance
1067 // happens in two stages:
1068 // 1. Walk the list to find candidates. Each of the skipped slot span is moved
1069 // to either:
1070 // - one of the long-lived lists: empty, decommitted
1071 // - the temporary "active slots spans with no freelist entry" list
1072 // - Nowhere for full slot spans.
1073 // 2. Once we have a candidate:
1074 // - Set it as the new active list head
1075 // - Reattach the temporary list
1076 //
1077 // Note that in most cases, the whole list will not be walked and maintained
1078 // at this stage.
1079
1080 SlotSpanMetadata* to_provision_head = nullptr;
1081 SlotSpanMetadata* to_provision_tail = nullptr;
1082
1083 for (; slot_span; slot_span = next_slot_span) {
1084 next_slot_span = slot_span->next_slot_span;
1085 PA_DCHECK(slot_span->bucket == this);
1086 PA_DCHECK(slot_span != empty_slot_spans_head);
1087 PA_DCHECK(slot_span != decommitted_slot_spans_head);
1088
1089 if (slot_span->is_active()) {
1090 // Has provisioned slots.
1091 if (slot_span->get_freelist_head()) {
1092 // Will use this slot span, no need to go further.
1093 break;
1094 } else {
1095 // Keeping head and tail because we don't want to reverse the list.
1096 if (!to_provision_head) {
1097 to_provision_head = slot_span;
1098 }
1099 if (to_provision_tail) {
1100 to_provision_tail->next_slot_span = slot_span;
1101 }
1102 to_provision_tail = slot_span;
1103 slot_span->next_slot_span = nullptr;
1104 }
1105 } else if (slot_span->is_empty()) {
1106 slot_span->next_slot_span = empty_slot_spans_head;
1107 empty_slot_spans_head = slot_span;
1108 } else if (PA_LIKELY(slot_span->is_decommitted())) {
1109 slot_span->next_slot_span = decommitted_slot_spans_head;
1110 decommitted_slot_spans_head = slot_span;
1111 } else {
1112 PA_DCHECK(slot_span->is_full());
1113 // Move this slot span... nowhere, and also mark it as full. We need it
1114 // marked so that free'ing can tell, and move it back into the active
1115 // list.
1116 slot_span->marked_full = 1;
1117 ++num_full_slot_spans;
1118 // Overflow. Most likely a correctness issue in the code. It is in theory
1119 // possible that the number of full slot spans really reaches (1 << 24),
1120 // but this is very unlikely (and not possible with most pool settings).
1121 PA_CHECK(num_full_slot_spans);
1122 // Not necessary but might help stop accidents.
1123 slot_span->next_slot_span = nullptr;
1124 }
1125 }
1126
1127 bool usable_active_list_head = false;
1128 // Found an active slot span with provisioned entries on the freelist.
1129 if (slot_span) {
1130 usable_active_list_head = true;
1131 // We have active slot spans with unprovisioned entries. Re-attach them into
1132 // the active list, past the span with freelist entries.
1133 if (to_provision_head) {
1134 auto* next = slot_span->next_slot_span;
1135 slot_span->next_slot_span = to_provision_head;
1136 to_provision_tail->next_slot_span = next;
1137 }
1138 active_slot_spans_head = slot_span;
1139 } else if (to_provision_head) {
1140 usable_active_list_head = true;
1141 // Need to provision new slots.
1142 active_slot_spans_head = to_provision_head;
1143 } else {
1144 // Active list is now empty.
1145 active_slot_spans_head =
1146 SlotSpanMetadata::get_sentinel_slot_span_non_const();
1147 }
1148
1149 return usable_active_list_head;
1150 }
1151
MaintainActiveList()1152 void PartitionBucket::MaintainActiveList() {
1153 SlotSpanMetadata* slot_span = active_slot_spans_head;
1154 if (slot_span == SlotSpanMetadata::get_sentinel_slot_span()) {
1155 return;
1156 }
1157
1158 SlotSpanMetadata* new_active_slot_spans_head = nullptr;
1159 SlotSpanMetadata* new_active_slot_spans_tail = nullptr;
1160
1161 SlotSpanMetadata* next_slot_span;
1162 for (; slot_span; slot_span = next_slot_span) {
1163 next_slot_span = slot_span->next_slot_span;
1164
1165 if (slot_span->is_active()) {
1166 // Ordering in the active slot span list matters, don't reverse it.
1167 if (!new_active_slot_spans_head) {
1168 new_active_slot_spans_head = slot_span;
1169 }
1170 if (new_active_slot_spans_tail) {
1171 new_active_slot_spans_tail->next_slot_span = slot_span;
1172 }
1173 new_active_slot_spans_tail = slot_span;
1174 slot_span->next_slot_span = nullptr;
1175 } else if (slot_span->is_empty()) {
1176 // For the empty and decommitted lists, LIFO ordering makes sense (since
1177 // it would lead to reusing memory which has been touched relatively
1178 // recently, which only matters for committed spans though).
1179 slot_span->next_slot_span = empty_slot_spans_head;
1180 empty_slot_spans_head = slot_span;
1181 } else if (slot_span->is_decommitted()) {
1182 slot_span->next_slot_span = decommitted_slot_spans_head;
1183 decommitted_slot_spans_head = slot_span;
1184 } else {
1185 // Full slot spans are not tracked, just accounted for.
1186 PA_DCHECK(slot_span->is_full());
1187 slot_span->marked_full = 1;
1188 ++num_full_slot_spans;
1189 PA_CHECK(num_full_slot_spans); // Overflow.
1190 slot_span->next_slot_span = nullptr;
1191 }
1192 }
1193
1194 if (!new_active_slot_spans_head) {
1195 new_active_slot_spans_head =
1196 SlotSpanMetadata::get_sentinel_slot_span_non_const();
1197 }
1198 active_slot_spans_head = new_active_slot_spans_head;
1199 }
1200
SortSmallerSlotSpanFreeLists()1201 void PartitionBucket::SortSmallerSlotSpanFreeLists() {
1202 for (auto* slot_span = active_slot_spans_head; slot_span;
1203 slot_span = slot_span->next_slot_span) {
1204 // No need to sort the freelist if it's already sorted. Note that if the
1205 // freelist is sorted, this means that it didn't change at all since the
1206 // last call. This may be a good signal to shrink it if possible (if an
1207 // entire OS page is free, we can decommit it).
1208 //
1209 // Besides saving CPU, this also avoids touching memory of fully idle slot
1210 // spans, which may required paging.
1211 if (slot_span->num_allocated_slots > 0 &&
1212 !slot_span->freelist_is_sorted()) {
1213 slot_span->SortFreelist();
1214 }
1215 }
1216 }
1217
PA_COMPONENT_EXPORT(PARTITION_ALLOC)1218 PA_COMPONENT_EXPORT(PARTITION_ALLOC)
1219 bool CompareSlotSpans(SlotSpanMetadata* a, SlotSpanMetadata* b) {
1220 auto criteria_tuple = [](SlotSpanMetadata const* a) {
1221 size_t freelist_length = a->GetFreelistLength();
1222 // The criteria are, in order (hence the lexicographic comparison below):
1223 // 1. Prefer slot spans with freelist entries. The ones without freelist
1224 // entries would be skipped in SetNewActiveSlotSpan() anyway.
1225 // 2. Then the ones with the fewest freelist entries. They are either close
1226 // to being full (for the provisioned memory), or close to being pushed
1227 // at the end of the list (since they would not have freelist entries
1228 // anymore, and would either fall into the first case, or be skipped by
1229 // SetNewActiveSlotSpan()).
1230 // 3. The ones with the fewer unprovisioned slots, meaning that they are
1231 // close to being completely full.
1232 //
1233 // Note that this sorting order is not necessarily the best one when slot
1234 // spans are partially provisioned. From local testing, in steady-state,
1235 // most slot spans are entirely provisioned (or decommitted), which may be a
1236 // consequence of the lack of partial slot span decommit, or of fairly
1237 // effective fragmentation avoidance heuristics. Make sure to evaluate
1238 // whether an alternative sorting order (sorting according to freelist size
1239 // + unprovisioned slots) makes more sense.
1240 return std::tuple<bool, size_t, size_t>{
1241 freelist_length == 0, freelist_length, a->num_unprovisioned_slots};
1242 };
1243
1244 return criteria_tuple(a) < criteria_tuple(b);
1245 }
1246
SortActiveSlotSpans()1247 void PartitionBucket::SortActiveSlotSpans() {
1248 // Sorting up to |kMaxSlotSpansToSort| slot spans. This is capped for two
1249 // reasons:
1250 // - Limiting execution time
1251 // - Current code cannot allocate.
1252 //
1253 // In practice though, it's rare to have that many active slot spans.
1254 SlotSpanMetadata* active_spans_array[kMaxSlotSpansToSort];
1255 size_t index = 0;
1256 SlotSpanMetadata* overflow_spans_start = nullptr;
1257
1258 for (auto* slot_span = active_slot_spans_head; slot_span;
1259 slot_span = slot_span->next_slot_span) {
1260 if (index < kMaxSlotSpansToSort) {
1261 active_spans_array[index++] = slot_span;
1262 } else {
1263 // Starting from this one, not sorting the slot spans.
1264 overflow_spans_start = slot_span;
1265 break;
1266 }
1267 }
1268
1269 // We sort the active slot spans so that allocations are preferably serviced
1270 // from the fullest ones. This way we hope to reduce fragmentation by keeping
1271 // as few slot spans as full as possible.
1272 //
1273 // With perfect information on allocation lifespan, we would be able to pack
1274 // allocations and get almost no fragmentation. This is obviously not the
1275 // case, so we have partially full SlotSpans. Nevertheless, as a heuristic we
1276 // want to:
1277 // - Keep almost-empty slot spans as empty as possible
1278 // - Keep mostly-full slot spans as full as possible
1279 //
1280 // The first part is done in the hope that future free()s will make these
1281 // slot spans completely empty, allowing us to reclaim them. To that end, sort
1282 // SlotSpans periodically so that the fullest ones are preferred.
1283 //
1284 // std::sort() is not completely guaranteed to never allocate memory. However,
1285 // it may not throw std::bad_alloc, which constrains the implementation. In
1286 // addition, this is protected by the reentrancy guard, so we would detect
1287 // such an allocation.
1288 std::sort(active_spans_array, active_spans_array + index, CompareSlotSpans);
1289
1290 active_slot_spans_head = overflow_spans_start;
1291
1292 // Reverse order, since we insert at the head of the list.
1293 for (int i = index - 1; i >= 0; i--) {
1294 if (active_spans_array[i] == SlotSpanMetadata::get_sentinel_slot_span()) {
1295 // The sentinel is const, don't try to write to it.
1296 PA_DCHECK(active_slot_spans_head == nullptr);
1297 } else {
1298 active_spans_array[i]->next_slot_span = active_slot_spans_head;
1299 }
1300 active_slot_spans_head = active_spans_array[i];
1301 }
1302 }
1303
SlowPathAlloc(PartitionRoot * root,AllocFlags flags,size_t raw_size,size_t slot_span_alignment,SlotSpanMetadata ** slot_span,bool * is_already_zeroed)1304 uintptr_t PartitionBucket::SlowPathAlloc(PartitionRoot* root,
1305 AllocFlags flags,
1306 size_t raw_size,
1307 size_t slot_span_alignment,
1308 SlotSpanMetadata** slot_span,
1309 bool* is_already_zeroed) {
1310 PA_DCHECK((slot_span_alignment >= PartitionPageSize()) &&
1311 std::has_single_bit(slot_span_alignment));
1312
1313 // The slow path is called when the freelist is empty. The only exception is
1314 // when a higher-order alignment is requested, in which case the freelist
1315 // logic is bypassed and we go directly for slot span allocation.
1316 bool allocate_aligned_slot_span = slot_span_alignment > PartitionPageSize();
1317 PA_DCHECK(!active_slot_spans_head->get_freelist_head() ||
1318 allocate_aligned_slot_span);
1319
1320 SlotSpanMetadata* new_slot_span = nullptr;
1321 // |new_slot_span->bucket| will always be |this|, except when |this| is the
1322 // sentinel bucket, which is used to signal a direct mapped allocation. In
1323 // this case |new_bucket| will be set properly later. This avoids a read for
1324 // most allocations.
1325 PartitionBucket* new_bucket = this;
1326 *is_already_zeroed = false;
1327
1328 // For the PartitionRoot::Alloc() API, we have a bunch of buckets
1329 // marked as special cases. We bounce them through to the slow path so that
1330 // we can still have a blazing fast hot path due to lack of corner-case
1331 // branches.
1332 //
1333 // Note: The ordering of the conditionals matter! In particular,
1334 // SetNewActiveSlotSpan() has a side-effect even when returning
1335 // false where it sweeps the active list and may move things into the empty or
1336 // decommitted lists which affects the subsequent conditional.
1337 if (PA_UNLIKELY(is_direct_mapped())) {
1338 PA_DCHECK(raw_size > kMaxBucketed);
1339 PA_DCHECK(this == &root->sentinel_bucket);
1340 PA_DCHECK(active_slot_spans_head ==
1341 SlotSpanMetadata::get_sentinel_slot_span());
1342
1343 // No fast path for direct-mapped allocations.
1344 if (ContainsFlags(flags, AllocFlags::kFastPathOrReturnNull)) {
1345 return 0;
1346 }
1347
1348 new_slot_span =
1349 PartitionDirectMap(root, flags, raw_size, slot_span_alignment);
1350 if (new_slot_span) {
1351 new_bucket = new_slot_span->bucket;
1352 }
1353 // Memory from PageAllocator is always zeroed.
1354 *is_already_zeroed = true;
1355 } else if (PA_LIKELY(!allocate_aligned_slot_span && SetNewActiveSlotSpan())) {
1356 // First, did we find an active slot span in the active list?
1357 new_slot_span = active_slot_spans_head;
1358 PA_DCHECK(new_slot_span->is_active());
1359 } else if (PA_LIKELY(!allocate_aligned_slot_span &&
1360 (empty_slot_spans_head != nullptr ||
1361 decommitted_slot_spans_head != nullptr))) {
1362 // Second, look in our lists of empty and decommitted slot spans.
1363 // Check empty slot spans first, which are preferred, but beware that an
1364 // empty slot span might have been decommitted.
1365 while (PA_LIKELY((new_slot_span = empty_slot_spans_head) != nullptr)) {
1366 PA_DCHECK(new_slot_span->bucket == this);
1367 PA_DCHECK(new_slot_span->is_empty() || new_slot_span->is_decommitted());
1368 empty_slot_spans_head = new_slot_span->next_slot_span;
1369 // Accept the empty slot span unless it got decommitted.
1370 if (new_slot_span->get_freelist_head()) {
1371 new_slot_span->next_slot_span = nullptr;
1372 new_slot_span->ToSuperPageExtent()
1373 ->IncrementNumberOfNonemptySlotSpans();
1374
1375 // Re-activating an empty slot span, update accounting.
1376 size_t dirty_size = base::bits::AlignUp(
1377 new_slot_span->GetProvisionedSize(), SystemPageSize());
1378 PA_DCHECK(root->empty_slot_spans_dirty_bytes >= dirty_size);
1379 root->empty_slot_spans_dirty_bytes -= dirty_size;
1380
1381 break;
1382 }
1383 PA_DCHECK(new_slot_span->is_decommitted());
1384 new_slot_span->next_slot_span = decommitted_slot_spans_head;
1385 decommitted_slot_spans_head = new_slot_span;
1386 }
1387 if (PA_UNLIKELY(!new_slot_span) &&
1388 PA_LIKELY(decommitted_slot_spans_head != nullptr)) {
1389 // Commit can be expensive, don't do it.
1390 if (ContainsFlags(flags, AllocFlags::kFastPathOrReturnNull)) {
1391 return 0;
1392 }
1393
1394 new_slot_span = decommitted_slot_spans_head;
1395 PA_DCHECK(new_slot_span->bucket == this);
1396 PA_DCHECK(new_slot_span->is_decommitted());
1397
1398 // If lazy commit is enabled, pages will be recommitted when provisioning
1399 // slots, in ProvisionMoreSlotsAndAllocOne(), not here.
1400 if (!kUseLazyCommit) {
1401 uintptr_t slot_span_start =
1402 SlotSpanMetadata::ToSlotSpanStart(new_slot_span);
1403 // Since lazy commit isn't used, we have a guarantee that all slot span
1404 // pages have been previously committed, and then decommitted using
1405 // PageAccessibilityDisposition::kAllowKeepForPerf, so use the
1406 // same option as an optimization.
1407 const bool ok = root->TryRecommitSystemPagesForDataLocked(
1408 slot_span_start, new_slot_span->bucket->get_bytes_per_span(),
1409 PageAccessibilityDisposition::kAllowKeepForPerf,
1410 slot_size <= kMaxMemoryTaggingSize);
1411 if (!ok) {
1412 if (!ContainsFlags(flags, AllocFlags::kReturnNull)) {
1413 ScopedUnlockGuard unlock{PartitionRootLock(root)};
1414 PartitionOutOfMemoryCommitFailure(
1415 root, new_slot_span->bucket->get_bytes_per_span());
1416 }
1417 return 0;
1418 }
1419 }
1420
1421 decommitted_slot_spans_head = new_slot_span->next_slot_span;
1422 new_slot_span->Reset();
1423 *is_already_zeroed = DecommittedMemoryIsAlwaysZeroed();
1424 }
1425 PA_DCHECK(new_slot_span);
1426 } else {
1427 // Getting a new slot span is expensive, don't do it.
1428 if (ContainsFlags(flags, AllocFlags::kFastPathOrReturnNull)) {
1429 return 0;
1430 }
1431
1432 // Third. If we get here, we need a brand new slot span.
1433 // TODO(bartekn): For single-slot slot spans, we can use rounded raw_size
1434 // as slot_span_committed_size.
1435 new_slot_span = AllocNewSlotSpan(root, flags, slot_span_alignment);
1436 // New memory from PageAllocator is always zeroed.
1437 *is_already_zeroed = true;
1438 }
1439
1440 // Bail if we had a memory allocation failure.
1441 if (PA_UNLIKELY(!new_slot_span)) {
1442 PA_DCHECK(active_slot_spans_head ==
1443 SlotSpanMetadata::get_sentinel_slot_span());
1444 if (ContainsFlags(flags, AllocFlags::kReturnNull)) {
1445 return 0;
1446 }
1447 // See comment in PartitionDirectMap() for unlocking.
1448 ScopedUnlockGuard unlock{PartitionRootLock(root)};
1449 root->OutOfMemory(raw_size);
1450 PA_IMMEDIATE_CRASH(); // Not required, kept as documentation.
1451 }
1452 *slot_span = new_slot_span;
1453
1454 PA_DCHECK(new_bucket != &root->sentinel_bucket);
1455 new_bucket->active_slot_spans_head = new_slot_span;
1456 if (new_slot_span->CanStoreRawSize()) {
1457 new_slot_span->SetRawSize(raw_size);
1458 }
1459
1460 // If we found an active slot span with free slots, or an empty slot span, we
1461 // have a usable freelist head.
1462 if (PA_LIKELY(new_slot_span->get_freelist_head() != nullptr)) {
1463 const PartitionFreelistDispatcher* freelist_dispatcher =
1464 root->get_freelist_dispatcher();
1465 PartitionFreelistEntry* entry =
1466 new_slot_span->PopForAlloc(new_bucket->slot_size, freelist_dispatcher);
1467 // We may have set *is_already_zeroed to true above, make sure that the
1468 // freelist entry doesn't contain data. Either way, it wouldn't be a good
1469 // idea to let users see our internal data.
1470 uintptr_t slot_start = freelist_dispatcher->ClearForAllocation(entry);
1471 return slot_start;
1472 }
1473
1474 // Otherwise, we need to provision more slots by committing more pages. Build
1475 // the free list for the newly provisioned slots.
1476 PA_DCHECK(new_slot_span->num_unprovisioned_slots);
1477 return ProvisionMoreSlotsAndAllocOne(root, flags, new_slot_span);
1478 }
1479
AllocNewSuperPageSpanForGwpAsan(PartitionRoot * root,size_t super_page_count,AllocFlags flags)1480 uintptr_t PartitionBucket::AllocNewSuperPageSpanForGwpAsan(
1481 PartitionRoot* root,
1482 size_t super_page_count,
1483 AllocFlags flags) {
1484 return AllocNewSuperPageSpan(root, super_page_count, flags);
1485 }
1486
InitializeSlotSpanForGwpAsan(SlotSpanMetadata * slot_span)1487 void PartitionBucket::InitializeSlotSpanForGwpAsan(
1488 SlotSpanMetadata* slot_span) {
1489 InitializeSlotSpan(slot_span);
1490 }
1491
1492 } // namespace partition_alloc::internal
1493