xref: /aosp_15_r20/external/cronet/base/allocator/partition_allocator/src/partition_alloc/thread_cache.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2020 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "partition_alloc/thread_cache.h"
6 
7 #include <sys/types.h>
8 
9 #include <algorithm>
10 #include <atomic>
11 #include <cstdint>
12 
13 #include "build/build_config.h"
14 #include "partition_alloc/internal_allocator.h"
15 #include "partition_alloc/partition_alloc-inl.h"
16 #include "partition_alloc/partition_alloc_base/component_export.h"
17 #include "partition_alloc/partition_alloc_base/debug/debugging_buildflags.h"
18 #include "partition_alloc/partition_alloc_base/immediate_crash.h"
19 #include "partition_alloc/partition_alloc_base/time/time.h"
20 #include "partition_alloc/partition_alloc_buildflags.h"
21 #include "partition_alloc/partition_alloc_check.h"
22 #include "partition_alloc/partition_alloc_config.h"
23 #include "partition_alloc/partition_alloc_constants.h"
24 #include "partition_alloc/partition_freelist_entry.h"
25 #include "partition_alloc/partition_root.h"
26 
27 namespace partition_alloc {
28 
29 namespace {
30 ThreadCacheRegistry g_instance;
31 }  // namespace
32 
33 namespace tools {
34 uintptr_t kThreadCacheNeedleArray[kThreadCacheNeedleArraySize] = {
35     kNeedle1, reinterpret_cast<uintptr_t>(&g_instance),
36 #if BUILDFLAG(RECORD_ALLOC_INFO)
37     reinterpret_cast<uintptr_t>(&internal::g_allocs),
38 #else
39     0,
40 #endif
41     kNeedle2};
42 }  // namespace tools
43 
44 namespace internal {
45 
46 PA_COMPONENT_EXPORT(PARTITION_ALLOC) PartitionTlsKey g_thread_cache_key;
47 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
48 PA_COMPONENT_EXPORT(PARTITION_ALLOC)
49 thread_local ThreadCache* g_thread_cache;
50 #endif
51 
52 }  // namespace internal
53 
54 namespace {
55 // Since |g_thread_cache_key| is shared, make sure that no more than one
56 // PartitionRoot can use it.
57 static std::atomic<PartitionRoot*> g_thread_cache_root;
58 
59 #if BUILDFLAG(IS_WIN)
OnDllProcessDetach()60 void OnDllProcessDetach() {
61   // Very late allocations do occur (see crbug.com/1159411#c7 for instance),
62   // including during CRT teardown. This is problematic for the thread cache
63   // which relies on the CRT for TLS access for instance. This cannot be
64   // mitigated inside the thread cache (since getting to it requires querying
65   // TLS), but the PartitionRoot associated wih the thread cache can be made to
66   // not use the thread cache anymore.
67   g_thread_cache_root.load(std::memory_order_relaxed)
68       ->settings.with_thread_cache = false;
69 }
70 #endif
71 
72 static bool g_thread_cache_key_created = false;
73 }  // namespace
74 
75 uint8_t ThreadCache::global_limits_[ThreadCache::kBucketCount];
76 
77 // Start with the normal size, not the maximum one.
78 uint16_t ThreadCache::largest_active_bucket_index_ =
79     internal::BucketIndexLookup::GetIndex(ThreadCache::kDefaultSizeThreshold);
80 
81 // static
Instance()82 ThreadCacheRegistry& ThreadCacheRegistry::Instance() {
83   return g_instance;
84 }
85 
86 const internal::PartitionFreelistDispatcher*
get_freelist_dispatcher_from_root()87 ThreadCache::get_freelist_dispatcher_from_root() {
88   return root_->get_freelist_dispatcher();
89 }
90 
RegisterThreadCache(ThreadCache * cache)91 void ThreadCacheRegistry::RegisterThreadCache(ThreadCache* cache) {
92   internal::ScopedGuard scoped_locker(GetLock());
93   cache->next_ = nullptr;
94   cache->prev_ = nullptr;
95 
96   ThreadCache* previous_head = list_head_;
97   list_head_ = cache;
98   cache->next_ = previous_head;
99   if (previous_head) {
100     previous_head->prev_ = cache;
101   }
102 }
103 
UnregisterThreadCache(ThreadCache * cache)104 void ThreadCacheRegistry::UnregisterThreadCache(ThreadCache* cache) {
105   internal::ScopedGuard scoped_locker(GetLock());
106   if (cache->prev_) {
107     cache->prev_->next_ = cache->next_;
108   }
109   if (cache->next_) {
110     cache->next_->prev_ = cache->prev_;
111   }
112   if (cache == list_head_) {
113     list_head_ = cache->next_;
114   }
115 }
116 
DumpStats(bool my_thread_only,ThreadCacheStats * stats)117 void ThreadCacheRegistry::DumpStats(bool my_thread_only,
118                                     ThreadCacheStats* stats) {
119   ThreadCache::EnsureThreadSpecificDataInitialized();
120   memset(reinterpret_cast<void*>(stats), 0, sizeof(ThreadCacheStats));
121 
122   internal::ScopedGuard scoped_locker(GetLock());
123   if (my_thread_only) {
124     auto* tcache = ThreadCache::Get();
125     if (!ThreadCache::IsValid(tcache)) {
126       return;
127     }
128     tcache->AccumulateStats(stats);
129   } else {
130     ThreadCache* tcache = list_head_;
131     while (tcache) {
132       // Racy, as other threads are still allocating. This is not an issue,
133       // since we are only interested in statistics. However, this means that
134       // count is not necessarily equal to hits + misses for the various types
135       // of events.
136       tcache->AccumulateStats(stats);
137       tcache = tcache->next_;
138     }
139   }
140 }
141 
PurgeAll()142 void ThreadCacheRegistry::PurgeAll() {
143   auto* current_thread_tcache = ThreadCache::Get();
144 
145   // May take a while, don't hold the lock while purging.
146   //
147   // In most cases, the current thread is more important than other ones. For
148   // instance in renderers, it is the main thread. It is also the only thread
149   // that we can synchronously purge.
150   //
151   // The reason why we trigger the purge for this one first is that assuming
152   // that all threads are allocating memory, they will start purging
153   // concurrently in the loop below. This will then make them all contend with
154   // the main thread for the partition lock, since it is acquired/released once
155   // per bucket. By purging the main thread first, we avoid these interferences
156   // for this thread at least.
157   if (ThreadCache::IsValid(current_thread_tcache)) {
158     current_thread_tcache->Purge();
159   }
160 
161   {
162     internal::ScopedGuard scoped_locker(GetLock());
163     ThreadCache* tcache = list_head_;
164     while (tcache) {
165       PA_DCHECK(ThreadCache::IsValid(tcache));
166       // Cannot purge directly, need to ask the other thread to purge "at some
167       // point".
168       // Note that this will not work if the other thread is sleeping forever.
169       // TODO(lizeb): Handle sleeping threads.
170       if (tcache != current_thread_tcache) {
171         tcache->SetShouldPurge();
172       }
173       tcache = tcache->next_;
174     }
175   }
176 }
177 
ForcePurgeAllThreadAfterForkUnsafe()178 void ThreadCacheRegistry::ForcePurgeAllThreadAfterForkUnsafe() {
179   internal::ScopedGuard scoped_locker(GetLock());
180   ThreadCache* tcache = list_head_;
181   while (tcache) {
182 #if BUILDFLAG(PA_DCHECK_IS_ON)
183     // Before fork(), locks are acquired in the parent process. This means that
184     // a concurrent allocation in the parent which must be filled by the central
185     // allocator (i.e. the thread cache bucket is empty) will block inside the
186     // thread cache waiting for the lock to be released.
187     //
188     // In the child process, this allocation will never complete since this
189     // thread will not be resumed. However, calling |Purge()| triggers the
190     // reentrancy guard since the parent process thread was suspended from
191     // within the thread cache.
192     // Clear the guard to prevent this from crashing.
193     tcache->is_in_thread_cache_ = false;
194 #endif
195     // There is a PA_DCHECK() in code called from |Purge()| checking that thread
196     // cache memory accounting is correct. Since we are after fork() and the
197     // other threads got interrupted mid-flight, this guarantee does not hold,
198     // and we get inconsistent results.  Rather than giving up on checking this
199     // invariant in regular code, reset it here so that the PA_DCHECK()
200     // passes. See crbug.com/1216964.
201     tcache->cached_memory_ = tcache->CachedMemory();
202 
203     // At this point, we should call |TryPurge|. However, due to the thread
204     // cache being possibly inconsistent at this point, this may crash. Rather
205     // than crash, we'd prefer to simply not purge, even though this may leak
206     // memory in some cases.
207     //
208     // see crbug.com/1289092 for details of the crashes.
209 
210     tcache = tcache->next_;
211   }
212 }
213 
SetLargestActiveBucketIndex(uint16_t largest_active_bucket_index)214 void ThreadCacheRegistry::SetLargestActiveBucketIndex(
215     uint16_t largest_active_bucket_index) {
216   largest_active_bucket_index_ = largest_active_bucket_index;
217 }
218 
SetThreadCacheMultiplier(float multiplier)219 void ThreadCacheRegistry::SetThreadCacheMultiplier(float multiplier) {
220   // Two steps:
221   // - Set the global limits, which will affect newly created threads.
222   // - Enumerate all thread caches and set the limit to the global one.
223   {
224     internal::ScopedGuard scoped_locker(GetLock());
225     ThreadCache* tcache = list_head_;
226 
227     // If this is called before *any* thread cache has serviced *any*
228     // allocation, which can happen in tests, and in theory in non-test code as
229     // well.
230     if (!tcache) {
231       return;
232     }
233 
234     // Setting the global limit while locked, because we need |tcache->root_|.
235     ThreadCache::SetGlobalLimits(tcache->root_, multiplier);
236 
237     while (tcache) {
238       PA_DCHECK(ThreadCache::IsValid(tcache));
239       for (int index = 0; index < ThreadCache::kBucketCount; index++) {
240         // This is racy, but we don't care if the limit is enforced later, and
241         // we really want to avoid atomic instructions on the fast path.
242         tcache->buckets_[index].limit.store(ThreadCache::global_limits_[index],
243                                             std::memory_order_relaxed);
244       }
245 
246       tcache = tcache->next_;
247     }
248   }
249 }
250 
SetPurgingConfiguration(const internal::base::TimeDelta min_purge_interval,const internal::base::TimeDelta max_purge_interval,const internal::base::TimeDelta default_purge_interval,size_t min_cached_memory_for_purging_bytes)251 void ThreadCacheRegistry::SetPurgingConfiguration(
252     const internal::base::TimeDelta min_purge_interval,
253     const internal::base::TimeDelta max_purge_interval,
254     const internal::base::TimeDelta default_purge_interval,
255     size_t min_cached_memory_for_purging_bytes) {
256   PA_CHECK(min_purge_interval <= default_purge_interval);
257   PA_CHECK(default_purge_interval <= max_purge_interval);
258   min_purge_interval_ = min_purge_interval;
259   max_purge_interval_ = max_purge_interval;
260   default_purge_interval_ = default_purge_interval;
261   min_cached_memory_for_purging_bytes_ = min_cached_memory_for_purging_bytes;
262   is_purging_configured_ = true;
263 }
264 
RunPeriodicPurge()265 void ThreadCacheRegistry::RunPeriodicPurge() {
266   if (!periodic_purge_is_initialized_) {
267     ThreadCache::EnsureThreadSpecificDataInitialized();
268     periodic_purge_is_initialized_ = true;
269   }
270 
271   PA_CHECK(is_purging_configured_);
272 
273   // Summing across all threads can be slow, but is necessary. Otherwise we rely
274   // on the assumption that the current thread is a good proxy for overall
275   // allocation activity. This is not the case for all process types.
276   //
277   // Since there is no synchronization with other threads, the value is stale,
278   // which is fine.
279   size_t cached_memory_approx = 0;
280   {
281     internal::ScopedGuard scoped_locker(GetLock());
282     ThreadCache* tcache = list_head_;
283     // Can run when there is no thread cache, in which case there is nothing to
284     // do, and the task should not be rescheduled. This would typically indicate
285     // a case where the thread cache was never enabled, or got disabled.
286     if (!tcache) {
287       return;
288     }
289 
290     while (tcache) {
291       cached_memory_approx += tcache->cached_memory_;
292       tcache = tcache->next_;
293     }
294   }
295 
296   // If cached memory is low, this means that either memory footprint is fine,
297   // or the process is mostly idle, and not allocating much since the last
298   // purge. In this case, back off. On the other hand, if there is a lot of
299   // cached memory, make purge more frequent, but always within a set frequency
300   // range.
301   //
302   // There is a potential drawback: a process that was idle for a long time and
303   // suddenly becomes very active will take some time to go back to regularly
304   // scheduled purge with a small enough interval. This is the case for instance
305   // of a renderer moving to foreground. To mitigate that, if cached memory
306   // jumps is very large, make a greater leap to faster purging.
307   if (cached_memory_approx > 10 * min_cached_memory_for_purging_bytes_) {
308     periodic_purge_next_interval_ =
309         std::min(default_purge_interval_, periodic_purge_next_interval_ / 2);
310   } else if (cached_memory_approx > 2 * min_cached_memory_for_purging_bytes_) {
311     periodic_purge_next_interval_ =
312         std::max(min_purge_interval_, periodic_purge_next_interval_ / 2);
313   } else if (cached_memory_approx < min_cached_memory_for_purging_bytes_) {
314     periodic_purge_next_interval_ =
315         std::min(max_purge_interval_, periodic_purge_next_interval_ * 2);
316   }
317 
318   // Make sure that the next interval is in the right bounds. Even though the
319   // logic above should eventually converge to a reasonable interval, if a
320   // sleeping background thread holds onto a large amount of cached memory, then
321   // |PurgeAll()| will not free any memory from it, and the first branch above
322   // can be taken repeatedly until the interval gets very small, as the amount
323   // of cached memory cannot change between calls (since we do not purge
324   // background threads, but only ask them to purge their own cache at the next
325   // allocation).
326   periodic_purge_next_interval_ = std::clamp(
327       periodic_purge_next_interval_, min_purge_interval_, max_purge_interval_);
328 
329   PurgeAll();
330 }
331 
GetPeriodicPurgeNextIntervalInMicroseconds() const332 int64_t ThreadCacheRegistry::GetPeriodicPurgeNextIntervalInMicroseconds()
333     const {
334   return periodic_purge_next_interval_.InMicroseconds();
335 }
336 
ResetForTesting()337 void ThreadCacheRegistry::ResetForTesting() {
338   periodic_purge_next_interval_ = default_purge_interval_;
339 }
340 
341 // static
EnsureThreadSpecificDataInitialized()342 void ThreadCache::EnsureThreadSpecificDataInitialized() {
343   // Using the registry lock to protect from concurrent initialization without
344   // adding a special-pupose lock.
345   internal::ScopedGuard scoped_locker(
346       ThreadCacheRegistry::Instance().GetLock());
347   if (g_thread_cache_key_created) {
348     return;
349   }
350 
351   bool ok = internal::PartitionTlsCreate(&internal::g_thread_cache_key, Delete);
352   PA_CHECK(ok);
353   g_thread_cache_key_created = true;
354 }
355 
356 // static
DeleteForTesting(ThreadCache * tcache)357 void ThreadCache::DeleteForTesting(ThreadCache* tcache) {
358   ThreadCache::Delete(tcache);
359 }
360 
361 // static
SwapForTesting(PartitionRoot * root)362 void ThreadCache::SwapForTesting(PartitionRoot* root) {
363   auto* old_tcache = ThreadCache::Get();
364   g_thread_cache_root.store(nullptr, std::memory_order_relaxed);
365   if (old_tcache) {
366     ThreadCache::DeleteForTesting(old_tcache);
367   }
368   if (root) {
369     Init(root);
370     Create(root);
371   } else {
372 #if BUILDFLAG(IS_WIN)
373     // OnDllProcessDetach accesses g_thread_cache_root which is nullptr now.
374     internal::PartitionTlsSetOnDllProcessDetach(nullptr);
375 #endif
376   }
377 }
378 
379 // static
RemoveTombstoneForTesting()380 void ThreadCache::RemoveTombstoneForTesting() {
381   PA_CHECK(IsTombstone(Get()));
382   internal::PartitionTlsSet(internal::g_thread_cache_key, nullptr);
383 }
384 
385 // static
Init(PartitionRoot * root)386 void ThreadCache::Init(PartitionRoot* root) {
387 #if BUILDFLAG(IS_NACL)
388   static_assert(false, "PartitionAlloc isn't supported for NaCl");
389 #endif
390   PA_CHECK(root->buckets[kBucketCount - 1].slot_size ==
391            ThreadCache::kLargeSizeThreshold);
392   PA_CHECK(root->buckets[largest_active_bucket_index_].slot_size ==
393            ThreadCache::kDefaultSizeThreshold);
394 
395   EnsureThreadSpecificDataInitialized();
396 
397   // Make sure that only one PartitionRoot wants a thread cache.
398   PartitionRoot* expected = nullptr;
399   if (!g_thread_cache_root.compare_exchange_strong(expected, root,
400                                                    std::memory_order_seq_cst,
401                                                    std::memory_order_seq_cst)) {
402     PA_CHECK(false)
403         << "Only one PartitionRoot is allowed to have a thread cache";
404   }
405 
406 #if BUILDFLAG(IS_WIN)
407   internal::PartitionTlsSetOnDllProcessDetach(OnDllProcessDetach);
408 #endif
409 
410   SetGlobalLimits(root, kDefaultMultiplier);
411 }
412 
413 // static
SetGlobalLimits(PartitionRoot * root,float multiplier)414 void ThreadCache::SetGlobalLimits(PartitionRoot* root, float multiplier) {
415   size_t initial_value =
416       static_cast<size_t>(kSmallBucketBaseCount) * multiplier;
417 
418   for (int index = 0; index < kBucketCount; index++) {
419     const auto& root_bucket = root->buckets[index];
420     // Invalid bucket.
421     if (!root_bucket.active_slot_spans_head) {
422       global_limits_[index] = 0;
423       continue;
424     }
425 
426     // Smaller allocations are more frequent, and more performance-sensitive.
427     // Cache more small objects, and fewer larger ones, to save memory.
428     size_t slot_size = root_bucket.slot_size;
429     size_t value;
430     if (slot_size <= 128) {
431       value = initial_value;
432     } else if (slot_size <= 256) {
433       value = initial_value / 2;
434     } else if (slot_size <= 512) {
435       value = initial_value / 4;
436     } else {
437       value = initial_value / 8;
438     }
439 
440     // Bare minimum so that malloc() / free() in a loop will not hit the central
441     // allocator each time.
442     constexpr size_t kMinLimit = 1;
443     // |PutInBucket()| is called on a full bucket, which should not overflow.
444     constexpr size_t kMaxLimit = std::numeric_limits<uint8_t>::max() - 1;
445     global_limits_[index] =
446         static_cast<uint8_t>(std::clamp(value, kMinLimit, kMaxLimit));
447     PA_DCHECK(global_limits_[index] >= kMinLimit);
448     PA_DCHECK(global_limits_[index] <= kMaxLimit);
449   }
450 }
451 
452 // static
SetLargestCachedSize(size_t size)453 void ThreadCache::SetLargestCachedSize(size_t size) {
454   if (size > ThreadCache::kLargeSizeThreshold) {
455     size = ThreadCache::kLargeSizeThreshold;
456   }
457   largest_active_bucket_index_ = PartitionRoot::SizeToBucketIndex(
458       size, PartitionRoot::BucketDistribution::kNeutral);
459   PA_CHECK(largest_active_bucket_index_ < kBucketCount);
460   ThreadCacheRegistry::Instance().SetLargestActiveBucketIndex(
461       largest_active_bucket_index_);
462 }
463 
464 // static
Create(PartitionRoot * root)465 ThreadCache* ThreadCache::Create(PartitionRoot* root) {
466   PA_CHECK(root);
467   // See comment in thread_cache.h, this is used to make sure
468   // kThreadCacheNeedleArray is kept in the final binary.
469   PA_CHECK(tools::kThreadCacheNeedleArray[0] == tools::kNeedle1);
470 
471   // Operator new is overloaded to route to internal partition.
472   // The internal partition does not use `ThreadCache`, so safe to depend on.
473   ThreadCache* tcache = new ThreadCache(root);
474 
475   // This may allocate.
476   internal::PartitionTlsSet(internal::g_thread_cache_key, tcache);
477 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
478   // |thread_local| variables with destructors cause issues on some platforms.
479   // Since we need a destructor (to empty the thread cache), we cannot use it
480   // directly. However, TLS accesses with |thread_local| are typically faster,
481   // as it can turn into a fixed offset load from a register (GS/FS on Linux
482   // x86, for instance). On Windows, saving/restoring the last error increases
483   // cost as well.
484   //
485   // To still get good performance, use |thread_local| to store a raw pointer,
486   // and rely on the platform TLS to call the destructor.
487   internal::g_thread_cache = tcache;
488 #endif  // PA_CONFIG(THREAD_CACHE_FAST_TLS)
489 
490   return tcache;
491 }
492 
ThreadCache(PartitionRoot * root)493 ThreadCache::ThreadCache(PartitionRoot* root)
494     : should_purge_(false),
495       root_(root),
496       thread_id_(internal::base::PlatformThread::CurrentId()),
497       next_(nullptr),
498       prev_(nullptr) {
499   ThreadCacheRegistry::Instance().RegisterThreadCache(this);
500 
501   memset(&stats_, 0, sizeof(stats_));
502 
503   for (int index = 0; index < kBucketCount; index++) {
504     const auto& root_bucket = root->buckets[index];
505     Bucket* tcache_bucket = &buckets_[index];
506     tcache_bucket->freelist_head = nullptr;
507     tcache_bucket->count = 0;
508     tcache_bucket->limit.store(global_limits_[index],
509                                std::memory_order_relaxed);
510 
511     tcache_bucket->slot_size = root_bucket.slot_size;
512     // Invalid bucket.
513     if (!root_bucket.is_valid()) {
514       // Explicitly set this, as size computations iterate over all buckets.
515       tcache_bucket->limit.store(0, std::memory_order_relaxed);
516     }
517   }
518 
519   // When enabled, initialize scheduler loop quarantine branch.
520   // This branch is only used within this thread, so not `lock_required`.
521   if (root_->settings.scheduler_loop_quarantine) {
522     scheduler_loop_quarantine_branch_.emplace(
523         root_->CreateSchedulerLoopQuarantineBranch(false));
524   }
525 }
526 
~ThreadCache()527 ThreadCache::~ThreadCache() {
528   ThreadCacheRegistry::Instance().UnregisterThreadCache(this);
529   Purge();
530 }
531 
532 // static
Delete(void * tcache_ptr)533 void ThreadCache::Delete(void* tcache_ptr) {
534   auto* tcache = static_cast<ThreadCache*>(tcache_ptr);
535 
536   if (!IsValid(tcache)) {
537     return;
538   }
539 
540 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
541   internal::g_thread_cache = nullptr;
542 #else
543   internal::PartitionTlsSet(internal::g_thread_cache_key, nullptr);
544 #endif
545 
546   // Operator new is overloaded to route to internal partition.
547   delete tcache;
548 
549 #if BUILDFLAG(IS_WIN)
550   // On Windows, allocations do occur during thread/process teardown, make sure
551   // they don't resurrect the thread cache.
552   //
553   // Don't MTE-tag, as it'd mess with the sentinel value.
554   //
555   // TODO(lizeb): Investigate whether this is needed on POSIX as well.
556   internal::PartitionTlsSet(internal::g_thread_cache_key,
557                             reinterpret_cast<void*>(kTombstone));
558 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
559   internal::g_thread_cache = reinterpret_cast<ThreadCache*>(kTombstone);
560 #endif
561 
562 #endif  // BUILDFLAG(IS_WIN)
563 }
564 
565 // static
operator new(size_t count)566 void* ThreadCache::operator new(size_t count) {
567   return internal::InternalAllocatorRoot().Alloc<AllocFlags::kNoHooks>(count);
568 }
569 // static
operator delete(void * ptr)570 void ThreadCache::operator delete(void* ptr) {
571   internal::InternalAllocatorRoot().Free<FreeFlags::kNoHooks>(ptr);
572 }
573 
Bucket()574 ThreadCache::Bucket::Bucket() {
575   limit.store(0, std::memory_order_relaxed);
576 }
577 
FillBucket(size_t bucket_index)578 void ThreadCache::FillBucket(size_t bucket_index) {
579   // Filling multiple elements from the central allocator at a time has several
580   // advantages:
581   // - Amortize lock acquisition
582   // - Increase hit rate
583   // - Can improve locality, as consecutive allocations from the central
584   //   allocator will likely return close addresses, especially early on.
585   //
586   // However, do not take too many items, to prevent memory bloat.
587   //
588   // Cache filling / purging policy:
589   // We aim at keeping the buckets neither empty nor full, while minimizing
590   // requests to the central allocator.
591   //
592   // For each bucket, there is a |limit| of how many cached objects there are in
593   // the bucket, so |count| < |limit| at all times.
594   // - Clearing: limit -> limit / 2
595   // - Filling: 0 -> limit / kBatchFillRatio
596   //
597   // These thresholds are somewhat arbitrary, with these considerations:
598   // (1) Batched filling should not completely fill the bucket
599   // (2) Batched clearing should not completely clear the bucket
600   // (3) Batched filling should not be too eager
601   //
602   // If (1) and (2) do not hold, we risk oscillations of bucket filling /
603   // clearing which would greatly increase calls to the central allocator. (3)
604   // tries to keep memory usage low. So clearing half of the bucket, and filling
605   // a quarter of it are sensible defaults.
606   PA_INCREMENT_COUNTER(stats_.batch_fill_count);
607 
608   Bucket& bucket = buckets_[bucket_index];
609   // Some buckets may have a limit lower than |kBatchFillRatio|, but we still
610   // want to at least allocate a single slot, otherwise we wrongly return
611   // nullptr, which ends up deactivating the bucket.
612   //
613   // In these cases, we do not really batch bucket filling, but this is expected
614   // to be used for the largest buckets, where over-allocating is not advised.
615   int count = std::max(
616       1, bucket.limit.load(std::memory_order_relaxed) / kBatchFillRatio);
617 
618   size_t usable_size;
619   bool is_already_zeroed;
620 
621   PA_DCHECK(!root_->buckets[bucket_index].CanStoreRawSize());
622   PA_DCHECK(!root_->buckets[bucket_index].is_direct_mapped());
623 
624   size_t allocated_slots = 0;
625   // Same as calling RawAlloc() |count| times, but acquires the lock only once.
626   internal::ScopedGuard guard(internal::PartitionRootLock(root_));
627   for (int i = 0; i < count; i++) {
628     // Thread cache fill should not trigger expensive operations, to not grab
629     // the lock for a long time needlessly, but also to not inflate memory
630     // usage. Indeed, without AllocFlags::kFastPathOrReturnNull, cache
631     // fill may activate a new PartitionPage, or even a new SuperPage, which is
632     // clearly not desirable.
633     //
634     // |raw_size| is set to the slot size, as we don't know it. However, it is
635     // only used for direct-mapped allocations and single-slot ones anyway,
636     // which are not handled here.
637     size_t ret_slot_size;
638     uintptr_t slot_start =
639         root_->AllocFromBucket<AllocFlags::kFastPathOrReturnNull |
640                                AllocFlags::kReturnNull>(
641             &root_->buckets[bucket_index],
642             root_->buckets[bucket_index].slot_size /* raw_size */,
643             internal::PartitionPageSize(), &usable_size, &ret_slot_size,
644             &is_already_zeroed);
645     // Either the previous allocation would require a slow path allocation, or
646     // the central allocator is out of memory. If the bucket was filled with
647     // some objects, then the allocation will be handled normally. Otherwise,
648     // this goes to the central allocator, which will service the allocation,
649     // return nullptr or crash.
650     if (!slot_start) {
651       break;
652     }
653     PA_DCHECK(ret_slot_size == root_->buckets[bucket_index].slot_size);
654 
655     allocated_slots++;
656     PutInBucket(bucket, slot_start);
657   }
658 
659   cached_memory_ += allocated_slots * bucket.slot_size;
660 }
661 
ClearBucket(Bucket & bucket,size_t limit)662 void ThreadCache::ClearBucket(Bucket& bucket, size_t limit) {
663   ClearBucketHelper<true>(bucket, limit);
664 }
665 
666 template <bool crash_on_corruption>
ClearBucketHelper(Bucket & bucket,size_t limit)667 void ThreadCache::ClearBucketHelper(Bucket& bucket, size_t limit) {
668   // Avoids acquiring the lock needlessly.
669   if (!bucket.count || bucket.count <= limit) {
670     return;
671   }
672 
673   // This serves two purposes: error checking and avoiding stalls when grabbing
674   // the lock:
675   // 1. Error checking: this is pretty clear. Since this path is taken
676   //    infrequently, and is going to walk the entire freelist anyway, its
677   //    incremental cost should be very small. Indeed, we free from the tail of
678   //    the list, so all calls here will end up walking the entire freelist, and
679   //    incurring the same amount of cache misses.
680   // 2. Avoiding stalls: If one of the freelist accesses in |FreeAfter()|
681   //    triggers a major page fault, and we are running on a low-priority
682   //    thread, we don't want the thread to be blocked while holding the lock,
683   //    causing a priority inversion.
684   const internal::PartitionFreelistDispatcher* freelist_dispatcher =
685       root_->get_freelist_dispatcher();
686 
687   if constexpr (crash_on_corruption) {
688     freelist_dispatcher->CheckFreeListForThreadCache(bucket.freelist_head,
689                                                      bucket.slot_size);
690   }
691   uint8_t count_before = bucket.count;
692   if (limit == 0) {
693     FreeAfter<crash_on_corruption>(bucket.freelist_head, bucket.slot_size);
694     bucket.freelist_head = nullptr;
695   } else {
696     // Free the *end* of the list, not the head, since the head contains the
697     // most recently touched memory.
698     auto* head = bucket.freelist_head;
699     size_t items = 1;  // Cannot free the freelist head.
700     while (items < limit) {
701 #if BUILDFLAG(USE_FREELIST_POOL_OFFSETS)
702       head = freelist_dispatcher->GetNextForThreadCacheBool(
703           head, crash_on_corruption, bucket.slot_size);
704 #else
705       head = freelist_dispatcher->GetNextForThreadCache<crash_on_corruption>(
706           head, bucket.slot_size);
707 #endif  // USE_FREELIST_POOL_OFFSETS
708       items++;
709     }
710 
711 #if BUILDFLAG(USE_FREELIST_POOL_OFFSETS)
712     FreeAfter<crash_on_corruption>(
713         freelist_dispatcher->GetNextForThreadCacheBool(
714             head, crash_on_corruption, bucket.slot_size),
715         bucket.slot_size);
716 #else
717     FreeAfter<crash_on_corruption>(
718         freelist_dispatcher->GetNextForThreadCache<crash_on_corruption>(
719             head, bucket.slot_size),
720         bucket.slot_size);
721 #endif  // USE_FREELIST_POOL_OFFSETS
722     freelist_dispatcher->SetNext(head, nullptr);
723   }
724   bucket.count = limit;
725   uint8_t count_after = bucket.count;
726   size_t freed_memory = (count_before - count_after) * bucket.slot_size;
727   PA_DCHECK(cached_memory_ >= freed_memory);
728   cached_memory_ -= freed_memory;
729 
730   PA_DCHECK(cached_memory_ == CachedMemory());
731 }
732 
733 template <bool crash_on_corruption>
FreeAfter(internal::PartitionFreelistEntry * head,size_t slot_size)734 void ThreadCache::FreeAfter(internal::PartitionFreelistEntry* head,
735                             size_t slot_size) {
736   // Acquire the lock once. Deallocation from the same bucket are likely to be
737   // hitting the same cache lines in the central allocator, and lock
738   // acquisitions can be expensive.
739   internal::ScopedGuard guard(internal::PartitionRootLock(root_));
740   while (head) {
741     uintptr_t slot_start = internal::SlotStartPtr2Addr(head);
742     const internal::PartitionFreelistDispatcher* freelist_dispatcher =
743         root_->get_freelist_dispatcher();
744 #if BUILDFLAG(USE_FREELIST_POOL_OFFSETS)
745     head = freelist_dispatcher->GetNextForThreadCacheBool(
746         head, crash_on_corruption, slot_size);
747 #else
748     head = freelist_dispatcher->GetNextForThreadCache<crash_on_corruption>(
749         head, slot_size);
750 #endif  // USE_FREELIST_POOL_OFFSETS
751     root_->RawFreeLocked(slot_start);
752   }
753 }
754 
ResetForTesting()755 void ThreadCache::ResetForTesting() {
756   stats_.alloc_count = 0;
757   stats_.alloc_hits = 0;
758   stats_.alloc_misses = 0;
759 
760   stats_.alloc_miss_empty = 0;
761   stats_.alloc_miss_too_large = 0;
762 
763   stats_.cache_fill_count = 0;
764   stats_.cache_fill_hits = 0;
765   stats_.cache_fill_misses = 0;
766 
767   stats_.batch_fill_count = 0;
768 
769   stats_.bucket_total_memory = 0;
770   stats_.metadata_overhead = 0;
771 
772   Purge();
773   PA_CHECK(cached_memory_ == 0u);
774   should_purge_.store(false, std::memory_order_relaxed);
775 }
776 
CachedMemory() const777 size_t ThreadCache::CachedMemory() const {
778   size_t total = 0;
779   for (const Bucket& bucket : buckets_) {
780     total += bucket.count * static_cast<size_t>(bucket.slot_size);
781   }
782 
783   return total;
784 }
785 
AccumulateStats(ThreadCacheStats * stats) const786 void ThreadCache::AccumulateStats(ThreadCacheStats* stats) const {
787   stats->alloc_count += stats_.alloc_count;
788   stats->alloc_hits += stats_.alloc_hits;
789   stats->alloc_misses += stats_.alloc_misses;
790 
791   stats->alloc_miss_empty += stats_.alloc_miss_empty;
792   stats->alloc_miss_too_large += stats_.alloc_miss_too_large;
793 
794   stats->cache_fill_count += stats_.cache_fill_count;
795   stats->cache_fill_hits += stats_.cache_fill_hits;
796   stats->cache_fill_misses += stats_.cache_fill_misses;
797 
798   stats->batch_fill_count += stats_.batch_fill_count;
799 
800 #if PA_CONFIG(THREAD_CACHE_ALLOC_STATS)
801   for (size_t i = 0; i < internal::kNumBuckets + 1; i++) {
802     stats->allocs_per_bucket_[i] += stats_.allocs_per_bucket_[i];
803   }
804 #endif  // PA_CONFIG(THREAD_CACHE_ALLOC_STATS)
805 
806   // cached_memory_ is not necessarily equal to |CachedMemory()| here, since
807   // this function can be called racily from another thread, to collect
808   // statistics. Hence no DCHECK_EQ(CachedMemory(), cached_memory_).
809   stats->bucket_total_memory += cached_memory_;
810 
811   stats->metadata_overhead += sizeof(*this);
812 }
813 
SetShouldPurge()814 void ThreadCache::SetShouldPurge() {
815   should_purge_.store(true, std::memory_order_relaxed);
816 }
817 
Purge()818 void ThreadCache::Purge() {
819   PA_REENTRANCY_GUARD(is_in_thread_cache_);
820   PurgeInternal();
821 }
822 
TryPurge()823 void ThreadCache::TryPurge() {
824   PA_REENTRANCY_GUARD(is_in_thread_cache_);
825   PurgeInternalHelper<false>();
826 }
827 
828 // static
PurgeCurrentThread()829 void ThreadCache::PurgeCurrentThread() {
830   auto* tcache = Get();
831   if (IsValid(tcache)) {
832     tcache->Purge();
833   }
834 }
835 
PurgeInternal()836 void ThreadCache::PurgeInternal() {
837   PurgeInternalHelper<true>();
838 }
839 
ResetPerThreadAllocationStatsForTesting()840 void ThreadCache::ResetPerThreadAllocationStatsForTesting() {
841   thread_alloc_stats_ = {};
842 }
843 
844 template <bool crash_on_corruption>
PurgeInternalHelper()845 void ThreadCache::PurgeInternalHelper() {
846   should_purge_.store(false, std::memory_order_relaxed);
847   // TODO(lizeb): Investigate whether lock acquisition should be less
848   // frequent.
849   //
850   // Note: iterate over all buckets, even the inactive ones. Since
851   // |largest_active_bucket_index_| can be lowered at runtime, there may be
852   // memory already cached in the inactive buckets. They should still be
853   // purged.
854   for (auto& bucket : buckets_) {
855     ClearBucketHelper<crash_on_corruption>(bucket, 0);
856   }
857 }
858 
859 }  // namespace partition_alloc
860