xref: /aosp_15_r20/external/cronet/base/allocator/partition_allocator/src/partition_alloc/thread_cache.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2020 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef PARTITION_ALLOC_THREAD_CACHE_H_
6 #define PARTITION_ALLOC_THREAD_CACHE_H_
7 
8 #include <atomic>
9 #include <cstdint>
10 #include <limits>
11 #include <memory>
12 #include <optional>
13 
14 #include "build/build_config.h"
15 #include "partition_alloc/lightweight_quarantine.h"
16 #include "partition_alloc/partition_alloc-inl.h"
17 #include "partition_alloc/partition_alloc_base/compiler_specific.h"
18 #include "partition_alloc/partition_alloc_base/component_export.h"
19 #include "partition_alloc/partition_alloc_base/debug/debugging_buildflags.h"
20 #include "partition_alloc/partition_alloc_base/thread_annotations.h"
21 #include "partition_alloc/partition_alloc_base/time/time.h"
22 #include "partition_alloc/partition_alloc_buildflags.h"
23 #include "partition_alloc/partition_alloc_config.h"
24 #include "partition_alloc/partition_alloc_forward.h"
25 #include "partition_alloc/partition_bucket_lookup.h"
26 #include "partition_alloc/partition_freelist_entry.h"
27 #include "partition_alloc/partition_lock.h"
28 #include "partition_alloc/partition_stats.h"
29 #include "partition_alloc/partition_tls.h"
30 
31 #if defined(ARCH_CPU_X86_64) && BUILDFLAG(HAS_64_BIT_POINTERS)
32 #include <algorithm>
33 #endif
34 
35 namespace partition_alloc {
36 
37 class ThreadCache;
38 
39 namespace tools {
40 
41 // This is used from ThreadCacheInspector, which runs in a different process. It
42 // scans the process memory looking for the two needles, to locate the thread
43 // cache registry instance.
44 //
45 // These two values were chosen randomly, and in particular neither is a valid
46 // pointer on most 64 bit architectures.
47 #if BUILDFLAG(HAS_64_BIT_POINTERS)
48 constexpr uintptr_t kNeedle1 = 0xe69e32f3ad9ea63;
49 constexpr uintptr_t kNeedle2 = 0x9615ee1c5eb14caf;
50 #else
51 constexpr uintptr_t kNeedle1 = 0xe69e32f3;
52 constexpr uintptr_t kNeedle2 = 0x9615ee1c;
53 #endif  // BUILDFLAG(HAS_64_BIT_POINTERS)
54 
55 // This array contains, in order:
56 // - kNeedle1
57 // - &ThreadCacheRegistry::Instance()
58 // - kNeedle2
59 //
60 // It is refererenced in the thread cache constructor to make sure it is not
61 // removed by the compiler. It is also not const to make sure it ends up in
62 // .data.
63 constexpr size_t kThreadCacheNeedleArraySize = 4;
64 extern uintptr_t kThreadCacheNeedleArray[kThreadCacheNeedleArraySize];
65 
66 class HeapDumper;
67 class ThreadCacheInspector;
68 
69 }  // namespace tools
70 
71 namespace internal {
72 
73 extern PA_COMPONENT_EXPORT(PARTITION_ALLOC) PartitionTlsKey g_thread_cache_key;
74 
75 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
76 extern PA_COMPONENT_EXPORT(
77     PARTITION_ALLOC) thread_local ThreadCache* g_thread_cache;
78 #endif
79 
80 }  // namespace internal
81 
82 struct ThreadCacheLimits {
83   // When trying to conserve memory, set the thread cache limit to this.
84   static constexpr size_t kDefaultSizeThreshold = 512;
85   // 32kiB is chosen here as from local experiments, "zone" allocation in
86   // V8 is performance-sensitive, and zones can (and do) grow up to 32kiB for
87   // each individual allocation.
88   static constexpr size_t kLargeSizeThreshold = 1 << 15;
89   static_assert(kLargeSizeThreshold <= std::numeric_limits<uint16_t>::max(),
90                 "");
91 };
92 
93 constexpr internal::base::TimeDelta kMinPurgeInterval =
94     internal::base::Seconds(1);
95 constexpr internal::base::TimeDelta kMaxPurgeInterval =
96     internal::base::Minutes(1);
97 constexpr internal::base::TimeDelta kDefaultPurgeInterval =
98     2 * kMinPurgeInterval;
99 constexpr size_t kMinCachedMemoryForPurgingBytes = 500 * 1024;
100 
101 // Global registry of all ThreadCache instances.
102 //
103 // This class cannot allocate in the (Un)registerThreadCache() functions, as
104 // they are called from ThreadCache constructor, which is from within the
105 // allocator. However the other members can allocate.
PA_COMPONENT_EXPORT(PARTITION_ALLOC)106 class PA_COMPONENT_EXPORT(PARTITION_ALLOC) ThreadCacheRegistry {
107  public:
108   static ThreadCacheRegistry& Instance();
109   // Do not instantiate.
110   //
111   // Several things are surprising here:
112   // - The constructor is public even though this is intended to be a singleton:
113   //   we cannot use a "static local" variable in |Instance()| as this is
114   //   reached too early during CRT initialization on Windows, meaning that
115   //   static local variables don't work (as they call into the uninitialized
116   //   runtime). To sidestep that, we use a regular global variable in the .cc,
117   //   which is fine as this object's constructor is constexpr.
118   // - Marked inline so that the chromium style plugin doesn't complain that a
119   //   "complex constructor" has an inline body. This warning is disabled when
120   //   the constructor is explicitly marked "inline". Note that this is a false
121   //   positive of the plugin, since constexpr implies inline.
122   inline constexpr ThreadCacheRegistry();
123 
124   void RegisterThreadCache(ThreadCache* cache);
125   void UnregisterThreadCache(ThreadCache* cache);
126   // Prints statistics for all thread caches, or this thread's only.
127   void DumpStats(bool my_thread_only, ThreadCacheStats* stats);
128   // Purge() this thread's cache, and asks the other ones to trigger Purge() at
129   // a later point (during a deallocation).
130   void PurgeAll();
131 
132   // Runs `PurgeAll` and updates the next interval which
133   // `GetPeriodicPurgeNextIntervalInMicroseconds` returns.
134   //
135   // Note that it's a caller's responsibility to invoke this member function
136   // periodically with an appropriate interval. This function does not schedule
137   // any task nor timer.
138   void RunPeriodicPurge();
139   // Returns the appropriate interval to invoke `RunPeriodicPurge` next time.
140   int64_t GetPeriodicPurgeNextIntervalInMicroseconds() const;
141 
142   // Controls the thread cache size, by setting the multiplier to a value above
143   // or below |ThreadCache::kDefaultMultiplier|.
144   void SetThreadCacheMultiplier(float multiplier);
145   void SetLargestActiveBucketIndex(uint16_t largest_active_bucket_index);
146 
147   // Controls the thread cache purging configuration.
148   void SetPurgingConfiguration(
149       const internal::base::TimeDelta min_purge_interval,
150       const internal::base::TimeDelta max_purge_interval,
151       const internal::base::TimeDelta default_purge_interval,
152       size_t min_cached_memory_for_purging_bytes);
153   internal::base::TimeDelta min_purge_interval() const {
154     return min_purge_interval_;
155   }
156   internal::base::TimeDelta max_purge_interval() const {
157     return max_purge_interval_;
158   }
159   internal::base::TimeDelta default_purge_interval() const {
160     return default_purge_interval_;
161   }
162   size_t min_cached_memory_for_purging_bytes() const {
163     return min_cached_memory_for_purging_bytes_;
164   }
165   bool is_purging_configured() const { return is_purging_configured_; }
166 
167   static internal::Lock& GetLock() { return Instance().lock_; }
168   // Purges all thread caches *now*. This is completely thread-unsafe, and
169   // should only be called in a post-fork() handler.
170   void ForcePurgeAllThreadAfterForkUnsafe();
171 
172   void ResetForTesting();
173 
174  private:
175   friend class tools::ThreadCacheInspector;
176   friend class tools::HeapDumper;
177 
178   // Not using base::Lock as the object's constructor must be constexpr.
179   internal::Lock lock_;
180   ThreadCache* list_head_ PA_GUARDED_BY(GetLock()) = nullptr;
181   bool periodic_purge_is_initialized_ = false;
182   internal::base::TimeDelta min_purge_interval_;
183   internal::base::TimeDelta max_purge_interval_;
184   internal::base::TimeDelta default_purge_interval_;
185   size_t min_cached_memory_for_purging_bytes_ = 0u;
186   internal::base::TimeDelta periodic_purge_next_interval_;
187   bool is_purging_configured_ = false;
188 
189   uint16_t largest_active_bucket_index_ = internal::BucketIndexLookup::GetIndex(
190       ThreadCacheLimits::kDefaultSizeThreshold);
191 };
192 
193 constexpr ThreadCacheRegistry::ThreadCacheRegistry() = default;
194 
195 #if PA_CONFIG(THREAD_CACHE_ENABLE_STATISTICS)
196 #define PA_INCREMENT_COUNTER(counter) ++counter
197 #else
198 #define PA_INCREMENT_COUNTER(counter) \
199   do {                                \
200   } while (0)
201 #endif  // PA_CONFIG(THREAD_CACHE_ENABLE_STATISTICS)
202 
203 #if BUILDFLAG(PA_DCHECK_IS_ON)
204 
205 namespace internal {
206 
207 class ReentrancyGuard {
208  public:
ReentrancyGuard(bool & flag)209   explicit ReentrancyGuard(bool& flag) : flag_(flag) {
210     PA_CHECK(!flag_);
211     flag_ = true;
212   }
213 
~ReentrancyGuard()214   ~ReentrancyGuard() { flag_ = false; }
215 
216  private:
217   bool& flag_;
218 };
219 
220 }  // namespace internal
221 
222 #define PA_REENTRANCY_GUARD(x)      \
223   internal::ReentrancyGuard guard { \
224     x                               \
225   }
226 
227 #else  // BUILDFLAG(PA_DCHECK_IS_ON)
228 
229 #define PA_REENTRANCY_GUARD(x) \
230   do {                         \
231   } while (0)
232 
233 #endif  // BUILDFLAG(PA_DCHECK_IS_ON)
234 
235 // Per-thread cache. *Not* threadsafe, must only be accessed from a single
236 // thread.
237 //
238 // In practice, this is easily enforced as long as only |instance| is
239 // manipulated, as it is a thread_local member. As such, any
240 // |ThreadCache::instance->*()| call will necessarily be done from a single
241 // thread.
PA_COMPONENT_EXPORT(PARTITION_ALLOC)242 class PA_COMPONENT_EXPORT(PARTITION_ALLOC) ThreadCache {
243  public:
244   struct Bucket {
245     internal::PartitionFreelistEntry* freelist_head = nullptr;
246     // Want to keep sizeof(Bucket) small, using small types.
247     uint8_t count = 0;
248     std::atomic<uint8_t> limit{};  // Can be changed from another thread.
249     uint16_t slot_size = 0;
250 
251     Bucket();
252   };
253 
254   // Initializes the thread cache for |root|. May allocate, so should be called
255   // with the thread cache disabled on the partition side, and without the
256   // partition lock held.
257   //
258   // May only be called by a single PartitionRoot.
259   static void Init(PartitionRoot* root);
260 
261   static void DeleteForTesting(ThreadCache* tcache);
262 
263   // Deletes existing thread cache and creates a new one for |root|.
264   static void SwapForTesting(PartitionRoot* root);
265 
266   // Removes the tombstone marker that would be returned by Get() otherwise.
267   static void RemoveTombstoneForTesting();
268 
269   // Can be called several times, must be called before any ThreadCache
270   // interactions.
271   static void EnsureThreadSpecificDataInitialized();
272 
273   static ThreadCache* Get() {
274 #if PA_CONFIG(THREAD_CACHE_FAST_TLS)
275     return internal::g_thread_cache;
276 #else
277     // This region isn't MTE-tagged.
278     return reinterpret_cast<ThreadCache*>(
279         internal::PartitionTlsGet(internal::g_thread_cache_key));
280 #endif
281   }
282 
283   static bool IsValid(ThreadCache* tcache) {
284     // Do not MTE-untag, as it'd mess up the sentinel value.
285     return reinterpret_cast<uintptr_t>(tcache) & kTombstoneMask;
286   }
287 
288   static bool IsTombstone(ThreadCache* tcache) {
289     // Do not MTE-untag, as it'd mess up the sentinel value.
290     return reinterpret_cast<uintptr_t>(tcache) == kTombstone;
291   }
292 
293   // Create a new ThreadCache associated with |root|.
294   // Must be called without the partition locked, as this may allocate.
295   static ThreadCache* Create(PartitionRoot* root);
296 
297   const internal::PartitionFreelistDispatcher*
298   get_freelist_dispatcher_from_root();
299 
300   ~ThreadCache();
301 
302   // Disallow copy and move.
303   ThreadCache(const ThreadCache&) = delete;
304   ThreadCache(const ThreadCache&&) = delete;
305   ThreadCache& operator=(const ThreadCache&) = delete;
306 
307   // Tries to put a slot at |slot_start| into the cache.
308   // The slot comes from the bucket at index |bucket_index| from the partition
309   // this cache is for.
310   //
311   // Returns true if the slot was put in the cache, and false otherwise. This
312   // can happen either because the cache is full or the allocation was too
313   // large.
314   PA_ALWAYS_INLINE bool MaybePutInCache(uintptr_t slot_start,
315                                         size_t bucket_index,
316                                         size_t* slot_size);
317 
318   // Tries to allocate a memory slot from the cache.
319   // Returns 0 on failure.
320   //
321   // Has the same behavior as RawAlloc(), that is: no cookie nor ref-count
322   // handling. Sets |slot_size| to the allocated size upon success.
323   PA_ALWAYS_INLINE uintptr_t GetFromCache(size_t bucket_index,
324                                           size_t* slot_size);
325 
326   // Asks this cache to trigger |Purge()| at a later point. Can be called from
327   // any thread.
328   void SetShouldPurge();
329   // Empties the cache.
330   // The Partition lock must *not* be held when calling this.
331   // Must be called from the thread this cache is for.
332   void Purge();
333   // |TryPurge| is the same as |Purge|, except that |TryPurge| will
334   // not crash if the thread cache is inconsistent. Normally inconsistency
335   // is a sign of a bug somewhere, so |Purge| should be preferred in most cases.
336   void TryPurge();
337   // Amount of cached memory for this thread's cache, in bytes.
338   size_t CachedMemory() const;
339   void AccumulateStats(ThreadCacheStats* stats) const;
340 
341   // Purge the thread cache of the current thread, if one exists.
342   static void PurgeCurrentThread();
343 
344   const ThreadAllocStats& thread_alloc_stats() const {
345     return thread_alloc_stats_;
346   }
347   size_t bucket_count_for_testing(size_t index) const {
348     return buckets_[index].count;
349   }
350 
351   internal::base::PlatformThreadId thread_id() const { return thread_id_; }
352 
353   // Sets the maximum size of allocations that may be cached by the thread
354   // cache. This applies to all threads. However, the maximum size is bounded by
355   // |kLargeSizeThreshold|.
356   static void SetLargestCachedSize(size_t size);
357 
358   // Cumulative stats about *all* allocations made on the `root_` partition on
359   // this thread, that is not only the allocations serviced by the thread cache,
360   // but all allocations, including large and direct-mapped ones. This should in
361   // theory be split into a separate PerThread data structure, but the thread
362   // cache is the only per-thread data we have as of now.
363   //
364   // TODO(lizeb): Investigate adding a proper per-thread data structure.
365   PA_ALWAYS_INLINE void RecordAllocation(size_t size);
366   PA_ALWAYS_INLINE void RecordDeallocation(size_t size);
367   void ResetPerThreadAllocationStatsForTesting();
368 
369   // Fill 1 / kBatchFillRatio * bucket.limit slots at a time.
370   static constexpr uint16_t kBatchFillRatio = 8;
371 
372   // Limit for the smallest bucket will be kDefaultMultiplier *
373   // kSmallBucketBaseCount by default.
374   static constexpr float kDefaultMultiplier = 2.;
375   static constexpr uint8_t kSmallBucketBaseCount = 64;
376 
377   static constexpr size_t kDefaultSizeThreshold =
378       ThreadCacheLimits::kDefaultSizeThreshold;
379   static constexpr size_t kLargeSizeThreshold =
380       ThreadCacheLimits::kLargeSizeThreshold;
381   static constexpr uint16_t kBucketCount =
382       internal::BucketIndexLookup::GetIndex(ThreadCache::kLargeSizeThreshold) +
383       1;
384   static_assert(
385       kBucketCount < internal::kNumBuckets,
386       "Cannot have more cached buckets than what the allocator supports");
387 
388   const ThreadCache* prev_for_testing() const
389       PA_EXCLUSIVE_LOCKS_REQUIRED(ThreadCacheRegistry::GetLock()) {
390     return prev_;
391   }
392   const ThreadCache* next_for_testing() const
393       PA_EXCLUSIVE_LOCKS_REQUIRED(ThreadCacheRegistry::GetLock()) {
394     return next_;
395   }
396 
397   ThreadCacheStats& stats_for_testing() { return stats_; }
398 
399   Bucket& bucket_for_testing(size_t index) { return buckets_[index]; }
400   void ClearBucketForTesting(Bucket& bucket, size_t limit) {
401     ClearBucket(bucket, limit);
402   }
403 
404   internal::LightweightQuarantineBranch& GetSchedulerLoopQuarantineBranch() {
405     PA_DCHECK(scheduler_loop_quarantine_branch_.has_value());
406     return *scheduler_loop_quarantine_branch_;
407   }
408 
409  private:
410   friend class tools::HeapDumper;
411   friend class tools::ThreadCacheInspector;
412 
413   static_assert(sizeof(Bucket) <= 2 * sizeof(void*), "Keep Bucket small.");
414 
415   explicit ThreadCache(PartitionRoot* root);
416   static void Delete(void* thread_cache_ptr);
417 
418   static void* operator new(size_t count);
419   static void operator delete(void* ptr);
420 
421   void PurgeInternal();
422   template <bool crash_on_corruption>
423   void PurgeInternalHelper();
424 
425   // Fills a bucket from the central allocator.
426   void FillBucket(size_t bucket_index);
427   // Empties the |bucket| until there are at most |limit| objects in it.
428   template <bool crash_on_corruption>
429   void ClearBucketHelper(Bucket& bucket, size_t limit);
430   void ClearBucket(Bucket& bucket, size_t limit);
431   PA_ALWAYS_INLINE void PutInBucket(Bucket& bucket, uintptr_t slot_start);
432   void ResetForTesting();
433   // Releases the entire freelist starting at |head| to the root.
434   template <bool crash_on_corruption>
435   void FreeAfter(internal::PartitionFreelistEntry* head, size_t slot_size);
436   static void SetGlobalLimits(PartitionRoot* root, float multiplier);
437 
438   // On some architectures, ThreadCache::Get() can be called and return
439   // something after the thread cache has been destroyed. In this case, we set
440   // it to this value, to signal that the thread is being terminated, and the
441   // thread cache should not be used.
442   //
443   // This happens in particular on Windows, during program termination.
444   //
445   // We choose 0x1 as the value as it is an invalid pointer value, since it is
446   // not aligned, and too low. Also, checking !(ptr & kTombstoneMask) checks for
447   // nullptr and kTombstone at the same time.
448   static constexpr uintptr_t kTombstone = 0x1;
449   static constexpr uintptr_t kTombstoneMask = ~kTombstone;
450 
451   static uint8_t global_limits_[kBucketCount];
452   // Index of the largest active bucket. Not all processes/platforms will use
453   // all buckets, as using larger buckets increases the memory footprint.
454   //
455   // TODO(lizeb): Investigate making this per-thread rather than static, to
456   // improve locality, and open the door to per-thread settings.
457   static uint16_t largest_active_bucket_index_;
458 
459   // These are at the beginning as they're accessed for each allocation.
460   uint32_t cached_memory_ = 0;
461   std::atomic<bool> should_purge_;
462   ThreadCacheStats stats_;
463   ThreadAllocStats thread_alloc_stats_;
464 
465   // Buckets are quite big, though each is only 2 pointers.
466   Bucket buckets_[kBucketCount];
467 
468   // Cold data below.
469   PartitionRoot* const root_;
470 
471   const internal::base::PlatformThreadId thread_id_;
472 #if BUILDFLAG(PA_DCHECK_IS_ON)
473   bool is_in_thread_cache_ = false;
474 #endif
475 
476   // Intrusive list since ThreadCacheRegistry::RegisterThreadCache() cannot
477   // allocate.
478   ThreadCache* next_ PA_GUARDED_BY(ThreadCacheRegistry::GetLock());
479   ThreadCache* prev_ PA_GUARDED_BY(ThreadCacheRegistry::GetLock());
480 
481   std::optional<internal::LightweightQuarantineBranch>
482       scheduler_loop_quarantine_branch_;
483 
484   friend class ThreadCacheRegistry;
485   friend class PartitionAllocThreadCacheTest;
486   friend class tools::ThreadCacheInspector;
487 };
488 
MaybePutInCache(uintptr_t slot_start,size_t bucket_index,size_t * slot_size)489 PA_ALWAYS_INLINE bool ThreadCache::MaybePutInCache(uintptr_t slot_start,
490                                                    size_t bucket_index,
491                                                    size_t* slot_size) {
492   PA_REENTRANCY_GUARD(is_in_thread_cache_);
493   PA_INCREMENT_COUNTER(stats_.cache_fill_count);
494 
495   if (PA_UNLIKELY(bucket_index > largest_active_bucket_index_)) {
496     PA_INCREMENT_COUNTER(stats_.cache_fill_misses);
497     return false;
498   }
499 
500   auto& bucket = buckets_[bucket_index];
501 
502   PA_DCHECK(bucket.count != 0 || bucket.freelist_head == nullptr);
503 
504   PutInBucket(bucket, slot_start);
505   cached_memory_ += bucket.slot_size;
506   PA_INCREMENT_COUNTER(stats_.cache_fill_hits);
507 
508   // Relaxed ordering: we don't care about having an up-to-date or consistent
509   // value, just want it to not change while we are using it, hence using
510   // relaxed ordering, and loading into a local variable. Without it, we are
511   // gambling that the compiler would not issue multiple loads.
512   uint8_t limit = bucket.limit.load(std::memory_order_relaxed);
513   // Batched deallocation, amortizing lock acquisitions.
514   if (PA_UNLIKELY(bucket.count > limit)) {
515     ClearBucket(bucket, limit / 2);
516   }
517 
518   if (PA_UNLIKELY(should_purge_.load(std::memory_order_relaxed))) {
519     PurgeInternal();
520   }
521 
522   *slot_size = bucket.slot_size;
523   return true;
524 }
525 
GetFromCache(size_t bucket_index,size_t * slot_size)526 PA_ALWAYS_INLINE uintptr_t ThreadCache::GetFromCache(size_t bucket_index,
527                                                      size_t* slot_size) {
528 #if PA_CONFIG(THREAD_CACHE_ALLOC_STATS)
529   stats_.allocs_per_bucket_[bucket_index]++;
530 #endif
531 
532   PA_REENTRANCY_GUARD(is_in_thread_cache_);
533   PA_INCREMENT_COUNTER(stats_.alloc_count);
534   // Only handle "small" allocations.
535   if (PA_UNLIKELY(bucket_index > largest_active_bucket_index_)) {
536     PA_INCREMENT_COUNTER(stats_.alloc_miss_too_large);
537     PA_INCREMENT_COUNTER(stats_.alloc_misses);
538     return 0;
539   }
540 
541   auto& bucket = buckets_[bucket_index];
542   if (PA_LIKELY(bucket.freelist_head)) {
543     PA_INCREMENT_COUNTER(stats_.alloc_hits);
544   } else {
545     PA_DCHECK(bucket.count == 0);
546     PA_INCREMENT_COUNTER(stats_.alloc_miss_empty);
547     PA_INCREMENT_COUNTER(stats_.alloc_misses);
548 
549     FillBucket(bucket_index);
550 
551     // Very unlikely, means that the central allocator is out of memory. Let it
552     // deal with it (may return 0, may crash).
553     if (PA_UNLIKELY(!bucket.freelist_head)) {
554       return 0;
555     }
556   }
557 
558   PA_DCHECK(bucket.count != 0);
559   internal::PartitionFreelistEntry* entry = bucket.freelist_head;
560   // TODO(lizeb): Consider removing once crbug.com/1382658 is fixed.
561 #if BUILDFLAG(IS_CHROMEOS) && defined(ARCH_CPU_X86_64) && \
562     BUILDFLAG(HAS_64_BIT_POINTERS)
563   // x86_64 architecture now supports 57 bits of address space, as of Ice Lake
564   // for Intel. However Chrome OS systems do not ship with kernel support for
565   // it, but with 48 bits, so all canonical addresses have the upper 16 bits
566   // zeroed (17 in practice, since the upper half of address space is reserved
567   // by the kernel).
568   constexpr uintptr_t kCanonicalPointerMask = (1ULL << 48) - 1;
569   PA_CHECK(!(reinterpret_cast<uintptr_t>(entry) & ~kCanonicalPointerMask));
570 #endif  // BUILDFLAG(IS_CHROMEOS) && defined(ARCH_CPU_X86_64) &&
571         // BUILDFLAG(HAS_64_BIT_POINTERS)
572 
573   // Passes the bucket size to |GetNext()|, so that in case of freelist
574   // corruption, we know the bucket size that lead to the crash, helping to
575   // narrow down the search for culprit. |bucket| was touched just now, so this
576   // does not introduce another cache miss.
577   const internal::PartitionFreelistDispatcher* freelist_dispatcher =
578       get_freelist_dispatcher_from_root();
579 #if BUILDFLAG(USE_FREELIST_POOL_OFFSETS)
580   internal::PartitionFreelistEntry* next =
581       freelist_dispatcher->GetNextForThreadCacheTrue(entry, bucket.slot_size);
582 #else
583   internal::PartitionFreelistEntry* next =
584       freelist_dispatcher->GetNextForThreadCache<true>(entry, bucket.slot_size);
585 #endif  // USE_FREELIST_POOL_OFFSETS
586 
587   PA_DCHECK(entry != next);
588   bucket.count--;
589   PA_DCHECK(bucket.count != 0 || !next);
590   bucket.freelist_head = next;
591   *slot_size = bucket.slot_size;
592 
593   PA_DCHECK(cached_memory_ >= bucket.slot_size);
594   cached_memory_ -= bucket.slot_size;
595 
596   return internal::SlotStartPtr2Addr(entry);
597 }
598 
PutInBucket(Bucket & bucket,uintptr_t slot_start)599 PA_ALWAYS_INLINE void ThreadCache::PutInBucket(Bucket& bucket,
600                                                uintptr_t slot_start) {
601 #if PA_CONFIG(HAS_FREELIST_SHADOW_ENTRY) && defined(ARCH_CPU_X86_64) && \
602     BUILDFLAG(HAS_64_BIT_POINTERS)
603   // We see freelist corruption crashes happening in the wild.  These are likely
604   // due to out-of-bounds accesses in the previous slot, or to a Use-After-Free
605   // somewhere in the code.
606   //
607   // The issue is that we detect the UaF far away from the place where it
608   // happens. As a consequence, we should try to make incorrect code crash as
609   // early as possible. Poisoning memory at free() time works for UaF, but it
610   // was seen in the past to incur a high performance cost.
611   //
612   // Here, only poison the current cacheline, which we are touching anyway.
613   // TODO(lizeb): Make sure this does not hurt performance.
614 
615   // Everything below requires this alignment.
616   static_assert(internal::kAlignment == 16, "");
617 
618   // The pointer is always 16 bytes aligned, so its start address is always == 0
619   // % 16. Its distance to the next cacheline is
620   //   `64 - ((slot_start & 63) / 16) * 16`
621   static_assert(
622       internal::kPartitionCachelineSize == 64,
623       "The computation below assumes that cache lines are 64 bytes long.");
624   int distance_to_next_cacheline_in_16_bytes = 4 - ((slot_start >> 4) & 3);
625   int slot_size_remaining_in_16_bytes = bucket.slot_size / 16;
626   slot_size_remaining_in_16_bytes = std::min(
627       slot_size_remaining_in_16_bytes, distance_to_next_cacheline_in_16_bytes);
628 
629   static const uint32_t poison_16_bytes[4] = {0xbadbad00, 0xbadbad00,
630                                               0xbadbad00, 0xbadbad00};
631 
632 #if !(BUILDFLAG(IS_WIN) && defined(COMPONENT_BUILD))
633   void* slot_start_tagged = std::assume_aligned<internal::kAlignment>(
634       internal::SlotStartAddr2Ptr(slot_start));
635 #else
636   // TODO(crbug.com/1429450): std::assume_aligned introuces an additional
637   // dependency: _libcpp_verbose_abort(const char*, ...).  It will cause
638   // "undefined symbol" error when linking allocator_shim.dll.
639   void* slot_start_tagged = internal::SlotStartAddr2Ptr(slot_start);
640 #endif
641 
642   uint32_t* address_aligned = static_cast<uint32_t*>(slot_start_tagged);
643   for (int i = 0; i < slot_size_remaining_in_16_bytes; i++) {
644     // Clang will expand the memcpy to a 16-byte write (movups on x86).
645     memcpy(address_aligned, poison_16_bytes, sizeof(poison_16_bytes));
646     address_aligned += 4;
647   }
648 #endif  // PA_CONFIG(HAS_FREELIST_SHADOW_ENTRY) && defined(ARCH_CPU_X86_64) &&
649         // BUILDFLAG(HAS_64_BIT_POINTERS)
650 
651   auto* entry =
652       get_freelist_dispatcher_from_root()->EmplaceAndInitForThreadCache(
653           slot_start, bucket.freelist_head);
654   bucket.freelist_head = entry;
655   bucket.count++;
656 }
657 
RecordAllocation(size_t size)658 PA_ALWAYS_INLINE void ThreadCache::RecordAllocation(size_t size) {
659   thread_alloc_stats_.alloc_count++;
660   thread_alloc_stats_.alloc_total_size += size;
661 }
662 
RecordDeallocation(size_t size)663 PA_ALWAYS_INLINE void ThreadCache::RecordDeallocation(size_t size) {
664   thread_alloc_stats_.dealloc_count++;
665   thread_alloc_stats_.dealloc_total_size += size;
666 }
667 
668 }  // namespace partition_alloc
669 
670 #endif  // PARTITION_ALLOC_THREAD_CACHE_H_
671