xref: /aosp_15_r20/external/cronet/base/sampling_heap_profiler/poisson_allocation_sampler.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/sampling_heap_profiler/poisson_allocation_sampler.h"
6 
7 #include <atomic>
8 #include <cmath>
9 #include <memory>
10 #include <utility>
11 
12 #include "base/allocator/dispatcher/reentry_guard.h"
13 #include "base/allocator/dispatcher/tls.h"
14 #include "base/check.h"
15 #include "base/compiler_specific.h"
16 #include "base/memory/raw_ptr.h"
17 #include "base/no_destructor.h"
18 #include "base/rand_util.h"
19 #include "base/ranges/algorithm.h"
20 #include "build/build_config.h"
21 #include "third_party/abseil-cpp/absl/base/attributes.h"
22 
23 namespace base {
24 
25 namespace {
26 
27 using ::base::allocator::dispatcher::ReentryGuard;
28 
29 const size_t kDefaultSamplingIntervalBytes = 128 * 1024;
30 
31 const intptr_t kAccumulatedBytesOffset = 1 << 29;
32 
33 // Controls if sample intervals should not be randomized. Used for testing.
34 bool g_deterministic = false;
35 
36 // Pointer to the current |LockFreeAddressHashSet|.
37 ABSL_CONST_INIT std::atomic<LockFreeAddressHashSet*> g_sampled_addresses_set{
38     nullptr};
39 
40 // Sampling interval parameter, the mean value for intervals between samples.
41 ABSL_CONST_INIT std::atomic_size_t g_sampling_interval{
42     kDefaultSamplingIntervalBytes};
43 
44 struct ThreadLocalData {
45   // Accumulated bytes towards sample.
46   intptr_t accumulated_bytes = 0;
47   // Used as a workaround to avoid bias from muted samples. See
48   // ScopedMuteThreadSamples for more details.
49   intptr_t accumulated_bytes_snapshot = 0;
50   // PoissonAllocationSampler performs allocations while handling a
51   // notification. The guard protects against recursions originating from these.
52   bool internal_reentry_guard = false;
53   // A boolean used to distinguish first allocation on a thread:
54   //   false - first allocation on the thread;
55   //   true  - otherwise.
56   // Since accumulated_bytes is initialized with zero the very first
57   // allocation on a thread would always trigger the sample, thus skewing the
58   // profile towards such allocations. To mitigate that we use the flag to
59   // ensure the first allocation is properly accounted.
60   bool sampling_interval_initialized = false;
61 };
62 
GetThreadLocalData()63 ThreadLocalData* GetThreadLocalData() {
64 #if USE_LOCAL_TLS_EMULATION()
65   // If available, use ThreadLocalStorage to bypass dependencies introduced by
66   // Clang's implementation of thread_local.
67   static base::NoDestructor<
68       base::allocator::dispatcher::ThreadLocalStorage<ThreadLocalData>>
69       thread_local_data("poisson_allocation_sampler");
70   return thread_local_data->GetThreadLocalData();
71 #else
72   // Notes on TLS usage:
73   //
74   // * There's no safe way to use TLS in malloc() as both C++ thread_local and
75   //   pthread do not pose any guarantees on whether they allocate or not.
76   // * We think that we can safely use thread_local w/o re-entrancy guard
77   //   because the compiler will use "tls static access model" for static builds
78   //   of Chrome [https://www.uclibc.org/docs/tls.pdf].
79   //   But there's no guarantee that this will stay true, and in practice
80   //   it seems to have problems on macOS/Android. These platforms do allocate
81   //   on the very first access to a thread_local on each thread.
82   // * Directly using/warming-up platform TLS seems to work on all platforms,
83   //   but is also not guaranteed to stay true. We make use of it for reentrancy
84   //   guards on macOS/Android.
85   // * We cannot use Windows Tls[GS]etValue API as it modifies the result of
86   //   GetLastError.
87   //
88   // Android thread_local seems to be using __emutls_get_address from libgcc:
89   // https://github.com/gcc-mirror/gcc/blob/master/libgcc/emutls.c
90   // macOS version is based on _tlv_get_addr from dyld:
91   // https://opensource.apple.com/source/dyld/dyld-635.2/src/threadLocalHelpers.s.auto.html
92   thread_local ThreadLocalData thread_local_data;
93   return &thread_local_data;
94 #endif
95 }
96 
97 }  // namespace
98 
ScopedMuteThreadSamples()99 PoissonAllocationSampler::ScopedMuteThreadSamples::ScopedMuteThreadSamples() {
100   ThreadLocalData* const thread_local_data = GetThreadLocalData();
101 
102   DCHECK(!thread_local_data->internal_reentry_guard);
103   thread_local_data->internal_reentry_guard = true;
104 
105   // We mute thread samples immediately after taking a sample, which is when we
106   // reset g_tls_accumulated_bytes. This breaks the random sampling requirement
107   // of the poisson process, and causes us to systematically overcount all other
108   // allocations. That's because muted allocations rarely trigger a sample
109   // [which would cause them to be ignored] since they occur right after
110   // g_tls_accumulated_bytes is reset.
111   //
112   // To counteract this, we drop g_tls_accumulated_bytes by a large, fixed
113   // amount to lower the probability that a sample is taken to close to 0. Then
114   // we reset it after we're done muting thread samples.
115   thread_local_data->accumulated_bytes_snapshot =
116       thread_local_data->accumulated_bytes;
117   thread_local_data->accumulated_bytes -= kAccumulatedBytesOffset;
118 }
119 
~ScopedMuteThreadSamples()120 PoissonAllocationSampler::ScopedMuteThreadSamples::~ScopedMuteThreadSamples() {
121   ThreadLocalData* const thread_local_data = GetThreadLocalData();
122   DCHECK(thread_local_data->internal_reentry_guard);
123   thread_local_data->internal_reentry_guard = false;
124   thread_local_data->accumulated_bytes =
125       thread_local_data->accumulated_bytes_snapshot;
126 }
127 
128 // static
IsMuted()129 bool PoissonAllocationSampler::ScopedMuteThreadSamples::IsMuted() {
130   ThreadLocalData* const thread_local_data = GetThreadLocalData();
131   return thread_local_data->internal_reentry_guard;
132 }
133 
134 PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
ScopedSuppressRandomnessForTesting()135     ScopedSuppressRandomnessForTesting() {
136   DCHECK(!g_deterministic);
137   g_deterministic = true;
138   // The accumulated_bytes may contain a random value from previous
139   // test runs, which would make the behaviour of the next call to
140   // RecordAlloc unpredictable.
141   ThreadLocalData* const thread_local_data = GetThreadLocalData();
142   thread_local_data->accumulated_bytes = 0;
143 }
144 
145 PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
~ScopedSuppressRandomnessForTesting()146     ~ScopedSuppressRandomnessForTesting() {
147   DCHECK(g_deterministic);
148   g_deterministic = false;
149 }
150 
151 // static
152 bool PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
IsSuppressed()153     IsSuppressed() {
154   return g_deterministic;
155 }
156 
157 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
ScopedMuteHookedSamplesForTesting()158     ScopedMuteHookedSamplesForTesting() {
159   SetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
160 
161   // Reset the accumulated bytes to 0 on this thread.
162   ThreadLocalData* const thread_local_data = GetThreadLocalData();
163   accumulated_bytes_snapshot_ = thread_local_data->accumulated_bytes;
164   thread_local_data->accumulated_bytes = 0;
165 }
166 
167 PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
~ScopedMuteHookedSamplesForTesting()168     ~ScopedMuteHookedSamplesForTesting() {
169   // Restore the accumulated bytes.
170   ThreadLocalData* const thread_local_data = GetThreadLocalData();
171   thread_local_data->accumulated_bytes = accumulated_bytes_snapshot_;
172   ResetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
173 }
174 
175 // static
176 ABSL_CONST_INIT std::atomic<PoissonAllocationSampler::ProfilingStateFlagMask>
177     PoissonAllocationSampler::profiling_state_{0};
178 
PoissonAllocationSampler()179 PoissonAllocationSampler::PoissonAllocationSampler() {
180   Init();
181   auto* sampled_addresses = new LockFreeAddressHashSet(64);
182   g_sampled_addresses_set.store(sampled_addresses, std::memory_order_release);
183 }
184 
185 // static
Init()186 void PoissonAllocationSampler::Init() {
187   [[maybe_unused]] static bool init_once = []() {
188     // Touch thread local data on initialization to enforce proper setup of
189     // underlying storage system.
190     GetThreadLocalData();
191     ReentryGuard::InitTLSSlot();
192     return true;
193   }();
194 }
195 
SetSamplingInterval(size_t sampling_interval_bytes)196 void PoissonAllocationSampler::SetSamplingInterval(
197     size_t sampling_interval_bytes) {
198   // TODO(alph): Reset the sample being collected if running.
199   g_sampling_interval.store(sampling_interval_bytes, std::memory_order_relaxed);
200 }
201 
SamplingInterval() const202 size_t PoissonAllocationSampler::SamplingInterval() const {
203   return g_sampling_interval.load(std::memory_order_relaxed);
204 }
205 
206 // static
GetNextSampleInterval(size_t interval)207 size_t PoissonAllocationSampler::GetNextSampleInterval(size_t interval) {
208   if (UNLIKELY(g_deterministic)) {
209     return interval;
210   }
211 
212   // We sample with a Poisson process, with constant average sampling
213   // interval. This follows the exponential probability distribution with
214   // parameter λ = 1/interval where |interval| is the average number of bytes
215   // between samples.
216   // Let u be a uniformly distributed random number (0,1], then
217   // next_sample = -ln(u) / λ
218   // RandDouble returns numbers [0,1). We use 1-RandDouble to correct it to
219   // avoid a possible floating point exception from taking the log of 0.
220   // The allocator shim uses the PoissonAllocationSampler, hence avoid
221   // allocation to avoid infinite recursion.
222   double uniform = internal::RandDoubleAvoidAllocation();
223   double value = -log(1 - uniform) * interval;
224   size_t min_value = sizeof(intptr_t);
225   // We limit the upper bound of a sample interval to make sure we don't have
226   // huge gaps in the sampling stream. Probability of the upper bound gets hit
227   // is exp(-20) ~ 2e-9, so it should not skew the distribution.
228   size_t max_value = interval * 20;
229   if (UNLIKELY(value < min_value)) {
230     return min_value;
231   }
232   if (UNLIKELY(value > max_value)) {
233     return max_value;
234   }
235   return static_cast<size_t>(value);
236 }
237 
DoRecordAllocation(const ProfilingStateFlagMask state,void * address,size_t size,base::allocator::dispatcher::AllocationSubsystem type,const char * context)238 void PoissonAllocationSampler::DoRecordAllocation(
239     const ProfilingStateFlagMask state,
240     void* address,
241     size_t size,
242     base::allocator::dispatcher::AllocationSubsystem type,
243     const char* context) {
244   ThreadLocalData* const thread_local_data = GetThreadLocalData();
245 
246   thread_local_data->accumulated_bytes += size;
247   intptr_t accumulated_bytes = thread_local_data->accumulated_bytes;
248   if (LIKELY(accumulated_bytes < 0)) {
249     return;
250   }
251 
252   if (UNLIKELY(!(state & ProfilingStateFlag::kIsRunning))) {
253     // Sampling was in fact disabled when the hook was called. Reset the state
254     // of the sampler. We do this check off the fast-path, because it's quite a
255     // rare state when the sampler is stopped after it's started. (The most
256     // common caller of PoissonAllocationSampler starts it and leaves it running
257     // for the rest of the Chrome session.)
258     thread_local_data->sampling_interval_initialized = false;
259     thread_local_data->accumulated_bytes = 0;
260     return;
261   }
262 
263   // Failed allocation? Skip the sample.
264   if (UNLIKELY(!address)) {
265     return;
266   }
267 
268   size_t mean_interval = g_sampling_interval.load(std::memory_order_relaxed);
269   if (UNLIKELY(!thread_local_data->sampling_interval_initialized)) {
270     thread_local_data->sampling_interval_initialized = true;
271     // This is the very first allocation on the thread. It always makes it
272     // passing the condition at |RecordAlloc|, because accumulated_bytes
273     // is initialized with zero due to TLS semantics.
274     // Generate proper sampling interval instance and make sure the allocation
275     // has indeed crossed the threshold before counting it as a sample.
276     accumulated_bytes -= GetNextSampleInterval(mean_interval);
277     if (accumulated_bytes < 0) {
278       thread_local_data->accumulated_bytes = accumulated_bytes;
279       return;
280     }
281   }
282 
283   // This cast is safe because this function is only called with a positive
284   // value of `accumulated_bytes`.
285   size_t samples = static_cast<size_t>(accumulated_bytes) / mean_interval;
286   accumulated_bytes %= mean_interval;
287 
288   do {
289     accumulated_bytes -= GetNextSampleInterval(mean_interval);
290     ++samples;
291   } while (accumulated_bytes >= 0);
292 
293   thread_local_data->accumulated_bytes = accumulated_bytes;
294 
295   if (UNLIKELY(ScopedMuteThreadSamples::IsMuted())) {
296     return;
297   }
298 
299   ScopedMuteThreadSamples no_reentrancy_scope;
300   std::vector<raw_ptr<SamplesObserver, VectorExperimental>> observers_copy;
301   {
302     AutoLock lock(mutex_);
303 
304     // TODO(alph): Sometimes RecordAlloc is called twice in a row without
305     // a RecordFree in between. Investigate it.
306     if (sampled_addresses_set().Contains(address)) {
307       return;
308     }
309     sampled_addresses_set().Insert(address);
310     BalanceAddressesHashSet();
311     observers_copy = observers_;
312   }
313 
314   size_t total_allocated = mean_interval * samples;
315   for (base::PoissonAllocationSampler::SamplesObserver* observer :
316        observers_copy) {
317     observer->SampleAdded(address, size, total_allocated, type, context);
318   }
319 }
320 
DoRecordFree(void * address)321 void PoissonAllocationSampler::DoRecordFree(void* address) {
322   // There is a rare case on macOS and Android when the very first thread_local
323   // access in ScopedMuteThreadSamples constructor may allocate and
324   // thus reenter DoRecordAlloc. However the call chain won't build up further
325   // as RecordAlloc accesses are guarded with pthread TLS-based ReentryGuard.
326   ScopedMuteThreadSamples no_reentrancy_scope;
327   std::vector<raw_ptr<SamplesObserver, VectorExperimental>> observers_copy;
328   {
329     AutoLock lock(mutex_);
330     observers_copy = observers_;
331     sampled_addresses_set().Remove(address);
332   }
333   for (base::PoissonAllocationSampler::SamplesObserver* observer :
334        observers_copy) {
335     observer->SampleRemoved(address);
336   }
337 }
338 
BalanceAddressesHashSet()339 void PoissonAllocationSampler::BalanceAddressesHashSet() {
340   // Check if the load_factor of the current addresses hash set becomes higher
341   // than 1, allocate a new twice larger one, copy all the data,
342   // and switch to using it.
343   // During the copy process no other writes are made to both sets
344   // as it's behind the lock.
345   // All the readers continue to use the old one until the atomic switch
346   // process takes place.
347   LockFreeAddressHashSet& current_set = sampled_addresses_set();
348   if (current_set.load_factor() < 1) {
349     return;
350   }
351   auto new_set =
352       std::make_unique<LockFreeAddressHashSet>(current_set.buckets_count() * 2);
353   new_set->Copy(current_set);
354   // Atomically switch all the new readers to the new set.
355   g_sampled_addresses_set.store(new_set.release(), std::memory_order_release);
356   // We leak the older set because we still have to keep all the old maps alive
357   // as there might be reader threads that have already obtained the map,
358   // but haven't yet managed to access it.
359 }
360 
361 // static
sampled_addresses_set()362 LockFreeAddressHashSet& PoissonAllocationSampler::sampled_addresses_set() {
363   return *g_sampled_addresses_set.load(std::memory_order_acquire);
364 }
365 
366 // static
Get()367 PoissonAllocationSampler* PoissonAllocationSampler::Get() {
368   static NoDestructor<PoissonAllocationSampler> instance;
369   return instance.get();
370 }
371 
372 // static
SetProfilingStateFlag(ProfilingStateFlag flag)373 void PoissonAllocationSampler::SetProfilingStateFlag(ProfilingStateFlag flag) {
374   ProfilingStateFlagMask flags = flag;
375   if (flag == ProfilingStateFlag::kIsRunning) {
376     flags |= ProfilingStateFlag::kWasStarted;
377   }
378   ProfilingStateFlagMask old_state =
379       profiling_state_.fetch_or(flags, std::memory_order_relaxed);
380   DCHECK(!(old_state & flag));
381 }
382 
383 // static
ResetProfilingStateFlag(ProfilingStateFlag flag)384 void PoissonAllocationSampler::ResetProfilingStateFlag(
385     ProfilingStateFlag flag) {
386   DCHECK_NE(flag, kWasStarted);
387   ProfilingStateFlagMask old_state =
388       profiling_state_.fetch_and(~flag, std::memory_order_relaxed);
389   DCHECK(old_state & flag);
390 }
391 
AddSamplesObserver(SamplesObserver * observer)392 void PoissonAllocationSampler::AddSamplesObserver(SamplesObserver* observer) {
393   // The following implementation (including ScopedMuteThreadSamples) will use
394   // `thread_local`, which may cause a reentrancy issue.  So, temporarily
395   // disable the sampling by having a ReentryGuard.
396   ReentryGuard guard;
397 
398   ScopedMuteThreadSamples no_reentrancy_scope;
399   AutoLock lock(mutex_);
400   DCHECK(ranges::find(observers_, observer) == observers_.end());
401   bool profiler_was_stopped = observers_.empty();
402   observers_.push_back(observer);
403 
404   // Adding the observer will enable profiling. This will use
405   // `g_sampled_address_set` so it had better be initialized.
406   DCHECK(g_sampled_addresses_set.load(std::memory_order_relaxed));
407 
408   // Start the profiler if this was the first observer. Setting/resetting
409   // kIsRunning isn't racy because it's performed based on `observers_.empty()`
410   // while holding `mutex_`.
411   if (profiler_was_stopped) {
412     SetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
413   }
414   DCHECK(profiling_state_.load(std::memory_order_relaxed) &
415          ProfilingStateFlag::kIsRunning);
416 }
417 
RemoveSamplesObserver(SamplesObserver * observer)418 void PoissonAllocationSampler::RemoveSamplesObserver(
419     SamplesObserver* observer) {
420   // The following implementation (including ScopedMuteThreadSamples) will use
421   // `thread_local`, which may cause a reentrancy issue.  So, temporarily
422   // disable the sampling by having a ReentryGuard.
423   ReentryGuard guard;
424 
425   ScopedMuteThreadSamples no_reentrancy_scope;
426   AutoLock lock(mutex_);
427   auto it = ranges::find(observers_, observer);
428   CHECK(it != observers_.end(), base::NotFatalUntil::M125);
429   observers_.erase(it);
430 
431   // Stop the profiler if there are no more observers. Setting/resetting
432   // kIsRunning isn't racy because it's performed based on `observers_.empty()`
433   // while holding `mutex_`.
434   DCHECK(profiling_state_.load(std::memory_order_relaxed) &
435          ProfilingStateFlag::kIsRunning);
436   if (observers_.empty()) {
437     ResetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
438   }
439 }
440 
441 }  // namespace base
442