xref: /aosp_15_r20/external/ruy/ruy/allocator.cc (revision bb86c7ed5fb1b98a7eac808e443a46cc8b90dfc0)
1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "ruy/allocator.h"
17 
18 #include "ruy/opt_set.h"
19 #include "ruy/size_util.h"
20 #include "ruy/system_aligned_alloc.h"
21 
22 namespace ruy {
23 
~Allocator()24 Allocator::~Allocator() {
25   FreeAll();
26   detail::SystemAlignedFree(ptr_);
27 }
28 
AllocateFast(std::ptrdiff_t num_bytes)29 void* Allocator::AllocateFast(std::ptrdiff_t num_bytes) {
30   if (current_ + num_bytes > size_) {
31     return nullptr;
32   }
33   void* ret = static_cast<char*>(ptr_) + current_;
34   current_ += num_bytes;
35   return ret;
36 }
37 
AllocateSlow(std::ptrdiff_t num_bytes)38 void* Allocator::AllocateSlow(std::ptrdiff_t num_bytes) {
39   void* p = detail::SystemAlignedAlloc(num_bytes);
40   fallback_blocks_total_size_ += num_bytes;
41   fallback_blocks_.push_back(p);
42   return p;
43 }
44 
AllocateBytes(std::ptrdiff_t num_bytes)45 void* Allocator::AllocateBytes(std::ptrdiff_t num_bytes) {
46   if (num_bytes == 0) {
47     return nullptr;
48   }
49   const std::ptrdiff_t rounded_num_bytes =
50       round_up_pot(num_bytes, detail::kMinimumBlockAlignment);
51   if (void* p = AllocateFast(rounded_num_bytes)) {
52     return p;
53   }
54   return AllocateSlow(rounded_num_bytes);
55 }
56 
AllocateBytesAvoidingAliasingWith(std::ptrdiff_t num_bytes,const void * to_avoid)57 void* Allocator::AllocateBytesAvoidingAliasingWith(std::ptrdiff_t num_bytes,
58                                                    const void* to_avoid) {
59 #if RUY_OPT(AVOID_ALIASING)
60   if (num_bytes == 0) {
61     return nullptr;
62   }
63   // The minimum L1D cache aliasing periodicity in bytes that we expect to
64   // encounter on any device. This does not seem to be documented, but
65   // empirically we observe the following:
66   //   Cortex-A53:   1024
67   //   Cortex-A55r1: 2048
68   //   Cortex-A76:   not as easily observable.
69   // Over-estimating this makes the AVOID_ALIASING optimization useless on
70   // devices with lower periodicity.
71   // Under-estimating this by 2x should be harmless.
72   // Under-estimating this by a larger factor should gradually degrade
73   // performance due to cache aliasing causing mutual eviction between
74   // the packed matrix data, and the source matrix data being prefetched by the
75   // CPU ahead of the packing code execution.
76   static constexpr std::uint32_t kMinPeriod = 1024;
77   static_assert(is_pot(kMinPeriod), "");
78   void* p = AllocateBytes(num_bytes + kMinPeriod);
79   auto unsigned_low_bits = [](const void* p) {
80     return static_cast<std::uint32_t>(reinterpret_cast<std::uintptr_t>(p));
81   };
82   // This relies on unsigned integer overflow wrapping around.
83   std::uint32_t diff_modulus =
84       (unsigned_low_bits(p) - unsigned_low_bits(to_avoid)) % kMinPeriod;
85   // diff_modulus is in [0, kMinPeriod).
86   // We want it as close as possible to the middle of that interval,
87   // kMinPeriod/2. The bad 'aliasing' case, that we are working to avoid,
88   // is when diff_modulus is close to the ends of that interval, 0 or
89   // kMinPeriod. So we want to add an offset of kMinPeriod/2 if it is in the
90   // first or the last quarter of that interval.
91   bool need_offset =
92       diff_modulus < kMinPeriod / 4 || diff_modulus > 3 * kMinPeriod / 4;
93   return static_cast<char*>(p) + (need_offset ? (kMinPeriod / 2) : 0);
94 #else
95   (void)to_avoid;
96   return AllocateBytes(num_bytes);
97 #endif
98 }
99 
FreeAll()100 void Allocator::FreeAll() {
101   current_ = 0;
102   if (fallback_blocks_.empty()) {
103     return;
104   }
105 
106   // Free all memory before reallocating `ptr_`.
107   // This minimizes the memory high-water-mark.
108   detail::SystemAlignedFree(ptr_);
109   for (void* p : fallback_blocks_) {
110     detail::SystemAlignedFree(p);
111   }
112 
113   // We reallocate to the exact new size, rather than growing
114   // exponentially like std::vector. This means linear instead of logarithmic
115   // bound on the number of allocation in some worst-case calling patterns.
116   // This is considered worth it because minimizing memory usage is important
117   // and actual calling patterns in applications that we care about still
118   // reach the no-further-allocations steady state in a small finite number
119   // of iterations.
120   std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_;
121   ptr_ = detail::SystemAlignedAlloc(new_size);
122   size_ = new_size;
123 
124   fallback_blocks_.clear();
125   fallback_blocks_total_size_ = 0;
126 }
127 
128 }  // namespace ruy
129