xref: /aosp_15_r20/external/cronet/base/allocator/partition_allocator/src/partition_alloc/starscan/scan_loop.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2021 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef PARTITION_ALLOC_STARSCAN_SCAN_LOOP_H_
6 #define PARTITION_ALLOC_STARSCAN_SCAN_LOOP_H_
7 
8 #include <cstddef>
9 #include <cstdint>
10 
11 #include "build/build_config.h"
12 #include "partition_alloc/partition_alloc_base/compiler_specific.h"
13 #include "partition_alloc/partition_alloc_buildflags.h"
14 #include "partition_alloc/partition_alloc_check.h"
15 #include "partition_alloc/partition_alloc_config.h"
16 #include "partition_alloc/starscan/starscan_fwd.h"
17 #include "partition_alloc/tagging.h"
18 
19 #if defined(ARCH_CPU_X86_64)
20 // Include order is important, so we disable formatting.
21 // clang-format off
22 // Including these headers directly should generally be avoided. For the
23 // scanning loop, we check at runtime which SIMD extension we can use. Since
24 // Chrome is compiled with -msse3 (the minimal requirement), we include the
25 // headers directly to make the intrinsics available. Another option could be to
26 // use inline assembly, but that would hinder compiler optimization for
27 // vectorized instructions.
28 #include <immintrin.h>
29 #include <smmintrin.h>
30 #include <avxintrin.h>
31 #include <avx2intrin.h>
32 // clang-format on
33 #endif
34 
35 #if PA_CONFIG(STARSCAN_NEON_SUPPORTED)
36 #include <arm_neon.h>
37 #endif
38 
39 namespace partition_alloc::internal {
40 
41 // Iterates over range of memory using the best available SIMD extension.
42 // Assumes that 64bit platforms have pool support and the begin pointer of
43 // incoming ranges are properly aligned. The class is designed around the CRTP
44 // version of the "template method" (in GoF terms). CRTP is needed for fast
45 // static dispatch.
46 template <typename Derived>
47 class ScanLoop {
48  public:
ScanLoop(SimdSupport simd_type)49   explicit ScanLoop(SimdSupport simd_type) : simd_type_(simd_type) {}
50 
51   ScanLoop(const ScanLoop&) = delete;
52   ScanLoop& operator=(const ScanLoop&) = delete;
53 
54   // Scan input range. Assumes the range is properly aligned. Please note that
55   // the function doesn't MTE-tag the input range as it assumes that MTE is
56   // disabled when function is called. See DisableMTEScope for details.
57   void Run(uintptr_t begin, uintptr_t end);
58 
59  private:
derived()60   const Derived& derived() const { return static_cast<const Derived&>(*this); }
derived()61   Derived& derived() { return static_cast<Derived&>(*this); }
62 
63 #if defined(ARCH_CPU_X86_64)
64   __attribute__((target("avx2"))) void RunAVX2(uintptr_t, uintptr_t);
65   __attribute__((target("sse4.1"))) void RunSSE4(uintptr_t, uintptr_t);
66 #endif
67 #if PA_CONFIG(STARSCAN_NEON_SUPPORTED)
68   void RunNEON(uintptr_t, uintptr_t);
69 #endif
70 
71   void RunUnvectorized(uintptr_t, uintptr_t);
72 
73   SimdSupport simd_type_;
74 };
75 
76 template <typename Derived>
Run(uintptr_t begin,uintptr_t end)77 void ScanLoop<Derived>::Run(uintptr_t begin, uintptr_t end) {
78 // We allow vectorization only for 64bit since they require support of the
79 // 64bit regular pool, and only for x86 because a special instruction set is
80 // required.
81 #if defined(ARCH_CPU_X86_64)
82   if (simd_type_ == SimdSupport::kAVX2) {
83     return RunAVX2(begin, end);
84   }
85   if (simd_type_ == SimdSupport::kSSE41) {
86     return RunSSE4(begin, end);
87   }
88 #elif PA_CONFIG(STARSCAN_NEON_SUPPORTED)
89   if (simd_type_ == SimdSupport::kNEON) {
90     return RunNEON(begin, end);
91   }
92 #endif  // PA_CONFIG(STARSCAN_NEON_SUPPORTED)
93   return RunUnvectorized(begin, end);
94 }
95 
96 template <typename Derived>
RunUnvectorized(uintptr_t begin,uintptr_t end)97 void ScanLoop<Derived>::RunUnvectorized(uintptr_t begin, uintptr_t end) {
98   PA_SCAN_DCHECK(!(begin % sizeof(uintptr_t)));
99   PA_SCAN_DCHECK(!(end % sizeof(uintptr_t)));
100 #if BUILDFLAG(HAS_64_BIT_POINTERS)
101   // If the read value is a pointer into the PA region, it's likely
102   // MTE-tagged. Piggyback on |mask| to untag, for efficiency.
103   const uintptr_t mask = Derived::RegularPoolMask() & kPtrUntagMask;
104   const uintptr_t base = Derived::RegularPoolBase();
105 #endif  // BUILDFLAG(HAS_64_BIT_POINTERS)
106   for (; begin < end; begin += sizeof(uintptr_t)) {
107     // Read the region word-by-word. Everything that we read is a potential
108     // pointer to or inside an object on heap. Such an object should be
109     // quarantined, if attempted to free.
110     //
111     // Keep it MTE-untagged. See DisableMTEScope for details.
112     const uintptr_t maybe_ptr = *reinterpret_cast<uintptr_t*>(begin);
113 #if BUILDFLAG(HAS_64_BIT_POINTERS)
114     if (PA_LIKELY((maybe_ptr & mask) != base)) {
115       continue;
116     }
117 #else
118     if (!maybe_ptr) {
119       continue;
120     }
121 #endif  // BUILDFLAG(HAS_64_BIT_POINTERS)
122     derived().CheckPointer(maybe_ptr);
123   }
124 }
125 
126 #if defined(ARCH_CPU_X86_64)
127 template <typename Derived>
RunAVX2(uintptr_t begin,uintptr_t end)128 __attribute__((target("avx2"))) void ScanLoop<Derived>::RunAVX2(uintptr_t begin,
129                                                                 uintptr_t end) {
130   static constexpr size_t kAlignmentRequirement = 32;
131   static constexpr size_t kWordsInVector = 4;
132   static constexpr size_t kBytesInVector = kWordsInVector * sizeof(uintptr_t);
133   PA_SCAN_DCHECK(!(begin % kAlignmentRequirement));
134   // Stick to integer instructions. This brings slightly better throughput. For
135   // example, according to the Intel docs, on Broadwell and Haswell the CPI of
136   // vmovdqa (_mm256_load_si256) is twice smaller (0.25) than that of vmovapd
137   // (_mm256_load_pd).
138   const __m256i vbase = _mm256_set1_epi64x(derived().RegularPoolBase());
139   // If the read value is a pointer into the PA region, it's likely
140   // MTE-tagged. Piggyback on |regular_pool_mask| to untag, for efficiency.
141   const __m256i regular_pool_mask =
142       _mm256_set1_epi64x(derived().RegularPoolMask() & kPtrUntagMask);
143 
144   static_assert(sizeof(__m256i) == kBytesInVector);
145   for (; begin <= (end - kBytesInVector); begin += kBytesInVector) {
146     // Keep it MTE-untagged. See DisableMTEScope for details.
147     const __m256i maybe_ptrs =
148         _mm256_load_si256(reinterpret_cast<__m256i*>(begin));
149     const __m256i vand = _mm256_and_si256(maybe_ptrs, regular_pool_mask);
150     const __m256i vcmp = _mm256_cmpeq_epi64(vand, vbase);
151     const int mask = _mm256_movemask_pd(_mm256_castsi256_pd(vcmp));
152     if (PA_LIKELY(!mask)) {
153       continue;
154     }
155     // It's important to extract pointers from the already loaded vector.
156     // Otherwise, new loads can break in-pool assumption checked above.
157     if (mask & 0b0001) {
158       derived().CheckPointer(_mm256_extract_epi64(maybe_ptrs, 0));
159     }
160     if (mask & 0b0010) {
161       derived().CheckPointer(_mm256_extract_epi64(maybe_ptrs, 1));
162     }
163     if (mask & 0b0100) {
164       derived().CheckPointer(_mm256_extract_epi64(maybe_ptrs, 2));
165     }
166     if (mask & 0b1000) {
167       derived().CheckPointer(_mm256_extract_epi64(maybe_ptrs, 3));
168     }
169   }
170   // Run unvectorized on the remainder of the region.
171   RunUnvectorized(begin, end);
172 }
173 
174 template <typename Derived>
RunSSE4(uintptr_t begin,uintptr_t end)175 __attribute__((target("sse4.1"))) void ScanLoop<Derived>::RunSSE4(
176     uintptr_t begin,
177     uintptr_t end) {
178   static constexpr size_t kAlignmentRequirement = 16;
179   static constexpr size_t kWordsInVector = 2;
180   static constexpr size_t kBytesInVector = kWordsInVector * sizeof(uintptr_t);
181   PA_SCAN_DCHECK(!(begin % kAlignmentRequirement));
182   const __m128i vbase = _mm_set1_epi64x(derived().RegularPoolBase());
183   // If the read value is a pointer into the PA region, it's likely
184   // MTE-tagged. Piggyback on |regular_pool_mask| to untag, for efficiency.
185   const __m128i regular_pool_mask =
186       _mm_set1_epi64x(derived().RegularPoolMask() & kPtrUntagMask);
187 
188   static_assert(sizeof(__m128i) == kBytesInVector);
189   for (; begin <= (end - kBytesInVector); begin += kBytesInVector) {
190     // Keep it MTE-untagged. See DisableMTEScope for details.
191     const __m128i maybe_ptrs =
192         _mm_loadu_si128(reinterpret_cast<__m128i*>(begin));
193     const __m128i vand = _mm_and_si128(maybe_ptrs, regular_pool_mask);
194     const __m128i vcmp = _mm_cmpeq_epi64(vand, vbase);
195     const int mask = _mm_movemask_pd(_mm_castsi128_pd(vcmp));
196     if (PA_LIKELY(!mask)) {
197       continue;
198     }
199     // It's important to extract pointers from the already loaded vector.
200     // Otherwise, new loads can break in-pool assumption checked above.
201     if (mask & 0b01) {
202       derived().CheckPointer(_mm_cvtsi128_si64(maybe_ptrs));
203     }
204     if (mask & 0b10) {
205       // The mask is used to move the 4th and 3rd dwords into the second and
206       // first position.
207       static constexpr int kSecondWordMask = (3 << 2) | (2 << 0);
208       const __m128i shuffled = _mm_shuffle_epi32(maybe_ptrs, kSecondWordMask);
209       derived().CheckPointer(_mm_cvtsi128_si64(shuffled));
210     }
211   }
212   // Run unvectorized on the remainder of the region.
213   RunUnvectorized(begin, end);
214 }
215 #endif  // defined(ARCH_CPU_X86_64)
216 
217 #if PA_CONFIG(STARSCAN_NEON_SUPPORTED)
218 template <typename Derived>
RunNEON(uintptr_t begin,uintptr_t end)219 void ScanLoop<Derived>::RunNEON(uintptr_t begin, uintptr_t end) {
220   static constexpr size_t kAlignmentRequirement = 16;
221   static constexpr size_t kWordsInVector = 2;
222   static constexpr size_t kBytesInVector = kWordsInVector * sizeof(uintptr_t);
223   PA_SCAN_DCHECK(!(begin % kAlignmentRequirement));
224   const uint64x2_t vbase = vdupq_n_u64(derived().RegularPoolBase());
225   // If the read value is a pointer into the PA region, it's likely
226   // MTE-tagged. Piggyback on |regular_pool_mask| to untag, for efficiency.
227   const uint64x2_t regular_pool_mask =
228       vdupq_n_u64(derived().RegularPoolMask() & kPtrUntagMask);
229 
230   for (; begin <= (end - kBytesInVector); begin += kBytesInVector) {
231     // Keep it MTE-untagged. See DisableMTEScope for details.
232     const uint64x2_t maybe_ptrs = vld1q_u64(reinterpret_cast<uint64_t*>(begin));
233     const uint64x2_t vand = vandq_u64(maybe_ptrs, regular_pool_mask);
234     const uint64x2_t vcmp = vceqq_u64(vand, vbase);
235     const uint32_t max = vmaxvq_u32(vreinterpretq_u32_u64(vcmp));
236     if (PA_LIKELY(!max)) {
237       continue;
238     }
239     // It's important to extract pointers from the already loaded vector.
240     // Otherwise, new loads can break in-pool assumption checked above.
241     if (vgetq_lane_u64(vcmp, 0)) {
242       derived().CheckPointer(vgetq_lane_u64(maybe_ptrs, 0));
243     }
244     if (vgetq_lane_u64(vcmp, 1)) {
245       derived().CheckPointer(vgetq_lane_u64(maybe_ptrs, 1));
246     }
247   }
248   // Run unvectorized on the remainder of the region.
249   RunUnvectorized(begin, end);
250 }
251 #endif  // PA_CONFIG(STARSCAN_NEON_SUPPORTED)
252 
253 }  // namespace partition_alloc::internal
254 
255 #endif  // PARTITION_ALLOC_STARSCAN_SCAN_LOOP_H_
256