1 // Copyright 2023 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // -----------------------------------------------------------------------------
16 // File: prefetch.h
17 // -----------------------------------------------------------------------------
18 //
19 // This header file defines prefetch functions to prefetch memory contents
20 // into the first level cache (L1) for the current CPU. The prefetch logic
21 // offered in this header is limited to prefetching first level cachelines
22 // only, and is aimed at relatively 'simple' prefetching logic.
23 //
24 #ifndef ABSL_BASE_PREFETCH_H_
25 #define ABSL_BASE_PREFETCH_H_
26
27 #include "absl/base/attributes.h"
28 #include "absl/base/config.h"
29
30 #if defined(ABSL_INTERNAL_HAVE_SSE)
31 #include <xmmintrin.h>
32 #endif
33
34 #if defined(_MSC_VER)
35 #include <intrin.h>
36 #if defined(ABSL_INTERNAL_HAVE_SSE)
37 #pragma intrinsic(_mm_prefetch)
38 #endif
39 #endif
40
41 namespace absl {
42 ABSL_NAMESPACE_BEGIN
43
44 // Moves data into the L1 cache before it is read, or "prefetches" it.
45 //
46 // The value of `addr` is the address of the memory to prefetch. If
47 // the target and compiler support it, data prefetch instructions are
48 // generated. If the prefetch is done some time before the memory is
49 // read, it may be in the cache by the time the read occurs.
50 //
51 // This method prefetches data with the highest degree of temporal locality;
52 // data is prefetched where possible into all levels of the cache.
53 //
54 // Incorrect or gratuitous use of this function can degrade performance.
55 // Use this function only when representative benchmarks show an improvement.
56 //
57 // Example:
58 //
59 // // Computes incremental checksum for `data`.
60 // int ComputeChecksum(int sum, absl::string_view data);
61 //
62 // // Computes cumulative checksum for all values in `data`
63 // int ComputeChecksum(absl::Span<const std::string> data) {
64 // int sum = 0;
65 // auto it = data.begin();
66 // auto pit = data.begin();
67 // auto end = data.end();
68 // for (int dist = 8; dist > 0 && pit != data.end(); --dist, ++pit) {
69 // absl::PrefetchToLocalCache(pit->data());
70 // }
71 // for (; pit != end; ++pit, ++it) {
72 // sum = ComputeChecksum(sum, *it);
73 // absl::PrefetchToLocalCache(pit->data());
74 // }
75 // for (; it != end; ++it) {
76 // sum = ComputeChecksum(sum, *it);
77 // }
78 // return sum;
79 // }
80 //
81 void PrefetchToLocalCache(const void* addr);
82
83 // Moves data into the L1 cache before it is read, or "prefetches" it.
84 //
85 // This function is identical to `PrefetchToLocalCache()` except that it has
86 // non-temporal locality: the fetched data should not be left in any of the
87 // cache tiers. This is useful for cases where the data is used only once /
88 // short term, for example, invoking a destructor on an object.
89 //
90 // Incorrect or gratuitous use of this function can degrade performance.
91 // Use this function only when representative benchmarks show an improvement.
92 //
93 // Example:
94 //
95 // template <typename Iterator>
96 // void DestroyPointers(Iterator begin, Iterator end) {
97 // size_t distance = std::min(8U, bars.size());
98 //
99 // int dist = 8;
100 // auto prefetch_it = begin;
101 // while (prefetch_it != end && --dist;) {
102 // absl::PrefetchToLocalCacheNta(*prefetch_it++);
103 // }
104 // while (prefetch_it != end) {
105 // delete *begin++;
106 // absl::PrefetchToLocalCacheNta(*prefetch_it++);
107 // }
108 // while (begin != end) {
109 // delete *begin++;
110 // }
111 // }
112 //
113 void PrefetchToLocalCacheNta(const void* addr);
114
115 // Moves data into the L1 cache with the intent to modify it.
116 //
117 // This function is similar to `PrefetchToLocalCache()` except that it
118 // prefetches cachelines with an 'intent to modify' This typically includes
119 // invalidating cache entries for this address in all other cache tiers, and an
120 // exclusive access intent.
121 //
122 // Incorrect or gratuitous use of this function can degrade performance. As this
123 // function can invalidate cached cachelines on other caches and computer cores,
124 // incorrect usage of this function can have an even greater negative impact
125 // than incorrect regular prefetches.
126 // Use this function only when representative benchmarks show an improvement.
127 //
128 // Example:
129 //
130 // void* Arena::Allocate(size_t size) {
131 // void* ptr = AllocateBlock(size);
132 // absl::PrefetchToLocalCacheForWrite(ptr);
133 // return ptr;
134 // }
135 //
136 void PrefetchToLocalCacheForWrite(const void* addr);
137
138 #if ABSL_HAVE_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
139
140 #define ABSL_HAVE_PREFETCH 1
141
142 // See __builtin_prefetch:
143 // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html.
144 //
PrefetchToLocalCache(const void * addr)145 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCache(
146 const void* addr) {
147 __builtin_prefetch(addr, 0, 3);
148 }
149
PrefetchToLocalCacheNta(const void * addr)150 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheNta(
151 const void* addr) {
152 __builtin_prefetch(addr, 0, 0);
153 }
154
PrefetchToLocalCacheForWrite(const void * addr)155 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheForWrite(
156 const void* addr) {
157 // [x86] gcc/clang don't generate PREFETCHW for __builtin_prefetch(.., 1)
158 // unless -march=broadwell or newer; this is not generally the default, so we
159 // manually emit prefetchw. PREFETCHW is recognized as a no-op on older Intel
160 // processors and has been present on AMD processors since the K6-2.
161 #if defined(__x86_64__) && !defined(__PRFCHW__)
162 asm("prefetchw %0" : : "m"(*reinterpret_cast<const char*>(addr)));
163 #else
164 __builtin_prefetch(addr, 1, 3);
165 #endif
166 }
167
168 #elif defined(ABSL_INTERNAL_HAVE_SSE)
169
170 #define ABSL_HAVE_PREFETCH 1
171
PrefetchToLocalCache(const void * addr)172 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCache(
173 const void* addr) {
174 _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0);
175 }
176
PrefetchToLocalCacheNta(const void * addr)177 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheNta(
178 const void* addr) {
179 _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_NTA);
180 }
181
PrefetchToLocalCacheForWrite(const void * addr)182 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheForWrite(
183 const void* addr) {
184 #if defined(_MM_HINT_ET0)
185 _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_ET0);
186 #elif !defined(_MSC_VER) && defined(__x86_64__)
187 // _MM_HINT_ET0 is not universally supported. As we commented further
188 // up, PREFETCHW is recognized as a no-op on older Intel processors
189 // and has been present on AMD processors since the K6-2. We have this
190 // disabled for MSVC compilers as this miscompiles on older MSVC compilers.
191 asm("prefetchw %0" : : "m"(*reinterpret_cast<const char*>(addr)));
192 #endif
193 }
194
195 #else
196
PrefetchToLocalCache(const void * addr)197 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCache(
198 const void* addr) {}
PrefetchToLocalCacheNta(const void * addr)199 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheNta(
200 const void* addr) {}
PrefetchToLocalCacheForWrite(const void * addr)201 ABSL_ATTRIBUTE_ALWAYS_INLINE inline void PrefetchToLocalCacheForWrite(
202 const void* addr) {}
203
204 #endif
205
206 ABSL_NAMESPACE_END
207 } // namespace absl
208
209 #endif // ABSL_BASE_PREFETCH_H_
210