1 // Copyright 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #![allow(missing_docs, unused_results, clippy::unwrap_used)]
16
17 use criterion::{black_box, criterion_group, criterion_main, Criterion};
18 use handle_map::*;
19 use std::sync::Arc;
20 use std::thread;
21 use std::time::{Duration, Instant};
22
23 // For the sake of these benchmarks, we just want
24 // to be able to allocate an almost-arbitrary number of handles
25 // if needed by the particular benchmark.
26 const MAX_ACTIVE_HANDLES: u32 = u32::MAX - 1;
27
28 // The default number of threads+shards to use
29 // in multi-threaded benchmarks, chosen
30 // to represent a typical multi-threaded use-case.
31 const DEFAULT_SHARDS_AND_THREADS: u8 = 8;
32
33 /// We're using u8's for the wrapped object type to deliberately ignore
34 /// costs related to accessing the underlying data - we just care about
35 /// benchmarking handle-map operations.
36 type BenchHandleMap = HandleMap<u8>;
37
build_handle_map(num_shards: u8) -> BenchHandleMap38 fn build_handle_map(num_shards: u8) -> BenchHandleMap {
39 let dimensions = HandleMapDimensions {
40 num_shards,
41 max_active_handles: MAX_ACTIVE_HANDLES,
42 };
43 HandleMap::with_dimensions(dimensions)
44 }
45
46 /// Benchmark for repeated reads on a single thread
single_threaded_read_benchmark(c: &mut Criterion)47 fn single_threaded_read_benchmark(c: &mut Criterion) {
48 // The number of shards should not matter much here.
49 let handle_map = build_handle_map(8);
50 // Pre-populate the map with a single handle.
51 let handle = handle_map.allocate(|| 0xFF).unwrap();
52 let handle_map_ref = &handle_map;
53 // Perform repeated reads
54 let _ = c.bench_function("single-threaded reads", |b| {
55 b.iter(|| {
56 let guard = handle_map_ref.get(black_box(handle)).unwrap();
57 black_box(*guard)
58 })
59 });
60 }
61
62 /// Benchmark for repeated writes on a single thread
single_threaded_write_benchmark(c: &mut Criterion)63 fn single_threaded_write_benchmark(c: &mut Criterion) {
64 // The number of shards should not matter much here.
65 let handle_map = build_handle_map(8);
66 // Pre-populate the map with a single handle.
67 let handle = handle_map.allocate(|| 0xFF).unwrap();
68 let handle_map_ref = &handle_map;
69 // Perform repeated writes
70 c.bench_function("single-threaded writes", |b| {
71 b.iter(|| {
72 let mut guard = handle_map_ref.get_mut(black_box(handle)).unwrap();
73 *guard *= black_box(0);
74 })
75 });
76 }
77
78 /// Benchmark for repeated allocations on a single thread
79 #[allow(unused_must_use)]
single_threaded_allocate_benchmark(c: &mut Criterion)80 fn single_threaded_allocate_benchmark(c: &mut Criterion) {
81 // The number of shards here is chosen to be quasi-reasonable.
82 // Realistically, performance will fluctuate somewhat with this
83 // due to the difference in the number of internal hash-map collisions,
84 // but ultimately we only care about the asymptotic behavior.
85 let handle_map = build_handle_map(8);
86 let handle_map_ref = &handle_map;
87 // Perform repeated allocations
88 c.bench_function("single-threaded allocations", |b| {
89 b.iter(|| {
90 handle_map_ref.allocate(|| black_box(0xEE));
91 })
92 });
93 }
94
95 /// Benchmark for repeated allocation->deallocation on a single thread
96 /// starting from an empty map.
97 #[allow(unused_must_use)]
single_threaded_allocate_deallocate_near_empty_benchmark(c: &mut Criterion)98 fn single_threaded_allocate_deallocate_near_empty_benchmark(c: &mut Criterion) {
99 // The number of shards should not matter much here.
100 let handle_map = build_handle_map(8);
101 let handle_map_ref = &handle_map;
102 // Perform repeated allocation/deallocation pairs
103 c.bench_function(
104 "single-threaded allocate/deallocate pairs (empty init state)",
105 |b| {
106 b.iter(|| {
107 let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap();
108 handle_map_ref.deallocate(handle);
109 })
110 },
111 );
112 }
113
114 /// Benchmark for repeated allocation->deallocation starting
115 /// from a map with a thousand elements per shard.
116 #[allow(unused_must_use)]
single_threaded_allocate_deallocate_one_thousand_benchmark(c: &mut Criterion)117 fn single_threaded_allocate_deallocate_one_thousand_benchmark(c: &mut Criterion) {
118 let handle_map = build_handle_map(8);
119 for _ in 0..(8 * 1000) {
120 handle_map.allocate(|| black_box(0xFF)).unwrap();
121 }
122 let handle_map_ref = &handle_map;
123
124 // Perform repeated allocation/deallocation pairs
125 c.bench_function(
126 "single-threaded allocate/deallocate pairs (init one thousand elements per shard)",
127 |b| {
128 b.iter(|| {
129 let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap();
130 handle_map_ref.deallocate(handle);
131 })
132 },
133 );
134 }
135
136 /// Benchmark for repeated allocation->deallocation starting
137 /// from a map with a million elements per shard.
138 #[allow(unused_must_use)]
single_threaded_allocate_deallocate_one_million_benchmark(c: &mut Criterion)139 fn single_threaded_allocate_deallocate_one_million_benchmark(c: &mut Criterion) {
140 let handle_map = build_handle_map(8);
141 for _ in 0..(8 * 1000000) {
142 handle_map.allocate(|| black_box(0xFF)).unwrap();
143 }
144 let handle_map_ref = &handle_map;
145
146 // Perform repeated allocation/deallocation pairs
147 c.bench_function(
148 "single-threaded allocate/deallocate pairs (init one million elements per shard)",
149 |b| {
150 b.iter(|| {
151 let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap();
152 handle_map_ref.deallocate(handle);
153 })
154 },
155 );
156 }
157
158 /// Helper function for creating a multi-threaded benchmark
159 /// with the set number of threads. Untimed Initialization of state
160 /// should occur prior to this call. This method will repeatedly
161 /// execute the benchmark code on all threads
bench_multithreaded<F>(name: &str, num_threads: u8, benchable_fn_ref: Arc<F>, c: &mut Criterion) where F: Fn() + Send + Sync + 'static,162 fn bench_multithreaded<F>(name: &str, num_threads: u8, benchable_fn_ref: Arc<F>, c: &mut Criterion)
163 where
164 F: Fn() + Send + Sync + 'static,
165 {
166 c.bench_function(name, move |b| {
167 b.iter_custom(|iters| {
168 // The returned results from the threads will be their timings
169 let mut join_handles = Vec::new();
170 for _ in 0..num_threads {
171 let benchable_fn_ref_clone = benchable_fn_ref.clone();
172 let join_handle = thread::spawn(move || {
173 let start = Instant::now();
174 for _ in 0..iters {
175 benchable_fn_ref_clone();
176 }
177 start.elapsed()
178 });
179 join_handles.push(join_handle);
180 }
181 //Threads spawned, now collect the results
182 let mut total_duration = Duration::from_nanos(0);
183 for join_handle in join_handles {
184 let thread_duration = join_handle.join().unwrap();
185 total_duration += thread_duration;
186 }
187 total_duration /= num_threads as u32;
188 total_duration
189 })
190 });
191 }
192
193 /// Defines a number of reads/writes to perform [relative
194 /// to other operations performed in a given test]
195 #[derive(Debug, Clone, Copy)]
196 struct ReadWriteCount {
197 num_reads: usize,
198 num_writes: usize,
199 }
200
201 const READS_ONLY: ReadWriteCount = ReadWriteCount {
202 num_reads: 4,
203 num_writes: 0,
204 };
205
206 const READ_LEANING: ReadWriteCount = ReadWriteCount {
207 num_reads: 3,
208 num_writes: 1,
209 };
210
211 const BALANCED: ReadWriteCount = ReadWriteCount {
212 num_reads: 2,
213 num_writes: 2,
214 };
215
216 const WRITE_LEANING: ReadWriteCount = ReadWriteCount {
217 num_reads: 1,
218 num_writes: 3,
219 };
220
221 const WRITES_ONLY: ReadWriteCount = ReadWriteCount {
222 num_reads: 0,
223 num_writes: 4,
224 };
225
226 const READ_WRITE_COUNTS: [ReadWriteCount; 5] = [
227 READS_ONLY,
228 READ_LEANING,
229 BALANCED,
230 WRITE_LEANING,
231 WRITES_ONLY,
232 ];
233
234 /// Benchmarks a repeated allocate/[X writes]/[Y reads]/deallocate workflow across
235 /// the default number of threads and shards.
lifecycle_read_and_write_multithreaded(per_object_rw_count: ReadWriteCount, c: &mut Criterion)236 fn lifecycle_read_and_write_multithreaded(per_object_rw_count: ReadWriteCount, c: &mut Criterion) {
237 let name = format!(
238 "lifecycle_read_and_write_multithreaded(reads/object: {}, writes/object:{})",
239 per_object_rw_count.num_reads, per_object_rw_count.num_writes
240 );
241 // We need an Arc to be able to pass this off to `bench_multithreaded`.
242 let handle_map = Arc::new(build_handle_map(DEFAULT_SHARDS_AND_THREADS));
243 bench_multithreaded(
244 &name,
245 DEFAULT_SHARDS_AND_THREADS,
246 Arc::new(move || {
247 let handle = handle_map.allocate(|| black_box(0xFF)).unwrap();
248 for _ in 0..per_object_rw_count.num_writes {
249 let mut write_guard = handle_map.get_mut(handle).unwrap();
250 *write_guard *= black_box(1);
251 }
252 for _ in 0..per_object_rw_count.num_reads {
253 let read_guard = handle_map.get_mut(handle).unwrap();
254 black_box(*read_guard);
255 }
256 handle_map.deallocate(handle).unwrap();
257 }),
258 c,
259 );
260 }
261
lifecycle_benchmarks(c: &mut Criterion)262 fn lifecycle_benchmarks(c: &mut Criterion) {
263 for read_write_count in READ_WRITE_COUNTS {
264 // Using 8 separate shards to roughly match the number of active threads.
265 lifecycle_read_and_write_multithreaded(read_write_count, c);
266 }
267 }
268
269 /// Benchmarks repeated cycles of accessing the given number of distinct objects, each
270 /// with the given number of reads and writes across the given number of threads.
read_and_write_contention_multithreaded( num_threads: u8, num_distinct_objects: u32, per_object_rw_count: ReadWriteCount, c: &mut Criterion, )271 fn read_and_write_contention_multithreaded(
272 num_threads: u8,
273 num_distinct_objects: u32,
274 per_object_rw_count: ReadWriteCount,
275 c: &mut Criterion,
276 ) {
277 let name = format!("read_and_write_contention_multithreaded(threads: {}, objects: {}, reads/object: {}, writes/object:{})",
278 num_threads, num_distinct_objects, per_object_rw_count.num_reads, per_object_rw_count.num_writes);
279 // Default to 8 shards, ultimately doesn't matter much since we only ever access a few shards.
280 // This needs to be `'static` in order to pass a ref off to `bench_multithreaded`.
281 let handle_map = Arc::new(build_handle_map(8));
282 let mut handles = Vec::new();
283 for i in 0..num_distinct_objects {
284 let handle = handle_map.allocate(|| i as u8).unwrap();
285 handles.push(handle);
286 }
287
288 bench_multithreaded(
289 &name,
290 num_threads,
291 Arc::new(move || {
292 // Deliberately performing all writes first, with
293 // the goal of getting staggered timings from
294 // all of the threads for subsequent iterations.
295 // We also perform reads and writes across all objects
296 // in batches to get something resembling uniform access.
297 for _ in 0..per_object_rw_count.num_writes {
298 for handle in handles.iter() {
299 let mut write_guard = handle_map.get_mut(*handle).unwrap();
300 *write_guard *= black_box(1);
301 }
302 }
303 for _ in 0..per_object_rw_count.num_reads {
304 for handle in handles.iter() {
305 let read_guard = handle_map.get(*handle).unwrap();
306 black_box(*read_guard);
307 }
308 }
309 }),
310 c,
311 );
312 }
313
read_write_contention_benchmarks(c: &mut Criterion)314 fn read_write_contention_benchmarks(c: &mut Criterion) {
315 for num_threads in [2, 4, 8].iter() {
316 for num_distinct_objects in [1u32, 2u32, 4u32].iter() {
317 if num_distinct_objects >= &(*num_threads as u32) {
318 continue;
319 }
320 for read_write_count in READ_WRITE_COUNTS {
321 read_and_write_contention_multithreaded(
322 *num_threads,
323 *num_distinct_objects,
324 read_write_count,
325 c,
326 );
327 }
328 }
329 }
330 }
331
332 /// Benchmarks allocation (starting from empty) for the default number of shards and threads
allocator_contention_benchmark(c: &mut Criterion)333 fn allocator_contention_benchmark(c: &mut Criterion) {
334 let name = "allocator_contention_multithreaded";
335 // We need an Arc to pass this off to `bench_multithreaded`
336 let handle_map = Arc::new(build_handle_map(DEFAULT_SHARDS_AND_THREADS));
337 bench_multithreaded(
338 name,
339 DEFAULT_SHARDS_AND_THREADS,
340 Arc::new(move || {
341 handle_map.allocate(|| black_box(0xFF)).unwrap();
342 }),
343 c,
344 );
345 }
346
347 criterion_group!(
348 benches,
349 single_threaded_read_benchmark,
350 single_threaded_write_benchmark,
351 single_threaded_allocate_benchmark,
352 single_threaded_allocate_deallocate_near_empty_benchmark,
353 single_threaded_allocate_deallocate_one_thousand_benchmark,
354 single_threaded_allocate_deallocate_one_million_benchmark,
355 lifecycle_benchmarks,
356 read_write_contention_benchmarks,
357 allocator_contention_benchmark,
358 );
359 criterion_main!(benches);
360