1 // Copyright 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #![allow(missing_docs, unused_results, clippy::unwrap_used)]
16 
17 use criterion::{black_box, criterion_group, criterion_main, Criterion};
18 use handle_map::*;
19 use std::sync::Arc;
20 use std::thread;
21 use std::time::{Duration, Instant};
22 
23 // For the sake of these benchmarks, we just want
24 // to be able to allocate an almost-arbitrary number of handles
25 // if needed by the particular benchmark.
26 const MAX_ACTIVE_HANDLES: u32 = u32::MAX - 1;
27 
28 // The default number of threads+shards to use
29 // in multi-threaded benchmarks, chosen
30 // to represent a typical multi-threaded use-case.
31 const DEFAULT_SHARDS_AND_THREADS: u8 = 8;
32 
33 /// We're using u8's for the wrapped object type to deliberately ignore
34 /// costs related to accessing the underlying data - we just care about
35 /// benchmarking handle-map operations.
36 type BenchHandleMap = HandleMap<u8>;
37 
build_handle_map(num_shards: u8) -> BenchHandleMap38 fn build_handle_map(num_shards: u8) -> BenchHandleMap {
39     let dimensions = HandleMapDimensions {
40         num_shards,
41         max_active_handles: MAX_ACTIVE_HANDLES,
42     };
43     HandleMap::with_dimensions(dimensions)
44 }
45 
46 /// Benchmark for repeated reads on a single thread
single_threaded_read_benchmark(c: &mut Criterion)47 fn single_threaded_read_benchmark(c: &mut Criterion) {
48     // The number of shards should not matter much here.
49     let handle_map = build_handle_map(8);
50     // Pre-populate the map with a single handle.
51     let handle = handle_map.allocate(|| 0xFF).unwrap();
52     let handle_map_ref = &handle_map;
53     // Perform repeated reads
54     let _ = c.bench_function("single-threaded reads", |b| {
55         b.iter(|| {
56             let guard = handle_map_ref.get(black_box(handle)).unwrap();
57             black_box(*guard)
58         })
59     });
60 }
61 
62 /// Benchmark for repeated writes on a single thread
single_threaded_write_benchmark(c: &mut Criterion)63 fn single_threaded_write_benchmark(c: &mut Criterion) {
64     // The number of shards should not matter much here.
65     let handle_map = build_handle_map(8);
66     // Pre-populate the map with a single handle.
67     let handle = handle_map.allocate(|| 0xFF).unwrap();
68     let handle_map_ref = &handle_map;
69     // Perform repeated writes
70     c.bench_function("single-threaded writes", |b| {
71         b.iter(|| {
72             let mut guard = handle_map_ref.get_mut(black_box(handle)).unwrap();
73             *guard *= black_box(0);
74         })
75     });
76 }
77 
78 /// Benchmark for repeated allocations on a single thread
79 #[allow(unused_must_use)]
single_threaded_allocate_benchmark(c: &mut Criterion)80 fn single_threaded_allocate_benchmark(c: &mut Criterion) {
81     // The number of shards here is chosen to be quasi-reasonable.
82     // Realistically, performance will fluctuate somewhat with this
83     // due to the difference in the number of internal hash-map collisions,
84     // but ultimately we only care about the asymptotic behavior.
85     let handle_map = build_handle_map(8);
86     let handle_map_ref = &handle_map;
87     // Perform repeated allocations
88     c.bench_function("single-threaded allocations", |b| {
89         b.iter(|| {
90             handle_map_ref.allocate(|| black_box(0xEE));
91         })
92     });
93 }
94 
95 /// Benchmark for repeated allocation->deallocation on a single thread
96 /// starting from an empty map.
97 #[allow(unused_must_use)]
single_threaded_allocate_deallocate_near_empty_benchmark(c: &mut Criterion)98 fn single_threaded_allocate_deallocate_near_empty_benchmark(c: &mut Criterion) {
99     // The number of shards should not matter much here.
100     let handle_map = build_handle_map(8);
101     let handle_map_ref = &handle_map;
102     // Perform repeated allocation/deallocation pairs
103     c.bench_function(
104         "single-threaded allocate/deallocate pairs (empty init state)",
105         |b| {
106             b.iter(|| {
107                 let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap();
108                 handle_map_ref.deallocate(handle);
109             })
110         },
111     );
112 }
113 
114 /// Benchmark for repeated allocation->deallocation starting
115 /// from a map with a thousand elements per shard.
116 #[allow(unused_must_use)]
single_threaded_allocate_deallocate_one_thousand_benchmark(c: &mut Criterion)117 fn single_threaded_allocate_deallocate_one_thousand_benchmark(c: &mut Criterion) {
118     let handle_map = build_handle_map(8);
119     for _ in 0..(8 * 1000) {
120         handle_map.allocate(|| black_box(0xFF)).unwrap();
121     }
122     let handle_map_ref = &handle_map;
123 
124     // Perform repeated allocation/deallocation pairs
125     c.bench_function(
126         "single-threaded allocate/deallocate pairs (init one thousand elements per shard)",
127         |b| {
128             b.iter(|| {
129                 let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap();
130                 handle_map_ref.deallocate(handle);
131             })
132         },
133     );
134 }
135 
136 /// Benchmark for repeated allocation->deallocation starting
137 /// from a map with a million elements per shard.
138 #[allow(unused_must_use)]
single_threaded_allocate_deallocate_one_million_benchmark(c: &mut Criterion)139 fn single_threaded_allocate_deallocate_one_million_benchmark(c: &mut Criterion) {
140     let handle_map = build_handle_map(8);
141     for _ in 0..(8 * 1000000) {
142         handle_map.allocate(|| black_box(0xFF)).unwrap();
143     }
144     let handle_map_ref = &handle_map;
145 
146     // Perform repeated allocation/deallocation pairs
147     c.bench_function(
148         "single-threaded allocate/deallocate pairs (init one million elements per shard)",
149         |b| {
150             b.iter(|| {
151                 let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap();
152                 handle_map_ref.deallocate(handle);
153             })
154         },
155     );
156 }
157 
158 /// Helper function for creating a multi-threaded benchmark
159 /// with the set number of threads. Untimed Initialization of state
160 /// should occur prior to this call. This method will repeatedly
161 /// execute the benchmark code on all threads
bench_multithreaded<F>(name: &str, num_threads: u8, benchable_fn_ref: Arc<F>, c: &mut Criterion) where F: Fn() + Send + Sync + 'static,162 fn bench_multithreaded<F>(name: &str, num_threads: u8, benchable_fn_ref: Arc<F>, c: &mut Criterion)
163 where
164     F: Fn() + Send + Sync + 'static,
165 {
166     c.bench_function(name, move |b| {
167         b.iter_custom(|iters| {
168             // The returned results from the threads will be their timings
169             let mut join_handles = Vec::new();
170             for _ in 0..num_threads {
171                 let benchable_fn_ref_clone = benchable_fn_ref.clone();
172                 let join_handle = thread::spawn(move || {
173                     let start = Instant::now();
174                     for _ in 0..iters {
175                         benchable_fn_ref_clone();
176                     }
177                     start.elapsed()
178                 });
179                 join_handles.push(join_handle);
180             }
181             //Threads spawned, now collect the results
182             let mut total_duration = Duration::from_nanos(0);
183             for join_handle in join_handles {
184                 let thread_duration = join_handle.join().unwrap();
185                 total_duration += thread_duration;
186             }
187             total_duration /= num_threads as u32;
188             total_duration
189         })
190     });
191 }
192 
193 /// Defines a number of reads/writes to perform [relative
194 /// to other operations performed in a given test]
195 #[derive(Debug, Clone, Copy)]
196 struct ReadWriteCount {
197     num_reads: usize,
198     num_writes: usize,
199 }
200 
201 const READS_ONLY: ReadWriteCount = ReadWriteCount {
202     num_reads: 4,
203     num_writes: 0,
204 };
205 
206 const READ_LEANING: ReadWriteCount = ReadWriteCount {
207     num_reads: 3,
208     num_writes: 1,
209 };
210 
211 const BALANCED: ReadWriteCount = ReadWriteCount {
212     num_reads: 2,
213     num_writes: 2,
214 };
215 
216 const WRITE_LEANING: ReadWriteCount = ReadWriteCount {
217     num_reads: 1,
218     num_writes: 3,
219 };
220 
221 const WRITES_ONLY: ReadWriteCount = ReadWriteCount {
222     num_reads: 0,
223     num_writes: 4,
224 };
225 
226 const READ_WRITE_COUNTS: [ReadWriteCount; 5] = [
227     READS_ONLY,
228     READ_LEANING,
229     BALANCED,
230     WRITE_LEANING,
231     WRITES_ONLY,
232 ];
233 
234 /// Benchmarks a repeated allocate/[X writes]/[Y reads]/deallocate workflow across
235 /// the default number of threads and shards.
lifecycle_read_and_write_multithreaded(per_object_rw_count: ReadWriteCount, c: &mut Criterion)236 fn lifecycle_read_and_write_multithreaded(per_object_rw_count: ReadWriteCount, c: &mut Criterion) {
237     let name = format!(
238         "lifecycle_read_and_write_multithreaded(reads/object: {}, writes/object:{})",
239         per_object_rw_count.num_reads, per_object_rw_count.num_writes
240     );
241     // We need an Arc to be able to pass this off to `bench_multithreaded`.
242     let handle_map = Arc::new(build_handle_map(DEFAULT_SHARDS_AND_THREADS));
243     bench_multithreaded(
244         &name,
245         DEFAULT_SHARDS_AND_THREADS,
246         Arc::new(move || {
247             let handle = handle_map.allocate(|| black_box(0xFF)).unwrap();
248             for _ in 0..per_object_rw_count.num_writes {
249                 let mut write_guard = handle_map.get_mut(handle).unwrap();
250                 *write_guard *= black_box(1);
251             }
252             for _ in 0..per_object_rw_count.num_reads {
253                 let read_guard = handle_map.get_mut(handle).unwrap();
254                 black_box(*read_guard);
255             }
256             handle_map.deallocate(handle).unwrap();
257         }),
258         c,
259     );
260 }
261 
lifecycle_benchmarks(c: &mut Criterion)262 fn lifecycle_benchmarks(c: &mut Criterion) {
263     for read_write_count in READ_WRITE_COUNTS {
264         // Using 8 separate shards to roughly match the number of active threads.
265         lifecycle_read_and_write_multithreaded(read_write_count, c);
266     }
267 }
268 
269 /// Benchmarks repeated cycles of accessing the given number of distinct objects, each
270 /// with the given number of reads and writes across the given number of threads.
read_and_write_contention_multithreaded( num_threads: u8, num_distinct_objects: u32, per_object_rw_count: ReadWriteCount, c: &mut Criterion, )271 fn read_and_write_contention_multithreaded(
272     num_threads: u8,
273     num_distinct_objects: u32,
274     per_object_rw_count: ReadWriteCount,
275     c: &mut Criterion,
276 ) {
277     let name = format!("read_and_write_contention_multithreaded(threads: {}, objects: {}, reads/object: {}, writes/object:{})",
278                        num_threads, num_distinct_objects, per_object_rw_count.num_reads, per_object_rw_count.num_writes);
279     // Default to 8 shards, ultimately doesn't matter much since we only ever access a few shards.
280     // This needs to be `'static` in order to pass a ref off to `bench_multithreaded`.
281     let handle_map = Arc::new(build_handle_map(8));
282     let mut handles = Vec::new();
283     for i in 0..num_distinct_objects {
284         let handle = handle_map.allocate(|| i as u8).unwrap();
285         handles.push(handle);
286     }
287 
288     bench_multithreaded(
289         &name,
290         num_threads,
291         Arc::new(move || {
292             // Deliberately performing all writes first, with
293             // the goal of getting staggered timings from
294             // all of the threads for subsequent iterations.
295             // We also perform reads and writes across all objects
296             // in batches to get something resembling uniform access.
297             for _ in 0..per_object_rw_count.num_writes {
298                 for handle in handles.iter() {
299                     let mut write_guard = handle_map.get_mut(*handle).unwrap();
300                     *write_guard *= black_box(1);
301                 }
302             }
303             for _ in 0..per_object_rw_count.num_reads {
304                 for handle in handles.iter() {
305                     let read_guard = handle_map.get(*handle).unwrap();
306                     black_box(*read_guard);
307                 }
308             }
309         }),
310         c,
311     );
312 }
313 
read_write_contention_benchmarks(c: &mut Criterion)314 fn read_write_contention_benchmarks(c: &mut Criterion) {
315     for num_threads in [2, 4, 8].iter() {
316         for num_distinct_objects in [1u32, 2u32, 4u32].iter() {
317             if num_distinct_objects >= &(*num_threads as u32) {
318                 continue;
319             }
320             for read_write_count in READ_WRITE_COUNTS {
321                 read_and_write_contention_multithreaded(
322                     *num_threads,
323                     *num_distinct_objects,
324                     read_write_count,
325                     c,
326                 );
327             }
328         }
329     }
330 }
331 
332 /// Benchmarks allocation (starting from empty) for the default number of shards and threads
allocator_contention_benchmark(c: &mut Criterion)333 fn allocator_contention_benchmark(c: &mut Criterion) {
334     let name = "allocator_contention_multithreaded";
335     // We need an Arc to pass this off to `bench_multithreaded`
336     let handle_map = Arc::new(build_handle_map(DEFAULT_SHARDS_AND_THREADS));
337     bench_multithreaded(
338         name,
339         DEFAULT_SHARDS_AND_THREADS,
340         Arc::new(move || {
341             handle_map.allocate(|| black_box(0xFF)).unwrap();
342         }),
343         c,
344     );
345 }
346 
347 criterion_group!(
348     benches,
349     single_threaded_read_benchmark,
350     single_threaded_write_benchmark,
351     single_threaded_allocate_benchmark,
352     single_threaded_allocate_deallocate_near_empty_benchmark,
353     single_threaded_allocate_deallocate_one_thousand_benchmark,
354     single_threaded_allocate_deallocate_one_million_benchmark,
355     lifecycle_benchmarks,
356     read_write_contention_benchmarks,
357     allocator_contention_benchmark,
358 );
359 criterion_main!(benches);
360