xref: /aosp_15_r20/external/XNNPACK/src/cache.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2022 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include "xnnpack/cache.h"
7 
8 #include <assert.h> // For assert.
9 #include <stddef.h> // For size_t.
10 #include <stdint.h> // For uint32_t.
11 
12 #include "xnnpack.h"
13 #include "xnnpack/allocator.h"
14 #include "xnnpack/log.h"
15 #include "xnnpack/math.h"
16 #include "xnnpack/mutex.h"
17 
18 #define XNN_CACHE_HASH_SEED 7
19 #define XNN_CACHE_INITIAL_BUCKETS 32
20 #define XNN_CACHE_MAX_LOAD 0.75
21 // Max load factor is 0.75 (3/4), i.e. num_entries / num_buckets > 3 / 4.
22 #define XNN_CACHE_MAX_LOAD_ENTRIES_MULTIPLIER 4
23 #define XNN_CACHE_MAX_LOAD_BUCKETS_MULTIPLIER 3
24 #define XNN_CACHE_GROWTH_FACTOR 2
25 
26 // MurmurHash3 implementation, copied from smhasher, with minor modifications in
27 // style and main loop.
28 
fmix32(uint32_t h)29 static inline uint32_t fmix32(uint32_t h)
30 {
31   h ^= h >> 16;
32   h *= UINT32_C(0x85EBCA6B);
33   h ^= h >> 13;
34   h *= UINT32_C(0xC2B2AE35);
35   h ^= h >> 16;
36 
37   return h;
38 }
39 
murmur_hash3(const void * key,size_t len,uint32_t seed)40 static uint32_t murmur_hash3(const void* key, size_t len, uint32_t seed)
41 {
42   const uint8_t* data = (const uint8_t*) key;
43 
44   uint32_t h1 = seed;
45 
46   const uint32_t c1 = UINT32_C(0xCC9E2D51);
47   const uint32_t c2 = UINT32_C(0x1B873593);
48 
49   const uint32_t* blocks = (const uint32_t*) data;
50   for (; len >= sizeof(uint32_t); len -= sizeof(uint32_t)) {
51     uint32_t k1 = *blocks++;
52 
53     k1 *= c1;
54     k1 = math_rotl_u32(k1, 15);
55     k1 *= c2;
56 
57     h1 ^= k1;
58     h1 = math_rotl_u32(h1, 13);
59     h1 = h1 * 5 + UINT32_C(0xE6546B64);
60   }
61 
62   const uint8_t* tail = (const uint8_t*) blocks;
63 
64   uint32_t k1 = 0;
65 
66   switch (len & 3) {
67     case 3:
68       k1 ^= tail[2] << 16;
69     case 2:
70       k1 ^= tail[1] << 8;
71     case 1:
72       k1 ^= tail[0];
73       k1 *= c1;
74       k1 = math_rotl_u32(k1, 15);
75       k1 *= c2;
76       h1 ^= k1;
77   };
78 
79   h1 ^= len;
80 
81   return fmix32(h1);
82 }
83 
84 #ifndef NDEBUG
85 // This function is only used by an assert, so do not include it in non-debug
86 // builds.
cache_size(struct xnn_cache * cache)87 static inline size_t cache_size(struct xnn_cache* cache) {
88   switch (cache->type) {
89     case xnn_cache_type_code:
90       return cache->code.size;
91     case xnn_cache_type_weights:
92       return cache->weights.size;
93     default:
94       XNN_UNREACHABLE;
95   }
96   return SIZE_MAX;
97 }
98 #endif
99 
cache_start(struct xnn_cache * cache)100 static inline void* cache_start(struct xnn_cache* cache) {
101   switch (cache->type) {
102     case xnn_cache_type_code:
103       return cache->code.start;
104     case xnn_cache_type_weights:
105       return cache->weights.start;
106     default:
107       XNN_UNREACHABLE;
108   }
109   return NULL;
110 }
111 
xnn_init_cache_with_size(struct xnn_cache * cache,size_t num_buckets,enum xnn_cache_type cache_type)112 enum xnn_status xnn_init_cache_with_size(struct xnn_cache* cache, size_t num_buckets, enum xnn_cache_type cache_type)
113 {
114   memset(cache, 0, sizeof(struct xnn_cache));
115   cache->buckets = (struct xnn_cache_bucket*) xnn_allocate_zero_memory(num_buckets * sizeof(struct xnn_cache_bucket));
116   if (cache->buckets == NULL) {
117     xnn_log_error("fail to allocate memory for cache buckets");
118     return xnn_status_out_of_memory;
119   }
120 
121   cache->type = cache_type;
122   cache->num_buckets = num_buckets;
123   return xnn_status_success;
124 }
125 
xnn_init_code_cache_with_size(struct xnn_code_cache * cache,size_t num_buckets)126 enum xnn_status xnn_init_code_cache_with_size(struct xnn_code_cache* cache, size_t num_buckets)
127 {
128   memset(cache, 0, sizeof(struct xnn_code_cache));
129   enum xnn_status status = xnn_status_success;
130   status = xnn_init_cache_with_size(&cache->cache, num_buckets, xnn_cache_type_code);
131   if (status != xnn_status_success) {
132     goto error;
133   }
134 
135   status = xnn_allocate_code_memory(&cache->cache.code, XNN_DEFAULT_CODE_BUFFER_SIZE);
136   if (status != xnn_status_success) {
137     goto error;
138   }
139 
140   return xnn_status_success;
141 
142 error:
143   xnn_release_code_cache(cache);
144   return status;
145 }
146 
xnn_init_code_cache(struct xnn_code_cache * cache)147 enum xnn_status xnn_init_code_cache(struct xnn_code_cache* cache)
148 {
149   return xnn_init_code_cache_with_size(cache, XNN_CACHE_INITIAL_BUCKETS);
150 }
151 
cache_buckets_grow(struct xnn_cache * cache)152 static bool cache_buckets_grow(struct xnn_cache* cache)
153 {
154   const size_t new_num_buckets = cache->num_buckets * XNN_CACHE_GROWTH_FACTOR;
155   assert(is_po2(new_num_buckets));
156   struct xnn_cache tmp_cache;
157   xnn_init_cache_with_size(&tmp_cache, new_num_buckets, cache->type);
158 
159   for (size_t i = 0; i < cache->num_buckets; i++) {
160     struct xnn_cache_bucket b = cache->buckets[i];
161     if (b.size == 0) {
162       continue;
163     }
164 
165     // Find the first empty slot by linear probing to insert. No need to check
166     // hashes since we are not looking up anything, just moving things around
167     // into a bigger hash table.
168     const size_t mask = tmp_cache.num_buckets - 1;
169     size_t idx = b.hash & mask;
170     while (tmp_cache.buckets[idx].size != 0) {
171       idx = (idx + 1) & mask;
172     }
173     tmp_cache.buckets[idx].hash = b.hash;
174     tmp_cache.buckets[idx].size = b.size;
175     tmp_cache.buckets[idx].offset = b.offset;
176   }
177 
178   xnn_release_memory(cache->buckets);
179 
180   cache->buckets = tmp_cache.buckets;
181   cache->num_buckets = tmp_cache.num_buckets;
182   return true;
183 }
184 
bytes_equal(struct xnn_cache * cache,void * ptr,size_t size,size_t offset)185 static inline bool bytes_equal(struct xnn_cache* cache, void* ptr, size_t size, size_t offset)
186 {
187   return memcmp(ptr, (void*) ((uintptr_t) cache_start(cache) + offset), size) == 0;
188 }
189 
lookup(struct xnn_cache * cache,void * ptr,size_t size,uint32_t hash,size_t * index)190 static bool lookup(struct xnn_cache* cache, void* ptr, size_t size, uint32_t hash, size_t* index)
191 {
192   assert(is_po2(cache->num_buckets));
193   const size_t mask = cache->num_buckets - 1;
194   size_t idx = hash & mask;
195   const struct xnn_cache_bucket* buckets = cache->buckets;
196 
197   // Linear probing.
198   while (buckets[idx].size != 0 &&
199          !(buckets[idx].hash == hash &&
200            size == buckets[idx].size &&
201            bytes_equal(cache, ptr, buckets[idx].size, buckets[idx].offset))) {
202     idx = (idx + 1) & mask;
203   }
204   *index = idx;
205   if (buckets[idx].size == 0) {
206     return false;
207   } else {
208     return true;
209   }
210 }
211 
insert(struct xnn_cache * cache,void * ptr,size_t size)212 static bool insert(struct xnn_cache* cache, void* ptr, size_t size)
213 {
214   const uint32_t hash = murmur_hash3(ptr, size, /*seed=*/XNN_CACHE_HASH_SEED);
215   size_t idx;
216   const bool found = lookup(cache, ptr, size, hash, &idx);
217   if (found) {
218     return false;
219   }
220 
221   // Ensure we have enough buckets to keep under our load limit.
222   if (cache->num_entries * XNN_CACHE_MAX_LOAD_ENTRIES_MULTIPLIER >
223       cache->num_buckets * XNN_CACHE_MAX_LOAD_BUCKETS_MULTIPLIER) {
224     if (!cache_buckets_grow(cache)) {
225       // Can't grow hash table anymore.
226       xnn_log_error("failed to grow cache buckets");
227       return false;
228     }
229     xnn_log_debug("successfully grew cache buckets");
230 
231     // If the cache grew, idx is stale, since that is based on the old cache's num_buckets.
232     const bool found_in_grown_cache = lookup(cache, ptr, size, hash, &idx);
233     assert(!found_in_grown_cache);
234     (void) found_in_grown_cache;  // Silence unused variable warnings.
235   }
236 
237   // Check that ptr points into cache's buffer.
238   assert((uintptr_t) ptr >= (uintptr_t) cache_start(cache));
239   if (cache->type == xnn_cache_type_code) {
240     assert((uintptr_t) ptr < (uintptr_t) cache_start(cache) + cache_size(cache));
241   }
242 
243   const size_t offset = (uintptr_t) ptr - (uintptr_t) cache_start(cache);
244 
245   // Insert the entry.
246   cache->buckets[idx].size = size;
247   cache->buckets[idx].hash = hash;
248   cache->buckets[idx].offset = offset;
249   cache->num_entries++;
250   return true;
251 }
252 
253 // Checks if a generated microkernel is already in the cache, returns the offset
254 // if found, XNN_CACHE_NOT_FOUND otherwise.
lookup_cache(struct xnn_cache * cache,void * ptr,size_t size)255 static size_t lookup_cache(struct xnn_cache* cache, void* ptr, size_t size)
256 {
257   const uint32_t hash = murmur_hash3(ptr, size, /*seed=*/XNN_CACHE_HASH_SEED);
258   size_t bucket_idx;
259   if (lookup(cache, ptr, size, hash, &bucket_idx)) {
260     cache->hits++;
261     return cache->buckets[bucket_idx].offset;
262   } else {
263     cache->misses++;
264     return XNN_CACHE_NOT_FOUND;
265   }
266 }
267 
xnn_get_or_insert_cache(struct xnn_cache * cache,void * ptr,size_t size)268 size_t xnn_get_or_insert_cache(struct xnn_cache* cache, void* ptr, size_t size)
269 {
270   const size_t found_offset = lookup_cache(cache, ptr, size);
271   if (found_offset != XNN_CACHE_NOT_FOUND) {
272     if (cache->type == xnn_cache_type_code) {
273       // Found in the cache, rewind the buffer because code generators update buffer size.
274       cache->code.size -= size;
275     }
276     return found_offset;
277   }
278 
279   if (cache->type == xnn_cache_type_weights) {
280     // Cache miss, weights packing functions don't update buffer size, update it here.
281     cache->weights.size += size;
282   }
283 
284   const size_t offset = (uintptr_t) ptr - (uintptr_t) cache_start(cache);
285   if (!insert(cache, ptr, size)) {
286     return XNN_CACHE_NOT_FOUND;
287   }
288   return offset;
289 }
290 
xnn_get_or_insert_code_cache(struct xnn_code_cache * cache,void * ptr,size_t size)291 size_t xnn_get_or_insert_code_cache(struct xnn_code_cache* cache, void* ptr, size_t size)
292 {
293   return xnn_get_or_insert_cache(&cache->cache, ptr, size);
294 }
295 
xnn_release_code_cache(struct xnn_code_cache * cache)296 enum xnn_status xnn_release_code_cache(struct xnn_code_cache* cache)
297 {
298   if XNN_LIKELY(cache != NULL) {
299     assert(cache->cache.type == xnn_cache_type_code);
300     xnn_release_code_memory(&cache->cache.code);
301     xnn_release_memory(cache->cache.buckets);
302   }
303   return xnn_status_success;
304 }
305 
xnn_internal_init_weights_cache(struct xnn_weights_cache * cache,size_t num_buckets,size_t buffer_size)306 enum xnn_status xnn_internal_init_weights_cache(
307   struct xnn_weights_cache* cache,
308   size_t num_buckets,
309   size_t buffer_size)
310 {
311   memset(cache, 0, sizeof(struct xnn_weights_cache));
312 
313   enum xnn_status status = xnn_status_success;
314   status = xnn_init_cache_with_size(&cache->cache, num_buckets, xnn_cache_type_weights);
315   if (status != xnn_status_success) {
316     goto error;
317   }
318 
319   status = xnn_allocate_weights_memory(&cache->cache.weights, buffer_size);
320   if (status != xnn_status_success) {
321     goto error;
322   }
323 
324   status = xnn_mutex_init(&cache->mutex);
325   if (status != xnn_status_success) {
326     goto error;
327   }
328 
329   return xnn_status_success;
330 
331 error:
332   xnn_release_weights_cache(cache);
333   return status;
334 }
335 
xnn_init_weights_cache_with_size(struct xnn_weights_cache * cache,size_t size)336 enum xnn_status xnn_init_weights_cache_with_size(struct xnn_weights_cache* cache, size_t size)
337 {
338   return xnn_internal_init_weights_cache(cache, XNN_CACHE_INITIAL_BUCKETS, size);
339 }
340 
xnn_init_weights_cache(struct xnn_weights_cache * cache)341 enum xnn_status xnn_init_weights_cache(struct xnn_weights_cache* cache)
342 {
343   return xnn_init_weights_cache_with_size(cache, XNN_DEFAULT_WEIGHTS_BUFFER_SIZE);
344 }
345 
xnn_finalize_weights_cache(struct xnn_weights_cache * cache,enum xnn_weights_cache_finalization_kind finalization_kind)346 enum xnn_status xnn_finalize_weights_cache(
347   struct xnn_weights_cache* cache,
348   enum xnn_weights_cache_finalization_kind finalization_kind)
349 {
350   switch (cache->finalization_state) {
351     case xnn_cache_state_hard_finalized:
352     case xnn_cache_state_soft_finalized:
353       xnn_log_error("failed to finalize an already final weights cache");
354       return xnn_status_invalid_state;
355     case xnn_cache_state_not_finalized: {
356       enum xnn_status status;
357       enum xnn_cache_state finalized_state;
358 
359       if (finalization_kind == xnn_weights_cache_finalization_kind_hard) {
360         xnn_log_debug("hard finalizing weights cache");
361         status = xnn_finalize_weights_memory(&cache->cache.weights);
362         // Also release the memory used by hash table (but not the weights memory).
363         xnn_release_memory(cache->cache.buckets);
364         cache->cache.buckets = NULL;
365         finalized_state = xnn_cache_state_hard_finalized;
366       } else {
367         xnn_log_debug("soft finalizing weights cache");
368         assert(finalization_kind == xnn_weights_cache_finalization_kind_soft);
369         // Finalize weights cache by reserving sufficient space for the insertion of the largest cached weights. This
370         // ensures that we have space to write packed weights to check for cache hits without growing and moving the
371         // memory. This has some memory overhead, which can be as large as the size of the largest cached weights,
372         // rounded up to page size.
373         status = xnn_reserve_weights_memory(&cache->cache.weights, cache->max_weights_size);
374         finalized_state = xnn_cache_state_soft_finalized;
375       }
376       if (status != xnn_status_success) {
377         xnn_log_error("failed to finalize weights cache memory");
378         return xnn_status_invalid_state;
379       }
380 
381       cache->finalization_state = finalized_state;
382       return xnn_status_success;
383     }
384   }
385 }
386 
xnn_release_weights_cache(struct xnn_weights_cache * cache)387 enum xnn_status xnn_release_weights_cache(struct xnn_weights_cache* cache)
388 {
389   if XNN_LIKELY(cache != NULL) {
390     assert(cache->cache.type == xnn_cache_type_weights);
391     xnn_release_weights_memory(&cache->cache.weights);
392     if (cache->cache.buckets != NULL) {
393       xnn_release_memory(cache->cache.buckets);
394     }
395     const enum xnn_status status = xnn_mutex_destroy(&cache->mutex);
396     if (status != xnn_status_success) {
397       return status;
398     }
399   }
400   return xnn_status_success;
401 }
402 
cache_has_space(struct xnn_weights_cache * cache,size_t n)403 static inline bool cache_has_space(struct xnn_weights_cache* cache, size_t n)
404 {
405   const struct xnn_weights_buffer buf = cache->cache.weights;
406   return buf.size + n <= buf.capacity;
407 }
408 
xnn_reserve_space_in_weights_cache(struct xnn_weights_cache * cache,size_t n)409 void* xnn_reserve_space_in_weights_cache(struct xnn_weights_cache* cache, size_t n) {
410   switch (cache->finalization_state) {
411     case xnn_cache_state_hard_finalized:
412       xnn_log_error("cannot reserve additional space in a finalized compact weights cache");
413       return NULL;
414     case xnn_cache_state_soft_finalized:
415       if (!cache_has_space(cache, n)) {
416         xnn_log_error("cannot reserve additional space in a finalized weights cache");
417         return NULL;
418       }
419       // If the cache is finalized, and has space for `n` bytes, we still want to lock the mutex, because we can have
420       // multiple writers attempting to write to this space.
421       break;
422     default:
423       break;
424   }
425 
426   enum xnn_status status = xnn_mutex_lock(&cache->mutex);
427   if (status != xnn_status_success) {
428     return NULL;
429   }
430 
431   struct xnn_weights_buffer* buffer = &cache->cache.weights;
432   status = xnn_reserve_weights_memory(buffer, n);
433   if (status != xnn_status_success) {
434     xnn_mutex_unlock(&cache->mutex);
435     return NULL;
436   }
437 
438   return (void*) ((uintptr_t) buffer->start + buffer->size);
439 }
440 
xnn_get_or_insert_weights_cache(struct xnn_weights_cache * cache,void * ptr,size_t size)441 size_t xnn_get_or_insert_weights_cache(struct xnn_weights_cache* cache, void* ptr, size_t size)
442 {
443   size_t offset = XNN_CACHE_NOT_FOUND;
444 
445   switch (cache->finalization_state) {
446     case xnn_cache_state_hard_finalized: {
447       xnn_log_error("cannot insert into a finalized compact weights cache");
448       return XNN_CACHE_NOT_FOUND;
449     }
450     case xnn_cache_state_soft_finalized: {
451       // Inserting into a finalized weights cache is okay as long as:
452       // 1. there is sufficient space in the memory (to write the incoming packed weights), or
453       // 2. incoming packed weights is already in cache
454       if (!cache_has_space(cache, size)) {
455         xnn_log_error("insufficient extra space in finalized weights cache buffer");
456         return XNN_CACHE_NOT_FOUND;
457       }
458 
459       // We need to release the mutex from this point onwards, because xnn_reserve_space_in_weights would have returned
460       // non-NULL (which means that it locked the mutex).
461       const size_t found_offset = lookup_cache(&cache->cache, ptr, size);
462       if (found_offset == XNN_CACHE_NOT_FOUND) {
463         xnn_log_error("packed weights not found in finalized weights cache");
464       }
465 
466       offset = found_offset;
467       break;
468     }
469     case xnn_cache_state_not_finalized: {
470       offset = xnn_get_or_insert_cache(&cache->cache, ptr, size);
471       if (offset != XNN_CACHE_NOT_FOUND) {
472         // Found or inserted packed weights, update the largest size seen so far, this will be used when finalizing the
473         // weights cache, to ensure there is an extra space at the end for future cache checks.
474         cache->max_weights_size = max(size, cache->max_weights_size);
475       }
476       break;
477     }
478   }
479 
480   // Mutex is locked in xnn_reserve_space_in_weights_cache when it returns non-NULL, i.e. when cache is not finalized,
481   // or if it is xnn_cache_state_soft_finalized and has sufficient space.
482   const enum xnn_status status = xnn_mutex_unlock(&cache->mutex);
483   (void) status;
484   assert(status == xnn_status_success);
485   return offset;
486 }
487 
xnn_weights_cache_is_finalized(struct xnn_weights_cache * cache)488 bool xnn_weights_cache_is_finalized(struct xnn_weights_cache* cache) {
489   return cache->finalization_state != xnn_cache_state_not_finalized;
490 }
491