xref: /aosp_15_r20/external/setfilters/java/com/google/setfilters/cuckoofilter/CuckooFilterConfig.java (revision f53908e1379d754ac0a89dcba1e5f85b29df130f)
1 // Copyright 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 package com.google.setfilters.cuckoofilter;
16 
17 import static com.google.common.base.Preconditions.checkArgument;
18 
19 import com.google.common.collect.ImmutableMap;
20 import com.google.common.hash.Funnel;
21 import com.google.common.hash.HashCode;
22 import com.google.errorprone.annotations.CanIgnoreReturnValue;
23 import java.util.Map;
24 
25 /**
26  * Specification for the cuckoo filter.
27  *
28  * <p>This class is immutable.
29  */
30 // TODO: Handle serialization.
31 public final class CuckooFilterConfig {
32   private final Size size;
33   private final HashFunction hashFunction;
34   private final Strategy strategy;
35   private final boolean useSpaceOptimization;
36 
CuckooFilterConfig( Size size, HashFunction hashFunction, Strategy strategy, boolean useSpaceOptimization)37   private CuckooFilterConfig(
38       Size size, HashFunction hashFunction, Strategy strategy, boolean useSpaceOptimization) {
39     this.size = size;
40     this.hashFunction = hashFunction;
41     this.strategy = strategy;
42     this.useSpaceOptimization = useSpaceOptimization;
43   }
44 
size()45   public Size size() {
46     return size;
47   }
48 
hashFunction()49   public HashFunction hashFunction() {
50     return hashFunction;
51   }
52 
strategy()53   public Strategy strategy() {
54     return strategy;
55   }
56 
useSpaceOptimization()57   public boolean useSpaceOptimization() {
58     return useSpaceOptimization;
59   }
60 
newBuilder()61   public static Builder newBuilder() {
62     return new Builder();
63   }
64 
65   /** Builder for the {@link CuckooFilterConfig}. */
66   public static class Builder {
67     private Size size;
68     private HashFunction hashFunction;
69     private Strategy strategy;
70     private boolean useSpaceOptimization;
71 
Builder()72     private Builder() {}
73 
74     @CanIgnoreReturnValue
setSize(Size size)75     public Builder setSize(Size size) {
76       this.size = size;
77       return this;
78     }
79 
80     @CanIgnoreReturnValue
setHashFunction(HashFunction hashFunction)81     public Builder setHashFunction(HashFunction hashFunction) {
82       this.hashFunction = hashFunction;
83       return this;
84     }
85 
86     @CanIgnoreReturnValue
setStrategy(Strategy strategy)87     public Builder setStrategy(Strategy strategy) {
88       this.strategy = strategy;
89       return this;
90     }
91 
92     /**
93      * Whether to use space optimized filter representation (if possible).
94      *
95      * <p>Setting this field to {@code true} does not guarantee the optimization algorithm to always
96      * apply - it is best effort.
97      *
98      * <p>In general, using this may result in slower filter operations, and incurs an additional
99      * fixed space overhead. Thus, it is possible for the "optimized" version of the filter to
100      * actually take more space than the non optimized one.
101      */
102     @CanIgnoreReturnValue
setUseSpaceOptimization(boolean useSpaceOptimization)103     public Builder setUseSpaceOptimization(boolean useSpaceOptimization) {
104       this.useSpaceOptimization = useSpaceOptimization;
105       return this;
106     }
107 
108     /**
109      * Builds {@link CuckooFilterConfig}.
110      *
111      * @throws IllegalArgumentException if the required parameters are not set.
112      */
build()113     public CuckooFilterConfig build() {
114       checkArgument(size != null, "Size must be set.");
115       checkArgument(hashFunction != null, "Hash function must be set.");
116       checkArgument(strategy != null, "Strategy must be set.");
117 
118       return new CuckooFilterConfig(size, hashFunction, strategy, useSpaceOptimization);
119     }
120   }
121 
122   /**
123    * Specification of the cuckoo filter size.
124    *
125    * <p>A cuckoo filter's size can be defined as a tuple (bucketCount, bucketCapacity,
126    * fingeprintLength); this means that there are bucketCount number of buckets, where each bucket
127    * can store up to bucketCapacity fingerprints, and each fingerprint is of length
128    * fingerprintLength bits.
129    *
130    * <p>All fields are required and must be set explicitly.
131    *
132    * <p>This class is immutable.
133    */
134   public static class Size {
135     private static final int MAX_BUCKET_CAPACITY = 128;
136     private static final int MAX_FINGERPRINT_LENGTH = 64;
137     /** Empirical load by the bucket capacity. */
138     private static final ImmutableMap<Integer, Double> APPROX_LOAD_BY_BUCKET_CAPACITY =
139         ImmutableMap.<Integer, Double>builder()
140             .put(2, 0.85)
141             .put(3, 0.91)
142             .put(4, 0.95)
143             .put(5, 0.96)
144             .put(6, 0.97)
145             .put(7, 0.98)
146             .put(8, 0.98)
147             .buildOrThrow();
148 
149     private final int bucketCount;
150     private final int bucketCapacity;
151     private final int fingerprintLength;
152 
Size(int bucketCount, int bucketCapacity, int fingerprintLength)153     private Size(int bucketCount, int bucketCapacity, int fingerprintLength) {
154       this.bucketCount = bucketCount;
155       this.bucketCapacity = bucketCapacity;
156       this.fingerprintLength = fingerprintLength;
157     }
158 
159     /**
160      * Automatically computes a reasonably efficient cuckoo filter {@link Size} that ensures (with
161      * high probability) storing up to {@code elementsCountUpperBound} elements (with high
162      * probability) with the given {@code targetFalsePositiveRate}.
163      *
164      * @throws IllegalArgumentException if {@code targetFalsePositiveRate} is not in range [0, 1] or
165      *     {@code elementsCountUpperBound} is <= 0, or a suitable cuckoo filter size could not be
166      *     computed based on the given input.
167      */
computeEfficientSize( double targetFalsePositiveRate, long elementsCountUpperBound)168     public static Size computeEfficientSize(
169         double targetFalsePositiveRate, long elementsCountUpperBound) {
170       checkArgument(
171           0 < targetFalsePositiveRate && targetFalsePositiveRate < 1,
172           "targetFalsePositiveRate must be in range (0, 1): %s given.",
173           targetFalsePositiveRate);
174       checkArgument(
175           elementsCountUpperBound > 0,
176           "elementsCountUpperBound must be > 0: %s given.",
177           elementsCountUpperBound);
178 
179       long bestCuckooFilterSizeInBits = -1;
180       int bestBucketCount = 0;
181       int bestBucketCapacity = 0;
182       int bestFingerprintLength = 0;
183       for (Map.Entry<Integer, Double> entry : APPROX_LOAD_BY_BUCKET_CAPACITY.entrySet()) {
184         int bucketCapacity = entry.getKey();
185         double load = entry.getValue();
186 
187         int fingerprintLength =
188             (int) Math.ceil(-log2(targetFalsePositiveRate) + log2(bucketCapacity) + 1);
189         long bucketCount = (long) Math.ceil(elementsCountUpperBound / (bucketCapacity * load));
190 
191         // The computed size is invalid if fingerprint length is larger than max length or the
192         // bucket count that is larger than max integer.
193         if (fingerprintLength > MAX_FINGERPRINT_LENGTH || bucketCount >= Integer.MAX_VALUE) {
194           continue;
195         }
196 
197         long totalBits = bucketCount * bucketCapacity * fingerprintLength;
198         if (bestCuckooFilterSizeInBits == -1 || bestCuckooFilterSizeInBits > totalBits) {
199           bestCuckooFilterSizeInBits = totalBits;
200           bestBucketCount = (int) bucketCount;
201           bestBucketCapacity = bucketCapacity;
202           bestFingerprintLength = fingerprintLength;
203         }
204       }
205 
206       checkArgument(
207           bestCuckooFilterSizeInBits != -1,
208           "Could not compute suitable cuckoo filter size based on the given input. Either the"
209               + " target false positive rate is too low, or the computed size is too big.");
210 
211       return Size.newBuilder()
212           .setBucketCount(bestBucketCount)
213           .setBucketCapacity(bestBucketCapacity)
214           .setFingerprintLength(bestFingerprintLength)
215           .build();
216     }
217 
newBuilder()218     public static Builder newBuilder() {
219       return new Builder();
220     }
221 
222     /** Returns the total number of buckets in the cuckoo filter. */
bucketCount()223     public int bucketCount() {
224       return bucketCount;
225     }
226 
227     /** Returns the maximum number of fingerprints each bucket can hold. */
bucketCapacity()228     public int bucketCapacity() {
229       return bucketCapacity;
230     }
231 
232     /** Returns the length of the fingerprint in bits. */
fingerprintLength()233     public int fingerprintLength() {
234       return fingerprintLength;
235     }
236 
237     /** Builder for the {@link Size}. */
238     public static class Builder {
239       private int bucketCount;
240       private int bucketCapacity;
241       private int fingerprintLength;
242 
Builder()243       private Builder() {}
244 
245       /**
246        * Sets the number of buckets in the cuckoo filter.
247        *
248        * <p>{@code bucketCount} must be > 0.
249        */
250       @CanIgnoreReturnValue
setBucketCount(int bucketCount)251       public Builder setBucketCount(int bucketCount) {
252         this.bucketCount = bucketCount;
253         return this;
254       }
255 
256       /**
257        * Sets the maximum number of fingerprints each bucket can hold.
258        *
259        * <p>{@code bucketCapacity} must be in range (0, {@value #MAX_BUCKET_CAPACITY}].
260        */
261       @CanIgnoreReturnValue
setBucketCapacity(int bucketCapacity)262       public Builder setBucketCapacity(int bucketCapacity) {
263         this.bucketCapacity = bucketCapacity;
264         return this;
265       }
266 
267       /**
268        * Sets the length of each fingerprint in bits.
269        *
270        * <p>{@code fingerprintLength} must be in range (0, {@value #MAX_FINGERPRINT_LENGTH}].
271        */
272       @CanIgnoreReturnValue
setFingerprintLength(int fingerprintLength)273       public Builder setFingerprintLength(int fingerprintLength) {
274         this.fingerprintLength = fingerprintLength;
275         return this;
276       }
277 
278       /**
279        * Builds {@link Size}.
280        *
281        * @throws IllegalArgumentException if the configured parameters are invalid.
282        */
build()283       public Size build() {
284         checkArgument(bucketCount > 0, "bucketCount must be > 0: %s given instead.", bucketCount);
285         checkArgument(
286             0 < bucketCapacity && bucketCapacity <= MAX_BUCKET_CAPACITY,
287             "bucketCapacity must be in range (0, %s]: %s given instead.",
288             MAX_BUCKET_CAPACITY,
289             bucketCapacity);
290         checkArgument(
291             0 < fingerprintLength && fingerprintLength <= MAX_FINGERPRINT_LENGTH,
292             "fingerprintLength must be in range (0, %s]: %s given instead.",
293             MAX_FINGERPRINT_LENGTH,
294             fingerprintLength);
295 
296         return new Size(bucketCount, bucketCapacity, fingerprintLength);
297       }
298     }
299 
log2(double x)300     private static double log2(double x) {
301       return Math.log(x) / Math.log(2);
302     }
303   }
304 
305   /** Hash function for transforming an arbitrary type element to a {@link HashCode}. */
306   public interface HashFunction {
307     /** Hashes given {@code element} to a {@link HashCode}, using the given {@code funnel}. */
hash(T element, Funnel<? super T> funnel)308     <T> HashCode hash(T element, Funnel<? super T> funnel);
309   }
310 
311   /**
312    * Strategy for computing fingerprints and where these fingerprints belong in the cuckoo filter
313    * table.
314    */
315   public interface Strategy {
316 
317     /**
318      * Computes the fingerprint value given the element's {@code hash} output from {@link
319      * HashFunction}.
320      *
321      * <p>The returned value should be in range (0, 2^{@code fingerprintLength}). Otherwise, the
322      * behavior of the cuckoo filter is undefined. Note that the interval is an open interval, so 0
323      * and 2^{@code fingerprintLength} are not included.
324      */
computeFingerprint(HashCode hash, int fingerprintLength)325     long computeFingerprint(HashCode hash, int fingerprintLength);
326 
327     /**
328      * Computes one of the bucket indices given the element's {@code hash} output from {@link
329      * HashFunction} and {@code bucketCount} of the cuckoo filter.
330      *
331      * <p>The returned value should be in range [0, {@code bucketCount}). Otherwise, the behavior of
332      * the cuckoo filter is undefined.
333      */
computeBucketIndex(HashCode hash, int bucketCount)334     int computeBucketIndex(HashCode hash, int bucketCount);
335 
336     /**
337      * Computes the element's other bucket index given the element's {@code fingerprint} value and
338      * its initial {@code bucketIndex}.
339      *
340      * <p>{@code hashFunction} corresponds to the {@link HashFunction} that was supplied when the
341      * config was constructed. Depending on the implementation, {@code hashFunction} may or may not
342      * be used.
343      *
344      * <p>The returned value should be in range [0, {@code bucketCount}), and the method needs to be
345      * an involution with respect to {@code bucketIndex}. That is, with other parameters fixed, the
346      * method needs to satisfy <b>bucketIndex =
347      * computeOtherBucketIndex(computeOtherBucketIndex(bucketIndex))</b> for all valid
348      * <b>bucketIndex</b>. Note that other parameters are omitted for brevity. If these properties
349      * don't hold, the behavior of the cuckoo filter is undefined.
350      */
computeOtherBucketIndex( long fingerprint, int bucketIndex, int bucketCount, HashFunction hashFunction)351     int computeOtherBucketIndex(
352         long fingerprint, int bucketIndex, int bucketCount, HashFunction hashFunction);
353 
354     /**
355      * Maximum number of replacements to be made during insertion, before declaring that the
356      * insertion has failed.
357      *
358      * <p>If not overridden, set to 500 as a default.
359      */
maxReplacementCount()360     default int maxReplacementCount() {
361       return 500;
362     }
363   }
364 }
365