1 // Copyright 2022 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package com.google.setfilters.cuckoofilter; 16 17 import static com.google.common.base.Preconditions.checkArgument; 18 19 import com.google.common.collect.ImmutableMap; 20 import com.google.common.hash.Funnel; 21 import com.google.common.hash.HashCode; 22 import com.google.errorprone.annotations.CanIgnoreReturnValue; 23 import java.util.Map; 24 25 /** 26 * Specification for the cuckoo filter. 27 * 28 * <p>This class is immutable. 29 */ 30 // TODO: Handle serialization. 31 public final class CuckooFilterConfig { 32 private final Size size; 33 private final HashFunction hashFunction; 34 private final Strategy strategy; 35 private final boolean useSpaceOptimization; 36 CuckooFilterConfig( Size size, HashFunction hashFunction, Strategy strategy, boolean useSpaceOptimization)37 private CuckooFilterConfig( 38 Size size, HashFunction hashFunction, Strategy strategy, boolean useSpaceOptimization) { 39 this.size = size; 40 this.hashFunction = hashFunction; 41 this.strategy = strategy; 42 this.useSpaceOptimization = useSpaceOptimization; 43 } 44 size()45 public Size size() { 46 return size; 47 } 48 hashFunction()49 public HashFunction hashFunction() { 50 return hashFunction; 51 } 52 strategy()53 public Strategy strategy() { 54 return strategy; 55 } 56 useSpaceOptimization()57 public boolean useSpaceOptimization() { 58 return useSpaceOptimization; 59 } 60 newBuilder()61 public static Builder newBuilder() { 62 return new Builder(); 63 } 64 65 /** Builder for the {@link CuckooFilterConfig}. */ 66 public static class Builder { 67 private Size size; 68 private HashFunction hashFunction; 69 private Strategy strategy; 70 private boolean useSpaceOptimization; 71 Builder()72 private Builder() {} 73 74 @CanIgnoreReturnValue setSize(Size size)75 public Builder setSize(Size size) { 76 this.size = size; 77 return this; 78 } 79 80 @CanIgnoreReturnValue setHashFunction(HashFunction hashFunction)81 public Builder setHashFunction(HashFunction hashFunction) { 82 this.hashFunction = hashFunction; 83 return this; 84 } 85 86 @CanIgnoreReturnValue setStrategy(Strategy strategy)87 public Builder setStrategy(Strategy strategy) { 88 this.strategy = strategy; 89 return this; 90 } 91 92 /** 93 * Whether to use space optimized filter representation (if possible). 94 * 95 * <p>Setting this field to {@code true} does not guarantee the optimization algorithm to always 96 * apply - it is best effort. 97 * 98 * <p>In general, using this may result in slower filter operations, and incurs an additional 99 * fixed space overhead. Thus, it is possible for the "optimized" version of the filter to 100 * actually take more space than the non optimized one. 101 */ 102 @CanIgnoreReturnValue setUseSpaceOptimization(boolean useSpaceOptimization)103 public Builder setUseSpaceOptimization(boolean useSpaceOptimization) { 104 this.useSpaceOptimization = useSpaceOptimization; 105 return this; 106 } 107 108 /** 109 * Builds {@link CuckooFilterConfig}. 110 * 111 * @throws IllegalArgumentException if the required parameters are not set. 112 */ build()113 public CuckooFilterConfig build() { 114 checkArgument(size != null, "Size must be set."); 115 checkArgument(hashFunction != null, "Hash function must be set."); 116 checkArgument(strategy != null, "Strategy must be set."); 117 118 return new CuckooFilterConfig(size, hashFunction, strategy, useSpaceOptimization); 119 } 120 } 121 122 /** 123 * Specification of the cuckoo filter size. 124 * 125 * <p>A cuckoo filter's size can be defined as a tuple (bucketCount, bucketCapacity, 126 * fingeprintLength); this means that there are bucketCount number of buckets, where each bucket 127 * can store up to bucketCapacity fingerprints, and each fingerprint is of length 128 * fingerprintLength bits. 129 * 130 * <p>All fields are required and must be set explicitly. 131 * 132 * <p>This class is immutable. 133 */ 134 public static class Size { 135 private static final int MAX_BUCKET_CAPACITY = 128; 136 private static final int MAX_FINGERPRINT_LENGTH = 64; 137 /** Empirical load by the bucket capacity. */ 138 private static final ImmutableMap<Integer, Double> APPROX_LOAD_BY_BUCKET_CAPACITY = 139 ImmutableMap.<Integer, Double>builder() 140 .put(2, 0.85) 141 .put(3, 0.91) 142 .put(4, 0.95) 143 .put(5, 0.96) 144 .put(6, 0.97) 145 .put(7, 0.98) 146 .put(8, 0.98) 147 .buildOrThrow(); 148 149 private final int bucketCount; 150 private final int bucketCapacity; 151 private final int fingerprintLength; 152 Size(int bucketCount, int bucketCapacity, int fingerprintLength)153 private Size(int bucketCount, int bucketCapacity, int fingerprintLength) { 154 this.bucketCount = bucketCount; 155 this.bucketCapacity = bucketCapacity; 156 this.fingerprintLength = fingerprintLength; 157 } 158 159 /** 160 * Automatically computes a reasonably efficient cuckoo filter {@link Size} that ensures (with 161 * high probability) storing up to {@code elementsCountUpperBound} elements (with high 162 * probability) with the given {@code targetFalsePositiveRate}. 163 * 164 * @throws IllegalArgumentException if {@code targetFalsePositiveRate} is not in range [0, 1] or 165 * {@code elementsCountUpperBound} is <= 0, or a suitable cuckoo filter size could not be 166 * computed based on the given input. 167 */ computeEfficientSize( double targetFalsePositiveRate, long elementsCountUpperBound)168 public static Size computeEfficientSize( 169 double targetFalsePositiveRate, long elementsCountUpperBound) { 170 checkArgument( 171 0 < targetFalsePositiveRate && targetFalsePositiveRate < 1, 172 "targetFalsePositiveRate must be in range (0, 1): %s given.", 173 targetFalsePositiveRate); 174 checkArgument( 175 elementsCountUpperBound > 0, 176 "elementsCountUpperBound must be > 0: %s given.", 177 elementsCountUpperBound); 178 179 long bestCuckooFilterSizeInBits = -1; 180 int bestBucketCount = 0; 181 int bestBucketCapacity = 0; 182 int bestFingerprintLength = 0; 183 for (Map.Entry<Integer, Double> entry : APPROX_LOAD_BY_BUCKET_CAPACITY.entrySet()) { 184 int bucketCapacity = entry.getKey(); 185 double load = entry.getValue(); 186 187 int fingerprintLength = 188 (int) Math.ceil(-log2(targetFalsePositiveRate) + log2(bucketCapacity) + 1); 189 long bucketCount = (long) Math.ceil(elementsCountUpperBound / (bucketCapacity * load)); 190 191 // The computed size is invalid if fingerprint length is larger than max length or the 192 // bucket count that is larger than max integer. 193 if (fingerprintLength > MAX_FINGERPRINT_LENGTH || bucketCount >= Integer.MAX_VALUE) { 194 continue; 195 } 196 197 long totalBits = bucketCount * bucketCapacity * fingerprintLength; 198 if (bestCuckooFilterSizeInBits == -1 || bestCuckooFilterSizeInBits > totalBits) { 199 bestCuckooFilterSizeInBits = totalBits; 200 bestBucketCount = (int) bucketCount; 201 bestBucketCapacity = bucketCapacity; 202 bestFingerprintLength = fingerprintLength; 203 } 204 } 205 206 checkArgument( 207 bestCuckooFilterSizeInBits != -1, 208 "Could not compute suitable cuckoo filter size based on the given input. Either the" 209 + " target false positive rate is too low, or the computed size is too big."); 210 211 return Size.newBuilder() 212 .setBucketCount(bestBucketCount) 213 .setBucketCapacity(bestBucketCapacity) 214 .setFingerprintLength(bestFingerprintLength) 215 .build(); 216 } 217 newBuilder()218 public static Builder newBuilder() { 219 return new Builder(); 220 } 221 222 /** Returns the total number of buckets in the cuckoo filter. */ bucketCount()223 public int bucketCount() { 224 return bucketCount; 225 } 226 227 /** Returns the maximum number of fingerprints each bucket can hold. */ bucketCapacity()228 public int bucketCapacity() { 229 return bucketCapacity; 230 } 231 232 /** Returns the length of the fingerprint in bits. */ fingerprintLength()233 public int fingerprintLength() { 234 return fingerprintLength; 235 } 236 237 /** Builder for the {@link Size}. */ 238 public static class Builder { 239 private int bucketCount; 240 private int bucketCapacity; 241 private int fingerprintLength; 242 Builder()243 private Builder() {} 244 245 /** 246 * Sets the number of buckets in the cuckoo filter. 247 * 248 * <p>{@code bucketCount} must be > 0. 249 */ 250 @CanIgnoreReturnValue setBucketCount(int bucketCount)251 public Builder setBucketCount(int bucketCount) { 252 this.bucketCount = bucketCount; 253 return this; 254 } 255 256 /** 257 * Sets the maximum number of fingerprints each bucket can hold. 258 * 259 * <p>{@code bucketCapacity} must be in range (0, {@value #MAX_BUCKET_CAPACITY}]. 260 */ 261 @CanIgnoreReturnValue setBucketCapacity(int bucketCapacity)262 public Builder setBucketCapacity(int bucketCapacity) { 263 this.bucketCapacity = bucketCapacity; 264 return this; 265 } 266 267 /** 268 * Sets the length of each fingerprint in bits. 269 * 270 * <p>{@code fingerprintLength} must be in range (0, {@value #MAX_FINGERPRINT_LENGTH}]. 271 */ 272 @CanIgnoreReturnValue setFingerprintLength(int fingerprintLength)273 public Builder setFingerprintLength(int fingerprintLength) { 274 this.fingerprintLength = fingerprintLength; 275 return this; 276 } 277 278 /** 279 * Builds {@link Size}. 280 * 281 * @throws IllegalArgumentException if the configured parameters are invalid. 282 */ build()283 public Size build() { 284 checkArgument(bucketCount > 0, "bucketCount must be > 0: %s given instead.", bucketCount); 285 checkArgument( 286 0 < bucketCapacity && bucketCapacity <= MAX_BUCKET_CAPACITY, 287 "bucketCapacity must be in range (0, %s]: %s given instead.", 288 MAX_BUCKET_CAPACITY, 289 bucketCapacity); 290 checkArgument( 291 0 < fingerprintLength && fingerprintLength <= MAX_FINGERPRINT_LENGTH, 292 "fingerprintLength must be in range (0, %s]: %s given instead.", 293 MAX_FINGERPRINT_LENGTH, 294 fingerprintLength); 295 296 return new Size(bucketCount, bucketCapacity, fingerprintLength); 297 } 298 } 299 log2(double x)300 private static double log2(double x) { 301 return Math.log(x) / Math.log(2); 302 } 303 } 304 305 /** Hash function for transforming an arbitrary type element to a {@link HashCode}. */ 306 public interface HashFunction { 307 /** Hashes given {@code element} to a {@link HashCode}, using the given {@code funnel}. */ hash(T element, Funnel<? super T> funnel)308 <T> HashCode hash(T element, Funnel<? super T> funnel); 309 } 310 311 /** 312 * Strategy for computing fingerprints and where these fingerprints belong in the cuckoo filter 313 * table. 314 */ 315 public interface Strategy { 316 317 /** 318 * Computes the fingerprint value given the element's {@code hash} output from {@link 319 * HashFunction}. 320 * 321 * <p>The returned value should be in range (0, 2^{@code fingerprintLength}). Otherwise, the 322 * behavior of the cuckoo filter is undefined. Note that the interval is an open interval, so 0 323 * and 2^{@code fingerprintLength} are not included. 324 */ computeFingerprint(HashCode hash, int fingerprintLength)325 long computeFingerprint(HashCode hash, int fingerprintLength); 326 327 /** 328 * Computes one of the bucket indices given the element's {@code hash} output from {@link 329 * HashFunction} and {@code bucketCount} of the cuckoo filter. 330 * 331 * <p>The returned value should be in range [0, {@code bucketCount}). Otherwise, the behavior of 332 * the cuckoo filter is undefined. 333 */ computeBucketIndex(HashCode hash, int bucketCount)334 int computeBucketIndex(HashCode hash, int bucketCount); 335 336 /** 337 * Computes the element's other bucket index given the element's {@code fingerprint} value and 338 * its initial {@code bucketIndex}. 339 * 340 * <p>{@code hashFunction} corresponds to the {@link HashFunction} that was supplied when the 341 * config was constructed. Depending on the implementation, {@code hashFunction} may or may not 342 * be used. 343 * 344 * <p>The returned value should be in range [0, {@code bucketCount}), and the method needs to be 345 * an involution with respect to {@code bucketIndex}. That is, with other parameters fixed, the 346 * method needs to satisfy <b>bucketIndex = 347 * computeOtherBucketIndex(computeOtherBucketIndex(bucketIndex))</b> for all valid 348 * <b>bucketIndex</b>. Note that other parameters are omitted for brevity. If these properties 349 * don't hold, the behavior of the cuckoo filter is undefined. 350 */ computeOtherBucketIndex( long fingerprint, int bucketIndex, int bucketCount, HashFunction hashFunction)351 int computeOtherBucketIndex( 352 long fingerprint, int bucketIndex, int bucketCount, HashFunction hashFunction); 353 354 /** 355 * Maximum number of replacements to be made during insertion, before declaring that the 356 * insertion has failed. 357 * 358 * <p>If not overridden, set to 500 as a default. 359 */ maxReplacementCount()360 default int maxReplacementCount() { 361 return 500; 362 } 363 } 364 } 365