xref: /aosp_15_r20/external/zucchini/binary_data_histogram.h (revision a03ca8b91e029cd15055c20c78c2e087c84792e4)
1*a03ca8b9SKrzysztof Kosiński // Copyright 2017 The Chromium Authors. All rights reserved.
2*a03ca8b9SKrzysztof Kosiński // Use of this source code is governed by a BSD-style license that can be
3*a03ca8b9SKrzysztof Kosiński // found in the LICENSE file.
4*a03ca8b9SKrzysztof Kosiński 
5*a03ca8b9SKrzysztof Kosiński #ifndef COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_
6*a03ca8b9SKrzysztof Kosiński #define COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_
7*a03ca8b9SKrzysztof Kosiński 
8*a03ca8b9SKrzysztof Kosiński #include <stddef.h>
9*a03ca8b9SKrzysztof Kosiński #include <stdint.h>
10*a03ca8b9SKrzysztof Kosiński 
11*a03ca8b9SKrzysztof Kosiński #include <memory>
12*a03ca8b9SKrzysztof Kosiński #include <string>
13*a03ca8b9SKrzysztof Kosiński 
14*a03ca8b9SKrzysztof Kosiński #include "components/zucchini/buffer_view.h"
15*a03ca8b9SKrzysztof Kosiński 
16*a03ca8b9SKrzysztof Kosiński namespace zucchini {
17*a03ca8b9SKrzysztof Kosiński 
18*a03ca8b9SKrzysztof Kosiński // A class to detect outliers in a list of doubles using Chauvenet's criterion:
19*a03ca8b9SKrzysztof Kosiński // Compute mean and standard deviation of observations, then determine whether
20*a03ca8b9SKrzysztof Kosiński // a query value lies beyond a fixed number of standard deviations (sigmas) from
21*a03ca8b9SKrzysztof Kosiński // the mean. The purpose of this test is to reduce the chance of false-positive
22*a03ca8b9SKrzysztof Kosiński // ensemble matches.
23*a03ca8b9SKrzysztof Kosiński class OutlierDetector {
24*a03ca8b9SKrzysztof Kosiński  public:
25*a03ca8b9SKrzysztof Kosiński   OutlierDetector();
26*a03ca8b9SKrzysztof Kosiński   OutlierDetector(const OutlierDetector&) = delete;
27*a03ca8b9SKrzysztof Kosiński   const OutlierDetector& operator=(const OutlierDetector&) = delete;
28*a03ca8b9SKrzysztof Kosiński   ~OutlierDetector();
29*a03ca8b9SKrzysztof Kosiński 
30*a03ca8b9SKrzysztof Kosiński   // Incorporates |sample| into mean and standard deviation.
31*a03ca8b9SKrzysztof Kosiński   void Add(double sample);
32*a03ca8b9SKrzysztof Kosiński 
33*a03ca8b9SKrzysztof Kosiński   // Prepares basic statistics for DecideOutlier() calls. Should be called after
34*a03ca8b9SKrzysztof Kosiński   // all samples have been added.
35*a03ca8b9SKrzysztof Kosiński   void Prepare();
36*a03ca8b9SKrzysztof Kosiński 
37*a03ca8b9SKrzysztof Kosiński   // Renders current statistics as strings for logging.
38*a03ca8b9SKrzysztof Kosiński   std::string RenderStats();
39*a03ca8b9SKrzysztof Kosiński 
40*a03ca8b9SKrzysztof Kosiński   // Heuristically decides whether |sample| is an outlier. Returns 1 if |sample|
41*a03ca8b9SKrzysztof Kosiński   // is "too high", 0 if |sample| is "normal", and -1 if |sample| is "too low".
42*a03ca8b9SKrzysztof Kosiński   // Must be called after Prepare().
43*a03ca8b9SKrzysztof Kosiński   int DecideOutlier(double sample);
44*a03ca8b9SKrzysztof Kosiński 
45*a03ca8b9SKrzysztof Kosiński  private:
46*a03ca8b9SKrzysztof Kosiński   size_t n_ = 0;
47*a03ca8b9SKrzysztof Kosiński   double sum_ = 0;
48*a03ca8b9SKrzysztof Kosiński   double sum_of_squares_ = 0;
49*a03ca8b9SKrzysztof Kosiński   double mean_ = 0;
50*a03ca8b9SKrzysztof Kosiński   double standard_deviation_ = 0;
51*a03ca8b9SKrzysztof Kosiński };
52*a03ca8b9SKrzysztof Kosiński 
53*a03ca8b9SKrzysztof Kosiński // A class to compute similarity score between binary data. The heuristic here
54*a03ca8b9SKrzysztof Kosiński // preprocesses input data to a size-65536 histogram, counting the frequency of
55*a03ca8b9SKrzysztof Kosiński // consecutive 2-byte sequences. Therefore data with lengths < 2 are considered
56*a03ca8b9SKrzysztof Kosiński // invalid -- but this is okay for Zucchini's use case.
57*a03ca8b9SKrzysztof Kosiński class BinaryDataHistogram {
58*a03ca8b9SKrzysztof Kosiński  public:
59*a03ca8b9SKrzysztof Kosiński   BinaryDataHistogram();
60*a03ca8b9SKrzysztof Kosiński   BinaryDataHistogram(const BinaryDataHistogram&) = delete;
61*a03ca8b9SKrzysztof Kosiński   const BinaryDataHistogram& operator=(const BinaryDataHistogram&) = delete;
62*a03ca8b9SKrzysztof Kosiński   ~BinaryDataHistogram();
63*a03ca8b9SKrzysztof Kosiński 
64*a03ca8b9SKrzysztof Kosiński   // Attempts to compute the histogram, returns true iff successful.
65*a03ca8b9SKrzysztof Kosiński   bool Compute(ConstBufferView region);
66*a03ca8b9SKrzysztof Kosiński 
IsValid()67*a03ca8b9SKrzysztof Kosiński   bool IsValid() const { return static_cast<bool>(histogram_); }
68*a03ca8b9SKrzysztof Kosiński 
69*a03ca8b9SKrzysztof Kosiński   // Returns distance to another histogram (heuristics). If two binaries are
70*a03ca8b9SKrzysztof Kosiński   // identical then their histogram distance is 0. However, the converse is not
71*a03ca8b9SKrzysztof Kosiński   // true in general. For example, "aba" and "bab" are different, but their
72*a03ca8b9SKrzysztof Kosiński   // histogram distance is 0 (both histograms are {"ab": 1, "ba": 1}).
73*a03ca8b9SKrzysztof Kosiński   double Distance(const BinaryDataHistogram& other) const;
74*a03ca8b9SKrzysztof Kosiński 
75*a03ca8b9SKrzysztof Kosiński  private:
76*a03ca8b9SKrzysztof Kosiński   enum { kNumBins = 1 << (sizeof(uint16_t) * 8) };
77*a03ca8b9SKrzysztof Kosiński   static_assert(kNumBins == 65536, "Incorrect constant computation.");
78*a03ca8b9SKrzysztof Kosiński 
79*a03ca8b9SKrzysztof Kosiński   // Size, in bytes, of the data over which the histogram was computed.
80*a03ca8b9SKrzysztof Kosiński   size_t size_ = 0;
81*a03ca8b9SKrzysztof Kosiński 
82*a03ca8b9SKrzysztof Kosiński   // 2^16 buckets holding counts of all 2-byte sequences in the data. The counts
83*a03ca8b9SKrzysztof Kosiński   // are stored as signed values to simplify computing the distance between two
84*a03ca8b9SKrzysztof Kosiński   // histograms.
85*a03ca8b9SKrzysztof Kosiński   std::unique_ptr<int32_t[]> histogram_;
86*a03ca8b9SKrzysztof Kosiński };
87*a03ca8b9SKrzysztof Kosiński 
88*a03ca8b9SKrzysztof Kosiński }  // namespace zucchini
89*a03ca8b9SKrzysztof Kosiński 
90*a03ca8b9SKrzysztof Kosiński #endif  // COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_
91