1*a03ca8b9SKrzysztof Kosiński // Copyright 2017 The Chromium Authors. All rights reserved. 2*a03ca8b9SKrzysztof Kosiński // Use of this source code is governed by a BSD-style license that can be 3*a03ca8b9SKrzysztof Kosiński // found in the LICENSE file. 4*a03ca8b9SKrzysztof Kosiński 5*a03ca8b9SKrzysztof Kosiński #ifndef COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_ 6*a03ca8b9SKrzysztof Kosiński #define COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_ 7*a03ca8b9SKrzysztof Kosiński 8*a03ca8b9SKrzysztof Kosiński #include <stddef.h> 9*a03ca8b9SKrzysztof Kosiński #include <stdint.h> 10*a03ca8b9SKrzysztof Kosiński 11*a03ca8b9SKrzysztof Kosiński #include <memory> 12*a03ca8b9SKrzysztof Kosiński #include <string> 13*a03ca8b9SKrzysztof Kosiński 14*a03ca8b9SKrzysztof Kosiński #include "components/zucchini/buffer_view.h" 15*a03ca8b9SKrzysztof Kosiński 16*a03ca8b9SKrzysztof Kosiński namespace zucchini { 17*a03ca8b9SKrzysztof Kosiński 18*a03ca8b9SKrzysztof Kosiński // A class to detect outliers in a list of doubles using Chauvenet's criterion: 19*a03ca8b9SKrzysztof Kosiński // Compute mean and standard deviation of observations, then determine whether 20*a03ca8b9SKrzysztof Kosiński // a query value lies beyond a fixed number of standard deviations (sigmas) from 21*a03ca8b9SKrzysztof Kosiński // the mean. The purpose of this test is to reduce the chance of false-positive 22*a03ca8b9SKrzysztof Kosiński // ensemble matches. 23*a03ca8b9SKrzysztof Kosiński class OutlierDetector { 24*a03ca8b9SKrzysztof Kosiński public: 25*a03ca8b9SKrzysztof Kosiński OutlierDetector(); 26*a03ca8b9SKrzysztof Kosiński OutlierDetector(const OutlierDetector&) = delete; 27*a03ca8b9SKrzysztof Kosiński const OutlierDetector& operator=(const OutlierDetector&) = delete; 28*a03ca8b9SKrzysztof Kosiński ~OutlierDetector(); 29*a03ca8b9SKrzysztof Kosiński 30*a03ca8b9SKrzysztof Kosiński // Incorporates |sample| into mean and standard deviation. 31*a03ca8b9SKrzysztof Kosiński void Add(double sample); 32*a03ca8b9SKrzysztof Kosiński 33*a03ca8b9SKrzysztof Kosiński // Prepares basic statistics for DecideOutlier() calls. Should be called after 34*a03ca8b9SKrzysztof Kosiński // all samples have been added. 35*a03ca8b9SKrzysztof Kosiński void Prepare(); 36*a03ca8b9SKrzysztof Kosiński 37*a03ca8b9SKrzysztof Kosiński // Renders current statistics as strings for logging. 38*a03ca8b9SKrzysztof Kosiński std::string RenderStats(); 39*a03ca8b9SKrzysztof Kosiński 40*a03ca8b9SKrzysztof Kosiński // Heuristically decides whether |sample| is an outlier. Returns 1 if |sample| 41*a03ca8b9SKrzysztof Kosiński // is "too high", 0 if |sample| is "normal", and -1 if |sample| is "too low". 42*a03ca8b9SKrzysztof Kosiński // Must be called after Prepare(). 43*a03ca8b9SKrzysztof Kosiński int DecideOutlier(double sample); 44*a03ca8b9SKrzysztof Kosiński 45*a03ca8b9SKrzysztof Kosiński private: 46*a03ca8b9SKrzysztof Kosiński size_t n_ = 0; 47*a03ca8b9SKrzysztof Kosiński double sum_ = 0; 48*a03ca8b9SKrzysztof Kosiński double sum_of_squares_ = 0; 49*a03ca8b9SKrzysztof Kosiński double mean_ = 0; 50*a03ca8b9SKrzysztof Kosiński double standard_deviation_ = 0; 51*a03ca8b9SKrzysztof Kosiński }; 52*a03ca8b9SKrzysztof Kosiński 53*a03ca8b9SKrzysztof Kosiński // A class to compute similarity score between binary data. The heuristic here 54*a03ca8b9SKrzysztof Kosiński // preprocesses input data to a size-65536 histogram, counting the frequency of 55*a03ca8b9SKrzysztof Kosiński // consecutive 2-byte sequences. Therefore data with lengths < 2 are considered 56*a03ca8b9SKrzysztof Kosiński // invalid -- but this is okay for Zucchini's use case. 57*a03ca8b9SKrzysztof Kosiński class BinaryDataHistogram { 58*a03ca8b9SKrzysztof Kosiński public: 59*a03ca8b9SKrzysztof Kosiński BinaryDataHistogram(); 60*a03ca8b9SKrzysztof Kosiński BinaryDataHistogram(const BinaryDataHistogram&) = delete; 61*a03ca8b9SKrzysztof Kosiński const BinaryDataHistogram& operator=(const BinaryDataHistogram&) = delete; 62*a03ca8b9SKrzysztof Kosiński ~BinaryDataHistogram(); 63*a03ca8b9SKrzysztof Kosiński 64*a03ca8b9SKrzysztof Kosiński // Attempts to compute the histogram, returns true iff successful. 65*a03ca8b9SKrzysztof Kosiński bool Compute(ConstBufferView region); 66*a03ca8b9SKrzysztof Kosiński IsValid()67*a03ca8b9SKrzysztof Kosiński bool IsValid() const { return static_cast<bool>(histogram_); } 68*a03ca8b9SKrzysztof Kosiński 69*a03ca8b9SKrzysztof Kosiński // Returns distance to another histogram (heuristics). If two binaries are 70*a03ca8b9SKrzysztof Kosiński // identical then their histogram distance is 0. However, the converse is not 71*a03ca8b9SKrzysztof Kosiński // true in general. For example, "aba" and "bab" are different, but their 72*a03ca8b9SKrzysztof Kosiński // histogram distance is 0 (both histograms are {"ab": 1, "ba": 1}). 73*a03ca8b9SKrzysztof Kosiński double Distance(const BinaryDataHistogram& other) const; 74*a03ca8b9SKrzysztof Kosiński 75*a03ca8b9SKrzysztof Kosiński private: 76*a03ca8b9SKrzysztof Kosiński enum { kNumBins = 1 << (sizeof(uint16_t) * 8) }; 77*a03ca8b9SKrzysztof Kosiński static_assert(kNumBins == 65536, "Incorrect constant computation."); 78*a03ca8b9SKrzysztof Kosiński 79*a03ca8b9SKrzysztof Kosiński // Size, in bytes, of the data over which the histogram was computed. 80*a03ca8b9SKrzysztof Kosiński size_t size_ = 0; 81*a03ca8b9SKrzysztof Kosiński 82*a03ca8b9SKrzysztof Kosiński // 2^16 buckets holding counts of all 2-byte sequences in the data. The counts 83*a03ca8b9SKrzysztof Kosiński // are stored as signed values to simplify computing the distance between two 84*a03ca8b9SKrzysztof Kosiński // histograms. 85*a03ca8b9SKrzysztof Kosiński std::unique_ptr<int32_t[]> histogram_; 86*a03ca8b9SKrzysztof Kosiński }; 87*a03ca8b9SKrzysztof Kosiński 88*a03ca8b9SKrzysztof Kosiński } // namespace zucchini 89*a03ca8b9SKrzysztof Kosiński 90*a03ca8b9SKrzysztof Kosiński #endif // COMPONENTS_ZUCCHINI_BINARY_DATA_HISTOGRAM_H_ 91