xref: /aosp_15_r20/external/zucchini/binary_data_histogram.cc (revision a03ca8b91e029cd15055c20c78c2e087c84792e4)
1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/zucchini/binary_data_histogram.h"
6 
7 #include <algorithm>
8 #include <cmath>
9 #include <limits>
10 
11 #include "base/check_op.h"
12 #include "base/format_macros.h"
13 #include "base/strings/stringprintf.h"
14 
15 namespace zucchini {
16 
17 /******** OutlierDetector ********/
18 
19 OutlierDetector::OutlierDetector() = default;
20 
21 OutlierDetector::~OutlierDetector() = default;
22 
23 // For BinaryDataHistogram, |sample| is typically in interval [0, 1].
Add(double sample)24 void OutlierDetector::Add(double sample) {
25   ++n_;
26   sum_ += sample;
27   sum_of_squares_ += sample * sample;
28 }
29 
Prepare()30 void OutlierDetector::Prepare() {
31   if (n_ > 0) {
32     mean_ = sum_ / n_;
33     standard_deviation_ = ::sqrt((sum_of_squares_ - sum_ * mean_) /
34                                  std::max(static_cast<size_t>(1), n_ - 1));
35   }
36 }
37 
RenderStats()38 std::string OutlierDetector::RenderStats() {
39   return base::StringPrintf("Mean = %.5f, StdDev = %.5f over %" PRIuS
40                             " samples",
41                             mean_, standard_deviation_, n_);
42 }
43 
44 // Constants are chosen for BinaryDataHistogram, where |sample| is typically in
45 // [0, 1].
DecideOutlier(double sample)46 int OutlierDetector::DecideOutlier(double sample) {
47   // Lower bound to avoid divide-by-zero and penalizing tight clusters.
48   constexpr double kMinTolerance = 0.1;
49   // Number of standard deviations away from mean for value to become outlier.
50   constexpr double kSigmaBound = 1.9;
51   if (n_ <= 1)
52     return 0;
53   double tolerance = std::max(kMinTolerance, standard_deviation_);
54   double num_sigma = (sample - mean_) / tolerance;
55   return num_sigma > kSigmaBound ? 1 : num_sigma < -kSigmaBound ? -1 : 0;
56 }
57 
58 /******** BinaryDataHistogram ********/
59 
60 BinaryDataHistogram::BinaryDataHistogram() = default;
61 
62 BinaryDataHistogram::~BinaryDataHistogram() = default;
63 
Compute(ConstBufferView region)64 bool BinaryDataHistogram::Compute(ConstBufferView region) {
65   DCHECK(!histogram_);
66   // Binary data with size < 2 are invalid.
67   if (region.size() < sizeof(uint16_t))
68     return false;
69   DCHECK_LE(region.size(),
70             static_cast<size_t>(std::numeric_limits<int32_t>::max()));
71 
72   histogram_ = std::make_unique<int32_t[]>(kNumBins);
73   size_ = region.size();
74   // Number of 2-byte intervals fully contained in |region|.
75   size_t bound = size_ - sizeof(uint16_t) + 1;
76   for (size_t i = 0; i < bound; ++i)
77     ++histogram_[region.read<uint16_t>(i)];
78   return true;
79 }
80 
Distance(const BinaryDataHistogram & other) const81 double BinaryDataHistogram::Distance(const BinaryDataHistogram& other) const {
82   DCHECK(IsValid() && other.IsValid());
83   // Compute Manhattan (L1) distance between respective histograms.
84   double total_diff = 0;
85   for (int i = 0; i < kNumBins; ++i)
86     total_diff += std::abs(histogram_[i] - other.histogram_[i]);
87   // Normalize by total size, so result lies in [0, 1].
88   return total_diff / (size_ + other.size_);
89 }
90 
91 }  // namespace zucchini
92