1*a03ca8b9SKrzysztof Kosiński // Copyright 2017 The Chromium Authors. All rights reserved.
2*a03ca8b9SKrzysztof Kosiński // Use of this source code is governed by a BSD-style license that can be
3*a03ca8b9SKrzysztof Kosiński // found in the LICENSE file.
4*a03ca8b9SKrzysztof Kosiński
5*a03ca8b9SKrzysztof Kosiński #include "components/zucchini/binary_data_histogram.h"
6*a03ca8b9SKrzysztof Kosiński
7*a03ca8b9SKrzysztof Kosiński #include <stddef.h>
8*a03ca8b9SKrzysztof Kosiński
9*a03ca8b9SKrzysztof Kosiński #include <memory>
10*a03ca8b9SKrzysztof Kosiński #include <vector>
11*a03ca8b9SKrzysztof Kosiński
12*a03ca8b9SKrzysztof Kosiński #include "components/zucchini/buffer_view.h"
13*a03ca8b9SKrzysztof Kosiński #include "testing/gtest/include/gtest/gtest.h"
14*a03ca8b9SKrzysztof Kosiński
15*a03ca8b9SKrzysztof Kosiński namespace zucchini {
16*a03ca8b9SKrzysztof Kosiński
TEST(OutlierDetectorTest,Basic)17*a03ca8b9SKrzysztof Kosiński TEST(OutlierDetectorTest, Basic) {
18*a03ca8b9SKrzysztof Kosiński auto make_detector = [](const std::vector<double>& values) {
19*a03ca8b9SKrzysztof Kosiński auto detector = std::make_unique<OutlierDetector>();
20*a03ca8b9SKrzysztof Kosiński for (double v : values)
21*a03ca8b9SKrzysztof Kosiński detector->Add(v);
22*a03ca8b9SKrzysztof Kosiński detector->Prepare();
23*a03ca8b9SKrzysztof Kosiński return detector;
24*a03ca8b9SKrzysztof Kosiński };
25*a03ca8b9SKrzysztof Kosiński
26*a03ca8b9SKrzysztof Kosiński std::unique_ptr<OutlierDetector> detector;
27*a03ca8b9SKrzysztof Kosiński // No data: Should at least not cause error.
28*a03ca8b9SKrzysztof Kosiński detector = make_detector({});
29*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.0));
30*a03ca8b9SKrzysztof Kosiński // Single point: Trivially inert.
31*a03ca8b9SKrzysztof Kosiński detector = make_detector({0.5});
32*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.1));
33*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.5));
34*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.9));
35*a03ca8b9SKrzysztof Kosiński // Two identical points: StdDev is 0, so falls back to built-in tolerance.
36*a03ca8b9SKrzysztof Kosiński detector = make_detector({0.5, 0.5});
37*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(-1, detector->DecideOutlier(0.3));
38*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.499));
39*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.5));
40*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.501));
41*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(1, detector->DecideOutlier(0.7));
42*a03ca8b9SKrzysztof Kosiński // Two separate points: Outliner test is pretty lax.
43*a03ca8b9SKrzysztof Kosiński detector = make_detector({0.4, 0.6});
44*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(-1, detector->DecideOutlier(0.2));
45*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.3));
46*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.5));
47*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.7));
48*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(1, detector->DecideOutlier(0.8));
49*a03ca8b9SKrzysztof Kosiński // Sharpen distribution by clustering toward norm: Now test is stricter.
50*a03ca8b9SKrzysztof Kosiński detector = make_detector({0.4, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.6});
51*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(-1, detector->DecideOutlier(0.3));
52*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.4));
53*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.5));
54*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.6));
55*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(1, detector->DecideOutlier(0.7));
56*a03ca8b9SKrzysztof Kosiński // Shift numbers around: Mean is 0.3, and data order scrambled.
57*a03ca8b9SKrzysztof Kosiński detector = make_detector({0.28, 0.2, 0.31, 0.4, 0.29, 0.32, 0.27, 0.30});
58*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(-1, detector->DecideOutlier(0.0));
59*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(-1, detector->DecideOutlier(0.1));
60*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.2));
61*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.3));
62*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.4));
63*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(1, detector->DecideOutlier(0.5));
64*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(1, detector->DecideOutlier(1.0));
65*a03ca8b9SKrzysztof Kosiński // Typical usage: Potential outlier would be part of original input data!
66*a03ca8b9SKrzysztof Kosiński detector = make_detector({0.3, 0.29, 0.31, 0.0, 0.3, 0.32, 0.3, 0.29, 0.6});
67*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(-1, detector->DecideOutlier(0.0));
68*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.28));
69*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.29));
70*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.3));
71*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.31));
72*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0, detector->DecideOutlier(0.32));
73*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(1, detector->DecideOutlier(0.6));
74*a03ca8b9SKrzysztof Kosiński }
75*a03ca8b9SKrzysztof Kosiński
TEST(BinaryDataHistogramTest,Basic)76*a03ca8b9SKrzysztof Kosiński TEST(BinaryDataHistogramTest, Basic) {
77*a03ca8b9SKrzysztof Kosiński constexpr double kUninitScore = -1;
78*a03ca8b9SKrzysztof Kosiński
79*a03ca8b9SKrzysztof Kosiński constexpr uint8_t kTestData[] = {2, 137, 42, 0, 0, 0, 7, 11, 1, 11, 255};
80*a03ca8b9SKrzysztof Kosiński const size_t n = sizeof(kTestData);
81*a03ca8b9SKrzysztof Kosiński ConstBufferView region(kTestData, n);
82*a03ca8b9SKrzysztof Kosiński
83*a03ca8b9SKrzysztof Kosiński std::vector<BinaryDataHistogram> prefix_histograms(n + 1); // Short to long.
84*a03ca8b9SKrzysztof Kosiński std::vector<BinaryDataHistogram> suffix_histograms(n + 1); // Long to short.
85*a03ca8b9SKrzysztof Kosiński
86*a03ca8b9SKrzysztof Kosiński for (size_t i = 0; i <= n; ++i) {
87*a03ca8b9SKrzysztof Kosiński ConstBufferView prefix(region.begin(), i);
88*a03ca8b9SKrzysztof Kosiński ConstBufferView suffix(region.begin() + i, n - i);
89*a03ca8b9SKrzysztof Kosiński // If regions are smaller than 2 bytes then it is invalid. Else valid.
90*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(prefix.size() >= 2, prefix_histograms[i].Compute(prefix));
91*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(suffix.size() >= 2, suffix_histograms[i].Compute(suffix));
92*a03ca8b9SKrzysztof Kosiński // IsValid() returns the same results.
93*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(prefix.size() >= 2, prefix_histograms[i].IsValid());
94*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(suffix.size() >= 2, suffix_histograms[i].IsValid());
95*a03ca8b9SKrzysztof Kosiński }
96*a03ca8b9SKrzysztof Kosiński
97*a03ca8b9SKrzysztof Kosiński // Full-prefix = full-suffix = full data.
98*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0.0, prefix_histograms[n].Distance(suffix_histograms[0]));
99*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(0.0, suffix_histograms[0].Distance(prefix_histograms[n]));
100*a03ca8b9SKrzysztof Kosiński
101*a03ca8b9SKrzysztof Kosiński // Testing heuristics without overreliance on implementation details.
102*a03ca8b9SKrzysztof Kosiński
103*a03ca8b9SKrzysztof Kosiński // Strict prefixes, in increasing size. Compare against full data.
104*a03ca8b9SKrzysztof Kosiński double prev_prefix_score = kUninitScore;
105*a03ca8b9SKrzysztof Kosiński for (size_t i = 2; i < n; ++i) {
106*a03ca8b9SKrzysztof Kosiński double score = prefix_histograms[i].Distance(prefix_histograms[n]);
107*a03ca8b9SKrzysztof Kosiński // Positivity.
108*a03ca8b9SKrzysztof Kosiński EXPECT_GT(score, 0.0);
109*a03ca8b9SKrzysztof Kosiński // Symmetry.
110*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(score, prefix_histograms[n].Distance(prefix_histograms[i]));
111*a03ca8b9SKrzysztof Kosiński // Distance should decrease as prefix gets nearer to full data.
112*a03ca8b9SKrzysztof Kosiński if (prev_prefix_score != kUninitScore)
113*a03ca8b9SKrzysztof Kosiński EXPECT_LT(score, prev_prefix_score);
114*a03ca8b9SKrzysztof Kosiński prev_prefix_score = score;
115*a03ca8b9SKrzysztof Kosiński }
116*a03ca8b9SKrzysztof Kosiński
117*a03ca8b9SKrzysztof Kosiński // Strict suffixes, in decreasing size. Compare against full data.
118*a03ca8b9SKrzysztof Kosiński double prev_suffix_score = -1;
119*a03ca8b9SKrzysztof Kosiński for (size_t i = 1; i <= n - 2; ++i) {
120*a03ca8b9SKrzysztof Kosiński double score = suffix_histograms[i].Distance(suffix_histograms[0]);
121*a03ca8b9SKrzysztof Kosiński // Positivity.
122*a03ca8b9SKrzysztof Kosiński EXPECT_GT(score, 0.0);
123*a03ca8b9SKrzysztof Kosiński // Symmetry.
124*a03ca8b9SKrzysztof Kosiński EXPECT_EQ(score, suffix_histograms[0].Distance(suffix_histograms[i]));
125*a03ca8b9SKrzysztof Kosiński // Distance should increase as suffix gets farther from full data.
126*a03ca8b9SKrzysztof Kosiński if (prev_suffix_score != kUninitScore)
127*a03ca8b9SKrzysztof Kosiński EXPECT_GT(score, prev_suffix_score);
128*a03ca8b9SKrzysztof Kosiński prev_suffix_score = score;
129*a03ca8b9SKrzysztof Kosiński }
130*a03ca8b9SKrzysztof Kosiński }
131*a03ca8b9SKrzysztof Kosiński
132*a03ca8b9SKrzysztof Kosiński } // namespace zucchini
133