1 //! Tukey's method
2 //!
3 //! The original method uses two "fences" to classify the data. All the observations "inside" the
4 //! fences are considered "normal", and the rest are considered outliers.
5 //!
6 //! The fences are computed from the quartiles of the sample, according to the following formula:
7 //!
8 //! ``` ignore
9 //! // q1, q3 are the first and third quartiles
10 //! let iqr = q3 - q1;  // The interquartile range
11 //! let (f1, f2) = (q1 - 1.5 * iqr, q3 + 1.5 * iqr);  // the "fences"
12 //!
13 //! let is_outlier = |x| if x > f1 && x < f2 { true } else { false };
14 //! ```
15 //!
16 //! The classifier provided here adds two extra outer fences:
17 //!
18 //! ``` ignore
19 //! let (f3, f4) = (q1 - 3 * iqr, q3 + 3 * iqr);  // the outer "fences"
20 //! ```
21 //!
22 //! The extra fences add a sense of "severity" to the classification. Data points outside of the
23 //! outer fences are considered "severe" outliers, whereas points outside the inner fences are just
24 //! "mild" outliers, and, as the original method, everything inside the inner fences is considered
25 //! "normal" data.
26 //!
27 //! Some ASCII art for the visually oriented people:
28 //!
29 //! ``` ignore
30 //!          LOW-ish                NORMAL-ish                 HIGH-ish
31 //!         x   |       +    |  o o  o    o   o o  o  |        +   |   x
32 //!             f3           f1                       f2           f4
33 //!
34 //! Legend:
35 //! o: "normal" data (not an outlier)
36 //! +: "mild" outlier
37 //! x: "severe" outlier
38 //! ```
39 
40 use std::iter::IntoIterator;
41 use std::ops::{Deref, Index};
42 use std::slice;
43 
44 use crate::stats::float::Float;
45 use crate::stats::univariate::Sample;
46 
47 use self::Label::*;
48 
49 /// A classified/labeled sample.
50 ///
51 /// The labeled data can be accessed using the indexing operator. The order of the data points is
52 /// retained.
53 ///
54 /// NOTE: Due to limitations in the indexing traits, only the label is returned. Once the
55 /// `IndexGet` trait lands in stdlib, the indexing operation will return a `(data_point, label)`
56 /// pair.
57 #[derive(Clone, Copy)]
58 pub struct LabeledSample<'a, A>
59 where
60     A: Float,
61 {
62     fences: (A, A, A, A),
63     sample: &'a Sample<A>,
64 }
65 
66 impl<'a, A> LabeledSample<'a, A>
67 where
68     A: Float,
69 {
70     /// Returns the number of data points per label
71     ///
72     /// - Time: `O(length)`
73     #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
count(&self) -> (usize, usize, usize, usize, usize)74     pub fn count(&self) -> (usize, usize, usize, usize, usize) {
75         let (mut los, mut lom, mut noa, mut him, mut his) = (0, 0, 0, 0, 0);
76 
77         for (_, label) in self {
78             match label {
79                 LowSevere => {
80                     los += 1;
81                 }
82                 LowMild => {
83                     lom += 1;
84                 }
85                 NotAnOutlier => {
86                     noa += 1;
87                 }
88                 HighMild => {
89                     him += 1;
90                 }
91                 HighSevere => {
92                     his += 1;
93                 }
94             }
95         }
96 
97         (los, lom, noa, him, his)
98     }
99 
100     /// Returns the fences used to classify the outliers
fences(&self) -> (A, A, A, A)101     pub fn fences(&self) -> (A, A, A, A) {
102         self.fences
103     }
104 
105     /// Returns an iterator over the labeled data
iter(&self) -> Iter<'a, A>106     pub fn iter(&self) -> Iter<'a, A> {
107         Iter {
108             fences: self.fences,
109             iter: self.sample.iter(),
110         }
111     }
112 }
113 
114 impl<'a, A> Deref for LabeledSample<'a, A>
115 where
116     A: Float,
117 {
118     type Target = Sample<A>;
119 
deref(&self) -> &Sample<A>120     fn deref(&self) -> &Sample<A> {
121         self.sample
122     }
123 }
124 
125 // FIXME Use the `IndexGet` trait
126 impl<'a, A> Index<usize> for LabeledSample<'a, A>
127 where
128     A: Float,
129 {
130     type Output = Label;
131 
132     #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
index(&self, i: usize) -> &Label133     fn index(&self, i: usize) -> &Label {
134         static LOW_SEVERE: Label = LowSevere;
135         static LOW_MILD: Label = LowMild;
136         static HIGH_MILD: Label = HighMild;
137         static HIGH_SEVERE: Label = HighSevere;
138         static NOT_AN_OUTLIER: Label = NotAnOutlier;
139 
140         let x = self.sample[i];
141         let (lost, lomt, himt, hist) = self.fences;
142 
143         if x < lost {
144             &LOW_SEVERE
145         } else if x > hist {
146             &HIGH_SEVERE
147         } else if x < lomt {
148             &LOW_MILD
149         } else if x > himt {
150             &HIGH_MILD
151         } else {
152             &NOT_AN_OUTLIER
153         }
154     }
155 }
156 
157 impl<'a, 'b, A> IntoIterator for &'b LabeledSample<'a, A>
158 where
159     A: Float,
160 {
161     type Item = (A, Label);
162     type IntoIter = Iter<'a, A>;
163 
into_iter(self) -> Iter<'a, A>164     fn into_iter(self) -> Iter<'a, A> {
165         self.iter()
166     }
167 }
168 
169 /// Iterator over the labeled data
170 pub struct Iter<'a, A>
171 where
172     A: Float,
173 {
174     fences: (A, A, A, A),
175     iter: slice::Iter<'a, A>,
176 }
177 
178 impl<'a, A> Iterator for Iter<'a, A>
179 where
180     A: Float,
181 {
182     type Item = (A, Label);
183 
184     #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
next(&mut self) -> Option<(A, Label)>185     fn next(&mut self) -> Option<(A, Label)> {
186         self.iter.next().map(|&x| {
187             let (lost, lomt, himt, hist) = self.fences;
188 
189             let label = if x < lost {
190                 LowSevere
191             } else if x > hist {
192                 HighSevere
193             } else if x < lomt {
194                 LowMild
195             } else if x > himt {
196                 HighMild
197             } else {
198                 NotAnOutlier
199             };
200 
201             (x, label)
202         })
203     }
204 
size_hint(&self) -> (usize, Option<usize>)205     fn size_hint(&self) -> (usize, Option<usize>) {
206         self.iter.size_hint()
207     }
208 }
209 
210 /// Labels used to classify outliers
211 pub enum Label {
212     /// A "mild" outlier in the "high" spectrum
213     HighMild,
214     /// A "severe" outlier in the "high" spectrum
215     HighSevere,
216     /// A "mild" outlier in the "low" spectrum
217     LowMild,
218     /// A "severe" outlier in the "low" spectrum
219     LowSevere,
220     /// A normal data point
221     NotAnOutlier,
222 }
223 
224 impl Label {
225     /// Checks if the data point has an "unusually" high value
is_high(&self) -> bool226     pub fn is_high(&self) -> bool {
227         matches!(*self, HighMild | HighSevere)
228     }
229 
230     /// Checks if the data point is labeled as a "mild" outlier
is_mild(&self) -> bool231     pub fn is_mild(&self) -> bool {
232         matches!(*self, HighMild | LowMild)
233     }
234 
235     /// Checks if the data point has an "unusually" low value
is_low(&self) -> bool236     pub fn is_low(&self) -> bool {
237         matches!(*self, LowMild | LowSevere)
238     }
239 
240     /// Checks if the data point is labeled as an outlier
is_outlier(&self) -> bool241     pub fn is_outlier(&self) -> bool {
242         matches!(*self, NotAnOutlier)
243     }
244 
245     /// Checks if the data point is labeled as a "severe" outlier
is_severe(&self) -> bool246     pub fn is_severe(&self) -> bool {
247         matches!(*self, HighSevere | LowSevere)
248     }
249 }
250 
251 /// Classifies the sample, and returns a labeled sample.
252 ///
253 /// - Time: `O(N log N) where N = length`
classify<A>(sample: &Sample<A>) -> LabeledSample<'_, A> where A: Float, usize: cast::From<A, Output = Result<usize, cast::Error>>,254 pub fn classify<A>(sample: &Sample<A>) -> LabeledSample<'_, A>
255 where
256     A: Float,
257     usize: cast::From<A, Output = Result<usize, cast::Error>>,
258 {
259     let (q1, _, q3) = sample.percentiles().quartiles();
260     let iqr = q3 - q1;
261 
262     // Mild
263     let k_m = A::cast(1.5_f32);
264     // Severe
265     let k_s = A::cast(3);
266 
267     LabeledSample {
268         fences: (
269             q1 - k_s * iqr,
270             q1 - k_m * iqr,
271             q3 + k_m * iqr,
272             q3 + k_s * iqr,
273         ),
274         sample,
275     }
276 }
277