tukey.rs - OpenGrok cross reference for /aosp_15_r20/external/rust/android-crates-io/crates/criterion/src/stats/univariate/outliers/tukey.rs

//! Tukey's method
//!
//! The original method uses two "fences" to classify the data. All the observations "inside" the
//! fences are considered "normal", and the rest are considered outliers.
//!
//! The fences are computed from the quartiles of the sample, according to the following formula:
//!
//! ``` ignore
//! // q1, q3 are the first and third quartiles
//! let iqr = q3 - q1;  // The interquartile range
//! let (f1, f2) = (q1 - 1.5 * iqr, q3 + 1.5 * iqr);  // the "fences"
//!
//! let is_outlier = |x| if x > f1 && x < f2 { true } else { false };
//! ```
//!
//! The classifier provided here adds two extra outer fences:
//!
//! ``` ignore
//! let (f3, f4) = (q1 - 3 * iqr, q3 + 3 * iqr);  // the outer "fences"
//! ```
//!
//! The extra fences add a sense of "severity" to the classification. Data points outside of the
//! outer fences are considered "severe" outliers, whereas points outside the inner fences are just
//! "mild" outliers, and, as the original method, everything inside the inner fences is considered
//! "normal" data.
//!
//! Some ASCII art for the visually oriented people:
//!
//! ``` ignore
//!          LOW-ish                NORMAL-ish                 HIGH-ish
//!         x   |       +    |  o o  o    o   o o  o  |        +   |   x
//!             f3           f1                       f2           f4
//!
//! Legend:
//! o: "normal" data (not an outlier)
//! +: "mild" outlier
//! x: "severe" outlier
//! ```

use std::iter::IntoIterator;
use std::ops::{Deref, Index};
use std::slice;

use crate::stats::float::Float;
use crate::stats::univariate::Sample;

use self::Label::*;

/// A classified/labeled sample.
///
/// The labeled data can be accessed using the indexing operator. The order of the data points is
/// retained.
///
/// NOTE: Due to limitations in the indexing traits, only the label is returned. Once the
/// `IndexGet` trait lands in stdlib, the indexing operation will return a `(data_point, label)`
/// pair.
#[derive(Clone, Copy)]
pub struct LabeledSample<'a, A>
where
    A: Float,
{
    fences: (A, A, A, A),
    sample: &'a Sample<A>,
}

impl<'a, A> LabeledSample<'a, A>
where
    A: Float,
{
    /// Returns the number of data points per label
    ///
    /// - Time: `O(length)`
    #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
    pub fn count(&self) -> (usize, usize, usize, usize, usize) {
        let (mut los, mut lom, mut noa, mut him, mut his) = (0, 0, 0, 0, 0);

        for (_, label) in self {
            match label {
                LowSevere => {
                    los += 1;
                }
                LowMild => {
                    lom += 1;
                }
                NotAnOutlier => {
                    noa += 1;
                }
                HighMild => {
                    him += 1;
                }
                HighSevere => {
                    his += 1;
                }
            }
        }

        (los, lom, noa, him, his)
    }

    /// Returns the fences used to classify the outliers
    pub fn fences(&self) -> (A, A, A, A) {
        self.fences
    }

    /// Returns an iterator over the labeled data
    pub fn iter(&self) -> Iter<'a, A> {
        Iter {
            fences: self.fences,
            iter: self.sample.iter(),
        }
    }
}

impl<'a, A> Deref for LabeledSample<'a, A>
where
    A: Float,
{
    type Target = Sample<A>;

    fn deref(&self) -> &Sample<A> {
        self.sample
    }
}

// FIXME Use the `IndexGet` trait
impl<'a, A> Index<usize> for LabeledSample<'a, A>
where
    A: Float,
{
    type Output = Label;

    #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
    fn index(&self, i: usize) -> &Label {
        static LOW_SEVERE: Label = LowSevere;
        static LOW_MILD: Label = LowMild;
        static HIGH_MILD: Label = HighMild;
        static HIGH_SEVERE: Label = HighSevere;
        static NOT_AN_OUTLIER: Label = NotAnOutlier;

        let x = self.sample[i];
        let (lost, lomt, himt, hist) = self.fences;

        if x < lost {
            &LOW_SEVERE
        } else if x > hist {
            &HIGH_SEVERE
        } else if x < lomt {
            &LOW_MILD
        } else if x > himt {
            &HIGH_MILD
        } else {
            &NOT_AN_OUTLIER
        }
    }
}

impl<'a, 'b, A> IntoIterator for &'b LabeledSample<'a, A>
where
    A: Float,
{
    type Item = (A, Label);
    type IntoIter = Iter<'a, A>;

    fn into_iter(self) -> Iter<'a, A> {
        self.iter()
    }
}

/// Iterator over the labeled data
pub struct Iter<'a, A>
where
    A: Float,
{
    fences: (A, A, A, A),
    iter: slice::Iter<'a, A>,
}

impl<'a, A> Iterator for Iter<'a, A>
where
    A: Float,
{
    type Item = (A, Label);

    #[cfg_attr(feature = "cargo-clippy", allow(clippy::similar_names))]
    fn next(&mut self) -> Option<(A, Label)> {
        self.iter.next().map(|&x| {
            let (lost, lomt, himt, hist) = self.fences;

            let label = if x < lost {
                LowSevere
            } else if x > hist {
                HighSevere
            } else if x < lomt {
                LowMild
            } else if x > himt {
                HighMild
            } else {
                NotAnOutlier
            };

            (x, label)
        })
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        self.iter.size_hint()
    }
}

/// Labels used to classify outliers
pub enum Label {
    /// A "mild" outlier in the "high" spectrum
    HighMild,
    /// A "severe" outlier in the "high" spectrum
    HighSevere,
    /// A "mild" outlier in the "low" spectrum
    LowMild,
    /// A "severe" outlier in the "low" spectrum
    LowSevere,
    /// A normal data point
    NotAnOutlier,
}

impl Label {
    /// Checks if the data point has an "unusually" high value
    pub fn is_high(&self) -> bool {
        matches!(*self, HighMild | HighSevere)
    }

    /// Checks if the data point is labeled as a "mild" outlier
    pub fn is_mild(&self) -> bool {
        matches!(*self, HighMild | LowMild)
    }

    /// Checks if the data point has an "unusually" low value
    pub fn is_low(&self) -> bool {
        matches!(*self, LowMild | LowSevere)
    }

    /// Checks if the data point is labeled as an outlier
    pub fn is_outlier(&self) -> bool {
        matches!(*self, NotAnOutlier)
    }

    /// Checks if the data point is labeled as a "severe" outlier
    pub fn is_severe(&self) -> bool {
        matches!(*self, HighSevere | LowSevere)
    }
}

/// Classifies the sample, and returns a labeled sample.
///
/// - Time: `O(N log N) where N = length`
pub fn classify<A>(sample: &Sample<A>) -> LabeledSample<'_, A>
where
    A: Float,
    usize: cast::From<A, Output = Result<usize, cast::Error>>,
{
    let (q1, _, q3) = sample.percentiles().quartiles();
    let iqr = q3 - q1;

    // Mild
    let k_m = A::cast(1.5_f32);
    // Severe
    let k_s = A::cast(3);

    LabeledSample {
        fences: (
            q1 - k_s * iqr,
            q1 - k_m * iqr,
            q3 + k_m * iqr,
            q3 + k_s * iqr,
        ),
        sample,
    }
}