From 8aa2911b4c978ad7131a430d07a580cedf6f8f65 Mon Sep 17 00:00:00 2001 From: Tavian Barnes Date: Sun, 19 Apr 2020 16:37:16 -0400 Subject: metric: Add some general interfaces for metric spaces --- Cargo.lock | 104 ++++++++++++ Cargo.toml | 4 + src/main.rs | 2 + src/metric.rs | 531 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 641 insertions(+) create mode 100644 src/metric.rs diff --git a/Cargo.lock b/Cargo.lock index a63d677..45602e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,109 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +[[package]] +name = "autocfg" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "getrandom" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "kd-forest" version = "2.0.0" +dependencies = [ + "ordered-float", + "rand", +] + +[[package]] +name = "libc" +version = "0.2.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99e85c08494b21a9054e7fe1374a732aeadaff3980b6990b94bfd3a70f690005" + +[[package]] +name = "num-traits" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c62be47e61d1842b9170f0fdeec8eba98e60e90e5446449a0545e5152acd7096" +dependencies = [ + "autocfg", +] + +[[package]] +name = "ordered-float" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18869315e81473c951eb56ad5558bbc56978562d3ecfb87abb7a1e944cea4518" +dependencies = [ + "num-traits", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74490b50b9fbe561ac330df47c08f3f33073d2d00c150f719147d7c54522fa1b" + +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom", + "libc", + "rand_chacha", + "rand_core", + "rand_hc", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core", +] + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" diff --git a/Cargo.toml b/Cargo.toml index 65ffec8..5a93cf5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,3 +3,7 @@ name = "kd-forest" version = "2.0.0" authors = ["Tavian Barnes "] edition = "2018" + +[dependencies] +ordered-float = "1.0.2" +rand = "0.7.3" diff --git a/src/main.rs b/src/main.rs index f328e4d..0d7989b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1 +1,3 @@ +pub mod metric; + fn main() {} diff --git a/src/metric.rs b/src/metric.rs new file mode 100644 index 0000000..e067771 --- /dev/null +++ b/src/metric.rs @@ -0,0 +1,531 @@ +//! [Metric spaces](https://en.wikipedia.org/wiki/Metric_space). + +use ordered_float::OrderedFloat; + +use std::cmp::Ordering; +use std::collections::BinaryHeap; +use std::iter::FromIterator; + +/// An [order embedding](https://en.wikipedia.org/wiki/Order_embedding) for distances. +/// +/// Implementations of this trait must satisfy, for all non-negative distances `x` and `y`: +/// +/// * `x == Self::from(x).into()` +/// * `x <= y` iff `Self::from(x) <= Self::from(y)` +/// +/// This trait exists to optimize the common case where distances can be compared more efficiently +/// than their exact values can be computed. For example, taking the square root can be avoided +/// when comparing Euclidean distances (see [SquaredDistance]). +pub trait Distance: Copy + From + Into + Ord {} + +/// A raw numerical distance. +#[derive(Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd)] +pub struct RawDistance(OrderedFloat); + +impl From for RawDistance { + fn from(value: f64) -> Self { + Self(value.into()) + } +} + +impl From for f64 { + fn from(value: RawDistance) -> Self { + value.0.into_inner() + } +} + +impl Distance for RawDistance {} + +/// A squared distance, to avoid computing square roots unless absolutely necessary. +#[derive(Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd)] +pub struct SquaredDistance(OrderedFloat); + +impl SquaredDistance { + /// Create a SquaredDistance from an already squared value. + pub fn from_squared(value: f64) -> Self { + Self(value.into()) + } +} + +impl From for SquaredDistance { + fn from(value: f64) -> Self { + Self::from_squared(value * value) + } +} + +impl From for f64 { + fn from(value: SquaredDistance) -> Self { + value.0.into_inner().sqrt() + } +} + +impl Distance for SquaredDistance {} + +/// A [metric space](https://en.wikipedia.org/wiki/Metric_space). +pub trait Metric { + /// The type used to represent distances. Use [RawDistance] to compare the actual values + /// directly, or another type if comparisons can be implemented more efficiently. + type Distance: Distance; + + /// Computes the distance between this point and another point. This function must satisfy + /// three conditions: + /// + /// * `x.distance(y) == 0` iff `x == y` (identity of indiscernibles) + /// * `x.distance(y) == y.distance(x)` (symmetry) + /// * `x.distance(z) <= x.distance(y) + y.distance(z)` (triangle inequality) + fn distance(&self, other: &T) -> Self::Distance; +} + +/// Blanket [Metric] implementation for references. +impl<'a, 'b, T, U: Metric> Metric<&'a T> for &'b U { + type Distance = U::Distance; + + fn distance(&self, other: &&'a T) -> Self::Distance { + (*self).distance(other) + } +} + +/// The standard [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) metric. +impl Metric for [f64] { + type Distance = SquaredDistance; + + fn distance(&self, other: &Self) -> Self::Distance { + debug_assert!(self.len() == other.len()); + + let mut sum = 0.0; + for i in 0..self.len() { + let diff = self[i] - other[i]; + sum += diff * diff; + } + + Self::Distance::from_squared(sum) + } +} + +/// A nearest neighbor to a target. +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct Neighbor { + /// The found item. + pub item: T, + /// The distance from the target. + pub distance: f64, +} + +impl Neighbor { + /// Create a new Neighbor. + pub fn new(item: T, distance: f64) -> Self { + Self { item, distance } + } +} + +/// A candidate nearest neighbor found during a search. +#[derive(Debug)] +struct Candidate { + item: T, + distance: D, +} + +impl Candidate { + fn new(target: U, item: T) -> Self + where + U: Metric, + { + let distance = target.distance(&item); + Self { item, distance } + } + + fn into_neighbor(self) -> Neighbor { + Neighbor::new(self.item, self.distance.into()) + } +} + +impl PartialOrd for Candidate { + fn partial_cmp(&self, other: &Self) -> Option { + self.distance.partial_cmp(&other.distance) + } +} + +impl Ord for Candidate { + fn cmp(&self, other: &Self) -> Ordering { + self.distance.cmp(&other.distance) + } +} + +impl PartialEq for Candidate { + fn eq(&self, other: &Self) -> bool { + self.distance.eq(&other.distance) + } +} + +impl Eq for Candidate {} + +/// Accumulates nearest neighbor search results. +pub trait Neighborhood> { + /// Returns the target of the nearest neighbor search. + fn target(&self) -> U; + + /// Check whether a distance is within this neighborhood. + fn contains(&self, distance: f64) -> bool { + distance < 0.0 || self.contains_distance(distance.into()) + } + + /// Check whether a distance is within this neighborhood. + fn contains_distance(&self, distance: U::Distance) -> bool; + + /// Consider a new candidate neighbor. + fn consider(&mut self, item: T) -> U::Distance; +} + +/// A [Neighborhood] with at most one result. +#[derive(Debug)] +struct SingletonNeighborhood> { + /// The target of the nearest neighbor search. + target: U, + /// The current threshold distance to the farthest result. + threshold: Option, + /// The current nearest neighbor, if any. + candidate: Option>, +} + +impl SingletonNeighborhood +where + U: Copy + Metric, +{ + /// Create a new single metric result tracker. + /// + /// * `target`: The target fo the nearest neighbor search. + /// * `threshold`: The maximum allowable distance. + fn new(target: U, threshold: Option) -> Self { + Self { + target, + threshold: threshold.map(U::Distance::from), + candidate: None, + } + } + + /// Consider a candidate. + fn push(&mut self, candidate: Candidate) -> U::Distance { + let distance = candidate.distance; + + if self.contains_distance(distance) { + self.threshold = Some(distance); + self.candidate = Some(candidate); + } + + distance + } + + /// Convert this result into an optional neighbor. + fn into_option(self) -> Option> { + self.candidate.map(Candidate::into_neighbor) + } +} + +impl Neighborhood for SingletonNeighborhood +where + U: Copy + Metric, +{ + fn target(&self) -> U { + self.target + } + + fn contains_distance(&self, distance: U::Distance) -> bool { + self.threshold.map(|t| distance <= t).unwrap_or(true) + } + + fn consider(&mut self, item: T) -> U::Distance { + self.push(Candidate::new(self.target, item)) + } +} + +/// A [Neighborhood] of up to `k` results, using a binary heap. +#[derive(Debug)] +struct HeapNeighborhood> { + /// The target of the nearest neighbor search. + target: U, + /// The number of nearest neighbors to find. + k: usize, + /// The current threshold distance to the farthest result. + threshold: Option, + /// A max-heap of the best candidates found so far. + heap: BinaryHeap>, +} + +impl HeapNeighborhood +where + U: Copy + Metric, +{ + /// Create a new metric result tracker. + /// + /// * `target`: The target fo the nearest neighbor search. + /// * `k`: The number of nearest neighbors to find. + /// * `threshold`: The maximum allowable distance. + fn new(target: U, k: usize, threshold: Option) -> Self { + Self { + target, + k, + threshold: threshold.map(U::Distance::from), + heap: BinaryHeap::with_capacity(k), + } + } + + /// Consider a candidate. + fn push(&mut self, candidate: Candidate) -> U::Distance { + let distance = candidate.distance; + + if self.contains_distance(distance) { + let heap = &mut self.heap; + + if heap.len() == self.k { + heap.pop(); + } + + heap.push(candidate); + + if heap.len() == self.k { + self.threshold = self.heap.peek().map(|c| c.distance) + } + } + + distance + } + + /// Convert these results into a vector of neighbors. + fn into_vec(self) -> Vec> { + self.heap + .into_sorted_vec() + .into_iter() + .map(Candidate::into_neighbor) + .collect() + } +} + +impl Neighborhood for HeapNeighborhood +where + U: Copy + Metric, +{ + fn target(&self) -> U { + self.target + } + + fn contains_distance(&self, distance: U::Distance) -> bool { + self.k > 0 && self.threshold.map(|t| distance <= t).unwrap_or(true) + } + + fn consider(&mut self, item: T) -> U::Distance { + self.push(Candidate::new(self.target, item)) + } +} + +/// A [nearest neighbor search](https://en.wikipedia.org/wiki/Nearest_neighbor_search) index. +/// +/// Type parameters: +/// * `T`: The search result type. +/// * `U`: The query type. +pub trait NearestNeighbors = T> { + /// Returns the nearest neighbor to `target` (or `None` if this index is empty). + fn nearest(&self, target: &U) -> Option> { + self.search(SingletonNeighborhood::new(target, None)) + .into_option() + } + + /// Returns the nearest neighbor to `target` within the distance `threshold`, if one exists. + fn nearest_within(&self, target: &U, threshold: f64) -> Option> { + self.search(SingletonNeighborhood::new(target, Some(threshold))) + .into_option() + } + + /// Returns the up to `k` nearest neighbors to `target`. + fn k_nearest(&self, target: &U, k: usize) -> Vec> { + self.search(HeapNeighborhood::new(target, k, None)) + .into_vec() + } + + /// Returns the up to `k` nearest neighbors to `target` within the distance `threshold`. + fn k_nearest_within(&self, target: &U, k: usize, threshold: f64) -> Vec> { + self.search(HeapNeighborhood::new(target, k, Some(threshold))) + .into_vec() + } + + /// Search for nearest neighbors and add them to a neighborhood. + fn search<'a, 'b, N>(&'a self, neighborhood: N) -> N + where + T: 'a, + U: 'b, + N: Neighborhood<&'a T, &'b U>; +} + +/// A [NearestNeighbors] implementation that does exhaustive search. +#[derive(Debug)] +pub struct ExhaustiveSearch(Vec); + +impl ExhaustiveSearch { + /// Create an empty ExhaustiveSearch index. + pub fn new() -> Self { + Self(Vec::new()) + } + + /// Add a new item to the index. + pub fn push(&mut self, item: T) { + self.0.push(item); + } +} + +impl FromIterator for ExhaustiveSearch { + fn from_iter>(items: I) -> Self { + Self(items.into_iter().collect()) + } +} + +impl IntoIterator for ExhaustiveSearch { + type Item = T; + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } +} + +impl Extend for ExhaustiveSearch { + fn extend>(&mut self, iter: I) { + for value in iter { + self.push(value); + } + } +} + +impl> NearestNeighbors for ExhaustiveSearch { + fn search<'a, 'b, N>(&'a self, mut neighborhood: N) -> N + where + T: 'a, + U: 'b, + N: Neighborhood<&'a T, &'b U>, + { + for e in &self.0 { + neighborhood.consider(e); + } + neighborhood + } +} + +#[cfg(test)] +pub mod tests { + use super::*; + + use rand::prelude::*; + + #[derive(Clone, Copy, Debug, PartialEq)] + pub struct Point(pub [f64; 3]); + + impl Metric for Point { + type Distance = SquaredDistance; + + fn distance(&self, other: &Self) -> Self::Distance { + self.0.distance(&other.0) + } + } + + /// Test a [NearestNeighbors] impl. + pub fn test_nearest_neighbors(from_iter: F) + where + T: NearestNeighbors, + F: Fn(Vec) -> T, + { + test_empty(&from_iter); + test_pythagorean(&from_iter); + test_random_points(&from_iter); + } + + fn test_empty(from_iter: &F) + where + T: NearestNeighbors, + F: Fn(Vec) -> T, + { + let points = Vec::new(); + let index = from_iter(points); + let target = Point([0.0, 0.0, 0.0]); + assert_eq!(index.nearest(&target), None); + assert_eq!(index.nearest_within(&target, 1.0), None); + assert!(index.k_nearest(&target, 0).is_empty()); + assert!(index.k_nearest(&target, 3).is_empty()); + assert!(index.k_nearest_within(&target, 0, 1.0).is_empty()); + assert!(index.k_nearest_within(&target, 3, 1.0).is_empty()); + } + + fn test_pythagorean(from_iter: &F) + where + T: NearestNeighbors, + F: Fn(Vec) -> T, + { + let points = vec![ + Point([3.0, 4.0, 0.0]), + Point([5.0, 0.0, 12.0]), + Point([0.0, 8.0, 15.0]), + Point([1.0, 2.0, 2.0]), + Point([2.0, 3.0, 6.0]), + Point([4.0, 4.0, 7.0]), + ]; + let index = from_iter(points); + let target = Point([0.0, 0.0, 0.0]); + + assert_eq!( + index.nearest(&target), + Some(Neighbor::new(&Point([1.0, 2.0, 2.0]), 3.0)) + ); + + assert_eq!(index.nearest_within(&target, 2.0), None); + assert_eq!( + index.nearest_within(&target, 4.0), + Some(Neighbor::new(&Point([1.0, 2.0, 2.0]), 3.0)) + ); + + assert!(index.k_nearest(&target, 0).is_empty()); + assert_eq!( + index.k_nearest(&target, 3), + vec![ + Neighbor::new(&Point([1.0, 2.0, 2.0]), 3.0), + Neighbor::new(&Point([3.0, 4.0, 0.0]), 5.0), + Neighbor::new(&Point([2.0, 3.0, 6.0]), 7.0), + ] + ); + + assert!(index.k_nearest(&target, 0).is_empty()); + assert_eq!( + index.k_nearest_within(&target, 3, 6.0), + vec![ + Neighbor::new(&Point([1.0, 2.0, 2.0]), 3.0), + Neighbor::new(&Point([3.0, 4.0, 0.0]), 5.0), + ] + ); + assert_eq!( + index.k_nearest_within(&target, 3, 8.0), + vec![ + Neighbor::new(&Point([1.0, 2.0, 2.0]), 3.0), + Neighbor::new(&Point([3.0, 4.0, 0.0]), 5.0), + Neighbor::new(&Point([2.0, 3.0, 6.0]), 7.0), + ] + ); + } + + fn test_random_points(from_iter: &F) + where + T: NearestNeighbors, + F: Fn(Vec) -> T, + { + let mut points = Vec::new(); + for _ in 0..255 { + points.push(Point([random(), random(), random()])); + } + let target = Point([random(), random(), random()]); + + let eindex = ExhaustiveSearch::from_iter(points.clone()); + let index = from_iter(points); + + assert_eq!(index.k_nearest(&target, 3), eindex.k_nearest(&target, 3)); + } + + #[test] + fn test_exhaustive_index() { + test_nearest_neighbors(ExhaustiveSearch::from_iter); + } +} -- cgit v1.2.3