1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
//! Common metrics for clustering
use crate::dataset::{AsSingleTargets, DatasetBase, Label, Labels, Records};
use crate::error::Result;
use crate::Float;
use ndarray::{ArrayBase, ArrayView1, Data, Ix2};
use std::collections::HashMap;
use std::ops::Sub;
/// Evaluates the quality of a clustering using euclidean distance.
pub trait SilhouetteScore<F> {
/// Evaluates the quality of a clustering.
///
/// Given a clustered dataset,
/// the silhouette score for each sample is computed as
/// the relative difference between the average distance
/// of the sample to other samples in the same cluster and
/// the minimum average distance of the sample to samples in
/// another cluster. This value goes from -1 to +1 when the point
/// is respectively closer (in average) to points in another cluster and to points in its own cluster.
///
/// Finally, the silhouette score for the clustering is evaluated as the mean
/// silhouette score of each sample.
fn silhouette_score(&self) -> Result<F>;
}
struct DistanceCount<F> {
total_distance: F,
count: usize,
}
impl<F: Float> DistanceCount<F> {
/// Sets the total distance from the sample to this cluster to zero
pub fn reset(&mut self) {
self.total_distance = F::zero();
}
pub fn new(count: usize) -> DistanceCount<F> {
DistanceCount {
total_distance: F::zero(),
count,
}
}
/// Divides the total distance from the sample to this cluster by the number of samples in the cluster
pub fn mean_distance(&self) -> F {
self.total_distance / F::cast(self.count)
}
/// To be used in the cluster in which the sample is located. The distance from the sample to itself
/// is zero so it does not get added to the total distance. We can then just divide the total
/// distance by 1 - #samples in this cluster
pub fn same_label_mean_distance(&self) -> F {
if self.count == 1 {
return F::zero();
}
self.total_distance / F::cast(self.count - 1)
}
/// adds the distance of `other_sample` from `eval_sample` to the total distance of `eval_sample` from the current cluster
pub fn add_point(&mut self, eval_sample: ArrayView1<F>, other_sample: ArrayView1<F>) {
self.total_distance += eval_sample.sub(&other_sample).mapv(|x| x * x).sum().sqrt();
}
}
impl<
'a,
F: Float,
L: 'a + Label,
D: Data<Elem = F>,
T: AsSingleTargets<Elem = L> + Labels<Elem = L>,
> SilhouetteScore<F> for DatasetBase<ArrayBase<D, Ix2>, T>
{
fn silhouette_score(&self) -> Result<F> {
let mut labels: HashMap<L, DistanceCount<F>> = self
.label_count()
.remove(0)
.into_iter()
.map(|(label, count)| (label, DistanceCount::new(count)))
.collect();
// Single label dataset, all points are in the same cluster.
if labels.len() == 1 {
return Ok(F::one());
}
// Compute and sum silhouette score for each sample
let score = self
.sample_iter()
.map(|sample| {
// Loops through all samples in the dataset and adds
// the distance between them and `sample` to the cluster
// in which they belong
for other in self.sample_iter() {
labels
.get_mut(other.1.into_scalar())
.unwrap()
.add_point(sample.0, other.0);
}
// average distance from `sample` to points in its cluster
let mut a_x = F::zero();
// minimum average distance from `sample` to another cluster
// set to none so that it can be initialized as the first value
let mut b_x: Option<F> = None;
for (label, counter) in &mut labels {
if sample.1.into_scalar() == label {
// The cluster of `sample` averages by excluding `sample` from the counting
a_x = counter.same_label_mean_distance();
} else {
// Keep the minimum average distance
b_x = match b_x {
None => Some(counter.mean_distance()),
Some(v) => {
if counter.mean_distance() < v {
Some(counter.mean_distance())
} else {
Some(v)
}
}
}
}
counter.reset()
}
// Since the single label case was taken care of earlier, here there are at least
// two clusters so `b_x` can't be `None`
let b_x = b_x.unwrap();
// s(x) = (b(x) - a(x)) / max{a(x), b(x)}
if a_x >= b_x {
(b_x - a_x) / a_x
} else {
(b_x - a_x) / b_x
}
})
.sum::<F>();
let score = score / F::cast(self.records().nsamples());
Ok(score)
}
}
#[cfg(test)]
mod tests {
use crate::metrics_clustering::SilhouetteScore;
use crate::{Dataset, DatasetBase};
use approx::assert_abs_diff_eq;
use ndarray::{concatenate, Array, Array1, Axis, Ix1};
#[test]
fn test_silhouette_score() {
// Two very far apart clusters, each with its own label.
// This is a very good clustering for silhouette and should return a score very close to +1
let records = concatenate![
Axis(0),
Array::linspace(0f64, 1f64, 10),
Array::linspace(10000f64, 10001f64, 10)
]
.insert_axis(Axis(1));
let records = concatenate![Axis(1), records, records];
let targets = concatenate![Axis(0), Array1::from_elem(10, 0), Array1::from_elem(10, 1)];
let dataset: Dataset<_, _, Ix1> = (records, targets).into();
let score = dataset.silhouette_score().unwrap();
assert_abs_diff_eq!(score, 1f64, epsilon = 1e-3);
// Two clusters separated into halves very far from each other and each very near an half of the other cluster.
// Bad but not terrible for silhouette, should return a score slightly negative
let records = concatenate![
Axis(0),
Array::linspace(0f64, 1f64, 5),
Array::linspace(1f64, 2f64, 5),
Array::linspace(10000f64, 10001f64, 5),
Array::linspace(10001f64, 10002f64, 5)
]
.insert_axis(Axis(1));
let records = concatenate![Axis(1), records, records];
let targets = concatenate![
Axis(0),
Array1::from_elem(5, 0),
Array1::from_elem(5, 1),
Array1::from_elem(5, 0),
Array1::from_elem(5, 1)
];
let dataset: Dataset<_, _, Ix1> = (records, targets).into();
let score = dataset.silhouette_score().unwrap();
assert!(score < 0f64);
// Very bad clustering with a high number of clusters, I expect a very negative value
let records = Array::linspace(0f64, 10f64, 100).insert_axis(Axis(1));
let records = concatenate![Axis(1), records, records];
let targets = Array1::from_shape_fn(100, |i| (i + 3) % 48);
let dataset: Dataset<_, _, Ix1> = (records, targets).into();
let score = dataset.silhouette_score().unwrap();
assert!(score < -0.5f64)
}
#[test]
fn test_empty_labels_as_single_label() {
let records = Array::linspace(0f64, 1f64, 10).insert_axis(Axis(1));
let dataset: DatasetBase<_, _> = records.into();
let score = dataset.silhouette_score().unwrap();
assert_abs_diff_eq!(score, 1f64, epsilon = 1e-5);
}
}