linfa_datasets/
generate.rs

1//! Utility functions for randomly generating datasets
2
3use linfa::Dataset;
4use ndarray::{s, Array, Array2, ArrayBase, Data, Ix1, Ix2};
5use ndarray_rand::{
6    rand::Rng,
7    rand_distr::{Distribution, StandardNormal},
8    RandomExt,
9};
10
11/// Special case of `blobs_with_distribution` with a standard normal distribution.
12pub fn blobs(
13    blob_size: usize,
14    blob_centroids: &ArrayBase<impl Data<Elem = f64>, Ix2>,
15    rng: &mut impl Rng,
16) -> Array2<f64> {
17    blobs_with_distribution(blob_size, blob_centroids, StandardNormal, rng)
18}
19
20/// Given an input matrix `blob_centroids`, with shape `(n_blobs, n_features)`,
21/// generate `blob_size` data points (a "blob") around each of the blob centroids.
22///
23/// More specifically, each blob is formed by `blob_size` points sampled from a distribution
24/// centered in the blob centroid.
25///
26/// `blobs` can be used to quickly assemble a synthetic dataset to test or
27/// benchmark various clustering algorithms on a best-case scenario input.
28pub fn blobs_with_distribution(
29    blob_size: usize,
30    blob_centroids: &ArrayBase<impl Data<Elem = f64>, Ix2>,
31    distribution: impl Distribution<f64> + Clone,
32    rng: &mut impl Rng,
33) -> Array2<f64> {
34    let (n_centroids, n_features) = blob_centroids.dim();
35    let mut blobs: Array2<f64> = Array2::zeros((n_centroids * blob_size, n_features));
36
37    for (blob_index, blob_centroid) in blob_centroids.rows().into_iter().enumerate() {
38        let blob = make_blob(blob_size, &blob_centroid, distribution.clone(), rng);
39
40        let indexes = s![blob_index * blob_size..(blob_index + 1) * blob_size, ..];
41        blobs.slice_mut(indexes).assign(&blob);
42    }
43    blobs
44}
45
46/// Generate `blob_size` data points (a "blob") around `blob_centroid` using the given distribution.
47///
48/// `blob` can be used to quickly assemble a synthetic stereotypical cluster.
49fn make_blob(
50    blob_size: usize,
51    blob_centroid: &ArrayBase<impl Data<Elem = f64>, Ix1>,
52    distribution: impl Distribution<f64>,
53    rng: &mut impl Rng,
54) -> Array2<f64> {
55    let shape = (blob_size, blob_centroid.len());
56    let origin_blob: Array2<f64> = Array::random_using(shape, distribution, rng);
57    origin_blob + blob_centroid
58}
59
60/// Generates a random Linfa::Dataset (ds). The ds values are determined by the provided statistical distributions.
61///
62/// # Example
63/// ```
64/// use statrs::distribution::{DiscreteUniform, Laplace};
65/// use ndarray_rand::{RandomExt, rand_distr::Distribution};
66/// use linfa_datasets::generate::make_dataset;
67/// let feat_distr = Laplace::new(0.5, 5. ).unwrap();
68/// let target_distr = DiscreteUniform::new(0, 5).unwrap();
69/// make_dataset(5, 5, 2, feat_distr, target_distr);
70/// ```
71pub fn make_dataset<X, Y>(
72    num_rows: usize,
73    num_feats: usize,
74    num_targets: usize,
75    feat_distr: X,
76    target_distr: Y,
77) -> Dataset<f64, f64>
78where
79    X: Distribution<f64>,
80    Y: Distribution<f64>,
81{
82    let features = Array::random((num_rows, num_feats), feat_distr);
83    let targets = Array::random((num_rows, num_targets), target_distr);
84
85    Dataset::new(features, targets)
86}