1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
//! Correlation analysis for dataset features
//!
//! # Implementations
//!
//! * Pearsons's Correlation Coefficients - linear feature correlation
use std::fmt;

use ndarray::{Array1, ArrayBase, ArrayView2, Axis, Data, Ix2};
use rand::{rngs::SmallRng, seq::SliceRandom, SeedableRng};

use crate::dataset::DatasetBase;
use crate::Float;

/// Calculate the Pearson's Correlation Coefficient (or bivariate correlation)
///
/// The PCC describes the linear correlation between two variables. It is the covariance divided by
/// the product of the standard deviations, therefore essentially a normalised measurement of the
/// covariance and in range (-1, 1). A negative coefficient indicates a negative correlation
/// between both variables.
fn pearson_correlation<F: Float, D: Data<Elem = F>>(data: &ArrayBase<D, Ix2>) -> Array1<F> {
    // number of obserations and features
    let nobservations = data.nrows();
    let nfeatures = data.ncols();

    // center distribution by subtracting mean
    let mean = data.mean_axis(Axis(0)).unwrap();
    //let std_deviation = mean.clone();
    let denoised = data - &mean.insert_axis(Axis(1)).t();

    // calculate the covariance matrix
    let covariance = denoised.t().dot(&denoised) / F::cast(nobservations - 1);
    // calculate the standard deviation vector
    let std_deviation = denoised.var_axis(Axis(0), F::one()).mapv(|x| x.sqrt());

    // we will only save the upper triangular matrix as the diagonal is one and
    // the lower triangular is a mirror of the upper triangular part
    let mut pearson_coeffs = Array1::zeros(nfeatures * (nfeatures - 1) / 2);

    let mut k = 0;
    for i in 0..(nfeatures - 1) {
        for j in (i + 1)..nfeatures {
            // calculate pearson correlation coefficients by normalizing the covariance matrix
            pearson_coeffs[k] = covariance[(i, j)] / std_deviation[i] / std_deviation[j];

            k += 1;
        }
    }

    pearson_coeffs
}

/// Evidence of non-correlation with re-sampling test
///
/// The p-value supports or reject the null hypthesis that two variables are not correlated. A
/// small p-value indicates a strong evidence that two variables are correlated.
fn p_values<F: Float, D: Data<Elem = F>>(
    data: &ArrayBase<D, Ix2>,
    ground: &Array1<F>,
    num_iter: usize,
) -> Array1<F> {
    // transpose element matrix such that we can shuffle columns
    let (n, m) = (data.ncols(), data.nrows());
    let mut flattened = Vec::with_capacity(n * m);
    for i in 0..m {
        for j in 0..n {
            flattened.push(data[(i, j)]);
        }
    }

    let mut p_values = Array1::zeros(n * (n - 1) / 2);
    let mut rng = SmallRng::from_entropy();

    // calculate p-values by shuffling features `num_iter` times
    for _ in 0..num_iter {
        // shuffle all corresponding features
        for j in 0..n {
            flattened[j * m..(j + 1) * m].shuffle(&mut rng);
        }

        // create an ndarray and calculate the PCC for this distribution
        let arr_view = ArrayView2::from_shape((m, n), &flattened).unwrap();
        let correlation = pearson_correlation(&arr_view.t());

        // count the number of times that the re-shuffled distribution has a larger PCC than the
        // original distribution
        let greater = ground
            .iter()
            .zip(correlation.iter())
            .map(|(a, b)| {
                if a.abs() < b.abs() {
                    F::one()
                } else {
                    F::zero()
                }
            })
            .collect::<Array1<_>>();

        p_values += &greater;
    }

    // divide by the number of iterations to re-scale range
    p_values / F::cast(num_iter)
}

/// Pearson Correlation Coefficients (or Bivariate Coefficients)
///
/// The PCCs indicate the linear correlation between variables. This type also supports printing
/// the PCC as an upper triangle matrix together with the feature names.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PearsonCorrelation<F> {
    pearson_coeffs: Array1<F>,
    p_values: Array1<F>,
    feature_names: Vec<String>,
}

impl<F: Float> PearsonCorrelation<F> {
    /// Calculate the Pearson Correlation Coefficients and optionally p-values from dataset
    ///
    /// The PCC describes the linear correlation between two variables. It is the covariance divided by
    /// the product of the standard deviations, therefore essentially a normalised measurement of the
    /// covariance and in range (-1, 1). A negative coefficient indicates a negative correlation
    /// between both variables.
    ///
    /// The p-value supports or reject the null hypthesis that two variables are not correlated. A
    /// small p-value indicates a strong evidence that two variables are correlated.
    ///
    /// # Parameters
    ///
    /// * `dataset`: Data for the correlation analysis
    /// * `num_iter`: optionally number of iterations of the p-value test, if none then no p-value
    /// are calculate
    ///
    /// # Example
    ///
    /// ```
    /// let corr = linfa_datasets::diabetes()
    ///     .pearson_correlation_with_p_value(100);
    ///
    /// println!("{}", corr);
    /// ```
    ///
    /// The output looks like this (the p-value is in brackets behind the PCC):
    ///
    /// ```ignore
    /// age                        +0.17 (0.61) +0.18 (0.62) +0.33 (0.34) +0.26 (0.47) +0.22 (0.54) -0.07 (0.83) +0.20 (0.60) +0.27 (0.54) +0.30 (0.41)
    /// sex                                     +0.09 (0.74) +0.24 (0.59) +0.04 (0.91) +0.14 (0.74) -0.38 (0.28) +0.33 (0.30) +0.15 (0.74) +0.21 (0.58)
    /// body mass index                                      +0.39 (0.20) +0.25 (0.45) +0.26 (0.51) -0.37 (0.31) +0.41 (0.24) +0.45 (0.21) +0.39 (0.21)
    /// blood pressure                                                    +0.24 (0.54) +0.19 (0.56) -0.18 (0.61) +0.26 (0.45) +0.39 (0.20) +0.39 (0.16)
    /// t-cells                                                                        +0.90 (0.00) +0.05 (0.89) +0.54 (0.05) +0.52 (0.10) +0.33 (0.37)
    /// low-density lipoproteins                                                                    -0.20 (0.53) +0.66 (0.04) +0.32 (0.42) +0.29 (0.42)
    /// high-density lipoproteins                                                                                -0.74 (0.02) -0.40 (0.21) -0.27 (0.42)
    /// thyroid stimulating hormone                                                                                           +0.62 (0.04) +0.42 (0.21)
    /// lamotrigine                                                                                                                        +0.47 (0.14)
    /// blood sugar level
    /// ```

    pub fn from_dataset<D: Data<Elem = F>, T>(
        dataset: &DatasetBase<ArrayBase<D, Ix2>, T>,
        num_iter: Option<usize>,
    ) -> Self {
        // calculate pearson coefficients
        let pearson_coeffs = pearson_correlation(dataset.records());

        // calculate p values
        let p_values = match num_iter {
            Some(num_iter) => p_values(dataset.records(), &pearson_coeffs, num_iter),
            None => Array1::zeros(0),
        };

        PearsonCorrelation {
            pearson_coeffs,
            p_values,
            feature_names: dataset.feature_names(),
        }
    }

    /// Return the Pearson's Correlation Coefficients
    ///
    /// The coefficients are describing the linear correlation, normalized in range (-1, 1) between
    /// two variables. Because the correlation is commutative and PCC to the same variable is
    /// always perfectly correlated (i.e. 1), this function only returns the upper triangular
    /// matrix with (n-1)*n/2 elements.
    pub fn get_coeffs(&self) -> &Array1<F> {
        &self.pearson_coeffs
    }

    /// Return the p values supporting the null-hypothesis
    ///
    /// This implementation estimates the p value with the permutation test. As null-hypothesis
    /// the non-correlation between two variables is chosen such that the smaller the p-value the
    /// stronger we can reject the null-hypothesis and conclude that they are linearily correlated.
    pub fn get_p_values(&self) -> Option<&Array1<F>> {
        if self.p_values.is_empty() {
            None
        } else {
            Some(&self.p_values)
        }
    }
}

impl<F: Float, D: Data<Elem = F>, T> DatasetBase<ArrayBase<D, Ix2>, T> {
    /// Calculate the Pearson Correlation Coefficients from a dataset
    ///
    /// The PCC describes the linear correlation between two variables. It is the covariance divided by
    /// the product of the standard deviations, therefore essentially a normalised measurement of the
    /// covariance and in range (-1, 1). A negative coefficient indicates a negative correlation
    /// between both variables.
    ///
    /// # Example
    ///
    /// ```
    /// let corr = linfa_datasets::diabetes()
    ///     .pearson_correlation();
    ///
    /// println!("{}", corr);
    /// ```
    ///
    pub fn pearson_correlation(&self) -> PearsonCorrelation<F> {
        PearsonCorrelation::from_dataset(self, None)
    }

    /// Calculate the Pearson Correlation Coefficients and p-values from the dataset
    ///
    /// The PCC describes the linear correlation between two variables. It is the covariance divided by
    /// the product of the standard deviations, therefore essentially a normalised measurement of the
    /// covariance and in range (-1, 1). A negative coefficient indicates a negative correlation
    /// between both variables.
    ///
    /// The p-value supports or reject the null hypthesis that two variables are not correlated.
    /// The smaller the p-value the stronger is the evidence that two variables are correlated. A
    /// typical threshold is p < 0.05.
    ///
    /// # Parameters
    ///
    /// * `num_iter`: number of iterations of the permutation test to estimate the p-value
    ///
    /// # Example
    ///
    /// ```
    /// let corr = linfa_datasets::diabetes()
    ///     .pearson_correlation_with_p_value(100);
    ///
    /// println!("{}", corr);
    /// ```
    ///
    pub fn pearson_correlation_with_p_value(&self, num_iter: usize) -> PearsonCorrelation<F> {
        PearsonCorrelation::from_dataset(self, Some(num_iter))
    }
}

/// Display the Pearson's Correlation Coefficients as upper triangular matrix
///
/// This function prints the feature names for each row, the corresponding PCCs and optionally the
/// p-values in brackets after the PCCs.
impl<F: Float> fmt::Display for PearsonCorrelation<F> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let n = self.feature_names.len();
        let longest = self.feature_names.iter().map(|x| x.len()).max().unwrap();

        let mut k = 0;
        for i in 0..(n - 1) {
            write!(f, "{}", self.feature_names[i])?;
            for _ in 0..longest - self.feature_names[i].len() {
                write!(f, " ")?;
            }

            for _ in 0..i {
                write!(f, "             ")?;
            }

            for _ in (i + 1)..n {
                if !self.p_values.is_empty() {
                    write!(
                        f,
                        "{:+.2} ({:.2}) ",
                        self.pearson_coeffs[k], self.p_values[k]
                    )?;
                } else {
                    write!(f, "{:.2} ", self.pearson_coeffs[k])?;
                }

                k += 1;
            }
            writeln!(f,)?;
        }
        writeln!(f, "{}", self.feature_names[n - 1])?;

        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use crate::DatasetBase;
    use ndarray::{concatenate, Array, Axis};
    use ndarray_rand::{rand_distr::Uniform, RandomExt};
    use rand::{rngs::SmallRng, SeedableRng};

    #[test]
    fn uniform_random() {
        // create random number generator and random matrix with uniform distribution
        let mut rng = SmallRng::seed_from_u64(42);
        let data = Array::random_using((1000, 4), Uniform::new(-1., 1.), &mut rng);

        // calculate PCCs and test that they are indeed near zero
        let pcc = DatasetBase::from(data).pearson_correlation();
        assert!(pcc.get_coeffs().mapv(|x: f32| x.abs()).sum() < 5e-2 * 6.0);
    }

    #[test]
    fn perfectly_correlated() {
        let mut rng = SmallRng::seed_from_u64(42);
        let v = Array::random_using((4, 1), Uniform::new(0., 1.), &mut rng);

        // project feature with matrix
        let data = Array::random_using((1000, 1), Uniform::new(-1., 1.), &mut rng);
        let data_proj = data.dot(&v.t());

        let corr = DatasetBase::from(concatenate![Axis(1), data, data_proj])
            .pearson_correlation_with_p_value(100);

        assert!(corr.get_coeffs().mapv(|x| 1. - x).sum() < 1e-2);
        assert!(corr.get_p_values().unwrap().sum() < 1e-2);
    }
}