linfa_preprocessing/
helpers.rs

1/// Given a sequence of words, the list can be iterated to obtain all the n-grams in the sequence,
2/// starting from n-grams of lenght `min` up to n_grams of length `max`.
3#[derive(Debug, Clone, PartialEq)]
4pub struct NGramList<'a> {
5    min: usize,
6    max: usize,
7    list: Vec<&'a str>,
8}
9
10#[derive(Debug, Clone, PartialEq)]
11pub struct NGramListIntoIterator<'a> {
12    list: NGramList<'a>,
13    index: usize,
14}
15
16impl Iterator for NGramListIntoIterator<'_> {
17    type Item = Vec<String>;
18    fn next(&mut self) -> Option<Self::Item> {
19        if self.index >= self.list.len() {
20            return None;
21        }
22        let res = self.list.ngram_items(self.index);
23        if res.is_some() {
24            self.index += 1;
25            res
26        } else {
27            None
28        }
29    }
30}
31
32impl<'a> IntoIterator for NGramList<'a> {
33    type Item = Vec<String>;
34    type IntoIter = NGramListIntoIterator<'a>;
35
36    fn into_iter(self) -> Self::IntoIter {
37        NGramListIntoIterator {
38            list: self,
39            index: 0,
40        }
41    }
42}
43
44impl<'a> NGramList<'a> {
45    pub fn new(vec: Vec<&'a str>, range: (usize, usize)) -> Self {
46        Self {
47            min: range.0,
48            max: range.1,
49            list: vec,
50        }
51    }
52
53    pub fn len(&self) -> usize {
54        self.list.len()
55    }
56
57    /// Constructs all n-grams obtainable from the word sequence starting from the word at `index`
58    pub fn ngram_items(&self, index: usize) -> Option<Vec<String>> {
59        if self.max == 1 {
60            return Some(vec![self.list[index].to_string()]);
61        }
62        let mut items = Vec::new();
63        let len = self.list.len();
64        let min_end = index + self.min;
65        if min_end > len {
66            return None;
67        }
68        let max_end = usize::min(index + self.max, len);
69        let mut item = self.list[index].to_string();
70        for j in (index + 1)..min_end {
71            item.push(' ');
72            item.push_str(self.list[j]);
73        }
74        items.push(item.clone());
75        for j in min_end..max_end {
76            item.push(' ');
77            item.push_str(self.list[j]);
78            items.push(item.clone())
79        }
80        Some(items)
81    }
82}
83
84#[macro_export]
85macro_rules! column_for_word {
86    ($voc:expr, $transf:expr, $word: expr ) => {
87        $transf.column($voc.iter().position(|s| s == $word).unwrap())
88    };
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94
95    #[test]
96    fn autotraits() {
97        fn has_autotraits<T: Send + Sync + Sized + Unpin>() {}
98        has_autotraits::<NGramList>();
99        has_autotraits::<NGramListIntoIterator>();
100    }
101
102    #[test]
103    fn test_ngram_queue() {
104        let words = vec![
105            "oNe",
106            "oNe",
107            "two",
108            "three",
109            "four",
110            "TWO",
111            "three",
112            "four",
113            "three;four",
114            "four",
115        ];
116        let list = NGramList::new(words.clone(), (1, 1));
117        for (i, items) in list.into_iter().enumerate() {
118            assert_eq!(items.len(), 1);
119            assert_eq!(items[0], words[i]);
120        }
121
122        let list = NGramList::new(words.clone(), (2, 2));
123        for (i, items) in list.into_iter().enumerate() {
124            assert_eq!(items.len(), 1);
125            assert_eq!(items[0], words[i].to_string() + " " + words[i + 1]);
126        }
127        let list = NGramList::new(words.clone(), (1, 2));
128        for (i, items) in list.into_iter().enumerate() {
129            if i < words.len() - 1 {
130                assert_eq!(items.len(), 2);
131                assert_eq!(items[0], words[i]);
132                assert_eq!(items[1], words[i].to_string() + " " + words[i + 1]);
133            } else {
134                assert_eq!(items.len(), 1);
135                assert_eq!(items[0], words[i]);
136            }
137        }
138    }
139}