linfa_preprocessing/
helpers.rs1#[derive(Debug, Clone, PartialEq)]
4pub struct NGramList<'a> {
5 min: usize,
6 max: usize,
7 list: Vec<&'a str>,
8}
9
10#[derive(Debug, Clone, PartialEq)]
11pub struct NGramListIntoIterator<'a> {
12 list: NGramList<'a>,
13 index: usize,
14}
15
16impl Iterator for NGramListIntoIterator<'_> {
17 type Item = Vec<String>;
18 fn next(&mut self) -> Option<Self::Item> {
19 if self.index >= self.list.len() {
20 return None;
21 }
22 let res = self.list.ngram_items(self.index);
23 if res.is_some() {
24 self.index += 1;
25 res
26 } else {
27 None
28 }
29 }
30}
31
32impl<'a> IntoIterator for NGramList<'a> {
33 type Item = Vec<String>;
34 type IntoIter = NGramListIntoIterator<'a>;
35
36 fn into_iter(self) -> Self::IntoIter {
37 NGramListIntoIterator {
38 list: self,
39 index: 0,
40 }
41 }
42}
43
44impl<'a> NGramList<'a> {
45 pub fn new(vec: Vec<&'a str>, range: (usize, usize)) -> Self {
46 Self {
47 min: range.0,
48 max: range.1,
49 list: vec,
50 }
51 }
52
53 pub fn len(&self) -> usize {
54 self.list.len()
55 }
56
57 pub fn ngram_items(&self, index: usize) -> Option<Vec<String>> {
59 if self.max == 1 {
60 return Some(vec![self.list[index].to_string()]);
61 }
62 let mut items = Vec::new();
63 let len = self.list.len();
64 let min_end = index + self.min;
65 if min_end > len {
66 return None;
67 }
68 let max_end = usize::min(index + self.max, len);
69 let mut item = self.list[index].to_string();
70 for j in (index + 1)..min_end {
71 item.push(' ');
72 item.push_str(self.list[j]);
73 }
74 items.push(item.clone());
75 for j in min_end..max_end {
76 item.push(' ');
77 item.push_str(self.list[j]);
78 items.push(item.clone())
79 }
80 Some(items)
81 }
82}
83
84#[macro_export]
85macro_rules! column_for_word {
86 ($voc:expr, $transf:expr, $word: expr ) => {
87 $transf.column($voc.iter().position(|s| s == $word).unwrap())
88 };
89}
90
91#[cfg(test)]
92mod tests {
93 use super::*;
94
95 #[test]
96 fn autotraits() {
97 fn has_autotraits<T: Send + Sync + Sized + Unpin>() {}
98 has_autotraits::<NGramList>();
99 has_autotraits::<NGramListIntoIterator>();
100 }
101
102 #[test]
103 fn test_ngram_queue() {
104 let words = vec![
105 "oNe",
106 "oNe",
107 "two",
108 "three",
109 "four",
110 "TWO",
111 "three",
112 "four",
113 "three;four",
114 "four",
115 ];
116 let list = NGramList::new(words.clone(), (1, 1));
117 for (i, items) in list.into_iter().enumerate() {
118 assert_eq!(items.len(), 1);
119 assert_eq!(items[0], words[i]);
120 }
121
122 let list = NGramList::new(words.clone(), (2, 2));
123 for (i, items) in list.into_iter().enumerate() {
124 assert_eq!(items.len(), 1);
125 assert_eq!(items[0], words[i].to_string() + " " + words[i + 1]);
126 }
127 let list = NGramList::new(words.clone(), (1, 2));
128 for (i, items) in list.into_iter().enumerate() {
129 if i < words.len() - 1 {
130 assert_eq!(items.len(), 2);
131 assert_eq!(items[0], words[i]);
132 assert_eq!(items[1], words[i].to_string() + " " + words[i + 1]);
133 } else {
134 assert_eq!(items.len(), 1);
135 assert_eq!(items[0], words[i]);
136 }
137 }
138 }
139}