]>
Commit | Line | Data |
---|---|---|
5869c6ff XL |
1 | use std::fmt; |
2 | use std::iter::FusedIterator; | |
3 | ||
8bb4bdeb XL |
4 | /// Slot is a single saved capture location. Note that there are two slots for |
5 | /// every capture in a regular expression (one slot each for the start and end | |
6 | /// of the capture). | |
7 | pub type Slot = Option<usize>; | |
8 | ||
9 | /// Locations represents the offsets of each capturing group in a regex for | |
10 | /// a single match. | |
11 | /// | |
12 | /// Unlike `Captures`, a `Locations` value only stores offsets. | |
13 | #[doc(hidden)] | |
b7449926 | 14 | #[derive(Clone, Debug)] |
8bb4bdeb XL |
15 | pub struct Locations(Vec<Slot>); |
16 | ||
17 | impl Locations { | |
18 | /// Returns the start and end positions of the Nth capture group. Returns | |
19 | /// `None` if `i` is not a valid capture group or if the capture group did | |
20 | /// not match anything. The positions returned are *always* byte indices | |
21 | /// with respect to the original string matched. | |
22 | pub fn pos(&self, i: usize) -> Option<(usize, usize)> { | |
23 | let (s, e) = (i * 2, i * 2 + 1); | |
24 | match (self.0.get(s), self.0.get(e)) { | |
25 | (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), | |
26 | _ => None, | |
27 | } | |
28 | } | |
29 | ||
30 | /// Creates an iterator of all the capture group positions in order of | |
31 | /// appearance in the regular expression. Positions are byte indices | |
32 | /// in terms of the original string matched. | |
33 | pub fn iter(&self) -> SubCapturesPosIter { | |
ff7c6d11 | 34 | SubCapturesPosIter { idx: 0, locs: self } |
8bb4bdeb XL |
35 | } |
36 | ||
37 | /// Returns the total number of capturing groups. | |
38 | /// | |
39 | /// This is always at least `1` since every regex has at least `1` | |
40 | /// capturing group that corresponds to the entire match. | |
41 | pub fn len(&self) -> usize { | |
42 | self.0.len() / 2 | |
43 | } | |
8bb4bdeb | 44 | |
b7449926 XL |
45 | /// Return the individual slots as a slice. |
46 | pub(crate) fn as_slots(&mut self) -> &mut [Slot] { | |
47 | &mut self.0 | |
48 | } | |
8bb4bdeb XL |
49 | } |
50 | ||
51 | /// An iterator over capture group positions for a particular match of a | |
52 | /// regular expression. | |
53 | /// | |
54 | /// Positions are byte indices in terms of the original string matched. | |
55 | /// | |
56 | /// `'c` is the lifetime of the captures. | |
5869c6ff | 57 | #[derive(Clone, Debug)] |
8bb4bdeb XL |
58 | pub struct SubCapturesPosIter<'c> { |
59 | idx: usize, | |
60 | locs: &'c Locations, | |
61 | } | |
62 | ||
63 | impl<'c> Iterator for SubCapturesPosIter<'c> { | |
64 | type Item = Option<(usize, usize)>; | |
65 | ||
66 | fn next(&mut self) -> Option<Option<(usize, usize)>> { | |
67 | if self.idx >= self.locs.len() { | |
68 | return None; | |
69 | } | |
70 | let x = match self.locs.pos(self.idx) { | |
71 | None => Some(None), | |
f9f354fc | 72 | Some((s, e)) => Some(Some((s, e))), |
8bb4bdeb XL |
73 | }; |
74 | self.idx += 1; | |
75 | x | |
76 | } | |
77 | } | |
78 | ||
5869c6ff XL |
79 | impl<'c> FusedIterator for SubCapturesPosIter<'c> {} |
80 | ||
ff7c6d11 | 81 | /// `RegularExpression` describes types that can implement regex searching. |
8bb4bdeb XL |
82 | /// |
83 | /// This trait is my attempt at reducing code duplication and to standardize | |
84 | /// the internal API. Specific duplication that is avoided are the `find` | |
85 | /// and `capture` iterators, which are slightly tricky. | |
86 | /// | |
87 | /// It's not clear whether this trait is worth it, and it also isn't | |
88 | /// clear whether it's useful as a public trait or not. Methods like | |
89 | /// `next_after_empty` reak of bad design, but the rest of the methods seem | |
90 | /// somewhat reasonable. One particular thing this trait would expose would be | |
91 | /// the ability to start the search of a regex anywhere in a haystack, which | |
92 | /// isn't possible in the current public API. | |
5869c6ff | 93 | pub trait RegularExpression: Sized + fmt::Debug { |
8bb4bdeb | 94 | /// The type of the haystack. |
5869c6ff | 95 | type Text: ?Sized + fmt::Debug; |
8bb4bdeb XL |
96 | |
97 | /// The number of capture slots in the compiled regular expression. This is | |
98 | /// always two times the number of capture groups (two slots per group). | |
99 | fn slots_len(&self) -> usize; | |
100 | ||
101 | /// Allocates fresh space for all capturing groups in this regex. | |
102 | fn locations(&self) -> Locations { | |
103 | Locations(vec![None; self.slots_len()]) | |
104 | } | |
105 | ||
106 | /// Returns the position of the next character after `i`. | |
107 | /// | |
108 | /// For example, a haystack with type `&[u8]` probably returns `i+1`, | |
109 | /// whereas a haystack with type `&str` probably returns `i` plus the | |
110 | /// length of the next UTF-8 sequence. | |
111 | fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize; | |
112 | ||
113 | /// Returns the location of the shortest match. | |
114 | fn shortest_match_at( | |
115 | &self, | |
116 | text: &Self::Text, | |
117 | start: usize, | |
118 | ) -> Option<usize>; | |
119 | ||
120 | /// Returns whether the regex matches the text given. | |
f9f354fc | 121 | fn is_match_at(&self, text: &Self::Text, start: usize) -> bool; |
8bb4bdeb XL |
122 | |
123 | /// Returns the leftmost-first match location if one exists. | |
124 | fn find_at( | |
125 | &self, | |
126 | text: &Self::Text, | |
127 | start: usize, | |
128 | ) -> Option<(usize, usize)>; | |
129 | ||
130 | /// Returns the leftmost-first match location if one exists, and also | |
131 | /// fills in any matching capture slot locations. | |
b7449926 | 132 | fn captures_read_at( |
8bb4bdeb XL |
133 | &self, |
134 | locs: &mut Locations, | |
135 | text: &Self::Text, | |
136 | start: usize, | |
137 | ) -> Option<(usize, usize)>; | |
138 | ||
139 | /// Returns an iterator over all non-overlapping successive leftmost-first | |
140 | /// matches. | |
f9f354fc XL |
141 | fn find_iter(self, text: &Self::Text) -> Matches<Self> { |
142 | Matches { re: self, text: text, last_end: 0, last_match: None } | |
8bb4bdeb XL |
143 | } |
144 | ||
145 | /// Returns an iterator over all non-overlapping successive leftmost-first | |
146 | /// matches with captures. | |
f9f354fc | 147 | fn captures_iter(self, text: &Self::Text) -> CaptureMatches<Self> { |
8bb4bdeb XL |
148 | CaptureMatches(self.find_iter(text)) |
149 | } | |
150 | } | |
151 | ||
152 | /// An iterator over all non-overlapping successive leftmost-first matches. | |
5869c6ff | 153 | #[derive(Debug)] |
f9f354fc XL |
154 | pub struct Matches<'t, R> |
155 | where | |
156 | R: RegularExpression, | |
157 | R::Text: 't, | |
158 | { | |
8bb4bdeb XL |
159 | re: R, |
160 | text: &'t R::Text, | |
161 | last_end: usize, | |
162 | last_match: Option<usize>, | |
163 | } | |
164 | ||
f9f354fc XL |
165 | impl<'t, R> Matches<'t, R> |
166 | where | |
167 | R: RegularExpression, | |
168 | R::Text: 't, | |
169 | { | |
8bb4bdeb XL |
170 | /// Return the text being searched. |
171 | pub fn text(&self) -> &'t R::Text { | |
172 | self.text | |
173 | } | |
174 | ||
175 | /// Return the underlying regex. | |
176 | pub fn regex(&self) -> &R { | |
177 | &self.re | |
178 | } | |
179 | } | |
180 | ||
181 | impl<'t, R> Iterator for Matches<'t, R> | |
f9f354fc XL |
182 | where |
183 | R: RegularExpression, | |
184 | R::Text: 't + AsRef<[u8]>, | |
185 | { | |
8bb4bdeb XL |
186 | type Item = (usize, usize); |
187 | ||
188 | fn next(&mut self) -> Option<(usize, usize)> { | |
189 | if self.last_end > self.text.as_ref().len() { | |
190 | return None; | |
191 | } | |
192 | let (s, e) = match self.re.find_at(self.text, self.last_end) { | |
193 | None => return None, | |
194 | Some((s, e)) => (s, e), | |
195 | }; | |
196 | if s == e { | |
197 | // This is an empty match. To ensure we make progress, start | |
198 | // the next search at the smallest possible starting position | |
199 | // of the next match following this one. | |
ff7c6d11 | 200 | self.last_end = self.re.next_after_empty(self.text, e); |
8bb4bdeb XL |
201 | // Don't accept empty matches immediately following a match. |
202 | // Just move on to the next match. | |
203 | if Some(e) == self.last_match { | |
204 | return self.next(); | |
205 | } | |
206 | } else { | |
207 | self.last_end = e; | |
208 | } | |
209 | self.last_match = Some(e); | |
210 | Some((s, e)) | |
211 | } | |
212 | } | |
213 | ||
5869c6ff XL |
214 | impl<'t, R> FusedIterator for Matches<'t, R> |
215 | where | |
216 | R: RegularExpression, | |
217 | R::Text: 't + AsRef<[u8]>, | |
218 | { | |
219 | } | |
220 | ||
8bb4bdeb XL |
221 | /// An iterator over all non-overlapping successive leftmost-first matches with |
222 | /// captures. | |
5869c6ff | 223 | #[derive(Debug)] |
8bb4bdeb | 224 | pub struct CaptureMatches<'t, R>(Matches<'t, R>) |
f9f354fc XL |
225 | where |
226 | R: RegularExpression, | |
227 | R::Text: 't; | |
8bb4bdeb | 228 | |
f9f354fc XL |
229 | impl<'t, R> CaptureMatches<'t, R> |
230 | where | |
231 | R: RegularExpression, | |
232 | R::Text: 't, | |
233 | { | |
8bb4bdeb XL |
234 | /// Return the text being searched. |
235 | pub fn text(&self) -> &'t R::Text { | |
236 | self.0.text() | |
237 | } | |
238 | ||
239 | /// Return the underlying regex. | |
240 | pub fn regex(&self) -> &R { | |
241 | self.0.regex() | |
242 | } | |
243 | } | |
244 | ||
245 | impl<'t, R> Iterator for CaptureMatches<'t, R> | |
f9f354fc XL |
246 | where |
247 | R: RegularExpression, | |
248 | R::Text: 't + AsRef<[u8]>, | |
249 | { | |
8bb4bdeb XL |
250 | type Item = Locations; |
251 | ||
252 | fn next(&mut self) -> Option<Locations> { | |
253 | if self.0.last_end > self.0.text.as_ref().len() { | |
f9f354fc | 254 | return None; |
8bb4bdeb XL |
255 | } |
256 | let mut locs = self.0.re.locations(); | |
b7449926 | 257 | let (s, e) = match self.0.re.captures_read_at( |
8bb4bdeb XL |
258 | &mut locs, |
259 | self.0.text, | |
260 | self.0.last_end, | |
261 | ) { | |
262 | None => return None, | |
263 | Some((s, e)) => (s, e), | |
264 | }; | |
265 | if s == e { | |
ff7c6d11 | 266 | self.0.last_end = self.0.re.next_after_empty(self.0.text, e); |
8bb4bdeb XL |
267 | if Some(e) == self.0.last_match { |
268 | return self.next(); | |
269 | } | |
270 | } else { | |
271 | self.0.last_end = e; | |
272 | } | |
273 | self.0.last_match = Some(e); | |
274 | Some(locs) | |
275 | } | |
276 | } | |
5869c6ff XL |
277 | |
278 | impl<'t, R> FusedIterator for CaptureMatches<'t, R> | |
279 | where | |
280 | R: RegularExpression, | |
281 | R::Text: 't + AsRef<[u8]>, | |
282 | { | |
283 | } |