]>
Commit | Line | Data |
---|---|---|
8bb4bdeb XL |
1 | macro_rules! define_set { |
2 | ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, | |
3 | $(#[$doc_regexset_example:meta])* ) => { | |
4 | pub mod $name { | |
5 | use std::fmt; | |
6 | use std::iter; | |
7 | use std::slice; | |
8 | use std::vec; | |
9 | ||
10 | use error::Error; | |
11 | use exec::Exec; | |
12 | use re_builder::$builder_mod::RegexSetBuilder; | |
13 | use re_trait::RegularExpression; | |
14 | ||
15 | /// Match multiple (possibly overlapping) regular expressions in a single scan. | |
16 | /// | |
17 | /// A regex set corresponds to the union of two or more regular expressions. | |
18 | /// That is, a regex set will match text where at least one of its | |
19 | /// constituent regular expressions matches. A regex set as its formulated here | |
20 | /// provides a touch more power: it will also report *which* regular | |
21 | /// expressions in the set match. Indeed, this is the key difference between | |
22 | /// regex sets and a single `Regex` with many alternates, since only one | |
23 | /// alternate can match at a time. | |
24 | /// | |
25 | /// For example, consider regular expressions to match email addresses and | |
26 | /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a | |
27 | /// regex set is constructed from those regexes, then searching the text | |
28 | /// `foo@example.com` will report both regexes as matching. Of course, one | |
29 | /// could accomplish this by compiling each regex on its own and doing two | |
30 | /// searches over the text. The key advantage of using a regex set is that it | |
31 | /// will report the matching regexes using a *single pass through the text*. | |
32 | /// If one has hundreds or thousands of regexes to match repeatedly (like a URL | |
33 | /// router for a complex web application or a user agent matcher), then a regex | |
34 | /// set can realize huge performance gains. | |
35 | /// | |
36 | /// # Example | |
37 | /// | |
38 | /// This shows how the above two regexes (for matching email addresses and | |
39 | /// domains) might work: | |
40 | /// | |
41 | $(#[$doc_regexset_example])* | |
42 | /// | |
43 | /// Note that it would be possible to adapt the above example to using `Regex` | |
44 | /// with an expression like: | |
45 | /// | |
136023e0 | 46 | /// ```text |
8bb4bdeb XL |
47 | /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) |
48 | /// ``` | |
49 | /// | |
50 | /// After a match, one could then inspect the capture groups to figure out | |
51 | /// which alternates matched. The problem is that it is hard to make this | |
52 | /// approach scale when there are many regexes since the overlap between each | |
53 | /// alternate isn't always obvious to reason about. | |
54 | /// | |
55 | /// # Limitations | |
56 | /// | |
57 | /// Regex sets are limited to answering the following two questions: | |
58 | /// | |
59 | /// 1. Does any regex in the set match? | |
60 | /// 2. If so, which regexes in the set match? | |
61 | /// | |
62 | /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2) | |
63 | /// since the matching engines can stop after the first match is found. | |
64 | /// | |
65 | /// Other features like finding the location of successive matches or their | |
66 | /// sub-captures aren't supported. If you need this functionality, the | |
67 | /// recommended approach is to compile each regex in the set independently and | |
68 | /// selectively match them based on which regexes in the set matched. | |
69 | /// | |
70 | /// # Performance | |
71 | /// | |
72 | /// A `RegexSet` has the same performance characteristics as `Regex`. Namely, | |
73 | /// search takes `O(mn)` time, where `m` is proportional to the size of the | |
74 | /// regex set and `n` is proportional to the length of the search text. | |
75 | #[derive(Clone)] | |
76 | pub struct RegexSet(Exec); | |
77 | ||
78 | impl RegexSet { | |
79 | /// Create a new regex set with the given regular expressions. | |
80 | /// | |
81 | /// This takes an iterator of `S`, where `S` is something that can produce | |
82 | /// a `&str`. If any of the strings in the iterator are not valid regular | |
83 | /// expressions, then an error is returned. | |
84 | /// | |
85 | /// # Example | |
86 | /// | |
87 | /// Create a new regex set from an iterator of strings: | |
88 | /// | |
89 | /// ```rust | |
90 | /// # use regex::RegexSet; | |
91 | /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); | |
92 | /// assert!(set.is_match("foo")); | |
93 | /// ``` | |
94 | pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> | |
95 | where S: AsRef<str>, I: IntoIterator<Item=S> { | |
96 | RegexSetBuilder::new(exprs).build() | |
97 | } | |
98 | ||
5869c6ff XL |
99 | /// Create a new empty regex set. |
100 | /// | |
101 | /// # Example | |
102 | /// | |
103 | /// ```rust | |
104 | /// # use regex::RegexSet; | |
105 | /// let set = RegexSet::empty(); | |
106 | /// assert!(set.is_empty()); | |
107 | /// ``` | |
108 | pub fn empty() -> RegexSet { | |
109 | RegexSetBuilder::new(&[""; 0]).build().unwrap() | |
110 | } | |
111 | ||
8bb4bdeb XL |
112 | /// Returns true if and only if one of the regexes in this set matches |
113 | /// the text given. | |
114 | /// | |
115 | /// This method should be preferred if you only need to test whether any | |
116 | /// of the regexes in the set should match, but don't care about *which* | |
117 | /// regexes matched. This is because the underlying matching engine will | |
118 | /// quit immediately after seeing the first match instead of continuing to | |
119 | /// find all matches. | |
120 | /// | |
121 | /// Note that as with searches using `Regex`, the expression is unanchored | |
122 | /// by default. That is, if the regex does not start with `^` or `\A`, or | |
123 | /// end with `$` or `\z`, then it is permitted to match anywhere in the | |
124 | /// text. | |
125 | /// | |
126 | /// # Example | |
127 | /// | |
128 | /// Tests whether a set matches some text: | |
129 | /// | |
130 | /// ```rust | |
131 | /// # use regex::RegexSet; | |
132 | /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); | |
133 | /// assert!(set.is_match("foo")); | |
134 | /// assert!(!set.is_match("☃")); | |
135 | /// ``` | |
136 | pub fn is_match(&self, text: $text_ty) -> bool { | |
137 | self.is_match_at(text, 0) | |
138 | } | |
139 | ||
140 | /// Returns the same as is_match, but starts the search at the given | |
141 | /// offset. | |
142 | /// | |
143 | /// The significance of the starting point is that it takes the surrounding | |
144 | /// context into consideration. For example, the `\A` anchor can only | |
145 | /// match when `start == 0`. | |
146 | #[doc(hidden)] | |
147 | pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { | |
148 | self.0.searcher().is_match_at($as_bytes(text), start) | |
149 | } | |
150 | ||
151 | /// Returns the set of regular expressions that match in the given text. | |
152 | /// | |
153 | /// The set returned contains the index of each regular expression that | |
154 | /// matches in the given text. The index is in correspondence with the | |
155 | /// order of regular expressions given to `RegexSet`'s constructor. | |
156 | /// | |
157 | /// The set can also be used to iterate over the matched indices. | |
158 | /// | |
159 | /// Note that as with searches using `Regex`, the expression is unanchored | |
160 | /// by default. That is, if the regex does not start with `^` or `\A`, or | |
161 | /// end with `$` or `\z`, then it is permitted to match anywhere in the | |
162 | /// text. | |
163 | /// | |
164 | /// # Example | |
165 | /// | |
166 | /// Tests which regular expressions match the given text: | |
167 | /// | |
168 | /// ```rust | |
169 | /// # use regex::RegexSet; | |
170 | /// let set = RegexSet::new(&[ | |
171 | /// r"\w+", | |
172 | /// r"\d+", | |
173 | /// r"\pL+", | |
174 | /// r"foo", | |
175 | /// r"bar", | |
176 | /// r"barfoo", | |
177 | /// r"foobar", | |
178 | /// ]).unwrap(); | |
179 | /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); | |
180 | /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); | |
181 | /// | |
182 | /// // You can also test whether a particular regex matched: | |
183 | /// let matches = set.matches("foobar"); | |
184 | /// assert!(!matches.matched(5)); | |
185 | /// assert!(matches.matched(6)); | |
186 | /// ``` | |
187 | pub fn matches(&self, text: $text_ty) -> SetMatches { | |
188 | let mut matches = vec![false; self.0.regex_strings().len()]; | |
189 | let any = self.read_matches_at(&mut matches, text, 0); | |
190 | SetMatches { | |
191 | matched_any: any, | |
192 | matches: matches, | |
193 | } | |
194 | } | |
195 | ||
196 | /// Returns the same as matches, but starts the search at the given | |
197 | /// offset and stores the matches into the slice given. | |
198 | /// | |
199 | /// The significance of the starting point is that it takes the surrounding | |
200 | /// context into consideration. For example, the `\A` anchor can only | |
201 | /// match when `start == 0`. | |
202 | /// | |
203 | /// `matches` must have a length that is at least the number of regexes | |
204 | /// in this set. | |
205 | /// | |
206 | /// This method returns true if and only if at least one member of | |
207 | /// `matches` is true after executing the set against `text`. | |
208 | #[doc(hidden)] | |
209 | pub fn read_matches_at( | |
210 | &self, | |
211 | matches: &mut [bool], | |
212 | text: $text_ty, | |
213 | start: usize, | |
214 | ) -> bool { | |
215 | self.0.searcher().many_matches_at(matches, $as_bytes(text), start) | |
216 | } | |
217 | ||
218 | /// Returns the total number of regular expressions in this set. | |
219 | pub fn len(&self) -> usize { | |
220 | self.0.regex_strings().len() | |
221 | } | |
0731742a | 222 | |
5869c6ff XL |
223 | /// Returns `true` if this set contains no regular expressions. |
224 | pub fn is_empty(&self) -> bool { | |
225 | self.0.regex_strings().is_empty() | |
226 | } | |
227 | ||
0731742a XL |
228 | /// Returns the patterns that this set will match on. |
229 | /// | |
f9f354fc XL |
230 | /// This function can be used to determine the pattern for a match. The |
231 | /// slice returned has exactly as many patterns givens to this regex set, | |
232 | /// and the order of the slice is the same as the order of the patterns | |
0731742a XL |
233 | /// provided to the set. |
234 | /// | |
235 | /// # Example | |
236 | /// | |
237 | /// ```rust | |
238 | /// # use regex::RegexSet; | |
239 | /// let set = RegexSet::new(&[ | |
240 | /// r"\w+", | |
241 | /// r"\d+", | |
242 | /// r"\pL+", | |
243 | /// r"foo", | |
244 | /// r"bar", | |
245 | /// r"barfoo", | |
246 | /// r"foobar", | |
247 | /// ]).unwrap(); | |
248 | /// let matches: Vec<_> = set | |
249 | /// .matches("foobar") | |
250 | /// .into_iter() | |
251 | /// .map(|match_idx| &set.patterns()[match_idx]) | |
252 | /// .collect(); | |
253 | /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); | |
254 | /// ``` | |
255 | pub fn patterns(&self) -> &[String] { | |
256 | self.0.regex_strings() | |
257 | } | |
8bb4bdeb XL |
258 | } |
259 | ||
260 | /// A set of matches returned by a regex set. | |
261 | #[derive(Clone, Debug)] | |
262 | pub struct SetMatches { | |
263 | matched_any: bool, | |
264 | matches: Vec<bool>, | |
265 | } | |
266 | ||
267 | impl SetMatches { | |
268 | /// Whether this set contains any matches. | |
269 | pub fn matched_any(&self) -> bool { | |
270 | self.matched_any | |
271 | } | |
272 | ||
273 | /// Whether the regex at the given index matched. | |
274 | /// | |
275 | /// The index for a regex is determined by its insertion order upon the | |
276 | /// initial construction of a `RegexSet`, starting at `0`. | |
277 | /// | |
278 | /// # Panics | |
279 | /// | |
280 | /// If `regex_index` is greater than or equal to `self.len()`. | |
281 | pub fn matched(&self, regex_index: usize) -> bool { | |
282 | self.matches[regex_index] | |
283 | } | |
284 | ||
285 | /// The total number of regexes in the set that created these matches. | |
286 | pub fn len(&self) -> usize { | |
287 | self.matches.len() | |
288 | } | |
289 | ||
290 | /// Returns an iterator over indexes in the regex that matched. | |
94b46f34 XL |
291 | /// |
292 | /// This will always produces matches in ascending order of index, where | |
293 | /// the index corresponds to the index of the regex that matched with | |
294 | /// respect to its position when initially building the set. | |
8bb4bdeb XL |
295 | pub fn iter(&self) -> SetMatchesIter { |
296 | SetMatchesIter((&*self.matches).into_iter().enumerate()) | |
297 | } | |
298 | } | |
299 | ||
300 | impl IntoIterator for SetMatches { | |
301 | type IntoIter = SetMatchesIntoIter; | |
302 | type Item = usize; | |
303 | ||
304 | fn into_iter(self) -> Self::IntoIter { | |
305 | SetMatchesIntoIter(self.matches.into_iter().enumerate()) | |
306 | } | |
307 | } | |
308 | ||
309 | impl<'a> IntoIterator for &'a SetMatches { | |
310 | type IntoIter = SetMatchesIter<'a>; | |
311 | type Item = usize; | |
312 | ||
313 | fn into_iter(self) -> Self::IntoIter { | |
314 | self.iter() | |
315 | } | |
316 | } | |
317 | ||
318 | /// An owned iterator over the set of matches from a regex set. | |
94b46f34 XL |
319 | /// |
320 | /// This will always produces matches in ascending order of index, where the | |
321 | /// index corresponds to the index of the regex that matched with respect to | |
322 | /// its position when initially building the set. | |
5869c6ff | 323 | #[derive(Debug)] |
8bb4bdeb XL |
324 | pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>); |
325 | ||
326 | impl Iterator for SetMatchesIntoIter { | |
327 | type Item = usize; | |
328 | ||
329 | fn next(&mut self) -> Option<usize> { | |
330 | loop { | |
331 | match self.0.next() { | |
332 | None => return None, | |
333 | Some((_, false)) => {} | |
334 | Some((i, true)) => return Some(i), | |
335 | } | |
336 | } | |
337 | } | |
8faf50e0 XL |
338 | |
339 | fn size_hint(&self) -> (usize, Option<usize>) { | |
f9f354fc | 340 | self.0.size_hint() |
8faf50e0 | 341 | } |
8bb4bdeb XL |
342 | } |
343 | ||
344 | impl DoubleEndedIterator for SetMatchesIntoIter { | |
345 | fn next_back(&mut self) -> Option<usize> { | |
346 | loop { | |
347 | match self.0.next_back() { | |
348 | None => return None, | |
349 | Some((_, false)) => {} | |
350 | Some((i, true)) => return Some(i), | |
351 | } | |
352 | } | |
353 | } | |
354 | } | |
355 | ||
5869c6ff XL |
356 | impl iter::FusedIterator for SetMatchesIntoIter {} |
357 | ||
8bb4bdeb XL |
358 | /// A borrowed iterator over the set of matches from a regex set. |
359 | /// | |
360 | /// The lifetime `'a` refers to the lifetime of a `SetMatches` value. | |
94b46f34 XL |
361 | /// |
362 | /// This will always produces matches in ascending order of index, where the | |
363 | /// index corresponds to the index of the regex that matched with respect to | |
364 | /// its position when initially building the set. | |
5869c6ff | 365 | #[derive(Clone, Debug)] |
8bb4bdeb XL |
366 | pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); |
367 | ||
368 | impl<'a> Iterator for SetMatchesIter<'a> { | |
369 | type Item = usize; | |
370 | ||
371 | fn next(&mut self) -> Option<usize> { | |
372 | loop { | |
373 | match self.0.next() { | |
374 | None => return None, | |
375 | Some((_, &false)) => {} | |
376 | Some((i, &true)) => return Some(i), | |
377 | } | |
378 | } | |
379 | } | |
8faf50e0 XL |
380 | |
381 | fn size_hint(&self) -> (usize, Option<usize>) { | |
f9f354fc | 382 | self.0.size_hint() |
8faf50e0 | 383 | } |
8bb4bdeb XL |
384 | } |
385 | ||
386 | impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { | |
387 | fn next_back(&mut self) -> Option<usize> { | |
388 | loop { | |
389 | match self.0.next_back() { | |
390 | None => return None, | |
391 | Some((_, &false)) => {} | |
392 | Some((i, &true)) => return Some(i), | |
393 | } | |
394 | } | |
395 | } | |
396 | } | |
397 | ||
5869c6ff XL |
398 | impl<'a> iter::FusedIterator for SetMatchesIter<'a> {} |
399 | ||
8bb4bdeb XL |
400 | #[doc(hidden)] |
401 | impl From<Exec> for RegexSet { | |
402 | fn from(exec: Exec) -> Self { | |
403 | RegexSet(exec) | |
404 | } | |
405 | } | |
406 | ||
407 | impl fmt::Debug for RegexSet { | |
408 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
409 | write!(f, "RegexSet({:?})", self.0.regex_strings()) | |
410 | } | |
411 | } | |
412 | ||
413 | #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() } | |
414 | #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text } | |
415 | } | |
416 | } | |
417 | } | |
418 | ||
419 | define_set! { | |
420 | unicode, | |
421 | set_unicode, | |
422 | &str, | |
423 | as_bytes_str, | |
424 | /// ```rust | |
425 | /// # use regex::RegexSet; | |
426 | /// let set = RegexSet::new(&[ | |
427 | /// r"[a-z]+@[a-z]+\.(com|org|net)", | |
428 | /// r"[a-z]+\.(com|org|net)", | |
429 | /// ]).unwrap(); | |
430 | /// | |
431 | /// // Ask whether any regexes in the set match. | |
432 | /// assert!(set.is_match("foo@example.com")); | |
433 | /// | |
434 | /// // Identify which regexes in the set match. | |
435 | /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); | |
436 | /// assert_eq!(vec![0, 1], matches); | |
437 | /// | |
438 | /// // Try again, but with text that only matches one of the regexes. | |
439 | /// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); | |
440 | /// assert_eq!(vec![1], matches); | |
441 | /// | |
442 | /// // Try again, but with text that doesn't match any regex in the set. | |
443 | /// let matches: Vec<_> = set.matches("example").into_iter().collect(); | |
444 | /// assert!(matches.is_empty()); | |
445 | /// ``` | |
446 | } | |
447 | ||
448 | define_set! { | |
449 | bytes, | |
450 | set_bytes, | |
451 | &[u8], | |
452 | as_bytes_bytes, | |
453 | /// ```rust | |
454 | /// # use regex::bytes::RegexSet; | |
455 | /// let set = RegexSet::new(&[ | |
456 | /// r"[a-z]+@[a-z]+\.(com|org|net)", | |
457 | /// r"[a-z]+\.(com|org|net)", | |
458 | /// ]).unwrap(); | |
459 | /// | |
460 | /// // Ask whether any regexes in the set match. | |
461 | /// assert!(set.is_match(b"foo@example.com")); | |
462 | /// | |
463 | /// // Identify which regexes in the set match. | |
464 | /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); | |
465 | /// assert_eq!(vec![0, 1], matches); | |
466 | /// | |
467 | /// // Try again, but with text that only matches one of the regexes. | |
468 | /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); | |
469 | /// assert_eq!(vec![1], matches); | |
470 | /// | |
471 | /// // Try again, but with text that doesn't match any regex in the set. | |
472 | /// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); | |
473 | /// assert!(matches.is_empty()); | |
474 | /// ``` | |
475 | } |