]> git.proxmox.com Git - rustc.git/blame - vendor/regex-1.4.6/src/re_set.rs
New upstream version 1.56.0+dfsg1
[rustc.git] / vendor / regex-1.4.6 / src / re_set.rs
CommitLineData
8bb4bdeb
XL
1macro_rules! define_set {
2 ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
3 $(#[$doc_regexset_example:meta])* ) => {
4 pub mod $name {
5 use std::fmt;
6 use std::iter;
7 use std::slice;
8 use std::vec;
9
10 use error::Error;
11 use exec::Exec;
12 use re_builder::$builder_mod::RegexSetBuilder;
13 use re_trait::RegularExpression;
14
15/// Match multiple (possibly overlapping) regular expressions in a single scan.
16///
17/// A regex set corresponds to the union of two or more regular expressions.
18/// That is, a regex set will match text where at least one of its
19/// constituent regular expressions matches. A regex set as its formulated here
20/// provides a touch more power: it will also report *which* regular
21/// expressions in the set match. Indeed, this is the key difference between
22/// regex sets and a single `Regex` with many alternates, since only one
23/// alternate can match at a time.
24///
25/// For example, consider regular expressions to match email addresses and
26/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
27/// regex set is constructed from those regexes, then searching the text
28/// `foo@example.com` will report both regexes as matching. Of course, one
29/// could accomplish this by compiling each regex on its own and doing two
30/// searches over the text. The key advantage of using a regex set is that it
31/// will report the matching regexes using a *single pass through the text*.
32/// If one has hundreds or thousands of regexes to match repeatedly (like a URL
33/// router for a complex web application or a user agent matcher), then a regex
34/// set can realize huge performance gains.
35///
36/// # Example
37///
38/// This shows how the above two regexes (for matching email addresses and
39/// domains) might work:
40///
41$(#[$doc_regexset_example])*
42///
43/// Note that it would be possible to adapt the above example to using `Regex`
44/// with an expression like:
45///
136023e0 46/// ```text
8bb4bdeb
XL
47/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
48/// ```
49///
50/// After a match, one could then inspect the capture groups to figure out
51/// which alternates matched. The problem is that it is hard to make this
52/// approach scale when there are many regexes since the overlap between each
53/// alternate isn't always obvious to reason about.
54///
55/// # Limitations
56///
57/// Regex sets are limited to answering the following two questions:
58///
59/// 1. Does any regex in the set match?
60/// 2. If so, which regexes in the set match?
61///
62/// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
63/// since the matching engines can stop after the first match is found.
64///
65/// Other features like finding the location of successive matches or their
66/// sub-captures aren't supported. If you need this functionality, the
67/// recommended approach is to compile each regex in the set independently and
68/// selectively match them based on which regexes in the set matched.
69///
70/// # Performance
71///
72/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
73/// search takes `O(mn)` time, where `m` is proportional to the size of the
74/// regex set and `n` is proportional to the length of the search text.
75#[derive(Clone)]
76pub struct RegexSet(Exec);
77
78impl RegexSet {
79 /// Create a new regex set with the given regular expressions.
80 ///
81 /// This takes an iterator of `S`, where `S` is something that can produce
82 /// a `&str`. If any of the strings in the iterator are not valid regular
83 /// expressions, then an error is returned.
84 ///
85 /// # Example
86 ///
87 /// Create a new regex set from an iterator of strings:
88 ///
89 /// ```rust
90 /// # use regex::RegexSet;
91 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
92 /// assert!(set.is_match("foo"));
93 /// ```
94 pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
95 where S: AsRef<str>, I: IntoIterator<Item=S> {
96 RegexSetBuilder::new(exprs).build()
97 }
98
5869c6ff
XL
99 /// Create a new empty regex set.
100 ///
101 /// # Example
102 ///
103 /// ```rust
104 /// # use regex::RegexSet;
105 /// let set = RegexSet::empty();
106 /// assert!(set.is_empty());
107 /// ```
108 pub fn empty() -> RegexSet {
109 RegexSetBuilder::new(&[""; 0]).build().unwrap()
110 }
111
8bb4bdeb
XL
112 /// Returns true if and only if one of the regexes in this set matches
113 /// the text given.
114 ///
115 /// This method should be preferred if you only need to test whether any
116 /// of the regexes in the set should match, but don't care about *which*
117 /// regexes matched. This is because the underlying matching engine will
118 /// quit immediately after seeing the first match instead of continuing to
119 /// find all matches.
120 ///
121 /// Note that as with searches using `Regex`, the expression is unanchored
122 /// by default. That is, if the regex does not start with `^` or `\A`, or
123 /// end with `$` or `\z`, then it is permitted to match anywhere in the
124 /// text.
125 ///
126 /// # Example
127 ///
128 /// Tests whether a set matches some text:
129 ///
130 /// ```rust
131 /// # use regex::RegexSet;
132 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
133 /// assert!(set.is_match("foo"));
134 /// assert!(!set.is_match("☃"));
135 /// ```
136 pub fn is_match(&self, text: $text_ty) -> bool {
137 self.is_match_at(text, 0)
138 }
139
140 /// Returns the same as is_match, but starts the search at the given
141 /// offset.
142 ///
143 /// The significance of the starting point is that it takes the surrounding
144 /// context into consideration. For example, the `\A` anchor can only
145 /// match when `start == 0`.
146 #[doc(hidden)]
147 pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
148 self.0.searcher().is_match_at($as_bytes(text), start)
149 }
150
151 /// Returns the set of regular expressions that match in the given text.
152 ///
153 /// The set returned contains the index of each regular expression that
154 /// matches in the given text. The index is in correspondence with the
155 /// order of regular expressions given to `RegexSet`'s constructor.
156 ///
157 /// The set can also be used to iterate over the matched indices.
158 ///
159 /// Note that as with searches using `Regex`, the expression is unanchored
160 /// by default. That is, if the regex does not start with `^` or `\A`, or
161 /// end with `$` or `\z`, then it is permitted to match anywhere in the
162 /// text.
163 ///
164 /// # Example
165 ///
166 /// Tests which regular expressions match the given text:
167 ///
168 /// ```rust
169 /// # use regex::RegexSet;
170 /// let set = RegexSet::new(&[
171 /// r"\w+",
172 /// r"\d+",
173 /// r"\pL+",
174 /// r"foo",
175 /// r"bar",
176 /// r"barfoo",
177 /// r"foobar",
178 /// ]).unwrap();
179 /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
180 /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
181 ///
182 /// // You can also test whether a particular regex matched:
183 /// let matches = set.matches("foobar");
184 /// assert!(!matches.matched(5));
185 /// assert!(matches.matched(6));
186 /// ```
187 pub fn matches(&self, text: $text_ty) -> SetMatches {
188 let mut matches = vec![false; self.0.regex_strings().len()];
189 let any = self.read_matches_at(&mut matches, text, 0);
190 SetMatches {
191 matched_any: any,
192 matches: matches,
193 }
194 }
195
196 /// Returns the same as matches, but starts the search at the given
197 /// offset and stores the matches into the slice given.
198 ///
199 /// The significance of the starting point is that it takes the surrounding
200 /// context into consideration. For example, the `\A` anchor can only
201 /// match when `start == 0`.
202 ///
203 /// `matches` must have a length that is at least the number of regexes
204 /// in this set.
205 ///
206 /// This method returns true if and only if at least one member of
207 /// `matches` is true after executing the set against `text`.
208 #[doc(hidden)]
209 pub fn read_matches_at(
210 &self,
211 matches: &mut [bool],
212 text: $text_ty,
213 start: usize,
214 ) -> bool {
215 self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
216 }
217
218 /// Returns the total number of regular expressions in this set.
219 pub fn len(&self) -> usize {
220 self.0.regex_strings().len()
221 }
0731742a 222
5869c6ff
XL
223 /// Returns `true` if this set contains no regular expressions.
224 pub fn is_empty(&self) -> bool {
225 self.0.regex_strings().is_empty()
226 }
227
0731742a
XL
228 /// Returns the patterns that this set will match on.
229 ///
f9f354fc
XL
230 /// This function can be used to determine the pattern for a match. The
231 /// slice returned has exactly as many patterns givens to this regex set,
232 /// and the order of the slice is the same as the order of the patterns
0731742a
XL
233 /// provided to the set.
234 ///
235 /// # Example
236 ///
237 /// ```rust
238 /// # use regex::RegexSet;
239 /// let set = RegexSet::new(&[
240 /// r"\w+",
241 /// r"\d+",
242 /// r"\pL+",
243 /// r"foo",
244 /// r"bar",
245 /// r"barfoo",
246 /// r"foobar",
247 /// ]).unwrap();
248 /// let matches: Vec<_> = set
249 /// .matches("foobar")
250 /// .into_iter()
251 /// .map(|match_idx| &set.patterns()[match_idx])
252 /// .collect();
253 /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
254 /// ```
255 pub fn patterns(&self) -> &[String] {
256 self.0.regex_strings()
257 }
8bb4bdeb
XL
258}
259
260/// A set of matches returned by a regex set.
261#[derive(Clone, Debug)]
262pub struct SetMatches {
263 matched_any: bool,
264 matches: Vec<bool>,
265}
266
267impl SetMatches {
268 /// Whether this set contains any matches.
269 pub fn matched_any(&self) -> bool {
270 self.matched_any
271 }
272
273 /// Whether the regex at the given index matched.
274 ///
275 /// The index for a regex is determined by its insertion order upon the
276 /// initial construction of a `RegexSet`, starting at `0`.
277 ///
278 /// # Panics
279 ///
280 /// If `regex_index` is greater than or equal to `self.len()`.
281 pub fn matched(&self, regex_index: usize) -> bool {
282 self.matches[regex_index]
283 }
284
285 /// The total number of regexes in the set that created these matches.
286 pub fn len(&self) -> usize {
287 self.matches.len()
288 }
289
290 /// Returns an iterator over indexes in the regex that matched.
94b46f34
XL
291 ///
292 /// This will always produces matches in ascending order of index, where
293 /// the index corresponds to the index of the regex that matched with
294 /// respect to its position when initially building the set.
8bb4bdeb
XL
295 pub fn iter(&self) -> SetMatchesIter {
296 SetMatchesIter((&*self.matches).into_iter().enumerate())
297 }
298}
299
300impl IntoIterator for SetMatches {
301 type IntoIter = SetMatchesIntoIter;
302 type Item = usize;
303
304 fn into_iter(self) -> Self::IntoIter {
305 SetMatchesIntoIter(self.matches.into_iter().enumerate())
306 }
307}
308
309impl<'a> IntoIterator for &'a SetMatches {
310 type IntoIter = SetMatchesIter<'a>;
311 type Item = usize;
312
313 fn into_iter(self) -> Self::IntoIter {
314 self.iter()
315 }
316}
317
318/// An owned iterator over the set of matches from a regex set.
94b46f34
XL
319///
320/// This will always produces matches in ascending order of index, where the
321/// index corresponds to the index of the regex that matched with respect to
322/// its position when initially building the set.
5869c6ff 323#[derive(Debug)]
8bb4bdeb
XL
324pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
325
326impl Iterator for SetMatchesIntoIter {
327 type Item = usize;
328
329 fn next(&mut self) -> Option<usize> {
330 loop {
331 match self.0.next() {
332 None => return None,
333 Some((_, false)) => {}
334 Some((i, true)) => return Some(i),
335 }
336 }
337 }
8faf50e0
XL
338
339 fn size_hint(&self) -> (usize, Option<usize>) {
f9f354fc 340 self.0.size_hint()
8faf50e0 341 }
8bb4bdeb
XL
342}
343
344impl DoubleEndedIterator for SetMatchesIntoIter {
345 fn next_back(&mut self) -> Option<usize> {
346 loop {
347 match self.0.next_back() {
348 None => return None,
349 Some((_, false)) => {}
350 Some((i, true)) => return Some(i),
351 }
352 }
353 }
354}
355
5869c6ff
XL
356impl iter::FusedIterator for SetMatchesIntoIter {}
357
8bb4bdeb
XL
358/// A borrowed iterator over the set of matches from a regex set.
359///
360/// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
94b46f34
XL
361///
362/// This will always produces matches in ascending order of index, where the
363/// index corresponds to the index of the regex that matched with respect to
364/// its position when initially building the set.
5869c6ff 365#[derive(Clone, Debug)]
8bb4bdeb
XL
366pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
367
368impl<'a> Iterator for SetMatchesIter<'a> {
369 type Item = usize;
370
371 fn next(&mut self) -> Option<usize> {
372 loop {
373 match self.0.next() {
374 None => return None,
375 Some((_, &false)) => {}
376 Some((i, &true)) => return Some(i),
377 }
378 }
379 }
8faf50e0
XL
380
381 fn size_hint(&self) -> (usize, Option<usize>) {
f9f354fc 382 self.0.size_hint()
8faf50e0 383 }
8bb4bdeb
XL
384}
385
386impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
387 fn next_back(&mut self) -> Option<usize> {
388 loop {
389 match self.0.next_back() {
390 None => return None,
391 Some((_, &false)) => {}
392 Some((i, &true)) => return Some(i),
393 }
394 }
395 }
396}
397
5869c6ff
XL
398impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
399
8bb4bdeb
XL
400#[doc(hidden)]
401impl From<Exec> for RegexSet {
402 fn from(exec: Exec) -> Self {
403 RegexSet(exec)
404 }
405}
406
407impl fmt::Debug for RegexSet {
408 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
409 write!(f, "RegexSet({:?})", self.0.regex_strings())
410 }
411}
412
413#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
414#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
415 }
416 }
417}
418
419define_set! {
420 unicode,
421 set_unicode,
422 &str,
423 as_bytes_str,
424/// ```rust
425/// # use regex::RegexSet;
426/// let set = RegexSet::new(&[
427/// r"[a-z]+@[a-z]+\.(com|org|net)",
428/// r"[a-z]+\.(com|org|net)",
429/// ]).unwrap();
430///
431/// // Ask whether any regexes in the set match.
432/// assert!(set.is_match("foo@example.com"));
433///
434/// // Identify which regexes in the set match.
435/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
436/// assert_eq!(vec![0, 1], matches);
437///
438/// // Try again, but with text that only matches one of the regexes.
439/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
440/// assert_eq!(vec![1], matches);
441///
442/// // Try again, but with text that doesn't match any regex in the set.
443/// let matches: Vec<_> = set.matches("example").into_iter().collect();
444/// assert!(matches.is_empty());
445/// ```
446}
447
448define_set! {
449 bytes,
450 set_bytes,
451 &[u8],
452 as_bytes_bytes,
453/// ```rust
454/// # use regex::bytes::RegexSet;
455/// let set = RegexSet::new(&[
456/// r"[a-z]+@[a-z]+\.(com|org|net)",
457/// r"[a-z]+\.(com|org|net)",
458/// ]).unwrap();
459///
460/// // Ask whether any regexes in the set match.
461/// assert!(set.is_match(b"foo@example.com"));
462///
463/// // Identify which regexes in the set match.
464/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
465/// assert_eq!(vec![0, 1], matches);
466///
467/// // Try again, but with text that only matches one of the regexes.
468/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
469/// assert_eq!(vec![1], matches);
470///
471/// // Try again, but with text that doesn't match any regex in the set.
472/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
473/// assert!(matches.is_empty());
474/// ```
475}