]>
git.proxmox.com Git - rustc.git/blob - vendor/regex/src/re_set.rs
1 macro_rules
! define_set
{
2 ($name
:ident
, $builder_mod
:ident
, $text_ty
:ty
, $as_bytes
:expr
,
3 $
(#[$doc_regexset_example:meta])* ) => {
10 use crate::error
::Error
;
11 use crate::exec
::Exec
;
12 use crate::re_builder
::$builder_mod
::RegexSetBuilder
;
13 use crate::re_trait
::RegularExpression
;
15 /// Match multiple (possibly overlapping) regular expressions in a single scan.
17 /// A regex set corresponds to the union of two or more regular expressions.
18 /// That is, a regex set will match text where at least one of its
19 /// constituent regular expressions matches. A regex set as its formulated here
20 /// provides a touch more power: it will also report *which* regular
21 /// expressions in the set match. Indeed, this is the key difference between
22 /// regex sets and a single `Regex` with many alternates, since only one
23 /// alternate can match at a time.
25 /// For example, consider regular expressions to match email addresses and
26 /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
27 /// regex set is constructed from those regexes, then searching the text
28 /// `foo@example.com` will report both regexes as matching. Of course, one
29 /// could accomplish this by compiling each regex on its own and doing two
30 /// searches over the text. The key advantage of using a regex set is that it
31 /// will report the matching regexes using a *single pass through the text*.
32 /// If one has hundreds or thousands of regexes to match repeatedly (like a URL
33 /// router for a complex web application or a user agent matcher), then a regex
34 /// set can realize huge performance gains.
38 /// This shows how the above two regexes (for matching email addresses and
39 /// domains) might work:
41 $
(#[$doc_regexset_example])*
43 /// Note that it would be possible to adapt the above example to using `Regex`
44 /// with an expression like:
47 /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
50 /// After a match, one could then inspect the capture groups to figure out
51 /// which alternates matched. The problem is that it is hard to make this
52 /// approach scale when there are many regexes since the overlap between each
53 /// alternate isn't always obvious to reason about.
57 /// Regex sets are limited to answering the following two questions:
59 /// 1. Does any regex in the set match?
60 /// 2. If so, which regexes in the set match?
62 /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
63 /// since the matching engines can stop after the first match is found.
65 /// Other features like finding the location of successive matches or their
66 /// sub-captures aren't supported. If you need this functionality, the
67 /// recommended approach is to compile each regex in the set independently and
68 /// selectively match them based on which regexes in the set matched.
72 /// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
73 /// search takes `O(mn)` time, where `m` is proportional to the size of the
74 /// regex set and `n` is proportional to the length of the search text.
76 pub struct RegexSet(Exec
);
79 /// Create a new regex set with the given regular expressions.
81 /// This takes an iterator of `S`, where `S` is something that can produce
82 /// a `&str`. If any of the strings in the iterator are not valid regular
83 /// expressions, then an error is returned.
87 /// Create a new regex set from an iterator of strings:
90 /// # use regex::RegexSet;
91 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
92 /// assert!(set.is_match("foo"));
94 pub fn new
<I
, S
>(exprs
: I
) -> Result
<RegexSet
, Error
>
95 where S
: AsRef
<str>, I
: IntoIterator
<Item
=S
> {
96 RegexSetBuilder
::new(exprs
).build()
99 /// Create a new empty regex set.
104 /// # use regex::RegexSet;
105 /// let set = RegexSet::empty();
106 /// assert!(set.is_empty());
108 pub fn empty() -> RegexSet
{
109 RegexSetBuilder
::new(&[""; 0]).build().unwrap()
112 /// Returns true if and only if one of the regexes in this set matches
115 /// This method should be preferred if you only need to test whether any
116 /// of the regexes in the set should match, but don't care about *which*
117 /// regexes matched. This is because the underlying matching engine will
118 /// quit immediately after seeing the first match instead of continuing to
119 /// find all matches.
121 /// Note that as with searches using `Regex`, the expression is unanchored
122 /// by default. That is, if the regex does not start with `^` or `\A`, or
123 /// end with `$` or `\z`, then it is permitted to match anywhere in the
128 /// Tests whether a set matches some text:
131 /// # use regex::RegexSet;
132 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
133 /// assert!(set.is_match("foo"));
134 /// assert!(!set.is_match("☃"));
136 pub fn is_match(&self, text
: $text_ty
) -> bool
{
137 self.is_match_at(text
, 0)
140 /// Returns the same as is_match, but starts the search at the given
143 /// The significance of the starting point is that it takes the surrounding
144 /// context into consideration. For example, the `\A` anchor can only
145 /// match when `start == 0`.
147 pub fn is_match_at(&self, text
: $text_ty
, start
: usize) -> bool
{
148 self.0.searcher().is_match_at($
as_bytes(text
), start
)
151 /// Returns the set of regular expressions that match in the given text.
153 /// The set returned contains the index of each regular expression that
154 /// matches in the given text. The index is in correspondence with the
155 /// order of regular expressions given to `RegexSet`'s constructor.
157 /// The set can also be used to iterate over the matched indices.
159 /// Note that as with searches using `Regex`, the expression is unanchored
160 /// by default. That is, if the regex does not start with `^` or `\A`, or
161 /// end with `$` or `\z`, then it is permitted to match anywhere in the
166 /// Tests which regular expressions match the given text:
169 /// # use regex::RegexSet;
170 /// let set = RegexSet::new(&[
179 /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
180 /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
182 /// // You can also test whether a particular regex matched:
183 /// let matches = set.matches("foobar");
184 /// assert!(!matches.matched(5));
185 /// assert!(matches.matched(6));
187 pub fn matches(&self, text
: $text_ty
) -> SetMatches
{
188 let mut matches
= vec
![false; self.0.regex_strings().len()];
189 let any
= self.read_matches_at(&mut matches
, text
, 0);
196 /// Returns the same as matches, but starts the search at the given
197 /// offset and stores the matches into the slice given.
199 /// The significance of the starting point is that it takes the surrounding
200 /// context into consideration. For example, the `\A` anchor can only
201 /// match when `start == 0`.
203 /// `matches` must have a length that is at least the number of regexes
206 /// This method returns true if and only if at least one member of
207 /// `matches` is true after executing the set against `text`.
209 pub fn read_matches_at(
211 matches
: &mut [bool
],
215 self.0.searcher().many_matches_at(matches
, $
as_bytes(text
), start
)
218 /// Returns the total number of regular expressions in this set.
219 pub fn len(&self) -> usize {
220 self.0.regex_strings().len()
223 /// Returns `true` if this set contains no regular expressions.
224 pub fn is_empty(&self) -> bool
{
225 self.0.regex_strings().is_empty()
228 /// Returns the patterns that this set will match on.
230 /// This function can be used to determine the pattern for a match. The
231 /// slice returned has exactly as many patterns givens to this regex set,
232 /// and the order of the slice is the same as the order of the patterns
233 /// provided to the set.
238 /// # use regex::RegexSet;
239 /// let set = RegexSet::new(&[
248 /// let matches: Vec<_> = set
249 /// .matches("foobar")
251 /// .map(|match_idx| &set.patterns()[match_idx])
253 /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
255 pub fn patterns(&self) -> &[String
] {
256 self.0.regex_strings()
260 /// A set of matches returned by a regex set.
261 #[derive(Clone, Debug)]
262 pub struct SetMatches
{
268 /// Whether this set contains any matches.
269 pub fn matched_any(&self) -> bool
{
273 /// Whether the regex at the given index matched.
275 /// The index for a regex is determined by its insertion order upon the
276 /// initial construction of a `RegexSet`, starting at `0`.
280 /// If `regex_index` is greater than or equal to `self.len()`.
281 pub fn matched(&self, regex_index
: usize) -> bool
{
282 self.matches
[regex_index
]
285 /// The total number of regexes in the set that created these matches.
286 pub fn len(&self) -> usize {
290 /// Returns an iterator over indexes in the regex that matched.
292 /// This will always produces matches in ascending order of index, where
293 /// the index corresponds to the index of the regex that matched with
294 /// respect to its position when initially building the set.
295 pub fn iter(&self) -> SetMatchesIter
<'_
> {
296 SetMatchesIter((&*self.matches
).into_iter().enumerate())
300 impl IntoIterator
for SetMatches
{
301 type IntoIter
= SetMatchesIntoIter
;
304 fn into_iter(self) -> Self::IntoIter
{
305 SetMatchesIntoIter(self.matches
.into_iter().enumerate())
309 impl<'a
> IntoIterator
for &'a SetMatches
{
310 type IntoIter
= SetMatchesIter
<'a
>;
313 fn into_iter(self) -> Self::IntoIter
{
318 /// An owned iterator over the set of matches from a regex set.
320 /// This will always produces matches in ascending order of index, where the
321 /// index corresponds to the index of the regex that matched with respect to
322 /// its position when initially building the set.
324 pub struct SetMatchesIntoIter(iter
::Enumerate
<vec
::IntoIter
<bool
>>);
326 impl Iterator
for SetMatchesIntoIter
{
329 fn next(&mut self) -> Option
<usize> {
331 match self.0.next() {
333 Some((_
, false)) => {}
334 Some((i
, true)) => return Some(i
),
339 fn size_hint(&self) -> (usize, Option
<usize>) {
344 impl DoubleEndedIterator
for SetMatchesIntoIter
{
345 fn next_back(&mut self) -> Option
<usize> {
347 match self.0.next_back() {
349 Some((_
, false)) => {}
350 Some((i
, true)) => return Some(i
),
356 impl iter
::FusedIterator
for SetMatchesIntoIter {}
358 /// A borrowed iterator over the set of matches from a regex set.
360 /// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
362 /// This will always produces matches in ascending order of index, where the
363 /// index corresponds to the index of the regex that matched with respect to
364 /// its position when initially building the set.
365 #[derive(Clone, Debug)]
366 pub struct SetMatchesIter
<'a
>(iter
::Enumerate
<slice
::Iter
<'a
, bool
>>);
368 impl<'a
> Iterator
for SetMatchesIter
<'a
> {
371 fn next(&mut self) -> Option
<usize> {
373 match self.0.next() {
375 Some((_
, &false)) => {}
376 Some((i
, &true)) => return Some(i
),
381 fn size_hint(&self) -> (usize, Option
<usize>) {
386 impl<'a
> DoubleEndedIterator
for SetMatchesIter
<'a
> {
387 fn next_back(&mut self) -> Option
<usize> {
389 match self.0.next_back() {
391 Some((_
, &false)) => {}
392 Some((i
, &true)) => return Some(i
),
398 impl<'a
> iter
::FusedIterator
for SetMatchesIter
<'a
> {}
401 impl From
<Exec
> for RegexSet
{
402 fn from(exec
: Exec
) -> Self {
407 impl fmt
::Debug
for RegexSet
{
408 fn fmt(&self, f
: &mut fmt
::Formatter
<'_
>) -> fmt
::Result
{
409 write
!(f
, "RegexSet({:?})", self.0.regex_strings())
413 #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
414 #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
425 /// # use regex::RegexSet;
426 /// let set = RegexSet::new(&[
427 /// r"[a-z]+@[a-z]+\.(com|org|net)",
428 /// r"[a-z]+\.(com|org|net)",
431 /// // Ask whether any regexes in the set match.
432 /// assert!(set.is_match("foo@example.com"));
434 /// // Identify which regexes in the set match.
435 /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
436 /// assert_eq!(vec![0, 1], matches);
438 /// // Try again, but with text that only matches one of the regexes.
439 /// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
440 /// assert_eq!(vec![1], matches);
442 /// // Try again, but with text that doesn't match any regex in the set.
443 /// let matches: Vec<_> = set.matches("example").into_iter().collect();
444 /// assert!(matches.is_empty());
454 /// # use regex::bytes::RegexSet;
455 /// let set = RegexSet::new(&[
456 /// r"[a-z]+@[a-z]+\.(com|org|net)",
457 /// r"[a-z]+\.(com|org|net)",
460 /// // Ask whether any regexes in the set match.
461 /// assert!(set.is_match(b"foo@example.com"));
463 /// // Identify which regexes in the set match.
464 /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
465 /// assert_eq!(vec![0, 1], matches);
467 /// // Try again, but with text that only matches one of the regexes.
468 /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
469 /// assert_eq!(vec![1], matches);
471 /// // Try again, but with text that doesn't match any regex in the set.
472 /// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
473 /// assert!(matches.is_empty());