]> git.proxmox.com Git - rustc.git/blob - vendor/regex-syntax/src/hir/mod.rs
New upstream version 1.45.0+dfsg1
[rustc.git] / vendor / regex-syntax / src / hir / mod.rs
1 /*!
2 Defines a high-level intermediate representation for regular expressions.
3 */
4 use std::char;
5 use std::cmp;
6 use std::error;
7 use std::fmt;
8 use std::result;
9 use std::u8;
10
11 use ast::Span;
12 use hir::interval::{Interval, IntervalSet, IntervalSetIter};
13 use unicode;
14
15 pub use hir::visitor::{visit, Visitor};
16 pub use unicode::CaseFoldError;
17
18 mod interval;
19 pub mod literal;
20 pub mod print;
21 pub mod translate;
22 mod visitor;
23
24 /// An error that can occur while translating an `Ast` to a `Hir`.
25 #[derive(Clone, Debug, Eq, PartialEq)]
26 pub struct Error {
27 /// The kind of error.
28 kind: ErrorKind,
29 /// The original pattern that the translator's Ast was parsed from. Every
30 /// span in an error is a valid range into this string.
31 pattern: String,
32 /// The span of this error, derived from the Ast given to the translator.
33 span: Span,
34 }
35
36 impl Error {
37 /// Return the type of this error.
38 pub fn kind(&self) -> &ErrorKind {
39 &self.kind
40 }
41
42 /// The original pattern string in which this error occurred.
43 ///
44 /// Every span reported by this error is reported in terms of this string.
45 pub fn pattern(&self) -> &str {
46 &self.pattern
47 }
48
49 /// Return the span at which this error occurred.
50 pub fn span(&self) -> &Span {
51 &self.span
52 }
53 }
54
55 /// The type of an error that occurred while building an `Hir`.
56 #[derive(Clone, Debug, Eq, PartialEq)]
57 pub enum ErrorKind {
58 /// This error occurs when a Unicode feature is used when Unicode
59 /// support is disabled. For example `(?-u:\pL)` would trigger this error.
60 UnicodeNotAllowed,
61 /// This error occurs when translating a pattern that could match a byte
62 /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
63 InvalidUtf8,
64 /// This occurs when an unrecognized Unicode property name could not
65 /// be found.
66 UnicodePropertyNotFound,
67 /// This occurs when an unrecognized Unicode property value could not
68 /// be found.
69 UnicodePropertyValueNotFound,
70 /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
71 /// `\d`) could not be found. This can occur when the `unicode-perl`
72 /// crate feature is not enabled.
73 UnicodePerlClassNotFound,
74 /// This occurs when the Unicode simple case mapping tables are not
75 /// available, and the regular expression required Unicode aware case
76 /// insensitivity.
77 UnicodeCaseUnavailable,
78 /// This occurs when the translator attempts to construct a character class
79 /// that is empty.
80 ///
81 /// Note that this restriction in the translator may be removed in the
82 /// future.
83 EmptyClassNotAllowed,
84 /// Hints that destructuring should not be exhaustive.
85 ///
86 /// This enum may grow additional variants, so this makes sure clients
87 /// don't count on exhaustive matching. (Otherwise, adding a new variant
88 /// could break existing code.)
89 #[doc(hidden)]
90 __Nonexhaustive,
91 }
92
93 impl ErrorKind {
94 // TODO: Remove this method entirely on the next breaking semver release.
95 #[allow(deprecated)]
96 fn description(&self) -> &str {
97 use self::ErrorKind::*;
98 match *self {
99 UnicodeNotAllowed => "Unicode not allowed here",
100 InvalidUtf8 => "pattern can match invalid UTF-8",
101 UnicodePropertyNotFound => "Unicode property not found",
102 UnicodePropertyValueNotFound => "Unicode property value not found",
103 UnicodePerlClassNotFound => {
104 "Unicode-aware Perl class not found \
105 (make sure the unicode-perl feature is enabled)"
106 }
107 UnicodeCaseUnavailable => {
108 "Unicode-aware case insensitivity matching is not available \
109 (make sure the unicode-case feature is enabled)"
110 }
111 EmptyClassNotAllowed => "empty character classes are not allowed",
112 __Nonexhaustive => unreachable!(),
113 }
114 }
115 }
116
117 impl error::Error for Error {
118 // TODO: Remove this method entirely on the next breaking semver release.
119 #[allow(deprecated)]
120 fn description(&self) -> &str {
121 self.kind.description()
122 }
123 }
124
125 impl fmt::Display for Error {
126 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
127 ::error::Formatter::from(self).fmt(f)
128 }
129 }
130
131 impl fmt::Display for ErrorKind {
132 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
133 // TODO: Remove this on the next breaking semver release.
134 #[allow(deprecated)]
135 f.write_str(self.description())
136 }
137 }
138
139 /// A high-level intermediate representation (HIR) for a regular expression.
140 ///
141 /// The HIR of a regular expression represents an intermediate step between its
142 /// abstract syntax (a structured description of the concrete syntax) and
143 /// compiled byte codes. The purpose of HIR is to make regular expressions
144 /// easier to analyze. In particular, the AST is much more complex than the
145 /// HIR. For example, while an AST supports arbitrarily nested character
146 /// classes, the HIR will flatten all nested classes into a single set. The HIR
147 /// will also "compile away" every flag present in the concrete syntax. For
148 /// example, users of HIR expressions never need to worry about case folding;
149 /// it is handled automatically by the translator (e.g., by translating `(?i)A`
150 /// to `[aA]`).
151 ///
152 /// If the HIR was produced by a translator that disallows invalid UTF-8, then
153 /// the HIR is guaranteed to match UTF-8 exclusively.
154 ///
155 /// This type defines its own destructor that uses constant stack space and
156 /// heap space proportional to the size of the HIR.
157 ///
158 /// The specific type of an HIR expression can be accessed via its `kind`
159 /// or `into_kind` methods. This extra level of indirection exists for two
160 /// reasons:
161 ///
162 /// 1. Construction of an HIR expression *must* use the constructor methods
163 /// on this `Hir` type instead of building the `HirKind` values directly.
164 /// This permits construction to enforce invariants like "concatenations
165 /// always consist of two or more sub-expressions."
166 /// 2. Every HIR expression contains attributes that are defined inductively,
167 /// and can be computed cheaply during the construction process. For
168 /// example, one such attribute is whether the expression must match at the
169 /// beginning of the text.
170 ///
171 /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
172 /// expression pattern string, and uses constant stack space and heap space
173 /// proportional to the size of the `Hir`.
174 #[derive(Clone, Debug, Eq, PartialEq)]
175 pub struct Hir {
176 /// The underlying HIR kind.
177 kind: HirKind,
178 /// Analysis info about this HIR, computed during construction.
179 info: HirInfo,
180 }
181
182 /// The kind of an arbitrary `Hir` expression.
183 #[derive(Clone, Debug, Eq, PartialEq)]
184 pub enum HirKind {
185 /// The empty regular expression, which matches everything, including the
186 /// empty string.
187 Empty,
188 /// A single literal character that matches exactly this character.
189 Literal(Literal),
190 /// A single character class that matches any of the characters in the
191 /// class. A class can either consist of Unicode scalar values as
192 /// characters, or it can use bytes.
193 Class(Class),
194 /// An anchor assertion. An anchor assertion match always has zero length.
195 Anchor(Anchor),
196 /// A word boundary assertion, which may or may not be Unicode aware. A
197 /// word boundary assertion match always has zero length.
198 WordBoundary(WordBoundary),
199 /// A repetition operation applied to a child expression.
200 Repetition(Repetition),
201 /// A possibly capturing group, which contains a child expression.
202 Group(Group),
203 /// A concatenation of expressions. A concatenation always has at least two
204 /// child expressions.
205 ///
206 /// A concatenation matches only if each of its child expression matches
207 /// one after the other.
208 Concat(Vec<Hir>),
209 /// An alternation of expressions. An alternation always has at least two
210 /// child expressions.
211 ///
212 /// An alternation matches only if at least one of its child expression
213 /// matches. If multiple expressions match, then the leftmost is preferred.
214 Alternation(Vec<Hir>),
215 }
216
217 impl Hir {
218 /// Returns a reference to the underlying HIR kind.
219 pub fn kind(&self) -> &HirKind {
220 &self.kind
221 }
222
223 /// Consumes ownership of this HIR expression and returns its underlying
224 /// `HirKind`.
225 pub fn into_kind(mut self) -> HirKind {
226 use std::mem;
227 mem::replace(&mut self.kind, HirKind::Empty)
228 }
229
230 /// Returns an empty HIR expression.
231 ///
232 /// An empty HIR expression always matches, including the empty string.
233 pub fn empty() -> Hir {
234 let mut info = HirInfo::new();
235 info.set_always_utf8(true);
236 info.set_all_assertions(true);
237 info.set_anchored_start(false);
238 info.set_anchored_end(false);
239 info.set_line_anchored_start(false);
240 info.set_line_anchored_end(false);
241 info.set_any_anchored_start(false);
242 info.set_any_anchored_end(false);
243 info.set_match_empty(true);
244 info.set_literal(true);
245 info.set_alternation_literal(true);
246 Hir { kind: HirKind::Empty, info: info }
247 }
248
249 /// Creates a literal HIR expression.
250 ///
251 /// If the given literal has a `Byte` variant with an ASCII byte, then this
252 /// method panics. This enforces the invariant that `Byte` variants are
253 /// only used to express matching of invalid UTF-8.
254 pub fn literal(lit: Literal) -> Hir {
255 if let Literal::Byte(b) = lit {
256 assert!(b > 0x7F);
257 }
258
259 let mut info = HirInfo::new();
260 info.set_always_utf8(lit.is_unicode());
261 info.set_all_assertions(false);
262 info.set_anchored_start(false);
263 info.set_anchored_end(false);
264 info.set_line_anchored_start(false);
265 info.set_line_anchored_end(false);
266 info.set_any_anchored_start(false);
267 info.set_any_anchored_end(false);
268 info.set_match_empty(false);
269 info.set_literal(true);
270 info.set_alternation_literal(true);
271 Hir { kind: HirKind::Literal(lit), info: info }
272 }
273
274 /// Creates a class HIR expression.
275 pub fn class(class: Class) -> Hir {
276 let mut info = HirInfo::new();
277 info.set_always_utf8(class.is_always_utf8());
278 info.set_all_assertions(false);
279 info.set_anchored_start(false);
280 info.set_anchored_end(false);
281 info.set_line_anchored_start(false);
282 info.set_line_anchored_end(false);
283 info.set_any_anchored_start(false);
284 info.set_any_anchored_end(false);
285 info.set_match_empty(false);
286 info.set_literal(false);
287 info.set_alternation_literal(false);
288 Hir { kind: HirKind::Class(class), info: info }
289 }
290
291 /// Creates an anchor assertion HIR expression.
292 pub fn anchor(anchor: Anchor) -> Hir {
293 let mut info = HirInfo::new();
294 info.set_always_utf8(true);
295 info.set_all_assertions(true);
296 info.set_anchored_start(false);
297 info.set_anchored_end(false);
298 info.set_line_anchored_start(false);
299 info.set_line_anchored_end(false);
300 info.set_any_anchored_start(false);
301 info.set_any_anchored_end(false);
302 info.set_match_empty(true);
303 info.set_literal(false);
304 info.set_alternation_literal(false);
305 if let Anchor::StartText = anchor {
306 info.set_anchored_start(true);
307 info.set_line_anchored_start(true);
308 info.set_any_anchored_start(true);
309 }
310 if let Anchor::EndText = anchor {
311 info.set_anchored_end(true);
312 info.set_line_anchored_end(true);
313 info.set_any_anchored_end(true);
314 }
315 if let Anchor::StartLine = anchor {
316 info.set_line_anchored_start(true);
317 }
318 if let Anchor::EndLine = anchor {
319 info.set_line_anchored_end(true);
320 }
321 Hir { kind: HirKind::Anchor(anchor), info: info }
322 }
323
324 /// Creates a word boundary assertion HIR expression.
325 pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
326 let mut info = HirInfo::new();
327 info.set_always_utf8(true);
328 info.set_all_assertions(true);
329 info.set_anchored_start(false);
330 info.set_anchored_end(false);
331 info.set_line_anchored_start(false);
332 info.set_line_anchored_end(false);
333 info.set_any_anchored_start(false);
334 info.set_any_anchored_end(false);
335 info.set_literal(false);
336 info.set_alternation_literal(false);
337 // A negated word boundary matches the empty string, but a normal
338 // word boundary does not!
339 info.set_match_empty(word_boundary.is_negated());
340 // Negated ASCII word boundaries can match invalid UTF-8.
341 if let WordBoundary::AsciiNegate = word_boundary {
342 info.set_always_utf8(false);
343 }
344 Hir { kind: HirKind::WordBoundary(word_boundary), info: info }
345 }
346
347 /// Creates a repetition HIR expression.
348 pub fn repetition(rep: Repetition) -> Hir {
349 let mut info = HirInfo::new();
350 info.set_always_utf8(rep.hir.is_always_utf8());
351 info.set_all_assertions(rep.hir.is_all_assertions());
352 // If this operator can match the empty string, then it can never
353 // be anchored.
354 info.set_anchored_start(
355 !rep.is_match_empty() && rep.hir.is_anchored_start(),
356 );
357 info.set_anchored_end(
358 !rep.is_match_empty() && rep.hir.is_anchored_end(),
359 );
360 info.set_line_anchored_start(
361 !rep.is_match_empty() && rep.hir.is_anchored_start(),
362 );
363 info.set_line_anchored_end(
364 !rep.is_match_empty() && rep.hir.is_anchored_end(),
365 );
366 info.set_any_anchored_start(rep.hir.is_any_anchored_start());
367 info.set_any_anchored_end(rep.hir.is_any_anchored_end());
368 info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
369 info.set_literal(false);
370 info.set_alternation_literal(false);
371 Hir { kind: HirKind::Repetition(rep), info: info }
372 }
373
374 /// Creates a group HIR expression.
375 pub fn group(group: Group) -> Hir {
376 let mut info = HirInfo::new();
377 info.set_always_utf8(group.hir.is_always_utf8());
378 info.set_all_assertions(group.hir.is_all_assertions());
379 info.set_anchored_start(group.hir.is_anchored_start());
380 info.set_anchored_end(group.hir.is_anchored_end());
381 info.set_line_anchored_start(group.hir.is_line_anchored_start());
382 info.set_line_anchored_end(group.hir.is_line_anchored_end());
383 info.set_any_anchored_start(group.hir.is_any_anchored_start());
384 info.set_any_anchored_end(group.hir.is_any_anchored_end());
385 info.set_match_empty(group.hir.is_match_empty());
386 info.set_literal(false);
387 info.set_alternation_literal(false);
388 Hir { kind: HirKind::Group(group), info: info }
389 }
390
391 /// Returns the concatenation of the given expressions.
392 ///
393 /// This flattens the concatenation as appropriate.
394 pub fn concat(mut exprs: Vec<Hir>) -> Hir {
395 match exprs.len() {
396 0 => Hir::empty(),
397 1 => exprs.pop().unwrap(),
398 _ => {
399 let mut info = HirInfo::new();
400 info.set_always_utf8(true);
401 info.set_all_assertions(true);
402 info.set_any_anchored_start(false);
403 info.set_any_anchored_end(false);
404 info.set_match_empty(true);
405 info.set_literal(true);
406 info.set_alternation_literal(true);
407
408 // Some attributes require analyzing all sub-expressions.
409 for e in &exprs {
410 let x = info.is_always_utf8() && e.is_always_utf8();
411 info.set_always_utf8(x);
412
413 let x = info.is_all_assertions() && e.is_all_assertions();
414 info.set_all_assertions(x);
415
416 let x = info.is_any_anchored_start()
417 || e.is_any_anchored_start();
418 info.set_any_anchored_start(x);
419
420 let x =
421 info.is_any_anchored_end() || e.is_any_anchored_end();
422 info.set_any_anchored_end(x);
423
424 let x = info.is_match_empty() && e.is_match_empty();
425 info.set_match_empty(x);
426
427 let x = info.is_literal() && e.is_literal();
428 info.set_literal(x);
429
430 let x = info.is_alternation_literal()
431 && e.is_alternation_literal();
432 info.set_alternation_literal(x);
433 }
434 // Anchored attributes require something slightly more
435 // sophisticated. Normally, WLOG, to determine whether an
436 // expression is anchored to the start, we'd only need to check
437 // the first expression of a concatenation. However,
438 // expressions like `$\b^` are still anchored to the start,
439 // but the first expression in the concatenation *isn't*
440 // anchored to the start. So the "first" expression to look at
441 // is actually one that is either not an assertion or is
442 // specifically the StartText assertion.
443 info.set_anchored_start(
444 exprs
445 .iter()
446 .take_while(|e| {
447 e.is_anchored_start() || e.is_all_assertions()
448 })
449 .any(|e| e.is_anchored_start()),
450 );
451 // Similarly for the end anchor, but in reverse.
452 info.set_anchored_end(
453 exprs
454 .iter()
455 .rev()
456 .take_while(|e| {
457 e.is_anchored_end() || e.is_all_assertions()
458 })
459 .any(|e| e.is_anchored_end()),
460 );
461 // Repeat the process for line anchors.
462 info.set_line_anchored_start(
463 exprs
464 .iter()
465 .take_while(|e| {
466 e.is_line_anchored_start() || e.is_all_assertions()
467 })
468 .any(|e| e.is_line_anchored_start()),
469 );
470 info.set_line_anchored_end(
471 exprs
472 .iter()
473 .rev()
474 .take_while(|e| {
475 e.is_line_anchored_end() || e.is_all_assertions()
476 })
477 .any(|e| e.is_line_anchored_end()),
478 );
479 Hir { kind: HirKind::Concat(exprs), info: info }
480 }
481 }
482 }
483
484 /// Returns the alternation of the given expressions.
485 ///
486 /// This flattens the alternation as appropriate.
487 pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
488 match exprs.len() {
489 0 => Hir::empty(),
490 1 => exprs.pop().unwrap(),
491 _ => {
492 let mut info = HirInfo::new();
493 info.set_always_utf8(true);
494 info.set_all_assertions(true);
495 info.set_anchored_start(true);
496 info.set_anchored_end(true);
497 info.set_line_anchored_start(true);
498 info.set_line_anchored_end(true);
499 info.set_any_anchored_start(false);
500 info.set_any_anchored_end(false);
501 info.set_match_empty(false);
502 info.set_literal(false);
503 info.set_alternation_literal(true);
504
505 // Some attributes require analyzing all sub-expressions.
506 for e in &exprs {
507 let x = info.is_always_utf8() && e.is_always_utf8();
508 info.set_always_utf8(x);
509
510 let x = info.is_all_assertions() && e.is_all_assertions();
511 info.set_all_assertions(x);
512
513 let x = info.is_anchored_start() && e.is_anchored_start();
514 info.set_anchored_start(x);
515
516 let x = info.is_anchored_end() && e.is_anchored_end();
517 info.set_anchored_end(x);
518
519 let x = info.is_line_anchored_start()
520 && e.is_line_anchored_start();
521 info.set_line_anchored_start(x);
522
523 let x = info.is_line_anchored_end()
524 && e.is_line_anchored_end();
525 info.set_line_anchored_end(x);
526
527 let x = info.is_any_anchored_start()
528 || e.is_any_anchored_start();
529 info.set_any_anchored_start(x);
530
531 let x =
532 info.is_any_anchored_end() || e.is_any_anchored_end();
533 info.set_any_anchored_end(x);
534
535 let x = info.is_match_empty() || e.is_match_empty();
536 info.set_match_empty(x);
537
538 let x = info.is_alternation_literal() && e.is_literal();
539 info.set_alternation_literal(x);
540 }
541 Hir { kind: HirKind::Alternation(exprs), info: info }
542 }
543 }
544 }
545
546 /// Build an HIR expression for `.`.
547 ///
548 /// A `.` expression matches any character except for `\n`. To build an
549 /// expression that matches any character, including `\n`, use the `any`
550 /// method.
551 ///
552 /// If `bytes` is `true`, then this assumes characters are limited to a
553 /// single byte.
554 pub fn dot(bytes: bool) -> Hir {
555 if bytes {
556 let mut cls = ClassBytes::empty();
557 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
558 cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
559 Hir::class(Class::Bytes(cls))
560 } else {
561 let mut cls = ClassUnicode::empty();
562 cls.push(ClassUnicodeRange::new('\0', '\x09'));
563 cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
564 Hir::class(Class::Unicode(cls))
565 }
566 }
567
568 /// Build an HIR expression for `(?s).`.
569 ///
570 /// A `(?s).` expression matches any character, including `\n`. To build an
571 /// expression that matches any character except for `\n`, then use the
572 /// `dot` method.
573 ///
574 /// If `bytes` is `true`, then this assumes characters are limited to a
575 /// single byte.
576 pub fn any(bytes: bool) -> Hir {
577 if bytes {
578 let mut cls = ClassBytes::empty();
579 cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
580 Hir::class(Class::Bytes(cls))
581 } else {
582 let mut cls = ClassUnicode::empty();
583 cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
584 Hir::class(Class::Unicode(cls))
585 }
586 }
587
588 /// Return true if and only if this HIR will always match valid UTF-8.
589 ///
590 /// When this returns false, then it is possible for this HIR expression
591 /// to match invalid UTF-8.
592 pub fn is_always_utf8(&self) -> bool {
593 self.info.is_always_utf8()
594 }
595
596 /// Returns true if and only if this entire HIR expression is made up of
597 /// zero-width assertions.
598 ///
599 /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but
600 /// not `^a`.
601 pub fn is_all_assertions(&self) -> bool {
602 self.info.is_all_assertions()
603 }
604
605 /// Return true if and only if this HIR is required to match from the
606 /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`,
607 /// `^foo|^bar` but not `^foo|bar`.
608 pub fn is_anchored_start(&self) -> bool {
609 self.info.is_anchored_start()
610 }
611
612 /// Return true if and only if this HIR is required to match at the end
613 /// of text. This includes expressions like `foo$`, `(foo|bar)$`,
614 /// `foo$|bar$` but not `foo$|bar`.
615 pub fn is_anchored_end(&self) -> bool {
616 self.info.is_anchored_end()
617 }
618
619 /// Return true if and only if this HIR is required to match from the
620 /// beginning of text or the beginning of a line. This includes expressions
621 /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar`
622 /// but not `^foo|bar` or `(?m)^foo|bar`.
623 ///
624 /// Note that if `is_anchored_start` is `true`, then
625 /// `is_line_anchored_start` will also be `true`. The reverse implication
626 /// is not true. For example, `(?m)^foo` is line anchored, but not
627 /// `is_anchored_start`.
628 pub fn is_line_anchored_start(&self) -> bool {
629 self.info.is_line_anchored_start()
630 }
631
632 /// Return true if and only if this HIR is required to match at the
633 /// end of text or the end of a line. This includes expressions like
634 /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`,
635 /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`.
636 ///
637 /// Note that if `is_anchored_end` is `true`, then
638 /// `is_line_anchored_end` will also be `true`. The reverse implication
639 /// is not true. For example, `(?m)foo$` is line anchored, but not
640 /// `is_anchored_end`.
641 pub fn is_line_anchored_end(&self) -> bool {
642 self.info.is_line_anchored_end()
643 }
644
645 /// Return true if and only if this HIR contains any sub-expression that
646 /// is required to match at the beginning of text. Specifically, this
647 /// returns true if the `^` symbol (when multiline mode is disabled) or the
648 /// `\A` escape appear anywhere in the regex.
649 pub fn is_any_anchored_start(&self) -> bool {
650 self.info.is_any_anchored_start()
651 }
652
653 /// Return true if and only if this HIR contains any sub-expression that is
654 /// required to match at the end of text. Specifically, this returns true
655 /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
656 /// appear anywhere in the regex.
657 pub fn is_any_anchored_end(&self) -> bool {
658 self.info.is_any_anchored_end()
659 }
660
661 /// Return true if and only if the empty string is part of the language
662 /// matched by this regular expression.
663 ///
664 /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`,
665 /// but not `a`, `a+` or `\b`.
666 pub fn is_match_empty(&self) -> bool {
667 self.info.is_match_empty()
668 }
669
670 /// Return true if and only if this HIR is a simple literal. This is only
671 /// true when this HIR expression is either itself a `Literal` or a
672 /// concatenation of only `Literal`s.
673 ///
674 /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`
675 /// are not (even though that contain sub-expressions that are literals).
676 pub fn is_literal(&self) -> bool {
677 self.info.is_literal()
678 }
679
680 /// Return true if and only if this HIR is either a simple literal or an
681 /// alternation of simple literals. This is only
682 /// true when this HIR expression is either itself a `Literal` or a
683 /// concatenation of only `Literal`s or an alternation of only `Literal`s.
684 ///
685 /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternaiton
686 /// literals, but `f+`, `(foo)`, `foo()`
687 /// are not (even though that contain sub-expressions that are literals).
688 pub fn is_alternation_literal(&self) -> bool {
689 self.info.is_alternation_literal()
690 }
691 }
692
693 impl HirKind {
694 /// Return true if and only if this HIR is the empty regular expression.
695 ///
696 /// Note that this is not defined inductively. That is, it only tests if
697 /// this kind is the `Empty` variant. To get the inductive definition,
698 /// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
699 pub fn is_empty(&self) -> bool {
700 match *self {
701 HirKind::Empty => true,
702 _ => false,
703 }
704 }
705
706 /// Returns true if and only if this kind has any (including possibly
707 /// empty) subexpressions.
708 pub fn has_subexprs(&self) -> bool {
709 match *self {
710 HirKind::Empty
711 | HirKind::Literal(_)
712 | HirKind::Class(_)
713 | HirKind::Anchor(_)
714 | HirKind::WordBoundary(_) => false,
715 HirKind::Group(_)
716 | HirKind::Repetition(_)
717 | HirKind::Concat(_)
718 | HirKind::Alternation(_) => true,
719 }
720 }
721 }
722
723 /// Print a display representation of this Hir.
724 ///
725 /// The result of this is a valid regular expression pattern string.
726 ///
727 /// This implementation uses constant stack space and heap space proportional
728 /// to the size of the `Hir`.
729 impl fmt::Display for Hir {
730 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
731 use hir::print::Printer;
732 Printer::new().print(self, f)
733 }
734 }
735
736 /// The high-level intermediate representation of a literal.
737 ///
738 /// A literal corresponds to a single character, where a character is either
739 /// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
740 /// are preferred whenever possible. In particular, a `Byte` variant is only
741 /// ever produced when it could match invalid UTF-8.
742 #[derive(Clone, Debug, Eq, PartialEq)]
743 pub enum Literal {
744 /// A single character represented by a Unicode scalar value.
745 Unicode(char),
746 /// A single character represented by an arbitrary byte.
747 Byte(u8),
748 }
749
750 impl Literal {
751 /// Returns true if and only if this literal corresponds to a Unicode
752 /// scalar value.
753 pub fn is_unicode(&self) -> bool {
754 match *self {
755 Literal::Unicode(_) => true,
756 Literal::Byte(b) if b <= 0x7F => true,
757 Literal::Byte(_) => false,
758 }
759 }
760 }
761
762 /// The high-level intermediate representation of a character class.
763 ///
764 /// A character class corresponds to a set of characters. A character is either
765 /// defined by a Unicode scalar value or a byte. Unicode characters are used
766 /// by default, while bytes are used when Unicode mode (via the `u` flag) is
767 /// disabled.
768 ///
769 /// A character class, regardless of its character type, is represented by a
770 /// sequence of non-overlapping non-adjacent ranges of characters.
771 ///
772 /// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
773 /// be produced even when it exclusively matches valid UTF-8. This is because
774 /// a `Bytes` variant represents an intention by the author of the regular
775 /// expression to disable Unicode mode, which in turn impacts the semantics of
776 /// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
777 /// match the same set of strings.
778 #[derive(Clone, Debug, Eq, PartialEq)]
779 pub enum Class {
780 /// A set of characters represented by Unicode scalar values.
781 Unicode(ClassUnicode),
782 /// A set of characters represented by arbitrary bytes (one byte per
783 /// character).
784 Bytes(ClassBytes),
785 }
786
787 impl Class {
788 /// Apply Unicode simple case folding to this character class, in place.
789 /// The character class will be expanded to include all simple case folded
790 /// character variants.
791 ///
792 /// If this is a byte oriented character class, then this will be limited
793 /// to the ASCII ranges `A-Z` and `a-z`.
794 pub fn case_fold_simple(&mut self) {
795 match *self {
796 Class::Unicode(ref mut x) => x.case_fold_simple(),
797 Class::Bytes(ref mut x) => x.case_fold_simple(),
798 }
799 }
800
801 /// Negate this character class in place.
802 ///
803 /// After completion, this character class will contain precisely the
804 /// characters that weren't previously in the class.
805 pub fn negate(&mut self) {
806 match *self {
807 Class::Unicode(ref mut x) => x.negate(),
808 Class::Bytes(ref mut x) => x.negate(),
809 }
810 }
811
812 /// Returns true if and only if this character class will only ever match
813 /// valid UTF-8.
814 ///
815 /// A character class can match invalid UTF-8 only when the following
816 /// conditions are met:
817 ///
818 /// 1. The translator was configured to permit generating an expression
819 /// that can match invalid UTF-8. (By default, this is disabled.)
820 /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
821 /// syntax or in the parser builder. By default, Unicode mode is
822 /// enabled.
823 pub fn is_always_utf8(&self) -> bool {
824 match *self {
825 Class::Unicode(_) => true,
826 Class::Bytes(ref x) => x.is_all_ascii(),
827 }
828 }
829 }
830
831 /// A set of characters represented by Unicode scalar values.
832 #[derive(Clone, Debug, Eq, PartialEq)]
833 pub struct ClassUnicode {
834 set: IntervalSet<ClassUnicodeRange>,
835 }
836
837 impl ClassUnicode {
838 /// Create a new class from a sequence of ranges.
839 ///
840 /// The given ranges do not need to be in any specific order, and ranges
841 /// may overlap.
842 pub fn new<I>(ranges: I) -> ClassUnicode
843 where
844 I: IntoIterator<Item = ClassUnicodeRange>,
845 {
846 ClassUnicode { set: IntervalSet::new(ranges) }
847 }
848
849 /// Create a new class with no ranges.
850 pub fn empty() -> ClassUnicode {
851 ClassUnicode::new(vec![])
852 }
853
854 /// Add a new range to this set.
855 pub fn push(&mut self, range: ClassUnicodeRange) {
856 self.set.push(range);
857 }
858
859 /// Return an iterator over all ranges in this class.
860 ///
861 /// The iterator yields ranges in ascending order.
862 pub fn iter(&self) -> ClassUnicodeIter {
863 ClassUnicodeIter(self.set.iter())
864 }
865
866 /// Return the underlying ranges as a slice.
867 pub fn ranges(&self) -> &[ClassUnicodeRange] {
868 self.set.intervals()
869 }
870
871 /// Expand this character class such that it contains all case folded
872 /// characters, according to Unicode's "simple" mapping. For example, if
873 /// this class consists of the range `a-z`, then applying case folding will
874 /// result in the class containing both the ranges `a-z` and `A-Z`.
875 ///
876 /// # Panics
877 ///
878 /// This routine panics when the case mapping data necessary for this
879 /// routine to complete is unavailable. This occurs when the `unicode-case`
880 /// feature is not enabled.
881 ///
882 /// Callers should prefer using `try_case_fold_simple` instead, which will
883 /// return an error instead of panicking.
884 pub fn case_fold_simple(&mut self) {
885 self.set
886 .case_fold_simple()
887 .expect("unicode-case feature must be enabled");
888 }
889
890 /// Expand this character class such that it contains all case folded
891 /// characters, according to Unicode's "simple" mapping. For example, if
892 /// this class consists of the range `a-z`, then applying case folding will
893 /// result in the class containing both the ranges `a-z` and `A-Z`.
894 ///
895 /// # Error
896 ///
897 /// This routine returns an error when the case mapping data necessary
898 /// for this routine to complete is unavailable. This occurs when the
899 /// `unicode-case` feature is not enabled.
900 pub fn try_case_fold_simple(
901 &mut self,
902 ) -> result::Result<(), CaseFoldError> {
903 self.set.case_fold_simple()
904 }
905
906 /// Negate this character class.
907 ///
908 /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
909 /// set, then it will not be in this set after negation.
910 pub fn negate(&mut self) {
911 self.set.negate();
912 }
913
914 /// Union this character class with the given character class, in place.
915 pub fn union(&mut self, other: &ClassUnicode) {
916 self.set.union(&other.set);
917 }
918
919 /// Intersect this character class with the given character class, in
920 /// place.
921 pub fn intersect(&mut self, other: &ClassUnicode) {
922 self.set.intersect(&other.set);
923 }
924
925 /// Subtract the given character class from this character class, in place.
926 pub fn difference(&mut self, other: &ClassUnicode) {
927 self.set.difference(&other.set);
928 }
929
930 /// Compute the symmetric difference of the given character classes, in
931 /// place.
932 ///
933 /// This computes the symmetric difference of two character classes. This
934 /// removes all elements in this class that are also in the given class,
935 /// but all adds all elements from the given class that aren't in this
936 /// class. That is, the class will contain all elements in either class,
937 /// but will not contain any elements that are in both classes.
938 pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
939 self.set.symmetric_difference(&other.set);
940 }
941
942 /// Returns true if and only if this character class will either match
943 /// nothing or only ASCII bytes. Stated differently, this returns false
944 /// if and only if this class contains a non-ASCII codepoint.
945 pub fn is_all_ascii(&self) -> bool {
946 self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
947 }
948 }
949
950 /// An iterator over all ranges in a Unicode character class.
951 ///
952 /// The lifetime `'a` refers to the lifetime of the underlying class.
953 #[derive(Debug)]
954 pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
955
956 impl<'a> Iterator for ClassUnicodeIter<'a> {
957 type Item = &'a ClassUnicodeRange;
958
959 fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
960 self.0.next()
961 }
962 }
963
964 /// A single range of characters represented by Unicode scalar values.
965 ///
966 /// The range is closed. That is, the start and end of the range are included
967 /// in the range.
968 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
969 pub struct ClassUnicodeRange {
970 start: char,
971 end: char,
972 }
973
974 impl fmt::Debug for ClassUnicodeRange {
975 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
976 let start = if !self.start.is_whitespace() && !self.start.is_control()
977 {
978 self.start.to_string()
979 } else {
980 format!("0x{:X}", self.start as u32)
981 };
982 let end = if !self.end.is_whitespace() && !self.end.is_control() {
983 self.end.to_string()
984 } else {
985 format!("0x{:X}", self.end as u32)
986 };
987 f.debug_struct("ClassUnicodeRange")
988 .field("start", &start)
989 .field("end", &end)
990 .finish()
991 }
992 }
993
994 impl Interval for ClassUnicodeRange {
995 type Bound = char;
996
997 #[inline]
998 fn lower(&self) -> char {
999 self.start
1000 }
1001 #[inline]
1002 fn upper(&self) -> char {
1003 self.end
1004 }
1005 #[inline]
1006 fn set_lower(&mut self, bound: char) {
1007 self.start = bound;
1008 }
1009 #[inline]
1010 fn set_upper(&mut self, bound: char) {
1011 self.end = bound;
1012 }
1013
1014 /// Apply simple case folding to this Unicode scalar value range.
1015 ///
1016 /// Additional ranges are appended to the given vector. Canonical ordering
1017 /// is *not* maintained in the given vector.
1018 fn case_fold_simple(
1019 &self,
1020 ranges: &mut Vec<ClassUnicodeRange>,
1021 ) -> Result<(), unicode::CaseFoldError> {
1022 if !unicode::contains_simple_case_mapping(self.start, self.end)? {
1023 return Ok(());
1024 }
1025 let start = self.start as u32;
1026 let end = (self.end as u32).saturating_add(1);
1027 let mut next_simple_cp = None;
1028 for cp in (start..end).filter_map(char::from_u32) {
1029 if next_simple_cp.map_or(false, |next| cp < next) {
1030 continue;
1031 }
1032 let it = match unicode::simple_fold(cp)? {
1033 Ok(it) => it,
1034 Err(next) => {
1035 next_simple_cp = next;
1036 continue;
1037 }
1038 };
1039 for cp_folded in it {
1040 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1041 }
1042 }
1043 Ok(())
1044 }
1045 }
1046
1047 impl ClassUnicodeRange {
1048 /// Create a new Unicode scalar value range for a character class.
1049 ///
1050 /// The returned range is always in a canonical form. That is, the range
1051 /// returned always satisfies the invariant that `start <= end`.
1052 pub fn new(start: char, end: char) -> ClassUnicodeRange {
1053 ClassUnicodeRange::create(start, end)
1054 }
1055
1056 /// Return the start of this range.
1057 ///
1058 /// The start of a range is always less than or equal to the end of the
1059 /// range.
1060 pub fn start(&self) -> char {
1061 self.start
1062 }
1063
1064 /// Return the end of this range.
1065 ///
1066 /// The end of a range is always greater than or equal to the start of the
1067 /// range.
1068 pub fn end(&self) -> char {
1069 self.end
1070 }
1071 }
1072
1073 /// A set of characters represented by arbitrary bytes (where one byte
1074 /// corresponds to one character).
1075 #[derive(Clone, Debug, Eq, PartialEq)]
1076 pub struct ClassBytes {
1077 set: IntervalSet<ClassBytesRange>,
1078 }
1079
1080 impl ClassBytes {
1081 /// Create a new class from a sequence of ranges.
1082 ///
1083 /// The given ranges do not need to be in any specific order, and ranges
1084 /// may overlap.
1085 pub fn new<I>(ranges: I) -> ClassBytes
1086 where
1087 I: IntoIterator<Item = ClassBytesRange>,
1088 {
1089 ClassBytes { set: IntervalSet::new(ranges) }
1090 }
1091
1092 /// Create a new class with no ranges.
1093 pub fn empty() -> ClassBytes {
1094 ClassBytes::new(vec![])
1095 }
1096
1097 /// Add a new range to this set.
1098 pub fn push(&mut self, range: ClassBytesRange) {
1099 self.set.push(range);
1100 }
1101
1102 /// Return an iterator over all ranges in this class.
1103 ///
1104 /// The iterator yields ranges in ascending order.
1105 pub fn iter(&self) -> ClassBytesIter {
1106 ClassBytesIter(self.set.iter())
1107 }
1108
1109 /// Return the underlying ranges as a slice.
1110 pub fn ranges(&self) -> &[ClassBytesRange] {
1111 self.set.intervals()
1112 }
1113
1114 /// Expand this character class such that it contains all case folded
1115 /// characters. For example, if this class consists of the range `a-z`,
1116 /// then applying case folding will result in the class containing both the
1117 /// ranges `a-z` and `A-Z`.
1118 ///
1119 /// Note that this only applies ASCII case folding, which is limited to the
1120 /// characters `a-z` and `A-Z`.
1121 pub fn case_fold_simple(&mut self) {
1122 self.set.case_fold_simple().expect("ASCII case folding never fails");
1123 }
1124
1125 /// Negate this byte class.
1126 ///
1127 /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1128 /// will not be in this set after negation.
1129 pub fn negate(&mut self) {
1130 self.set.negate();
1131 }
1132
1133 /// Union this byte class with the given byte class, in place.
1134 pub fn union(&mut self, other: &ClassBytes) {
1135 self.set.union(&other.set);
1136 }
1137
1138 /// Intersect this byte class with the given byte class, in place.
1139 pub fn intersect(&mut self, other: &ClassBytes) {
1140 self.set.intersect(&other.set);
1141 }
1142
1143 /// Subtract the given byte class from this byte class, in place.
1144 pub fn difference(&mut self, other: &ClassBytes) {
1145 self.set.difference(&other.set);
1146 }
1147
1148 /// Compute the symmetric difference of the given byte classes, in place.
1149 ///
1150 /// This computes the symmetric difference of two byte classes. This
1151 /// removes all elements in this class that are also in the given class,
1152 /// but all adds all elements from the given class that aren't in this
1153 /// class. That is, the class will contain all elements in either class,
1154 /// but will not contain any elements that are in both classes.
1155 pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1156 self.set.symmetric_difference(&other.set);
1157 }
1158
1159 /// Returns true if and only if this character class will either match
1160 /// nothing or only ASCII bytes. Stated differently, this returns false
1161 /// if and only if this class contains a non-ASCII byte.
1162 pub fn is_all_ascii(&self) -> bool {
1163 self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1164 }
1165 }
1166
1167 /// An iterator over all ranges in a byte character class.
1168 ///
1169 /// The lifetime `'a` refers to the lifetime of the underlying class.
1170 #[derive(Debug)]
1171 pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1172
1173 impl<'a> Iterator for ClassBytesIter<'a> {
1174 type Item = &'a ClassBytesRange;
1175
1176 fn next(&mut self) -> Option<&'a ClassBytesRange> {
1177 self.0.next()
1178 }
1179 }
1180
1181 /// A single range of characters represented by arbitrary bytes.
1182 ///
1183 /// The range is closed. That is, the start and end of the range are included
1184 /// in the range.
1185 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1186 pub struct ClassBytesRange {
1187 start: u8,
1188 end: u8,
1189 }
1190
1191 impl Interval for ClassBytesRange {
1192 type Bound = u8;
1193
1194 #[inline]
1195 fn lower(&self) -> u8 {
1196 self.start
1197 }
1198 #[inline]
1199 fn upper(&self) -> u8 {
1200 self.end
1201 }
1202 #[inline]
1203 fn set_lower(&mut self, bound: u8) {
1204 self.start = bound;
1205 }
1206 #[inline]
1207 fn set_upper(&mut self, bound: u8) {
1208 self.end = bound;
1209 }
1210
1211 /// Apply simple case folding to this byte range. Only ASCII case mappings
1212 /// (for a-z) are applied.
1213 ///
1214 /// Additional ranges are appended to the given vector. Canonical ordering
1215 /// is *not* maintained in the given vector.
1216 fn case_fold_simple(
1217 &self,
1218 ranges: &mut Vec<ClassBytesRange>,
1219 ) -> Result<(), unicode::CaseFoldError> {
1220 if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1221 let lower = cmp::max(self.start, b'a');
1222 let upper = cmp::min(self.end, b'z');
1223 ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1224 }
1225 if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1226 let lower = cmp::max(self.start, b'A');
1227 let upper = cmp::min(self.end, b'Z');
1228 ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1229 }
1230 Ok(())
1231 }
1232 }
1233
1234 impl ClassBytesRange {
1235 /// Create a new byte range for a character class.
1236 ///
1237 /// The returned range is always in a canonical form. That is, the range
1238 /// returned always satisfies the invariant that `start <= end`.
1239 pub fn new(start: u8, end: u8) -> ClassBytesRange {
1240 ClassBytesRange::create(start, end)
1241 }
1242
1243 /// Return the start of this range.
1244 ///
1245 /// The start of a range is always less than or equal to the end of the
1246 /// range.
1247 pub fn start(&self) -> u8 {
1248 self.start
1249 }
1250
1251 /// Return the end of this range.
1252 ///
1253 /// The end of a range is always greater than or equal to the start of the
1254 /// range.
1255 pub fn end(&self) -> u8 {
1256 self.end
1257 }
1258 }
1259
1260 impl fmt::Debug for ClassBytesRange {
1261 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1262 let mut debug = f.debug_struct("ClassBytesRange");
1263 if self.start <= 0x7F {
1264 debug.field("start", &(self.start as char));
1265 } else {
1266 debug.field("start", &self.start);
1267 }
1268 if self.end <= 0x7F {
1269 debug.field("end", &(self.end as char));
1270 } else {
1271 debug.field("end", &self.end);
1272 }
1273 debug.finish()
1274 }
1275 }
1276
1277 /// The high-level intermediate representation for an anchor assertion.
1278 ///
1279 /// A matching anchor assertion is always zero-length.
1280 #[derive(Clone, Debug, Eq, PartialEq)]
1281 pub enum Anchor {
1282 /// Match the beginning of a line or the beginning of text. Specifically,
1283 /// this matches at the starting position of the input, or at the position
1284 /// immediately following a `\n` character.
1285 StartLine,
1286 /// Match the end of a line or the end of text. Specifically,
1287 /// this matches at the end position of the input, or at the position
1288 /// immediately preceding a `\n` character.
1289 EndLine,
1290 /// Match the beginning of text. Specifically, this matches at the starting
1291 /// position of the input.
1292 StartText,
1293 /// Match the end of text. Specifically, this matches at the ending
1294 /// position of the input.
1295 EndText,
1296 }
1297
1298 /// The high-level intermediate representation for a word-boundary assertion.
1299 ///
1300 /// A matching word boundary assertion is always zero-length.
1301 #[derive(Clone, Debug, Eq, PartialEq)]
1302 pub enum WordBoundary {
1303 /// Match a Unicode-aware word boundary. That is, this matches a position
1304 /// where the left adjacent character and right adjacent character
1305 /// correspond to a word and non-word or a non-word and word character.
1306 Unicode,
1307 /// Match a Unicode-aware negation of a word boundary.
1308 UnicodeNegate,
1309 /// Match an ASCII-only word boundary. That is, this matches a position
1310 /// where the left adjacent character and right adjacent character
1311 /// correspond to a word and non-word or a non-word and word character.
1312 Ascii,
1313 /// Match an ASCII-only negation of a word boundary.
1314 AsciiNegate,
1315 }
1316
1317 impl WordBoundary {
1318 /// Returns true if and only if this word boundary assertion is negated.
1319 pub fn is_negated(&self) -> bool {
1320 match *self {
1321 WordBoundary::Unicode | WordBoundary::Ascii => false,
1322 WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
1323 }
1324 }
1325 }
1326
1327 /// The high-level intermediate representation for a group.
1328 ///
1329 /// This represents one of three possible group types:
1330 ///
1331 /// 1. A non-capturing group (e.g., `(?:expr)`).
1332 /// 2. A capturing group (e.g., `(expr)`).
1333 /// 3. A named capturing group (e.g., `(?P<name>expr)`).
1334 #[derive(Clone, Debug, Eq, PartialEq)]
1335 pub struct Group {
1336 /// The kind of this group. If it is a capturing group, then the kind
1337 /// contains the capture group index (and the name, if it is a named
1338 /// group).
1339 pub kind: GroupKind,
1340 /// The expression inside the capturing group, which may be empty.
1341 pub hir: Box<Hir>,
1342 }
1343
1344 /// The kind of group.
1345 #[derive(Clone, Debug, Eq, PartialEq)]
1346 pub enum GroupKind {
1347 /// A normal unnamed capturing group.
1348 ///
1349 /// The value is the capture index of the group.
1350 CaptureIndex(u32),
1351 /// A named capturing group.
1352 CaptureName {
1353 /// The name of the group.
1354 name: String,
1355 /// The capture index of the group.
1356 index: u32,
1357 },
1358 /// A non-capturing group.
1359 NonCapturing,
1360 }
1361
1362 /// The high-level intermediate representation of a repetition operator.
1363 ///
1364 /// A repetition operator permits the repetition of an arbitrary
1365 /// sub-expression.
1366 #[derive(Clone, Debug, Eq, PartialEq)]
1367 pub struct Repetition {
1368 /// The kind of this repetition operator.
1369 pub kind: RepetitionKind,
1370 /// Whether this repetition operator is greedy or not. A greedy operator
1371 /// will match as much as it can. A non-greedy operator will match as
1372 /// little as it can.
1373 ///
1374 /// Typically, operators are greedy by default and are only non-greedy when
1375 /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1376 /// not. However, this can be inverted via the `U` "ungreedy" flag.
1377 pub greedy: bool,
1378 /// The expression being repeated.
1379 pub hir: Box<Hir>,
1380 }
1381
1382 impl Repetition {
1383 /// Returns true if and only if this repetition operator makes it possible
1384 /// to match the empty string.
1385 ///
1386 /// Note that this is not defined inductively. For example, while `a*`
1387 /// will report `true`, `()+` will not, even though `()` matches the empty
1388 /// string and one or more occurrences of something that matches the empty
1389 /// string will always match the empty string. In order to get the
1390 /// inductive definition, see the corresponding method on
1391 /// [`Hir`](struct.Hir.html).
1392 pub fn is_match_empty(&self) -> bool {
1393 match self.kind {
1394 RepetitionKind::ZeroOrOne => true,
1395 RepetitionKind::ZeroOrMore => true,
1396 RepetitionKind::OneOrMore => false,
1397 RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
1398 RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
1399 RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
1400 }
1401 }
1402 }
1403
1404 /// The kind of a repetition operator.
1405 #[derive(Clone, Debug, Eq, PartialEq)]
1406 pub enum RepetitionKind {
1407 /// Matches a sub-expression zero or one times.
1408 ZeroOrOne,
1409 /// Matches a sub-expression zero or more times.
1410 ZeroOrMore,
1411 /// Matches a sub-expression one or more times.
1412 OneOrMore,
1413 /// Matches a sub-expression within a bounded range of times.
1414 Range(RepetitionRange),
1415 }
1416
1417 /// The kind of a counted repetition operator.
1418 #[derive(Clone, Debug, Eq, PartialEq)]
1419 pub enum RepetitionRange {
1420 /// Matches a sub-expression exactly this many times.
1421 Exactly(u32),
1422 /// Matches a sub-expression at least this many times.
1423 AtLeast(u32),
1424 /// Matches a sub-expression at least `m` times and at most `n` times.
1425 Bounded(u32, u32),
1426 }
1427
1428 /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1429 /// space but heap space proportional to the depth of the total `Hir`.
1430 impl Drop for Hir {
1431 fn drop(&mut self) {
1432 use std::mem;
1433
1434 match *self.kind() {
1435 HirKind::Empty
1436 | HirKind::Literal(_)
1437 | HirKind::Class(_)
1438 | HirKind::Anchor(_)
1439 | HirKind::WordBoundary(_) => return,
1440 HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
1441 HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
1442 HirKind::Concat(ref x) if x.is_empty() => return,
1443 HirKind::Alternation(ref x) if x.is_empty() => return,
1444 _ => {}
1445 }
1446
1447 let mut stack = vec![mem::replace(self, Hir::empty())];
1448 while let Some(mut expr) = stack.pop() {
1449 match expr.kind {
1450 HirKind::Empty
1451 | HirKind::Literal(_)
1452 | HirKind::Class(_)
1453 | HirKind::Anchor(_)
1454 | HirKind::WordBoundary(_) => {}
1455 HirKind::Group(ref mut x) => {
1456 stack.push(mem::replace(&mut x.hir, Hir::empty()));
1457 }
1458 HirKind::Repetition(ref mut x) => {
1459 stack.push(mem::replace(&mut x.hir, Hir::empty()));
1460 }
1461 HirKind::Concat(ref mut x) => {
1462 stack.extend(x.drain(..));
1463 }
1464 HirKind::Alternation(ref mut x) => {
1465 stack.extend(x.drain(..));
1466 }
1467 }
1468 }
1469 }
1470 }
1471
1472 /// A type that documents various attributes of an HIR expression.
1473 ///
1474 /// These attributes are typically defined inductively on the HIR.
1475 #[derive(Clone, Debug, Eq, PartialEq)]
1476 struct HirInfo {
1477 /// Represent yes/no questions by a bitfield to conserve space, since
1478 /// this is included in every HIR expression.
1479 ///
1480 /// If more attributes need to be added, it is OK to increase the size of
1481 /// this as appropriate.
1482 bools: u16,
1483 }
1484
1485 // A simple macro for defining bitfield accessors/mutators.
1486 macro_rules! define_bool {
1487 ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
1488 fn $is_fn_name(&self) -> bool {
1489 self.bools & (0b1 << $bit) > 0
1490 }
1491
1492 fn $set_fn_name(&mut self, yes: bool) {
1493 if yes {
1494 self.bools |= 1 << $bit;
1495 } else {
1496 self.bools &= !(1 << $bit);
1497 }
1498 }
1499 }
1500 }
1501
1502 impl HirInfo {
1503 fn new() -> HirInfo {
1504 HirInfo { bools: 0 }
1505 }
1506
1507 define_bool!(0, is_always_utf8, set_always_utf8);
1508 define_bool!(1, is_all_assertions, set_all_assertions);
1509 define_bool!(2, is_anchored_start, set_anchored_start);
1510 define_bool!(3, is_anchored_end, set_anchored_end);
1511 define_bool!(4, is_line_anchored_start, set_line_anchored_start);
1512 define_bool!(5, is_line_anchored_end, set_line_anchored_end);
1513 define_bool!(6, is_any_anchored_start, set_any_anchored_start);
1514 define_bool!(7, is_any_anchored_end, set_any_anchored_end);
1515 define_bool!(8, is_match_empty, set_match_empty);
1516 define_bool!(9, is_literal, set_literal);
1517 define_bool!(10, is_alternation_literal, set_alternation_literal);
1518 }
1519
1520 #[cfg(test)]
1521 mod tests {
1522 use super::*;
1523
1524 fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
1525 let ranges: Vec<ClassUnicodeRange> = ranges
1526 .iter()
1527 .map(|&(s, e)| ClassUnicodeRange::new(s, e))
1528 .collect();
1529 ClassUnicode::new(ranges)
1530 }
1531
1532 fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
1533 let ranges: Vec<ClassBytesRange> =
1534 ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
1535 ClassBytes::new(ranges)
1536 }
1537
1538 fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
1539 cls.iter().map(|x| (x.start(), x.end())).collect()
1540 }
1541
1542 #[cfg(feature = "unicode-case")]
1543 fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
1544 let mut cls_ = cls.clone();
1545 cls_.case_fold_simple();
1546 cls_
1547 }
1548
1549 fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1550 let mut cls_ = cls1.clone();
1551 cls_.union(cls2);
1552 cls_
1553 }
1554
1555 fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1556 let mut cls_ = cls1.clone();
1557 cls_.intersect(cls2);
1558 cls_
1559 }
1560
1561 fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1562 let mut cls_ = cls1.clone();
1563 cls_.difference(cls2);
1564 cls_
1565 }
1566
1567 fn usymdifference(
1568 cls1: &ClassUnicode,
1569 cls2: &ClassUnicode,
1570 ) -> ClassUnicode {
1571 let mut cls_ = cls1.clone();
1572 cls_.symmetric_difference(cls2);
1573 cls_
1574 }
1575
1576 fn unegate(cls: &ClassUnicode) -> ClassUnicode {
1577 let mut cls_ = cls.clone();
1578 cls_.negate();
1579 cls_
1580 }
1581
1582 fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
1583 cls.iter().map(|x| (x.start(), x.end())).collect()
1584 }
1585
1586 fn bcasefold(cls: &ClassBytes) -> ClassBytes {
1587 let mut cls_ = cls.clone();
1588 cls_.case_fold_simple();
1589 cls_
1590 }
1591
1592 fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1593 let mut cls_ = cls1.clone();
1594 cls_.union(cls2);
1595 cls_
1596 }
1597
1598 fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1599 let mut cls_ = cls1.clone();
1600 cls_.intersect(cls2);
1601 cls_
1602 }
1603
1604 fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1605 let mut cls_ = cls1.clone();
1606 cls_.difference(cls2);
1607 cls_
1608 }
1609
1610 fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1611 let mut cls_ = cls1.clone();
1612 cls_.symmetric_difference(cls2);
1613 cls_
1614 }
1615
1616 fn bnegate(cls: &ClassBytes) -> ClassBytes {
1617 let mut cls_ = cls.clone();
1618 cls_.negate();
1619 cls_
1620 }
1621
1622 #[test]
1623 fn class_range_canonical_unicode() {
1624 let range = ClassUnicodeRange::new('\u{00FF}', '\0');
1625 assert_eq!('\0', range.start());
1626 assert_eq!('\u{00FF}', range.end());
1627 }
1628
1629 #[test]
1630 fn class_range_canonical_bytes() {
1631 let range = ClassBytesRange::new(b'\xFF', b'\0');
1632 assert_eq!(b'\0', range.start());
1633 assert_eq!(b'\xFF', range.end());
1634 }
1635
1636 #[test]
1637 fn class_canonicalize_unicode() {
1638 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1639 let expected = vec![('a', 'c'), ('x', 'z')];
1640 assert_eq!(expected, uranges(&cls));
1641
1642 let cls = uclass(&[('x', 'z'), ('a', 'c')]);
1643 let expected = vec![('a', 'c'), ('x', 'z')];
1644 assert_eq!(expected, uranges(&cls));
1645
1646 let cls = uclass(&[('x', 'z'), ('w', 'y')]);
1647 let expected = vec![('w', 'z')];
1648 assert_eq!(expected, uranges(&cls));
1649
1650 let cls = uclass(&[
1651 ('c', 'f'),
1652 ('a', 'g'),
1653 ('d', 'j'),
1654 ('a', 'c'),
1655 ('m', 'p'),
1656 ('l', 's'),
1657 ]);
1658 let expected = vec![('a', 'j'), ('l', 's')];
1659 assert_eq!(expected, uranges(&cls));
1660
1661 let cls = uclass(&[('x', 'z'), ('u', 'w')]);
1662 let expected = vec![('u', 'z')];
1663 assert_eq!(expected, uranges(&cls));
1664
1665 let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1666 let expected = vec![('\x00', '\u{10FFFF}')];
1667 assert_eq!(expected, uranges(&cls));
1668
1669 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1670 let expected = vec![('a', 'b')];
1671 assert_eq!(expected, uranges(&cls));
1672 }
1673
1674 #[test]
1675 fn class_canonicalize_bytes() {
1676 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1677 let expected = vec![(b'a', b'c'), (b'x', b'z')];
1678 assert_eq!(expected, branges(&cls));
1679
1680 let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
1681 let expected = vec![(b'a', b'c'), (b'x', b'z')];
1682 assert_eq!(expected, branges(&cls));
1683
1684 let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
1685 let expected = vec![(b'w', b'z')];
1686 assert_eq!(expected, branges(&cls));
1687
1688 let cls = bclass(&[
1689 (b'c', b'f'),
1690 (b'a', b'g'),
1691 (b'd', b'j'),
1692 (b'a', b'c'),
1693 (b'm', b'p'),
1694 (b'l', b's'),
1695 ]);
1696 let expected = vec![(b'a', b'j'), (b'l', b's')];
1697 assert_eq!(expected, branges(&cls));
1698
1699 let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
1700 let expected = vec![(b'u', b'z')];
1701 assert_eq!(expected, branges(&cls));
1702
1703 let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
1704 let expected = vec![(b'\x00', b'\xFF')];
1705 assert_eq!(expected, branges(&cls));
1706
1707 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1708 let expected = vec![(b'a', b'b')];
1709 assert_eq!(expected, branges(&cls));
1710 }
1711
1712 #[test]
1713 #[cfg(feature = "unicode-case")]
1714 fn class_case_fold_unicode() {
1715 let cls = uclass(&[
1716 ('C', 'F'),
1717 ('A', 'G'),
1718 ('D', 'J'),
1719 ('A', 'C'),
1720 ('M', 'P'),
1721 ('L', 'S'),
1722 ('c', 'f'),
1723 ]);
1724 let expected = uclass(&[
1725 ('A', 'J'),
1726 ('L', 'S'),
1727 ('a', 'j'),
1728 ('l', 's'),
1729 ('\u{17F}', '\u{17F}'),
1730 ]);
1731 assert_eq!(expected, ucasefold(&cls));
1732
1733 let cls = uclass(&[('A', 'Z')]);
1734 let expected = uclass(&[
1735 ('A', 'Z'),
1736 ('a', 'z'),
1737 ('\u{17F}', '\u{17F}'),
1738 ('\u{212A}', '\u{212A}'),
1739 ]);
1740 assert_eq!(expected, ucasefold(&cls));
1741
1742 let cls = uclass(&[('a', 'z')]);
1743 let expected = uclass(&[
1744 ('A', 'Z'),
1745 ('a', 'z'),
1746 ('\u{17F}', '\u{17F}'),
1747 ('\u{212A}', '\u{212A}'),
1748 ]);
1749 assert_eq!(expected, ucasefold(&cls));
1750
1751 let cls = uclass(&[('A', 'A'), ('_', '_')]);
1752 let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
1753 assert_eq!(expected, ucasefold(&cls));
1754
1755 let cls = uclass(&[('A', 'A'), ('=', '=')]);
1756 let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
1757 assert_eq!(expected, ucasefold(&cls));
1758
1759 let cls = uclass(&[('\x00', '\x10')]);
1760 assert_eq!(cls, ucasefold(&cls));
1761
1762 let cls = uclass(&[('k', 'k')]);
1763 let expected =
1764 uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
1765 assert_eq!(expected, ucasefold(&cls));
1766
1767 let cls = uclass(&[('@', '@')]);
1768 assert_eq!(cls, ucasefold(&cls));
1769 }
1770
1771 #[test]
1772 #[cfg(not(feature = "unicode-case"))]
1773 fn class_case_fold_unicode_disabled() {
1774 let mut cls = uclass(&[
1775 ('C', 'F'),
1776 ('A', 'G'),
1777 ('D', 'J'),
1778 ('A', 'C'),
1779 ('M', 'P'),
1780 ('L', 'S'),
1781 ('c', 'f'),
1782 ]);
1783 assert!(cls.try_case_fold_simple().is_err());
1784 }
1785
1786 #[test]
1787 #[should_panic]
1788 #[cfg(not(feature = "unicode-case"))]
1789 fn class_case_fold_unicode_disabled_panics() {
1790 let mut cls = uclass(&[
1791 ('C', 'F'),
1792 ('A', 'G'),
1793 ('D', 'J'),
1794 ('A', 'C'),
1795 ('M', 'P'),
1796 ('L', 'S'),
1797 ('c', 'f'),
1798 ]);
1799 cls.case_fold_simple();
1800 }
1801
1802 #[test]
1803 fn class_case_fold_bytes() {
1804 let cls = bclass(&[
1805 (b'C', b'F'),
1806 (b'A', b'G'),
1807 (b'D', b'J'),
1808 (b'A', b'C'),
1809 (b'M', b'P'),
1810 (b'L', b'S'),
1811 (b'c', b'f'),
1812 ]);
1813 let expected =
1814 bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
1815 assert_eq!(expected, bcasefold(&cls));
1816
1817 let cls = bclass(&[(b'A', b'Z')]);
1818 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1819 assert_eq!(expected, bcasefold(&cls));
1820
1821 let cls = bclass(&[(b'a', b'z')]);
1822 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1823 assert_eq!(expected, bcasefold(&cls));
1824
1825 let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1826 let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
1827 assert_eq!(expected, bcasefold(&cls));
1828
1829 let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1830 let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
1831 assert_eq!(expected, bcasefold(&cls));
1832
1833 let cls = bclass(&[(b'\x00', b'\x10')]);
1834 assert_eq!(cls, bcasefold(&cls));
1835
1836 let cls = bclass(&[(b'k', b'k')]);
1837 let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
1838 assert_eq!(expected, bcasefold(&cls));
1839
1840 let cls = bclass(&[(b'@', b'@')]);
1841 assert_eq!(cls, bcasefold(&cls));
1842 }
1843
1844 #[test]
1845 fn class_negate_unicode() {
1846 let cls = uclass(&[('a', 'a')]);
1847 let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
1848 assert_eq!(expected, unegate(&cls));
1849
1850 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1851 let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
1852 assert_eq!(expected, unegate(&cls));
1853
1854 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1855 let expected = uclass(&[
1856 ('\x00', '\x60'),
1857 ('\x64', '\x77'),
1858 ('\x7B', '\u{10FFFF}'),
1859 ]);
1860 assert_eq!(expected, unegate(&cls));
1861
1862 let cls = uclass(&[('\x00', 'a')]);
1863 let expected = uclass(&[('\x62', '\u{10FFFF}')]);
1864 assert_eq!(expected, unegate(&cls));
1865
1866 let cls = uclass(&[('a', '\u{10FFFF}')]);
1867 let expected = uclass(&[('\x00', '\x60')]);
1868 assert_eq!(expected, unegate(&cls));
1869
1870 let cls = uclass(&[('\x00', '\u{10FFFF}')]);
1871 let expected = uclass(&[]);
1872 assert_eq!(expected, unegate(&cls));
1873
1874 let cls = uclass(&[]);
1875 let expected = uclass(&[('\x00', '\u{10FFFF}')]);
1876 assert_eq!(expected, unegate(&cls));
1877
1878 let cls =
1879 uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
1880 let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
1881 assert_eq!(expected, unegate(&cls));
1882
1883 let cls = uclass(&[('\x00', '\u{D7FF}')]);
1884 let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1885 assert_eq!(expected, unegate(&cls));
1886
1887 let cls = uclass(&[('\x00', '\u{D7FE}')]);
1888 let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
1889 assert_eq!(expected, unegate(&cls));
1890
1891 let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1892 let expected = uclass(&[('\x00', '\u{D7FF}')]);
1893 assert_eq!(expected, unegate(&cls));
1894
1895 let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
1896 let expected = uclass(&[('\x00', '\u{E000}')]);
1897 assert_eq!(expected, unegate(&cls));
1898 }
1899
1900 #[test]
1901 fn class_negate_bytes() {
1902 let cls = bclass(&[(b'a', b'a')]);
1903 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
1904 assert_eq!(expected, bnegate(&cls));
1905
1906 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1907 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
1908 assert_eq!(expected, bnegate(&cls));
1909
1910 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1911 let expected = bclass(&[
1912 (b'\x00', b'\x60'),
1913 (b'\x64', b'\x77'),
1914 (b'\x7B', b'\xFF'),
1915 ]);
1916 assert_eq!(expected, bnegate(&cls));
1917
1918 let cls = bclass(&[(b'\x00', b'a')]);
1919 let expected = bclass(&[(b'\x62', b'\xFF')]);
1920 assert_eq!(expected, bnegate(&cls));
1921
1922 let cls = bclass(&[(b'a', b'\xFF')]);
1923 let expected = bclass(&[(b'\x00', b'\x60')]);
1924 assert_eq!(expected, bnegate(&cls));
1925
1926 let cls = bclass(&[(b'\x00', b'\xFF')]);
1927 let expected = bclass(&[]);
1928 assert_eq!(expected, bnegate(&cls));
1929
1930 let cls = bclass(&[]);
1931 let expected = bclass(&[(b'\x00', b'\xFF')]);
1932 assert_eq!(expected, bnegate(&cls));
1933
1934 let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
1935 let expected = bclass(&[(b'\xFE', b'\xFE')]);
1936 assert_eq!(expected, bnegate(&cls));
1937 }
1938
1939 #[test]
1940 fn class_union_unicode() {
1941 let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
1942 let cls2 = uclass(&[('a', 'z')]);
1943 let expected = uclass(&[('a', 'z'), ('A', 'C')]);
1944 assert_eq!(expected, uunion(&cls1, &cls2));
1945 }
1946
1947 #[test]
1948 fn class_union_bytes() {
1949 let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
1950 let cls2 = bclass(&[(b'a', b'z')]);
1951 let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
1952 assert_eq!(expected, bunion(&cls1, &cls2));
1953 }
1954
1955 #[test]
1956 fn class_intersect_unicode() {
1957 let cls1 = uclass(&[]);
1958 let cls2 = uclass(&[('a', 'a')]);
1959 let expected = uclass(&[]);
1960 assert_eq!(expected, uintersect(&cls1, &cls2));
1961
1962 let cls1 = uclass(&[('a', 'a')]);
1963 let cls2 = uclass(&[('a', 'a')]);
1964 let expected = uclass(&[('a', 'a')]);
1965 assert_eq!(expected, uintersect(&cls1, &cls2));
1966
1967 let cls1 = uclass(&[('a', 'a')]);
1968 let cls2 = uclass(&[('b', 'b')]);
1969 let expected = uclass(&[]);
1970 assert_eq!(expected, uintersect(&cls1, &cls2));
1971
1972 let cls1 = uclass(&[('a', 'a')]);
1973 let cls2 = uclass(&[('a', 'c')]);
1974 let expected = uclass(&[('a', 'a')]);
1975 assert_eq!(expected, uintersect(&cls1, &cls2));
1976
1977 let cls1 = uclass(&[('a', 'b')]);
1978 let cls2 = uclass(&[('a', 'c')]);
1979 let expected = uclass(&[('a', 'b')]);
1980 assert_eq!(expected, uintersect(&cls1, &cls2));
1981
1982 let cls1 = uclass(&[('a', 'b')]);
1983 let cls2 = uclass(&[('b', 'c')]);
1984 let expected = uclass(&[('b', 'b')]);
1985 assert_eq!(expected, uintersect(&cls1, &cls2));
1986
1987 let cls1 = uclass(&[('a', 'b')]);
1988 let cls2 = uclass(&[('c', 'd')]);
1989 let expected = uclass(&[]);
1990 assert_eq!(expected, uintersect(&cls1, &cls2));
1991
1992 let cls1 = uclass(&[('b', 'c')]);
1993 let cls2 = uclass(&[('a', 'd')]);
1994 let expected = uclass(&[('b', 'c')]);
1995 assert_eq!(expected, uintersect(&cls1, &cls2));
1996
1997 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1998 let cls2 = uclass(&[('a', 'h')]);
1999 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2000 assert_eq!(expected, uintersect(&cls1, &cls2));
2001
2002 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2003 let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2004 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2005 assert_eq!(expected, uintersect(&cls1, &cls2));
2006
2007 let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
2008 let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
2009 let expected = uclass(&[]);
2010 assert_eq!(expected, uintersect(&cls1, &cls2));
2011
2012 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2013 let cls2 = uclass(&[('h', 'h')]);
2014 let expected = uclass(&[('h', 'h')]);
2015 assert_eq!(expected, uintersect(&cls1, &cls2));
2016
2017 let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
2018 let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
2019 let expected = uclass(&[]);
2020 assert_eq!(expected, uintersect(&cls1, &cls2));
2021
2022 let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
2023 let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
2024 let expected = uclass(&[('b', 'f')]);
2025 assert_eq!(expected, uintersect(&cls1, &cls2));
2026 }
2027
2028 #[test]
2029 fn class_intersect_bytes() {
2030 let cls1 = bclass(&[]);
2031 let cls2 = bclass(&[(b'a', b'a')]);
2032 let expected = bclass(&[]);
2033 assert_eq!(expected, bintersect(&cls1, &cls2));
2034
2035 let cls1 = bclass(&[(b'a', b'a')]);
2036 let cls2 = bclass(&[(b'a', b'a')]);
2037 let expected = bclass(&[(b'a', b'a')]);
2038 assert_eq!(expected, bintersect(&cls1, &cls2));
2039
2040 let cls1 = bclass(&[(b'a', b'a')]);
2041 let cls2 = bclass(&[(b'b', b'b')]);
2042 let expected = bclass(&[]);
2043 assert_eq!(expected, bintersect(&cls1, &cls2));
2044
2045 let cls1 = bclass(&[(b'a', b'a')]);
2046 let cls2 = bclass(&[(b'a', b'c')]);
2047 let expected = bclass(&[(b'a', b'a')]);
2048 assert_eq!(expected, bintersect(&cls1, &cls2));
2049
2050 let cls1 = bclass(&[(b'a', b'b')]);
2051 let cls2 = bclass(&[(b'a', b'c')]);
2052 let expected = bclass(&[(b'a', b'b')]);
2053 assert_eq!(expected, bintersect(&cls1, &cls2));
2054
2055 let cls1 = bclass(&[(b'a', b'b')]);
2056 let cls2 = bclass(&[(b'b', b'c')]);
2057 let expected = bclass(&[(b'b', b'b')]);
2058 assert_eq!(expected, bintersect(&cls1, &cls2));
2059
2060 let cls1 = bclass(&[(b'a', b'b')]);
2061 let cls2 = bclass(&[(b'c', b'd')]);
2062 let expected = bclass(&[]);
2063 assert_eq!(expected, bintersect(&cls1, &cls2));
2064
2065 let cls1 = bclass(&[(b'b', b'c')]);
2066 let cls2 = bclass(&[(b'a', b'd')]);
2067 let expected = bclass(&[(b'b', b'c')]);
2068 assert_eq!(expected, bintersect(&cls1, &cls2));
2069
2070 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2071 let cls2 = bclass(&[(b'a', b'h')]);
2072 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2073 assert_eq!(expected, bintersect(&cls1, &cls2));
2074
2075 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2076 let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2077 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2078 assert_eq!(expected, bintersect(&cls1, &cls2));
2079
2080 let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
2081 let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
2082 let expected = bclass(&[]);
2083 assert_eq!(expected, bintersect(&cls1, &cls2));
2084
2085 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2086 let cls2 = bclass(&[(b'h', b'h')]);
2087 let expected = bclass(&[(b'h', b'h')]);
2088 assert_eq!(expected, bintersect(&cls1, &cls2));
2089
2090 let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
2091 let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
2092 let expected = bclass(&[]);
2093 assert_eq!(expected, bintersect(&cls1, &cls2));
2094
2095 let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
2096 let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
2097 let expected = bclass(&[(b'b', b'f')]);
2098 assert_eq!(expected, bintersect(&cls1, &cls2));
2099 }
2100
2101 #[test]
2102 fn class_difference_unicode() {
2103 let cls1 = uclass(&[('a', 'a')]);
2104 let cls2 = uclass(&[('a', 'a')]);
2105 let expected = uclass(&[]);
2106 assert_eq!(expected, udifference(&cls1, &cls2));
2107
2108 let cls1 = uclass(&[('a', 'a')]);
2109 let cls2 = uclass(&[]);
2110 let expected = uclass(&[('a', 'a')]);
2111 assert_eq!(expected, udifference(&cls1, &cls2));
2112
2113 let cls1 = uclass(&[]);
2114 let cls2 = uclass(&[('a', 'a')]);
2115 let expected = uclass(&[]);
2116 assert_eq!(expected, udifference(&cls1, &cls2));
2117
2118 let cls1 = uclass(&[('a', 'z')]);
2119 let cls2 = uclass(&[('a', 'a')]);
2120 let expected = uclass(&[('b', 'z')]);
2121 assert_eq!(expected, udifference(&cls1, &cls2));
2122
2123 let cls1 = uclass(&[('a', 'z')]);
2124 let cls2 = uclass(&[('z', 'z')]);
2125 let expected = uclass(&[('a', 'y')]);
2126 assert_eq!(expected, udifference(&cls1, &cls2));
2127
2128 let cls1 = uclass(&[('a', 'z')]);
2129 let cls2 = uclass(&[('m', 'm')]);
2130 let expected = uclass(&[('a', 'l'), ('n', 'z')]);
2131 assert_eq!(expected, udifference(&cls1, &cls2));
2132
2133 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2134 let cls2 = uclass(&[('a', 'z')]);
2135 let expected = uclass(&[]);
2136 assert_eq!(expected, udifference(&cls1, &cls2));
2137
2138 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2139 let cls2 = uclass(&[('d', 'v')]);
2140 let expected = uclass(&[('a', 'c')]);
2141 assert_eq!(expected, udifference(&cls1, &cls2));
2142
2143 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2144 let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
2145 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2146 assert_eq!(expected, udifference(&cls1, &cls2));
2147
2148 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2149 let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
2150 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2151 assert_eq!(expected, udifference(&cls1, &cls2));
2152
2153 let cls1 = uclass(&[('x', 'z')]);
2154 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2155 let expected = uclass(&[('x', 'z')]);
2156 assert_eq!(expected, udifference(&cls1, &cls2));
2157
2158 let cls1 = uclass(&[('a', 'z')]);
2159 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2160 let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
2161 assert_eq!(expected, udifference(&cls1, &cls2));
2162 }
2163
2164 #[test]
2165 fn class_difference_bytes() {
2166 let cls1 = bclass(&[(b'a', b'a')]);
2167 let cls2 = bclass(&[(b'a', b'a')]);
2168 let expected = bclass(&[]);
2169 assert_eq!(expected, bdifference(&cls1, &cls2));
2170
2171 let cls1 = bclass(&[(b'a', b'a')]);
2172 let cls2 = bclass(&[]);
2173 let expected = bclass(&[(b'a', b'a')]);
2174 assert_eq!(expected, bdifference(&cls1, &cls2));
2175
2176 let cls1 = bclass(&[]);
2177 let cls2 = bclass(&[(b'a', b'a')]);
2178 let expected = bclass(&[]);
2179 assert_eq!(expected, bdifference(&cls1, &cls2));
2180
2181 let cls1 = bclass(&[(b'a', b'z')]);
2182 let cls2 = bclass(&[(b'a', b'a')]);
2183 let expected = bclass(&[(b'b', b'z')]);
2184 assert_eq!(expected, bdifference(&cls1, &cls2));
2185
2186 let cls1 = bclass(&[(b'a', b'z')]);
2187 let cls2 = bclass(&[(b'z', b'z')]);
2188 let expected = bclass(&[(b'a', b'y')]);
2189 assert_eq!(expected, bdifference(&cls1, &cls2));
2190
2191 let cls1 = bclass(&[(b'a', b'z')]);
2192 let cls2 = bclass(&[(b'm', b'm')]);
2193 let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
2194 assert_eq!(expected, bdifference(&cls1, &cls2));
2195
2196 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2197 let cls2 = bclass(&[(b'a', b'z')]);
2198 let expected = bclass(&[]);
2199 assert_eq!(expected, bdifference(&cls1, &cls2));
2200
2201 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2202 let cls2 = bclass(&[(b'd', b'v')]);
2203 let expected = bclass(&[(b'a', b'c')]);
2204 assert_eq!(expected, bdifference(&cls1, &cls2));
2205
2206 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2207 let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
2208 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2209 assert_eq!(expected, bdifference(&cls1, &cls2));
2210
2211 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2212 let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
2213 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2214 assert_eq!(expected, bdifference(&cls1, &cls2));
2215
2216 let cls1 = bclass(&[(b'x', b'z')]);
2217 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2218 let expected = bclass(&[(b'x', b'z')]);
2219 assert_eq!(expected, bdifference(&cls1, &cls2));
2220
2221 let cls1 = bclass(&[(b'a', b'z')]);
2222 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2223 let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
2224 assert_eq!(expected, bdifference(&cls1, &cls2));
2225 }
2226
2227 #[test]
2228 fn class_symmetric_difference_unicode() {
2229 let cls1 = uclass(&[('a', 'm')]);
2230 let cls2 = uclass(&[('g', 't')]);
2231 let expected = uclass(&[('a', 'f'), ('n', 't')]);
2232 assert_eq!(expected, usymdifference(&cls1, &cls2));
2233 }
2234
2235 #[test]
2236 fn class_symmetric_difference_bytes() {
2237 let cls1 = bclass(&[(b'a', b'm')]);
2238 let cls2 = bclass(&[(b'g', b't')]);
2239 let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
2240 assert_eq!(expected, bsymdifference(&cls1, &cls2));
2241 }
2242
2243 #[test]
2244 #[should_panic]
2245 fn hir_byte_literal_non_ascii() {
2246 Hir::literal(Literal::Byte(b'a'));
2247 }
2248
2249 // We use a thread with an explicit stack size to test that our destructor
2250 // for Hir can handle arbitrarily sized expressions in constant stack
2251 // space. In case we run on a platform without threads (WASM?), we limit
2252 // this test to Windows/Unix.
2253 #[test]
2254 #[cfg(any(unix, windows))]
2255 fn no_stack_overflow_on_drop() {
2256 use std::thread;
2257
2258 let run = || {
2259 let mut expr = Hir::empty();
2260 for _ in 0..100 {
2261 expr = Hir::group(Group {
2262 kind: GroupKind::NonCapturing,
2263 hir: Box::new(expr),
2264 });
2265 expr = Hir::repetition(Repetition {
2266 kind: RepetitionKind::ZeroOrOne,
2267 greedy: true,
2268 hir: Box::new(expr),
2269 });
2270
2271 expr = Hir {
2272 kind: HirKind::Concat(vec![expr]),
2273 info: HirInfo::new(),
2274 };
2275 expr = Hir {
2276 kind: HirKind::Alternation(vec![expr]),
2277 info: HirInfo::new(),
2278 };
2279 }
2280 assert!(!expr.kind.is_empty());
2281 };
2282
2283 // We run our test on a thread with a small stack size so we can
2284 // force the issue more easily.
2285 thread::Builder::new()
2286 .stack_size(1 << 10)
2287 .spawn(run)
2288 .unwrap()
2289 .join()
2290 .unwrap();
2291 }
2292 }