]> git.proxmox.com Git - rustc.git/blob - src/vendor/regex-syntax-0.5.5/src/hir/mod.rs
New upstream version 1.28.0~beta.14+dfsg1
[rustc.git] / src / vendor / regex-syntax-0.5.5 / src / hir / mod.rs
1 // Copyright 2018 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 /*!
12 Defines a high-level intermediate representation for regular expressions.
13 */
14 use std::char;
15 use std::cmp;
16 use std::error;
17 use std::fmt;
18 use std::u8;
19
20 use ast::Span;
21 use hir::interval::{Interval, IntervalSet, IntervalSetIter};
22 use unicode;
23
24 pub use hir::visitor::{Visitor, visit};
25
26 mod interval;
27 pub mod literal;
28 pub mod print;
29 pub mod translate;
30 mod visitor;
31
32 /// An error that can occur while translating an `Ast` to a `Hir`.
33 #[derive(Clone, Debug, Eq, PartialEq)]
34 pub struct Error {
35 /// The kind of error.
36 kind: ErrorKind,
37 /// The original pattern that the translator's Ast was parsed from. Every
38 /// span in an error is a valid range into this string.
39 pattern: String,
40 /// The span of this error, derived from the Ast given to the translator.
41 span: Span,
42 }
43
44 impl Error {
45 /// Return the type of this error.
46 pub fn kind(&self) -> &ErrorKind {
47 &self.kind
48 }
49
50 /// The original pattern string in which this error occurred.
51 ///
52 /// Every span reported by this error is reported in terms of this string.
53 pub fn pattern(&self) -> &str {
54 &self.pattern
55 }
56
57 /// Return the span at which this error occurred.
58 pub fn span(&self) -> &Span {
59 &self.span
60 }
61 }
62
63 /// The type of an error that occurred while building an `Hir`.
64 #[derive(Clone, Debug, Eq, PartialEq)]
65 pub enum ErrorKind {
66 /// This error occurs when a Unicode feature is used when Unicode
67 /// support is disabled. For example `(?-u:\pL)` would trigger this error.
68 UnicodeNotAllowed,
69 /// This error occurs when translating a pattern that could match a byte
70 /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
71 InvalidUtf8,
72 /// This occurs when an unrecognized Unicode property name could not
73 /// be found.
74 UnicodePropertyNotFound,
75 /// This occurs when an unrecognized Unicode property value could not
76 /// be found.
77 UnicodePropertyValueNotFound,
78 /// This occurs when the translator attempts to construct a character class
79 /// that is empty.
80 ///
81 /// Note that this restriction in the translator may be removed in the
82 /// future.
83 EmptyClassNotAllowed,
84 /// Hints that destructuring should not be exhaustive.
85 ///
86 /// This enum may grow additional variants, so this makes sure clients
87 /// don't count on exhaustive matching. (Otherwise, adding a new variant
88 /// could break existing code.)
89 #[doc(hidden)]
90 __Nonexhaustive,
91 }
92
93 impl ErrorKind {
94 fn description(&self) -> &str {
95 use self::ErrorKind::*;
96 match *self {
97 UnicodeNotAllowed => "Unicode not allowed here",
98 InvalidUtf8 => "pattern can match invalid UTF-8",
99 UnicodePropertyNotFound => "Unicode property not found",
100 UnicodePropertyValueNotFound => "Unicode property value not found",
101 EmptyClassNotAllowed => "empty character classes are not allowed",
102 _ => unreachable!(),
103 }
104 }
105 }
106
107 impl error::Error for Error {
108 fn description(&self) -> &str {
109 self.kind.description()
110 }
111 }
112
113 impl fmt::Display for Error {
114 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
115 ::error::Formatter::from(self).fmt(f)
116 }
117 }
118
119 impl fmt::Display for ErrorKind {
120 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
121 f.write_str(self.description())
122 }
123 }
124
125 /// A high-level intermediate representation (HIR) for a regular expression.
126 ///
127 /// The HIR of a regular expression represents an intermediate step between its
128 /// abstract syntax (a structured description of the concrete syntax) and
129 /// compiled byte codes. The purpose of HIR is to make regular expressions
130 /// easier to analyze. In particular, the AST is much more complex than the
131 /// HIR. For example, while an AST supports arbitrarily nested character
132 /// classes, the HIR will flatten all nested classes into a single set. The HIR
133 /// will also "compile away" every flag present in the concrete syntax. For
134 /// example, users of HIR expressions never need to worry about case folding;
135 /// it is handled automatically by the translator (e.g., by translating `(?i)A`
136 /// to `[aA]`).
137 ///
138 /// If the HIR was produced by a translator that disallows invalid UTF-8, then
139 /// the HIR is guaranteed to match UTF-8 exclusively.
140 ///
141 /// This type defines its own destructor that uses constant stack space and
142 /// heap space proportional to the size of the HIR.
143 ///
144 /// The specific type of an HIR expression can be accessed via its `kind`
145 /// or `into_kind` methods. This extra level of indirection exists for two
146 /// reasons:
147 ///
148 /// 1. Construction of an HIR expression *must* use the constructor methods
149 /// on this `Hir` type instead of building the `HirKind` values directly.
150 /// This permits construction to enforce invariants like "concatenations
151 /// always consist of two or more sub-expressions."
152 /// 2. Every HIR expression contains attributes that are defined inductively,
153 /// and can be computed cheaply during the construction process. For
154 /// example, one such attribute is whether the expression must match at the
155 /// beginning of the text.
156 ///
157 /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
158 /// expression pattern string, and uses constant stack space and heap space
159 /// proportional to the size of the `Hir`.
160 #[derive(Clone, Debug, Eq, PartialEq)]
161 pub struct Hir {
162 /// The underlying HIR kind.
163 kind: HirKind,
164 /// Analysis info about this HIR, computed during construction.
165 info: HirInfo,
166 }
167
168 /// The kind of an arbitrary `Hir` expression.
169 #[derive(Clone, Debug, Eq, PartialEq)]
170 pub enum HirKind {
171 /// The empty regular expression, which matches everything, including the
172 /// empty string.
173 Empty,
174 /// A single literal character that matches exactly this character.
175 Literal(Literal),
176 /// A single character class that matches any of the characters in the
177 /// class. A class can either consist of Unicode scalar values as
178 /// characters, or it can use bytes.
179 Class(Class),
180 /// An anchor assertion. An anchor assertion match always has zero length.
181 Anchor(Anchor),
182 /// A word boundary assertion, which may or may not be Unicode aware. A
183 /// word boundary assertion match always has zero length.
184 WordBoundary(WordBoundary),
185 /// A repetition operation applied to a child expression.
186 Repetition(Repetition),
187 /// A possibly capturing group, which contains a child expression.
188 Group(Group),
189 /// A concatenation of expressions. A concatenation always has at least two
190 /// child expressions.
191 ///
192 /// A concatenation matches only if each of its child expression matches
193 /// one after the other.
194 Concat(Vec<Hir>),
195 /// An alternation of expressions. An alternation always has at least two
196 /// child expressions.
197 ///
198 /// An alternation matches only if at least one of its child expression
199 /// matches. If multiple expressions match, then the leftmost is preferred.
200 Alternation(Vec<Hir>),
201 }
202
203 impl Hir {
204 /// Returns a reference to the underlying HIR kind.
205 pub fn kind(&self) -> &HirKind {
206 &self.kind
207 }
208
209 /// Consumes ownership of this HIR expression and returns its underlying
210 /// `HirKind`.
211 pub fn into_kind(mut self) -> HirKind {
212 use std::mem;
213 mem::replace(&mut self.kind, HirKind::Empty)
214 }
215
216 /// Returns an empty HIR expression.
217 ///
218 /// An empty HIR expression always matches, including the empty string.
219 pub fn empty() -> Hir {
220 let mut info = HirInfo::new();
221 info.set_always_utf8(true);
222 info.set_all_assertions(true);
223 info.set_anchored_start(false);
224 info.set_anchored_end(false);
225 info.set_any_anchored_start(false);
226 info.set_any_anchored_end(false);
227 info.set_match_empty(true);
228 Hir {
229 kind: HirKind::Empty,
230 info: info,
231 }
232 }
233
234 /// Creates a literal HIR expression.
235 ///
236 /// If the given literal has a `Byte` variant with an ASCII byte, then this
237 /// method panics. This enforces the invariant that `Byte` variants are
238 /// only used to express matching of invalid UTF-8.
239 pub fn literal(lit: Literal) -> Hir {
240 if let Literal::Byte(b) = lit {
241 assert!(b > 0x7F);
242 }
243
244 let mut info = HirInfo::new();
245 info.set_always_utf8(lit.is_unicode());
246 info.set_all_assertions(false);
247 info.set_anchored_start(false);
248 info.set_anchored_end(false);
249 info.set_any_anchored_start(false);
250 info.set_any_anchored_end(false);
251 info.set_match_empty(false);
252 Hir {
253 kind: HirKind::Literal(lit),
254 info: info,
255 }
256 }
257
258 /// Creates a class HIR expression.
259 pub fn class(class: Class) -> Hir {
260 let mut info = HirInfo::new();
261 info.set_always_utf8(class.is_always_utf8());
262 info.set_all_assertions(false);
263 info.set_anchored_start(false);
264 info.set_anchored_end(false);
265 info.set_any_anchored_start(false);
266 info.set_any_anchored_end(false);
267 info.set_match_empty(false);
268 Hir {
269 kind: HirKind::Class(class),
270 info: info,
271 }
272 }
273
274 /// Creates an anchor assertion HIR expression.
275 pub fn anchor(anchor: Anchor) -> Hir {
276 let mut info = HirInfo::new();
277 info.set_always_utf8(true);
278 info.set_all_assertions(true);
279 info.set_anchored_start(false);
280 info.set_anchored_end(false);
281 info.set_any_anchored_start(false);
282 info.set_any_anchored_end(false);
283 info.set_match_empty(true);
284 if let Anchor::StartText = anchor {
285 info.set_anchored_start(true);
286 info.set_any_anchored_start(true);
287 }
288 if let Anchor::EndText = anchor {
289 info.set_anchored_end(true);
290 info.set_any_anchored_end(true);
291 }
292 Hir {
293 kind: HirKind::Anchor(anchor),
294 info: info,
295 }
296 }
297
298 /// Creates a word boundary assertion HIR expression.
299 pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
300 let mut info = HirInfo::new();
301 info.set_always_utf8(true);
302 info.set_all_assertions(true);
303 info.set_anchored_start(false);
304 info.set_anchored_end(false);
305 info.set_any_anchored_start(false);
306 info.set_any_anchored_end(false);
307 // A negated word boundary matches the empty string, but a normal
308 // word boundary does not!
309 info.set_match_empty(word_boundary.is_negated());
310 // Negated ASCII word boundaries can match invalid UTF-8.
311 if let WordBoundary::AsciiNegate = word_boundary {
312 info.set_always_utf8(false);
313 }
314 Hir {
315 kind: HirKind::WordBoundary(word_boundary),
316 info: info,
317 }
318 }
319
320 /// Creates a repetition HIR expression.
321 pub fn repetition(rep: Repetition) -> Hir {
322 let mut info = HirInfo::new();
323 info.set_always_utf8(rep.hir.is_always_utf8());
324 info.set_all_assertions(rep.hir.is_all_assertions());
325 // If this operator can match the empty string, then it can never
326 // be anchored.
327 info.set_anchored_start(
328 !rep.is_match_empty() && rep.hir.is_anchored_start()
329 );
330 info.set_anchored_end(
331 !rep.is_match_empty() && rep.hir.is_anchored_end()
332 );
333 info.set_any_anchored_start(rep.hir.is_any_anchored_start());
334 info.set_any_anchored_end(rep.hir.is_any_anchored_end());
335 info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
336 Hir {
337 kind: HirKind::Repetition(rep),
338 info: info,
339 }
340 }
341
342 /// Creates a group HIR expression.
343 pub fn group(group: Group) -> Hir {
344 let mut info = HirInfo::new();
345 info.set_always_utf8(group.hir.is_always_utf8());
346 info.set_all_assertions(group.hir.is_all_assertions());
347 info.set_anchored_start(group.hir.is_anchored_start());
348 info.set_anchored_end(group.hir.is_anchored_end());
349 info.set_any_anchored_start(group.hir.is_any_anchored_start());
350 info.set_any_anchored_end(group.hir.is_any_anchored_end());
351 info.set_match_empty(group.hir.is_match_empty());
352 Hir {
353 kind: HirKind::Group(group),
354 info: info,
355 }
356 }
357
358 /// Returns the concatenation of the given expressions.
359 ///
360 /// This flattens the concatenation as appropriate.
361 pub fn concat(mut exprs: Vec<Hir>) -> Hir {
362 match exprs.len() {
363 0 => Hir::empty(),
364 1 => exprs.pop().unwrap(),
365 _ => {
366 let mut info = HirInfo::new();
367 info.set_always_utf8(true);
368 info.set_all_assertions(true);
369 info.set_any_anchored_start(false);
370 info.set_any_anchored_end(false);
371 info.set_match_empty(true);
372
373 // Some attributes require analyzing all sub-expressions.
374 for e in &exprs {
375 let x = info.is_always_utf8() && e.is_always_utf8();
376 info.set_always_utf8(x);
377
378 let x = info.is_all_assertions() && e.is_all_assertions();
379 info.set_all_assertions(x);
380
381 let x =
382 info.is_any_anchored_start()
383 || e.is_any_anchored_start();
384 info.set_any_anchored_start(x);
385
386 let x =
387 info.is_any_anchored_end()
388 || e.is_any_anchored_end();
389 info.set_any_anchored_end(x);
390
391 let x = info.is_match_empty() && e.is_match_empty();
392 info.set_match_empty(x);
393 }
394 // Anchored attributes require something slightly more
395 // sophisticated. Normally, WLOG, to determine whether an
396 // expression is anchored to the start, we'd only need to check
397 // the first expression of a concatenation. However,
398 // expressions like `$\b^` are still anchored to the start,
399 // but the first expression in the concatenation *isn't*
400 // anchored to the start. So the "first" expression to look at
401 // is actually one that is either not an assertion or is
402 // specifically the StartText assertion.
403 info.set_anchored_start(
404 exprs.iter()
405 .take_while(|e| {
406 e.is_anchored_start() || e.is_all_assertions()
407 })
408 .any(|e| {
409 e.is_anchored_start()
410 }));
411 // Similarly for the end anchor, but in reverse.
412 info.set_anchored_end(
413 exprs.iter()
414 .rev()
415 .take_while(|e| {
416 e.is_anchored_end() || e.is_all_assertions()
417 })
418 .any(|e| {
419 e.is_anchored_end()
420 }));
421 Hir {
422 kind: HirKind::Concat(exprs),
423 info: info,
424 }
425 }
426 }
427 }
428
429 /// Returns the alternation of the given expressions.
430 ///
431 /// This flattens the alternation as appropriate.
432 pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
433 match exprs.len() {
434 0 => Hir::empty(),
435 1 => exprs.pop().unwrap(),
436 _ => {
437 let mut info = HirInfo::new();
438 info.set_always_utf8(true);
439 info.set_all_assertions(true);
440 info.set_anchored_start(true);
441 info.set_anchored_end(true);
442 info.set_any_anchored_start(false);
443 info.set_any_anchored_end(false);
444 info.set_match_empty(false);
445
446 // Some attributes require analyzing all sub-expressions.
447 for e in &exprs {
448 let x = info.is_always_utf8() && e.is_always_utf8();
449 info.set_always_utf8(x);
450
451 let x = info.is_all_assertions() && e.is_all_assertions();
452 info.set_all_assertions(x);
453
454 let x = info.is_anchored_start() && e.is_anchored_start();
455 info.set_anchored_start(x);
456
457 let x = info.is_anchored_end() && e.is_anchored_end();
458 info.set_anchored_end(x);
459
460 let x =
461 info.is_any_anchored_start()
462 || e.is_any_anchored_start();
463 info.set_any_anchored_start(x);
464
465 let x =
466 info.is_any_anchored_end()
467 || e.is_any_anchored_end();
468 info.set_any_anchored_end(x);
469
470 let x = info.is_match_empty() || e.is_match_empty();
471 info.set_match_empty(x);
472 }
473 Hir {
474 kind: HirKind::Alternation(exprs),
475 info: info,
476 }
477 }
478 }
479 }
480
481 /// Build an HIR expression for `.`.
482 ///
483 /// A `.` expression matches any character except for `\n`. To build an
484 /// expression that matches any character, including `\n`, use the `any`
485 /// method.
486 ///
487 /// If `bytes` is `true`, then this assumes characters are limited to a
488 /// single byte.
489 pub fn dot(bytes: bool) -> Hir {
490 if bytes {
491 let mut cls = ClassBytes::empty();
492 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
493 cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
494 Hir::class(Class::Bytes(cls))
495 } else {
496 let mut cls = ClassUnicode::empty();
497 cls.push(ClassUnicodeRange::new('\0', '\x09'));
498 cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
499 Hir::class(Class::Unicode(cls))
500 }
501 }
502
503 /// Build an HIR expression for `(?s).`.
504 ///
505 /// A `(?s).` expression matches any character, including `\n`. To build an
506 /// expression that matches any character except for `\n`, then use the
507 /// `dot` method.
508 ///
509 /// If `bytes` is `true`, then this assumes characters are limited to a
510 /// single byte.
511 pub fn any(bytes: bool) -> Hir {
512 if bytes {
513 let mut cls = ClassBytes::empty();
514 cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
515 Hir::class(Class::Bytes(cls))
516 } else {
517 let mut cls = ClassUnicode::empty();
518 cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
519 Hir::class(Class::Unicode(cls))
520 }
521 }
522
523 /// Return true if and only if this HIR will always match valid UTF-8.
524 ///
525 /// When this returns false, then it is possible for this HIR expression
526 /// to match invalid UTF-8.
527 pub fn is_always_utf8(&self) -> bool {
528 self.info.is_always_utf8()
529 }
530
531 /// Returns true if and only if this entire HIR expression is made up of
532 /// zero-width assertions.
533 ///
534 /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but
535 /// not `^a`.
536 pub fn is_all_assertions(&self) -> bool {
537 self.info.is_all_assertions()
538 }
539
540 /// Return true if and only if this HIR is required to match from the
541 /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`,
542 /// `^foo|^bar` but not `^foo|bar`.
543 pub fn is_anchored_start(&self) -> bool {
544 self.info.is_anchored_start()
545 }
546
547 /// Return true if and only if this HIR is required to match at the end
548 /// of text. This includes expressions like `foo$`, `(foo|bar)$`,
549 /// `foo$|bar$` but not `foo$|bar`.
550 pub fn is_anchored_end(&self) -> bool {
551 self.info.is_anchored_end()
552 }
553
554 /// Return true if and only if this HIR contains any sub-expression that
555 /// is required to match at the beginning of text. Specifically, this
556 /// returns true if the `^` symbol (when multiline mode is disabled) or the
557 /// `\A` escape appear anywhere in the regex.
558 pub fn is_any_anchored_start(&self) -> bool {
559 self.info.is_any_anchored_start()
560 }
561
562 /// Return true if and only if this HIR contains any sub-expression that is
563 /// required to match at the end of text. Specifically, this returns true
564 /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
565 /// appear anywhere in the regex.
566 pub fn is_any_anchored_end(&self) -> bool {
567 self.info.is_any_anchored_end()
568 }
569
570 /// Return true if and only if the empty string is part of the language
571 /// matched by this regular expression.
572 ///
573 /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`,
574 /// but not `a`, `a+` or `\b`.
575 pub fn is_match_empty(&self) -> bool {
576 self.info.is_match_empty()
577 }
578 }
579
580 impl HirKind {
581 /// Return true if and only if this HIR is the empty regular expression.
582 ///
583 /// Note that this is not defined inductively. That is, it only tests if
584 /// this kind is the `Empty` variant. To get the inductive definition,
585 /// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
586 pub fn is_empty(&self) -> bool {
587 match *self {
588 HirKind::Empty => true,
589 _ => false,
590 }
591 }
592
593 /// Returns true if and only if this kind has any (including possibly
594 /// empty) subexpressions.
595 pub fn has_subexprs(&self) -> bool {
596 match *self {
597 HirKind::Empty
598 | HirKind::Literal(_)
599 | HirKind::Class(_)
600 | HirKind::Anchor(_)
601 | HirKind::WordBoundary(_) => false,
602 HirKind::Group(_)
603 | HirKind::Repetition(_)
604 | HirKind::Concat(_)
605 | HirKind::Alternation(_) => true,
606 }
607 }
608 }
609
610 /// Print a display representation of this Hir.
611 ///
612 /// The result of this is a valid regular expression pattern string.
613 ///
614 /// This implementation uses constant stack space and heap space proportional
615 /// to the size of the `Hir`.
616 impl fmt::Display for Hir {
617 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
618 use hir::print::Printer;
619 Printer::new().print(self, f)
620 }
621 }
622
623 /// The high-level intermediate representation of a literal.
624 ///
625 /// A literal corresponds to a single character, where a character is either
626 /// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
627 /// are preferred whenever possible. In particular, a `Byte` variant is only
628 /// ever produced when it could match invalid UTF-8.
629 #[derive(Clone, Debug, Eq, PartialEq)]
630 pub enum Literal {
631 /// A single character represented by a Unicode scalar value.
632 Unicode(char),
633 /// A single character represented by an arbitrary byte.
634 Byte(u8),
635 }
636
637 impl Literal {
638 /// Returns true if and only if this literal corresponds to a Unicode
639 /// scalar value.
640 pub fn is_unicode(&self) -> bool {
641 match *self {
642 Literal::Unicode(_) => true,
643 Literal::Byte(b) if b <= 0x7F => true,
644 Literal::Byte(_) => false,
645 }
646 }
647 }
648
649 /// The high-level intermediate representation of a character class.
650 ///
651 /// A character class corresponds to a set of characters. A character is either
652 /// defined by a Unicode scalar value or a byte. Unicode characters are used
653 /// by default, while bytes are used when Unicode mode (via the `u` flag) is
654 /// disabled.
655 ///
656 /// A character class, regardless of its character type, is represented by a
657 /// sequence of non-overlapping non-adjacent ranges of characters.
658 ///
659 /// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
660 /// be produced even when it exclusively matches valid UTF-8. This is because
661 /// a `Bytes` variant represents an intention by the author of the regular
662 /// expression to disable Unicode mode, which in turn impacts the semantics of
663 /// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
664 /// match the same set of strings.
665 #[derive(Clone, Debug, Eq, PartialEq)]
666 pub enum Class {
667 /// A set of characters represented by Unicode scalar values.
668 Unicode(ClassUnicode),
669 /// A set of characters represented by arbitrary bytes (one byte per
670 /// character).
671 Bytes(ClassBytes),
672 }
673
674 impl Class {
675 /// Apply Unicode simple case folding to this character class, in place.
676 /// The character class will be expanded to include all simple case folded
677 /// character variants.
678 ///
679 /// If this is a byte oriented character class, then this will be limited
680 /// to the ASCII ranges `A-Z` and `a-z`.
681 pub fn case_fold_simple(&mut self) {
682 match *self {
683 Class::Unicode(ref mut x) => x.case_fold_simple(),
684 Class::Bytes(ref mut x) => x.case_fold_simple(),
685 }
686 }
687
688 /// Negate this character class in place.
689 ///
690 /// After completion, this character class will contain precisely the
691 /// characters that weren't previously in the class.
692 pub fn negate(&mut self) {
693 match *self {
694 Class::Unicode(ref mut x) => x.negate(),
695 Class::Bytes(ref mut x) => x.negate(),
696 }
697 }
698
699 /// Returns true if and only if this character class will only ever match
700 /// valid UTF-8.
701 ///
702 /// A character class can match invalid UTF-8 only when the following
703 /// conditions are met:
704 ///
705 /// 1. The translator was configured to permit generating an expression
706 /// that can match invalid UTF-8. (By default, this is disabled.)
707 /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
708 /// syntax or in the parser builder. By default, Unicode mode is
709 /// enabled.
710 pub fn is_always_utf8(&self) -> bool {
711 match *self {
712 Class::Unicode(_) => true,
713 Class::Bytes(ref x) => x.is_all_ascii(),
714 }
715 }
716 }
717
718 /// A set of characters represented by Unicode scalar values.
719 #[derive(Clone, Debug, Eq, PartialEq)]
720 pub struct ClassUnicode {
721 set: IntervalSet<ClassUnicodeRange>,
722 }
723
724 impl ClassUnicode {
725 /// Create a new class from a sequence of ranges.
726 ///
727 /// The given ranges do not need to be in any specific order, and ranges
728 /// may overlap.
729 pub fn new<I>(ranges: I) -> ClassUnicode
730 where I: IntoIterator<Item=ClassUnicodeRange>
731 {
732 ClassUnicode { set: IntervalSet::new(ranges) }
733 }
734
735 /// Create a new class with no ranges.
736 pub fn empty() -> ClassUnicode {
737 ClassUnicode::new(vec![])
738 }
739
740 /// Add a new range to this set.
741 pub fn push(&mut self, range: ClassUnicodeRange) {
742 self.set.push(range);
743 }
744
745 /// Return an iterator over all ranges in this class.
746 ///
747 /// The iterator yields ranges in ascending order.
748 pub fn iter(&self) -> ClassUnicodeIter {
749 ClassUnicodeIter(self.set.iter())
750 }
751
752 /// Return the underlying ranges as a slice.
753 pub fn ranges(&self) -> &[ClassUnicodeRange] {
754 self.set.intervals()
755 }
756
757 /// Expand this character class such that it contains all case folded
758 /// characters, according to Unicode's "simple" mapping. For example, if
759 /// this class consists of the range `a-z`, then applying case folding will
760 /// result in the class containing both the ranges `a-z` and `A-Z`.
761 pub fn case_fold_simple(&mut self) {
762 self.set.case_fold_simple();
763 }
764
765 /// Negate this character class.
766 ///
767 /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
768 /// set, then it will not be in this set after negation.
769 pub fn negate(&mut self) {
770 self.set.negate();
771 }
772
773 /// Union this character class with the given character class, in place.
774 pub fn union(&mut self, other: &ClassUnicode) {
775 self.set.union(&other.set);
776 }
777
778 /// Intersect this character class with the given character class, in
779 /// place.
780 pub fn intersect(&mut self, other: &ClassUnicode) {
781 self.set.intersect(&other.set);
782 }
783
784 /// Subtract the given character class from this character class, in place.
785 pub fn difference(&mut self, other: &ClassUnicode) {
786 self.set.difference(&other.set);
787 }
788
789 /// Compute the symmetric difference of the given character classes, in
790 /// place.
791 ///
792 /// This computes the symmetric difference of two character classes. This
793 /// removes all elements in this class that are also in the given class,
794 /// but all adds all elements from the given class that aren't in this
795 /// class. That is, the class will contain all elements in either class,
796 /// but will not contain any elements that are in both classes.
797 pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
798 self.set.symmetric_difference(&other.set);
799 }
800 }
801
802 /// An iterator over all ranges in a Unicode character class.
803 ///
804 /// The lifetime `'a` refers to the lifetime of the underlying class.
805 #[derive(Debug)]
806 pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
807
808 impl<'a> Iterator for ClassUnicodeIter<'a> {
809 type Item = &'a ClassUnicodeRange;
810
811 fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
812 self.0.next()
813 }
814 }
815
816 /// A single range of characters represented by Unicode scalar values.
817 ///
818 /// The range is closed. That is, the start and end of the range are included
819 /// in the range.
820 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
821 pub struct ClassUnicodeRange {
822 start: char,
823 end: char,
824 }
825
826 impl fmt::Debug for ClassUnicodeRange {
827 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
828 let start =
829 if !self.start.is_whitespace() && !self.start.is_control() {
830 self.start.to_string()
831 } else {
832 format!("0x{:X}", self.start as u32)
833 };
834 let end =
835 if !self.end.is_whitespace() && !self.end.is_control() {
836 self.end.to_string()
837 } else {
838 format!("0x{:X}", self.end as u32)
839 };
840 f.debug_struct("ClassUnicodeRange")
841 .field("start", &start)
842 .field("end", &end)
843 .finish()
844 }
845 }
846
847 impl Interval for ClassUnicodeRange {
848 type Bound = char;
849
850 #[inline] fn lower(&self) -> char { self.start }
851 #[inline] fn upper(&self) -> char { self.end }
852 #[inline] fn set_lower(&mut self, bound: char) { self.start = bound; }
853 #[inline] fn set_upper(&mut self, bound: char) { self.end = bound; }
854
855 /// Apply simple case folding to this Unicode scalar value range.
856 ///
857 /// Additional ranges are appended to the given vector. Canonical ordering
858 /// is *not* maintained in the given vector.
859 fn case_fold_simple(&self, ranges: &mut Vec<ClassUnicodeRange>) {
860 if !unicode::contains_simple_case_mapping(self.start, self.end) {
861 return;
862 }
863 let start = self.start as u32;
864 let end = (self.end as u32).saturating_add(1);
865 let mut next_simple_cp = None;
866 for cp in (start..end).filter_map(char::from_u32) {
867 if next_simple_cp.map_or(false, |next| cp < next) {
868 continue;
869 }
870 let it = match unicode::simple_fold(cp) {
871 Ok(it) => it,
872 Err(next) => {
873 next_simple_cp = next;
874 continue;
875 }
876 };
877 for cp_folded in it {
878 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
879 }
880 }
881 }
882 }
883
884 impl ClassUnicodeRange {
885 /// Create a new Unicode scalar value range for a character class.
886 ///
887 /// The returned range is always in a canonical form. That is, the range
888 /// returned always satisfies the invariant that `start <= end`.
889 pub fn new(start: char, end: char) -> ClassUnicodeRange {
890 ClassUnicodeRange::create(start, end)
891 }
892
893 /// Return the start of this range.
894 ///
895 /// The start of a range is always less than or equal to the end of the
896 /// range.
897 pub fn start(&self) -> char {
898 self.start
899 }
900
901 /// Return the end of this range.
902 ///
903 /// The end of a range is always greater than or equal to the start of the
904 /// range.
905 pub fn end(&self) -> char {
906 self.end
907 }
908 }
909
910 /// A set of characters represented by arbitrary bytes (where one byte
911 /// corresponds to one character).
912 #[derive(Clone, Debug, Eq, PartialEq)]
913 pub struct ClassBytes {
914 set: IntervalSet<ClassBytesRange>,
915 }
916
917 impl ClassBytes {
918 /// Create a new class from a sequence of ranges.
919 ///
920 /// The given ranges do not need to be in any specific order, and ranges
921 /// may overlap.
922 pub fn new<I>(ranges: I) -> ClassBytes
923 where I: IntoIterator<Item=ClassBytesRange>
924 {
925 ClassBytes { set: IntervalSet::new(ranges) }
926 }
927
928 /// Create a new class with no ranges.
929 pub fn empty() -> ClassBytes {
930 ClassBytes::new(vec![])
931 }
932
933 /// Add a new range to this set.
934 pub fn push(&mut self, range: ClassBytesRange) {
935 self.set.push(range);
936 }
937
938 /// Return an iterator over all ranges in this class.
939 ///
940 /// The iterator yields ranges in ascending order.
941 pub fn iter(&self) -> ClassBytesIter {
942 ClassBytesIter(self.set.iter())
943 }
944
945 /// Return the underlying ranges as a slice.
946 pub fn ranges(&self) -> &[ClassBytesRange] {
947 self.set.intervals()
948 }
949
950 /// Expand this character class such that it contains all case folded
951 /// characters. For example, if this class consists of the range `a-z`,
952 /// then applying case folding will result in the class containing both the
953 /// ranges `a-z` and `A-Z`.
954 ///
955 /// Note that this only applies ASCII case folding, which is limited to the
956 /// characters `a-z` and `A-Z`.
957 pub fn case_fold_simple(&mut self) {
958 self.set.case_fold_simple();
959 }
960
961 /// Negate this byte class.
962 ///
963 /// For all `b` where `b` is a any byte, if `b` was in this set, then it
964 /// will not be in this set after negation.
965 pub fn negate(&mut self) {
966 self.set.negate();
967 }
968
969 /// Union this byte class with the given byte class, in place.
970 pub fn union(&mut self, other: &ClassBytes) {
971 self.set.union(&other.set);
972 }
973
974 /// Intersect this byte class with the given byte class, in place.
975 pub fn intersect(&mut self, other: &ClassBytes) {
976 self.set.intersect(&other.set);
977 }
978
979 /// Subtract the given byte class from this byte class, in place.
980 pub fn difference(&mut self, other: &ClassBytes) {
981 self.set.difference(&other.set);
982 }
983
984 /// Compute the symmetric difference of the given byte classes, in place.
985 ///
986 /// This computes the symmetric difference of two byte classes. This
987 /// removes all elements in this class that are also in the given class,
988 /// but all adds all elements from the given class that aren't in this
989 /// class. That is, the class will contain all elements in either class,
990 /// but will not contain any elements that are in both classes.
991 pub fn symmetric_difference(&mut self, other: &ClassBytes) {
992 self.set.symmetric_difference(&other.set);
993 }
994
995 /// Returns true if and only if this character class will either match
996 /// nothing or only ASCII bytes. Stated differently, this returns false
997 /// if and only if this class contains a non-ASCII byte.
998 pub fn is_all_ascii(&self) -> bool {
999 self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1000 }
1001 }
1002
1003 /// An iterator over all ranges in a byte character class.
1004 ///
1005 /// The lifetime `'a` refers to the lifetime of the underlying class.
1006 #[derive(Debug)]
1007 pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1008
1009 impl<'a> Iterator for ClassBytesIter<'a> {
1010 type Item = &'a ClassBytesRange;
1011
1012 fn next(&mut self) -> Option<&'a ClassBytesRange> {
1013 self.0.next()
1014 }
1015 }
1016
1017 /// A single range of characters represented by arbitrary bytes.
1018 ///
1019 /// The range is closed. That is, the start and end of the range are included
1020 /// in the range.
1021 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1022 pub struct ClassBytesRange {
1023 start: u8,
1024 end: u8,
1025 }
1026
1027 impl Interval for ClassBytesRange {
1028 type Bound = u8;
1029
1030 #[inline] fn lower(&self) -> u8 { self.start }
1031 #[inline] fn upper(&self) -> u8 { self.end }
1032 #[inline] fn set_lower(&mut self, bound: u8) { self.start = bound; }
1033 #[inline] fn set_upper(&mut self, bound: u8) { self.end = bound; }
1034
1035 /// Apply simple case folding to this byte range. Only ASCII case mappings
1036 /// (for a-z) are applied.
1037 ///
1038 /// Additional ranges are appended to the given vector. Canonical ordering
1039 /// is *not* maintained in the given vector.
1040 fn case_fold_simple(&self, ranges: &mut Vec<ClassBytesRange>) {
1041 if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1042 let lower = cmp::max(self.start, b'a');
1043 let upper = cmp::min(self.end, b'z');
1044 ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1045 }
1046 if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1047 let lower = cmp::max(self.start, b'A');
1048 let upper = cmp::min(self.end, b'Z');
1049 ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1050 }
1051 }
1052 }
1053
1054 impl ClassBytesRange {
1055 /// Create a new byte range for a character class.
1056 ///
1057 /// The returned range is always in a canonical form. That is, the range
1058 /// returned always satisfies the invariant that `start <= end`.
1059 pub fn new(start: u8, end: u8) -> ClassBytesRange {
1060 ClassBytesRange::create(start, end)
1061 }
1062
1063 /// Return the start of this range.
1064 ///
1065 /// The start of a range is always less than or equal to the end of the
1066 /// range.
1067 pub fn start(&self) -> u8 {
1068 self.start
1069 }
1070
1071 /// Return the end of this range.
1072 ///
1073 /// The end of a range is always greater than or equal to the start of the
1074 /// range.
1075 pub fn end(&self) -> u8 {
1076 self.end
1077 }
1078 }
1079
1080 impl fmt::Debug for ClassBytesRange {
1081 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1082 let mut debug = f.debug_struct("ClassBytesRange");
1083 if self.start <= 0x7F {
1084 debug.field("start", &(self.start as char));
1085 } else {
1086 debug.field("start", &self.start);
1087 }
1088 if self.end <= 0x7F {
1089 debug.field("end", &(self.end as char));
1090 } else {
1091 debug.field("end", &self.end);
1092 }
1093 debug.finish()
1094 }
1095 }
1096
1097 /// The high-level intermediate representation for an anchor assertion.
1098 ///
1099 /// A matching anchor assertion is always zero-length.
1100 #[derive(Clone, Debug, Eq, PartialEq)]
1101 pub enum Anchor {
1102 /// Match the beginning of a line or the beginning of text. Specifically,
1103 /// this matches at the starting position of the input, or at the position
1104 /// immediately following a `\n` character.
1105 StartLine,
1106 /// Match the end of a line or the end of text. Specifically,
1107 /// this matches at the end position of the input, or at the position
1108 /// immediately preceding a `\n` character.
1109 EndLine,
1110 /// Match the beginning of text. Specifically, this matches at the starting
1111 /// position of the input.
1112 StartText,
1113 /// Match the end of text. Specifically, this matches at the ending
1114 /// position of the input.
1115 EndText,
1116 }
1117
1118 /// The high-level intermediate representation for a word-boundary assertion.
1119 ///
1120 /// A matching word boundary assertion is always zero-length.
1121 #[derive(Clone, Debug, Eq, PartialEq)]
1122 pub enum WordBoundary {
1123 /// Match a Unicode-aware word boundary. That is, this matches a position
1124 /// where the left adjacent character and right adjacent character
1125 /// correspond to a word and non-word or a non-word and word character.
1126 Unicode,
1127 /// Match a Unicode-aware negation of a word boundary.
1128 UnicodeNegate,
1129 /// Match an ASCII-only word boundary. That is, this matches a position
1130 /// where the left adjacent character and right adjacent character
1131 /// correspond to a word and non-word or a non-word and word character.
1132 Ascii,
1133 /// Match an ASCII-only negation of a word boundary.
1134 AsciiNegate,
1135 }
1136
1137 impl WordBoundary {
1138 /// Returns true if and only if this word boundary assertion is negated.
1139 pub fn is_negated(&self) -> bool {
1140 match *self {
1141 WordBoundary::Unicode | WordBoundary::Ascii => false,
1142 WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
1143 }
1144 }
1145 }
1146
1147 /// The high-level intermediate representation for a group.
1148 ///
1149 /// This represents one of three possible group types:
1150 ///
1151 /// 1. A non-capturing group (e.g., `(?:expr)`).
1152 /// 2. A capturing group (e.g., `(expr)`).
1153 /// 3. A named capturing group (e.g., `(?P<name>expr)`).
1154 #[derive(Clone, Debug, Eq, PartialEq)]
1155 pub struct Group {
1156 /// The kind of this group. If it is a capturing group, then the kind
1157 /// contains the capture group index (and the name, if it is a named
1158 /// group).
1159 pub kind: GroupKind,
1160 /// The expression inside the capturing group, which may be empty.
1161 pub hir: Box<Hir>,
1162 }
1163
1164 /// The kind of group.
1165 #[derive(Clone, Debug, Eq, PartialEq)]
1166 pub enum GroupKind {
1167 /// A normal unnamed capturing group.
1168 ///
1169 /// The value is the capture index of the group.
1170 CaptureIndex(u32),
1171 /// A named capturing group.
1172 CaptureName {
1173 /// The name of the group.
1174 name: String,
1175 /// The capture index of the group.
1176 index: u32,
1177 },
1178 /// A non-capturing group.
1179 NonCapturing,
1180 }
1181
1182 /// The high-level intermediate representation of a repetition operator.
1183 ///
1184 /// A repetition operator permits the repetition of an arbitrary
1185 /// sub-expression.
1186 #[derive(Clone, Debug, Eq, PartialEq)]
1187 pub struct Repetition {
1188 /// The kind of this repetition operator.
1189 pub kind: RepetitionKind,
1190 /// Whether this repetition operator is greedy or not. A greedy operator
1191 /// will match as much as it can. A non-greedy operator will match as
1192 /// little as it can.
1193 ///
1194 /// Typically, operators are greedy by default and are only non-greedy when
1195 /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1196 /// not. However, this can be inverted via the `U` "ungreedy" flag.
1197 pub greedy: bool,
1198 /// The expression being repeated.
1199 pub hir: Box<Hir>,
1200 }
1201
1202 impl Repetition {
1203 /// Returns true if and only if this repetition operator makes it possible
1204 /// to match the empty string.
1205 ///
1206 /// Note that this is not defined inductively. For example, while `a*`
1207 /// will report `true`, `()+` will not, even though `()` matches the empty
1208 /// string and one or more occurrences of something that matches the empty
1209 /// string will always match the empty string. In order to get the
1210 /// inductive definition, see the corresponding method on
1211 /// [`Hir`](struct.Hir.html).
1212 pub fn is_match_empty(&self) -> bool {
1213 match self.kind {
1214 RepetitionKind::ZeroOrOne => true,
1215 RepetitionKind::ZeroOrMore => true,
1216 RepetitionKind::OneOrMore => false,
1217 RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
1218 RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
1219 RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
1220 }
1221 }
1222 }
1223
1224 /// The kind of a repetition operator.
1225 #[derive(Clone, Debug, Eq, PartialEq)]
1226 pub enum RepetitionKind {
1227 /// Matches a sub-expression zero or one times.
1228 ZeroOrOne,
1229 /// Matches a sub-expression zero or more times.
1230 ZeroOrMore,
1231 /// Matches a sub-expression one or more times.
1232 OneOrMore,
1233 /// Matches a sub-expression within a bounded range of times.
1234 Range(RepetitionRange),
1235 }
1236
1237 /// The kind of a counted repetition operator.
1238 #[derive(Clone, Debug, Eq, PartialEq)]
1239 pub enum RepetitionRange {
1240 /// Matches a sub-expression exactly this many times.
1241 Exactly(u32),
1242 /// Matches a sub-expression at least this many times.
1243 AtLeast(u32),
1244 /// Matches a sub-expression at least `m` times and at most `n` times.
1245 Bounded(u32, u32),
1246 }
1247
1248 /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1249 /// space but heap space proportional to the depth of the total `Hir`.
1250 impl Drop for Hir {
1251 fn drop(&mut self) {
1252 use std::mem;
1253
1254 match *self.kind() {
1255 HirKind::Empty
1256 | HirKind::Literal(_)
1257 | HirKind::Class(_)
1258 | HirKind::Anchor(_)
1259 | HirKind::WordBoundary(_) => return,
1260 HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
1261 HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
1262 HirKind::Concat(ref x) if x.is_empty() => return,
1263 HirKind::Alternation(ref x) if x.is_empty() => return,
1264 _ => {}
1265 }
1266
1267 let mut stack = vec![mem::replace(self, Hir::empty())];
1268 while let Some(mut expr) = stack.pop() {
1269 match expr.kind {
1270 HirKind::Empty
1271 | HirKind::Literal(_)
1272 | HirKind::Class(_)
1273 | HirKind::Anchor(_)
1274 | HirKind::WordBoundary(_) => {}
1275 HirKind::Group(ref mut x) => {
1276 stack.push(mem::replace(&mut x.hir, Hir::empty()));
1277 }
1278 HirKind::Repetition(ref mut x) => {
1279 stack.push(mem::replace(&mut x.hir, Hir::empty()));
1280 }
1281 HirKind::Concat(ref mut x) => {
1282 stack.extend(x.drain(..));
1283 }
1284 HirKind::Alternation(ref mut x) => {
1285 stack.extend(x.drain(..));
1286 }
1287 }
1288 }
1289 }
1290 }
1291
1292 /// A type that documents various attributes of an HIR expression.
1293 ///
1294 /// These attributes are typically defined inductively on the HIR.
1295 #[derive(Clone, Debug, Eq, PartialEq)]
1296 struct HirInfo {
1297 /// Represent yes/no questions by a bitfield to conserve space, since
1298 /// this is included in every HIR expression.
1299 ///
1300 /// If more attributes need to be added, it is OK to increase the size of
1301 /// this as appropriate.
1302 bools: u8,
1303 }
1304
1305 // A simple macro for defining bitfield accessors/mutators.
1306 macro_rules! define_bool {
1307 ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
1308 fn $is_fn_name(&self) -> bool {
1309 self.bools & (0b1 << $bit) > 0
1310 }
1311
1312 fn $set_fn_name(&mut self, yes: bool) {
1313 if yes {
1314 self.bools |= 1 << $bit;
1315 } else {
1316 self.bools &= !(1 << $bit);
1317 }
1318 }
1319 }
1320 }
1321
1322 impl HirInfo {
1323 fn new() -> HirInfo {
1324 HirInfo {
1325 bools: 0,
1326 }
1327 }
1328
1329 define_bool!(0, is_always_utf8, set_always_utf8);
1330 define_bool!(1, is_all_assertions, set_all_assertions);
1331 define_bool!(2, is_anchored_start, set_anchored_start);
1332 define_bool!(3, is_anchored_end, set_anchored_end);
1333 define_bool!(4, is_any_anchored_start, set_any_anchored_start);
1334 define_bool!(5, is_any_anchored_end, set_any_anchored_end);
1335 define_bool!(6, is_match_empty, set_match_empty);
1336 }
1337
1338 #[cfg(test)]
1339 mod tests {
1340 use super::*;
1341
1342 fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
1343 let ranges: Vec<ClassUnicodeRange> = ranges
1344 .iter()
1345 .map(|&(s, e)| ClassUnicodeRange::new(s, e))
1346 .collect();
1347 ClassUnicode::new(ranges)
1348 }
1349
1350 fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
1351 let ranges: Vec<ClassBytesRange> = ranges
1352 .iter()
1353 .map(|&(s, e)| ClassBytesRange::new(s, e))
1354 .collect();
1355 ClassBytes::new(ranges)
1356 }
1357
1358 fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
1359 cls.iter().map(|x| (x.start(), x.end())).collect()
1360 }
1361
1362 fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
1363 let mut cls_ = cls.clone();
1364 cls_.case_fold_simple();
1365 cls_
1366 }
1367
1368 fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1369 let mut cls_ = cls1.clone();
1370 cls_.union(cls2);
1371 cls_
1372 }
1373
1374 fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1375 let mut cls_ = cls1.clone();
1376 cls_.intersect(cls2);
1377 cls_
1378 }
1379
1380 fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1381 let mut cls_ = cls1.clone();
1382 cls_.difference(cls2);
1383 cls_
1384 }
1385
1386 fn usymdifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1387 let mut cls_ = cls1.clone();
1388 cls_.symmetric_difference(cls2);
1389 cls_
1390 }
1391
1392 fn unegate(cls: &ClassUnicode) -> ClassUnicode {
1393 let mut cls_ = cls.clone();
1394 cls_.negate();
1395 cls_
1396 }
1397
1398 fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
1399 cls.iter().map(|x| (x.start(), x.end())).collect()
1400 }
1401
1402 fn bcasefold(cls: &ClassBytes) -> ClassBytes {
1403 let mut cls_ = cls.clone();
1404 cls_.case_fold_simple();
1405 cls_
1406 }
1407
1408 fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1409 let mut cls_ = cls1.clone();
1410 cls_.union(cls2);
1411 cls_
1412 }
1413
1414 fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1415 let mut cls_ = cls1.clone();
1416 cls_.intersect(cls2);
1417 cls_
1418 }
1419
1420 fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1421 let mut cls_ = cls1.clone();
1422 cls_.difference(cls2);
1423 cls_
1424 }
1425
1426 fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1427 let mut cls_ = cls1.clone();
1428 cls_.symmetric_difference(cls2);
1429 cls_
1430 }
1431
1432 fn bnegate(cls: &ClassBytes) -> ClassBytes {
1433 let mut cls_ = cls.clone();
1434 cls_.negate();
1435 cls_
1436 }
1437
1438 #[test]
1439 fn class_range_canonical_unicode() {
1440 let range = ClassUnicodeRange::new('\u{00FF}', '\0');
1441 assert_eq!('\0', range.start());
1442 assert_eq!('\u{00FF}', range.end());
1443 }
1444
1445 #[test]
1446 fn class_range_canonical_bytes() {
1447 let range = ClassBytesRange::new(b'\xFF', b'\0');
1448 assert_eq!(b'\0', range.start());
1449 assert_eq!(b'\xFF', range.end());
1450 }
1451
1452 #[test]
1453 fn class_canonicalize_unicode() {
1454 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1455 let expected = vec![('a', 'c'), ('x', 'z')];
1456 assert_eq!(expected, uranges(&cls));
1457
1458 let cls = uclass(&[('x', 'z'), ('a', 'c')]);
1459 let expected = vec![('a', 'c'), ('x', 'z')];
1460 assert_eq!(expected, uranges(&cls));
1461
1462 let cls = uclass(&[('x', 'z'), ('w', 'y')]);
1463 let expected = vec![('w', 'z')];
1464 assert_eq!(expected, uranges(&cls));
1465
1466 let cls = uclass(&[
1467 ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'),
1468 ('m', 'p'), ('l', 's'),
1469 ]);
1470 let expected = vec![('a', 'j'), ('l', 's')];
1471 assert_eq!(expected, uranges(&cls));
1472
1473 let cls = uclass(&[('x', 'z'), ('u', 'w')]);
1474 let expected = vec![('u', 'z')];
1475 assert_eq!(expected, uranges(&cls));
1476
1477 let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1478 let expected = vec![('\x00', '\u{10FFFF}')];
1479 assert_eq!(expected, uranges(&cls));
1480
1481
1482 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1483 let expected = vec![('a', 'b')];
1484 assert_eq!(expected, uranges(&cls));
1485 }
1486
1487 #[test]
1488 fn class_canonicalize_bytes() {
1489 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1490 let expected = vec![(b'a', b'c'), (b'x', b'z')];
1491 assert_eq!(expected, branges(&cls));
1492
1493 let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
1494 let expected = vec![(b'a', b'c'), (b'x', b'z')];
1495 assert_eq!(expected, branges(&cls));
1496
1497 let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
1498 let expected = vec![(b'w', b'z')];
1499 assert_eq!(expected, branges(&cls));
1500
1501 let cls = bclass(&[
1502 (b'c', b'f'), (b'a', b'g'), (b'd', b'j'), (b'a', b'c'),
1503 (b'm', b'p'), (b'l', b's'),
1504 ]);
1505 let expected = vec![(b'a', b'j'), (b'l', b's')];
1506 assert_eq!(expected, branges(&cls));
1507
1508 let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
1509 let expected = vec![(b'u', b'z')];
1510 assert_eq!(expected, branges(&cls));
1511
1512 let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
1513 let expected = vec![(b'\x00', b'\xFF')];
1514 assert_eq!(expected, branges(&cls));
1515
1516 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1517 let expected = vec![(b'a', b'b')];
1518 assert_eq!(expected, branges(&cls));
1519 }
1520
1521 #[test]
1522 fn class_case_fold_unicode() {
1523 let cls = uclass(&[
1524 ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'),
1525 ('M', 'P'), ('L', 'S'), ('c', 'f'),
1526 ]);
1527 let expected = uclass(&[
1528 ('A', 'J'), ('L', 'S'),
1529 ('a', 'j'), ('l', 's'),
1530 ('\u{17F}', '\u{17F}'),
1531 ]);
1532 assert_eq!(expected, ucasefold(&cls));
1533
1534 let cls = uclass(&[('A', 'Z')]);
1535 let expected = uclass(&[
1536 ('A', 'Z'), ('a', 'z'),
1537 ('\u{17F}', '\u{17F}'),
1538 ('\u{212A}', '\u{212A}'),
1539 ]);
1540 assert_eq!(expected, ucasefold(&cls));
1541
1542 let cls = uclass(&[('a', 'z')]);
1543 let expected = uclass(&[
1544 ('A', 'Z'), ('a', 'z'),
1545 ('\u{17F}', '\u{17F}'),
1546 ('\u{212A}', '\u{212A}'),
1547 ]);
1548 assert_eq!(expected, ucasefold(&cls));
1549
1550 let cls = uclass(&[('A', 'A'), ('_', '_')]);
1551 let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
1552 assert_eq!(expected, ucasefold(&cls));
1553
1554 let cls = uclass(&[('A', 'A'), ('=', '=')]);
1555 let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
1556 assert_eq!(expected, ucasefold(&cls));
1557
1558 let cls = uclass(&[('\x00', '\x10')]);
1559 assert_eq!(cls, ucasefold(&cls));
1560
1561 let cls = uclass(&[('k', 'k')]);
1562 let expected = uclass(&[
1563 ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),
1564 ]);
1565 assert_eq!(expected, ucasefold(&cls));
1566
1567 let cls = uclass(&[('@', '@')]);
1568 assert_eq!(cls, ucasefold(&cls));
1569 }
1570
1571 #[test]
1572 fn class_case_fold_bytes() {
1573 let cls = bclass(&[
1574 (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'),
1575 (b'M', b'P'), (b'L', b'S'), (b'c', b'f'),
1576 ]);
1577 let expected = bclass(&[
1578 (b'A', b'J'), (b'L', b'S'),
1579 (b'a', b'j'), (b'l', b's'),
1580 ]);
1581 assert_eq!(expected, bcasefold(&cls));
1582
1583 let cls = bclass(&[(b'A', b'Z')]);
1584 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1585 assert_eq!(expected, bcasefold(&cls));
1586
1587 let cls = bclass(&[(b'a', b'z')]);
1588 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1589 assert_eq!(expected, bcasefold(&cls));
1590
1591 let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1592 let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
1593 assert_eq!(expected, bcasefold(&cls));
1594
1595 let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1596 let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
1597 assert_eq!(expected, bcasefold(&cls));
1598
1599 let cls = bclass(&[(b'\x00', b'\x10')]);
1600 assert_eq!(cls, bcasefold(&cls));
1601
1602 let cls = bclass(&[(b'k', b'k')]);
1603 let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
1604 assert_eq!(expected, bcasefold(&cls));
1605
1606 let cls = bclass(&[(b'@', b'@')]);
1607 assert_eq!(cls, bcasefold(&cls));
1608 }
1609
1610 #[test]
1611 fn class_negate_unicode() {
1612 let cls = uclass(&[('a', 'a')]);
1613 let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
1614 assert_eq!(expected, unegate(&cls));
1615
1616 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1617 let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
1618 assert_eq!(expected, unegate(&cls));
1619
1620 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1621 let expected = uclass(&[
1622 ('\x00', '\x60'), ('\x64', '\x77'), ('\x7B', '\u{10FFFF}'),
1623 ]);
1624 assert_eq!(expected, unegate(&cls));
1625
1626 let cls = uclass(&[('\x00', 'a')]);
1627 let expected = uclass(&[('\x62', '\u{10FFFF}')]);
1628 assert_eq!(expected, unegate(&cls));
1629
1630 let cls = uclass(&[('a', '\u{10FFFF}')]);
1631 let expected = uclass(&[('\x00', '\x60')]);
1632 assert_eq!(expected, unegate(&cls));
1633
1634 let cls = uclass(&[('\x00', '\u{10FFFF}')]);
1635 let expected = uclass(&[]);
1636 assert_eq!(expected, unegate(&cls));
1637
1638 let cls = uclass(&[]);
1639 let expected = uclass(&[('\x00', '\u{10FFFF}')]);
1640 assert_eq!(expected, unegate(&cls));
1641
1642 let cls = uclass(&[
1643 ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}'),
1644 ]);
1645 let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
1646 assert_eq!(expected, unegate(&cls));
1647
1648 let cls = uclass(&[('\x00', '\u{D7FF}')]);
1649 let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1650 assert_eq!(expected, unegate(&cls));
1651
1652 let cls = uclass(&[('\x00', '\u{D7FE}')]);
1653 let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
1654 assert_eq!(expected, unegate(&cls));
1655
1656 let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1657 let expected = uclass(&[('\x00', '\u{D7FF}')]);
1658 assert_eq!(expected, unegate(&cls));
1659
1660 let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
1661 let expected = uclass(&[('\x00', '\u{E000}')]);
1662 assert_eq!(expected, unegate(&cls));
1663 }
1664
1665 #[test]
1666 fn class_negate_bytes() {
1667 let cls = bclass(&[(b'a', b'a')]);
1668 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
1669 assert_eq!(expected, bnegate(&cls));
1670
1671 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1672 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
1673 assert_eq!(expected, bnegate(&cls));
1674
1675 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1676 let expected = bclass(&[
1677 (b'\x00', b'\x60'), (b'\x64', b'\x77'), (b'\x7B', b'\xFF'),
1678 ]);
1679 assert_eq!(expected, bnegate(&cls));
1680
1681 let cls = bclass(&[(b'\x00', b'a')]);
1682 let expected = bclass(&[(b'\x62', b'\xFF')]);
1683 assert_eq!(expected, bnegate(&cls));
1684
1685 let cls = bclass(&[(b'a', b'\xFF')]);
1686 let expected = bclass(&[(b'\x00', b'\x60')]);
1687 assert_eq!(expected, bnegate(&cls));
1688
1689 let cls = bclass(&[(b'\x00', b'\xFF')]);
1690 let expected = bclass(&[]);
1691 assert_eq!(expected, bnegate(&cls));
1692
1693 let cls = bclass(&[]);
1694 let expected = bclass(&[(b'\x00', b'\xFF')]);
1695 assert_eq!(expected, bnegate(&cls));
1696
1697 let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
1698 let expected = bclass(&[(b'\xFE', b'\xFE')]);
1699 assert_eq!(expected, bnegate(&cls));
1700 }
1701
1702 #[test]
1703 fn class_union_unicode() {
1704 let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
1705 let cls2 = uclass(&[('a', 'z')]);
1706 let expected = uclass(&[('a', 'z'), ('A', 'C')]);
1707 assert_eq!(expected, uunion(&cls1, &cls2));
1708 }
1709
1710 #[test]
1711 fn class_union_bytes() {
1712 let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
1713 let cls2 = bclass(&[(b'a', b'z')]);
1714 let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
1715 assert_eq!(expected, bunion(&cls1, &cls2));
1716 }
1717
1718 #[test]
1719 fn class_intersect_unicode() {
1720 let cls1 = uclass(&[]);
1721 let cls2 = uclass(&[('a', 'a')]);
1722 let expected = uclass(&[]);
1723 assert_eq!(expected, uintersect(&cls1, &cls2));
1724
1725 let cls1 = uclass(&[('a', 'a')]);
1726 let cls2 = uclass(&[('a', 'a')]);
1727 let expected = uclass(&[('a', 'a')]);
1728 assert_eq!(expected, uintersect(&cls1, &cls2));
1729
1730 let cls1 = uclass(&[('a', 'a')]);
1731 let cls2 = uclass(&[('b', 'b')]);
1732 let expected = uclass(&[]);
1733 assert_eq!(expected, uintersect(&cls1, &cls2));
1734
1735 let cls1 = uclass(&[('a', 'a')]);
1736 let cls2 = uclass(&[('a', 'c')]);
1737 let expected = uclass(&[('a', 'a')]);
1738 assert_eq!(expected, uintersect(&cls1, &cls2));
1739
1740 let cls1 = uclass(&[('a', 'b')]);
1741 let cls2 = uclass(&[('a', 'c')]);
1742 let expected = uclass(&[('a', 'b')]);
1743 assert_eq!(expected, uintersect(&cls1, &cls2));
1744
1745 let cls1 = uclass(&[('a', 'b')]);
1746 let cls2 = uclass(&[('b', 'c')]);
1747 let expected = uclass(&[('b', 'b')]);
1748 assert_eq!(expected, uintersect(&cls1, &cls2));
1749
1750 let cls1 = uclass(&[('a', 'b')]);
1751 let cls2 = uclass(&[('c', 'd')]);
1752 let expected = uclass(&[]);
1753 assert_eq!(expected, uintersect(&cls1, &cls2));
1754
1755 let cls1 = uclass(&[('b', 'c')]);
1756 let cls2 = uclass(&[('a', 'd')]);
1757 let expected = uclass(&[('b', 'c')]);
1758 assert_eq!(expected, uintersect(&cls1, &cls2));
1759
1760 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1761 let cls2 = uclass(&[('a', 'h')]);
1762 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1763 assert_eq!(expected, uintersect(&cls1, &cls2));
1764
1765 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1766 let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1767 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1768 assert_eq!(expected, uintersect(&cls1, &cls2));
1769
1770 let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
1771 let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
1772 let expected = uclass(&[]);
1773 assert_eq!(expected, uintersect(&cls1, &cls2));
1774
1775 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1776 let cls2 = uclass(&[('h', 'h')]);
1777 let expected = uclass(&[('h', 'h')]);
1778 assert_eq!(expected, uintersect(&cls1, &cls2));
1779
1780 let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
1781 let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
1782 let expected = uclass(&[]);
1783 assert_eq!(expected, uintersect(&cls1, &cls2));
1784
1785 let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
1786 let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
1787 let expected = uclass(&[('b', 'f')]);
1788 assert_eq!(expected, uintersect(&cls1, &cls2));
1789 }
1790
1791 #[test]
1792 fn class_intersect_bytes() {
1793 let cls1 = bclass(&[]);
1794 let cls2 = bclass(&[(b'a', b'a')]);
1795 let expected = bclass(&[]);
1796 assert_eq!(expected, bintersect(&cls1, &cls2));
1797
1798 let cls1 = bclass(&[(b'a', b'a')]);
1799 let cls2 = bclass(&[(b'a', b'a')]);
1800 let expected = bclass(&[(b'a', b'a')]);
1801 assert_eq!(expected, bintersect(&cls1, &cls2));
1802
1803 let cls1 = bclass(&[(b'a', b'a')]);
1804 let cls2 = bclass(&[(b'b', b'b')]);
1805 let expected = bclass(&[]);
1806 assert_eq!(expected, bintersect(&cls1, &cls2));
1807
1808 let cls1 = bclass(&[(b'a', b'a')]);
1809 let cls2 = bclass(&[(b'a', b'c')]);
1810 let expected = bclass(&[(b'a', b'a')]);
1811 assert_eq!(expected, bintersect(&cls1, &cls2));
1812
1813 let cls1 = bclass(&[(b'a', b'b')]);
1814 let cls2 = bclass(&[(b'a', b'c')]);
1815 let expected = bclass(&[(b'a', b'b')]);
1816 assert_eq!(expected, bintersect(&cls1, &cls2));
1817
1818 let cls1 = bclass(&[(b'a', b'b')]);
1819 let cls2 = bclass(&[(b'b', b'c')]);
1820 let expected = bclass(&[(b'b', b'b')]);
1821 assert_eq!(expected, bintersect(&cls1, &cls2));
1822
1823 let cls1 = bclass(&[(b'a', b'b')]);
1824 let cls2 = bclass(&[(b'c', b'd')]);
1825 let expected = bclass(&[]);
1826 assert_eq!(expected, bintersect(&cls1, &cls2));
1827
1828 let cls1 = bclass(&[(b'b', b'c')]);
1829 let cls2 = bclass(&[(b'a', b'd')]);
1830 let expected = bclass(&[(b'b', b'c')]);
1831 assert_eq!(expected, bintersect(&cls1, &cls2));
1832
1833 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
1834 let cls2 = bclass(&[(b'a', b'h')]);
1835 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
1836 assert_eq!(expected, bintersect(&cls1, &cls2));
1837
1838 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
1839 let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
1840 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
1841 assert_eq!(expected, bintersect(&cls1, &cls2));
1842
1843 let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
1844 let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
1845 let expected = bclass(&[]);
1846 assert_eq!(expected, bintersect(&cls1, &cls2));
1847
1848 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
1849 let cls2 = bclass(&[(b'h', b'h')]);
1850 let expected = bclass(&[(b'h', b'h')]);
1851 assert_eq!(expected, bintersect(&cls1, &cls2));
1852
1853 let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
1854 let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
1855 let expected = bclass(&[]);
1856 assert_eq!(expected, bintersect(&cls1, &cls2));
1857
1858 let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
1859 let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
1860 let expected = bclass(&[(b'b', b'f')]);
1861 assert_eq!(expected, bintersect(&cls1, &cls2));
1862 }
1863
1864 #[test]
1865 fn class_difference_unicode() {
1866 let cls1 = uclass(&[('a', 'a')]);
1867 let cls2 = uclass(&[('a', 'a')]);
1868 let expected = uclass(&[]);
1869 assert_eq!(expected, udifference(&cls1, &cls2));
1870
1871 let cls1 = uclass(&[('a', 'a')]);
1872 let cls2 = uclass(&[]);
1873 let expected = uclass(&[('a', 'a')]);
1874 assert_eq!(expected, udifference(&cls1, &cls2));
1875
1876 let cls1 = uclass(&[]);
1877 let cls2 = uclass(&[('a', 'a')]);
1878 let expected = uclass(&[]);
1879 assert_eq!(expected, udifference(&cls1, &cls2));
1880
1881 let cls1 = uclass(&[('a', 'z')]);
1882 let cls2 = uclass(&[('a', 'a')]);
1883 let expected = uclass(&[('b', 'z')]);
1884 assert_eq!(expected, udifference(&cls1, &cls2));
1885
1886 let cls1 = uclass(&[('a', 'z')]);
1887 let cls2 = uclass(&[('z', 'z')]);
1888 let expected = uclass(&[('a', 'y')]);
1889 assert_eq!(expected, udifference(&cls1, &cls2));
1890
1891 let cls1 = uclass(&[('a', 'z')]);
1892 let cls2 = uclass(&[('m', 'm')]);
1893 let expected = uclass(&[('a', 'l'), ('n', 'z')]);
1894 assert_eq!(expected, udifference(&cls1, &cls2));
1895
1896 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
1897 let cls2 = uclass(&[('a', 'z')]);
1898 let expected = uclass(&[]);
1899 assert_eq!(expected, udifference(&cls1, &cls2));
1900
1901 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
1902 let cls2 = uclass(&[('d', 'v')]);
1903 let expected = uclass(&[('a', 'c')]);
1904 assert_eq!(expected, udifference(&cls1, &cls2));
1905
1906 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
1907 let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
1908 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
1909 assert_eq!(expected, udifference(&cls1, &cls2));
1910
1911 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
1912 let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
1913 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
1914 assert_eq!(expected, udifference(&cls1, &cls2));
1915
1916 let cls1 = uclass(&[('x', 'z')]);
1917 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
1918 let expected = uclass(&[('x', 'z')]);
1919 assert_eq!(expected, udifference(&cls1, &cls2));
1920
1921 let cls1 = uclass(&[('a', 'z')]);
1922 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
1923 let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
1924 assert_eq!(expected, udifference(&cls1, &cls2));
1925 }
1926
1927 #[test]
1928 fn class_difference_bytes() {
1929 let cls1 = bclass(&[(b'a', b'a')]);
1930 let cls2 = bclass(&[(b'a', b'a')]);
1931 let expected = bclass(&[]);
1932 assert_eq!(expected, bdifference(&cls1, &cls2));
1933
1934 let cls1 = bclass(&[(b'a', b'a')]);
1935 let cls2 = bclass(&[]);
1936 let expected = bclass(&[(b'a', b'a')]);
1937 assert_eq!(expected, bdifference(&cls1, &cls2));
1938
1939 let cls1 = bclass(&[]);
1940 let cls2 = bclass(&[(b'a', b'a')]);
1941 let expected = bclass(&[]);
1942 assert_eq!(expected, bdifference(&cls1, &cls2));
1943
1944 let cls1 = bclass(&[(b'a', b'z')]);
1945 let cls2 = bclass(&[(b'a', b'a')]);
1946 let expected = bclass(&[(b'b', b'z')]);
1947 assert_eq!(expected, bdifference(&cls1, &cls2));
1948
1949 let cls1 = bclass(&[(b'a', b'z')]);
1950 let cls2 = bclass(&[(b'z', b'z')]);
1951 let expected = bclass(&[(b'a', b'y')]);
1952 assert_eq!(expected, bdifference(&cls1, &cls2));
1953
1954 let cls1 = bclass(&[(b'a', b'z')]);
1955 let cls2 = bclass(&[(b'm', b'm')]);
1956 let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
1957 assert_eq!(expected, bdifference(&cls1, &cls2));
1958
1959 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
1960 let cls2 = bclass(&[(b'a', b'z')]);
1961 let expected = bclass(&[]);
1962 assert_eq!(expected, bdifference(&cls1, &cls2));
1963
1964 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
1965 let cls2 = bclass(&[(b'd', b'v')]);
1966 let expected = bclass(&[(b'a', b'c')]);
1967 assert_eq!(expected, bdifference(&cls1, &cls2));
1968
1969 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
1970 let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
1971 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
1972 assert_eq!(expected, bdifference(&cls1, &cls2));
1973
1974 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
1975 let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
1976 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
1977 assert_eq!(expected, bdifference(&cls1, &cls2));
1978
1979 let cls1 = bclass(&[(b'x', b'z')]);
1980 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
1981 let expected = bclass(&[(b'x', b'z')]);
1982 assert_eq!(expected, bdifference(&cls1, &cls2));
1983
1984 let cls1 = bclass(&[(b'a', b'z')]);
1985 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
1986 let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
1987 assert_eq!(expected, bdifference(&cls1, &cls2));
1988 }
1989
1990 #[test]
1991 fn class_symmetric_difference_unicode() {
1992 let cls1 = uclass(&[('a', 'm')]);
1993 let cls2 = uclass(&[('g', 't')]);
1994 let expected = uclass(&[('a', 'f'), ('n', 't')]);
1995 assert_eq!(expected, usymdifference(&cls1, &cls2));
1996 }
1997
1998 #[test]
1999 fn class_symmetric_difference_bytes() {
2000 let cls1 = bclass(&[(b'a', b'm')]);
2001 let cls2 = bclass(&[(b'g', b't')]);
2002 let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
2003 assert_eq!(expected, bsymdifference(&cls1, &cls2));
2004 }
2005
2006 #[test]
2007 #[should_panic]
2008 fn hir_byte_literal_non_ascii() {
2009 Hir::literal(Literal::Byte(b'a'));
2010 }
2011
2012 // We use a thread with an explicit stack size to test that our destructor
2013 // for Hir can handle arbitrarily sized expressions in constant stack
2014 // space. In case we run on a platform without threads (WASM?), we limit
2015 // this test to Windows/Unix.
2016 #[test]
2017 #[cfg(any(unix, windows))]
2018 fn no_stack_overflow_on_drop() {
2019 use std::thread;
2020
2021 let run = || {
2022 let mut expr = Hir::empty();
2023 for _ in 0..100 {
2024 expr = Hir::group(Group {
2025 kind: GroupKind::NonCapturing,
2026 hir: Box::new(expr),
2027 });
2028 expr = Hir::repetition(Repetition {
2029 kind: RepetitionKind::ZeroOrOne,
2030 greedy: true,
2031 hir: Box::new(expr),
2032 });
2033
2034 expr = Hir {
2035 kind: HirKind::Concat(vec![expr]),
2036 info: HirInfo::new(),
2037 };
2038 expr = Hir {
2039 kind: HirKind::Alternation(vec![expr]),
2040 info: HirInfo::new(),
2041 };
2042 }
2043 assert!(!expr.kind.is_empty());
2044 };
2045
2046 // We run our test on a thread with a small stack size so we can
2047 // force the issue more easily.
2048 thread::Builder::new()
2049 .stack_size(1<<10)
2050 .spawn(run)
2051 .unwrap()
2052 .join()
2053 .unwrap();
2054 }
2055 }