vendor/regex/tests/bytes.rs

   1 // These are tests specifically crafted for regexes that can match arbitrary
   2 // bytes.
   3
   4 // A silly wrapper to make it possible to write and match raw bytes.
   5 struct R<'a>(&'a [u8]);
   6 impl<'a> R<'a> {
   7     fn as_bytes(&self) -> &'a [u8] {
   8         self.0
   9     }
  10 }
  11
  12 mat!(word_boundary, r"(?-u) \b", " δ", None);
  13 #[cfg(feature = "unicode-perl")]
  14 mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1)));
  15 mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1)));
  16 #[cfg(feature = "unicode-perl")]
  17 mat!(word_not_boundary_unicode, r" \B", " δ", None);
  18
  19 mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1)));
  20 #[cfg(feature = "unicode-perl")]
  21 mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3)));
  22 mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1)));
  23 #[cfg(feature = "unicode-perl")]
  24 mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8)));
  25 mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1)));
  26 #[cfg(feature = "unicode-perl")]
  27 mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4)));
  28
  29 // The first `(.+)` matches two Unicode codepoints, but can't match the 5th
  30 // byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
  31 // matches.
  32 mat!(
  33     mixed1,
  34     r"(.+)(?-u)(.+)",
  35     R(b"\xCE\x93\xCE\x94\xFF"),
  36     Some((0, 5)),
  37     Some((0, 4)),
  38     Some((4, 5))
  39 );
  40
  41 mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1)));
  42 mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5)));
  43 #[cfg(feature = "unicode-case")]
  44 mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7)));
  45 mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2)));
  46
  47 mat!(negate_unicode, r"[^a]", "δ", Some((0, 2)));
  48 mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1)));
  49
  50 // This doesn't match in a normal Unicode regex because the implicit preceding
  51 // `.*?` is Unicode aware.
  52 mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2)));
  53 mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2)));
  54
  55 // Have fun with null bytes.
  56 mat!(
  57     null_bytes,
  58     r"(?-u)(?P<cstr>[^\x00]+)\x00",
  59     R(b"foo\x00"),
  60     Some((0, 4)),
  61     Some((0, 3))
  62 );
  63
  64 // Test that lookahead operators work properly in the face of invalid UTF-8.
  65 // See: https://github.com/rust-lang/regex/issues/277
  66 matiter!(
  67     invalidutf8_anchor1,
  68     r"(?-u)\xcc?^",
  69     R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
  70     (0, 0)
  71 );
  72 matiter!(
  73     invalidutf8_anchor2,
  74     r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$",
  75     R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
  76     (22, 22)
  77 );
  78 matiter!(
  79     invalidutf8_anchor3,
  80     r"(?-u)^|ddp\xff\xffdddddlQd@\x80",
  81     R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
  82     (0, 0)
  83 );
  84
  85 // See https://github.com/rust-lang/regex/issues/303
  86 #[test]
  87 fn negated_full_byte_range() {
  88     assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err());
  89 }
  90
  91 matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ");
  92 matiter!(
  93     word_boundary_ascii2,
  94     r"(?-u:\B)",
  95     "0\u{7EF5E}",
  96     (2, 2),
  97     (3, 3),
  98     (4, 4),
  99     (5, 5)
 100 );
 101
 102 // See: https://github.com/rust-lang/regex/issues/264
 103 mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0)));
 104 mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0)));
 105
 106 // See: https://github.com/rust-lang/regex/issues/271
 107 mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8)));