1 // These are tests specifically crafted for regexes that can match arbitrary
4 // A silly wrapper to make it possible to write and match raw bytes.
5 struct R
<'a
>(&'a
[u8]);
7 fn as_bytes(&self) -> &'a
[u8] {
12 mat
!(word_boundary
, r
"(?-u) \b", " δ", None
);
13 #[cfg(feature = "unicode-perl")]
14 mat
!(word_boundary_unicode
, r
" \b", " δ", Some((0, 1)));
15 mat
!(word_not_boundary
, r
"(?-u) \B", " δ", Some((0, 1)));
16 #[cfg(feature = "unicode-perl")]
17 mat
!(word_not_boundary_unicode
, r
" \B", " δ", None
);
19 mat
!(perl_w_ascii
, r
"(?-u)\w+", "aδ", Some((0, 1)));
20 #[cfg(feature = "unicode-perl")]
21 mat
!(perl_w_unicode
, r
"\w+", "aδ", Some((0, 3)));
22 mat
!(perl_d_ascii
, r
"(?-u)\d+", "1२३9", Some((0, 1)));
23 #[cfg(feature = "unicode-perl")]
24 mat
!(perl_d_unicode
, r
"\d+", "1२३9", Some((0, 8)));
25 mat
!(perl_s_ascii
, r
"(?-u)\s+", " \u{1680}", Some((0, 1)));
26 #[cfg(feature = "unicode-perl")]
27 mat
!(perl_s_unicode
, r
"\s+", " \u{1680}", Some((0, 4)));
29 // The first `(.+)` matches two Unicode codepoints, but can't match the 5th
30 // byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
35 R(b
"\xCE\x93\xCE\x94\xFF"),
41 mat
!(case_ascii_one
, r
"(?i-u)a", "A", Some((0, 1)));
42 mat
!(case_ascii_class
, r
"(?i-u)[a-z]+", "AaAaA", Some((0, 5)));
43 #[cfg(feature = "unicode-case")]
44 mat
!(case_unicode
, r
"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7)));
45 mat
!(case_not_unicode
, r
"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2)));
47 mat
!(negate_unicode
, r
"[^a]", "δ", Some((0, 2)));
48 mat
!(negate_not_unicode
, r
"(?-u)[^a]", "δ", Some((0, 1)));
50 // This doesn't match in a normal Unicode regex because the implicit preceding
51 // `.*?` is Unicode aware.
52 mat
!(dotstar_prefix_not_unicode1
, r
"(?-u)a", R(b
"\xFFa"), Some((1, 2)));
53 mat
!(dotstar_prefix_not_unicode2
, r
"a", R(b
"\xFFa"), Some((1, 2)));
55 // Have fun with null bytes.
58 r
"(?-u)(?P<cstr>[^\x00]+)\x00",
64 // Test that lookahead operators work properly in the face of invalid UTF-8.
65 // See: https://github.com/rust-lang/regex/issues/277
69 R(b
"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
74 r
"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$",
75 R(b
"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
80 r
"(?-u)^|ddp\xff\xffdddddlQd@\x80",
81 R(b
"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"),
85 // See https://github.com/rust-lang/regex/issues/303
87 fn negated_full_byte_range() {
88 assert
!(::regex
::bytes
::Regex
::new(r
#"(?-u)[^\x00-\xff]"#).is_err());
91 matiter
!(word_boundary_ascii1
, r
"(?-u:\B)x(?-u:\B)", "áxβ");
102 // See: https://github.com/rust-lang/regex/issues/264
103 mat
!(ascii_boundary_no_capture
, r
"(?-u)\B", "\u{28f3e}", Some((0, 0)));
104 mat
!(ascii_boundary_capture
, r
"(?-u)(\B)", "\u{28f3e}", Some((0, 0)));
106 // See: https://github.com/rust-lang/regex/issues/271
107 mat
!(end_not_wb
, r
"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8)));