1 mat
!(uni_literal
, r
"☃", "☃", Some((0, 3)));
2 mat
!(uni_literal_plus
, r
"☃+", "☃", Some((0, 3)));
3 mat
!(uni_literal_casei_plus
, r
"(?i)☃+", "☃", Some((0, 3)));
4 mat
!(uni_class_plus
, r
"[☃Ⅰ]+", "☃", Some((0, 3)));
5 mat
!(uni_one
, r
"\pN", "Ⅰ", Some((0, 3)));
6 mat
!(uni_mixed
, r
"\pN+", "Ⅰ1Ⅱ2", Some((0, 8)));
7 mat
!(uni_not
, r
"\PN+", "abⅠ", Some((0, 2)));
8 mat
!(uni_not_class
, r
"[\PN]+", "abⅠ", Some((0, 2)));
9 mat
!(uni_not_class_neg
, r
"[^\PN]+", "abⅠ", Some((2, 5)));
10 mat
!(uni_case
, r
"(?i)Δ", "δ", Some((0, 2)));
11 mat
!(uni_case_upper
, r
"\p{Lu}+", "ΛΘΓΔα", Some((0, 8)));
12 mat
!(uni_case_upper_nocase_flag
, r
"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)));
13 mat
!(uni_case_upper_nocase
, r
"\p{L}+", "ΛΘΓΔα", Some((0, 10)));
14 mat
!(uni_case_lower
, r
"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)));
16 // Test the Unicode friendliness of Perl character classes.
17 mat
!(uni_perl_w
, r
"\w+", "dδd", Some((0, 4)));
18 mat
!(uni_perl_w_not
, r
"\w+", "⥡", None
);
19 mat
!(uni_perl_w_neg
, r
"\W+", "⥡", Some((0, 3)));
20 mat
!(uni_perl_d
, r
"\d+", "1२३9", Some((0, 8)));
21 mat
!(uni_perl_d_not
, r
"\d+", "Ⅱ", None
);
22 mat
!(uni_perl_d_neg
, r
"\D+", "Ⅱ", Some((0, 3)));
23 mat
!(uni_perl_s
, r
"\s+", " ", Some((0, 3)));
24 mat
!(uni_perl_s_not
, r
"\s+", "☃", None
);
25 mat
!(uni_perl_s_neg
, r
"\S+", "☃", Some((0, 3)));
27 // And do the same for word boundaries.
28 mat
!(uni_boundary_none
, r
"\d\b", "6δ", None
);
29 mat
!(uni_boundary_ogham
, r
"\d\b", "6 ", Some((0, 1)));
30 mat
!(uni_not_boundary_none
, r
"\d\B", "6δ", Some((0, 1)));
31 mat
!(uni_not_boundary_ogham
, r
"\d\B", "6 ", None
);
33 // Test general categories.
35 // We should test more, but there's a lot. Write a script to generate more of
37 mat
!(uni_class_gencat_cased_letter
, r
"\p{Cased_Letter}", "A", Some((0, 3)));
39 uni_class_gencat_close_punctuation
,
40 r
"\p{Close_Punctuation}",
45 uni_class_gencat_connector_punctuation
,
46 r
"\p{Connector_Punctuation}",
50 mat
!(uni_class_gencat_control
, r
"\p{Control}", "\u{9f}", Some((0, 2)));
52 uni_class_gencat_currency_symbol
,
53 r
"\p{Currency_Symbol}",
58 uni_class_gencat_dash_punctuation
,
59 r
"\p{Dash_Punctuation}",
63 mat
!(uni_class_gencat_decimal_numer
, r
"\p{Decimal_Number}", "𑓙", Some((0, 4)));
65 uni_class_gencat_enclosing_mark
,
66 r
"\p{Enclosing_Mark}",
71 uni_class_gencat_final_punctuation
,
72 r
"\p{Final_Punctuation}",
76 mat
!(uni_class_gencat_format
, r
"\p{Format}", "\u{E007F}", Some((0, 4)));
77 // See: https://github.com/rust-lang/regex/issues/719
78 mat
!(uni_class_gencat_format_abbrev1
, r
"\p{cf}", "\u{E007F}", Some((0, 4)));
79 mat
!(uni_class_gencat_format_abbrev2
, r
"\p{gc=cf}", "\u{E007F}", Some((0, 4)));
81 uni_class_gencat_initial_punctuation
,
82 r
"\p{Initial_Punctuation}",
86 mat
!(uni_class_gencat_letter
, r
"\p{Letter}", "Έ", Some((0, 2)));
87 mat
!(uni_class_gencat_letter_number
, r
"\p{Letter_Number}", "ↂ", Some((0, 3)));
89 uni_class_gencat_line_separator
,
90 r
"\p{Line_Separator}",
95 uni_class_gencat_lowercase_letter
,
96 r
"\p{Lowercase_Letter}",
100 mat
!(uni_class_gencat_mark
, r
"\p{Mark}", "\u{E01EF}", Some((0, 4)));
101 mat
!(uni_class_gencat_math
, r
"\p{Math}", "⋿", Some((0, 3)));
103 uni_class_gencat_modifier_letter
,
104 r
"\p{Modifier_Letter}",
109 uni_class_gencat_modifier_symbol
,
110 r
"\p{Modifier_Symbol}",
115 uni_class_gencat_nonspacing_mark
,
116 r
"\p{Nonspacing_Mark}",
120 mat
!(uni_class_gencat_number
, r
"\p{Number}", "⓿", Some((0, 3)));
122 uni_class_gencat_open_punctuation
,
123 r
"\p{Open_Punctuation}",
127 mat
!(uni_class_gencat_other
, r
"\p{Other}", "\u{bc9}", Some((0, 3)));
128 mat
!(uni_class_gencat_other_letter
, r
"\p{Other_Letter}", "ꓷ", Some((0, 3)));
129 mat
!(uni_class_gencat_other_number
, r
"\p{Other_Number}", "㉏", Some((0, 3)));
131 uni_class_gencat_other_punctuation
,
132 r
"\p{Other_Punctuation}",
136 mat
!(uni_class_gencat_other_symbol
, r
"\p{Other_Symbol}", "⅌", Some((0, 3)));
138 uni_class_gencat_paragraph_separator
,
139 r
"\p{Paragraph_Separator}",
144 uni_class_gencat_private_use
,
149 mat
!(uni_class_gencat_punctuation
, r
"\p{Punctuation}", "𑁍", Some((0, 4)));
150 mat
!(uni_class_gencat_separator
, r
"\p{Separator}", "\u{3000}", Some((0, 3)));
152 uni_class_gencat_space_separator
,
153 r
"\p{Space_Separator}",
158 uni_class_gencat_spacing_mark
,
163 mat
!(uni_class_gencat_symbol
, r
"\p{Symbol}", "⯈", Some((0, 3)));
165 uni_class_gencat_titlecase_letter
,
166 r
"\p{Titlecase_Letter}",
171 uni_class_gencat_unassigned
,
177 uni_class_gencat_uppercase_letter
,
178 r
"\p{Uppercase_Letter}",
183 // Test a smattering of properties.
184 mat
!(uni_class_prop_emoji1
, r
"\p{Emoji}", "\u{23E9}", Some((0, 3)));
185 mat
!(uni_class_prop_emoji2
, r
"\p{emoji}", "\u{1F21A}", Some((0, 4)));
187 uni_class_prop_picto1
,
188 r
"\p{extendedpictographic}",
193 uni_class_prop_picto2
,
194 r
"\p{extendedpictographic}",
199 // grapheme_cluster_break
201 uni_class_gcb_prepend
,
202 r
"\p{grapheme_cluster_break=prepend}",
208 r
"\p{gcb=regional_indicator}",
212 mat
!(uni_class_gcb_ri2
, r
"\p{gcb=ri}", "\u{1F1E7}", Some((0, 4)));
215 r
"\p{gcb=regionalindicator}",
219 mat
!(uni_class_gcb_lvt
, r
"\p{gcb=lvt}", "\u{C989}", Some((0, 3)));
220 mat
!(uni_class_gcb_zwj
, r
"\p{gcb=zwj}", "\u{200D}", Some((0, 3)));
223 mat
!(uni_class_wb1
, r
"\p{word_break=Hebrew_Letter}", "\u{FB46}", Some((0, 3)));
224 mat
!(uni_class_wb2
, r
"\p{wb=hebrewletter}", "\u{FB46}", Some((0, 3)));
225 mat
!(uni_class_wb3
, r
"\p{wb=ExtendNumLet}", "\u{FF3F}", Some((0, 3)));
226 mat
!(uni_class_wb4
, r
"\p{wb=WSegSpace}", "\u{3000}", Some((0, 3)));
227 mat
!(uni_class_wb5
, r
"\p{wb=numeric}", "\u{1E950}", Some((0, 4)));
230 mat
!(uni_class_sb1
, r
"\p{sentence_break=Lower}", "\u{0469}", Some((0, 2)));
231 mat
!(uni_class_sb2
, r
"\p{sb=lower}", "\u{0469}", Some((0, 2)));
232 mat
!(uni_class_sb3
, r
"\p{sb=Close}", "\u{FF60}", Some((0, 3)));
233 mat
!(uni_class_sb4
, r
"\p{sb=Close}", "\u{1F677}", Some((0, 4)));
234 mat
!(uni_class_sb5
, r
"\p{sb=SContinue}", "\u{FF64}", Some((0, 3)));