]> git.proxmox.com Git - rustc.git/blame - vendor/regex-syntax/src/unicode.rs
New upstream version 1.45.0+dfsg1
[rustc.git] / vendor / regex-syntax / src / unicode.rs
CommitLineData
f9f354fc
XL
1use std::error;
2use std::fmt;
0531ce1d
XL
3use std::result;
4
0531ce1d 5use hir;
f9f354fc
XL
6
7/// A type alias for errors specific to Unicode handling of classes.
8pub type Result<T> = result::Result<T, Error>;
9
10/// An inclusive range of codepoints from a generated file (hence the static
11/// lifetime).
12type Range = &'static [(char, char)];
0531ce1d
XL
13
14/// An error that occurs when dealing with Unicode.
15///
16/// We don't impl the Error trait here because these always get converted
17/// into other public errors. (This error type isn't exported.)
18#[derive(Debug)]
19pub enum Error {
20 PropertyNotFound,
21 PropertyValueNotFound,
f9f354fc
XL
22 // Not used when unicode-perl is enabled.
23 #[allow(dead_code)]
24 PerlClassNotFound,
0531ce1d
XL
25}
26
f9f354fc
XL
27/// A type alias for errors specific to Unicode case folding.
28pub type FoldResult<T> = result::Result<T, CaseFoldError>;
29
30/// An error that occurs when Unicode-aware simple case folding fails.
31///
32/// This error can occur when the case mapping tables necessary for Unicode
33/// aware case folding are unavailable. This only occurs when the
34/// `unicode-case` feature is disabled. (The feature is enabled by default.)
0531ce1d 35#[derive(Debug)]
f9f354fc 36pub struct CaseFoldError(());
0531ce1d 37
f9f354fc 38impl error::Error for CaseFoldError {}
0531ce1d 39
f9f354fc
XL
40impl fmt::Display for CaseFoldError {
41 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
42 write!(
43 f,
44 "Unicode-aware case folding is not available \
45 (probably because the unicode-case feature is not enabled)"
46 )
47 }
48}
49
50/// An error that occurs when the Unicode-aware `\w` class is unavailable.
51///
52/// This error can occur when the data tables necessary for the Unicode aware
53/// Perl character class `\w` are unavailable. This only occurs when the
54/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
55#[derive(Debug)]
56pub struct UnicodeWordError(());
57
58impl error::Error for UnicodeWordError {}
59
60impl fmt::Display for UnicodeWordError {
61 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
62 write!(
63 f,
64 "Unicode-aware \\w class is not available \
65 (probably because the unicode-perl feature is not enabled)"
66 )
0531ce1d
XL
67 }
68}
69
70/// Return an iterator over the equivalence class of simple case mappings
71/// for the given codepoint. The equivalence class does not include the
72/// given codepoint.
73///
74/// If the equivalence class is empty, then this returns the next scalar
75/// value that has a non-empty equivalence class, if it exists. If no such
76/// scalar value exists, then `None` is returned. The point of this behavior
77/// is to permit callers to avoid calling `simple_fold` more than they need
78/// to, since there is some cost to fetching the equivalence class.
f9f354fc
XL
79///
80/// This returns an error if the Unicode case folding tables are not available.
81pub fn simple_fold(
82 c: char,
83) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> {
84 #[cfg(not(feature = "unicode-case"))]
85 fn imp(
86 _: char,
87 ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
88 {
89 use std::option::IntoIter;
90 Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(()))
91 }
92
93 #[cfg(feature = "unicode-case")]
94 fn imp(
95 c: char,
96 ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
97 {
98 use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
99
100 Ok(CASE_FOLDING_SIMPLE
101 .binary_search_by_key(&c, |&(c1, _)| c1)
102 .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().map(|&c| c))
103 .map_err(|i| {
104 if i >= CASE_FOLDING_SIMPLE.len() {
105 None
106 } else {
107 Some(CASE_FOLDING_SIMPLE[i].0)
108 }
109 }))
110 }
111
112 imp(c)
0531ce1d
XL
113}
114
115/// Returns true if and only if the given (inclusive) range contains at least
116/// one Unicode scalar value that has a non-empty non-trivial simple case
117/// mapping.
118///
119/// This function panics if `end < start`.
f9f354fc
XL
120///
121/// This returns an error if the Unicode case folding tables are not available.
122pub fn contains_simple_case_mapping(
123 start: char,
124 end: char,
125) -> FoldResult<bool> {
126 #[cfg(not(feature = "unicode-case"))]
127 fn imp(_: char, _: char) -> FoldResult<bool> {
128 Err(CaseFoldError(()))
129 }
130
131 #[cfg(feature = "unicode-case")]
132 fn imp(start: char, end: char) -> FoldResult<bool> {
133 use std::cmp::Ordering;
134 use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
135
136 assert!(start <= end);
137 Ok(CASE_FOLDING_SIMPLE
138 .binary_search_by(|&(c, _)| {
139 if start <= c && c <= end {
140 Ordering::Equal
141 } else if c > end {
142 Ordering::Greater
143 } else {
144 Ordering::Less
145 }
146 })
147 .is_ok())
148 }
149
150 imp(start, end)
0531ce1d
XL
151}
152
153/// A query for finding a character class defined by Unicode. This supports
154/// either use of a property name directly, or lookup by property value. The
155/// former generally refers to Binary properties (see UTS#44, Table 8), but
156/// as a special exception (see UTS#18, Section 1.2) both general categories
157/// (an enumeration) and scripts (a catalog) are supported as if each of their
158/// possible values were a binary property.
159///
160/// In all circumstances, property names and values are normalized and
161/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
162///
163/// The lifetime `'a` refers to the shorter of the lifetimes of property name
164/// and property value.
165#[derive(Debug)]
166pub enum ClassQuery<'a> {
167 /// Return a class corresponding to a Unicode binary property, named by
168 /// a single letter.
169 OneLetter(char),
170 /// Return a class corresponding to a Unicode binary property.
171 ///
172 /// Note that, by special exception (see UTS#18, Section 1.2), both
173 /// general category values and script values are permitted here as if
174 /// they were a binary property.
175 Binary(&'a str),
176 /// Return a class corresponding to all codepoints whose property
177 /// (identified by `property_name`) corresponds to the given value
178 /// (identified by `property_value`).
179 ByValue {
180 /// A property name.
181 property_name: &'a str,
182 /// A property value.
183 property_value: &'a str,
184 },
185}
186
187impl<'a> ClassQuery<'a> {
188 fn canonicalize(&self) -> Result<CanonicalClassQuery> {
189 match *self {
190 ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
191 ClassQuery::Binary(name) => self.canonical_binary(name),
192 ClassQuery::ByValue { property_name, property_value } => {
f9f354fc
XL
193 let property_name = symbolic_name_normalize(property_name);
194 let property_value = symbolic_name_normalize(property_value);
0531ce1d 195
f9f354fc 196 let canon_name = match canonical_prop(&property_name)? {
0531ce1d
XL
197 None => return Err(Error::PropertyNotFound),
198 Some(canon_name) => canon_name,
199 };
200 Ok(match canon_name {
201 "General_Category" => {
f9f354fc 202 let canon = match canonical_gencat(&property_value)? {
0531ce1d
XL
203 None => return Err(Error::PropertyValueNotFound),
204 Some(canon) => canon,
205 };
206 CanonicalClassQuery::GeneralCategory(canon)
207 }
208 "Script" => {
f9f354fc 209 let canon = match canonical_script(&property_value)? {
0531ce1d
XL
210 None => return Err(Error::PropertyValueNotFound),
211 Some(canon) => canon,
212 };
213 CanonicalClassQuery::Script(canon)
214 }
215 _ => {
f9f354fc 216 let vals = match property_values(canon_name)? {
0531ce1d
XL
217 None => return Err(Error::PropertyValueNotFound),
218 Some(vals) => vals,
219 };
f9f354fc
XL
220 let canon_val =
221 match canonical_value(vals, &property_value) {
222 None => {
223 return Err(Error::PropertyValueNotFound)
224 }
225 Some(canon_val) => canon_val,
226 };
0531ce1d
XL
227 CanonicalClassQuery::ByValue {
228 property_name: canon_name,
229 property_value: canon_val,
230 }
231 }
232 })
233 }
234 }
235 }
236
237 fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
f9f354fc 238 let norm = symbolic_name_normalize(name);
0531ce1d 239
f9f354fc 240 if let Some(canon) = canonical_prop(&norm)? {
0531ce1d
XL
241 return Ok(CanonicalClassQuery::Binary(canon));
242 }
f9f354fc 243 if let Some(canon) = canonical_gencat(&norm)? {
0531ce1d
XL
244 return Ok(CanonicalClassQuery::GeneralCategory(canon));
245 }
f9f354fc 246 if let Some(canon) = canonical_script(&norm)? {
0531ce1d
XL
247 return Ok(CanonicalClassQuery::Script(canon));
248 }
249 Err(Error::PropertyNotFound)
250 }
251}
252
253/// Like ClassQuery, but its parameters have been canonicalized. This also
254/// differentiates binary properties from flattened general categories and
255/// scripts.
94b46f34 256#[derive(Debug, Eq, PartialEq)]
0531ce1d
XL
257enum CanonicalClassQuery {
258 /// The canonical binary property name.
259 Binary(&'static str),
260 /// The canonical general category name.
261 GeneralCategory(&'static str),
262 /// The canonical script name.
263 Script(&'static str),
264 /// An arbitrary association between property and value, both of which
265 /// have been canonicalized.
266 ///
267 /// Note that by construction, the property name of ByValue will never
268 /// be General_Category or Script. Those two cases are subsumed by the
269 /// eponymous variants.
270 ByValue {
271 /// The canonical property name.
272 property_name: &'static str,
273 /// The canonical property value.
274 property_value: &'static str,
275 },
276}
277
278/// Looks up a Unicode class given a query. If one doesn't exist, then
279/// `None` is returned.
f9f354fc 280pub fn class(query: ClassQuery) -> Result<hir::ClassUnicode> {
0531ce1d
XL
281 use self::CanonicalClassQuery::*;
282
94b46f34 283 match query.canonicalize()? {
f9f354fc
XL
284 Binary(name) => bool_property(name),
285 GeneralCategory(name) => gencat(name),
286 Script(name) => script(name),
0531ce1d
XL
287 ByValue { property_name: "Age", property_value } => {
288 let mut class = hir::ClassUnicode::empty();
94b46f34 289 for set in ages(property_value)? {
0531ce1d
XL
290 class.union(&hir_class(set));
291 }
292 Ok(class)
293 }
294 ByValue { property_name: "Script_Extensions", property_value } => {
f9f354fc 295 script_extension(property_value)
0731742a 296 }
f9f354fc
XL
297 ByValue {
298 property_name: "Grapheme_Cluster_Break",
299 property_value,
300 } => gcb(property_value),
0731742a 301 ByValue { property_name: "Sentence_Break", property_value } => {
f9f354fc 302 sb(property_value)
0731742a
XL
303 }
304 ByValue { property_name: "Word_Break", property_value } => {
f9f354fc 305 wb(property_value)
0731742a 306 }
0531ce1d
XL
307 _ => {
308 // What else should we support?
309 Err(Error::PropertyNotFound)
310 }
311 }
312}
313
f9f354fc
XL
314/// Returns a Unicode aware class for \w.
315///
316/// This returns an error if the data is not available for \w.
317pub fn perl_word() -> Result<hir::ClassUnicode> {
318 #[cfg(not(feature = "unicode-perl"))]
319 fn imp() -> Result<hir::ClassUnicode> {
320 Err(Error::PerlClassNotFound)
321 }
322
323 #[cfg(feature = "unicode-perl")]
324 fn imp() -> Result<hir::ClassUnicode> {
325 use unicode_tables::perl_word::PERL_WORD;
326 Ok(hir_class(PERL_WORD))
327 }
328
329 imp()
330}
331
332/// Returns a Unicode aware class for \s.
333///
334/// This returns an error if the data is not available for \s.
335pub fn perl_space() -> Result<hir::ClassUnicode> {
336 #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
337 fn imp() -> Result<hir::ClassUnicode> {
338 Err(Error::PerlClassNotFound)
339 }
340
341 #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
342 fn imp() -> Result<hir::ClassUnicode> {
343 use unicode_tables::perl_space::WHITE_SPACE;
344 Ok(hir_class(WHITE_SPACE))
345 }
346
347 #[cfg(feature = "unicode-bool")]
348 fn imp() -> Result<hir::ClassUnicode> {
349 use unicode_tables::property_bool::WHITE_SPACE;
350 Ok(hir_class(WHITE_SPACE))
351 }
352
353 imp()
354}
355
356/// Returns a Unicode aware class for \d.
357///
358/// This returns an error if the data is not available for \d.
359pub fn perl_digit() -> Result<hir::ClassUnicode> {
360 #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
361 fn imp() -> Result<hir::ClassUnicode> {
362 Err(Error::PerlClassNotFound)
363 }
364
365 #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
366 fn imp() -> Result<hir::ClassUnicode> {
367 use unicode_tables::perl_decimal::DECIMAL_NUMBER;
368 Ok(hir_class(DECIMAL_NUMBER))
369 }
370
371 #[cfg(feature = "unicode-gencat")]
372 fn imp() -> Result<hir::ClassUnicode> {
373 use unicode_tables::general_category::DECIMAL_NUMBER;
374 Ok(hir_class(DECIMAL_NUMBER))
375 }
376
377 imp()
378}
379
0531ce1d
XL
380/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
381pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
382 let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
383 .iter()
384 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
385 .collect();
386 hir::ClassUnicode::new(hir_ranges)
387}
388
f9f354fc
XL
389/// Returns true only if the given codepoint is in the `\w` character class.
390///
391/// If the `unicode-perl` feature is not enabled, then this returns an error.
392pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> {
393 #[cfg(not(feature = "unicode-perl"))]
394 fn imp(_: char) -> result::Result<bool, UnicodeWordError> {
395 Err(UnicodeWordError(()))
396 }
397
398 #[cfg(feature = "unicode-perl")]
399 fn imp(c: char) -> result::Result<bool, UnicodeWordError> {
400 use is_word_byte;
401 use std::cmp::Ordering;
402 use unicode_tables::perl_word::PERL_WORD;
403
404 if c <= 0x7F as char && is_word_byte(c as u8) {
405 return Ok(true);
406 }
407 Ok(PERL_WORD
408 .binary_search_by(|&(start, end)| {
409 if start <= c && c <= end {
410 Ordering::Equal
411 } else if start > c {
412 Ordering::Greater
413 } else {
414 Ordering::Less
415 }
416 })
417 .is_ok())
418 }
419
420 imp(c)
0531ce1d
XL
421}
422
f9f354fc
XL
423/// A mapping of property values for a specific property.
424///
425/// The first element of each tuple is a normalized property value while the
426/// second element of each tuple is the corresponding canonical property
427/// value.
428type PropertyValues = &'static [(&'static str, &'static str)];
429
430fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> {
431 Ok(match normalized_value {
0531ce1d
XL
432 "any" => Some("Any"),
433 "assigned" => Some("Assigned"),
434 "ascii" => Some("ASCII"),
435 _ => {
f9f354fc 436 let gencats = property_values("General_Category")?.unwrap();
0531ce1d
XL
437 canonical_value(gencats, normalized_value)
438 }
f9f354fc
XL
439 })
440}
441
442fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> {
443 let scripts = property_values("Script")?.unwrap();
444 Ok(canonical_value(scripts, normalized_value))
0531ce1d
XL
445}
446
f9f354fc
XL
447/// Find the canonical property name for the given normalized property name.
448///
449/// If no such property exists, then `None` is returned.
450///
451/// The normalized property name must have been normalized according to
452/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
453///
454/// If the property names data is not available, then an error is returned.
455fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> {
456 #[cfg(not(any(
457 feature = "unicode-age",
458 feature = "unicode-bool",
459 feature = "unicode-gencat",
460 feature = "unicode-perl",
461 feature = "unicode-script",
462 feature = "unicode-segment",
463 )))]
464 fn imp(_: &str) -> Result<Option<&'static str>> {
465 Err(Error::PropertyNotFound)
466 }
467
468 #[cfg(any(
469 feature = "unicode-age",
470 feature = "unicode-bool",
471 feature = "unicode-gencat",
472 feature = "unicode-perl",
473 feature = "unicode-script",
474 feature = "unicode-segment",
475 ))]
476 fn imp(name: &str) -> Result<Option<&'static str>> {
477 use unicode_tables::property_names::PROPERTY_NAMES;
478
479 Ok(PROPERTY_NAMES
480 .binary_search_by_key(&name, |&(n, _)| n)
481 .ok()
482 .map(|i| PROPERTY_NAMES[i].1))
483 }
484
485 imp(normalized_name)
0531ce1d
XL
486}
487
f9f354fc
XL
488/// Find the canonical property value for the given normalized property
489/// value.
490///
491/// The given property values should correspond to the values for the property
492/// under question, which can be found using `property_values`.
493///
494/// If no such property value exists, then `None` is returned.
495///
496/// The normalized property value must have been normalized according to
497/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
0531ce1d
XL
498fn canonical_value(
499 vals: PropertyValues,
500 normalized_value: &str,
501) -> Option<&'static str> {
f9f354fc
XL
502 vals.binary_search_by_key(&normalized_value, |&(n, _)| n)
503 .ok()
504 .map(|i| vals[i].1)
0531ce1d
XL
505}
506
f9f354fc
XL
507/// Return the table of property values for the given property name.
508///
509/// If the property values data is not available, then an error is returned.
0531ce1d
XL
510fn property_values(
511 canonical_property_name: &'static str,
f9f354fc
XL
512) -> Result<Option<PropertyValues>> {
513 #[cfg(not(any(
514 feature = "unicode-age",
515 feature = "unicode-bool",
516 feature = "unicode-gencat",
517 feature = "unicode-perl",
518 feature = "unicode-script",
519 feature = "unicode-segment",
520 )))]
521 fn imp(_: &'static str) -> Result<Option<PropertyValues>> {
522 Err(Error::PropertyValueNotFound)
523 }
524
525 #[cfg(any(
526 feature = "unicode-age",
527 feature = "unicode-bool",
528 feature = "unicode-gencat",
529 feature = "unicode-perl",
530 feature = "unicode-script",
531 feature = "unicode-segment",
532 ))]
533 fn imp(name: &'static str) -> Result<Option<PropertyValues>> {
534 use unicode_tables::property_values::PROPERTY_VALUES;
535
536 Ok(PROPERTY_VALUES
537 .binary_search_by_key(&name, |&(n, _)| n)
538 .ok()
539 .map(|i| PROPERTY_VALUES[i].1))
540 }
541
542 imp(canonical_property_name)
0531ce1d
XL
543}
544
f9f354fc
XL
545// This is only used in some cases, but small enough to just let it be dead
546// instead of figuring out (and maintaining) the right set of features.
547#[allow(dead_code)]
0531ce1d 548fn property_set(
f9f354fc 549 name_map: &'static [(&'static str, Range)],
0531ce1d 550 canonical: &'static str,
f9f354fc 551) -> Option<Range> {
0531ce1d
XL
552 name_map
553 .binary_search_by_key(&canonical, |x| x.0)
554 .ok()
555 .map(|i| name_map[i].1)
556}
557
f9f354fc
XL
558/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
559/// of codepoints that were added in a particular revision of Unicode. The
0531ce1d 560/// iterator yields items in chronological order.
f9f354fc
XL
561///
562/// If the given age value isn't valid or if the data isn't available, then an
563/// error is returned instead.
564fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
565 #[cfg(not(feature = "unicode-age"))]
566 fn imp(_: &str) -> Result<impl Iterator<Item = Range>> {
567 use std::option::IntoIter;
568 Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
569 }
570
571 #[cfg(feature = "unicode-age")]
572 fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
573 use unicode_tables::age;
574
575 const AGES: &'static [(&'static str, Range)] = &[
576 ("V1_1", age::V1_1),
577 ("V2_0", age::V2_0),
578 ("V2_1", age::V2_1),
579 ("V3_0", age::V3_0),
580 ("V3_1", age::V3_1),
581 ("V3_2", age::V3_2),
582 ("V4_0", age::V4_0),
583 ("V4_1", age::V4_1),
584 ("V5_0", age::V5_0),
585 ("V5_1", age::V5_1),
586 ("V5_2", age::V5_2),
587 ("V6_0", age::V6_0),
588 ("V6_1", age::V6_1),
589 ("V6_2", age::V6_2),
590 ("V6_3", age::V6_3),
591 ("V7_0", age::V7_0),
592 ("V8_0", age::V8_0),
593 ("V9_0", age::V9_0),
594 ("V10_0", age::V10_0),
595 ("V11_0", age::V11_0),
596 ("V12_0", age::V12_0),
597 ("V12_1", age::V12_1),
598 ("V13_0", age::V13_0),
599 ];
600 assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
601
602 let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
603 match pos {
604 None => Err(Error::PropertyValueNotFound),
605 Some(i) => Ok(AGES[..i + 1].iter().map(|&(_, classes)| classes)),
606 }
607 }
608
609 imp(canonical_age)
610}
611
612/// Returns the Unicode HIR class corresponding to the given general category.
613///
614/// Name canonicalization is assumed to be performed by the caller.
615///
616/// If the given general category could not be found, or if the general
617/// category data is not available, then an error is returned.
618fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
619 #[cfg(not(feature = "unicode-gencat"))]
620 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
621 Err(Error::PropertyNotFound)
622 }
623
624 #[cfg(feature = "unicode-gencat")]
625 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
626 use unicode_tables::general_category::BY_NAME;
627 match name {
628 "ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
629 "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
630 "Assigned" => {
631 let mut cls = gencat("Unassigned")?;
632 cls.negate();
633 Ok(cls)
634 }
635 name => property_set(BY_NAME, name)
636 .map(hir_class)
637 .ok_or(Error::PropertyValueNotFound),
638 }
639 }
640
641 match canonical_name {
642 "Decimal_Number" => perl_digit(),
643 name => imp(name),
644 }
645}
646
647/// Returns the Unicode HIR class corresponding to the given script.
648///
649/// Name canonicalization is assumed to be performed by the caller.
650///
651/// If the given script could not be found, or if the script data is not
652/// available, then an error is returned.
653fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
654 #[cfg(not(feature = "unicode-script"))]
655 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
656 Err(Error::PropertyNotFound)
657 }
658
659 #[cfg(feature = "unicode-script")]
660 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
661 use unicode_tables::script::BY_NAME;
662 property_set(BY_NAME, name)
663 .map(hir_class)
664 .ok_or(Error::PropertyValueNotFound)
665 }
666
667 imp(canonical_name)
668}
669
670/// Returns the Unicode HIR class corresponding to the given script extension.
671///
672/// Name canonicalization is assumed to be performed by the caller.
673///
674/// If the given script extension could not be found, or if the script data is
675/// not available, then an error is returned.
676fn script_extension(
677 canonical_name: &'static str,
678) -> Result<hir::ClassUnicode> {
679 #[cfg(not(feature = "unicode-script"))]
680 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
681 Err(Error::PropertyNotFound)
682 }
683
684 #[cfg(feature = "unicode-script")]
685 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
686 use unicode_tables::script_extension::BY_NAME;
687 property_set(BY_NAME, name)
688 .map(hir_class)
689 .ok_or(Error::PropertyValueNotFound)
690 }
691
692 imp(canonical_name)
693}
694
695/// Returns the Unicode HIR class corresponding to the given Unicode boolean
696/// property.
697///
698/// Name canonicalization is assumed to be performed by the caller.
699///
700/// If the given boolean property could not be found, or if the boolean
701/// property data is not available, then an error is returned.
702fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
703 #[cfg(not(feature = "unicode-bool"))]
704 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
705 Err(Error::PropertyNotFound)
706 }
707
708 #[cfg(feature = "unicode-bool")]
709 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
710 use unicode_tables::property_bool::BY_NAME;
711 property_set(BY_NAME, name)
712 .map(hir_class)
713 .ok_or(Error::PropertyNotFound)
714 }
715
716 match canonical_name {
717 "Decimal_Number" => perl_digit(),
718 "White_Space" => perl_space(),
719 name => imp(name),
720 }
721}
722
723/// Returns the Unicode HIR class corresponding to the given grapheme cluster
724/// break property.
725///
726/// Name canonicalization is assumed to be performed by the caller.
727///
728/// If the given property could not be found, or if the corresponding data is
729/// not available, then an error is returned.
730fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
731 #[cfg(not(feature = "unicode-segment"))]
732 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
733 Err(Error::PropertyNotFound)
734 }
735
736 #[cfg(feature = "unicode-segment")]
737 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
738 use unicode_tables::grapheme_cluster_break::BY_NAME;
739 property_set(BY_NAME, name)
740 .map(hir_class)
741 .ok_or(Error::PropertyValueNotFound)
742 }
743
744 imp(canonical_name)
745}
746
747/// Returns the Unicode HIR class corresponding to the given word break
748/// property.
749///
750/// Name canonicalization is assumed to be performed by the caller.
751///
752/// If the given property could not be found, or if the corresponding data is
753/// not available, then an error is returned.
754fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
755 #[cfg(not(feature = "unicode-segment"))]
756 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
757 Err(Error::PropertyNotFound)
758 }
759
760 #[cfg(feature = "unicode-segment")]
761 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
762 use unicode_tables::word_break::BY_NAME;
763 property_set(BY_NAME, name)
764 .map(hir_class)
765 .ok_or(Error::PropertyValueNotFound)
766 }
767
768 imp(canonical_name)
0531ce1d
XL
769}
770
f9f354fc
XL
771/// Returns the Unicode HIR class corresponding to the given sentence
772/// break property.
773///
774/// Name canonicalization is assumed to be performed by the caller.
775///
776/// If the given property could not be found, or if the corresponding data is
777/// not available, then an error is returned.
778fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
779 #[cfg(not(feature = "unicode-segment"))]
780 fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
781 Err(Error::PropertyNotFound)
782 }
783
784 #[cfg(feature = "unicode-segment")]
785 fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
786 use unicode_tables::sentence_break::BY_NAME;
787 property_set(BY_NAME, name)
788 .map(hir_class)
789 .ok_or(Error::PropertyValueNotFound)
0531ce1d 790 }
f9f354fc
XL
791
792 imp(canonical_name)
0531ce1d
XL
793}
794
f9f354fc
XL
795/// Like symbolic_name_normalize_bytes, but operates on a string.
796fn symbolic_name_normalize(x: &str) -> String {
797 let mut tmp = x.as_bytes().to_vec();
798 let len = symbolic_name_normalize_bytes(&mut tmp).len();
799 tmp.truncate(len);
800 // This should always succeed because `symbolic_name_normalize_bytes`
801 // guarantees that `&tmp[..len]` is always valid UTF-8.
802 //
803 // N.B. We could avoid the additional UTF-8 check here, but it's unlikely
804 // to be worth skipping the additional safety check. A benchmark must
805 // justify it first.
806 String::from_utf8(tmp).unwrap()
807}
0531ce1d 808
f9f354fc
XL
809/// Normalize the given symbolic name in place according to UAX44-LM3.
810///
811/// A "symbolic name" typically corresponds to property names and property
812/// value aliases. Note, though, that it should not be applied to property
813/// string values.
814///
815/// The slice returned is guaranteed to be valid UTF-8 for all possible values
816/// of `slice`.
817///
818/// See: http://unicode.org/reports/tr44/#UAX44-LM3
819fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
820 // I couldn't find a place in the standard that specified that property
821 // names/aliases had a particular structure (unlike character names), but
822 // we assume that it's ASCII only and drop anything that isn't ASCII.
823 let mut start = 0;
824 let mut starts_with_is = false;
825 if slice.len() >= 2 {
826 // Ignore any "is" prefix.
827 starts_with_is = slice[0..2] == b"is"[..]
828 || slice[0..2] == b"IS"[..]
829 || slice[0..2] == b"iS"[..]
830 || slice[0..2] == b"Is"[..];
831 if starts_with_is {
832 start = 2;
0531ce1d
XL
833 }
834 }
f9f354fc
XL
835 let mut next_write = 0;
836 for i in start..slice.len() {
837 // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
838 // UTF-8, we ensure that the slice contains only ASCII bytes. In
839 // particular, we drop every non-ASCII byte from the normalized string.
840 let b = slice[i];
841 if b == b' ' || b == b'_' || b == b'-' {
842 continue;
843 } else if b'A' <= b && b <= b'Z' {
844 slice[next_write] = b + (b'a' - b'A');
845 next_write += 1;
846 } else if b <= 0x7F {
847 slice[next_write] = b;
848 next_write += 1;
849 }
850 }
851 // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
852 // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
853 // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
854 // is actually an alias for the 'Other' general category.
855 if starts_with_is && next_write == 1 && slice[0] == b'c' {
856 slice[0] = b'i';
857 slice[1] = b's';
858 slice[2] = b'c';
859 next_write = 3;
860 }
861 &mut slice[..next_write]
0531ce1d
XL
862}
863
864#[cfg(test)]
865mod tests {
f9f354fc
XL
866 use super::{
867 contains_simple_case_mapping, simple_fold, symbolic_name_normalize,
868 symbolic_name_normalize_bytes,
869 };
870
871 #[cfg(feature = "unicode-case")]
872 fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
873 simple_fold(c).unwrap().unwrap()
874 }
875
876 #[cfg(feature = "unicode-case")]
877 fn simple_fold_err(c: char) -> Option<char> {
878 match simple_fold(c).unwrap() {
879 Ok(_) => unreachable!("simple_fold returned Ok iterator"),
880 Err(next) => next,
881 }
882 }
883
884 #[cfg(feature = "unicode-case")]
885 fn contains_case_map(start: char, end: char) -> bool {
886 contains_simple_case_mapping(start, end).unwrap()
887 }
0531ce1d
XL
888
889 #[test]
f9f354fc 890 #[cfg(feature = "unicode-case")]
0531ce1d 891 fn simple_fold_k() {
f9f354fc 892 let xs: Vec<char> = simple_fold_ok('k').collect();
0531ce1d
XL
893 assert_eq!(xs, vec!['K', 'K']);
894
f9f354fc 895 let xs: Vec<char> = simple_fold_ok('K').collect();
0531ce1d
XL
896 assert_eq!(xs, vec!['k', 'K']);
897
f9f354fc 898 let xs: Vec<char> = simple_fold_ok('K').collect();
0531ce1d
XL
899 assert_eq!(xs, vec!['K', 'k']);
900 }
901
902 #[test]
f9f354fc 903 #[cfg(feature = "unicode-case")]
0531ce1d 904 fn simple_fold_a() {
f9f354fc 905 let xs: Vec<char> = simple_fold_ok('a').collect();
0531ce1d
XL
906 assert_eq!(xs, vec!['A']);
907
f9f354fc 908 let xs: Vec<char> = simple_fold_ok('A').collect();
0531ce1d
XL
909 assert_eq!(xs, vec!['a']);
910 }
911
912 #[test]
f9f354fc 913 #[cfg(feature = "unicode-case")]
0531ce1d 914 fn simple_fold_empty() {
f9f354fc
XL
915 assert_eq!(Some('A'), simple_fold_err('?'));
916 assert_eq!(Some('A'), simple_fold_err('@'));
917 assert_eq!(Some('a'), simple_fold_err('['));
918 assert_eq!(Some('Ⰰ'), simple_fold_err('☃'));
0531ce1d
XL
919 }
920
921 #[test]
f9f354fc 922 #[cfg(feature = "unicode-case")]
0531ce1d 923 fn simple_fold_max() {
f9f354fc
XL
924 assert_eq!(None, simple_fold_err('\u{10FFFE}'));
925 assert_eq!(None, simple_fold_err('\u{10FFFF}'));
0531ce1d
XL
926 }
927
928 #[test]
f9f354fc
XL
929 #[cfg(not(feature = "unicode-case"))]
930 fn simple_fold_disabled() {
931 assert!(simple_fold('a').is_err());
932 }
933
934 #[test]
935 #[cfg(feature = "unicode-case")]
0531ce1d 936 fn range_contains() {
f9f354fc
XL
937 assert!(contains_case_map('A', 'A'));
938 assert!(contains_case_map('Z', 'Z'));
939 assert!(contains_case_map('A', 'Z'));
940 assert!(contains_case_map('@', 'A'));
941 assert!(contains_case_map('Z', '['));
942 assert!(contains_case_map('☃', 'Ⰰ'));
0531ce1d 943
f9f354fc
XL
944 assert!(!contains_case_map('[', '['));
945 assert!(!contains_case_map('[', '`'));
0531ce1d 946
f9f354fc
XL
947 assert!(!contains_case_map('☃', '☃'));
948 }
949
950 #[test]
951 #[cfg(not(feature = "unicode-case"))]
952 fn range_contains_disabled() {
953 assert!(contains_simple_case_mapping('a', 'a').is_err());
0531ce1d 954 }
94b46f34
XL
955
956 #[test]
f9f354fc 957 #[cfg(feature = "unicode-gencat")]
94b46f34
XL
958 fn regression_466() {
959 use super::{CanonicalClassQuery, ClassQuery};
960
961 let q = ClassQuery::OneLetter('C');
962 assert_eq!(
963 q.canonicalize().unwrap(),
f9f354fc
XL
964 CanonicalClassQuery::GeneralCategory("Other")
965 );
966 }
967
968 #[test]
969 fn sym_normalize() {
970 let sym_norm = symbolic_name_normalize;
971
972 assert_eq!(sym_norm("Line_Break"), "linebreak");
973 assert_eq!(sym_norm("Line-break"), "linebreak");
974 assert_eq!(sym_norm("linebreak"), "linebreak");
975 assert_eq!(sym_norm("BA"), "ba");
976 assert_eq!(sym_norm("ba"), "ba");
977 assert_eq!(sym_norm("Greek"), "greek");
978 assert_eq!(sym_norm("isGreek"), "greek");
979 assert_eq!(sym_norm("IS_Greek"), "greek");
980 assert_eq!(sym_norm("isc"), "isc");
981 assert_eq!(sym_norm("is c"), "isc");
982 assert_eq!(sym_norm("is_c"), "isc");
983 }
984
985 #[test]
986 fn valid_utf8_symbolic() {
987 let mut x = b"abc\xFFxyz".to_vec();
988 let y = symbolic_name_normalize_bytes(&mut x);
989 assert_eq!(y, b"abcxyz");
94b46f34 990 }
0531ce1d 991}