]>
Commit | Line | Data |
---|---|---|
f9f354fc XL |
1 | use std::error; |
2 | use std::fmt; | |
0531ce1d XL |
3 | use std::result; |
4 | ||
17df50a5 | 5 | use crate::hir; |
f9f354fc XL |
6 | |
7 | /// A type alias for errors specific to Unicode handling of classes. | |
8 | pub type Result<T> = result::Result<T, Error>; | |
9 | ||
10 | /// An inclusive range of codepoints from a generated file (hence the static | |
11 | /// lifetime). | |
12 | type Range = &'static [(char, char)]; | |
0531ce1d XL |
13 | |
14 | /// An error that occurs when dealing with Unicode. | |
15 | /// | |
16 | /// We don't impl the Error trait here because these always get converted | |
17 | /// into other public errors. (This error type isn't exported.) | |
18 | #[derive(Debug)] | |
19 | pub enum Error { | |
20 | PropertyNotFound, | |
21 | PropertyValueNotFound, | |
f9f354fc XL |
22 | // Not used when unicode-perl is enabled. |
23 | #[allow(dead_code)] | |
24 | PerlClassNotFound, | |
0531ce1d XL |
25 | } |
26 | ||
f9f354fc XL |
27 | /// A type alias for errors specific to Unicode case folding. |
28 | pub type FoldResult<T> = result::Result<T, CaseFoldError>; | |
29 | ||
30 | /// An error that occurs when Unicode-aware simple case folding fails. | |
31 | /// | |
32 | /// This error can occur when the case mapping tables necessary for Unicode | |
33 | /// aware case folding are unavailable. This only occurs when the | |
34 | /// `unicode-case` feature is disabled. (The feature is enabled by default.) | |
0531ce1d | 35 | #[derive(Debug)] |
f9f354fc | 36 | pub struct CaseFoldError(()); |
0531ce1d | 37 | |
f9f354fc | 38 | impl error::Error for CaseFoldError {} |
0531ce1d | 39 | |
f9f354fc | 40 | impl fmt::Display for CaseFoldError { |
17df50a5 | 41 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
f9f354fc XL |
42 | write!( |
43 | f, | |
44 | "Unicode-aware case folding is not available \ | |
45 | (probably because the unicode-case feature is not enabled)" | |
46 | ) | |
47 | } | |
48 | } | |
49 | ||
50 | /// An error that occurs when the Unicode-aware `\w` class is unavailable. | |
51 | /// | |
52 | /// This error can occur when the data tables necessary for the Unicode aware | |
53 | /// Perl character class `\w` are unavailable. This only occurs when the | |
54 | /// `unicode-perl` feature is disabled. (The feature is enabled by default.) | |
55 | #[derive(Debug)] | |
56 | pub struct UnicodeWordError(()); | |
57 | ||
58 | impl error::Error for UnicodeWordError {} | |
59 | ||
60 | impl fmt::Display for UnicodeWordError { | |
17df50a5 | 61 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
f9f354fc XL |
62 | write!( |
63 | f, | |
64 | "Unicode-aware \\w class is not available \ | |
65 | (probably because the unicode-perl feature is not enabled)" | |
66 | ) | |
0531ce1d XL |
67 | } |
68 | } | |
69 | ||
70 | /// Return an iterator over the equivalence class of simple case mappings | |
71 | /// for the given codepoint. The equivalence class does not include the | |
72 | /// given codepoint. | |
73 | /// | |
74 | /// If the equivalence class is empty, then this returns the next scalar | |
75 | /// value that has a non-empty equivalence class, if it exists. If no such | |
76 | /// scalar value exists, then `None` is returned. The point of this behavior | |
77 | /// is to permit callers to avoid calling `simple_fold` more than they need | |
78 | /// to, since there is some cost to fetching the equivalence class. | |
f9f354fc XL |
79 | /// |
80 | /// This returns an error if the Unicode case folding tables are not available. | |
81 | pub fn simple_fold( | |
82 | c: char, | |
83 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> { | |
84 | #[cfg(not(feature = "unicode-case"))] | |
85 | fn imp( | |
86 | _: char, | |
87 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> | |
88 | { | |
89 | use std::option::IntoIter; | |
90 | Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(())) | |
91 | } | |
92 | ||
93 | #[cfg(feature = "unicode-case")] | |
94 | fn imp( | |
95 | c: char, | |
96 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> | |
97 | { | |
17df50a5 | 98 | use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; |
f9f354fc XL |
99 | |
100 | Ok(CASE_FOLDING_SIMPLE | |
101 | .binary_search_by_key(&c, |&(c1, _)| c1) | |
102 | .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().map(|&c| c)) | |
103 | .map_err(|i| { | |
104 | if i >= CASE_FOLDING_SIMPLE.len() { | |
105 | None | |
106 | } else { | |
107 | Some(CASE_FOLDING_SIMPLE[i].0) | |
108 | } | |
109 | })) | |
110 | } | |
111 | ||
112 | imp(c) | |
0531ce1d XL |
113 | } |
114 | ||
115 | /// Returns true if and only if the given (inclusive) range contains at least | |
116 | /// one Unicode scalar value that has a non-empty non-trivial simple case | |
117 | /// mapping. | |
118 | /// | |
119 | /// This function panics if `end < start`. | |
f9f354fc XL |
120 | /// |
121 | /// This returns an error if the Unicode case folding tables are not available. | |
122 | pub fn contains_simple_case_mapping( | |
123 | start: char, | |
124 | end: char, | |
125 | ) -> FoldResult<bool> { | |
126 | #[cfg(not(feature = "unicode-case"))] | |
127 | fn imp(_: char, _: char) -> FoldResult<bool> { | |
128 | Err(CaseFoldError(())) | |
129 | } | |
130 | ||
131 | #[cfg(feature = "unicode-case")] | |
132 | fn imp(start: char, end: char) -> FoldResult<bool> { | |
17df50a5 | 133 | use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; |
f9f354fc | 134 | use std::cmp::Ordering; |
f9f354fc XL |
135 | |
136 | assert!(start <= end); | |
137 | Ok(CASE_FOLDING_SIMPLE | |
138 | .binary_search_by(|&(c, _)| { | |
139 | if start <= c && c <= end { | |
140 | Ordering::Equal | |
141 | } else if c > end { | |
142 | Ordering::Greater | |
143 | } else { | |
144 | Ordering::Less | |
145 | } | |
146 | }) | |
147 | .is_ok()) | |
148 | } | |
149 | ||
150 | imp(start, end) | |
0531ce1d XL |
151 | } |
152 | ||
153 | /// A query for finding a character class defined by Unicode. This supports | |
154 | /// either use of a property name directly, or lookup by property value. The | |
155 | /// former generally refers to Binary properties (see UTS#44, Table 8), but | |
156 | /// as a special exception (see UTS#18, Section 1.2) both general categories | |
157 | /// (an enumeration) and scripts (a catalog) are supported as if each of their | |
158 | /// possible values were a binary property. | |
159 | /// | |
160 | /// In all circumstances, property names and values are normalized and | |
161 | /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. | |
162 | /// | |
163 | /// The lifetime `'a` refers to the shorter of the lifetimes of property name | |
164 | /// and property value. | |
165 | #[derive(Debug)] | |
166 | pub enum ClassQuery<'a> { | |
167 | /// Return a class corresponding to a Unicode binary property, named by | |
168 | /// a single letter. | |
169 | OneLetter(char), | |
170 | /// Return a class corresponding to a Unicode binary property. | |
171 | /// | |
172 | /// Note that, by special exception (see UTS#18, Section 1.2), both | |
173 | /// general category values and script values are permitted here as if | |
174 | /// they were a binary property. | |
175 | Binary(&'a str), | |
176 | /// Return a class corresponding to all codepoints whose property | |
177 | /// (identified by `property_name`) corresponds to the given value | |
178 | /// (identified by `property_value`). | |
179 | ByValue { | |
180 | /// A property name. | |
181 | property_name: &'a str, | |
182 | /// A property value. | |
183 | property_value: &'a str, | |
184 | }, | |
185 | } | |
186 | ||
187 | impl<'a> ClassQuery<'a> { | |
188 | fn canonicalize(&self) -> Result<CanonicalClassQuery> { | |
189 | match *self { | |
190 | ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), | |
191 | ClassQuery::Binary(name) => self.canonical_binary(name), | |
192 | ClassQuery::ByValue { property_name, property_value } => { | |
f9f354fc XL |
193 | let property_name = symbolic_name_normalize(property_name); |
194 | let property_value = symbolic_name_normalize(property_value); | |
0531ce1d | 195 | |
f9f354fc | 196 | let canon_name = match canonical_prop(&property_name)? { |
0531ce1d XL |
197 | None => return Err(Error::PropertyNotFound), |
198 | Some(canon_name) => canon_name, | |
199 | }; | |
200 | Ok(match canon_name { | |
201 | "General_Category" => { | |
f9f354fc | 202 | let canon = match canonical_gencat(&property_value)? { |
0531ce1d XL |
203 | None => return Err(Error::PropertyValueNotFound), |
204 | Some(canon) => canon, | |
205 | }; | |
206 | CanonicalClassQuery::GeneralCategory(canon) | |
207 | } | |
208 | "Script" => { | |
f9f354fc | 209 | let canon = match canonical_script(&property_value)? { |
0531ce1d XL |
210 | None => return Err(Error::PropertyValueNotFound), |
211 | Some(canon) => canon, | |
212 | }; | |
213 | CanonicalClassQuery::Script(canon) | |
214 | } | |
215 | _ => { | |
f9f354fc | 216 | let vals = match property_values(canon_name)? { |
0531ce1d XL |
217 | None => return Err(Error::PropertyValueNotFound), |
218 | Some(vals) => vals, | |
219 | }; | |
f9f354fc XL |
220 | let canon_val = |
221 | match canonical_value(vals, &property_value) { | |
222 | None => { | |
223 | return Err(Error::PropertyValueNotFound) | |
224 | } | |
225 | Some(canon_val) => canon_val, | |
226 | }; | |
0531ce1d XL |
227 | CanonicalClassQuery::ByValue { |
228 | property_name: canon_name, | |
229 | property_value: canon_val, | |
230 | } | |
231 | } | |
232 | }) | |
233 | } | |
234 | } | |
235 | } | |
236 | ||
237 | fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> { | |
f9f354fc | 238 | let norm = symbolic_name_normalize(name); |
0531ce1d | 239 | |
5869c6ff XL |
240 | // This is a special case where 'cf' refers to the 'Format' general |
241 | // category, but where the 'cf' abbreviation is also an abbreviation | |
242 | // for the 'Case_Folding' property. But we want to treat it as | |
243 | // a general category. (Currently, we don't even support the | |
244 | // 'Case_Folding' property. But if we do in the future, users will be | |
245 | // required to spell it out.) | |
246 | if norm != "cf" { | |
247 | if let Some(canon) = canonical_prop(&norm)? { | |
248 | return Ok(CanonicalClassQuery::Binary(canon)); | |
249 | } | |
0531ce1d | 250 | } |
f9f354fc | 251 | if let Some(canon) = canonical_gencat(&norm)? { |
0531ce1d XL |
252 | return Ok(CanonicalClassQuery::GeneralCategory(canon)); |
253 | } | |
f9f354fc | 254 | if let Some(canon) = canonical_script(&norm)? { |
0531ce1d XL |
255 | return Ok(CanonicalClassQuery::Script(canon)); |
256 | } | |
257 | Err(Error::PropertyNotFound) | |
258 | } | |
259 | } | |
260 | ||
261 | /// Like ClassQuery, but its parameters have been canonicalized. This also | |
262 | /// differentiates binary properties from flattened general categories and | |
263 | /// scripts. | |
94b46f34 | 264 | #[derive(Debug, Eq, PartialEq)] |
0531ce1d XL |
265 | enum CanonicalClassQuery { |
266 | /// The canonical binary property name. | |
267 | Binary(&'static str), | |
268 | /// The canonical general category name. | |
269 | GeneralCategory(&'static str), | |
270 | /// The canonical script name. | |
271 | Script(&'static str), | |
272 | /// An arbitrary association between property and value, both of which | |
273 | /// have been canonicalized. | |
274 | /// | |
275 | /// Note that by construction, the property name of ByValue will never | |
276 | /// be General_Category or Script. Those two cases are subsumed by the | |
277 | /// eponymous variants. | |
278 | ByValue { | |
279 | /// The canonical property name. | |
280 | property_name: &'static str, | |
281 | /// The canonical property value. | |
282 | property_value: &'static str, | |
283 | }, | |
284 | } | |
285 | ||
286 | /// Looks up a Unicode class given a query. If one doesn't exist, then | |
287 | /// `None` is returned. | |
17df50a5 | 288 | pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode> { |
0531ce1d XL |
289 | use self::CanonicalClassQuery::*; |
290 | ||
94b46f34 | 291 | match query.canonicalize()? { |
f9f354fc XL |
292 | Binary(name) => bool_property(name), |
293 | GeneralCategory(name) => gencat(name), | |
294 | Script(name) => script(name), | |
0531ce1d XL |
295 | ByValue { property_name: "Age", property_value } => { |
296 | let mut class = hir::ClassUnicode::empty(); | |
94b46f34 | 297 | for set in ages(property_value)? { |
0531ce1d XL |
298 | class.union(&hir_class(set)); |
299 | } | |
300 | Ok(class) | |
301 | } | |
302 | ByValue { property_name: "Script_Extensions", property_value } => { | |
f9f354fc | 303 | script_extension(property_value) |
0731742a | 304 | } |
f9f354fc XL |
305 | ByValue { |
306 | property_name: "Grapheme_Cluster_Break", | |
307 | property_value, | |
308 | } => gcb(property_value), | |
0731742a | 309 | ByValue { property_name: "Sentence_Break", property_value } => { |
f9f354fc | 310 | sb(property_value) |
0731742a XL |
311 | } |
312 | ByValue { property_name: "Word_Break", property_value } => { | |
f9f354fc | 313 | wb(property_value) |
0731742a | 314 | } |
0531ce1d XL |
315 | _ => { |
316 | // What else should we support? | |
317 | Err(Error::PropertyNotFound) | |
318 | } | |
319 | } | |
320 | } | |
321 | ||
f9f354fc XL |
322 | /// Returns a Unicode aware class for \w. |
323 | /// | |
324 | /// This returns an error if the data is not available for \w. | |
325 | pub fn perl_word() -> Result<hir::ClassUnicode> { | |
326 | #[cfg(not(feature = "unicode-perl"))] | |
327 | fn imp() -> Result<hir::ClassUnicode> { | |
328 | Err(Error::PerlClassNotFound) | |
329 | } | |
330 | ||
331 | #[cfg(feature = "unicode-perl")] | |
332 | fn imp() -> Result<hir::ClassUnicode> { | |
17df50a5 | 333 | use crate::unicode_tables::perl_word::PERL_WORD; |
f9f354fc XL |
334 | Ok(hir_class(PERL_WORD)) |
335 | } | |
336 | ||
337 | imp() | |
338 | } | |
339 | ||
340 | /// Returns a Unicode aware class for \s. | |
341 | /// | |
342 | /// This returns an error if the data is not available for \s. | |
343 | pub fn perl_space() -> Result<hir::ClassUnicode> { | |
344 | #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] | |
345 | fn imp() -> Result<hir::ClassUnicode> { | |
346 | Err(Error::PerlClassNotFound) | |
347 | } | |
348 | ||
349 | #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] | |
350 | fn imp() -> Result<hir::ClassUnicode> { | |
17df50a5 | 351 | use crate::unicode_tables::perl_space::WHITE_SPACE; |
f9f354fc XL |
352 | Ok(hir_class(WHITE_SPACE)) |
353 | } | |
354 | ||
355 | #[cfg(feature = "unicode-bool")] | |
356 | fn imp() -> Result<hir::ClassUnicode> { | |
17df50a5 | 357 | use crate::unicode_tables::property_bool::WHITE_SPACE; |
f9f354fc XL |
358 | Ok(hir_class(WHITE_SPACE)) |
359 | } | |
360 | ||
361 | imp() | |
362 | } | |
363 | ||
364 | /// Returns a Unicode aware class for \d. | |
365 | /// | |
366 | /// This returns an error if the data is not available for \d. | |
367 | pub fn perl_digit() -> Result<hir::ClassUnicode> { | |
368 | #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] | |
369 | fn imp() -> Result<hir::ClassUnicode> { | |
370 | Err(Error::PerlClassNotFound) | |
371 | } | |
372 | ||
373 | #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] | |
374 | fn imp() -> Result<hir::ClassUnicode> { | |
17df50a5 | 375 | use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; |
f9f354fc XL |
376 | Ok(hir_class(DECIMAL_NUMBER)) |
377 | } | |
378 | ||
379 | #[cfg(feature = "unicode-gencat")] | |
380 | fn imp() -> Result<hir::ClassUnicode> { | |
17df50a5 | 381 | use crate::unicode_tables::general_category::DECIMAL_NUMBER; |
f9f354fc XL |
382 | Ok(hir_class(DECIMAL_NUMBER)) |
383 | } | |
384 | ||
385 | imp() | |
386 | } | |
387 | ||
0531ce1d XL |
388 | /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. |
389 | pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { | |
390 | let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges | |
391 | .iter() | |
392 | .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) | |
393 | .collect(); | |
394 | hir::ClassUnicode::new(hir_ranges) | |
395 | } | |
396 | ||
f9f354fc XL |
397 | /// Returns true only if the given codepoint is in the `\w` character class. |
398 | /// | |
399 | /// If the `unicode-perl` feature is not enabled, then this returns an error. | |
400 | pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> { | |
401 | #[cfg(not(feature = "unicode-perl"))] | |
402 | fn imp(_: char) -> result::Result<bool, UnicodeWordError> { | |
403 | Err(UnicodeWordError(())) | |
404 | } | |
405 | ||
406 | #[cfg(feature = "unicode-perl")] | |
407 | fn imp(c: char) -> result::Result<bool, UnicodeWordError> { | |
17df50a5 XL |
408 | use crate::is_word_byte; |
409 | use crate::unicode_tables::perl_word::PERL_WORD; | |
f9f354fc | 410 | use std::cmp::Ordering; |
f9f354fc XL |
411 | |
412 | if c <= 0x7F as char && is_word_byte(c as u8) { | |
413 | return Ok(true); | |
414 | } | |
415 | Ok(PERL_WORD | |
416 | .binary_search_by(|&(start, end)| { | |
417 | if start <= c && c <= end { | |
418 | Ordering::Equal | |
419 | } else if start > c { | |
420 | Ordering::Greater | |
421 | } else { | |
422 | Ordering::Less | |
423 | } | |
424 | }) | |
425 | .is_ok()) | |
426 | } | |
427 | ||
428 | imp(c) | |
0531ce1d XL |
429 | } |
430 | ||
f9f354fc XL |
431 | /// A mapping of property values for a specific property. |
432 | /// | |
433 | /// The first element of each tuple is a normalized property value while the | |
434 | /// second element of each tuple is the corresponding canonical property | |
435 | /// value. | |
436 | type PropertyValues = &'static [(&'static str, &'static str)]; | |
437 | ||
438 | fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> { | |
439 | Ok(match normalized_value { | |
0531ce1d XL |
440 | "any" => Some("Any"), |
441 | "assigned" => Some("Assigned"), | |
442 | "ascii" => Some("ASCII"), | |
443 | _ => { | |
f9f354fc | 444 | let gencats = property_values("General_Category")?.unwrap(); |
0531ce1d XL |
445 | canonical_value(gencats, normalized_value) |
446 | } | |
f9f354fc XL |
447 | }) |
448 | } | |
449 | ||
450 | fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> { | |
451 | let scripts = property_values("Script")?.unwrap(); | |
452 | Ok(canonical_value(scripts, normalized_value)) | |
0531ce1d XL |
453 | } |
454 | ||
f9f354fc XL |
455 | /// Find the canonical property name for the given normalized property name. |
456 | /// | |
457 | /// If no such property exists, then `None` is returned. | |
458 | /// | |
459 | /// The normalized property name must have been normalized according to | |
460 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. | |
461 | /// | |
462 | /// If the property names data is not available, then an error is returned. | |
463 | fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> { | |
464 | #[cfg(not(any( | |
465 | feature = "unicode-age", | |
466 | feature = "unicode-bool", | |
467 | feature = "unicode-gencat", | |
468 | feature = "unicode-perl", | |
469 | feature = "unicode-script", | |
470 | feature = "unicode-segment", | |
471 | )))] | |
472 | fn imp(_: &str) -> Result<Option<&'static str>> { | |
473 | Err(Error::PropertyNotFound) | |
474 | } | |
475 | ||
476 | #[cfg(any( | |
477 | feature = "unicode-age", | |
478 | feature = "unicode-bool", | |
479 | feature = "unicode-gencat", | |
480 | feature = "unicode-perl", | |
481 | feature = "unicode-script", | |
482 | feature = "unicode-segment", | |
483 | ))] | |
484 | fn imp(name: &str) -> Result<Option<&'static str>> { | |
17df50a5 | 485 | use crate::unicode_tables::property_names::PROPERTY_NAMES; |
f9f354fc XL |
486 | |
487 | Ok(PROPERTY_NAMES | |
488 | .binary_search_by_key(&name, |&(n, _)| n) | |
489 | .ok() | |
490 | .map(|i| PROPERTY_NAMES[i].1)) | |
491 | } | |
492 | ||
493 | imp(normalized_name) | |
0531ce1d XL |
494 | } |
495 | ||
f9f354fc XL |
496 | /// Find the canonical property value for the given normalized property |
497 | /// value. | |
498 | /// | |
499 | /// The given property values should correspond to the values for the property | |
500 | /// under question, which can be found using `property_values`. | |
501 | /// | |
502 | /// If no such property value exists, then `None` is returned. | |
503 | /// | |
504 | /// The normalized property value must have been normalized according to | |
505 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. | |
0531ce1d XL |
506 | fn canonical_value( |
507 | vals: PropertyValues, | |
508 | normalized_value: &str, | |
509 | ) -> Option<&'static str> { | |
f9f354fc XL |
510 | vals.binary_search_by_key(&normalized_value, |&(n, _)| n) |
511 | .ok() | |
512 | .map(|i| vals[i].1) | |
0531ce1d XL |
513 | } |
514 | ||
f9f354fc XL |
515 | /// Return the table of property values for the given property name. |
516 | /// | |
517 | /// If the property values data is not available, then an error is returned. | |
0531ce1d XL |
518 | fn property_values( |
519 | canonical_property_name: &'static str, | |
f9f354fc XL |
520 | ) -> Result<Option<PropertyValues>> { |
521 | #[cfg(not(any( | |
522 | feature = "unicode-age", | |
523 | feature = "unicode-bool", | |
524 | feature = "unicode-gencat", | |
525 | feature = "unicode-perl", | |
526 | feature = "unicode-script", | |
527 | feature = "unicode-segment", | |
528 | )))] | |
529 | fn imp(_: &'static str) -> Result<Option<PropertyValues>> { | |
530 | Err(Error::PropertyValueNotFound) | |
531 | } | |
532 | ||
533 | #[cfg(any( | |
534 | feature = "unicode-age", | |
535 | feature = "unicode-bool", | |
536 | feature = "unicode-gencat", | |
537 | feature = "unicode-perl", | |
538 | feature = "unicode-script", | |
539 | feature = "unicode-segment", | |
540 | ))] | |
541 | fn imp(name: &'static str) -> Result<Option<PropertyValues>> { | |
17df50a5 | 542 | use crate::unicode_tables::property_values::PROPERTY_VALUES; |
f9f354fc XL |
543 | |
544 | Ok(PROPERTY_VALUES | |
545 | .binary_search_by_key(&name, |&(n, _)| n) | |
546 | .ok() | |
547 | .map(|i| PROPERTY_VALUES[i].1)) | |
548 | } | |
549 | ||
550 | imp(canonical_property_name) | |
0531ce1d XL |
551 | } |
552 | ||
f9f354fc XL |
553 | // This is only used in some cases, but small enough to just let it be dead |
554 | // instead of figuring out (and maintaining) the right set of features. | |
555 | #[allow(dead_code)] | |
0531ce1d | 556 | fn property_set( |
f9f354fc | 557 | name_map: &'static [(&'static str, Range)], |
0531ce1d | 558 | canonical: &'static str, |
f9f354fc | 559 | ) -> Option<Range> { |
0531ce1d XL |
560 | name_map |
561 | .binary_search_by_key(&canonical, |x| x.0) | |
562 | .ok() | |
563 | .map(|i| name_map[i].1) | |
564 | } | |
565 | ||
f9f354fc XL |
566 | /// Returns an iterator over Unicode Age sets. Each item corresponds to a set |
567 | /// of codepoints that were added in a particular revision of Unicode. The | |
0531ce1d | 568 | /// iterator yields items in chronological order. |
f9f354fc XL |
569 | /// |
570 | /// If the given age value isn't valid or if the data isn't available, then an | |
571 | /// error is returned instead. | |
572 | fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { | |
573 | #[cfg(not(feature = "unicode-age"))] | |
574 | fn imp(_: &str) -> Result<impl Iterator<Item = Range>> { | |
575 | use std::option::IntoIter; | |
576 | Err::<IntoIter<Range>, _>(Error::PropertyNotFound) | |
577 | } | |
578 | ||
579 | #[cfg(feature = "unicode-age")] | |
580 | fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { | |
17df50a5 | 581 | use crate::unicode_tables::age; |
f9f354fc XL |
582 | |
583 | const AGES: &'static [(&'static str, Range)] = &[ | |
584 | ("V1_1", age::V1_1), | |
585 | ("V2_0", age::V2_0), | |
586 | ("V2_1", age::V2_1), | |
587 | ("V3_0", age::V3_0), | |
588 | ("V3_1", age::V3_1), | |
589 | ("V3_2", age::V3_2), | |
590 | ("V4_0", age::V4_0), | |
591 | ("V4_1", age::V4_1), | |
592 | ("V5_0", age::V5_0), | |
593 | ("V5_1", age::V5_1), | |
594 | ("V5_2", age::V5_2), | |
595 | ("V6_0", age::V6_0), | |
596 | ("V6_1", age::V6_1), | |
597 | ("V6_2", age::V6_2), | |
598 | ("V6_3", age::V6_3), | |
599 | ("V7_0", age::V7_0), | |
600 | ("V8_0", age::V8_0), | |
601 | ("V9_0", age::V9_0), | |
602 | ("V10_0", age::V10_0), | |
603 | ("V11_0", age::V11_0), | |
604 | ("V12_0", age::V12_0), | |
605 | ("V12_1", age::V12_1), | |
606 | ("V13_0", age::V13_0), | |
607 | ]; | |
608 | assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); | |
609 | ||
610 | let pos = AGES.iter().position(|&(age, _)| canonical_age == age); | |
611 | match pos { | |
612 | None => Err(Error::PropertyValueNotFound), | |
613 | Some(i) => Ok(AGES[..i + 1].iter().map(|&(_, classes)| classes)), | |
614 | } | |
615 | } | |
616 | ||
617 | imp(canonical_age) | |
618 | } | |
619 | ||
620 | /// Returns the Unicode HIR class corresponding to the given general category. | |
621 | /// | |
622 | /// Name canonicalization is assumed to be performed by the caller. | |
623 | /// | |
624 | /// If the given general category could not be found, or if the general | |
625 | /// category data is not available, then an error is returned. | |
626 | fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | |
627 | #[cfg(not(feature = "unicode-gencat"))] | |
628 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
629 | Err(Error::PropertyNotFound) | |
630 | } | |
631 | ||
632 | #[cfg(feature = "unicode-gencat")] | |
633 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
17df50a5 | 634 | use crate::unicode_tables::general_category::BY_NAME; |
f9f354fc XL |
635 | match name { |
636 | "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), | |
637 | "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), | |
638 | "Assigned" => { | |
639 | let mut cls = gencat("Unassigned")?; | |
640 | cls.negate(); | |
641 | Ok(cls) | |
642 | } | |
643 | name => property_set(BY_NAME, name) | |
644 | .map(hir_class) | |
645 | .ok_or(Error::PropertyValueNotFound), | |
646 | } | |
647 | } | |
648 | ||
649 | match canonical_name { | |
650 | "Decimal_Number" => perl_digit(), | |
651 | name => imp(name), | |
652 | } | |
653 | } | |
654 | ||
655 | /// Returns the Unicode HIR class corresponding to the given script. | |
656 | /// | |
657 | /// Name canonicalization is assumed to be performed by the caller. | |
658 | /// | |
659 | /// If the given script could not be found, or if the script data is not | |
660 | /// available, then an error is returned. | |
661 | fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | |
662 | #[cfg(not(feature = "unicode-script"))] | |
663 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
664 | Err(Error::PropertyNotFound) | |
665 | } | |
666 | ||
667 | #[cfg(feature = "unicode-script")] | |
668 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
17df50a5 | 669 | use crate::unicode_tables::script::BY_NAME; |
f9f354fc XL |
670 | property_set(BY_NAME, name) |
671 | .map(hir_class) | |
672 | .ok_or(Error::PropertyValueNotFound) | |
673 | } | |
674 | ||
675 | imp(canonical_name) | |
676 | } | |
677 | ||
678 | /// Returns the Unicode HIR class corresponding to the given script extension. | |
679 | /// | |
680 | /// Name canonicalization is assumed to be performed by the caller. | |
681 | /// | |
682 | /// If the given script extension could not be found, or if the script data is | |
683 | /// not available, then an error is returned. | |
684 | fn script_extension( | |
685 | canonical_name: &'static str, | |
686 | ) -> Result<hir::ClassUnicode> { | |
687 | #[cfg(not(feature = "unicode-script"))] | |
688 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
689 | Err(Error::PropertyNotFound) | |
690 | } | |
691 | ||
692 | #[cfg(feature = "unicode-script")] | |
693 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
17df50a5 | 694 | use crate::unicode_tables::script_extension::BY_NAME; |
f9f354fc XL |
695 | property_set(BY_NAME, name) |
696 | .map(hir_class) | |
697 | .ok_or(Error::PropertyValueNotFound) | |
698 | } | |
699 | ||
700 | imp(canonical_name) | |
701 | } | |
702 | ||
703 | /// Returns the Unicode HIR class corresponding to the given Unicode boolean | |
704 | /// property. | |
705 | /// | |
706 | /// Name canonicalization is assumed to be performed by the caller. | |
707 | /// | |
708 | /// If the given boolean property could not be found, or if the boolean | |
709 | /// property data is not available, then an error is returned. | |
710 | fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | |
711 | #[cfg(not(feature = "unicode-bool"))] | |
712 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
713 | Err(Error::PropertyNotFound) | |
714 | } | |
715 | ||
716 | #[cfg(feature = "unicode-bool")] | |
717 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
17df50a5 | 718 | use crate::unicode_tables::property_bool::BY_NAME; |
f9f354fc XL |
719 | property_set(BY_NAME, name) |
720 | .map(hir_class) | |
721 | .ok_or(Error::PropertyNotFound) | |
722 | } | |
723 | ||
724 | match canonical_name { | |
725 | "Decimal_Number" => perl_digit(), | |
726 | "White_Space" => perl_space(), | |
727 | name => imp(name), | |
728 | } | |
729 | } | |
730 | ||
731 | /// Returns the Unicode HIR class corresponding to the given grapheme cluster | |
732 | /// break property. | |
733 | /// | |
734 | /// Name canonicalization is assumed to be performed by the caller. | |
735 | /// | |
736 | /// If the given property could not be found, or if the corresponding data is | |
737 | /// not available, then an error is returned. | |
738 | fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | |
739 | #[cfg(not(feature = "unicode-segment"))] | |
740 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
741 | Err(Error::PropertyNotFound) | |
742 | } | |
743 | ||
744 | #[cfg(feature = "unicode-segment")] | |
745 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
17df50a5 | 746 | use crate::unicode_tables::grapheme_cluster_break::BY_NAME; |
f9f354fc XL |
747 | property_set(BY_NAME, name) |
748 | .map(hir_class) | |
749 | .ok_or(Error::PropertyValueNotFound) | |
750 | } | |
751 | ||
752 | imp(canonical_name) | |
753 | } | |
754 | ||
755 | /// Returns the Unicode HIR class corresponding to the given word break | |
756 | /// property. | |
757 | /// | |
758 | /// Name canonicalization is assumed to be performed by the caller. | |
759 | /// | |
760 | /// If the given property could not be found, or if the corresponding data is | |
761 | /// not available, then an error is returned. | |
762 | fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | |
763 | #[cfg(not(feature = "unicode-segment"))] | |
764 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
765 | Err(Error::PropertyNotFound) | |
766 | } | |
767 | ||
768 | #[cfg(feature = "unicode-segment")] | |
769 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
17df50a5 | 770 | use crate::unicode_tables::word_break::BY_NAME; |
f9f354fc XL |
771 | property_set(BY_NAME, name) |
772 | .map(hir_class) | |
773 | .ok_or(Error::PropertyValueNotFound) | |
774 | } | |
775 | ||
776 | imp(canonical_name) | |
0531ce1d XL |
777 | } |
778 | ||
f9f354fc XL |
779 | /// Returns the Unicode HIR class corresponding to the given sentence |
780 | /// break property. | |
781 | /// | |
782 | /// Name canonicalization is assumed to be performed by the caller. | |
783 | /// | |
784 | /// If the given property could not be found, or if the corresponding data is | |
785 | /// not available, then an error is returned. | |
786 | fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | |
787 | #[cfg(not(feature = "unicode-segment"))] | |
788 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
789 | Err(Error::PropertyNotFound) | |
790 | } | |
791 | ||
792 | #[cfg(feature = "unicode-segment")] | |
793 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
17df50a5 | 794 | use crate::unicode_tables::sentence_break::BY_NAME; |
f9f354fc XL |
795 | property_set(BY_NAME, name) |
796 | .map(hir_class) | |
797 | .ok_or(Error::PropertyValueNotFound) | |
0531ce1d | 798 | } |
f9f354fc XL |
799 | |
800 | imp(canonical_name) | |
0531ce1d XL |
801 | } |
802 | ||
f9f354fc XL |
803 | /// Like symbolic_name_normalize_bytes, but operates on a string. |
804 | fn symbolic_name_normalize(x: &str) -> String { | |
805 | let mut tmp = x.as_bytes().to_vec(); | |
806 | let len = symbolic_name_normalize_bytes(&mut tmp).len(); | |
807 | tmp.truncate(len); | |
808 | // This should always succeed because `symbolic_name_normalize_bytes` | |
809 | // guarantees that `&tmp[..len]` is always valid UTF-8. | |
810 | // | |
811 | // N.B. We could avoid the additional UTF-8 check here, but it's unlikely | |
812 | // to be worth skipping the additional safety check. A benchmark must | |
813 | // justify it first. | |
814 | String::from_utf8(tmp).unwrap() | |
815 | } | |
0531ce1d | 816 | |
f9f354fc XL |
817 | /// Normalize the given symbolic name in place according to UAX44-LM3. |
818 | /// | |
819 | /// A "symbolic name" typically corresponds to property names and property | |
820 | /// value aliases. Note, though, that it should not be applied to property | |
821 | /// string values. | |
822 | /// | |
823 | /// The slice returned is guaranteed to be valid UTF-8 for all possible values | |
824 | /// of `slice`. | |
825 | /// | |
cdc7bbd5 | 826 | /// See: https://unicode.org/reports/tr44/#UAX44-LM3 |
f9f354fc XL |
827 | fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { |
828 | // I couldn't find a place in the standard that specified that property | |
829 | // names/aliases had a particular structure (unlike character names), but | |
830 | // we assume that it's ASCII only and drop anything that isn't ASCII. | |
831 | let mut start = 0; | |
832 | let mut starts_with_is = false; | |
833 | if slice.len() >= 2 { | |
834 | // Ignore any "is" prefix. | |
835 | starts_with_is = slice[0..2] == b"is"[..] | |
836 | || slice[0..2] == b"IS"[..] | |
837 | || slice[0..2] == b"iS"[..] | |
838 | || slice[0..2] == b"Is"[..]; | |
839 | if starts_with_is { | |
840 | start = 2; | |
0531ce1d XL |
841 | } |
842 | } | |
f9f354fc XL |
843 | let mut next_write = 0; |
844 | for i in start..slice.len() { | |
845 | // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid | |
846 | // UTF-8, we ensure that the slice contains only ASCII bytes. In | |
847 | // particular, we drop every non-ASCII byte from the normalized string. | |
848 | let b = slice[i]; | |
849 | if b == b' ' || b == b'_' || b == b'-' { | |
850 | continue; | |
851 | } else if b'A' <= b && b <= b'Z' { | |
852 | slice[next_write] = b + (b'a' - b'A'); | |
853 | next_write += 1; | |
854 | } else if b <= 0x7F { | |
855 | slice[next_write] = b; | |
856 | next_write += 1; | |
857 | } | |
858 | } | |
859 | // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally | |
860 | // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross | |
861 | // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it | |
862 | // is actually an alias for the 'Other' general category. | |
863 | if starts_with_is && next_write == 1 && slice[0] == b'c' { | |
864 | slice[0] = b'i'; | |
865 | slice[1] = b's'; | |
866 | slice[2] = b'c'; | |
867 | next_write = 3; | |
868 | } | |
869 | &mut slice[..next_write] | |
0531ce1d XL |
870 | } |
871 | ||
872 | #[cfg(test)] | |
873 | mod tests { | |
f9f354fc XL |
874 | use super::{ |
875 | contains_simple_case_mapping, simple_fold, symbolic_name_normalize, | |
876 | symbolic_name_normalize_bytes, | |
877 | }; | |
878 | ||
879 | #[cfg(feature = "unicode-case")] | |
880 | fn simple_fold_ok(c: char) -> impl Iterator<Item = char> { | |
881 | simple_fold(c).unwrap().unwrap() | |
882 | } | |
883 | ||
884 | #[cfg(feature = "unicode-case")] | |
885 | fn simple_fold_err(c: char) -> Option<char> { | |
886 | match simple_fold(c).unwrap() { | |
887 | Ok(_) => unreachable!("simple_fold returned Ok iterator"), | |
888 | Err(next) => next, | |
889 | } | |
890 | } | |
891 | ||
892 | #[cfg(feature = "unicode-case")] | |
893 | fn contains_case_map(start: char, end: char) -> bool { | |
894 | contains_simple_case_mapping(start, end).unwrap() | |
895 | } | |
0531ce1d XL |
896 | |
897 | #[test] | |
f9f354fc | 898 | #[cfg(feature = "unicode-case")] |
0531ce1d | 899 | fn simple_fold_k() { |
f9f354fc | 900 | let xs: Vec<char> = simple_fold_ok('k').collect(); |
0531ce1d XL |
901 | assert_eq!(xs, vec!['K', 'K']); |
902 | ||
f9f354fc | 903 | let xs: Vec<char> = simple_fold_ok('K').collect(); |
0531ce1d XL |
904 | assert_eq!(xs, vec!['k', 'K']); |
905 | ||
f9f354fc | 906 | let xs: Vec<char> = simple_fold_ok('K').collect(); |
0531ce1d XL |
907 | assert_eq!(xs, vec!['K', 'k']); |
908 | } | |
909 | ||
910 | #[test] | |
f9f354fc | 911 | #[cfg(feature = "unicode-case")] |
0531ce1d | 912 | fn simple_fold_a() { |
f9f354fc | 913 | let xs: Vec<char> = simple_fold_ok('a').collect(); |
0531ce1d XL |
914 | assert_eq!(xs, vec!['A']); |
915 | ||
f9f354fc | 916 | let xs: Vec<char> = simple_fold_ok('A').collect(); |
0531ce1d XL |
917 | assert_eq!(xs, vec!['a']); |
918 | } | |
919 | ||
920 | #[test] | |
f9f354fc | 921 | #[cfg(feature = "unicode-case")] |
0531ce1d | 922 | fn simple_fold_empty() { |
f9f354fc XL |
923 | assert_eq!(Some('A'), simple_fold_err('?')); |
924 | assert_eq!(Some('A'), simple_fold_err('@')); | |
925 | assert_eq!(Some('a'), simple_fold_err('[')); | |
926 | assert_eq!(Some('Ⰰ'), simple_fold_err('☃')); | |
0531ce1d XL |
927 | } |
928 | ||
929 | #[test] | |
f9f354fc | 930 | #[cfg(feature = "unicode-case")] |
0531ce1d | 931 | fn simple_fold_max() { |
f9f354fc XL |
932 | assert_eq!(None, simple_fold_err('\u{10FFFE}')); |
933 | assert_eq!(None, simple_fold_err('\u{10FFFF}')); | |
0531ce1d XL |
934 | } |
935 | ||
936 | #[test] | |
f9f354fc XL |
937 | #[cfg(not(feature = "unicode-case"))] |
938 | fn simple_fold_disabled() { | |
939 | assert!(simple_fold('a').is_err()); | |
940 | } | |
941 | ||
942 | #[test] | |
943 | #[cfg(feature = "unicode-case")] | |
0531ce1d | 944 | fn range_contains() { |
f9f354fc XL |
945 | assert!(contains_case_map('A', 'A')); |
946 | assert!(contains_case_map('Z', 'Z')); | |
947 | assert!(contains_case_map('A', 'Z')); | |
948 | assert!(contains_case_map('@', 'A')); | |
949 | assert!(contains_case_map('Z', '[')); | |
950 | assert!(contains_case_map('☃', 'Ⰰ')); | |
0531ce1d | 951 | |
f9f354fc XL |
952 | assert!(!contains_case_map('[', '[')); |
953 | assert!(!contains_case_map('[', '`')); | |
0531ce1d | 954 | |
f9f354fc XL |
955 | assert!(!contains_case_map('☃', '☃')); |
956 | } | |
957 | ||
958 | #[test] | |
959 | #[cfg(not(feature = "unicode-case"))] | |
960 | fn range_contains_disabled() { | |
961 | assert!(contains_simple_case_mapping('a', 'a').is_err()); | |
0531ce1d | 962 | } |
94b46f34 XL |
963 | |
964 | #[test] | |
f9f354fc | 965 | #[cfg(feature = "unicode-gencat")] |
94b46f34 XL |
966 | fn regression_466() { |
967 | use super::{CanonicalClassQuery, ClassQuery}; | |
968 | ||
969 | let q = ClassQuery::OneLetter('C'); | |
970 | assert_eq!( | |
971 | q.canonicalize().unwrap(), | |
f9f354fc XL |
972 | CanonicalClassQuery::GeneralCategory("Other") |
973 | ); | |
974 | } | |
975 | ||
976 | #[test] | |
977 | fn sym_normalize() { | |
978 | let sym_norm = symbolic_name_normalize; | |
979 | ||
980 | assert_eq!(sym_norm("Line_Break"), "linebreak"); | |
981 | assert_eq!(sym_norm("Line-break"), "linebreak"); | |
982 | assert_eq!(sym_norm("linebreak"), "linebreak"); | |
983 | assert_eq!(sym_norm("BA"), "ba"); | |
984 | assert_eq!(sym_norm("ba"), "ba"); | |
985 | assert_eq!(sym_norm("Greek"), "greek"); | |
986 | assert_eq!(sym_norm("isGreek"), "greek"); | |
987 | assert_eq!(sym_norm("IS_Greek"), "greek"); | |
988 | assert_eq!(sym_norm("isc"), "isc"); | |
989 | assert_eq!(sym_norm("is c"), "isc"); | |
990 | assert_eq!(sym_norm("is_c"), "isc"); | |
991 | } | |
992 | ||
993 | #[test] | |
994 | fn valid_utf8_symbolic() { | |
995 | let mut x = b"abc\xFFxyz".to_vec(); | |
996 | let y = symbolic_name_normalize_bytes(&mut x); | |
997 | assert_eq!(y, b"abcxyz"); | |
94b46f34 | 998 | } |
0531ce1d | 999 | } |