]>
Commit | Line | Data |
---|---|---|
f9f354fc XL |
1 | use std::error; |
2 | use std::fmt; | |
0531ce1d XL |
3 | use std::result; |
4 | ||
0531ce1d | 5 | use hir; |
f9f354fc XL |
6 | |
7 | /// A type alias for errors specific to Unicode handling of classes. | |
8 | pub type Result<T> = result::Result<T, Error>; | |
9 | ||
10 | /// An inclusive range of codepoints from a generated file (hence the static | |
11 | /// lifetime). | |
12 | type Range = &'static [(char, char)]; | |
0531ce1d XL |
13 | |
14 | /// An error that occurs when dealing with Unicode. | |
15 | /// | |
16 | /// We don't impl the Error trait here because these always get converted | |
17 | /// into other public errors. (This error type isn't exported.) | |
18 | #[derive(Debug)] | |
19 | pub enum Error { | |
20 | PropertyNotFound, | |
21 | PropertyValueNotFound, | |
f9f354fc XL |
22 | // Not used when unicode-perl is enabled. |
23 | #[allow(dead_code)] | |
24 | PerlClassNotFound, | |
0531ce1d XL |
25 | } |
26 | ||
f9f354fc XL |
27 | /// A type alias for errors specific to Unicode case folding. |
28 | pub type FoldResult<T> = result::Result<T, CaseFoldError>; | |
29 | ||
30 | /// An error that occurs when Unicode-aware simple case folding fails. | |
31 | /// | |
32 | /// This error can occur when the case mapping tables necessary for Unicode | |
33 | /// aware case folding are unavailable. This only occurs when the | |
34 | /// `unicode-case` feature is disabled. (The feature is enabled by default.) | |
0531ce1d | 35 | #[derive(Debug)] |
f9f354fc | 36 | pub struct CaseFoldError(()); |
0531ce1d | 37 | |
f9f354fc | 38 | impl error::Error for CaseFoldError {} |
0531ce1d | 39 | |
f9f354fc XL |
40 | impl fmt::Display for CaseFoldError { |
41 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
42 | write!( | |
43 | f, | |
44 | "Unicode-aware case folding is not available \ | |
45 | (probably because the unicode-case feature is not enabled)" | |
46 | ) | |
47 | } | |
48 | } | |
49 | ||
50 | /// An error that occurs when the Unicode-aware `\w` class is unavailable. | |
51 | /// | |
52 | /// This error can occur when the data tables necessary for the Unicode aware | |
53 | /// Perl character class `\w` are unavailable. This only occurs when the | |
54 | /// `unicode-perl` feature is disabled. (The feature is enabled by default.) | |
55 | #[derive(Debug)] | |
56 | pub struct UnicodeWordError(()); | |
57 | ||
58 | impl error::Error for UnicodeWordError {} | |
59 | ||
60 | impl fmt::Display for UnicodeWordError { | |
61 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
62 | write!( | |
63 | f, | |
64 | "Unicode-aware \\w class is not available \ | |
65 | (probably because the unicode-perl feature is not enabled)" | |
66 | ) | |
0531ce1d XL |
67 | } |
68 | } | |
69 | ||
70 | /// Return an iterator over the equivalence class of simple case mappings | |
71 | /// for the given codepoint. The equivalence class does not include the | |
72 | /// given codepoint. | |
73 | /// | |
74 | /// If the equivalence class is empty, then this returns the next scalar | |
75 | /// value that has a non-empty equivalence class, if it exists. If no such | |
76 | /// scalar value exists, then `None` is returned. The point of this behavior | |
77 | /// is to permit callers to avoid calling `simple_fold` more than they need | |
78 | /// to, since there is some cost to fetching the equivalence class. | |
f9f354fc XL |
79 | /// |
80 | /// This returns an error if the Unicode case folding tables are not available. | |
81 | pub fn simple_fold( | |
82 | c: char, | |
83 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> { | |
84 | #[cfg(not(feature = "unicode-case"))] | |
85 | fn imp( | |
86 | _: char, | |
87 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> | |
88 | { | |
89 | use std::option::IntoIter; | |
90 | Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(())) | |
91 | } | |
92 | ||
93 | #[cfg(feature = "unicode-case")] | |
94 | fn imp( | |
95 | c: char, | |
96 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> | |
97 | { | |
98 | use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; | |
99 | ||
100 | Ok(CASE_FOLDING_SIMPLE | |
101 | .binary_search_by_key(&c, |&(c1, _)| c1) | |
102 | .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().map(|&c| c)) | |
103 | .map_err(|i| { | |
104 | if i >= CASE_FOLDING_SIMPLE.len() { | |
105 | None | |
106 | } else { | |
107 | Some(CASE_FOLDING_SIMPLE[i].0) | |
108 | } | |
109 | })) | |
110 | } | |
111 | ||
112 | imp(c) | |
0531ce1d XL |
113 | } |
114 | ||
115 | /// Returns true if and only if the given (inclusive) range contains at least | |
116 | /// one Unicode scalar value that has a non-empty non-trivial simple case | |
117 | /// mapping. | |
118 | /// | |
119 | /// This function panics if `end < start`. | |
f9f354fc XL |
120 | /// |
121 | /// This returns an error if the Unicode case folding tables are not available. | |
122 | pub fn contains_simple_case_mapping( | |
123 | start: char, | |
124 | end: char, | |
125 | ) -> FoldResult<bool> { | |
126 | #[cfg(not(feature = "unicode-case"))] | |
127 | fn imp(_: char, _: char) -> FoldResult<bool> { | |
128 | Err(CaseFoldError(())) | |
129 | } | |
130 | ||
131 | #[cfg(feature = "unicode-case")] | |
132 | fn imp(start: char, end: char) -> FoldResult<bool> { | |
133 | use std::cmp::Ordering; | |
134 | use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; | |
135 | ||
136 | assert!(start <= end); | |
137 | Ok(CASE_FOLDING_SIMPLE | |
138 | .binary_search_by(|&(c, _)| { | |
139 | if start <= c && c <= end { | |
140 | Ordering::Equal | |
141 | } else if c > end { | |
142 | Ordering::Greater | |
143 | } else { | |
144 | Ordering::Less | |
145 | } | |
146 | }) | |
147 | .is_ok()) | |
148 | } | |
149 | ||
150 | imp(start, end) | |
0531ce1d XL |
151 | } |
152 | ||
153 | /// A query for finding a character class defined by Unicode. This supports | |
154 | /// either use of a property name directly, or lookup by property value. The | |
155 | /// former generally refers to Binary properties (see UTS#44, Table 8), but | |
156 | /// as a special exception (see UTS#18, Section 1.2) both general categories | |
157 | /// (an enumeration) and scripts (a catalog) are supported as if each of their | |
158 | /// possible values were a binary property. | |
159 | /// | |
160 | /// In all circumstances, property names and values are normalized and | |
161 | /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. | |
162 | /// | |
163 | /// The lifetime `'a` refers to the shorter of the lifetimes of property name | |
164 | /// and property value. | |
165 | #[derive(Debug)] | |
166 | pub enum ClassQuery<'a> { | |
167 | /// Return a class corresponding to a Unicode binary property, named by | |
168 | /// a single letter. | |
169 | OneLetter(char), | |
170 | /// Return a class corresponding to a Unicode binary property. | |
171 | /// | |
172 | /// Note that, by special exception (see UTS#18, Section 1.2), both | |
173 | /// general category values and script values are permitted here as if | |
174 | /// they were a binary property. | |
175 | Binary(&'a str), | |
176 | /// Return a class corresponding to all codepoints whose property | |
177 | /// (identified by `property_name`) corresponds to the given value | |
178 | /// (identified by `property_value`). | |
179 | ByValue { | |
180 | /// A property name. | |
181 | property_name: &'a str, | |
182 | /// A property value. | |
183 | property_value: &'a str, | |
184 | }, | |
185 | } | |
186 | ||
187 | impl<'a> ClassQuery<'a> { | |
188 | fn canonicalize(&self) -> Result<CanonicalClassQuery> { | |
189 | match *self { | |
190 | ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), | |
191 | ClassQuery::Binary(name) => self.canonical_binary(name), | |
192 | ClassQuery::ByValue { property_name, property_value } => { | |
f9f354fc XL |
193 | let property_name = symbolic_name_normalize(property_name); |
194 | let property_value = symbolic_name_normalize(property_value); | |
0531ce1d | 195 | |
f9f354fc | 196 | let canon_name = match canonical_prop(&property_name)? { |
0531ce1d XL |
197 | None => return Err(Error::PropertyNotFound), |
198 | Some(canon_name) => canon_name, | |
199 | }; | |
200 | Ok(match canon_name { | |
201 | "General_Category" => { | |
f9f354fc | 202 | let canon = match canonical_gencat(&property_value)? { |
0531ce1d XL |
203 | None => return Err(Error::PropertyValueNotFound), |
204 | Some(canon) => canon, | |
205 | }; | |
206 | CanonicalClassQuery::GeneralCategory(canon) | |
207 | } | |
208 | "Script" => { | |
f9f354fc | 209 | let canon = match canonical_script(&property_value)? { |
0531ce1d XL |
210 | None => return Err(Error::PropertyValueNotFound), |
211 | Some(canon) => canon, | |
212 | }; | |
213 | CanonicalClassQuery::Script(canon) | |
214 | } | |
215 | _ => { | |
f9f354fc | 216 | let vals = match property_values(canon_name)? { |
0531ce1d XL |
217 | None => return Err(Error::PropertyValueNotFound), |
218 | Some(vals) => vals, | |
219 | }; | |
f9f354fc XL |
220 | let canon_val = |
221 | match canonical_value(vals, &property_value) { | |
222 | None => { | |
223 | return Err(Error::PropertyValueNotFound) | |
224 | } | |
225 | Some(canon_val) => canon_val, | |
226 | }; | |
0531ce1d XL |
227 | CanonicalClassQuery::ByValue { |
228 | property_name: canon_name, | |
229 | property_value: canon_val, | |
230 | } | |
231 | } | |
232 | }) | |
233 | } | |
234 | } | |
235 | } | |
236 | ||
237 | fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> { | |
f9f354fc | 238 | let norm = symbolic_name_normalize(name); |
0531ce1d | 239 | |
f9f354fc | 240 | if let Some(canon) = canonical_prop(&norm)? { |
0531ce1d XL |
241 | return Ok(CanonicalClassQuery::Binary(canon)); |
242 | } | |
f9f354fc | 243 | if let Some(canon) = canonical_gencat(&norm)? { |
0531ce1d XL |
244 | return Ok(CanonicalClassQuery::GeneralCategory(canon)); |
245 | } | |
f9f354fc | 246 | if let Some(canon) = canonical_script(&norm)? { |
0531ce1d XL |
247 | return Ok(CanonicalClassQuery::Script(canon)); |
248 | } | |
249 | Err(Error::PropertyNotFound) | |
250 | } | |
251 | } | |
252 | ||
253 | /// Like ClassQuery, but its parameters have been canonicalized. This also | |
254 | /// differentiates binary properties from flattened general categories and | |
255 | /// scripts. | |
94b46f34 | 256 | #[derive(Debug, Eq, PartialEq)] |
0531ce1d XL |
257 | enum CanonicalClassQuery { |
258 | /// The canonical binary property name. | |
259 | Binary(&'static str), | |
260 | /// The canonical general category name. | |
261 | GeneralCategory(&'static str), | |
262 | /// The canonical script name. | |
263 | Script(&'static str), | |
264 | /// An arbitrary association between property and value, both of which | |
265 | /// have been canonicalized. | |
266 | /// | |
267 | /// Note that by construction, the property name of ByValue will never | |
268 | /// be General_Category or Script. Those two cases are subsumed by the | |
269 | /// eponymous variants. | |
270 | ByValue { | |
271 | /// The canonical property name. | |
272 | property_name: &'static str, | |
273 | /// The canonical property value. | |
274 | property_value: &'static str, | |
275 | }, | |
276 | } | |
277 | ||
278 | /// Looks up a Unicode class given a query. If one doesn't exist, then | |
279 | /// `None` is returned. | |
f9f354fc | 280 | pub fn class(query: ClassQuery) -> Result<hir::ClassUnicode> { |
0531ce1d XL |
281 | use self::CanonicalClassQuery::*; |
282 | ||
94b46f34 | 283 | match query.canonicalize()? { |
f9f354fc XL |
284 | Binary(name) => bool_property(name), |
285 | GeneralCategory(name) => gencat(name), | |
286 | Script(name) => script(name), | |
0531ce1d XL |
287 | ByValue { property_name: "Age", property_value } => { |
288 | let mut class = hir::ClassUnicode::empty(); | |
94b46f34 | 289 | for set in ages(property_value)? { |
0531ce1d XL |
290 | class.union(&hir_class(set)); |
291 | } | |
292 | Ok(class) | |
293 | } | |
294 | ByValue { property_name: "Script_Extensions", property_value } => { | |
f9f354fc | 295 | script_extension(property_value) |
0731742a | 296 | } |
f9f354fc XL |
297 | ByValue { |
298 | property_name: "Grapheme_Cluster_Break", | |
299 | property_value, | |
300 | } => gcb(property_value), | |
0731742a | 301 | ByValue { property_name: "Sentence_Break", property_value } => { |
f9f354fc | 302 | sb(property_value) |
0731742a XL |
303 | } |
304 | ByValue { property_name: "Word_Break", property_value } => { | |
f9f354fc | 305 | wb(property_value) |
0731742a | 306 | } |
0531ce1d XL |
307 | _ => { |
308 | // What else should we support? | |
309 | Err(Error::PropertyNotFound) | |
310 | } | |
311 | } | |
312 | } | |
313 | ||
f9f354fc XL |
314 | /// Returns a Unicode aware class for \w. |
315 | /// | |
316 | /// This returns an error if the data is not available for \w. | |
317 | pub fn perl_word() -> Result<hir::ClassUnicode> { | |
318 | #[cfg(not(feature = "unicode-perl"))] | |
319 | fn imp() -> Result<hir::ClassUnicode> { | |
320 | Err(Error::PerlClassNotFound) | |
321 | } | |
322 | ||
323 | #[cfg(feature = "unicode-perl")] | |
324 | fn imp() -> Result<hir::ClassUnicode> { | |
325 | use unicode_tables::perl_word::PERL_WORD; | |
326 | Ok(hir_class(PERL_WORD)) | |
327 | } | |
328 | ||
329 | imp() | |
330 | } | |
331 | ||
332 | /// Returns a Unicode aware class for \s. | |
333 | /// | |
334 | /// This returns an error if the data is not available for \s. | |
335 | pub fn perl_space() -> Result<hir::ClassUnicode> { | |
336 | #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] | |
337 | fn imp() -> Result<hir::ClassUnicode> { | |
338 | Err(Error::PerlClassNotFound) | |
339 | } | |
340 | ||
341 | #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] | |
342 | fn imp() -> Result<hir::ClassUnicode> { | |
343 | use unicode_tables::perl_space::WHITE_SPACE; | |
344 | Ok(hir_class(WHITE_SPACE)) | |
345 | } | |
346 | ||
347 | #[cfg(feature = "unicode-bool")] | |
348 | fn imp() -> Result<hir::ClassUnicode> { | |
349 | use unicode_tables::property_bool::WHITE_SPACE; | |
350 | Ok(hir_class(WHITE_SPACE)) | |
351 | } | |
352 | ||
353 | imp() | |
354 | } | |
355 | ||
356 | /// Returns a Unicode aware class for \d. | |
357 | /// | |
358 | /// This returns an error if the data is not available for \d. | |
359 | pub fn perl_digit() -> Result<hir::ClassUnicode> { | |
360 | #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] | |
361 | fn imp() -> Result<hir::ClassUnicode> { | |
362 | Err(Error::PerlClassNotFound) | |
363 | } | |
364 | ||
365 | #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] | |
366 | fn imp() -> Result<hir::ClassUnicode> { | |
367 | use unicode_tables::perl_decimal::DECIMAL_NUMBER; | |
368 | Ok(hir_class(DECIMAL_NUMBER)) | |
369 | } | |
370 | ||
371 | #[cfg(feature = "unicode-gencat")] | |
372 | fn imp() -> Result<hir::ClassUnicode> { | |
373 | use unicode_tables::general_category::DECIMAL_NUMBER; | |
374 | Ok(hir_class(DECIMAL_NUMBER)) | |
375 | } | |
376 | ||
377 | imp() | |
378 | } | |
379 | ||
0531ce1d XL |
380 | /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. |
381 | pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { | |
382 | let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges | |
383 | .iter() | |
384 | .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) | |
385 | .collect(); | |
386 | hir::ClassUnicode::new(hir_ranges) | |
387 | } | |
388 | ||
f9f354fc XL |
389 | /// Returns true only if the given codepoint is in the `\w` character class. |
390 | /// | |
391 | /// If the `unicode-perl` feature is not enabled, then this returns an error. | |
392 | pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> { | |
393 | #[cfg(not(feature = "unicode-perl"))] | |
394 | fn imp(_: char) -> result::Result<bool, UnicodeWordError> { | |
395 | Err(UnicodeWordError(())) | |
396 | } | |
397 | ||
398 | #[cfg(feature = "unicode-perl")] | |
399 | fn imp(c: char) -> result::Result<bool, UnicodeWordError> { | |
400 | use is_word_byte; | |
401 | use std::cmp::Ordering; | |
402 | use unicode_tables::perl_word::PERL_WORD; | |
403 | ||
404 | if c <= 0x7F as char && is_word_byte(c as u8) { | |
405 | return Ok(true); | |
406 | } | |
407 | Ok(PERL_WORD | |
408 | .binary_search_by(|&(start, end)| { | |
409 | if start <= c && c <= end { | |
410 | Ordering::Equal | |
411 | } else if start > c { | |
412 | Ordering::Greater | |
413 | } else { | |
414 | Ordering::Less | |
415 | } | |
416 | }) | |
417 | .is_ok()) | |
418 | } | |
419 | ||
420 | imp(c) | |
0531ce1d XL |
421 | } |
422 | ||
f9f354fc XL |
423 | /// A mapping of property values for a specific property. |
424 | /// | |
425 | /// The first element of each tuple is a normalized property value while the | |
426 | /// second element of each tuple is the corresponding canonical property | |
427 | /// value. | |
428 | type PropertyValues = &'static [(&'static str, &'static str)]; | |
429 | ||
430 | fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> { | |
431 | Ok(match normalized_value { | |
0531ce1d XL |
432 | "any" => Some("Any"), |
433 | "assigned" => Some("Assigned"), | |
434 | "ascii" => Some("ASCII"), | |
435 | _ => { | |
f9f354fc | 436 | let gencats = property_values("General_Category")?.unwrap(); |
0531ce1d XL |
437 | canonical_value(gencats, normalized_value) |
438 | } | |
f9f354fc XL |
439 | }) |
440 | } | |
441 | ||
442 | fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> { | |
443 | let scripts = property_values("Script")?.unwrap(); | |
444 | Ok(canonical_value(scripts, normalized_value)) | |
0531ce1d XL |
445 | } |
446 | ||
f9f354fc XL |
447 | /// Find the canonical property name for the given normalized property name. |
448 | /// | |
449 | /// If no such property exists, then `None` is returned. | |
450 | /// | |
451 | /// The normalized property name must have been normalized according to | |
452 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. | |
453 | /// | |
454 | /// If the property names data is not available, then an error is returned. | |
455 | fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> { | |
456 | #[cfg(not(any( | |
457 | feature = "unicode-age", | |
458 | feature = "unicode-bool", | |
459 | feature = "unicode-gencat", | |
460 | feature = "unicode-perl", | |
461 | feature = "unicode-script", | |
462 | feature = "unicode-segment", | |
463 | )))] | |
464 | fn imp(_: &str) -> Result<Option<&'static str>> { | |
465 | Err(Error::PropertyNotFound) | |
466 | } | |
467 | ||
468 | #[cfg(any( | |
469 | feature = "unicode-age", | |
470 | feature = "unicode-bool", | |
471 | feature = "unicode-gencat", | |
472 | feature = "unicode-perl", | |
473 | feature = "unicode-script", | |
474 | feature = "unicode-segment", | |
475 | ))] | |
476 | fn imp(name: &str) -> Result<Option<&'static str>> { | |
477 | use unicode_tables::property_names::PROPERTY_NAMES; | |
478 | ||
479 | Ok(PROPERTY_NAMES | |
480 | .binary_search_by_key(&name, |&(n, _)| n) | |
481 | .ok() | |
482 | .map(|i| PROPERTY_NAMES[i].1)) | |
483 | } | |
484 | ||
485 | imp(normalized_name) | |
0531ce1d XL |
486 | } |
487 | ||
f9f354fc XL |
488 | /// Find the canonical property value for the given normalized property |
489 | /// value. | |
490 | /// | |
491 | /// The given property values should correspond to the values for the property | |
492 | /// under question, which can be found using `property_values`. | |
493 | /// | |
494 | /// If no such property value exists, then `None` is returned. | |
495 | /// | |
496 | /// The normalized property value must have been normalized according to | |
497 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. | |
0531ce1d XL |
498 | fn canonical_value( |
499 | vals: PropertyValues, | |
500 | normalized_value: &str, | |
501 | ) -> Option<&'static str> { | |
f9f354fc XL |
502 | vals.binary_search_by_key(&normalized_value, |&(n, _)| n) |
503 | .ok() | |
504 | .map(|i| vals[i].1) | |
0531ce1d XL |
505 | } |
506 | ||
f9f354fc XL |
507 | /// Return the table of property values for the given property name. |
508 | /// | |
509 | /// If the property values data is not available, then an error is returned. | |
0531ce1d XL |
510 | fn property_values( |
511 | canonical_property_name: &'static str, | |
f9f354fc XL |
512 | ) -> Result<Option<PropertyValues>> { |
513 | #[cfg(not(any( | |
514 | feature = "unicode-age", | |
515 | feature = "unicode-bool", | |
516 | feature = "unicode-gencat", | |
517 | feature = "unicode-perl", | |
518 | feature = "unicode-script", | |
519 | feature = "unicode-segment", | |
520 | )))] | |
521 | fn imp(_: &'static str) -> Result<Option<PropertyValues>> { | |
522 | Err(Error::PropertyValueNotFound) | |
523 | } | |
524 | ||
525 | #[cfg(any( | |
526 | feature = "unicode-age", | |
527 | feature = "unicode-bool", | |
528 | feature = "unicode-gencat", | |
529 | feature = "unicode-perl", | |
530 | feature = "unicode-script", | |
531 | feature = "unicode-segment", | |
532 | ))] | |
533 | fn imp(name: &'static str) -> Result<Option<PropertyValues>> { | |
534 | use unicode_tables::property_values::PROPERTY_VALUES; | |
535 | ||
536 | Ok(PROPERTY_VALUES | |
537 | .binary_search_by_key(&name, |&(n, _)| n) | |
538 | .ok() | |
539 | .map(|i| PROPERTY_VALUES[i].1)) | |
540 | } | |
541 | ||
542 | imp(canonical_property_name) | |
0531ce1d XL |
543 | } |
544 | ||
f9f354fc XL |
545 | // This is only used in some cases, but small enough to just let it be dead |
546 | // instead of figuring out (and maintaining) the right set of features. | |
547 | #[allow(dead_code)] | |
0531ce1d | 548 | fn property_set( |
f9f354fc | 549 | name_map: &'static [(&'static str, Range)], |
0531ce1d | 550 | canonical: &'static str, |
f9f354fc | 551 | ) -> Option<Range> { |
0531ce1d XL |
552 | name_map |
553 | .binary_search_by_key(&canonical, |x| x.0) | |
554 | .ok() | |
555 | .map(|i| name_map[i].1) | |
556 | } | |
557 | ||
f9f354fc XL |
558 | /// Returns an iterator over Unicode Age sets. Each item corresponds to a set |
559 | /// of codepoints that were added in a particular revision of Unicode. The | |
0531ce1d | 560 | /// iterator yields items in chronological order. |
f9f354fc XL |
561 | /// |
562 | /// If the given age value isn't valid or if the data isn't available, then an | |
563 | /// error is returned instead. | |
564 | fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { | |
565 | #[cfg(not(feature = "unicode-age"))] | |
566 | fn imp(_: &str) -> Result<impl Iterator<Item = Range>> { | |
567 | use std::option::IntoIter; | |
568 | Err::<IntoIter<Range>, _>(Error::PropertyNotFound) | |
569 | } | |
570 | ||
571 | #[cfg(feature = "unicode-age")] | |
572 | fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { | |
573 | use unicode_tables::age; | |
574 | ||
575 | const AGES: &'static [(&'static str, Range)] = &[ | |
576 | ("V1_1", age::V1_1), | |
577 | ("V2_0", age::V2_0), | |
578 | ("V2_1", age::V2_1), | |
579 | ("V3_0", age::V3_0), | |
580 | ("V3_1", age::V3_1), | |
581 | ("V3_2", age::V3_2), | |
582 | ("V4_0", age::V4_0), | |
583 | ("V4_1", age::V4_1), | |
584 | ("V5_0", age::V5_0), | |
585 | ("V5_1", age::V5_1), | |
586 | ("V5_2", age::V5_2), | |
587 | ("V6_0", age::V6_0), | |
588 | ("V6_1", age::V6_1), | |
589 | ("V6_2", age::V6_2), | |
590 | ("V6_3", age::V6_3), | |
591 | ("V7_0", age::V7_0), | |
592 | ("V8_0", age::V8_0), | |
593 | ("V9_0", age::V9_0), | |
594 | ("V10_0", age::V10_0), | |
595 | ("V11_0", age::V11_0), | |
596 | ("V12_0", age::V12_0), | |
597 | ("V12_1", age::V12_1), | |
598 | ("V13_0", age::V13_0), | |
599 | ]; | |
600 | assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); | |
601 | ||
602 | let pos = AGES.iter().position(|&(age, _)| canonical_age == age); | |
603 | match pos { | |
604 | None => Err(Error::PropertyValueNotFound), | |
605 | Some(i) => Ok(AGES[..i + 1].iter().map(|&(_, classes)| classes)), | |
606 | } | |
607 | } | |
608 | ||
609 | imp(canonical_age) | |
610 | } | |
611 | ||
612 | /// Returns the Unicode HIR class corresponding to the given general category. | |
613 | /// | |
614 | /// Name canonicalization is assumed to be performed by the caller. | |
615 | /// | |
616 | /// If the given general category could not be found, or if the general | |
617 | /// category data is not available, then an error is returned. | |
618 | fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | |
619 | #[cfg(not(feature = "unicode-gencat"))] | |
620 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
621 | Err(Error::PropertyNotFound) | |
622 | } | |
623 | ||
624 | #[cfg(feature = "unicode-gencat")] | |
625 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
626 | use unicode_tables::general_category::BY_NAME; | |
627 | match name { | |
628 | "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), | |
629 | "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), | |
630 | "Assigned" => { | |
631 | let mut cls = gencat("Unassigned")?; | |
632 | cls.negate(); | |
633 | Ok(cls) | |
634 | } | |
635 | name => property_set(BY_NAME, name) | |
636 | .map(hir_class) | |
637 | .ok_or(Error::PropertyValueNotFound), | |
638 | } | |
639 | } | |
640 | ||
641 | match canonical_name { | |
642 | "Decimal_Number" => perl_digit(), | |
643 | name => imp(name), | |
644 | } | |
645 | } | |
646 | ||
647 | /// Returns the Unicode HIR class corresponding to the given script. | |
648 | /// | |
649 | /// Name canonicalization is assumed to be performed by the caller. | |
650 | /// | |
651 | /// If the given script could not be found, or if the script data is not | |
652 | /// available, then an error is returned. | |
653 | fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | |
654 | #[cfg(not(feature = "unicode-script"))] | |
655 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
656 | Err(Error::PropertyNotFound) | |
657 | } | |
658 | ||
659 | #[cfg(feature = "unicode-script")] | |
660 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
661 | use unicode_tables::script::BY_NAME; | |
662 | property_set(BY_NAME, name) | |
663 | .map(hir_class) | |
664 | .ok_or(Error::PropertyValueNotFound) | |
665 | } | |
666 | ||
667 | imp(canonical_name) | |
668 | } | |
669 | ||
670 | /// Returns the Unicode HIR class corresponding to the given script extension. | |
671 | /// | |
672 | /// Name canonicalization is assumed to be performed by the caller. | |
673 | /// | |
674 | /// If the given script extension could not be found, or if the script data is | |
675 | /// not available, then an error is returned. | |
676 | fn script_extension( | |
677 | canonical_name: &'static str, | |
678 | ) -> Result<hir::ClassUnicode> { | |
679 | #[cfg(not(feature = "unicode-script"))] | |
680 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
681 | Err(Error::PropertyNotFound) | |
682 | } | |
683 | ||
684 | #[cfg(feature = "unicode-script")] | |
685 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
686 | use unicode_tables::script_extension::BY_NAME; | |
687 | property_set(BY_NAME, name) | |
688 | .map(hir_class) | |
689 | .ok_or(Error::PropertyValueNotFound) | |
690 | } | |
691 | ||
692 | imp(canonical_name) | |
693 | } | |
694 | ||
695 | /// Returns the Unicode HIR class corresponding to the given Unicode boolean | |
696 | /// property. | |
697 | /// | |
698 | /// Name canonicalization is assumed to be performed by the caller. | |
699 | /// | |
700 | /// If the given boolean property could not be found, or if the boolean | |
701 | /// property data is not available, then an error is returned. | |
702 | fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | |
703 | #[cfg(not(feature = "unicode-bool"))] | |
704 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
705 | Err(Error::PropertyNotFound) | |
706 | } | |
707 | ||
708 | #[cfg(feature = "unicode-bool")] | |
709 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
710 | use unicode_tables::property_bool::BY_NAME; | |
711 | property_set(BY_NAME, name) | |
712 | .map(hir_class) | |
713 | .ok_or(Error::PropertyNotFound) | |
714 | } | |
715 | ||
716 | match canonical_name { | |
717 | "Decimal_Number" => perl_digit(), | |
718 | "White_Space" => perl_space(), | |
719 | name => imp(name), | |
720 | } | |
721 | } | |
722 | ||
723 | /// Returns the Unicode HIR class corresponding to the given grapheme cluster | |
724 | /// break property. | |
725 | /// | |
726 | /// Name canonicalization is assumed to be performed by the caller. | |
727 | /// | |
728 | /// If the given property could not be found, or if the corresponding data is | |
729 | /// not available, then an error is returned. | |
730 | fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | |
731 | #[cfg(not(feature = "unicode-segment"))] | |
732 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
733 | Err(Error::PropertyNotFound) | |
734 | } | |
735 | ||
736 | #[cfg(feature = "unicode-segment")] | |
737 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
738 | use unicode_tables::grapheme_cluster_break::BY_NAME; | |
739 | property_set(BY_NAME, name) | |
740 | .map(hir_class) | |
741 | .ok_or(Error::PropertyValueNotFound) | |
742 | } | |
743 | ||
744 | imp(canonical_name) | |
745 | } | |
746 | ||
747 | /// Returns the Unicode HIR class corresponding to the given word break | |
748 | /// property. | |
749 | /// | |
750 | /// Name canonicalization is assumed to be performed by the caller. | |
751 | /// | |
752 | /// If the given property could not be found, or if the corresponding data is | |
753 | /// not available, then an error is returned. | |
754 | fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | |
755 | #[cfg(not(feature = "unicode-segment"))] | |
756 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
757 | Err(Error::PropertyNotFound) | |
758 | } | |
759 | ||
760 | #[cfg(feature = "unicode-segment")] | |
761 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
762 | use unicode_tables::word_break::BY_NAME; | |
763 | property_set(BY_NAME, name) | |
764 | .map(hir_class) | |
765 | .ok_or(Error::PropertyValueNotFound) | |
766 | } | |
767 | ||
768 | imp(canonical_name) | |
0531ce1d XL |
769 | } |
770 | ||
f9f354fc XL |
771 | /// Returns the Unicode HIR class corresponding to the given sentence |
772 | /// break property. | |
773 | /// | |
774 | /// Name canonicalization is assumed to be performed by the caller. | |
775 | /// | |
776 | /// If the given property could not be found, or if the corresponding data is | |
777 | /// not available, then an error is returned. | |
778 | fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { | |
779 | #[cfg(not(feature = "unicode-segment"))] | |
780 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { | |
781 | Err(Error::PropertyNotFound) | |
782 | } | |
783 | ||
784 | #[cfg(feature = "unicode-segment")] | |
785 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { | |
786 | use unicode_tables::sentence_break::BY_NAME; | |
787 | property_set(BY_NAME, name) | |
788 | .map(hir_class) | |
789 | .ok_or(Error::PropertyValueNotFound) | |
0531ce1d | 790 | } |
f9f354fc XL |
791 | |
792 | imp(canonical_name) | |
0531ce1d XL |
793 | } |
794 | ||
f9f354fc XL |
795 | /// Like symbolic_name_normalize_bytes, but operates on a string. |
796 | fn symbolic_name_normalize(x: &str) -> String { | |
797 | let mut tmp = x.as_bytes().to_vec(); | |
798 | let len = symbolic_name_normalize_bytes(&mut tmp).len(); | |
799 | tmp.truncate(len); | |
800 | // This should always succeed because `symbolic_name_normalize_bytes` | |
801 | // guarantees that `&tmp[..len]` is always valid UTF-8. | |
802 | // | |
803 | // N.B. We could avoid the additional UTF-8 check here, but it's unlikely | |
804 | // to be worth skipping the additional safety check. A benchmark must | |
805 | // justify it first. | |
806 | String::from_utf8(tmp).unwrap() | |
807 | } | |
0531ce1d | 808 | |
f9f354fc XL |
809 | /// Normalize the given symbolic name in place according to UAX44-LM3. |
810 | /// | |
811 | /// A "symbolic name" typically corresponds to property names and property | |
812 | /// value aliases. Note, though, that it should not be applied to property | |
813 | /// string values. | |
814 | /// | |
815 | /// The slice returned is guaranteed to be valid UTF-8 for all possible values | |
816 | /// of `slice`. | |
817 | /// | |
818 | /// See: http://unicode.org/reports/tr44/#UAX44-LM3 | |
819 | fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { | |
820 | // I couldn't find a place in the standard that specified that property | |
821 | // names/aliases had a particular structure (unlike character names), but | |
822 | // we assume that it's ASCII only and drop anything that isn't ASCII. | |
823 | let mut start = 0; | |
824 | let mut starts_with_is = false; | |
825 | if slice.len() >= 2 { | |
826 | // Ignore any "is" prefix. | |
827 | starts_with_is = slice[0..2] == b"is"[..] | |
828 | || slice[0..2] == b"IS"[..] | |
829 | || slice[0..2] == b"iS"[..] | |
830 | || slice[0..2] == b"Is"[..]; | |
831 | if starts_with_is { | |
832 | start = 2; | |
0531ce1d XL |
833 | } |
834 | } | |
f9f354fc XL |
835 | let mut next_write = 0; |
836 | for i in start..slice.len() { | |
837 | // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid | |
838 | // UTF-8, we ensure that the slice contains only ASCII bytes. In | |
839 | // particular, we drop every non-ASCII byte from the normalized string. | |
840 | let b = slice[i]; | |
841 | if b == b' ' || b == b'_' || b == b'-' { | |
842 | continue; | |
843 | } else if b'A' <= b && b <= b'Z' { | |
844 | slice[next_write] = b + (b'a' - b'A'); | |
845 | next_write += 1; | |
846 | } else if b <= 0x7F { | |
847 | slice[next_write] = b; | |
848 | next_write += 1; | |
849 | } | |
850 | } | |
851 | // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally | |
852 | // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross | |
853 | // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it | |
854 | // is actually an alias for the 'Other' general category. | |
855 | if starts_with_is && next_write == 1 && slice[0] == b'c' { | |
856 | slice[0] = b'i'; | |
857 | slice[1] = b's'; | |
858 | slice[2] = b'c'; | |
859 | next_write = 3; | |
860 | } | |
861 | &mut slice[..next_write] | |
0531ce1d XL |
862 | } |
863 | ||
864 | #[cfg(test)] | |
865 | mod tests { | |
f9f354fc XL |
866 | use super::{ |
867 | contains_simple_case_mapping, simple_fold, symbolic_name_normalize, | |
868 | symbolic_name_normalize_bytes, | |
869 | }; | |
870 | ||
871 | #[cfg(feature = "unicode-case")] | |
872 | fn simple_fold_ok(c: char) -> impl Iterator<Item = char> { | |
873 | simple_fold(c).unwrap().unwrap() | |
874 | } | |
875 | ||
876 | #[cfg(feature = "unicode-case")] | |
877 | fn simple_fold_err(c: char) -> Option<char> { | |
878 | match simple_fold(c).unwrap() { | |
879 | Ok(_) => unreachable!("simple_fold returned Ok iterator"), | |
880 | Err(next) => next, | |
881 | } | |
882 | } | |
883 | ||
884 | #[cfg(feature = "unicode-case")] | |
885 | fn contains_case_map(start: char, end: char) -> bool { | |
886 | contains_simple_case_mapping(start, end).unwrap() | |
887 | } | |
0531ce1d XL |
888 | |
889 | #[test] | |
f9f354fc | 890 | #[cfg(feature = "unicode-case")] |
0531ce1d | 891 | fn simple_fold_k() { |
f9f354fc | 892 | let xs: Vec<char> = simple_fold_ok('k').collect(); |
0531ce1d XL |
893 | assert_eq!(xs, vec!['K', 'K']); |
894 | ||
f9f354fc | 895 | let xs: Vec<char> = simple_fold_ok('K').collect(); |
0531ce1d XL |
896 | assert_eq!(xs, vec!['k', 'K']); |
897 | ||
f9f354fc | 898 | let xs: Vec<char> = simple_fold_ok('K').collect(); |
0531ce1d XL |
899 | assert_eq!(xs, vec!['K', 'k']); |
900 | } | |
901 | ||
902 | #[test] | |
f9f354fc | 903 | #[cfg(feature = "unicode-case")] |
0531ce1d | 904 | fn simple_fold_a() { |
f9f354fc | 905 | let xs: Vec<char> = simple_fold_ok('a').collect(); |
0531ce1d XL |
906 | assert_eq!(xs, vec!['A']); |
907 | ||
f9f354fc | 908 | let xs: Vec<char> = simple_fold_ok('A').collect(); |
0531ce1d XL |
909 | assert_eq!(xs, vec!['a']); |
910 | } | |
911 | ||
912 | #[test] | |
f9f354fc | 913 | #[cfg(feature = "unicode-case")] |
0531ce1d | 914 | fn simple_fold_empty() { |
f9f354fc XL |
915 | assert_eq!(Some('A'), simple_fold_err('?')); |
916 | assert_eq!(Some('A'), simple_fold_err('@')); | |
917 | assert_eq!(Some('a'), simple_fold_err('[')); | |
918 | assert_eq!(Some('Ⰰ'), simple_fold_err('☃')); | |
0531ce1d XL |
919 | } |
920 | ||
921 | #[test] | |
f9f354fc | 922 | #[cfg(feature = "unicode-case")] |
0531ce1d | 923 | fn simple_fold_max() { |
f9f354fc XL |
924 | assert_eq!(None, simple_fold_err('\u{10FFFE}')); |
925 | assert_eq!(None, simple_fold_err('\u{10FFFF}')); | |
0531ce1d XL |
926 | } |
927 | ||
928 | #[test] | |
f9f354fc XL |
929 | #[cfg(not(feature = "unicode-case"))] |
930 | fn simple_fold_disabled() { | |
931 | assert!(simple_fold('a').is_err()); | |
932 | } | |
933 | ||
934 | #[test] | |
935 | #[cfg(feature = "unicode-case")] | |
0531ce1d | 936 | fn range_contains() { |
f9f354fc XL |
937 | assert!(contains_case_map('A', 'A')); |
938 | assert!(contains_case_map('Z', 'Z')); | |
939 | assert!(contains_case_map('A', 'Z')); | |
940 | assert!(contains_case_map('@', 'A')); | |
941 | assert!(contains_case_map('Z', '[')); | |
942 | assert!(contains_case_map('☃', 'Ⰰ')); | |
0531ce1d | 943 | |
f9f354fc XL |
944 | assert!(!contains_case_map('[', '[')); |
945 | assert!(!contains_case_map('[', '`')); | |
0531ce1d | 946 | |
f9f354fc XL |
947 | assert!(!contains_case_map('☃', '☃')); |
948 | } | |
949 | ||
950 | #[test] | |
951 | #[cfg(not(feature = "unicode-case"))] | |
952 | fn range_contains_disabled() { | |
953 | assert!(contains_simple_case_mapping('a', 'a').is_err()); | |
0531ce1d | 954 | } |
94b46f34 XL |
955 | |
956 | #[test] | |
f9f354fc | 957 | #[cfg(feature = "unicode-gencat")] |
94b46f34 XL |
958 | fn regression_466() { |
959 | use super::{CanonicalClassQuery, ClassQuery}; | |
960 | ||
961 | let q = ClassQuery::OneLetter('C'); | |
962 | assert_eq!( | |
963 | q.canonicalize().unwrap(), | |
f9f354fc XL |
964 | CanonicalClassQuery::GeneralCategory("Other") |
965 | ); | |
966 | } | |
967 | ||
968 | #[test] | |
969 | fn sym_normalize() { | |
970 | let sym_norm = symbolic_name_normalize; | |
971 | ||
972 | assert_eq!(sym_norm("Line_Break"), "linebreak"); | |
973 | assert_eq!(sym_norm("Line-break"), "linebreak"); | |
974 | assert_eq!(sym_norm("linebreak"), "linebreak"); | |
975 | assert_eq!(sym_norm("BA"), "ba"); | |
976 | assert_eq!(sym_norm("ba"), "ba"); | |
977 | assert_eq!(sym_norm("Greek"), "greek"); | |
978 | assert_eq!(sym_norm("isGreek"), "greek"); | |
979 | assert_eq!(sym_norm("IS_Greek"), "greek"); | |
980 | assert_eq!(sym_norm("isc"), "isc"); | |
981 | assert_eq!(sym_norm("is c"), "isc"); | |
982 | assert_eq!(sym_norm("is_c"), "isc"); | |
983 | } | |
984 | ||
985 | #[test] | |
986 | fn valid_utf8_symbolic() { | |
987 | let mut x = b"abc\xFFxyz".to_vec(); | |
988 | let y = symbolic_name_normalize_bytes(&mut x); | |
989 | assert_eq!(y, b"abcxyz"); | |
94b46f34 | 990 | } |
0531ce1d | 991 | } |