]>
Commit | Line | Data |
---|---|---|
0531ce1d XL |
1 | use std::cmp::Ordering; |
2 | use std::result; | |
3 | ||
4 | use ucd_util::{self, PropertyValues}; | |
5 | ||
6 | use hir; | |
7 | use unicode_tables::age; | |
8 | use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; | |
9 | use unicode_tables::general_category; | |
10 | use unicode_tables::property_bool; | |
11 | use unicode_tables::property_names::PROPERTY_NAMES; | |
12 | use unicode_tables::property_values::PROPERTY_VALUES; | |
13 | use unicode_tables::script; | |
14 | use unicode_tables::script_extension; | |
15 | ||
16 | type Result<T> = result::Result<T, Error>; | |
17 | ||
18 | /// An error that occurs when dealing with Unicode. | |
19 | /// | |
20 | /// We don't impl the Error trait here because these always get converted | |
21 | /// into other public errors. (This error type isn't exported.) | |
22 | #[derive(Debug)] | |
23 | pub enum Error { | |
24 | PropertyNotFound, | |
25 | PropertyValueNotFound, | |
26 | } | |
27 | ||
0531ce1d XL |
28 | /// An iterator over a codepoint's simple case equivalence class. |
29 | #[derive(Debug)] | |
30 | pub struct SimpleFoldIter(::std::slice::Iter<'static, char>); | |
31 | ||
32 | impl Iterator for SimpleFoldIter { | |
33 | type Item = char; | |
34 | ||
35 | fn next(&mut self) -> Option<char> { | |
36 | self.0.next().map(|c| *c) | |
37 | } | |
38 | } | |
39 | ||
40 | /// Return an iterator over the equivalence class of simple case mappings | |
41 | /// for the given codepoint. The equivalence class does not include the | |
42 | /// given codepoint. | |
43 | /// | |
44 | /// If the equivalence class is empty, then this returns the next scalar | |
45 | /// value that has a non-empty equivalence class, if it exists. If no such | |
46 | /// scalar value exists, then `None` is returned. The point of this behavior | |
47 | /// is to permit callers to avoid calling `simple_fold` more than they need | |
48 | /// to, since there is some cost to fetching the equivalence class. | |
49 | pub fn simple_fold(c: char) -> result::Result<SimpleFoldIter, Option<char>> { | |
50 | CASE_FOLDING_SIMPLE | |
51 | .binary_search_by_key(&c, |&(c1, _)| c1) | |
52 | .map(|i| SimpleFoldIter(CASE_FOLDING_SIMPLE[i].1.iter())) | |
53 | .map_err(|i| { | |
54 | if i >= CASE_FOLDING_SIMPLE.len() { | |
55 | None | |
56 | } else { | |
57 | Some(CASE_FOLDING_SIMPLE[i].0) | |
58 | } | |
59 | }) | |
60 | } | |
61 | ||
62 | /// Returns true if and only if the given (inclusive) range contains at least | |
63 | /// one Unicode scalar value that has a non-empty non-trivial simple case | |
64 | /// mapping. | |
65 | /// | |
66 | /// This function panics if `end < start`. | |
67 | pub fn contains_simple_case_mapping(start: char, end: char) -> bool { | |
68 | assert!(start <= end); | |
69 | CASE_FOLDING_SIMPLE | |
70 | .binary_search_by(|&(c, _)| { | |
71 | if start <= c && c <= end { | |
72 | Ordering::Equal | |
73 | } else if c > end { | |
74 | Ordering::Greater | |
75 | } else { | |
76 | Ordering::Less | |
77 | } | |
78 | }).is_ok() | |
79 | } | |
80 | ||
81 | /// A query for finding a character class defined by Unicode. This supports | |
82 | /// either use of a property name directly, or lookup by property value. The | |
83 | /// former generally refers to Binary properties (see UTS#44, Table 8), but | |
84 | /// as a special exception (see UTS#18, Section 1.2) both general categories | |
85 | /// (an enumeration) and scripts (a catalog) are supported as if each of their | |
86 | /// possible values were a binary property. | |
87 | /// | |
88 | /// In all circumstances, property names and values are normalized and | |
89 | /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. | |
90 | /// | |
91 | /// The lifetime `'a` refers to the shorter of the lifetimes of property name | |
92 | /// and property value. | |
93 | #[derive(Debug)] | |
94 | pub enum ClassQuery<'a> { | |
95 | /// Return a class corresponding to a Unicode binary property, named by | |
96 | /// a single letter. | |
97 | OneLetter(char), | |
98 | /// Return a class corresponding to a Unicode binary property. | |
99 | /// | |
100 | /// Note that, by special exception (see UTS#18, Section 1.2), both | |
101 | /// general category values and script values are permitted here as if | |
102 | /// they were a binary property. | |
103 | Binary(&'a str), | |
104 | /// Return a class corresponding to all codepoints whose property | |
105 | /// (identified by `property_name`) corresponds to the given value | |
106 | /// (identified by `property_value`). | |
107 | ByValue { | |
108 | /// A property name. | |
109 | property_name: &'a str, | |
110 | /// A property value. | |
111 | property_value: &'a str, | |
112 | }, | |
113 | } | |
114 | ||
115 | impl<'a> ClassQuery<'a> { | |
116 | fn canonicalize(&self) -> Result<CanonicalClassQuery> { | |
117 | match *self { | |
118 | ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), | |
119 | ClassQuery::Binary(name) => self.canonical_binary(name), | |
120 | ClassQuery::ByValue { property_name, property_value } => { | |
121 | let property_name = normalize(property_name); | |
122 | let property_value = normalize(property_value); | |
123 | ||
124 | let canon_name = match canonical_prop(&property_name) { | |
125 | None => return Err(Error::PropertyNotFound), | |
126 | Some(canon_name) => canon_name, | |
127 | }; | |
128 | Ok(match canon_name { | |
129 | "General_Category" => { | |
130 | let canon = match canonical_gencat(&property_value) { | |
131 | None => return Err(Error::PropertyValueNotFound), | |
132 | Some(canon) => canon, | |
133 | }; | |
134 | CanonicalClassQuery::GeneralCategory(canon) | |
135 | } | |
136 | "Script" => { | |
137 | let canon = match canonical_script(&property_value) { | |
138 | None => return Err(Error::PropertyValueNotFound), | |
139 | Some(canon) => canon, | |
140 | }; | |
141 | CanonicalClassQuery::Script(canon) | |
142 | } | |
143 | _ => { | |
144 | let vals = match property_values(canon_name) { | |
145 | None => return Err(Error::PropertyValueNotFound), | |
146 | Some(vals) => vals, | |
147 | }; | |
148 | let canon_val = match canonical_value( | |
149 | vals, | |
150 | &property_value, | |
151 | ) { | |
152 | None => return Err(Error::PropertyValueNotFound), | |
153 | Some(canon_val) => canon_val, | |
154 | }; | |
155 | CanonicalClassQuery::ByValue { | |
156 | property_name: canon_name, | |
157 | property_value: canon_val, | |
158 | } | |
159 | } | |
160 | }) | |
161 | } | |
162 | } | |
163 | } | |
164 | ||
165 | fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> { | |
166 | let norm = normalize(name); | |
167 | ||
168 | if let Some(canon) = canonical_prop(&norm) { | |
169 | return Ok(CanonicalClassQuery::Binary(canon)); | |
170 | } | |
171 | if let Some(canon) = canonical_gencat(&norm) { | |
172 | return Ok(CanonicalClassQuery::GeneralCategory(canon)); | |
173 | } | |
174 | if let Some(canon) = canonical_script(&norm) { | |
175 | return Ok(CanonicalClassQuery::Script(canon)); | |
176 | } | |
177 | Err(Error::PropertyNotFound) | |
178 | } | |
179 | } | |
180 | ||
181 | /// Like ClassQuery, but its parameters have been canonicalized. This also | |
182 | /// differentiates binary properties from flattened general categories and | |
183 | /// scripts. | |
94b46f34 | 184 | #[derive(Debug, Eq, PartialEq)] |
0531ce1d XL |
185 | enum CanonicalClassQuery { |
186 | /// The canonical binary property name. | |
187 | Binary(&'static str), | |
188 | /// The canonical general category name. | |
189 | GeneralCategory(&'static str), | |
190 | /// The canonical script name. | |
191 | Script(&'static str), | |
192 | /// An arbitrary association between property and value, both of which | |
193 | /// have been canonicalized. | |
194 | /// | |
195 | /// Note that by construction, the property name of ByValue will never | |
196 | /// be General_Category or Script. Those two cases are subsumed by the | |
197 | /// eponymous variants. | |
198 | ByValue { | |
199 | /// The canonical property name. | |
200 | property_name: &'static str, | |
201 | /// The canonical property value. | |
202 | property_value: &'static str, | |
203 | }, | |
204 | } | |
205 | ||
206 | /// Looks up a Unicode class given a query. If one doesn't exist, then | |
207 | /// `None` is returned. | |
208 | pub fn class<'a>(query: ClassQuery<'a>) -> Result<hir::ClassUnicode> { | |
209 | use self::CanonicalClassQuery::*; | |
210 | ||
94b46f34 | 211 | match query.canonicalize()? { |
0531ce1d XL |
212 | Binary(name) => { |
213 | property_set(property_bool::BY_NAME, name) | |
214 | .map(hir_class) | |
215 | .ok_or(Error::PropertyNotFound) | |
216 | } | |
217 | GeneralCategory("Any") => { | |
218 | Ok(hir_class(&[('\0', '\u{10FFFF}')])) | |
219 | } | |
220 | GeneralCategory("Assigned") => { | |
221 | let mut cls = | |
94b46f34 | 222 | property_set(general_category::BY_NAME, "Unassigned") |
0531ce1d | 223 | .map(hir_class) |
94b46f34 | 224 | .ok_or(Error::PropertyNotFound)?; |
0531ce1d XL |
225 | cls.negate(); |
226 | Ok(cls) | |
227 | } | |
228 | GeneralCategory("ASCII") => { | |
229 | Ok(hir_class(&[('\0', '\x7F')])) | |
230 | } | |
231 | GeneralCategory(name) => { | |
232 | property_set(general_category::BY_NAME, name) | |
233 | .map(hir_class) | |
234 | .ok_or(Error::PropertyValueNotFound) | |
235 | } | |
236 | Script(name) => { | |
237 | property_set(script::BY_NAME, name) | |
238 | .map(hir_class) | |
239 | .ok_or(Error::PropertyValueNotFound) | |
240 | } | |
241 | ByValue { property_name: "Age", property_value } => { | |
242 | let mut class = hir::ClassUnicode::empty(); | |
94b46f34 | 243 | for set in ages(property_value)? { |
0531ce1d XL |
244 | class.union(&hir_class(set)); |
245 | } | |
246 | Ok(class) | |
247 | } | |
248 | ByValue { property_name: "Script_Extensions", property_value } => { | |
249 | property_set(script_extension::BY_NAME, property_value) | |
250 | .map(hir_class) | |
251 | .ok_or(Error::PropertyValueNotFound) | |
252 | } | |
253 | _ => { | |
254 | // What else should we support? | |
255 | Err(Error::PropertyNotFound) | |
256 | } | |
257 | } | |
258 | } | |
259 | ||
260 | /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. | |
261 | pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { | |
262 | let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges | |
263 | .iter() | |
264 | .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) | |
265 | .collect(); | |
266 | hir::ClassUnicode::new(hir_ranges) | |
267 | } | |
268 | ||
269 | fn canonical_prop(normalized_name: &str) -> Option<&'static str> { | |
270 | ucd_util::canonical_property_name(PROPERTY_NAMES, normalized_name) | |
271 | } | |
272 | ||
273 | fn canonical_gencat(normalized_value: &str) -> Option<&'static str> { | |
274 | match normalized_value { | |
275 | "any" => Some("Any"), | |
276 | "assigned" => Some("Assigned"), | |
277 | "ascii" => Some("ASCII"), | |
278 | _ => { | |
279 | let gencats = property_values("General_Category").unwrap(); | |
280 | canonical_value(gencats, normalized_value) | |
281 | } | |
282 | } | |
283 | } | |
284 | ||
285 | fn canonical_script(normalized_value: &str) -> Option<&'static str> { | |
286 | let scripts = property_values("Script").unwrap(); | |
287 | canonical_value(scripts, normalized_value) | |
288 | } | |
289 | ||
290 | fn canonical_value( | |
291 | vals: PropertyValues, | |
292 | normalized_value: &str, | |
293 | ) -> Option<&'static str> { | |
294 | ucd_util::canonical_property_value(vals, normalized_value) | |
295 | } | |
296 | ||
297 | fn normalize(x: &str) -> String { | |
298 | let mut x = x.to_string(); | |
299 | ucd_util::symbolic_name_normalize(&mut x); | |
300 | x | |
301 | } | |
302 | ||
303 | fn property_values( | |
304 | canonical_property_name: &'static str, | |
305 | ) -> Option<PropertyValues> | |
306 | { | |
307 | ucd_util::property_values(PROPERTY_VALUES, canonical_property_name) | |
308 | } | |
309 | ||
310 | fn property_set( | |
311 | name_map: &'static [(&'static str, &'static [(char, char)])], | |
312 | canonical: &'static str, | |
313 | ) -> Option<&'static [(char, char)]> { | |
314 | name_map | |
315 | .binary_search_by_key(&canonical, |x| x.0) | |
316 | .ok() | |
317 | .map(|i| name_map[i].1) | |
318 | } | |
319 | ||
320 | /// An iterator over Unicode Age sets. Each item corresponds to a set of | |
321 | /// codepoints that were added in a particular revision of Unicode. The | |
322 | /// iterator yields items in chronological order. | |
323 | #[derive(Debug)] | |
324 | struct AgeIter { | |
325 | ages: &'static [(&'static str, &'static [(char, char)])], | |
326 | } | |
327 | ||
328 | fn ages(canonical_age: &str) -> Result<AgeIter> { | |
329 | const AGES: &'static [(&'static str, &'static [(char, char)])] = &[ | |
330 | ("V1_1", age::V1_1), | |
331 | ("V2_0", age::V2_0), | |
332 | ("V2_1", age::V2_1), | |
333 | ("V3_0", age::V3_0), | |
334 | ("V3_1", age::V3_1), | |
335 | ("V3_2", age::V3_2), | |
336 | ("V4_0", age::V4_0), | |
337 | ("V4_1", age::V4_1), | |
338 | ("V5_0", age::V5_0), | |
339 | ("V5_1", age::V5_1), | |
340 | ("V5_2", age::V5_2), | |
341 | ("V6_0", age::V6_0), | |
342 | ("V6_1", age::V6_1), | |
343 | ("V6_2", age::V6_2), | |
344 | ("V6_3", age::V6_3), | |
345 | ("V7_0", age::V7_0), | |
346 | ("V8_0", age::V8_0), | |
347 | ("V9_0", age::V9_0), | |
348 | ("V10_0", age::V10_0), | |
349 | ]; | |
350 | assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); | |
351 | ||
352 | let pos = AGES.iter().position(|&(age, _)| canonical_age == age); | |
353 | match pos { | |
354 | None => Err(Error::PropertyValueNotFound), | |
355 | Some(i) => Ok(AgeIter { ages: &AGES[..i+1] }), | |
356 | } | |
357 | } | |
358 | ||
359 | impl Iterator for AgeIter { | |
360 | type Item = &'static [(char, char)]; | |
361 | ||
362 | fn next(&mut self) -> Option<&'static [(char, char)]> { | |
363 | if self.ages.is_empty() { | |
364 | None | |
365 | } else { | |
366 | let set = self.ages[0]; | |
367 | self.ages = &self.ages[1..]; | |
368 | Some(set.1) | |
369 | } | |
370 | } | |
371 | } | |
372 | ||
373 | #[cfg(test)] | |
374 | mod tests { | |
375 | use super::{contains_simple_case_mapping, simple_fold}; | |
376 | ||
377 | #[test] | |
378 | fn simple_fold_k() { | |
379 | let xs: Vec<char> = simple_fold('k').unwrap().collect(); | |
380 | assert_eq!(xs, vec!['K', 'K']); | |
381 | ||
382 | let xs: Vec<char> = simple_fold('K').unwrap().collect(); | |
383 | assert_eq!(xs, vec!['k', 'K']); | |
384 | ||
385 | let xs: Vec<char> = simple_fold('K').unwrap().collect(); | |
386 | assert_eq!(xs, vec!['K', 'k']); | |
387 | } | |
388 | ||
389 | #[test] | |
390 | fn simple_fold_a() { | |
391 | let xs: Vec<char> = simple_fold('a').unwrap().collect(); | |
392 | assert_eq!(xs, vec!['A']); | |
393 | ||
394 | let xs: Vec<char> = simple_fold('A').unwrap().collect(); | |
395 | assert_eq!(xs, vec!['a']); | |
396 | } | |
397 | ||
398 | #[test] | |
399 | fn simple_fold_empty() { | |
400 | assert_eq!(Some('A'), simple_fold('?').unwrap_err()); | |
401 | assert_eq!(Some('A'), simple_fold('@').unwrap_err()); | |
402 | assert_eq!(Some('a'), simple_fold('[').unwrap_err()); | |
403 | assert_eq!(Some('Ⰰ'), simple_fold('☃').unwrap_err()); | |
404 | } | |
405 | ||
406 | #[test] | |
407 | fn simple_fold_max() { | |
408 | assert_eq!(None, simple_fold('\u{10FFFE}').unwrap_err()); | |
409 | assert_eq!(None, simple_fold('\u{10FFFF}').unwrap_err()); | |
410 | } | |
411 | ||
412 | #[test] | |
413 | fn range_contains() { | |
414 | assert!(contains_simple_case_mapping('A', 'A')); | |
415 | assert!(contains_simple_case_mapping('Z', 'Z')); | |
416 | assert!(contains_simple_case_mapping('A', 'Z')); | |
417 | assert!(contains_simple_case_mapping('@', 'A')); | |
418 | assert!(contains_simple_case_mapping('Z', '[')); | |
419 | assert!(contains_simple_case_mapping('☃', 'Ⰰ')); | |
420 | ||
421 | assert!(!contains_simple_case_mapping('[', '[')); | |
422 | assert!(!contains_simple_case_mapping('[', '`')); | |
423 | ||
424 | assert!(!contains_simple_case_mapping('☃', '☃')); | |
425 | } | |
94b46f34 XL |
426 | |
427 | #[test] | |
428 | fn regression_466() { | |
429 | use super::{CanonicalClassQuery, ClassQuery}; | |
430 | ||
431 | let q = ClassQuery::OneLetter('C'); | |
432 | assert_eq!( | |
433 | q.canonicalize().unwrap(), | |
434 | CanonicalClassQuery::GeneralCategory("Other")); | |
435 | } | |
0531ce1d | 436 | } |