]> git.proxmox.com Git - rustc.git/blob - src/vendor/regex-syntax-0.5.6/src/unicode.rs
New upstream version 1.31.0+dfsg1
[rustc.git] / src / vendor / regex-syntax-0.5.6 / src / unicode.rs
1 use std::cmp::Ordering;
2 use std::result;
3
4 use ucd_util::{self, PropertyValues};
5
6 use hir;
7 use unicode_tables::age;
8 use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
9 use unicode_tables::general_category;
10 use unicode_tables::property_bool;
11 use unicode_tables::property_names::PROPERTY_NAMES;
12 use unicode_tables::property_values::PROPERTY_VALUES;
13 use unicode_tables::script;
14 use unicode_tables::script_extension;
15
16 type Result<T> = result::Result<T, Error>;
17
18 /// An error that occurs when dealing with Unicode.
19 ///
20 /// We don't impl the Error trait here because these always get converted
21 /// into other public errors. (This error type isn't exported.)
22 #[derive(Debug)]
23 pub enum Error {
24 PropertyNotFound,
25 PropertyValueNotFound,
26 }
27
28 /// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
29 ///
30 /// If `dst` is not long enough, then `None` is returned. Otherwise, the number
31 /// of bytes written is returned.
32 pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
33 // TODO: Remove this function once we move to at least Rust 1.15, which
34 // provides char::encode_utf8 for us.
35 const TAG_CONT: u8 = 0b1000_0000;
36 const TAG_TWO: u8 = 0b1100_0000;
37 const TAG_THREE: u8 = 0b1110_0000;
38 const TAG_FOUR: u8 = 0b1111_0000;
39
40 let code = character as u32;
41 if code <= 0x7F && !dst.is_empty() {
42 dst[0] = code as u8;
43 Some(1)
44 } else if code <= 0x7FF && dst.len() >= 2 {
45 dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
46 dst[1] = (code & 0x3F) as u8 | TAG_CONT;
47 Some(2)
48 } else if code <= 0xFFFF && dst.len() >= 3 {
49 dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
50 dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
51 dst[2] = (code & 0x3F) as u8 | TAG_CONT;
52 Some(3)
53 } else if dst.len() >= 4 {
54 dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
55 dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
56 dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
57 dst[3] = (code & 0x3F) as u8 | TAG_CONT;
58 Some(4)
59 } else {
60 None
61 }
62 }
63
64 /// An iterator over a codepoint's simple case equivalence class.
65 #[derive(Debug)]
66 pub struct SimpleFoldIter(::std::slice::Iter<'static, char>);
67
68 impl Iterator for SimpleFoldIter {
69 type Item = char;
70
71 fn next(&mut self) -> Option<char> {
72 self.0.next().map(|c| *c)
73 }
74 }
75
76 /// Return an iterator over the equivalence class of simple case mappings
77 /// for the given codepoint. The equivalence class does not include the
78 /// given codepoint.
79 ///
80 /// If the equivalence class is empty, then this returns the next scalar
81 /// value that has a non-empty equivalence class, if it exists. If no such
82 /// scalar value exists, then `None` is returned. The point of this behavior
83 /// is to permit callers to avoid calling `simple_fold` more than they need
84 /// to, since there is some cost to fetching the equivalence class.
85 pub fn simple_fold(c: char) -> result::Result<SimpleFoldIter, Option<char>> {
86 CASE_FOLDING_SIMPLE
87 .binary_search_by_key(&c, |&(c1, _)| c1)
88 .map(|i| SimpleFoldIter(CASE_FOLDING_SIMPLE[i].1.iter()))
89 .map_err(|i| {
90 if i >= CASE_FOLDING_SIMPLE.len() {
91 None
92 } else {
93 Some(CASE_FOLDING_SIMPLE[i].0)
94 }
95 })
96 }
97
98 /// Returns true if and only if the given (inclusive) range contains at least
99 /// one Unicode scalar value that has a non-empty non-trivial simple case
100 /// mapping.
101 ///
102 /// This function panics if `end < start`.
103 pub fn contains_simple_case_mapping(start: char, end: char) -> bool {
104 assert!(start <= end);
105 CASE_FOLDING_SIMPLE
106 .binary_search_by(|&(c, _)| {
107 if start <= c && c <= end {
108 Ordering::Equal
109 } else if c > end {
110 Ordering::Greater
111 } else {
112 Ordering::Less
113 }
114 }).is_ok()
115 }
116
117 /// A query for finding a character class defined by Unicode. This supports
118 /// either use of a property name directly, or lookup by property value. The
119 /// former generally refers to Binary properties (see UTS#44, Table 8), but
120 /// as a special exception (see UTS#18, Section 1.2) both general categories
121 /// (an enumeration) and scripts (a catalog) are supported as if each of their
122 /// possible values were a binary property.
123 ///
124 /// In all circumstances, property names and values are normalized and
125 /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
126 ///
127 /// The lifetime `'a` refers to the shorter of the lifetimes of property name
128 /// and property value.
129 #[derive(Debug)]
130 pub enum ClassQuery<'a> {
131 /// Return a class corresponding to a Unicode binary property, named by
132 /// a single letter.
133 OneLetter(char),
134 /// Return a class corresponding to a Unicode binary property.
135 ///
136 /// Note that, by special exception (see UTS#18, Section 1.2), both
137 /// general category values and script values are permitted here as if
138 /// they were a binary property.
139 Binary(&'a str),
140 /// Return a class corresponding to all codepoints whose property
141 /// (identified by `property_name`) corresponds to the given value
142 /// (identified by `property_value`).
143 ByValue {
144 /// A property name.
145 property_name: &'a str,
146 /// A property value.
147 property_value: &'a str,
148 },
149 }
150
151 impl<'a> ClassQuery<'a> {
152 fn canonicalize(&self) -> Result<CanonicalClassQuery> {
153 match *self {
154 ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
155 ClassQuery::Binary(name) => self.canonical_binary(name),
156 ClassQuery::ByValue { property_name, property_value } => {
157 let property_name = normalize(property_name);
158 let property_value = normalize(property_value);
159
160 let canon_name = match canonical_prop(&property_name) {
161 None => return Err(Error::PropertyNotFound),
162 Some(canon_name) => canon_name,
163 };
164 Ok(match canon_name {
165 "General_Category" => {
166 let canon = match canonical_gencat(&property_value) {
167 None => return Err(Error::PropertyValueNotFound),
168 Some(canon) => canon,
169 };
170 CanonicalClassQuery::GeneralCategory(canon)
171 }
172 "Script" => {
173 let canon = match canonical_script(&property_value) {
174 None => return Err(Error::PropertyValueNotFound),
175 Some(canon) => canon,
176 };
177 CanonicalClassQuery::Script(canon)
178 }
179 _ => {
180 let vals = match property_values(canon_name) {
181 None => return Err(Error::PropertyValueNotFound),
182 Some(vals) => vals,
183 };
184 let canon_val = match canonical_value(
185 vals,
186 &property_value,
187 ) {
188 None => return Err(Error::PropertyValueNotFound),
189 Some(canon_val) => canon_val,
190 };
191 CanonicalClassQuery::ByValue {
192 property_name: canon_name,
193 property_value: canon_val,
194 }
195 }
196 })
197 }
198 }
199 }
200
201 fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
202 let norm = normalize(name);
203
204 if let Some(canon) = canonical_prop(&norm) {
205 return Ok(CanonicalClassQuery::Binary(canon));
206 }
207 if let Some(canon) = canonical_gencat(&norm) {
208 return Ok(CanonicalClassQuery::GeneralCategory(canon));
209 }
210 if let Some(canon) = canonical_script(&norm) {
211 return Ok(CanonicalClassQuery::Script(canon));
212 }
213 Err(Error::PropertyNotFound)
214 }
215 }
216
217 /// Like ClassQuery, but its parameters have been canonicalized. This also
218 /// differentiates binary properties from flattened general categories and
219 /// scripts.
220 #[derive(Debug, Eq, PartialEq)]
221 enum CanonicalClassQuery {
222 /// The canonical binary property name.
223 Binary(&'static str),
224 /// The canonical general category name.
225 GeneralCategory(&'static str),
226 /// The canonical script name.
227 Script(&'static str),
228 /// An arbitrary association between property and value, both of which
229 /// have been canonicalized.
230 ///
231 /// Note that by construction, the property name of ByValue will never
232 /// be General_Category or Script. Those two cases are subsumed by the
233 /// eponymous variants.
234 ByValue {
235 /// The canonical property name.
236 property_name: &'static str,
237 /// The canonical property value.
238 property_value: &'static str,
239 },
240 }
241
242 /// Looks up a Unicode class given a query. If one doesn't exist, then
243 /// `None` is returned.
244 pub fn class<'a>(query: ClassQuery<'a>) -> Result<hir::ClassUnicode> {
245 use self::CanonicalClassQuery::*;
246
247 match try!(query.canonicalize()) {
248 Binary(name) => {
249 property_set(property_bool::BY_NAME, name)
250 .map(hir_class)
251 .ok_or(Error::PropertyNotFound)
252 }
253 GeneralCategory("Any") => {
254 Ok(hir_class(&[('\0', '\u{10FFFF}')]))
255 }
256 GeneralCategory("Assigned") => {
257 let mut cls =
258 try!(property_set(general_category::BY_NAME, "Unassigned")
259 .map(hir_class)
260 .ok_or(Error::PropertyNotFound));
261 cls.negate();
262 Ok(cls)
263 }
264 GeneralCategory("ASCII") => {
265 Ok(hir_class(&[('\0', '\x7F')]))
266 }
267 GeneralCategory(name) => {
268 property_set(general_category::BY_NAME, name)
269 .map(hir_class)
270 .ok_or(Error::PropertyValueNotFound)
271 }
272 Script(name) => {
273 property_set(script::BY_NAME, name)
274 .map(hir_class)
275 .ok_or(Error::PropertyValueNotFound)
276 }
277 ByValue { property_name: "Age", property_value } => {
278 let mut class = hir::ClassUnicode::empty();
279 for set in try!(ages(property_value)) {
280 class.union(&hir_class(set));
281 }
282 Ok(class)
283 }
284 ByValue { property_name: "Script_Extensions", property_value } => {
285 property_set(script_extension::BY_NAME, property_value)
286 .map(hir_class)
287 .ok_or(Error::PropertyValueNotFound)
288 }
289 _ => {
290 // What else should we support?
291 Err(Error::PropertyNotFound)
292 }
293 }
294 }
295
296 /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
297 pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
298 let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
299 .iter()
300 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
301 .collect();
302 hir::ClassUnicode::new(hir_ranges)
303 }
304
305 fn canonical_prop(normalized_name: &str) -> Option<&'static str> {
306 ucd_util::canonical_property_name(PROPERTY_NAMES, normalized_name)
307 }
308
309 fn canonical_gencat(normalized_value: &str) -> Option<&'static str> {
310 match normalized_value {
311 "any" => Some("Any"),
312 "assigned" => Some("Assigned"),
313 "ascii" => Some("ASCII"),
314 _ => {
315 let gencats = property_values("General_Category").unwrap();
316 canonical_value(gencats, normalized_value)
317 }
318 }
319 }
320
321 fn canonical_script(normalized_value: &str) -> Option<&'static str> {
322 let scripts = property_values("Script").unwrap();
323 canonical_value(scripts, normalized_value)
324 }
325
326 fn canonical_value(
327 vals: PropertyValues,
328 normalized_value: &str,
329 ) -> Option<&'static str> {
330 ucd_util::canonical_property_value(vals, normalized_value)
331 }
332
333 fn normalize(x: &str) -> String {
334 let mut x = x.to_string();
335 ucd_util::symbolic_name_normalize(&mut x);
336 x
337 }
338
339 fn property_values(
340 canonical_property_name: &'static str,
341 ) -> Option<PropertyValues>
342 {
343 ucd_util::property_values(PROPERTY_VALUES, canonical_property_name)
344 }
345
346 fn property_set(
347 name_map: &'static [(&'static str, &'static [(char, char)])],
348 canonical: &'static str,
349 ) -> Option<&'static [(char, char)]> {
350 name_map
351 .binary_search_by_key(&canonical, |x| x.0)
352 .ok()
353 .map(|i| name_map[i].1)
354 }
355
356 /// An iterator over Unicode Age sets. Each item corresponds to a set of
357 /// codepoints that were added in a particular revision of Unicode. The
358 /// iterator yields items in chronological order.
359 #[derive(Debug)]
360 struct AgeIter {
361 ages: &'static [(&'static str, &'static [(char, char)])],
362 }
363
364 fn ages(canonical_age: &str) -> Result<AgeIter> {
365 const AGES: &'static [(&'static str, &'static [(char, char)])] = &[
366 ("V1_1", age::V1_1),
367 ("V2_0", age::V2_0),
368 ("V2_1", age::V2_1),
369 ("V3_0", age::V3_0),
370 ("V3_1", age::V3_1),
371 ("V3_2", age::V3_2),
372 ("V4_0", age::V4_0),
373 ("V4_1", age::V4_1),
374 ("V5_0", age::V5_0),
375 ("V5_1", age::V5_1),
376 ("V5_2", age::V5_2),
377 ("V6_0", age::V6_0),
378 ("V6_1", age::V6_1),
379 ("V6_2", age::V6_2),
380 ("V6_3", age::V6_3),
381 ("V7_0", age::V7_0),
382 ("V8_0", age::V8_0),
383 ("V9_0", age::V9_0),
384 ("V10_0", age::V10_0),
385 ];
386 assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
387
388 let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
389 match pos {
390 None => Err(Error::PropertyValueNotFound),
391 Some(i) => Ok(AgeIter { ages: &AGES[..i+1] }),
392 }
393 }
394
395 impl Iterator for AgeIter {
396 type Item = &'static [(char, char)];
397
398 fn next(&mut self) -> Option<&'static [(char, char)]> {
399 if self.ages.is_empty() {
400 None
401 } else {
402 let set = self.ages[0];
403 self.ages = &self.ages[1..];
404 Some(set.1)
405 }
406 }
407 }
408
409 #[cfg(test)]
410 mod tests {
411 use super::{contains_simple_case_mapping, simple_fold};
412
413 #[test]
414 fn simple_fold_k() {
415 let xs: Vec<char> = simple_fold('k').unwrap().collect();
416 assert_eq!(xs, vec!['K', 'K']);
417
418 let xs: Vec<char> = simple_fold('K').unwrap().collect();
419 assert_eq!(xs, vec!['k', 'K']);
420
421 let xs: Vec<char> = simple_fold('K').unwrap().collect();
422 assert_eq!(xs, vec!['K', 'k']);
423 }
424
425 #[test]
426 fn simple_fold_a() {
427 let xs: Vec<char> = simple_fold('a').unwrap().collect();
428 assert_eq!(xs, vec!['A']);
429
430 let xs: Vec<char> = simple_fold('A').unwrap().collect();
431 assert_eq!(xs, vec!['a']);
432 }
433
434 #[test]
435 fn simple_fold_empty() {
436 assert_eq!(Some('A'), simple_fold('?').unwrap_err());
437 assert_eq!(Some('A'), simple_fold('@').unwrap_err());
438 assert_eq!(Some('a'), simple_fold('[').unwrap_err());
439 assert_eq!(Some('Ⰰ'), simple_fold('☃').unwrap_err());
440 }
441
442 #[test]
443 fn simple_fold_max() {
444 assert_eq!(None, simple_fold('\u{10FFFE}').unwrap_err());
445 assert_eq!(None, simple_fold('\u{10FFFF}').unwrap_err());
446 }
447
448 #[test]
449 fn range_contains() {
450 assert!(contains_simple_case_mapping('A', 'A'));
451 assert!(contains_simple_case_mapping('Z', 'Z'));
452 assert!(contains_simple_case_mapping('A', 'Z'));
453 assert!(contains_simple_case_mapping('@', 'A'));
454 assert!(contains_simple_case_mapping('Z', '['));
455 assert!(contains_simple_case_mapping('☃', 'Ⰰ'));
456
457 assert!(!contains_simple_case_mapping('[', '['));
458 assert!(!contains_simple_case_mapping('[', '`'));
459
460 assert!(!contains_simple_case_mapping('☃', '☃'));
461 }
462
463 #[test]
464 fn regression_466() {
465 use super::{CanonicalClassQuery, ClassQuery};
466
467 let q = ClassQuery::OneLetter('C');
468 assert_eq!(
469 q.canonicalize().unwrap(),
470 CanonicalClassQuery::GeneralCategory("Other"));
471 }
472 }