]> git.proxmox.com Git - rustc.git/blame - src/vendor/regex-syntax/src/unicode.rs
New upstream version 1.28.0~beta.14+dfsg1
[rustc.git] / src / vendor / regex-syntax / src / unicode.rs
CommitLineData
0531ce1d
XL
1use std::cmp::Ordering;
2use std::result;
3
4use ucd_util::{self, PropertyValues};
5
6use hir;
7use unicode_tables::age;
8use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
9use unicode_tables::general_category;
10use unicode_tables::property_bool;
11use unicode_tables::property_names::PROPERTY_NAMES;
12use unicode_tables::property_values::PROPERTY_VALUES;
13use unicode_tables::script;
14use unicode_tables::script_extension;
15
16type Result<T> = result::Result<T, Error>;
17
18/// An error that occurs when dealing with Unicode.
19///
20/// We don't impl the Error trait here because these always get converted
21/// into other public errors. (This error type isn't exported.)
22#[derive(Debug)]
23pub enum Error {
24 PropertyNotFound,
25 PropertyValueNotFound,
26}
27
0531ce1d
XL
28/// An iterator over a codepoint's simple case equivalence class.
29#[derive(Debug)]
30pub struct SimpleFoldIter(::std::slice::Iter<'static, char>);
31
32impl Iterator for SimpleFoldIter {
33 type Item = char;
34
35 fn next(&mut self) -> Option<char> {
36 self.0.next().map(|c| *c)
37 }
38}
39
40/// Return an iterator over the equivalence class of simple case mappings
41/// for the given codepoint. The equivalence class does not include the
42/// given codepoint.
43///
44/// If the equivalence class is empty, then this returns the next scalar
45/// value that has a non-empty equivalence class, if it exists. If no such
46/// scalar value exists, then `None` is returned. The point of this behavior
47/// is to permit callers to avoid calling `simple_fold` more than they need
48/// to, since there is some cost to fetching the equivalence class.
49pub fn simple_fold(c: char) -> result::Result<SimpleFoldIter, Option<char>> {
50 CASE_FOLDING_SIMPLE
51 .binary_search_by_key(&c, |&(c1, _)| c1)
52 .map(|i| SimpleFoldIter(CASE_FOLDING_SIMPLE[i].1.iter()))
53 .map_err(|i| {
54 if i >= CASE_FOLDING_SIMPLE.len() {
55 None
56 } else {
57 Some(CASE_FOLDING_SIMPLE[i].0)
58 }
59 })
60}
61
62/// Returns true if and only if the given (inclusive) range contains at least
63/// one Unicode scalar value that has a non-empty non-trivial simple case
64/// mapping.
65///
66/// This function panics if `end < start`.
67pub fn contains_simple_case_mapping(start: char, end: char) -> bool {
68 assert!(start <= end);
69 CASE_FOLDING_SIMPLE
70 .binary_search_by(|&(c, _)| {
71 if start <= c && c <= end {
72 Ordering::Equal
73 } else if c > end {
74 Ordering::Greater
75 } else {
76 Ordering::Less
77 }
78 }).is_ok()
79}
80
81/// A query for finding a character class defined by Unicode. This supports
82/// either use of a property name directly, or lookup by property value. The
83/// former generally refers to Binary properties (see UTS#44, Table 8), but
84/// as a special exception (see UTS#18, Section 1.2) both general categories
85/// (an enumeration) and scripts (a catalog) are supported as if each of their
86/// possible values were a binary property.
87///
88/// In all circumstances, property names and values are normalized and
89/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
90///
91/// The lifetime `'a` refers to the shorter of the lifetimes of property name
92/// and property value.
93#[derive(Debug)]
94pub enum ClassQuery<'a> {
95 /// Return a class corresponding to a Unicode binary property, named by
96 /// a single letter.
97 OneLetter(char),
98 /// Return a class corresponding to a Unicode binary property.
99 ///
100 /// Note that, by special exception (see UTS#18, Section 1.2), both
101 /// general category values and script values are permitted here as if
102 /// they were a binary property.
103 Binary(&'a str),
104 /// Return a class corresponding to all codepoints whose property
105 /// (identified by `property_name`) corresponds to the given value
106 /// (identified by `property_value`).
107 ByValue {
108 /// A property name.
109 property_name: &'a str,
110 /// A property value.
111 property_value: &'a str,
112 },
113}
114
115impl<'a> ClassQuery<'a> {
116 fn canonicalize(&self) -> Result<CanonicalClassQuery> {
117 match *self {
118 ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
119 ClassQuery::Binary(name) => self.canonical_binary(name),
120 ClassQuery::ByValue { property_name, property_value } => {
121 let property_name = normalize(property_name);
122 let property_value = normalize(property_value);
123
124 let canon_name = match canonical_prop(&property_name) {
125 None => return Err(Error::PropertyNotFound),
126 Some(canon_name) => canon_name,
127 };
128 Ok(match canon_name {
129 "General_Category" => {
130 let canon = match canonical_gencat(&property_value) {
131 None => return Err(Error::PropertyValueNotFound),
132 Some(canon) => canon,
133 };
134 CanonicalClassQuery::GeneralCategory(canon)
135 }
136 "Script" => {
137 let canon = match canonical_script(&property_value) {
138 None => return Err(Error::PropertyValueNotFound),
139 Some(canon) => canon,
140 };
141 CanonicalClassQuery::Script(canon)
142 }
143 _ => {
144 let vals = match property_values(canon_name) {
145 None => return Err(Error::PropertyValueNotFound),
146 Some(vals) => vals,
147 };
148 let canon_val = match canonical_value(
149 vals,
150 &property_value,
151 ) {
152 None => return Err(Error::PropertyValueNotFound),
153 Some(canon_val) => canon_val,
154 };
155 CanonicalClassQuery::ByValue {
156 property_name: canon_name,
157 property_value: canon_val,
158 }
159 }
160 })
161 }
162 }
163 }
164
165 fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
166 let norm = normalize(name);
167
168 if let Some(canon) = canonical_prop(&norm) {
169 return Ok(CanonicalClassQuery::Binary(canon));
170 }
171 if let Some(canon) = canonical_gencat(&norm) {
172 return Ok(CanonicalClassQuery::GeneralCategory(canon));
173 }
174 if let Some(canon) = canonical_script(&norm) {
175 return Ok(CanonicalClassQuery::Script(canon));
176 }
177 Err(Error::PropertyNotFound)
178 }
179}
180
181/// Like ClassQuery, but its parameters have been canonicalized. This also
182/// differentiates binary properties from flattened general categories and
183/// scripts.
94b46f34 184#[derive(Debug, Eq, PartialEq)]
0531ce1d
XL
185enum CanonicalClassQuery {
186 /// The canonical binary property name.
187 Binary(&'static str),
188 /// The canonical general category name.
189 GeneralCategory(&'static str),
190 /// The canonical script name.
191 Script(&'static str),
192 /// An arbitrary association between property and value, both of which
193 /// have been canonicalized.
194 ///
195 /// Note that by construction, the property name of ByValue will never
196 /// be General_Category or Script. Those two cases are subsumed by the
197 /// eponymous variants.
198 ByValue {
199 /// The canonical property name.
200 property_name: &'static str,
201 /// The canonical property value.
202 property_value: &'static str,
203 },
204}
205
206/// Looks up a Unicode class given a query. If one doesn't exist, then
207/// `None` is returned.
208pub fn class<'a>(query: ClassQuery<'a>) -> Result<hir::ClassUnicode> {
209 use self::CanonicalClassQuery::*;
210
94b46f34 211 match query.canonicalize()? {
0531ce1d
XL
212 Binary(name) => {
213 property_set(property_bool::BY_NAME, name)
214 .map(hir_class)
215 .ok_or(Error::PropertyNotFound)
216 }
217 GeneralCategory("Any") => {
218 Ok(hir_class(&[('\0', '\u{10FFFF}')]))
219 }
220 GeneralCategory("Assigned") => {
221 let mut cls =
94b46f34 222 property_set(general_category::BY_NAME, "Unassigned")
0531ce1d 223 .map(hir_class)
94b46f34 224 .ok_or(Error::PropertyNotFound)?;
0531ce1d
XL
225 cls.negate();
226 Ok(cls)
227 }
228 GeneralCategory("ASCII") => {
229 Ok(hir_class(&[('\0', '\x7F')]))
230 }
231 GeneralCategory(name) => {
232 property_set(general_category::BY_NAME, name)
233 .map(hir_class)
234 .ok_or(Error::PropertyValueNotFound)
235 }
236 Script(name) => {
237 property_set(script::BY_NAME, name)
238 .map(hir_class)
239 .ok_or(Error::PropertyValueNotFound)
240 }
241 ByValue { property_name: "Age", property_value } => {
242 let mut class = hir::ClassUnicode::empty();
94b46f34 243 for set in ages(property_value)? {
0531ce1d
XL
244 class.union(&hir_class(set));
245 }
246 Ok(class)
247 }
248 ByValue { property_name: "Script_Extensions", property_value } => {
249 property_set(script_extension::BY_NAME, property_value)
250 .map(hir_class)
251 .ok_or(Error::PropertyValueNotFound)
252 }
253 _ => {
254 // What else should we support?
255 Err(Error::PropertyNotFound)
256 }
257 }
258}
259
260/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
261pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
262 let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
263 .iter()
264 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
265 .collect();
266 hir::ClassUnicode::new(hir_ranges)
267}
268
269fn canonical_prop(normalized_name: &str) -> Option<&'static str> {
270 ucd_util::canonical_property_name(PROPERTY_NAMES, normalized_name)
271}
272
273fn canonical_gencat(normalized_value: &str) -> Option<&'static str> {
274 match normalized_value {
275 "any" => Some("Any"),
276 "assigned" => Some("Assigned"),
277 "ascii" => Some("ASCII"),
278 _ => {
279 let gencats = property_values("General_Category").unwrap();
280 canonical_value(gencats, normalized_value)
281 }
282 }
283}
284
285fn canonical_script(normalized_value: &str) -> Option<&'static str> {
286 let scripts = property_values("Script").unwrap();
287 canonical_value(scripts, normalized_value)
288}
289
290fn canonical_value(
291 vals: PropertyValues,
292 normalized_value: &str,
293) -> Option<&'static str> {
294 ucd_util::canonical_property_value(vals, normalized_value)
295}
296
297fn normalize(x: &str) -> String {
298 let mut x = x.to_string();
299 ucd_util::symbolic_name_normalize(&mut x);
300 x
301}
302
303fn property_values(
304 canonical_property_name: &'static str,
305) -> Option<PropertyValues>
306{
307 ucd_util::property_values(PROPERTY_VALUES, canonical_property_name)
308}
309
310fn property_set(
311 name_map: &'static [(&'static str, &'static [(char, char)])],
312 canonical: &'static str,
313) -> Option<&'static [(char, char)]> {
314 name_map
315 .binary_search_by_key(&canonical, |x| x.0)
316 .ok()
317 .map(|i| name_map[i].1)
318}
319
320/// An iterator over Unicode Age sets. Each item corresponds to a set of
321/// codepoints that were added in a particular revision of Unicode. The
322/// iterator yields items in chronological order.
323#[derive(Debug)]
324struct AgeIter {
325 ages: &'static [(&'static str, &'static [(char, char)])],
326}
327
328fn ages(canonical_age: &str) -> Result<AgeIter> {
329 const AGES: &'static [(&'static str, &'static [(char, char)])] = &[
330 ("V1_1", age::V1_1),
331 ("V2_0", age::V2_0),
332 ("V2_1", age::V2_1),
333 ("V3_0", age::V3_0),
334 ("V3_1", age::V3_1),
335 ("V3_2", age::V3_2),
336 ("V4_0", age::V4_0),
337 ("V4_1", age::V4_1),
338 ("V5_0", age::V5_0),
339 ("V5_1", age::V5_1),
340 ("V5_2", age::V5_2),
341 ("V6_0", age::V6_0),
342 ("V6_1", age::V6_1),
343 ("V6_2", age::V6_2),
344 ("V6_3", age::V6_3),
345 ("V7_0", age::V7_0),
346 ("V8_0", age::V8_0),
347 ("V9_0", age::V9_0),
348 ("V10_0", age::V10_0),
349 ];
350 assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
351
352 let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
353 match pos {
354 None => Err(Error::PropertyValueNotFound),
355 Some(i) => Ok(AgeIter { ages: &AGES[..i+1] }),
356 }
357}
358
359impl Iterator for AgeIter {
360 type Item = &'static [(char, char)];
361
362 fn next(&mut self) -> Option<&'static [(char, char)]> {
363 if self.ages.is_empty() {
364 None
365 } else {
366 let set = self.ages[0];
367 self.ages = &self.ages[1..];
368 Some(set.1)
369 }
370 }
371}
372
373#[cfg(test)]
374mod tests {
375 use super::{contains_simple_case_mapping, simple_fold};
376
377 #[test]
378 fn simple_fold_k() {
379 let xs: Vec<char> = simple_fold('k').unwrap().collect();
380 assert_eq!(xs, vec!['K', 'K']);
381
382 let xs: Vec<char> = simple_fold('K').unwrap().collect();
383 assert_eq!(xs, vec!['k', 'K']);
384
385 let xs: Vec<char> = simple_fold('K').unwrap().collect();
386 assert_eq!(xs, vec!['K', 'k']);
387 }
388
389 #[test]
390 fn simple_fold_a() {
391 let xs: Vec<char> = simple_fold('a').unwrap().collect();
392 assert_eq!(xs, vec!['A']);
393
394 let xs: Vec<char> = simple_fold('A').unwrap().collect();
395 assert_eq!(xs, vec!['a']);
396 }
397
398 #[test]
399 fn simple_fold_empty() {
400 assert_eq!(Some('A'), simple_fold('?').unwrap_err());
401 assert_eq!(Some('A'), simple_fold('@').unwrap_err());
402 assert_eq!(Some('a'), simple_fold('[').unwrap_err());
403 assert_eq!(Some('Ⰰ'), simple_fold('☃').unwrap_err());
404 }
405
406 #[test]
407 fn simple_fold_max() {
408 assert_eq!(None, simple_fold('\u{10FFFE}').unwrap_err());
409 assert_eq!(None, simple_fold('\u{10FFFF}').unwrap_err());
410 }
411
412 #[test]
413 fn range_contains() {
414 assert!(contains_simple_case_mapping('A', 'A'));
415 assert!(contains_simple_case_mapping('Z', 'Z'));
416 assert!(contains_simple_case_mapping('A', 'Z'));
417 assert!(contains_simple_case_mapping('@', 'A'));
418 assert!(contains_simple_case_mapping('Z', '['));
419 assert!(contains_simple_case_mapping('☃', 'Ⰰ'));
420
421 assert!(!contains_simple_case_mapping('[', '['));
422 assert!(!contains_simple_case_mapping('[', '`'));
423
424 assert!(!contains_simple_case_mapping('☃', '☃'));
425 }
94b46f34
XL
426
427 #[test]
428 fn regression_466() {
429 use super::{CanonicalClassQuery, ClassQuery};
430
431 let q = ClassQuery::OneLetter('C');
432 assert_eq!(
433 q.canonicalize().unwrap(),
434 CanonicalClassQuery::GeneralCategory("Other"));
435 }
0531ce1d 436}