1 use std
::cmp
::Ordering
;
4 use ucd_util
::{self, PropertyValues}
;
7 use unicode_tables
::age
;
8 use unicode_tables
::case_folding_simple
::CASE_FOLDING_SIMPLE
;
9 use unicode_tables
::general_category
;
10 use unicode_tables
::property_bool
;
11 use unicode_tables
::property_names
::PROPERTY_NAMES
;
12 use unicode_tables
::property_values
::PROPERTY_VALUES
;
13 use unicode_tables
::script
;
14 use unicode_tables
::script_extension
;
16 type Result
<T
> = result
::Result
<T
, Error
>;
18 /// An error that occurs when dealing with Unicode.
20 /// We don't impl the Error trait here because these always get converted
21 /// into other public errors. (This error type isn't exported.)
25 PropertyValueNotFound
,
28 /// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
30 /// If `dst` is not long enough, then `None` is returned. Otherwise, the number
31 /// of bytes written is returned.
32 pub fn encode_utf8(character
: char, dst
: &mut [u8]) -> Option
<usize> {
33 // TODO: Remove this function once we move to at least Rust 1.15, which
34 // provides char::encode_utf8 for us.
35 const TAG_CONT
: u8 = 0b1000_0000;
36 const TAG_TWO
: u8 = 0b1100_0000;
37 const TAG_THREE
: u8 = 0b1110_0000;
38 const TAG_FOUR
: u8 = 0b1111_0000;
40 let code
= character
as u32;
41 if code
<= 0x7F && !dst
.is_empty() {
44 } else if code
<= 0x7FF && dst
.len() >= 2 {
45 dst
[0] = (code
>> 6 & 0x1F) as u8 | TAG_TWO
;
46 dst
[1] = (code
& 0x3F) as u8 | TAG_CONT
;
48 } else if code
<= 0xFFFF && dst
.len() >= 3 {
49 dst
[0] = (code
>> 12 & 0x0F) as u8 | TAG_THREE
;
50 dst
[1] = (code
>> 6 & 0x3F) as u8 | TAG_CONT
;
51 dst
[2] = (code
& 0x3F) as u8 | TAG_CONT
;
53 } else if dst
.len() >= 4 {
54 dst
[0] = (code
>> 18 & 0x07) as u8 | TAG_FOUR
;
55 dst
[1] = (code
>> 12 & 0x3F) as u8 | TAG_CONT
;
56 dst
[2] = (code
>> 6 & 0x3F) as u8 | TAG_CONT
;
57 dst
[3] = (code
& 0x3F) as u8 | TAG_CONT
;
64 /// An iterator over a codepoint's simple case equivalence class.
66 pub struct SimpleFoldIter(::std
::slice
::Iter
<'
static, char>);
68 impl Iterator
for SimpleFoldIter
{
71 fn next(&mut self) -> Option
<char> {
72 self.0.next().map(|c
| *c
)
76 /// Return an iterator over the equivalence class of simple case mappings
77 /// for the given codepoint. The equivalence class does not include the
80 /// If the equivalence class is empty, then this returns the next scalar
81 /// value that has a non-empty equivalence class, if it exists. If no such
82 /// scalar value exists, then `None` is returned. The point of this behavior
83 /// is to permit callers to avoid calling `simple_fold` more than they need
84 /// to, since there is some cost to fetching the equivalence class.
85 pub fn simple_fold(c
: char) -> result
::Result
<SimpleFoldIter
, Option
<char>> {
87 .binary_search_by_key(&c
, |&(c1
, _
)| c1
)
88 .map(|i
| SimpleFoldIter(CASE_FOLDING_SIMPLE
[i
].1.iter
()))
90 if i
>= CASE_FOLDING_SIMPLE
.len() {
93 Some(CASE_FOLDING_SIMPLE
[i
].0)
98 /// Returns true if and only if the given (inclusive) range contains at least
99 /// one Unicode scalar value that has a non-empty non-trivial simple case
102 /// This function panics if `end < start`.
103 pub fn contains_simple_case_mapping(start
: char, end
: char) -> bool
{
104 assert
!(start
<= end
);
106 .binary_search_by(|&(c
, _
)| {
107 if start
<= c
&& c
<= end
{
117 /// A query for finding a character class defined by Unicode. This supports
118 /// either use of a property name directly, or lookup by property value. The
119 /// former generally refers to Binary properties (see UTS#44, Table 8), but
120 /// as a special exception (see UTS#18, Section 1.2) both general categories
121 /// (an enumeration) and scripts (a catalog) are supported as if each of their
122 /// possible values were a binary property.
124 /// In all circumstances, property names and values are normalized and
125 /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
127 /// The lifetime `'a` refers to the shorter of the lifetimes of property name
128 /// and property value.
130 pub enum ClassQuery
<'a
> {
131 /// Return a class corresponding to a Unicode binary property, named by
134 /// Return a class corresponding to a Unicode binary property.
136 /// Note that, by special exception (see UTS#18, Section 1.2), both
137 /// general category values and script values are permitted here as if
138 /// they were a binary property.
140 /// Return a class corresponding to all codepoints whose property
141 /// (identified by `property_name`) corresponds to the given value
142 /// (identified by `property_value`).
145 property_name
: &'a
str,
146 /// A property value.
147 property_value
: &'a
str,
151 impl<'a
> ClassQuery
<'a
> {
152 fn canonicalize(&self) -> Result
<CanonicalClassQuery
> {
154 ClassQuery
::OneLetter(c
) => self.canonical_binary(&c
.to_string()),
155 ClassQuery
::Binary(name
) => self.canonical_binary(name
),
156 ClassQuery
::ByValue { property_name, property_value }
=> {
157 let property_name
= normalize(property_name
);
158 let property_value
= normalize(property_value
);
160 let canon_name
= match canonical_prop(&property_name
) {
161 None
=> return Err(Error
::PropertyNotFound
),
162 Some(canon_name
) => canon_name
,
164 Ok(match canon_name
{
165 "General_Category" => {
166 let canon
= match canonical_gencat(&property_value
) {
167 None
=> return Err(Error
::PropertyValueNotFound
),
168 Some(canon
) => canon
,
170 CanonicalClassQuery
::GeneralCategory(canon
)
173 let canon
= match canonical_script(&property_value
) {
174 None
=> return Err(Error
::PropertyValueNotFound
),
175 Some(canon
) => canon
,
177 CanonicalClassQuery
::Script(canon
)
180 let vals
= match property_values(canon_name
) {
181 None
=> return Err(Error
::PropertyValueNotFound
),
184 let canon_val
= match canonical_value(
188 None
=> return Err(Error
::PropertyValueNotFound
),
189 Some(canon_val
) => canon_val
,
191 CanonicalClassQuery
::ByValue
{
192 property_name
: canon_name
,
193 property_value
: canon_val
,
201 fn canonical_binary(&self, name
: &str) -> Result
<CanonicalClassQuery
> {
202 let norm
= normalize(name
);
204 if let Some(canon
) = canonical_prop(&norm
) {
205 return Ok(CanonicalClassQuery
::Binary(canon
));
207 if let Some(canon
) = canonical_gencat(&norm
) {
208 return Ok(CanonicalClassQuery
::GeneralCategory(canon
));
210 if let Some(canon
) = canonical_script(&norm
) {
211 return Ok(CanonicalClassQuery
::Script(canon
));
213 Err(Error
::PropertyNotFound
)
217 /// Like ClassQuery, but its parameters have been canonicalized. This also
218 /// differentiates binary properties from flattened general categories and
220 #[derive(Debug, Eq, PartialEq)]
221 enum CanonicalClassQuery
{
222 /// The canonical binary property name.
223 Binary(&'
static str),
224 /// The canonical general category name.
225 GeneralCategory(&'
static str),
226 /// The canonical script name.
227 Script(&'
static str),
228 /// An arbitrary association between property and value, both of which
229 /// have been canonicalized.
231 /// Note that by construction, the property name of ByValue will never
232 /// be General_Category or Script. Those two cases are subsumed by the
233 /// eponymous variants.
235 /// The canonical property name.
236 property_name
: &'
static str,
237 /// The canonical property value.
238 property_value
: &'
static str,
242 /// Looks up a Unicode class given a query. If one doesn't exist, then
243 /// `None` is returned.
244 pub fn class
<'a
>(query
: ClassQuery
<'a
>) -> Result
<hir
::ClassUnicode
> {
245 use self::CanonicalClassQuery
::*;
247 match try
!(query
.canonicalize()) {
249 property_set(property_bool
::BY_NAME
, name
)
251 .ok_or(Error
::PropertyNotFound
)
253 GeneralCategory("Any") => {
254 Ok(hir_class(&[('
\0'
, '
\u{10FFFF}'
)]))
256 GeneralCategory("Assigned") => {
258 try
!(property_set(general_category
::BY_NAME
, "Unassigned")
260 .ok_or(Error
::PropertyNotFound
));
264 GeneralCategory("ASCII") => {
265 Ok(hir_class(&[('
\0'
, '
\x7F'
)]))
267 GeneralCategory(name
) => {
268 property_set(general_category
::BY_NAME
, name
)
270 .ok_or(Error
::PropertyValueNotFound
)
273 property_set(script
::BY_NAME
, name
)
275 .ok_or(Error
::PropertyValueNotFound
)
277 ByValue { property_name: "Age", property_value }
=> {
278 let mut class
= hir
::ClassUnicode
::empty();
279 for set
in try
!(ages(property_value
)) {
280 class
.union(&hir_class(set
));
284 ByValue { property_name: "Script_Extensions", property_value }
=> {
285 property_set(script_extension
::BY_NAME
, property_value
)
287 .ok_or(Error
::PropertyValueNotFound
)
290 // What else should we support?
291 Err(Error
::PropertyNotFound
)
296 /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
297 pub fn hir_class(ranges
: &[(char, char)]) -> hir
::ClassUnicode
{
298 let hir_ranges
: Vec
<hir
::ClassUnicodeRange
> = ranges
300 .map(|&(s
, e
)| hir
::ClassUnicodeRange
::new(s
, e
))
302 hir
::ClassUnicode
::new(hir_ranges
)
305 fn canonical_prop(normalized_name
: &str) -> Option
<&'
static str> {
306 ucd_util
::canonical_property_name(PROPERTY_NAMES
, normalized_name
)
309 fn canonical_gencat(normalized_value
: &str) -> Option
<&'
static str> {
310 match normalized_value
{
311 "any" => Some("Any"),
312 "assigned" => Some("Assigned"),
313 "ascii" => Some("ASCII"),
315 let gencats
= property_values("General_Category").unwrap();
316 canonical_value(gencats
, normalized_value
)
321 fn canonical_script(normalized_value
: &str) -> Option
<&'
static str> {
322 let scripts
= property_values("Script").unwrap();
323 canonical_value(scripts
, normalized_value
)
327 vals
: PropertyValues
,
328 normalized_value
: &str,
329 ) -> Option
<&'
static str> {
330 ucd_util
::canonical_property_value(vals
, normalized_value
)
333 fn normalize(x
: &str) -> String
{
334 let mut x
= x
.to_string();
335 ucd_util
::symbolic_name_normalize(&mut x
);
340 canonical_property_name
: &'
static str,
341 ) -> Option
<PropertyValues
>
343 ucd_util
::property_values(PROPERTY_VALUES
, canonical_property_name
)
347 name_map
: &'
static [(&'
static str, &'
static [(char, char)])],
348 canonical
: &'
static str,
349 ) -> Option
<&'
static [(char, char)]> {
351 .binary_search_by_key(&canonical
, |x
| x
.0)
353 .map(|i
| name_map
[i
].1)
356 /// An iterator over Unicode Age sets. Each item corresponds to a set of
357 /// codepoints that were added in a particular revision of Unicode. The
358 /// iterator yields items in chronological order.
361 ages
: &'
static [(&'
static str, &'
static [(char, char)])],
364 fn ages(canonical_age
: &str) -> Result
<AgeIter
> {
365 const AGES
: &'
static [(&'
static str, &'
static [(char, char)])] = &[
384 ("V10_0", age
::V10_0
),
386 assert_eq
!(AGES
.len(), age
::BY_NAME
.len(), "ages are out of sync");
388 let pos
= AGES
.iter().position(|&(age
, _
)| canonical_age
== age
);
390 None
=> Err(Error
::PropertyValueNotFound
),
391 Some(i
) => Ok(AgeIter { ages: &AGES[..i+1] }
),
395 impl Iterator
for AgeIter
{
396 type Item
= &'
static [(char, char)];
398 fn next(&mut self) -> Option
<&'
static [(char, char)]> {
399 if self.ages
.is_empty() {
402 let set
= self.ages
[0];
403 self.ages
= &self.ages
[1..];
411 use super::{contains_simple_case_mapping, simple_fold}
;
415 let xs
: Vec
<char> = simple_fold('k'
).unwrap().collect();
416 assert_eq
!(xs
, vec
!['K'
, 'K'
]);
418 let xs
: Vec
<char> = simple_fold('K'
).unwrap().collect();
419 assert_eq
!(xs
, vec
!['k'
, 'K'
]);
421 let xs
: Vec
<char> = simple_fold('K'
).unwrap().collect();
422 assert_eq
!(xs
, vec
!['K'
, 'k'
]);
427 let xs
: Vec
<char> = simple_fold('a'
).unwrap().collect();
428 assert_eq
!(xs
, vec
!['A'
]);
430 let xs
: Vec
<char> = simple_fold('A'
).unwrap().collect();
431 assert_eq
!(xs
, vec
!['a'
]);
435 fn simple_fold_empty() {
436 assert_eq
!(Some('A'
), simple_fold('?'
).unwrap_err());
437 assert_eq
!(Some('A'
), simple_fold('@'
).unwrap_err());
438 assert_eq
!(Some('a'
), simple_fold('
['
).unwrap_err());
439 assert_eq
!(Some('Ⰰ'
), simple_fold('☃'
).unwrap_err());
443 fn simple_fold_max() {
444 assert_eq
!(None
, simple_fold('
\u{10FFFE}'
).unwrap_err());
445 assert_eq
!(None
, simple_fold('
\u{10FFFF}'
).unwrap_err());
449 fn range_contains() {
450 assert
!(contains_simple_case_mapping('A'
, 'A'
));
451 assert
!(contains_simple_case_mapping('Z'
, 'Z'
));
452 assert
!(contains_simple_case_mapping('A'
, 'Z'
));
453 assert
!(contains_simple_case_mapping('@'
, 'A'
));
454 assert
!(contains_simple_case_mapping('Z'
, '
['
));
455 assert
!(contains_simple_case_mapping('☃'
, 'Ⰰ'
));
457 assert
!(!contains_simple_case_mapping('
['
, '
['
));
458 assert
!(!contains_simple_case_mapping('
['
, '`'
));
460 assert
!(!contains_simple_case_mapping('☃'
, '☃'
));
464 fn regression_466() {
465 use super::{CanonicalClassQuery, ClassQuery}
;
467 let q
= ClassQuery
::OneLetter('C'
);
469 q
.canonicalize().unwrap(),
470 CanonicalClassQuery
::GeneralCategory("Other"));