]> git.proxmox.com Git - rustc.git/blob - vendor/utf8-ranges/src/lib.rs
New upstream version 1.44.1+dfsg1
[rustc.git] / vendor / utf8-ranges / src / lib.rs
1 /*!
2 Crate `utf8-ranges` converts ranges of Unicode scalar values to equivalent
3 ranges of UTF-8 bytes. This is useful for constructing byte based automatons
4 that need to embed UTF-8 decoding.
5
6 See the documentation on the `Utf8Sequences` iterator for more details and
7 an example.
8
9 # Wait, what is this?
10
11 This is simplest to explain with an example. Let's say you wanted to test
12 whether a particular byte sequence was a Cyrillic character. One possible
13 scalar value range is `[0400-04FF]`. The set of allowed bytes for this
14 range can be expressed as a sequence of byte ranges:
15
16 ```ignore
17 [D0-D3][80-BF]
18 ```
19
20 This is simple enough: simply encode the boundaries, `0400` encodes to
21 `D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
22 corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
23
24 However, what if you wanted to add the Cyrillic Supplementary characters to
25 your range? Your range might then become `[0400-052F]`. The same procedure
26 as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
27 you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
28 this isn't quite correct because this range doesn't capture many characters,
29 for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
30
31 Instead, you need multiple sequences of byte ranges:
32
33 ```ignore
34 [D0-D3][80-BF] # matches codepoints 0400-04FF
35 [D4][80-AF] # matches codepoints 0500-052F
36 ```
37
38 This gets even more complicated if you want bigger ranges, particularly if
39 they naively contain surrogate codepoints. For example, the sequence of byte
40 ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
41
42 ```ignore
43 [0-7F]
44 [C2-DF][80-BF]
45 [E0][A0-BF][80-BF]
46 [E1-EC][80-BF][80-BF]
47 [ED][80-9F][80-BF]
48 [EE-EF][80-BF][80-BF]
49 ```
50
51 Note that the byte ranges above will *not* match any erroneous encoding of
52 UTF-8, including encodings of surrogate codepoints.
53
54 And, of course, for all of Unicode (`[000000-10FFFF]`):
55
56 ```ignore
57 [0-7F]
58 [C2-DF][80-BF]
59 [E0][A0-BF][80-BF]
60 [E1-EC][80-BF][80-BF]
61 [ED][80-9F][80-BF]
62 [EE-EF][80-BF][80-BF]
63 [F0][90-BF][80-BF][80-BF]
64 [F1-F3][80-BF][80-BF][80-BF]
65 [F4][80-8F][80-BF][80-BF]
66 ```
67
68 This crate automates the process of creating these byte ranges from ranges of
69 Unicode scalar values.
70
71 # Why would I ever use this?
72
73 You probably won't ever need this. In 99% of cases, you just decode the byte
74 sequence into a Unicode scalar value and compare scalar values directly.
75 However, this explicit decoding step isn't always possible. For example, the
76 construction of some finite state machines may benefit from converting ranges
77 of scalar values into UTF-8 decoder automata (e.g., for character classes in
78 regular expressions).
79
80 # Lineage
81
82 I got the idea and general implementation strategy from Russ Cox in his
83 [article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
84 Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
85 I also got the idea from
86 [Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
87 which uses it for executing automata on their term index.
88 */
89
90 #![deny(missing_docs)]
91
92 #[cfg(test)] extern crate quickcheck;
93
94 use std::char;
95 use std::fmt;
96 use std::slice;
97
98 use char_utf8::encode_utf8;
99
100 const MAX_UTF8_BYTES: usize = 4;
101
102 mod char_utf8;
103
104 /// Utf8Sequence represents a sequence of byte ranges.
105 ///
106 /// To match a Utf8Sequence, a candidate byte sequence must match each
107 /// successive range.
108 ///
109 /// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
110 /// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
111 #[derive(Copy, Clone, Eq, PartialEq)]
112 pub enum Utf8Sequence {
113 /// One byte range.
114 One(Utf8Range),
115 /// Two successive byte ranges.
116 Two([Utf8Range; 2]),
117 /// Three successive byte ranges.
118 Three([Utf8Range; 3]),
119 /// Four successive byte ranges.
120 Four([Utf8Range; 4]),
121 }
122
123 impl Utf8Sequence {
124 /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
125 /// range.
126 ///
127 /// This assumes that `start` and `end` have the same length.
128 fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
129 assert_eq!(start.len(), end.len());
130 match start.len() {
131 2 => Utf8Sequence::Two([
132 Utf8Range::new(start[0], end[0]),
133 Utf8Range::new(start[1], end[1]),
134 ]),
135 3 => Utf8Sequence::Three([
136 Utf8Range::new(start[0], end[0]),
137 Utf8Range::new(start[1], end[1]),
138 Utf8Range::new(start[2], end[2]),
139 ]),
140 4 => Utf8Sequence::Four([
141 Utf8Range::new(start[0], end[0]),
142 Utf8Range::new(start[1], end[1]),
143 Utf8Range::new(start[2], end[2]),
144 Utf8Range::new(start[3], end[3]),
145 ]),
146 n => unreachable!("invalid encoded length: {}", n),
147 }
148 }
149
150 /// Returns the underlying sequence of byte ranges as a slice.
151 pub fn as_slice(&self) -> &[Utf8Range] {
152 use self::Utf8Sequence::*;
153 match *self {
154 One(ref r) => unsafe { slice::from_raw_parts(r, 1) },
155 Two(ref r) => &r[..],
156 Three(ref r) => &r[..],
157 Four(ref r) => &r[..],
158 }
159 }
160
161 /// Returns the number of byte ranges in this sequence.
162 ///
163 /// The length is guaranteed to be in the closed interval `[1, 4]`.
164 pub fn len(&self) -> usize {
165 self.as_slice().len()
166 }
167
168 /// Returns true if and only if a prefix of `bytes` matches this sequence
169 /// of byte ranges.
170 pub fn matches(&self, bytes: &[u8]) -> bool {
171 if bytes.len() < self.len() {
172 return false;
173 }
174 for (&b, r) in bytes.iter().zip(self) {
175 if !r.matches(b) {
176 return false;
177 }
178 }
179 true
180 }
181 }
182
183 impl<'a> IntoIterator for &'a Utf8Sequence {
184 type IntoIter = slice::Iter<'a, Utf8Range>;
185 type Item = &'a Utf8Range;
186
187 fn into_iter(self) -> Self::IntoIter {
188 self.as_slice().into_iter()
189 }
190 }
191
192 impl fmt::Debug for Utf8Sequence {
193 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
194 use self::Utf8Sequence::*;
195 match *self {
196 One(ref r) => write!(f, "{:?}", r),
197 Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
198 Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
199 Four(ref r) => write!(f, "{:?}{:?}{:?}{:?}",
200 r[0], r[1], r[2], r[3]),
201 }
202 }
203 }
204
205 /// A single inclusive range of UTF-8 bytes.
206 #[derive(Clone, Copy, PartialEq, Eq)]
207 pub struct Utf8Range {
208 /// Start of byte range (inclusive).
209 pub start: u8,
210 /// End of byte range (inclusive).
211 pub end: u8,
212 }
213
214 impl Utf8Range {
215 fn new(start: u8, end: u8) -> Self {
216 Utf8Range { start: start, end: end }
217 }
218
219 /// Returns true if and only if the given byte is in this range.
220 pub fn matches(&self, b: u8) -> bool {
221 self.start <= b && b <= self.end
222 }
223 }
224
225 impl fmt::Debug for Utf8Range {
226 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
227 if self.start == self.end {
228 write!(f, "[{:X}]", self.start)
229 } else {
230 write!(f, "[{:X}-{:X}]", self.start, self.end)
231 }
232 }
233 }
234
235 /// An iterator over ranges of matching UTF-8 byte sequences.
236 ///
237 /// The iteration represents an alternation of comprehensive byte sequences
238 /// that match precisely the set of UTF-8 encoded scalar values.
239 ///
240 /// A byte sequence corresponds to one of the scalar values in the range given
241 /// if and only if it completely matches exactly one of the sequences of byte
242 /// ranges produced by this iterator.
243 ///
244 /// Each sequence of byte ranges matches a unique set of bytes. That is, no two
245 /// sequences will match the same bytes.
246 ///
247 /// # Example
248 ///
249 /// This shows how to match an arbitrary byte sequence against a range of
250 /// scalar values.
251 ///
252 /// ```rust
253 /// use utf8_ranges::{Utf8Sequences, Utf8Sequence};
254 ///
255 /// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
256 /// for range in seqs {
257 /// if range.matches(bytes) {
258 /// return true;
259 /// }
260 /// }
261 /// false
262 /// }
263 ///
264 /// // Test the basic multilingual plane.
265 /// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
266 ///
267 /// // UTF-8 encoding of 'a'.
268 /// assert!(matches(&seqs, &[0x61]));
269 /// // UTF-8 encoding of '☃' (`\u{2603}`).
270 /// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
271 /// // UTF-8 encoding of `\u{10348}` (outside the BMP).
272 /// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
273 /// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
274 /// // which is invalid UTF-8, and therefore fails, despite the fact that
275 /// // the corresponding codepoint (0xD800) falls in the range given.
276 /// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
277 /// // And fails against plain old invalid UTF-8.
278 /// assert!(!matches(&seqs, &[0xFF, 0xFF]));
279 /// ```
280 ///
281 /// If this example seems circuitous, that's because it is! It's meant to be
282 /// illustrative. In practice, you could just try to decode your byte sequence
283 /// and compare it with the scalar value range directly. However, this is not
284 /// always possible (for example, in a byte based automaton).
285 pub struct Utf8Sequences {
286 range_stack: Vec<ScalarRange>,
287 }
288
289 impl Utf8Sequences {
290 /// Create a new iterator over UTF-8 byte ranges for the scalar value range
291 /// given.
292 pub fn new(start: char, end: char) -> Self {
293 let mut it = Utf8Sequences { range_stack: vec![] };
294 it.push(start as u32, end as u32);
295 it
296 }
297
298 /// reset resets the scalar value range.
299 /// Any existing state is cleared, but resources may be reused.
300 ///
301 /// N.B. Benchmarks say that this method is dubious.
302 #[doc(hidden)]
303 pub fn reset(&mut self, start: char, end: char) {
304 self.range_stack.clear();
305 self.push(start as u32, end as u32);
306 }
307
308 fn push(&mut self, start: u32, end: u32) {
309 self.range_stack.push(ScalarRange { start: start, end: end });
310 }
311 }
312
313 struct ScalarRange {
314 start: u32,
315 end: u32,
316 }
317
318 impl fmt::Debug for ScalarRange {
319 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
320 write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
321 }
322 }
323
324 impl Iterator for Utf8Sequences {
325 type Item = Utf8Sequence;
326
327 fn next(&mut self) -> Option<Self::Item> {
328 'TOP:
329 while let Some(mut r) = self.range_stack.pop() {
330 'INNER:
331 loop {
332 if let Some((r1, r2)) = r.split() {
333 self.push(r2.start, r2.end);
334 r.start = r1.start;
335 r.end = r1.end;
336 continue 'INNER;
337 }
338 if !r.is_valid() {
339 continue 'TOP;
340 }
341 for i in 1..MAX_UTF8_BYTES {
342 let max = max_scalar_value(i);
343 if r.start <= max && max < r.end {
344 self.push(max + 1, r.end);
345 r.end = max;
346 continue 'INNER;
347 }
348 }
349 if let Some(ascii_range) = r.as_ascii() {
350 return Some(Utf8Sequence::One(ascii_range));
351 }
352 for i in 1..MAX_UTF8_BYTES {
353 let m = (1 << (6 * i)) - 1;
354 if (r.start & !m) != (r.end & !m) {
355 if (r.start & m) != 0 {
356 self.push((r.start | m) + 1, r.end);
357 r.end = r.start | m;
358 continue 'INNER;
359 }
360 if (r.end & m) != m {
361 self.push(r.end & !m, r.end);
362 r.end = (r.end & !m) - 1;
363 continue 'INNER;
364 }
365 }
366 }
367 let mut start = [0; MAX_UTF8_BYTES];
368 let mut end = [0; MAX_UTF8_BYTES];
369 let n = r.encode(&mut start, &mut end);
370 return Some(Utf8Sequence::from_encoded_range(
371 &start[0..n], &end[0..n]));
372 }
373 }
374 None
375 }
376 }
377
378 impl ScalarRange {
379 /// split splits this range if it overlaps with a surrogate codepoint.
380 ///
381 /// Either or both ranges may be invalid.
382 fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
383 if self.start < 0xE000 && self.end > 0xD7FF {
384 Some((ScalarRange {
385 start: self.start,
386 end: 0xD7FF,
387 }, ScalarRange {
388 start: 0xE000,
389 end: self.end,
390 }))
391 } else {
392 None
393 }
394 }
395
396 /// is_valid returns true if and only if start <= end.
397 fn is_valid(&self) -> bool {
398 self.start <= self.end
399 }
400
401 /// as_ascii returns this range as a Utf8Range if and only if all scalar
402 /// values in this range can be encoded as a single byte.
403 fn as_ascii(&self) -> Option<Utf8Range> {
404 if self.is_ascii() {
405 Some(Utf8Range::new(self.start as u8, self.end as u8))
406 } else {
407 None
408 }
409 }
410
411 /// is_ascii returns true if the range is ASCII only (i.e., takes a single
412 /// byte to encode any scalar value).
413 fn is_ascii(&self) -> bool {
414 self.is_valid() && self.end <= 0x7f
415 }
416
417 /// encode writes the UTF-8 encoding of the start and end of this range
418 /// to the corresponding destination slices.
419 ///
420 /// The slices should have room for at least `MAX_UTF8_BYTES`.
421 fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
422 let cs = char::from_u32(self.start).unwrap();
423 let ce = char::from_u32(self.end).unwrap();
424 let n = encode_utf8(cs, start).unwrap();
425 let m = encode_utf8(ce, end).unwrap();
426 assert_eq!(n, m);
427 n
428 }
429 }
430
431 fn max_scalar_value(nbytes: usize) -> u32 {
432 match nbytes {
433 1 => 0x007F,
434 2 => 0x07FF,
435 3 => 0xFFFF,
436 4 => 0x10FFFF,
437 _ => unreachable!("invalid UTF-8 byte sequence size"),
438 }
439 }
440
441 #[cfg(test)]
442 mod tests {
443 use std::char;
444
445 use quickcheck::{TestResult, quickcheck};
446
447 use char_utf8::encode_utf8;
448 use {MAX_UTF8_BYTES, Utf8Range, Utf8Sequences};
449
450 fn rutf8(s: u8, e: u8) -> Utf8Range {
451 Utf8Range::new(s, e)
452 }
453
454 fn never_accepts_surrogate_codepoints(start: char, end: char) {
455 let mut buf = [0; MAX_UTF8_BYTES];
456 for cp in 0xD800..0xE000 {
457 let c = unsafe { ::std::mem::transmute(cp) };
458 let n = encode_utf8(c, &mut buf).unwrap();
459 for r in Utf8Sequences::new(start, end) {
460 if r.matches(&buf[0..n]) {
461 panic!("Sequence ({:X}, {:X}) contains range {:?}, \
462 which matches surrogate code point {:X} \
463 with encoded bytes {:?}",
464 start as u32, end as u32, r, cp, &buf[0..n]);
465 }
466 }
467 }
468 }
469
470 #[test]
471 fn codepoints_no_surrogates() {
472 never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
473 never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
474 never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
475 never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
476 never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
477 }
478
479 #[test]
480 fn single_codepoint_one_sequence() {
481 // Tests that every range of scalar values that contains a single
482 // scalar value is recognized by one sequence of byte ranges.
483 for i in 0x0..(0x10FFFF + 1) {
484 let c = match char::from_u32(i) {
485 None => continue,
486 Some(c) => c,
487 };
488 let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
489 assert_eq!(seqs.len(), 1);
490 }
491 }
492
493 #[test]
494 fn qc_codepoints_no_surrogate() {
495 fn p(s: char, e: char) -> TestResult {
496 if s > e {
497 return TestResult::discard();
498 }
499 never_accepts_surrogate_codepoints(s, e);
500 TestResult::passed()
501 }
502 quickcheck(p as fn(char, char) -> TestResult);
503 }
504
505 #[test]
506 fn bmp() {
507 use Utf8Sequence::*;
508
509 let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}')
510 .collect::<Vec<_>>();
511 assert_eq!(seqs, vec![
512 One(rutf8(0x0, 0x7F)),
513 Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
514 Three([rutf8(0xE0, 0xE0), rutf8(0xA0, 0xBF), rutf8(0x80, 0xBF)]),
515 Three([rutf8(0xE1, 0xEC), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF)]),
516 Three([rutf8(0xED, 0xED), rutf8(0x80, 0x9F), rutf8(0x80, 0xBF)]),
517 Three([rutf8(0xEE, 0xEF), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF)]),
518 ]);
519 }
520
521 #[test]
522 fn scratch() {
523 for range in Utf8Sequences::new('\u{0}', '\u{FFFF}') {
524 println!("{:?}", range);
525 }
526 }
527 }