]>
Commit | Line | Data |
---|---|---|
83c7162d | 1 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
923072b8 FG |
2 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
3 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your | |
83c7162d XL |
4 | // option. This file may not be copied, modified, or distributed |
5 | // except according to those terms. | |
6 | ||
7 | //! Marker types for formats. | |
8 | //! | |
9 | //! This module defines the types and traits used to mark a `Tendril` | |
10 | //! with the format of data it contains. It includes those formats | |
11 | //! for which `Tendril` supports at least some operations without | |
12 | //! conversion. | |
13 | //! | |
14 | //! To convert a string tendril to/from a byte tendril in an arbitrary | |
15 | //! character encoding, see the `encode` and `decode` methods on | |
16 | //! `Tendril`. | |
17 | //! | |
18 | //! `Tendril` operations may become memory-unsafe if data invalid for | |
19 | //! the format sneaks in. For that reason, these traits require | |
20 | //! `unsafe impl`. | |
21 | ||
83c7162d | 22 | use std::default::Default; |
923072b8 | 23 | use std::{char, mem, str}; |
83c7162d XL |
24 | |
25 | use futf::{self, Codepoint, Meaning}; | |
26 | ||
83c7162d XL |
27 | /// Implementation details. |
28 | /// | |
29 | /// You don't need these unless you are implementing | |
30 | /// a new format. | |
31 | pub mod imp { | |
83c7162d | 32 | use std::default::Default; |
923072b8 | 33 | use std::{iter, mem, slice}; |
83c7162d XL |
34 | |
35 | /// Describes how to fix up encodings when concatenating. | |
36 | /// | |
37 | /// We can drop characters on either side of the splice, | |
38 | /// and insert up to 4 bytes in the middle. | |
39 | pub struct Fixup { | |
40 | pub drop_left: u32, | |
41 | pub drop_right: u32, | |
42 | pub insert_len: u32, | |
43 | pub insert_bytes: [u8; 4], | |
44 | } | |
45 | ||
46 | impl Default for Fixup { | |
47 | #[inline(always)] | |
48 | fn default() -> Fixup { | |
49 | Fixup { | |
50 | drop_left: 0, | |
51 | drop_right: 0, | |
52 | insert_len: 0, | |
53 | insert_bytes: [0; 4], | |
54 | } | |
55 | } | |
56 | } | |
57 | ||
58 | #[inline(always)] | |
59 | unsafe fn from_u32_unchecked(n: u32) -> char { | |
60 | mem::transmute(n) | |
61 | } | |
62 | ||
63 | pub struct SingleByteCharIndices<'a> { | |
64 | inner: iter::Enumerate<slice::Iter<'a, u8>>, | |
65 | } | |
66 | ||
67 | impl<'a> Iterator for SingleByteCharIndices<'a> { | |
68 | type Item = (usize, char); | |
69 | ||
70 | #[inline] | |
71 | fn next(&mut self) -> Option<(usize, char)> { | |
923072b8 FG |
72 | self.inner |
73 | .next() | |
74 | .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) }) | |
83c7162d XL |
75 | } |
76 | } | |
77 | ||
78 | impl<'a> SingleByteCharIndices<'a> { | |
79 | #[inline] | |
80 | pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> { | |
81 | SingleByteCharIndices { | |
82 | inner: buf.iter().enumerate(), | |
83 | } | |
84 | } | |
85 | } | |
86 | } | |
87 | ||
88 | /// Trait for format marker types. | |
89 | /// | |
90 | /// The type implementing this trait is usually not instantiated. | |
91 | /// It's used with a phantom type parameter of `Tendril`. | |
92 | pub unsafe trait Format { | |
93 | /// Check whether the buffer is valid for this format. | |
94 | fn validate(buf: &[u8]) -> bool; | |
95 | ||
96 | /// Check whether the buffer is valid for this format. | |
97 | /// | |
98 | /// You may assume the buffer is a prefix of a valid buffer. | |
99 | #[inline] | |
100 | fn validate_prefix(buf: &[u8]) -> bool { | |
101 | <Self as Format>::validate(buf) | |
102 | } | |
103 | ||
104 | /// Check whether the buffer is valid for this format. | |
105 | /// | |
106 | /// You may assume the buffer is a suffix of a valid buffer. | |
107 | #[inline] | |
108 | fn validate_suffix(buf: &[u8]) -> bool { | |
109 | <Self as Format>::validate(buf) | |
110 | } | |
111 | ||
112 | /// Check whether the buffer is valid for this format. | |
113 | /// | |
114 | /// You may assume the buffer is a contiguous subsequence | |
115 | /// of a valid buffer, but not necessarily a prefix or | |
116 | /// a suffix. | |
117 | #[inline] | |
118 | fn validate_subseq(buf: &[u8]) -> bool { | |
119 | <Self as Format>::validate(buf) | |
120 | } | |
121 | ||
122 | /// Compute any fixup needed when concatenating buffers. | |
123 | /// | |
124 | /// The default is to do nothing. | |
125 | /// | |
126 | /// The function is `unsafe` because it may assume the input | |
127 | /// buffers are already valid for the format. Also, no | |
128 | /// bounds-checking is performed on the return value! | |
129 | #[inline(always)] | |
130 | unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup { | |
131 | Default::default() | |
132 | } | |
133 | } | |
134 | ||
135 | /// Indicates that one format is a subset of another. | |
136 | /// | |
137 | /// The subset format can be converted to the superset format | |
138 | /// for free. | |
139 | pub unsafe trait SubsetOf<Super>: Format | |
923072b8 FG |
140 | where |
141 | Super: Format, | |
83c7162d XL |
142 | { |
143 | /// Validate the *other* direction of conversion; check if | |
144 | /// this buffer from the superset format conforms to the | |
145 | /// subset format. | |
146 | /// | |
147 | /// The default calls `Self::validate`, but some conversions | |
148 | /// may implement a check which is cheaper than validating | |
149 | /// from scratch. | |
150 | fn revalidate_subset(x: &[u8]) -> bool { | |
151 | Self::validate(x) | |
152 | } | |
153 | } | |
154 | ||
155 | /// Indicates a format which corresponds to a Rust slice type, | |
156 | /// representing exactly the same invariants. | |
157 | pub unsafe trait SliceFormat: Format + Sized { | |
158 | type Slice: ?Sized + Slice; | |
159 | } | |
160 | ||
161 | /// Indicates a format which contains characters from Unicode | |
162 | /// (all of it, or some proper subset). | |
163 | pub unsafe trait CharFormat<'a>: Format { | |
164 | /// Iterator for characters and their byte indices. | |
923072b8 | 165 | type Iter: Iterator<Item = (usize, char)>; |
83c7162d XL |
166 | |
167 | /// Iterate over the characters of the string and their byte | |
168 | /// indices. | |
169 | /// | |
170 | /// You may assume the buffer is *already validated* for `Format`. | |
171 | unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter; | |
172 | ||
173 | /// Encode the character as bytes and pass them to a continuation. | |
174 | /// | |
175 | /// Returns `Err(())` iff the character cannot be represented. | |
176 | fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> | |
923072b8 FG |
177 | where |
178 | F: FnOnce(&[u8]); | |
83c7162d XL |
179 | } |
180 | ||
181 | /// Indicates a Rust slice type that is represented in memory as bytes. | |
182 | pub unsafe trait Slice { | |
183 | /// Access the raw bytes of the slice. | |
184 | fn as_bytes(&self) -> &[u8]; | |
185 | ||
186 | /// Convert a byte slice to this kind of slice. | |
187 | /// | |
188 | /// You may assume the buffer is *already validated* | |
189 | /// for `Format`. | |
190 | unsafe fn from_bytes(x: &[u8]) -> &Self; | |
191 | ||
192 | /// Convert a byte slice to this kind of slice. | |
193 | /// | |
194 | /// You may assume the buffer is *already validated* | |
195 | /// for `Format`. | |
196 | unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self; | |
197 | } | |
198 | ||
199 | /// Marker type for uninterpreted bytes. | |
200 | /// | |
201 | /// Validation will never fail for this format. | |
202 | #[derive(Copy, Clone, Default, Debug)] | |
203 | pub struct Bytes; | |
204 | ||
205 | unsafe impl Format for Bytes { | |
206 | #[inline(always)] | |
207 | fn validate(_: &[u8]) -> bool { | |
208 | true | |
209 | } | |
210 | } | |
211 | ||
212 | unsafe impl SliceFormat for Bytes { | |
213 | type Slice = [u8]; | |
214 | } | |
215 | ||
216 | unsafe impl Slice for [u8] { | |
217 | #[inline(always)] | |
218 | fn as_bytes(&self) -> &[u8] { | |
219 | self | |
220 | } | |
221 | ||
222 | #[inline(always)] | |
223 | unsafe fn from_bytes(x: &[u8]) -> &[u8] { | |
224 | x | |
225 | } | |
226 | ||
227 | #[inline(always)] | |
228 | unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] { | |
229 | x | |
230 | } | |
231 | } | |
232 | ||
233 | /// Marker type for ASCII text. | |
234 | #[derive(Copy, Clone, Default, Debug)] | |
235 | pub struct ASCII; | |
236 | ||
237 | unsafe impl Format for ASCII { | |
238 | #[inline] | |
239 | fn validate(buf: &[u8]) -> bool { | |
240 | buf.iter().all(|&n| n <= 127) | |
241 | } | |
242 | ||
243 | #[inline(always)] | |
244 | fn validate_prefix(_: &[u8]) -> bool { | |
245 | true | |
246 | } | |
247 | ||
248 | #[inline(always)] | |
249 | fn validate_suffix(_: &[u8]) -> bool { | |
250 | true | |
251 | } | |
252 | ||
253 | #[inline(always)] | |
254 | fn validate_subseq(_: &[u8]) -> bool { | |
255 | true | |
256 | } | |
257 | } | |
258 | ||
923072b8 FG |
259 | unsafe impl SubsetOf<UTF8> for ASCII {} |
260 | unsafe impl SubsetOf<Latin1> for ASCII {} | |
83c7162d XL |
261 | |
262 | unsafe impl<'a> CharFormat<'a> for ASCII { | |
263 | type Iter = imp::SingleByteCharIndices<'a>; | |
264 | ||
265 | #[inline] | |
266 | unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { | |
267 | imp::SingleByteCharIndices::new(buf) | |
268 | } | |
269 | ||
270 | #[inline] | |
271 | fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> | |
923072b8 FG |
272 | where |
273 | F: FnOnce(&[u8]), | |
83c7162d XL |
274 | { |
275 | let n = ch as u32; | |
923072b8 FG |
276 | if n > 0x7F { |
277 | return Err(()); | |
278 | } | |
83c7162d XL |
279 | cont(&[n as u8]); |
280 | Ok(()) | |
281 | } | |
282 | } | |
283 | ||
284 | /// Marker type for UTF-8 text. | |
285 | #[derive(Copy, Clone, Default, Debug)] | |
286 | pub struct UTF8; | |
287 | ||
288 | unsafe impl Format for UTF8 { | |
289 | #[inline] | |
290 | fn validate(buf: &[u8]) -> bool { | |
291 | str::from_utf8(buf).is_ok() | |
292 | } | |
293 | ||
294 | #[inline] | |
295 | fn validate_prefix(buf: &[u8]) -> bool { | |
296 | if buf.len() == 0 { | |
297 | return true; | |
298 | } | |
299 | match futf::classify(buf, buf.len() - 1) { | |
923072b8 FG |
300 | Some(Codepoint { |
301 | meaning: Meaning::Whole(_), | |
302 | .. | |
303 | }) => true, | |
83c7162d XL |
304 | _ => false, |
305 | } | |
306 | } | |
307 | ||
308 | #[inline] | |
309 | fn validate_suffix(buf: &[u8]) -> bool { | |
310 | if buf.len() == 0 { | |
311 | return true; | |
312 | } | |
313 | match futf::classify(buf, 0) { | |
923072b8 FG |
314 | Some(Codepoint { |
315 | meaning: Meaning::Whole(_), | |
316 | .. | |
317 | }) => true, | |
83c7162d XL |
318 | _ => false, |
319 | } | |
320 | } | |
321 | ||
322 | #[inline] | |
323 | fn validate_subseq(buf: &[u8]) -> bool { | |
923072b8 | 324 | <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf) |
83c7162d XL |
325 | } |
326 | } | |
327 | ||
923072b8 | 328 | unsafe impl SubsetOf<WTF8> for UTF8 {} |
83c7162d XL |
329 | |
330 | unsafe impl SliceFormat for UTF8 { | |
331 | type Slice = str; | |
332 | } | |
333 | ||
334 | unsafe impl Slice for str { | |
335 | #[inline(always)] | |
336 | fn as_bytes(&self) -> &[u8] { | |
337 | str::as_bytes(self) | |
338 | } | |
339 | ||
340 | #[inline(always)] | |
341 | unsafe fn from_bytes(x: &[u8]) -> &str { | |
342 | str::from_utf8_unchecked(x) | |
343 | } | |
344 | ||
345 | #[inline(always)] | |
346 | unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str { | |
347 | mem::transmute(x) | |
348 | } | |
349 | } | |
350 | ||
351 | unsafe impl<'a> CharFormat<'a> for UTF8 { | |
352 | type Iter = str::CharIndices<'a>; | |
353 | ||
354 | #[inline] | |
355 | unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> { | |
356 | str::from_utf8_unchecked(buf).char_indices() | |
357 | } | |
358 | ||
359 | #[inline] | |
360 | fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> | |
923072b8 FG |
361 | where |
362 | F: FnOnce(&[u8]), | |
83c7162d | 363 | { |
923072b8 FG |
364 | cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes()); |
365 | Ok(()) | |
83c7162d XL |
366 | } |
367 | } | |
368 | ||
369 | /// Marker type for WTF-8 text. | |
370 | /// | |
923072b8 | 371 | /// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/). |
83c7162d XL |
372 | #[derive(Copy, Clone, Default, Debug)] |
373 | pub struct WTF8; | |
374 | ||
375 | #[inline] | |
376 | fn wtf8_meaningful(m: Meaning) -> bool { | |
377 | match m { | |
923072b8 | 378 | Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true, |
83c7162d XL |
379 | _ => false, |
380 | } | |
381 | } | |
382 | ||
383 | unsafe impl Format for WTF8 { | |
384 | #[inline] | |
385 | fn validate(buf: &[u8]) -> bool { | |
386 | let mut i = 0; | |
387 | let mut prev_lead = false; | |
388 | while i < buf.len() { | |
389 | let codept = unwrap_or_return!(futf::classify(buf, i), false); | |
390 | if !wtf8_meaningful(codept.meaning) { | |
391 | return false; | |
392 | } | |
393 | i += codept.bytes.len(); | |
394 | prev_lead = match codept.meaning { | |
395 | Meaning::TrailSurrogate(_) if prev_lead => return false, | |
396 | Meaning::LeadSurrogate(_) => true, | |
397 | _ => false, | |
398 | }; | |
399 | } | |
400 | ||
401 | true | |
402 | } | |
403 | ||
404 | #[inline] | |
405 | fn validate_prefix(buf: &[u8]) -> bool { | |
406 | if buf.len() == 0 { | |
407 | return true; | |
408 | } | |
409 | match futf::classify(buf, buf.len() - 1) { | |
410 | Some(c) => wtf8_meaningful(c.meaning), | |
411 | _ => false, | |
412 | } | |
413 | } | |
414 | ||
415 | #[inline] | |
416 | fn validate_suffix(buf: &[u8]) -> bool { | |
417 | if buf.len() == 0 { | |
418 | return true; | |
419 | } | |
420 | match futf::classify(buf, 0) { | |
421 | Some(c) => wtf8_meaningful(c.meaning), | |
422 | _ => false, | |
423 | } | |
424 | } | |
425 | ||
426 | #[inline] | |
427 | fn validate_subseq(buf: &[u8]) -> bool { | |
923072b8 | 428 | <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf) |
83c7162d XL |
429 | } |
430 | ||
431 | #[inline] | |
432 | unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup { | |
433 | const ERR: &'static str = "WTF8: internal error"; | |
434 | ||
435 | if lhs.len() >= 3 && rhs.len() >= 3 { | |
923072b8 FG |
436 | if let ( |
437 | Some(Codepoint { | |
438 | meaning: Meaning::LeadSurrogate(hi), | |
439 | .. | |
440 | }), | |
441 | Some(Codepoint { | |
442 | meaning: Meaning::TrailSurrogate(lo), | |
443 | .. | |
444 | }), | |
445 | ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0)) | |
83c7162d XL |
446 | { |
447 | let mut fixup = imp::Fixup { | |
448 | drop_left: 3, | |
449 | drop_right: 3, | |
450 | insert_len: 0, | |
923072b8 | 451 | insert_bytes: [0_u8; 4], |
83c7162d XL |
452 | }; |
453 | ||
454 | let n = 0x10000 + ((hi as u32) << 10) + (lo as u32); | |
455 | ||
923072b8 FG |
456 | let ch = char::from_u32(n).expect(ERR); |
457 | fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32; | |
83c7162d XL |
458 | |
459 | return fixup; | |
460 | } | |
461 | } | |
462 | ||
463 | Default::default() | |
464 | } | |
465 | } | |
466 | ||
467 | /// Marker type for the single-byte encoding of the first 256 Unicode codepoints. | |
468 | /// | |
469 | /// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the | |
470 | /// C0 and C1 control characters from ECMA-48 / ISO 6429. | |
471 | /// | |
472 | /// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the | |
473 | /// many other aliases), which actually stand for Windows-1252. | |
474 | #[derive(Copy, Clone, Default, Debug)] | |
475 | pub struct Latin1; | |
476 | ||
477 | unsafe impl Format for Latin1 { | |
478 | #[inline(always)] | |
479 | fn validate(_: &[u8]) -> bool { | |
480 | true | |
481 | } | |
482 | ||
483 | #[inline(always)] | |
484 | fn validate_prefix(_: &[u8]) -> bool { | |
485 | true | |
486 | } | |
487 | ||
488 | #[inline(always)] | |
489 | fn validate_suffix(_: &[u8]) -> bool { | |
490 | true | |
491 | } | |
492 | ||
493 | #[inline(always)] | |
494 | fn validate_subseq(_: &[u8]) -> bool { | |
495 | true | |
496 | } | |
497 | } | |
498 | ||
499 | unsafe impl<'a> CharFormat<'a> for Latin1 { | |
500 | type Iter = imp::SingleByteCharIndices<'a>; | |
501 | ||
502 | #[inline] | |
503 | unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { | |
504 | imp::SingleByteCharIndices::new(buf) | |
505 | } | |
506 | ||
507 | #[inline] | |
508 | fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> | |
923072b8 FG |
509 | where |
510 | F: FnOnce(&[u8]), | |
83c7162d XL |
511 | { |
512 | let n = ch as u32; | |
923072b8 FG |
513 | if n > 0xFF { |
514 | return Err(()); | |
515 | } | |
83c7162d XL |
516 | cont(&[n as u8]); |
517 | Ok(()) | |
518 | } | |
519 | } |