1 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
4 // option. This file may not be copied, modified, or distributed
5 // except according to those terms.
7 //! Marker types for formats.
9 //! This module defines the types and traits used to mark a `Tendril`
10 //! with the format of data it contains. It includes those formats
11 //! for which `Tendril` supports at least some operations without
14 //! To convert a string tendril to/from a byte tendril in an arbitrary
15 //! character encoding, see the `encode` and `decode` methods on
18 //! `Tendril` operations may become memory-unsafe if data invalid for
19 //! the format sneaks in. For that reason, these traits require
22 use std
::default::Default
;
23 use std
::{char, mem, str}
;
25 use futf
::{self, Codepoint, Meaning}
;
27 /// Implementation details.
29 /// You don't need these unless you are implementing
32 use std
::default::Default
;
33 use std
::{iter, mem, slice}
;
35 /// Describes how to fix up encodings when concatenating.
37 /// We can drop characters on either side of the splice,
38 /// and insert up to 4 bytes in the middle.
43 pub insert_bytes
: [u8; 4],
46 impl Default
for Fixup
{
48 fn default() -> Fixup
{
59 unsafe fn from_u32_unchecked(n
: u32) -> char {
63 pub struct SingleByteCharIndices
<'a
> {
64 inner
: iter
::Enumerate
<slice
::Iter
<'a
, u8>>,
67 impl<'a
> Iterator
for SingleByteCharIndices
<'a
> {
68 type Item
= (usize, char);
71 fn next(&mut self) -> Option
<(usize, char)> {
74 .map(|(i
, &b
)| unsafe { (i, from_u32_unchecked(b as u32)) }
)
78 impl<'a
> SingleByteCharIndices
<'a
> {
80 pub fn new(buf
: &'a
[u8]) -> SingleByteCharIndices
<'a
> {
81 SingleByteCharIndices
{
82 inner
: buf
.iter().enumerate(),
88 /// Trait for format marker types.
90 /// The type implementing this trait is usually not instantiated.
91 /// It's used with a phantom type parameter of `Tendril`.
92 pub unsafe trait Format
{
93 /// Check whether the buffer is valid for this format.
94 fn validate(buf
: &[u8]) -> bool
;
96 /// Check whether the buffer is valid for this format.
98 /// You may assume the buffer is a prefix of a valid buffer.
100 fn validate_prefix(buf
: &[u8]) -> bool
{
101 <Self as Format
>::validate(buf
)
104 /// Check whether the buffer is valid for this format.
106 /// You may assume the buffer is a suffix of a valid buffer.
108 fn validate_suffix(buf
: &[u8]) -> bool
{
109 <Self as Format
>::validate(buf
)
112 /// Check whether the buffer is valid for this format.
114 /// You may assume the buffer is a contiguous subsequence
115 /// of a valid buffer, but not necessarily a prefix or
118 fn validate_subseq(buf
: &[u8]) -> bool
{
119 <Self as Format
>::validate(buf
)
122 /// Compute any fixup needed when concatenating buffers.
124 /// The default is to do nothing.
126 /// The function is `unsafe` because it may assume the input
127 /// buffers are already valid for the format. Also, no
128 /// bounds-checking is performed on the return value!
130 unsafe fn fixup(_lhs
: &[u8], _rhs
: &[u8]) -> imp
::Fixup
{
135 /// Indicates that one format is a subset of another.
137 /// The subset format can be converted to the superset format
139 pub unsafe trait SubsetOf
<Super
>: Format
143 /// Validate the *other* direction of conversion; check if
144 /// this buffer from the superset format conforms to the
147 /// The default calls `Self::validate`, but some conversions
148 /// may implement a check which is cheaper than validating
150 fn revalidate_subset(x
: &[u8]) -> bool
{
155 /// Indicates a format which corresponds to a Rust slice type,
156 /// representing exactly the same invariants.
157 pub unsafe trait SliceFormat
: Format
+ Sized
{
158 type Slice
: ?Sized
+ Slice
;
161 /// Indicates a format which contains characters from Unicode
162 /// (all of it, or some proper subset).
163 pub unsafe trait CharFormat
<'a
>: Format
{
164 /// Iterator for characters and their byte indices.
165 type Iter
: Iterator
<Item
= (usize, char)>;
167 /// Iterate over the characters of the string and their byte
170 /// You may assume the buffer is *already validated* for `Format`.
171 unsafe fn char_indices(buf
: &'a
[u8]) -> Self::Iter
;
173 /// Encode the character as bytes and pass them to a continuation.
175 /// Returns `Err(())` iff the character cannot be represented.
176 fn encode_char
<F
>(ch
: char, cont
: F
) -> Result
<(), ()>
181 /// Indicates a Rust slice type that is represented in memory as bytes.
182 pub unsafe trait Slice
{
183 /// Access the raw bytes of the slice.
184 fn as_bytes(&self) -> &[u8];
186 /// Convert a byte slice to this kind of slice.
188 /// You may assume the buffer is *already validated*
190 unsafe fn from_bytes(x
: &[u8]) -> &Self;
192 /// Convert a byte slice to this kind of slice.
194 /// You may assume the buffer is *already validated*
196 unsafe fn from_mut_bytes(x
: &mut [u8]) -> &mut Self;
199 /// Marker type for uninterpreted bytes.
201 /// Validation will never fail for this format.
202 #[derive(Copy, Clone, Default, Debug)]
205 unsafe impl Format
for Bytes
{
207 fn validate(_
: &[u8]) -> bool
{
212 unsafe impl SliceFormat
for Bytes
{
216 unsafe impl Slice
for [u8] {
218 fn as_bytes(&self) -> &[u8] {
223 unsafe fn from_bytes(x
: &[u8]) -> &[u8] {
228 unsafe fn from_mut_bytes(x
: &mut [u8]) -> &mut [u8] {
233 /// Marker type for ASCII text.
234 #[derive(Copy, Clone, Default, Debug)]
237 unsafe impl Format
for ASCII
{
239 fn validate(buf
: &[u8]) -> bool
{
240 buf
.iter().all(|&n
| n
<= 127)
244 fn validate_prefix(_
: &[u8]) -> bool
{
249 fn validate_suffix(_
: &[u8]) -> bool
{
254 fn validate_subseq(_
: &[u8]) -> bool
{
259 unsafe impl SubsetOf
<UTF8
> for ASCII {}
260 unsafe impl SubsetOf
<Latin1
> for ASCII {}
262 unsafe impl<'a
> CharFormat
<'a
> for ASCII
{
263 type Iter
= imp
::SingleByteCharIndices
<'a
>;
266 unsafe fn char_indices(buf
: &'a
[u8]) -> imp
::SingleByteCharIndices
<'a
> {
267 imp
::SingleByteCharIndices
::new(buf
)
271 fn encode_char
<F
>(ch
: char, cont
: F
) -> Result
<(), ()>
284 /// Marker type for UTF-8 text.
285 #[derive(Copy, Clone, Default, Debug)]
288 unsafe impl Format
for UTF8
{
290 fn validate(buf
: &[u8]) -> bool
{
291 str::from_utf8(buf
).is_ok()
295 fn validate_prefix(buf
: &[u8]) -> bool
{
299 match futf
::classify(buf
, buf
.len() - 1) {
301 meaning
: Meaning
::Whole(_
),
309 fn validate_suffix(buf
: &[u8]) -> bool
{
313 match futf
::classify(buf
, 0) {
315 meaning
: Meaning
::Whole(_
),
323 fn validate_subseq(buf
: &[u8]) -> bool
{
324 <Self as Format
>::validate_prefix(buf
) && <Self as Format
>::validate_suffix(buf
)
328 unsafe impl SubsetOf
<WTF8
> for UTF8 {}
330 unsafe impl SliceFormat
for UTF8
{
334 unsafe impl Slice
for str {
336 fn as_bytes(&self) -> &[u8] {
341 unsafe fn from_bytes(x
: &[u8]) -> &str {
342 str::from_utf8_unchecked(x
)
346 unsafe fn from_mut_bytes(x
: &mut [u8]) -> &mut str {
351 unsafe impl<'a
> CharFormat
<'a
> for UTF8
{
352 type Iter
= str::CharIndices
<'a
>;
355 unsafe fn char_indices(buf
: &'a
[u8]) -> str::CharIndices
<'a
> {
356 str::from_utf8_unchecked(buf
).char_indices()
360 fn encode_char
<F
>(ch
: char, cont
: F
) -> Result
<(), ()>
364 cont(ch
.encode_utf8(&mut [0_u8; 4]).as_bytes());
369 /// Marker type for WTF-8 text.
371 /// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/).
372 #[derive(Copy, Clone, Default, Debug)]
376 fn wtf8_meaningful(m
: Meaning
) -> bool
{
378 Meaning
::Whole(_
) | Meaning
::LeadSurrogate(_
) | Meaning
::TrailSurrogate(_
) => true,
383 unsafe impl Format
for WTF8
{
385 fn validate(buf
: &[u8]) -> bool
{
387 let mut prev_lead
= false;
388 while i
< buf
.len() {
389 let codept
= unwrap_or_return
!(futf
::classify(buf
, i
), false);
390 if !wtf8_meaningful(codept
.meaning
) {
393 i
+= codept
.bytes
.len();
394 prev_lead
= match codept
.meaning
{
395 Meaning
::TrailSurrogate(_
) if prev_lead
=> return false,
396 Meaning
::LeadSurrogate(_
) => true,
405 fn validate_prefix(buf
: &[u8]) -> bool
{
409 match futf
::classify(buf
, buf
.len() - 1) {
410 Some(c
) => wtf8_meaningful(c
.meaning
),
416 fn validate_suffix(buf
: &[u8]) -> bool
{
420 match futf
::classify(buf
, 0) {
421 Some(c
) => wtf8_meaningful(c
.meaning
),
427 fn validate_subseq(buf
: &[u8]) -> bool
{
428 <Self as Format
>::validate_prefix(buf
) && <Self as Format
>::validate_suffix(buf
)
432 unsafe fn fixup(lhs
: &[u8], rhs
: &[u8]) -> imp
::Fixup
{
433 const ERR
: &'
static str = "WTF8: internal error";
435 if lhs
.len() >= 3 && rhs
.len() >= 3 {
438 meaning
: Meaning
::LeadSurrogate(hi
),
442 meaning
: Meaning
::TrailSurrogate(lo
),
445 ) = (futf
::classify(lhs
, lhs
.len() - 1), futf
::classify(rhs
, 0))
447 let mut fixup
= imp
::Fixup
{
451 insert_bytes
: [0_u8; 4],
454 let n
= 0x10000 + ((hi
as u32) << 10) + (lo
as u32);
456 let ch
= char::from_u32(n
).expect(ERR
);
457 fixup
.insert_len
= ch
.encode_utf8(&mut fixup
.insert_bytes
).len() as u32;
467 /// Marker type for the single-byte encoding of the first 256 Unicode codepoints.
469 /// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the
470 /// C0 and C1 control characters from ECMA-48 / ISO 6429.
472 /// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the
473 /// many other aliases), which actually stand for Windows-1252.
474 #[derive(Copy, Clone, Default, Debug)]
477 unsafe impl Format
for Latin1
{
479 fn validate(_
: &[u8]) -> bool
{
484 fn validate_prefix(_
: &[u8]) -> bool
{
489 fn validate_suffix(_
: &[u8]) -> bool
{
494 fn validate_subseq(_
: &[u8]) -> bool
{
499 unsafe impl<'a
> CharFormat
<'a
> for Latin1
{
500 type Iter
= imp
::SingleByteCharIndices
<'a
>;
503 unsafe fn char_indices(buf
: &'a
[u8]) -> imp
::SingleByteCharIndices
<'a
> {
504 imp
::SingleByteCharIndices
::new(buf
)
508 fn encode_char
<F
>(ch
: char, cont
: F
) -> Result
<(), ()>