1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
8 use alloc
::string
::String
;
10 use core
::cmp
::Ordering
;
11 use core
::convert
::TryFrom
;
12 use core
::marker
::PhantomData
;
15 // Also used by owned.rs
16 pub(super) const LENGTH_WIDTH
: usize = 4;
17 pub(super) const METADATA_WIDTH
: usize = 0;
18 pub(super) const MAX_LENGTH
: usize = u32::MAX
as usize;
19 pub(super) const MAX_INDEX
: usize = u32::MAX
as usize;
21 /// This trait allows switching between different possible internal
22 /// representations of VarZeroVec.
24 /// Currently this crate supports two formats: [`Index16`] and [`Index32`],
25 /// with [`Index16`] being the default for all [`VarZeroVec`](super::VarZeroVec)
26 /// types unless explicitly specified otherwise.
28 /// Do not implement this trait, its internals may be changed in the future,
29 /// and all of its associated items are hidden from the docs.
30 #[allow(clippy::missing_safety_doc)] // no safety section for you, don't implement this trait period
31 pub unsafe trait VarZeroVecFormat
: '
static + Sized
{
33 const INDEX_WIDTH
: usize;
36 /// This is always `RawBytesULE<Self::INDEX_WIDTH>` however
37 /// Rust does not currently support using associated constants in const
42 // various conversions because RawBytes is an associated constant now
44 fn rawbytes_to_usize(raw
: Self::RawBytes
) -> usize;
46 fn usize_to_rawbytes(u
: usize) -> Self::RawBytes
;
49 fn rawbytes_from_byte_slice_unchecked_mut(bytes
: &mut [u8]) -> &mut [Self::RawBytes
];
52 /// This is a [`VarZeroVecFormat`] that stores u16s in the index array.
53 /// Will have a smaller data size, but it's more likely for larger arrays
54 /// to be unrepresentable (and error on construction)
56 /// This is the default index size used by all [`VarZeroVec`](super::VarZeroVec) types.
57 #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
58 #[allow(clippy::exhaustive_structs)] // marker
61 /// This is a [`VarZeroVecFormat`] that stores u32s in the index array.
62 /// Will have a larger data size, but will support large arrays without
64 #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
65 #[allow(clippy::exhaustive_structs)] // marker
68 unsafe impl VarZeroVecFormat
for Index16
{
69 const INDEX_WIDTH
: usize = 2;
70 const MAX_VALUE
: u32 = u16::MAX
as u32;
71 type RawBytes
= RawBytesULE
<2>;
73 fn rawbytes_to_usize(raw
: Self::RawBytes
) -> usize {
74 raw
.as_unsigned_int() as usize
77 fn usize_to_rawbytes(u
: usize) -> Self::RawBytes
{
78 (u
as u16).to_unaligned()
81 fn rawbytes_from_byte_slice_unchecked_mut(bytes
: &mut [u8]) -> &mut [Self::RawBytes
] {
82 Self::RawBytes
::from_byte_slice_unchecked_mut(bytes
)
86 unsafe impl VarZeroVecFormat
for Index32
{
87 const INDEX_WIDTH
: usize = 4;
88 const MAX_VALUE
: u32 = u32::MAX
;
89 type RawBytes
= RawBytesULE
<4>;
91 fn rawbytes_to_usize(raw
: Self::RawBytes
) -> usize {
92 raw
.as_unsigned_int() as usize
95 fn usize_to_rawbytes(u
: usize) -> Self::RawBytes
{
96 (u
as u32).to_unaligned()
99 fn rawbytes_from_byte_slice_unchecked_mut(bytes
: &mut [u8]) -> &mut [Self::RawBytes
] {
100 Self::RawBytes
::from_byte_slice_unchecked_mut(bytes
)
104 /// A more parsed version of `VarZeroSlice`. This type is where most of the VarZeroVec
105 /// internal representation code lies.
107 /// This is *basically* an `&'a [u8]` to a zero copy buffer, but split out into
108 /// the buffer components. Logically this is capable of behaving as
109 /// a `&'a [T::VarULE]`, but since `T::VarULE` is unsized that type does not actually
112 /// See [`VarZeroVecComponents::parse_byte_slice()`] for information on the internal invariants involved
114 pub struct VarZeroVecComponents
<'a
, T
: ?Sized
, F
> {
115 /// The number of elements
117 /// The list of indices into the `things` slice
119 /// The contiguous list of `T::VarULE`s
121 /// The original slice this was constructed from
122 entire_slice
: &'a
[u8],
123 marker
: PhantomData
<(&'a T
, F
)>,
126 // #[derive()] won't work here since we do not want it to be
128 impl<'a
, T
: ?Sized
, F
> Copy
for VarZeroVecComponents
<'a
, T
, F
> {}
129 impl<'a
, T
: ?Sized
, F
> Clone
for VarZeroVecComponents
<'a
, T
, F
> {
130 fn clone(&self) -> Self {
135 impl<'a
, T
: VarULE
+ ?Sized
, F
> Default
for VarZeroVecComponents
<'a
, T
, F
> {
137 fn default() -> Self {
142 impl<'a
, T
: VarULE
+ ?Sized
, F
> VarZeroVecComponents
<'a
, T
, F
> {
144 pub fn new() -> Self {
154 impl<'a
, T
: VarULE
+ ?Sized
, F
: VarZeroVecFormat
> VarZeroVecComponents
<'a
, T
, F
> {
155 /// Construct a new VarZeroVecComponents, checking invariants about the overall buffer size:
157 /// - There must be either zero or at least four bytes (if four, this is the "length" parsed as a usize)
158 /// - There must be at least `4*length + 4` bytes total, to form the array `indices` of indices
159 /// - `indices[i]..indices[i+1]` must index into a valid section of
160 /// `things`, such that it parses to a `T::VarULE`
161 /// - `indices[len - 1]..things.len()` must index into a valid section of
162 /// `things`, such that it parses to a `T::VarULE`
164 pub fn parse_byte_slice(slice
: &'a
[u8]) -> Result
<Self, ZeroVecError
> {
165 // The empty VZV is special-cased to the empty slice
166 if slice
.is_empty() {
167 return Ok(VarZeroVecComponents
{
175 let len_bytes
= slice
176 .get(0..LENGTH_WIDTH
)
177 .ok_or(ZeroVecError
::VarZeroVecFormatError
)?
;
178 let len_ule
= RawBytesULE
::<LENGTH_WIDTH
>::parse_byte_slice(len_bytes
)
179 .map_err(|_
| ZeroVecError
::VarZeroVecFormatError
)?
;
183 .ok_or(ZeroVecError
::VarZeroVecFormatError
)?
185 let indices_bytes
= slice
187 LENGTH_WIDTH
+ METADATA_WIDTH
188 ..LENGTH_WIDTH
+ METADATA_WIDTH
+ F
::INDEX_WIDTH
* (len
as usize),
190 .ok_or(ZeroVecError
::VarZeroVecFormatError
)?
;
192 .get(F
::INDEX_WIDTH
* (len
as usize) + LENGTH_WIDTH
+ METADATA_WIDTH
..)
193 .ok_or(ZeroVecError
::VarZeroVecFormatError
)?
;
195 let borrowed
= VarZeroVecComponents
{
197 indices
: indices_bytes
,
203 borrowed
.check_indices_and_things()?
;
208 /// Construct a [`VarZeroVecComponents`] from a byte slice that has previously
209 /// successfully returned a [`VarZeroVecComponents`] when passed to
210 /// [`VarZeroVecComponents::parse_byte_slice()`]. Will return the same
211 /// object as one would get from calling [`VarZeroVecComponents::parse_byte_slice()`].
214 /// The bytes must have previously successfully run through
215 /// [`VarZeroVecComponents::parse_byte_slice()`]
216 pub unsafe fn from_bytes_unchecked(slice
: &'a
[u8]) -> Self {
217 // The empty VZV is special-cased to the empty slice
218 if slice
.is_empty() {
219 return VarZeroVecComponents
{
227 let len_bytes
= slice
.get_unchecked(0..LENGTH_WIDTH
);
228 let len_ule
= RawBytesULE
::<LENGTH_WIDTH
>::from_byte_slice_unchecked(len_bytes
);
230 let len
= len_ule
.get_unchecked(0).as_unsigned_int();
231 let indices_bytes
= slice
.get_unchecked(
232 LENGTH_WIDTH
+ METADATA_WIDTH
233 ..LENGTH_WIDTH
+ METADATA_WIDTH
+ F
::INDEX_WIDTH
* (len
as usize),
236 slice
.get_unchecked(LENGTH_WIDTH
+ METADATA_WIDTH
+ F
::INDEX_WIDTH
* (len
as usize)..);
238 VarZeroVecComponents
{
240 indices
: indices_bytes
,
247 /// Get the number of elements in this vector
249 pub fn len(self) -> usize {
253 /// Returns `true` if the vector contains no elements.
255 pub fn is_empty(self) -> bool
{
256 self.indices
.is_empty()
259 /// Get the idx'th element out of this slice. Returns `None` if out of bounds.
261 pub fn get(self, idx
: usize) -> Option
<&'a T
> {
262 if idx
>= self.len() {
265 Some(unsafe { self.get_unchecked(idx) }
)
268 /// Get the idx'th element out of this slice. Does not bounds check.
271 /// - `idx` must be in bounds (`idx < self.len()`)
273 pub(crate) unsafe fn get_unchecked(self, idx
: usize) -> &'a T
{
274 let range
= self.get_things_range(idx
);
275 let things_slice
= self.things
.get_unchecked(range
);
276 T
::from_byte_slice_unchecked(things_slice
)
279 /// Get the range in `things` for the element at `idx`. Does not bounds check.
282 /// - `idx` must be in bounds (`idx < self.len()`)
284 unsafe fn get_things_range(self, idx
: usize) -> Range
<usize> {
285 let start
= F
::rawbytes_to_usize(*self.indices_slice().get_unchecked(idx
));
286 let end
= if idx
+ 1 == self.len() {
289 F
::rawbytes_to_usize(*self.indices_slice().get_unchecked(idx
+ 1))
291 debug_assert
!(start
<= end
);
295 /// Get the range in `entire_slice` for the element at `idx`. Does not bounds check.
298 /// - `idx` must be in bounds (`idx < self.len()`)
300 pub(crate) unsafe fn get_range(self, idx
: usize) -> Range
<usize> {
301 let range
= self.get_things_range(idx
);
302 let offset
= (self.things
as *const [u8] as *const u8)
303 .offset_from(self.entire_slice
as *const [u8] as *const u8)
305 range
.start
+ offset
..range
.end
+ offset
308 /// Check the internal invariants of VarZeroVecComponents:
310 /// - `indices[i]..indices[i+1]` must index into a valid section of
311 /// `things`, such that it parses to a `T::VarULE`
312 /// - `indices[len - 1]..things.len()` must index into a valid section of
313 /// `things`, such that it parses to a `T::VarULE`
314 /// - `indices` is monotonically increasing
316 /// This method is NOT allowed to call any other methods on VarZeroVecComponents since all other methods
317 /// assume that the slice has been passed through check_indices_and_things
319 #[allow(clippy::len_zero)] // more explicit to enforce safety invariants
320 fn check_indices_and_things(self) -> Result
<(), ZeroVecError
> {
321 assert_eq
!(self.len(), self.indices_slice().len());
323 if self.things
.len() > 0 {
324 return Err(ZeroVecError
::VarZeroVecFormatError
);
329 // Safety: i is in bounds (assertion above)
330 let mut start
= F
::rawbytes_to_usize(unsafe { *self.indices_slice().get_unchecked(0) }
);
332 return Err(ZeroVecError
::VarZeroVecFormatError
);
334 for i
in 0..self.len() {
335 let end
= if i
== self.len() - 1 {
338 // Safety: i+1 is in bounds (assertion above)
339 F
::rawbytes_to_usize(unsafe { *self.indices_slice().get_unchecked(i + 1) }
)
342 return Err(ZeroVecError
::VarZeroVecFormatError
);
344 if end
> self.things
.len() {
345 return Err(ZeroVecError
::VarZeroVecFormatError
);
347 // Safety: start..end is a valid range in self.things
348 let bytes
= unsafe { self.things.get_unchecked(start..end) }
;
349 T
::parse_byte_slice(bytes
)?
;
355 /// Create an iterator over the Ts contained in VarZeroVecComponents
357 pub fn iter(self) -> impl Iterator
<Item
= &'a T
> {
361 .map(F
::rawbytes_to_usize
)
366 .map(F
::rawbytes_to_usize
)
368 .chain([self.things
.len()]),
370 .map(move |(start
, end
)| unsafe { self.things.get_unchecked(start..end) }
)
371 .map(|bytes
| unsafe { T::from_byte_slice_unchecked(bytes) }
)
374 pub fn to_vec(self) -> Vec
<Box
<T
>> {
375 self.iter().map(T
::to_boxed
).collect()
379 fn indices_slice(&self) -> &'a
[F
::RawBytes
] {
380 unsafe { F::RawBytes::from_byte_slice_unchecked(self.indices) }
383 // Dump a debuggable representation of this type
384 #[allow(unused)] // useful for debugging
385 pub(crate) fn dump(&self) -> String
{
390 .map(F
::rawbytes_to_usize
)
391 .collect
::<Vec
<_
>>();
392 format
!("VarZeroVecComponents {{ indices: {indices:?} }}")
396 impl<'a
, T
, F
> VarZeroVecComponents
<'a
, T
, F
>
403 /// Binary searches a sorted `VarZeroVecComponents<T>` for the given element. For more information, see
404 /// the primitive function [`binary_search`](slice::binary_search).
405 pub fn binary_search(&self, needle
: &T
) -> Result
<usize, usize> {
406 self.binary_search_impl(|probe
| probe
.cmp(needle
), self.indices_slice())
409 pub fn binary_search_in_range(
413 ) -> Option
<Result
<usize, usize>> {
414 let indices_slice
= self.indices_slice().get(range
)?
;
415 Some(self.binary_search_impl(|probe
| probe
.cmp(needle
), indices_slice
))
419 impl<'a
, T
, F
> VarZeroVecComponents
<'a
, T
, F
>
425 /// Binary searches a sorted `VarZeroVecComponents<T>` for the given predicate. For more information, see
426 /// the primitive function [`binary_search_by`](slice::binary_search_by).
427 pub fn binary_search_by(&self, predicate
: impl FnMut(&T
) -> Ordering
) -> Result
<usize, usize> {
428 self.binary_search_impl(predicate
, self.indices_slice())
431 pub fn binary_search_in_range_by(
433 predicate
: impl FnMut(&T
) -> Ordering
,
435 ) -> Option
<Result
<usize, usize>> {
436 let indices_slice
= self.indices_slice().get(range
)?
;
437 Some(self.binary_search_impl(predicate
, indices_slice
))
440 /// Binary searches a sorted `VarZeroVecComponents<T>` with the given predicate. For more information, see
441 /// the primitive function [`binary_search`](slice::binary_search).
442 fn binary_search_impl(
444 mut predicate
: impl FnMut(&T
) -> Ordering
,
445 indices_slice
: &[F
::RawBytes
],
446 ) -> Result
<usize, usize> {
447 // This code is an absolute atrocity. This code is not a place of honor. This
448 // code is known to the State of California to cause cancer.
450 // Unfortunately, the stdlib's `binary_search*` functions can only operate on slices.
451 // We do not have a slice. We have something we can .get() and index on, but that is not
454 // The `binary_search*` functions also do not have a variant where they give you the element's
455 // index, which we could otherwise use to directly index `self`.
456 // We do have `self.indices`, but these are indices into a byte buffer, which cannot in
457 // isolation be used to recoup the logical index of the element they refer to.
459 // However, `binary_search_by()` provides references to the elements of the slice being iterated.
460 // Since the layout of Rust slices is well-defined, we can do pointer arithmetic on these references
461 // to obtain the index being used by the search.
463 // It's worth noting that the slice we choose to search is irrelevant, as long as it has the appropriate
464 // length. `self.indices` is defined to have length `self.len()`, so it is convenient to use
465 // here and does not require additional allocations.
467 // The alternative to doing this is to implement our own binary search. This is significantly less fun.
469 // Note: We always use zero_index relative to the whole indices array, even if we are
470 // only searching a subslice of it.
471 let zero_index
= self.indices
.as_ptr() as *const _
as usize;
472 indices_slice
.binary_search_by(|probe
: &_
| {
473 // `self.indices` is a vec of unaligned F::INDEX_WIDTH values, so we divide by F::INDEX_WIDTH
474 // to get the actual index
475 let index
= (probe
as *const _
as usize - zero_index
) / F
::INDEX_WIDTH
;
476 // safety: we know this is in bounds
477 let actual_probe
= unsafe { self.get_unchecked(index) }
;
478 predicate(actual_probe
)
483 /// Collects the bytes for a VarZeroSlice into a Vec.
484 pub fn get_serializable_bytes_non_empty
<T
, A
, F
>(elements
: &[A
]) -> Option
<Vec
<u8>>
487 A
: EncodeAsVarULE
<T
>,
490 debug_assert
!(!elements
.is_empty());
491 let len
= compute_serializable_len
::<T
, A
, F
>(elements
)?
;
492 debug_assert
!(len
>= LENGTH_WIDTH
as u32);
493 let mut output
: Vec
<u8> = alloc
::vec
![0; len
as usize];
494 write_serializable_bytes
::<T
, A
, F
>(elements
, &mut output
);
498 /// Writes the bytes for a VarZeroSlice into an output buffer.
500 /// Every byte in the buffer will be initialized after calling this function.
504 /// Panics if the buffer is not exactly the correct length.
505 pub fn write_serializable_bytes
<T
, A
, F
>(elements
: &[A
], output
: &mut [u8])
508 A
: EncodeAsVarULE
<T
>,
511 assert
!(elements
.len() <= MAX_LENGTH
);
512 let num_elements_bytes
= elements
.len().to_le_bytes();
513 #[allow(clippy::indexing_slicing)] // Function contract allows panicky behavior
514 output
[0..LENGTH_WIDTH
].copy_from_slice(&num_elements_bytes
[0..LENGTH_WIDTH
]);
516 // idx_offset = offset from the start of the buffer for the next index
517 let mut idx_offset
: usize = LENGTH_WIDTH
+ METADATA_WIDTH
;
518 // first_dat_offset = offset from the start of the buffer of the first data block
519 let first_dat_offset
: usize = idx_offset
+ elements
.len() * F
::INDEX_WIDTH
;
520 // dat_offset = offset from the start of the buffer of the next data block
521 let mut dat_offset
: usize = first_dat_offset
;
523 for element
in elements
.iter() {
524 let element_len
= element
.encode_var_ule_len();
526 let idx_limit
= idx_offset
+ F
::INDEX_WIDTH
;
527 #[allow(clippy::indexing_slicing)] // Function contract allows panicky behavior
528 let idx_slice
= &mut output
[idx_offset
..idx_limit
];
529 // VZV expects data offsets to be stored relative to the first data block
530 let idx
= dat_offset
- first_dat_offset
;
531 assert
!(idx
<= MAX_INDEX
);
532 #[allow(clippy::indexing_slicing)] // this function is explicitly panicky
533 idx_slice
.copy_from_slice(&idx
.to_le_bytes()[..F
::INDEX_WIDTH
]);
535 let dat_limit
= dat_offset
+ element_len
;
536 #[allow(clippy::indexing_slicing)] // Function contract allows panicky behavior
537 let dat_slice
= &mut output
[dat_offset
..dat_limit
];
538 element
.encode_var_ule_write(dat_slice
);
539 debug_assert_eq
!(T
::validate_byte_slice(dat_slice
), Ok(()));
541 idx_offset
= idx_limit
;
542 dat_offset
= dat_limit
;
547 LENGTH_WIDTH
+ METADATA_WIDTH
+ F
::INDEX_WIDTH
* elements
.len()
549 assert_eq
!(dat_offset
, output
.len());
552 pub fn compute_serializable_len
<T
, A
, F
>(elements
: &[A
]) -> Option
<u32>
555 A
: EncodeAsVarULE
<T
>,
558 let idx_len
: u32 = u32::try_from(elements
.len())
560 .checked_mul(F
::INDEX_WIDTH
as u32)?
561 .checked_add(LENGTH_WIDTH
as u32)?
562 .checked_add(METADATA_WIDTH
as u32)?
;
563 let data_len
: u32 = elements
565 .map(|v
| u32::try_from(v
.encode_var_ule_len()).ok())
566 .try_fold(0u32, |s
, v
| s
.checked_add(v?
))?
;
567 let ret
= idx_len
.checked_add(data_len
);
568 if let Some(r
) = ret
{
569 if r
>= F
::MAX_VALUE
{