]> git.proxmox.com Git - rustc.git/blame - vendor/semver/src/identifier.rs
New upstream version 1.63.0+dfsg1
[rustc.git] / vendor / semver / src / identifier.rs
CommitLineData
3c0e092e
XL
1// This module implements Identifier, a short-optimized string allowed to
2// contain only the ASCII characters hyphen, dot, 0-9, A-Z, a-z.
3//
4// As of mid-2021, the distribution of pre-release lengths on crates.io is:
5//
6// length count length count length count
7// 0 355929 11 81 24 2
8// 1 208 12 48 25 6
9// 2 236 13 55 26 10
10// 3 1909 14 25 27 4
11// 4 1284 15 15 28 1
12// 5 1742 16 35 30 1
13// 6 3440 17 9 31 5
14// 7 5624 18 6 32 1
15// 8 1321 19 12 36 2
16// 9 179 20 2 37 379
17// 10 65 23 11
18//
19// and the distribution of build metadata lengths is:
20//
21// length count length count length count
22// 0 364445 8 7725 18 1
23// 1 72 9 16 19 1
24// 2 7 10 85 20 1
25// 3 28 11 17 22 4
26// 4 9 12 10 26 1
27// 5 68 13 9 27 1
28// 6 73 14 10 40 5
29// 7 53 15 6
30//
31// Therefore it really behooves us to be able to use the entire 8 bytes of a
32// pointer for inline storage. For both pre-release and build metadata there are
33// vastly more strings with length exactly 8 bytes than the sum over all lengths
34// longer than 8 bytes.
35//
36// To differentiate the inline representation from the heap allocated long
37// representation, we'll allocate heap pointers with 2-byte alignment so that
38// they are guaranteed to have an unset least significant bit. Then in the repr
39// we store for pointers, we rotate a 1 into the most significant bit of the
40// most significant byte, which is never set for an ASCII byte.
41//
42// Inline repr:
43//
44// 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx 0xxxxxxx
45//
46// Heap allocated repr:
47//
48// 1ppppppp pppppppp pppppppp pppppppp pppppppp pppppppp pppppppp pppppppp 0
49// ^ most significant bit least significant bit of orig ptr, rotated out ^
50//
51// Since the most significant bit doubles as a sign bit for the similarly sized
52// signed integer type, the CPU has an efficient instruction for inspecting it,
53// meaning we can differentiate between an inline repr and a heap allocated repr
54// in one instruction. Effectively an inline repr always looks like a positive
55// i64 while a heap allocated repr always looks like a negative i64.
56//
57// For the inline repr, we store \0 padding on the end of the stored characters,
58// and thus the string length is readily determined efficiently by a cttz (count
59// trailing zeros) or bsf (bit scan forward) instruction.
60//
61// For the heap allocated repr, the length is encoded as a base-128 varint at
62// the head of the allocation.
63//
64// Empty strings are stored as an all-1 bit pattern, corresponding to -1i64.
65// Consequently the all-0 bit pattern is never a legal representation in any
66// repr, leaving it available as a niche for downstream code. For example this
67// allows size_of::<Version>() == size_of::<Option<Version>>().
68
69use crate::alloc::alloc::{alloc, dealloc, Layout};
3c0e092e
XL
70use core::mem;
71use core::num::{NonZeroU64, NonZeroUsize};
923072b8 72use core::ptr::{self, NonNull};
3c0e092e
XL
73use core::slice;
74use core::str;
75
923072b8
FG
76const PTR_BYTES: usize = mem::size_of::<NonNull<u8>>();
77
78// If pointers are already 8 bytes or bigger, then 0. If pointers are smaller
79// than 8 bytes, then Identifier will contain a byte array to raise its size up
80// to 8 bytes total.
81const TAIL_BYTES: usize = 8 * (PTR_BYTES < 8) as usize - PTR_BYTES * (PTR_BYTES < 8) as usize;
82
83#[repr(C, align(8))]
3c0e092e 84pub(crate) struct Identifier {
923072b8
FG
85 head: NonNull<u8>,
86 tail: [u8; TAIL_BYTES],
3c0e092e
XL
87}
88
89impl Identifier {
3c0e092e 90 pub(crate) const fn empty() -> Self {
923072b8
FG
91 // This is a separate constant because unsafe function calls are not
92 // allowed in a const fn body, only in a const, until later rustc than
93 // what we support.
94 const HEAD: NonNull<u8> = unsafe { NonNull::new_unchecked(!0 as *mut u8) };
95
3c0e092e 96 // `mov rax, -1`
923072b8
FG
97 Identifier {
98 head: HEAD,
99 tail: [!0; TAIL_BYTES],
100 }
3c0e092e
XL
101 }
102
103 // SAFETY: string must be ASCII and not contain \0 bytes.
104 pub(crate) unsafe fn new_unchecked(string: &str) -> Self {
105 let len = string.len();
923072b8
FG
106 match len as u64 {
107 0 => Self::empty(),
3c0e092e 108 1..=8 => {
923072b8 109 let mut bytes = [0u8; mem::size_of::<Identifier>()];
3c0e092e
XL
110 // SAFETY: string is big enough to read len bytes, bytes is big
111 // enough to write len bytes, and they do not overlap.
112 unsafe { ptr::copy_nonoverlapping(string.as_ptr(), bytes.as_mut_ptr(), len) };
923072b8
FG
113 // SAFETY: the head field is nonzero because the input string
114 // was at least 1 byte of ASCII and did not contain \0.
115 unsafe { mem::transmute::<[u8; mem::size_of::<Identifier>()], Identifier>(bytes) }
3c0e092e
XL
116 }
117 9..=0xff_ffff_ffff_ffff => {
118 // SAFETY: len is in a range that does not contain 0.
119 let size = bytes_for_varint(unsafe { NonZeroUsize::new_unchecked(len) }) + len;
120 let align = 2;
121 // SAFETY: align is not zero, align is a power of two, and
122 // rounding size up to align does not overflow usize::MAX.
123 let layout = unsafe { Layout::from_size_align_unchecked(size, align) };
124 // SAFETY: layout's size is nonzero.
125 let ptr = unsafe { alloc(layout) };
126 let mut write = ptr;
127 let mut varint_remaining = len;
128 while varint_remaining > 0 {
129 // SAFETY: size is bytes_for_varint(len) bytes + len bytes.
130 // This is writing the first bytes_for_varint(len) bytes.
131 unsafe { ptr::write(write, varint_remaining as u8 | 0x80) };
132 varint_remaining >>= 7;
133 // SAFETY: still in bounds of the same allocation.
134 write = unsafe { write.add(1) };
135 }
136 // SAFETY: size is bytes_for_varint(len) bytes + len bytes. This
137 // is writing to the last len bytes.
138 unsafe { ptr::copy_nonoverlapping(string.as_ptr(), write, len) };
923072b8
FG
139 Identifier {
140 head: ptr_to_repr(ptr),
141 tail: [0; TAIL_BYTES],
142 }
3c0e092e
XL
143 }
144 0x100_0000_0000_0000..=0xffff_ffff_ffff_ffff => {
145 unreachable!("please refrain from storing >64 petabytes of text in semver version");
146 }
147 #[cfg(no_exhaustive_int_match)] // rustc <1.33
148 _ => unreachable!(),
923072b8 149 }
3c0e092e
XL
150 }
151
152 pub(crate) fn is_empty(&self) -> bool {
153 // `cmp rdi, -1` -- basically: `repr as i64 == -1`
923072b8
FG
154 let empty = Self::empty();
155 let is_empty = self.head == empty.head && self.tail == empty.tail;
156 // The empty representation does nothing on Drop. We can't let this one
157 // drop normally because `impl Drop for Identifier` calls is_empty; that
158 // would be an infinite recursion.
159 mem::forget(empty);
160 is_empty
3c0e092e
XL
161 }
162
163 fn is_inline(&self) -> bool {
164 // `test rdi, rdi` -- basically: `repr as i64 >= 0`
923072b8 165 self.head.as_ptr() as usize >> (PTR_BYTES * 8 - 1) == 0
3c0e092e
XL
166 }
167
168 fn is_empty_or_inline(&self) -> bool {
169 // `cmp rdi, -2` -- basically: `repr as i64 > -2`
170 self.is_empty() || self.is_inline()
171 }
172
173 pub(crate) fn as_str(&self) -> &str {
174 if self.is_empty() {
175 ""
176 } else if self.is_inline() {
177 // SAFETY: repr is in the inline representation.
923072b8 178 unsafe { inline_as_str(self) }
3c0e092e
XL
179 } else {
180 // SAFETY: repr is in the heap allocated representation.
923072b8 181 unsafe { ptr_as_str(&self.head) }
3c0e092e
XL
182 }
183 }
184}
185
186impl Clone for Identifier {
187 fn clone(&self) -> Self {
923072b8
FG
188 if self.is_empty_or_inline() {
189 Identifier {
190 head: self.head,
191 tail: self.tail,
192 }
3c0e092e 193 } else {
923072b8 194 let ptr = repr_to_ptr(self.head);
3c0e092e
XL
195 // SAFETY: ptr is one of our own heap allocations.
196 let len = unsafe { decode_len(ptr) };
197 let size = bytes_for_varint(len) + len.get();
198 let align = 2;
199 // SAFETY: align is not zero, align is a power of two, and rounding
200 // size up to align does not overflow usize::MAX. This is just
201 // duplicating a previous allocation where all of these guarantees
202 // were already made.
203 let layout = unsafe { Layout::from_size_align_unchecked(size, align) };
204 // SAFETY: layout's size is nonzero.
205 let clone = unsafe { alloc(layout) };
206 // SAFETY: new allocation cannot overlap the previous one (this was
207 // not a realloc). The argument ptrs are readable/writeable
208 // respectively for size bytes.
209 unsafe { ptr::copy_nonoverlapping(ptr, clone, size) }
923072b8
FG
210 Identifier {
211 head: ptr_to_repr(clone),
212 tail: [0; TAIL_BYTES],
213 }
214 }
3c0e092e
XL
215 }
216}
217
218impl Drop for Identifier {
219 fn drop(&mut self) {
220 if self.is_empty_or_inline() {
221 return;
222 }
923072b8 223 let ptr = repr_to_ptr_mut(self.head);
3c0e092e
XL
224 // SAFETY: ptr is one of our own heap allocations.
225 let len = unsafe { decode_len(ptr) };
226 let size = bytes_for_varint(len) + len.get();
227 let align = 2;
228 // SAFETY: align is not zero, align is a power of two, and rounding
229 // size up to align does not overflow usize::MAX. These guarantees were
230 // made when originally allocating this memory.
231 let layout = unsafe { Layout::from_size_align_unchecked(size, align) };
232 // SAFETY: ptr was previously allocated by the same allocator with the
233 // same layout.
234 unsafe { dealloc(ptr, layout) }
235 }
236}
237
238impl PartialEq for Identifier {
239 fn eq(&self, rhs: &Self) -> bool {
240 if self.is_empty_or_inline() {
241 // Fast path (most common)
923072b8 242 self.head == rhs.head && self.tail == rhs.tail
3c0e092e
XL
243 } else if rhs.is_empty_or_inline() {
244 false
245 } else {
246 // SAFETY: both reprs are in the heap allocated representation.
923072b8 247 unsafe { ptr_as_str(&self.head) == ptr_as_str(&rhs.head) }
3c0e092e
XL
248 }
249 }
250}
251
923072b8
FG
252unsafe impl Send for Identifier {}
253unsafe impl Sync for Identifier {}
254
3c0e092e
XL
255// We use heap pointers that are 2-byte aligned, meaning they have an
256// insignificant 0 in the least significant bit. We take advantage of that
257// unneeded bit to rotate a 1 into the most significant bit to make the repr
258// distinguishable from ASCII bytes.
923072b8 259fn ptr_to_repr(original: *mut u8) -> NonNull<u8> {
3c0e092e
XL
260 // `mov eax, 1`
261 // `shld rax, rdi, 63`
923072b8
FG
262 let modified = (original as usize | 1).rotate_right(1);
263
264 // `original + (modified - original)`, but being mindful of provenance.
265 let diff = modified.wrapping_sub(original as usize);
266 let modified = original.wrapping_add(diff);
3c0e092e
XL
267
268 // SAFETY: the most significant bit of repr is known to be set, so the value
269 // is not zero.
923072b8 270 unsafe { NonNull::new_unchecked(modified) }
3c0e092e
XL
271}
272
273// Shift out the 1 previously placed into the most significant bit of the least
274// significant byte. Shift in a low 0 bit to reconstruct the original 2-byte
275// aligned pointer.
923072b8 276fn repr_to_ptr(modified: NonNull<u8>) -> *const u8 {
3c0e092e 277 // `lea rax, [rdi + rdi]`
923072b8
FG
278 let modified = modified.as_ptr();
279 let original = (modified as usize) << 1;
280
281 // `modified + (original - modified)`, but being mindful of provenance.
282 let diff = original.wrapping_sub(modified as usize);
283 modified.wrapping_add(diff)
3c0e092e
XL
284}
285
923072b8 286fn repr_to_ptr_mut(repr: NonNull<u8>) -> *mut u8 {
3c0e092e
XL
287 repr_to_ptr(repr) as *mut u8
288}
289
290// Compute the length of the inline string, assuming the argument is in short
291// string representation. Short strings are stored as 1 to 8 nonzero ASCII
292// bytes, followed by \0 padding for the remaining bytes.
923072b8
FG
293//
294// SAFETY: the identifier must indeed be in the inline representation.
295unsafe fn inline_len(repr: &Identifier) -> NonZeroUsize {
296 // SAFETY: Identifier's layout is align(8) and at least size 8. We're doing
297 // an aligned read of the first 8 bytes from it. The bytes are not all zero
298 // because inline strings are at least 1 byte long and cannot contain \0.
299 let repr = unsafe { ptr::read(repr as *const Identifier as *const NonZeroU64) };
300
3c0e092e
XL
301 // Rustc >=1.53 has intrinsics for counting zeros on a non-zeroable integer.
302 // On many architectures these are more efficient than counting on ordinary
303 // zeroable integers (bsf vs cttz). On rustc <1.53 without those intrinsics,
304 // we count zeros in the u64 rather than the NonZeroU64.
305 #[cfg(no_nonzero_bitscan)]
306 let repr = repr.get();
307
308 #[cfg(target_endian = "little")]
309 let zero_bits_on_string_end = repr.leading_zeros();
310 #[cfg(target_endian = "big")]
311 let zero_bits_on_string_end = repr.trailing_zeros();
312
313 let nonzero_bytes = 8 - zero_bits_on_string_end as usize / 8;
314
315 // SAFETY: repr is nonzero, so it has at most 63 zero bits on either end,
316 // thus at least one nonzero byte.
317 unsafe { NonZeroUsize::new_unchecked(nonzero_bytes) }
318}
319
320// SAFETY: repr must be in the inline representation, i.e. at least 1 and at
321// most 8 nonzero ASCII bytes padded on the end with \0 bytes.
923072b8
FG
322unsafe fn inline_as_str(repr: &Identifier) -> &str {
323 let ptr = repr as *const Identifier as *const u8;
324 let len = unsafe { inline_len(repr) }.get();
3c0e092e
XL
325 // SAFETY: we are viewing the nonzero ASCII prefix of the inline repr's
326 // contents as a slice of bytes. Input/output lifetimes are correctly
327 // associated.
328 let slice = unsafe { slice::from_raw_parts(ptr, len) };
329 // SAFETY: the string contents are known to be only ASCII bytes, which are
330 // always valid UTF-8.
331 unsafe { str::from_utf8_unchecked(slice) }
332}
333
334// Decode varint. Varints consist of between one and eight base-128 digits, each
335// of which is stored in a byte with most significant bit set. Adjacent to the
336// varint in memory there is guaranteed to be at least 9 ASCII bytes, each of
337// which has an unset most significant bit.
338//
339// SAFETY: ptr must be one of our own heap allocations, with the varint header
340// already written.
341unsafe fn decode_len(ptr: *const u8) -> NonZeroUsize {
342 // SAFETY: There is at least one byte of varint followed by at least 9 bytes
343 // of string content, which is at least 10 bytes total for the allocation,
344 // so reading the first two is no problem.
345 let [first, second] = unsafe { ptr::read(ptr as *const [u8; 2]) };
346 if second < 0x80 {
347 // SAFETY: the length of this heap allocated string has been encoded as
348 // one base-128 digit, so the length is at least 9 and at most 127. It
349 // cannot be zero.
350 unsafe { NonZeroUsize::new_unchecked((first & 0x7f) as usize) }
351 } else {
352 return unsafe { decode_len_cold(ptr) };
353
354 // Identifiers 128 bytes or longer. This is not exercised by any crate
355 // version currently published to crates.io.
356 #[cold]
357 #[inline(never)]
358 unsafe fn decode_len_cold(mut ptr: *const u8) -> NonZeroUsize {
359 let mut len = 0;
360 let mut shift = 0;
361 loop {
362 // SAFETY: varint continues while there are bytes having the
363 // most significant bit set, i.e. until we start hitting the
364 // ASCII string content with msb unset.
365 let byte = unsafe { *ptr };
366 if byte < 0x80 {
367 // SAFETY: the string length is known to be 128 bytes or
368 // longer.
369 return unsafe { NonZeroUsize::new_unchecked(len) };
370 }
371 // SAFETY: still in bounds of the same allocation.
372 ptr = unsafe { ptr.add(1) };
373 len += ((byte & 0x7f) as usize) << shift;
374 shift += 7;
375 }
376 }
377 }
378}
379
380// SAFETY: repr must be in the heap allocated representation, with varint header
381// and string contents already written.
923072b8 382unsafe fn ptr_as_str(repr: &NonNull<u8>) -> &str {
3c0e092e
XL
383 let ptr = repr_to_ptr(*repr);
384 let len = unsafe { decode_len(ptr) };
385 let header = bytes_for_varint(len);
386 let slice = unsafe { slice::from_raw_parts(ptr.add(header), len.get()) };
387 // SAFETY: all identifier contents are ASCII bytes, which are always valid
388 // UTF-8.
389 unsafe { str::from_utf8_unchecked(slice) }
390}
391
392// Number of base-128 digits required for the varint representation of a length.
393fn bytes_for_varint(len: NonZeroUsize) -> usize {
394 #[cfg(no_nonzero_bitscan)] // rustc <1.53
395 let len = len.get();
396
397 let usize_bits = mem::size_of::<usize>() * 8;
398 let len_bits = usize_bits - len.leading_zeros() as usize;
399 (len_bits + 6) / 7
400}