1 // Copyright 2014 The Servo Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
10 use crate::dynamic_set
::{Entry, DYNAMIC_SET}
;
11 use crate::static_sets
::StaticAtomSet
;
12 use debug_unreachable
::debug_unreachable
;
15 use std
::cmp
::Ordering
::{self, Equal}
;
17 use std
::hash
::{Hash, Hasher}
;
18 use std
::marker
::PhantomData
;
20 use std
::num
::NonZeroU64
;
24 use std
::sync
::atomic
::Ordering
::SeqCst
;
26 const DYNAMIC_TAG
: u8 = 0b_00;
27 const INLINE_TAG
: u8 = 0b_01; // len in upper nybble
28 const STATIC_TAG
: u8 = 0b_10;
29 const TAG_MASK
: u64 = 0b_11;
30 const LEN_OFFSET
: u64 = 4;
31 const LEN_MASK
: u64 = 0xF0;
33 const MAX_INLINE_LEN
: usize = 7;
34 const STATIC_SHIFT_BITS
: usize = 32;
36 /// Represents a string that has been interned.
38 /// While the type definition for `Atom` indicates that it generic on a particular
39 /// implementation of an atom set, you don't need to worry about this. Atoms can be static
40 /// and come from a `StaticAtomSet` generated by the `string_cache_codegen` crate, or they
41 /// can be dynamic and created by you on an `EmptyStaticAtomSet`.
43 /// `Atom` implements `Clone` but not `Copy`, since internally atoms are reference-counted;
44 /// this means that you may need to `.clone()` an atom to keep copies to it in different
45 /// places, or when passing it to a function that takes an `Atom` rather than an `&Atom`.
47 /// ## Creating an atom at runtime
49 /// If you use `string_cache_codegen` to generate a precomputed list of atoms, your code
50 /// may then do something like read data from somewhere and extract tokens that need to be
51 /// compared to the atoms. In this case, you can use `Atom::from(&str)` or
52 /// `Atom::from(String)`. These create a reference-counted atom which will be
53 /// automatically freed when all references to it are dropped.
55 /// This means that your application can safely have a loop which tokenizes data, creates
56 /// atoms from the tokens, and compares the atoms to a predefined set of keywords, without
57 /// running the risk of arbitrary memory consumption from creating large numbers of atoms —
58 /// as long as your application does not store clones of the atoms it creates along the
61 /// For example, the following is safe and will not consume arbitrary amounts of memory:
64 /// let untrusted_data = "large amounts of text ...";
66 /// for token in untrusted_data.split_whitespace() {
67 /// let atom = Atom::from(token); // interns the string
69 /// if atom == Atom::from("keyword") {
70 /// // handle that keyword
71 /// } else if atom == Atom::from("another_keyword") {
72 /// // handle that keyword
74 /// println!("unknown keyword");
76 /// } // atom is dropped here, so it is not kept around in memory
78 #[derive(PartialEq, Eq)]
79 // NOTE: Deriving PartialEq requires that a given string must always be interned the same way.
80 pub struct Atom
<Static
> {
81 unsafe_data
: NonZeroU64
,
82 phantom
: PhantomData
<Static
>,
85 // FIXME: bound removed from the struct definition before of this error for pack_static:
86 // "error[E0723]: trait bounds other than `Sized` on const fn parameters are unstable"
87 // https://github.com/rust-lang/rust/issues/57563
88 impl<Static
> Atom
<Static
> {
89 /// For the atom!() macros
92 pub const fn pack_static(n
: u32) -> Self {
95 // STATIC_TAG ensures this is non-zero
96 NonZeroU64
::new_unchecked((STATIC_TAG
as u64) | ((n
as u64) << STATIC_SHIFT_BITS
))
102 fn tag(&self) -> u8 {
103 (self.unsafe_data
.get() & TAG_MASK
) as u8
107 impl<Static
: StaticAtomSet
> Atom
<Static
> {
108 /// Return the internal representation. For testing.
110 pub fn unsafe_data(&self) -> u64 {
111 self.unsafe_data
.get()
114 /// Return true if this is a static Atom. For testing.
116 pub fn is_static(&self) -> bool
{
117 self.tag() == STATIC_TAG
120 /// Return true if this is a dynamic Atom. For testing.
122 pub fn is_dynamic(&self) -> bool
{
123 self.tag() == DYNAMIC_TAG
126 /// Return true if this is an inline Atom. For testing.
128 pub fn is_inline(&self) -> bool
{
129 self.tag() == INLINE_TAG
132 fn static_index(&self) -> u64 {
133 self.unsafe_data
.get() >> STATIC_SHIFT_BITS
136 /// Get the hash of the string as it is stored in the set.
137 pub fn get_hash(&self) -> u32 {
140 let entry
= self.unsafe_data
.get() as *const Entry
;
141 unsafe { (*entry).hash }
143 STATIC_TAG
=> Static
::get().hashes
[self.static_index() as usize],
145 let data
= self.unsafe_data
.get();
146 // This may or may not be great...
147 ((data
>> 32) ^ data
) as u32
149 _
=> unsafe { debug_unreachable!() }
,
153 pub fn try_static(string_to_add
: &str) -> Option
<Self> {
154 Self::try_static_internal(string_to_add
).ok()
157 fn try_static_internal(string_to_add
: &str) -> Result
<Self, phf_shared
::Hashes
> {
158 let static_set
= Static
::get();
159 let hash
= phf_shared
::hash(&*string_to_add
, &static_set
.key
);
160 let index
= phf_shared
::get_index(&hash
, static_set
.disps
, static_set
.atoms
.len());
162 if static_set
.atoms
[index
as usize] == string_to_add
{
163 Ok(Self::pack_static(index
))
170 impl<Static
: StaticAtomSet
> Default
for Atom
<Static
> {
172 fn default() -> Self {
173 Atom
::pack_static(Static
::empty_string_index())
177 impl<Static
: StaticAtomSet
> Hash
for Atom
<Static
> {
179 fn hash
<H
>(&self, state
: &mut H
)
183 state
.write_u32(self.get_hash())
187 impl<'a
, Static
: StaticAtomSet
> From
<Cow
<'a
, str>> for Atom
<Static
> {
188 fn from(string_to_add
: Cow
<'a
, str>) -> Self {
189 Self::try_static_internal(&*string_to_add
).unwrap_or_else(|hash
| {
190 let len
= string_to_add
.len();
191 if len
<= MAX_INLINE_LEN
{
192 let mut data
: u64 = (INLINE_TAG
as u64) | ((len
as u64) << LEN_OFFSET
);
194 let dest
= inline_atom_slice_mut(&mut data
);
195 dest
[..len
].copy_from_slice(string_to_add
.as_bytes())
198 // INLINE_TAG ensures this is never zero
199 unsafe_data
: unsafe { NonZeroU64::new_unchecked(data) }
,
200 phantom
: PhantomData
,
203 let ptr
: std
::ptr
::NonNull
<Entry
> = DYNAMIC_SET
.insert(string_to_add
, hash
.g
);
204 let data
= ptr
.as_ptr() as u64;
205 debug_assert
!(0 == data
& TAG_MASK
);
207 // The address of a ptr::NonNull is non-zero
208 unsafe_data
: unsafe { NonZeroU64::new_unchecked(data) }
,
209 phantom
: PhantomData
,
216 impl<Static
: StaticAtomSet
> Clone
for Atom
<Static
> {
218 fn clone(&self) -> Self {
219 if self.tag() == DYNAMIC_TAG
{
220 let entry
= self.unsafe_data
.get() as *const Entry
;
221 unsafe { &*entry }
.ref_count
.fetch_add(1, SeqCst
);
227 impl<Static
> Drop
for Atom
<Static
> {
230 if self.tag() == DYNAMIC_TAG
{
231 let entry
= self.unsafe_data
.get() as *const Entry
;
232 if unsafe { &*entry }
.ref_count
.fetch_sub(1, SeqCst
) == 1 {
237 // Out of line to guide inlining.
238 fn drop_slow
<Static
>(this
: &mut Atom
<Static
>) {
239 DYNAMIC_SET
.remove(this
.unsafe_data
.get() as *mut Entry
);
244 impl<Static
: StaticAtomSet
> ops
::Deref
for Atom
<Static
> {
248 fn deref(&self) -> &str {
252 let entry
= self.unsafe_data
.get() as *const Entry
;
256 let len
= (self.unsafe_data() & LEN_MASK
) >> LEN_OFFSET
;
257 let src
= inline_atom_slice(&self.unsafe_data
);
258 str::from_utf8_unchecked(&src
[..(len
as usize)])
260 STATIC_TAG
=> Static
::get().atoms
[self.static_index() as usize],
261 _
=> debug_unreachable
!(),
267 impl<Static
: StaticAtomSet
> fmt
::Debug
for Atom
<Static
> {
269 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
270 let ty_str
= unsafe {
272 DYNAMIC_TAG
=> "dynamic",
273 INLINE_TAG
=> "inline",
274 STATIC_TAG
=> "static",
275 _
=> debug_unreachable
!(),
279 write
!(f
, "Atom('{}' type={})", &*self, ty_str
)
283 impl<Static
: StaticAtomSet
> PartialOrd
for Atom
<Static
> {
285 fn partial_cmp(&self, other
: &Self) -> Option
<Ordering
> {
286 if self.unsafe_data
== other
.unsafe_data
{
289 self.as_ref().partial_cmp(other
.as_ref())
293 impl<Static
: StaticAtomSet
> Ord
for Atom
<Static
> {
295 fn cmp(&self, other
: &Self) -> Ordering
{
296 if self.unsafe_data
== other
.unsafe_data
{
299 self.as_ref().cmp(other
.as_ref())
303 // AsciiExt requires mutating methods, so we just implement the non-mutating ones.
304 // We don't need to implement is_ascii because there's no performance improvement
305 // over the one from &str.
306 impl<Static
: StaticAtomSet
> Atom
<Static
> {
307 fn from_mutated_str
<F
: FnOnce(&mut str)>(s
: &str, f
: F
) -> Self {
308 let mut buffer
= mem
::MaybeUninit
::<[u8; 64]>::uninit();
309 let buffer
= unsafe { &mut *buffer.as_mut_ptr() }
;
311 if let Some(buffer_prefix
) = buffer
.get_mut(..s
.len()) {
312 buffer_prefix
.copy_from_slice(s
.as_bytes());
313 let as_str
= unsafe { ::std::str::from_utf8_unchecked_mut(buffer_prefix) }
;
317 let mut string
= s
.to_owned();
323 /// Like [`to_ascii_uppercase`].
325 /// [`to_ascii_uppercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_uppercase
326 pub fn to_ascii_uppercase(&self) -> Self {
327 for (i
, b
) in self.bytes().enumerate() {
328 if let b'a'
..=b'z'
= b
{
329 return Atom
::from_mutated_str(self, |s
| s
[i
..].make_ascii_uppercase());
335 /// Like [`to_ascii_lowercase`].
337 /// [`to_ascii_lowercase`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.to_ascii_lowercase
338 pub fn to_ascii_lowercase(&self) -> Self {
339 for (i
, b
) in self.bytes().enumerate() {
340 if let b'A'
..=b'Z'
= b
{
341 return Atom
::from_mutated_str(self, |s
| s
[i
..].make_ascii_lowercase());
347 /// Like [`eq_ignore_ascii_case`].
349 /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case
350 pub fn eq_ignore_ascii_case(&self, other
: &Self) -> bool
{
351 (self == other
) || self.eq_str_ignore_ascii_case(&**other
)
354 /// Like [`eq_ignore_ascii_case`], but takes an unhashed string as `other`.
356 /// [`eq_ignore_ascii_case`]: https://doc.rust-lang.org/std/ascii/trait.AsciiExt.html#tymethod.eq_ignore_ascii_case
357 pub fn eq_str_ignore_ascii_case(&self, other
: &str) -> bool
{
358 (&**self).eq_ignore_ascii_case(other
)
363 fn inline_atom_slice(x
: &NonZeroU64
) -> &[u8] {
365 let x
: *const NonZeroU64
= x
;
366 let mut data
= x
as *const u8;
367 // All except the lowest byte, which is first in little-endian, last in big-endian.
368 if cfg
!(target_endian
= "little") {
369 data
= data
.offset(1);
372 slice
::from_raw_parts(data
, len
)
377 fn inline_atom_slice_mut(x
: &mut u64) -> &mut [u8] {
380 let mut data
= x
as *mut u8;
381 // All except the lowest byte, which is first in little-endian, last in big-endian.
382 if cfg
!(target_endian
= "little") {
383 data
= data
.offset(1);
386 slice
::from_raw_parts_mut(data
, len
)