]>
Commit | Line | Data |
---|---|---|
48663c56 XL |
1 | //! A string table implementation with a tree-like encoding. |
2 | //! | |
60c5eb7d | 3 | //! Each entry in the table represents a string and is encoded as a list of |
48663c56 XL |
4 | //! components where each component can either be |
5 | //! | |
60c5eb7d XL |
6 | //! 1. a string _value_ that contains actual UTF-8 string content, |
7 | //! 2. a string _ID_ that contains a reference to another entry, or | |
8 | //! 3. a terminator tag which marks the end of a component list. | |
48663c56 | 9 | //! |
60c5eb7d XL |
10 | //! The string _content_ of an entry is defined as the concatenation of the |
11 | //! content of its components. The content of a string value is its actual | |
12 | //! UTF-8 bytes. The content of a string ID is the contents of the entry | |
48663c56 | 13 | //! it references. |
e74abb32 | 14 | //! |
60c5eb7d XL |
15 | //! The byte-level encoding of component lists uses the structure of UTF-8 in |
16 | //! order to save space: | |
e74abb32 | 17 | //! |
60c5eb7d XL |
18 | //! - A valid UTF-8 codepoint never starts with the bits `10` as this bit |
19 | //! prefix is reserved for bytes in the middle of a UTF-8 codepoint byte | |
20 | //! sequence. We make use of this fact by letting all string ID components | |
21 | //! start with this `10` prefix. Thus when we parse the contents of a value | |
22 | //! we know to stop if the start byte of the next codepoint has this prefix. | |
23 | //! | |
24 | //! - A valid UTF-8 string cannot contain the `0xFF` byte and since string IDs | |
25 | //! start with `10` as described above, they also cannot start with a `0xFF` | |
26 | //! byte. Thus we can safely use `0xFF` as our component list terminator. | |
27 | //! | |
28 | //! The sample composite string ["abc", ID(42), "def", TERMINATOR] would thus be | |
29 | //! encoded as: | |
30 | //! | |
31 | //! ```ignore | |
32 | //! ['a', 'b' , 'c', 128, 0, 0, 42, 'd', 'e', 'f', 255] | |
33 | //! ^^^^^^^^^^^^^ ^^^ | |
34 | //! string ID 42 with 0b10 prefix terminator (0xFF) | |
35 | //! ``` | |
36 | //! | |
37 | //! As you can see string IDs are encoded in big endian format so that highest | |
38 | //! order bits show up in the first byte we encounter. | |
39 | //! | |
40 | //! ---------------------------------------------------------------------------- | |
41 | //! | |
42 | //! Each string in the table is referred to via a `StringId`. `StringId`s may | |
43 | //! be generated in two ways: | |
44 | //! | |
45 | //! 1. Calling `StringTable::alloc()` which returns the `StringId` for the | |
46 | //! allocated string. | |
47 | //! 2. Calling `StringTable::alloc_with_reserved_id()` and `StringId::reserved()`. | |
48 | //! | |
49 | //! String IDs allow you to deduplicate strings by allocating a string | |
50 | //! once and then referring to it by id over and over. This is a useful trick | |
51 | //! for strings which are recorded many times and it can significantly reduce | |
52 | //! the size of profile trace files. | |
e74abb32 XL |
53 | //! |
54 | //! `StringId`s are partitioned according to type: | |
55 | //! | |
56 | //! > [0 .. MAX_PRE_RESERVED_STRING_ID, METADATA_STRING_ID, .. ] | |
57 | //! | |
58 | //! From `0` to `MAX_PRE_RESERVED_STRING_ID` are the allowed values for reserved strings. | |
59 | //! After `MAX_PRE_RESERVED_STRING_ID`, there is one string id (`METADATA_STRING_ID`) which is used | |
60 | //! internally by `measureme` to record additional metadata about the profiling session. | |
61 | //! After `METADATA_STRING_ID` are all other `StringId` values. | |
60c5eb7d | 62 | //! |
e74abb32 XL |
63 | |
64 | use crate::file_header::{ | |
60c5eb7d | 65 | write_file_header, FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX, |
e74abb32 | 66 | }; |
48663c56 | 67 | use crate::serialization::{Addr, SerializationSink}; |
60c5eb7d | 68 | use byteorder::{BigEndian, ByteOrder, LittleEndian}; |
48663c56 XL |
69 | use std::sync::atomic::{AtomicU32, Ordering}; |
70 | use std::sync::Arc; | |
71 | ||
72 | /// A `StringId` is used to identify a string in the `StringTable`. | |
73 | #[derive(Clone, Copy, Eq, PartialEq, Debug, Hash)] | |
74 | #[repr(C)] | |
75 | pub struct StringId(u32); | |
76 | ||
77 | impl StringId { | |
78 | #[inline] | |
79 | pub fn reserved(id: u32) -> StringId { | |
60c5eb7d | 80 | assert!(id == id & STRING_ID_MASK); |
48663c56 XL |
81 | StringId(id) |
82 | } | |
48663c56 | 83 | |
60c5eb7d XL |
84 | #[inline] |
85 | pub fn as_u32(self) -> u32 { | |
86 | self.0 | |
87 | } | |
88 | } | |
48663c56 | 89 | |
60c5eb7d XL |
90 | // See module-level documentation for more information on the encoding. |
91 | pub const TERMINATOR: u8 = 0xFF; | |
48663c56 | 92 | |
60c5eb7d XL |
93 | // All 1s except for the two highest bits. |
94 | pub const MAX_STRING_ID: u32 = 0x3FFF_FFFF; | |
95 | pub const STRING_ID_MASK: u32 = 0x3FFF_FFFF; | |
48663c56 | 96 | |
e74abb32 | 97 | /// The maximum id value a prereserved string may be. |
60c5eb7d | 98 | const MAX_PRE_RESERVED_STRING_ID: u32 = MAX_STRING_ID / 2; |
48663c56 | 99 | |
e74abb32 | 100 | /// The id of the profile metadata string entry. |
60c5eb7d | 101 | pub const METADATA_STRING_ID: u32 = MAX_PRE_RESERVED_STRING_ID + 1; |
e74abb32 | 102 | |
48663c56 XL |
103 | /// Write-only version of the string table |
104 | pub struct StringTableBuilder<S: SerializationSink> { | |
105 | data_sink: Arc<S>, | |
106 | index_sink: Arc<S>, | |
e74abb32 | 107 | id_counter: AtomicU32, // initialized to METADATA_STRING_ID + 1 |
48663c56 XL |
108 | } |
109 | ||
110 | /// Anything that implements `SerializableString` can be written to a | |
111 | /// `StringTable`. | |
112 | pub trait SerializableString { | |
113 | fn serialized_size(&self) -> usize; | |
114 | fn serialize(&self, bytes: &mut [u8]); | |
115 | } | |
116 | ||
60c5eb7d | 117 | // A single string is encoded as `[UTF-8 bytes][TERMINATOR]` |
48663c56 XL |
118 | impl SerializableString for str { |
119 | #[inline] | |
120 | fn serialized_size(&self) -> usize { | |
48663c56 XL |
121 | self.len() + // actual bytes |
122 | 1 // terminator | |
123 | } | |
124 | ||
125 | #[inline] | |
126 | fn serialize(&self, bytes: &mut [u8]) { | |
48663c56 | 127 | let last_byte_index = bytes.len() - 1; |
60c5eb7d XL |
128 | bytes[0..last_byte_index].copy_from_slice(self.as_bytes()); |
129 | bytes[last_byte_index] = TERMINATOR; | |
48663c56 XL |
130 | } |
131 | } | |
132 | ||
133 | /// A single component of a string. Used for building composite table entries. | |
134 | pub enum StringComponent<'s> { | |
135 | Value(&'s str), | |
136 | Ref(StringId), | |
137 | } | |
138 | ||
60c5eb7d XL |
139 | impl<'s> StringComponent<'s> { |
140 | #[inline] | |
141 | fn serialized_size(&self) -> usize { | |
142 | match *self { | |
143 | StringComponent::Value(s) => s.len(), | |
144 | StringComponent::Ref(_) => 4, | |
145 | } | |
146 | } | |
147 | ||
148 | #[inline] | |
149 | fn serialize<'b>(&self, bytes: &'b mut [u8]) -> &'b mut [u8] { | |
150 | match *self { | |
151 | StringComponent::Value(s) => { | |
152 | bytes[..s.len()].copy_from_slice(s.as_bytes()); | |
153 | &mut bytes[s.len()..] | |
154 | } | |
155 | StringComponent::Ref(string_id) => { | |
156 | assert!(string_id.0 == string_id.0 & STRING_ID_MASK); | |
157 | let tagged = string_id.0 | (1u32 << 31); | |
158 | ||
159 | BigEndian::write_u32(&mut bytes[0..4], tagged); | |
160 | &mut bytes[4..] | |
161 | } | |
162 | } | |
163 | } | |
164 | } | |
165 | ||
48663c56 XL |
166 | impl<'a> SerializableString for [StringComponent<'a>] { |
167 | #[inline] | |
168 | fn serialized_size(&self) -> usize { | |
60c5eb7d XL |
169 | self.iter().map(|c| c.serialized_size()).sum::<usize>() + // size of components |
170 | 1 // terminator | |
48663c56 XL |
171 | } |
172 | ||
173 | #[inline] | |
60c5eb7d XL |
174 | fn serialize(&self, mut bytes: &mut [u8]) { |
175 | assert!(bytes.len() == self.serialized_size()); | |
176 | for component in self.iter() { | |
177 | bytes = component.serialize(bytes); | |
178 | } | |
179 | ||
180 | // Assert that we used the exact number of bytes we anticipated. | |
181 | assert!(bytes.len() == 1); | |
182 | bytes[0] = TERMINATOR; | |
48663c56 XL |
183 | } |
184 | } | |
185 | ||
60c5eb7d XL |
186 | macro_rules! impl_serializable_string_for_fixed_size { |
187 | ($n:expr) => { | |
188 | impl<'a> SerializableString for [StringComponent<'a>; $n] { | |
189 | #[inline(always)] | |
190 | fn serialized_size(&self) -> usize { | |
191 | (&self[..]).serialized_size() | |
192 | } | |
193 | ||
194 | #[inline(always)] | |
195 | fn serialize(&self, bytes: &mut [u8]) { | |
196 | (&self[..]).serialize(bytes); | |
197 | } | |
198 | } | |
199 | }; | |
200 | } | |
201 | ||
202 | impl_serializable_string_for_fixed_size!(0); | |
203 | impl_serializable_string_for_fixed_size!(1); | |
204 | impl_serializable_string_for_fixed_size!(2); | |
205 | impl_serializable_string_for_fixed_size!(3); | |
206 | impl_serializable_string_for_fixed_size!(4); | |
207 | impl_serializable_string_for_fixed_size!(5); | |
208 | impl_serializable_string_for_fixed_size!(6); | |
209 | impl_serializable_string_for_fixed_size!(7); | |
210 | impl_serializable_string_for_fixed_size!(8); | |
211 | impl_serializable_string_for_fixed_size!(9); | |
212 | impl_serializable_string_for_fixed_size!(10); | |
213 | impl_serializable_string_for_fixed_size!(11); | |
214 | impl_serializable_string_for_fixed_size!(12); | |
215 | impl_serializable_string_for_fixed_size!(13); | |
216 | impl_serializable_string_for_fixed_size!(14); | |
217 | impl_serializable_string_for_fixed_size!(15); | |
218 | impl_serializable_string_for_fixed_size!(16); | |
219 | ||
48663c56 XL |
220 | fn serialize_index_entry<S: SerializationSink>(sink: &S, id: StringId, addr: Addr) { |
221 | sink.write_atomic(8, |bytes| { | |
222 | LittleEndian::write_u32(&mut bytes[0..4], id.0); | |
223 | LittleEndian::write_u32(&mut bytes[4..8], addr.0); | |
224 | }); | |
225 | } | |
226 | ||
48663c56 XL |
227 | impl<S: SerializationSink> StringTableBuilder<S> { |
228 | pub fn new(data_sink: Arc<S>, index_sink: Arc<S>) -> StringTableBuilder<S> { | |
48663c56 XL |
229 | // The first thing in every file we generate must be the file header. |
230 | write_file_header(&*data_sink, FILE_MAGIC_STRINGTABLE_DATA); | |
231 | write_file_header(&*index_sink, FILE_MAGIC_STRINGTABLE_INDEX); | |
232 | ||
233 | StringTableBuilder { | |
234 | data_sink, | |
235 | index_sink, | |
e74abb32 | 236 | id_counter: AtomicU32::new(METADATA_STRING_ID + 1), |
48663c56 XL |
237 | } |
238 | } | |
239 | ||
48663c56 XL |
240 | pub fn alloc_with_reserved_id<STR: SerializableString + ?Sized>( |
241 | &self, | |
242 | id: StringId, | |
243 | s: &STR, | |
244 | ) -> StringId { | |
245 | assert!(id.0 <= MAX_PRE_RESERVED_STRING_ID); | |
246 | self.alloc_unchecked(id, s); | |
247 | id | |
248 | } | |
249 | ||
e74abb32 XL |
250 | pub(crate) fn alloc_metadata<STR: SerializableString + ?Sized>(&self, s: &STR) -> StringId { |
251 | let id = StringId(METADATA_STRING_ID); | |
252 | self.alloc_unchecked(id, s); | |
253 | id | |
254 | } | |
255 | ||
48663c56 XL |
256 | pub fn alloc<STR: SerializableString + ?Sized>(&self, s: &STR) -> StringId { |
257 | let id = StringId(self.id_counter.fetch_add(1, Ordering::SeqCst)); | |
60c5eb7d XL |
258 | assert!(id.0 > METADATA_STRING_ID); |
259 | assert!(id.0 <= MAX_STRING_ID); | |
48663c56 XL |
260 | self.alloc_unchecked(id, s); |
261 | id | |
262 | } | |
263 | ||
264 | #[inline] | |
265 | fn alloc_unchecked<STR: SerializableString + ?Sized>(&self, id: StringId, s: &STR) { | |
266 | let size_in_bytes = s.serialized_size(); | |
267 | let addr = self.data_sink.write_atomic(size_in_bytes, |mem| { | |
268 | s.serialize(mem); | |
269 | }); | |
270 | ||
271 | serialize_index_entry(&*self.index_sink, id, addr); | |
272 | } | |
273 | } |