]> git.proxmox.com Git - rustc.git/blob - vendor/measureme/src/stringtable.rs
New upstream version 1.52.0~beta.3+dfsg1
[rustc.git] / vendor / measureme / src / stringtable.rs
1 //! A string table implementation with a tree-like encoding.
2 //!
3 //! Each entry in the table represents a string and is encoded as a list of
4 //! components where each component can either be
5 //!
6 //! 1. a string _value_ that contains actual UTF-8 string content,
7 //! 2. a string _ID_ that contains a reference to another entry, or
8 //! 3. a terminator tag which marks the end of a component list.
9 //!
10 //! The string _content_ of an entry is defined as the concatenation of the
11 //! content of its components. The content of a string value is its actual
12 //! UTF-8 bytes. The content of a string ID is the contents of the entry
13 //! it references.
14 //!
15 //! The byte-level encoding of component lists uses the structure of UTF-8 in
16 //! order to save space:
17 //!
18 //! - A valid UTF-8 codepoint never starts with the byte `0xFE`. We make use
19 //! of this fact by letting all string ID components start with this `0xFE`
20 //! prefix. Thus when we parse the contents of a value we know to stop if
21 //! we encounter this byte.
22 //!
23 //! - A valid UTF-8 string cannot contain the `0xFF` byte. Thus we can safely
24 //! use `0xFF` as our component list terminator.
25 //!
26 //! The sample composite string ["abc", ID(42), "def", TERMINATOR] would thus be
27 //! encoded as:
28 //!
29 //! ```ignore
30 //! ['a', 'b' , 'c', 254, 42, 0, 0, 0, 'd', 'e', 'f', 255]
31 //! ^^^^^^^^^^^^^^^^ ^^^
32 //! string ID with 0xFE prefix terminator (0xFF)
33 //! ```
34 //!
35 //! As you can see string IDs are encoded in little endian format.
36 //!
37 //! ----------------------------------------------------------------------------
38 //!
39 //! Each string in the table is referred to via a `StringId`. `StringId`s may
40 //! be generated in two ways:
41 //!
42 //! 1. Calling `StringTableBuilder::alloc()` which returns the `StringId` for
43 //! the allocated string.
44 //! 2. Calling `StringId::new_virtual()` to create a "virtual" `StringId` that
45 //! later can be mapped to an actual string via
46 //! `StringTableBuilder::map_virtual_to_concrete_string()`.
47 //!
48 //! String IDs allow you to deduplicate strings by allocating a string
49 //! once and then referring to it by id over and over. This is a useful trick
50 //! for strings which are recorded many times and it can significantly reduce
51 //! the size of profile trace files.
52 //!
53 //! `StringId`s are partitioned according to type:
54 //!
55 //! > [0 .. MAX_VIRTUAL_STRING_ID, METADATA_STRING_ID, .. ]
56 //!
57 //! From `0` to `MAX_VIRTUAL_STRING_ID` are the allowed values for virtual strings.
58 //! After `MAX_VIRTUAL_STRING_ID`, there is one string id (`METADATA_STRING_ID`)
59 //! which is used internally by `measureme` to record additional metadata about
60 //! the profiling session. After `METADATA_STRING_ID` are all other `StringId`
61 //! values.
62
63 use crate::file_header::{
64 write_file_header, FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX,
65 };
66 use crate::serialization::Addr;
67 use crate::serialization::SerializationSink;
68 use std::{error::Error, sync::Arc};
69
70 /// A `StringId` is used to identify a string in the `StringTable`. It is
71 /// either a regular `StringId`, meaning that it contains the absolute address
72 /// of a string within the string table data. Or it is "virtual", which means
73 /// that the address it points to is resolved via the string table index data,
74 /// that maps virtual `StringId`s to addresses.
75 #[derive(Clone, Copy, Eq, PartialEq, Debug, Hash)]
76 #[repr(C)]
77 pub struct StringId(u32);
78
79 impl StringId {
80 pub const INVALID: StringId = StringId(INVALID_STRING_ID);
81
82 #[inline]
83 pub fn new(id: u32) -> StringId {
84 StringId(id)
85 }
86
87 #[inline]
88 pub fn new_virtual(id: u32) -> StringId {
89 assert!(id <= MAX_USER_VIRTUAL_STRING_ID);
90 StringId(id)
91 }
92
93 #[inline]
94 pub fn is_virtual(self) -> bool {
95 self.0 <= METADATA_STRING_ID
96 }
97
98 #[inline]
99 pub fn as_u32(self) -> u32 {
100 self.0
101 }
102
103 #[inline]
104 pub fn from_addr(addr: Addr) -> StringId {
105 let id = addr.0.checked_add(FIRST_REGULAR_STRING_ID).unwrap();
106 StringId::new(id)
107 }
108
109 #[inline]
110 pub fn to_addr(self) -> Addr {
111 Addr(self.0.checked_sub(FIRST_REGULAR_STRING_ID).unwrap())
112 }
113 }
114
115 // See module-level documentation for more information on the encoding.
116 pub const TERMINATOR: u8 = 0xFF;
117 pub const STRING_REF_TAG: u8 = 0xFE;
118 pub const STRING_REF_ENCODED_SIZE: usize = 5;
119
120 /// The maximum id value a virtual string may be.
121 const MAX_USER_VIRTUAL_STRING_ID: u32 = 100_000_000;
122
123 /// The id of the profile metadata string entry.
124 pub const METADATA_STRING_ID: u32 = MAX_USER_VIRTUAL_STRING_ID + 1;
125
126 /// Some random string ID that we make sure cannot be generated or assigned to.
127 const INVALID_STRING_ID: u32 = METADATA_STRING_ID + 1;
128
129 pub const FIRST_REGULAR_STRING_ID: u32 = INVALID_STRING_ID + 1;
130
131 /// Write-only version of the string table
132 pub struct StringTableBuilder {
133 data_sink: Arc<SerializationSink>,
134 index_sink: Arc<SerializationSink>,
135 }
136
137 /// Anything that implements `SerializableString` can be written to a
138 /// `StringTable`.
139 pub trait SerializableString {
140 fn serialized_size(&self) -> usize;
141 fn serialize(&self, bytes: &mut [u8]);
142 }
143
144 // A single string is encoded as `[UTF-8 bytes][TERMINATOR]`
145 impl SerializableString for str {
146 #[inline]
147 fn serialized_size(&self) -> usize {
148 self.len() + // actual bytes
149 1 // terminator
150 }
151
152 #[inline]
153 fn serialize(&self, bytes: &mut [u8]) {
154 let last_byte_index = bytes.len() - 1;
155 bytes[0..last_byte_index].copy_from_slice(self.as_bytes());
156 bytes[last_byte_index] = TERMINATOR;
157 }
158 }
159
160 /// A single component of a string. Used for building composite table entries.
161 pub enum StringComponent<'s> {
162 Value(&'s str),
163 Ref(StringId),
164 }
165
166 impl<'s> StringComponent<'s> {
167 #[inline]
168 fn serialized_size(&self) -> usize {
169 match *self {
170 StringComponent::Value(s) => s.len(),
171 StringComponent::Ref(_) => STRING_REF_ENCODED_SIZE,
172 }
173 }
174
175 #[inline]
176 fn serialize<'b>(&self, bytes: &'b mut [u8]) -> &'b mut [u8] {
177 match *self {
178 StringComponent::Value(s) => {
179 bytes[..s.len()].copy_from_slice(s.as_bytes());
180 &mut bytes[s.len()..]
181 }
182 StringComponent::Ref(string_id) => {
183 // The code below assumes we use a 5-byte encoding for string
184 // refs, where the first byte is STRING_REF_TAG and the
185 // following 4 bytes are a little-endian u32 string ID value.
186 assert!(STRING_REF_ENCODED_SIZE == 5);
187
188 bytes[0] = STRING_REF_TAG;
189 &mut bytes[1..5].copy_from_slice(&string_id.0.to_le_bytes());
190 &mut bytes[5..]
191 }
192 }
193 }
194 }
195
196 impl<'a> SerializableString for [StringComponent<'a>] {
197 #[inline]
198 fn serialized_size(&self) -> usize {
199 self.iter().map(|c| c.serialized_size()).sum::<usize>() + // size of components
200 1 // terminator
201 }
202
203 #[inline]
204 fn serialize(&self, mut bytes: &mut [u8]) {
205 assert!(bytes.len() == self.serialized_size());
206 for component in self.iter() {
207 bytes = component.serialize(bytes);
208 }
209
210 // Assert that we used the exact number of bytes we anticipated.
211 assert!(bytes.len() == 1);
212 bytes[0] = TERMINATOR;
213 }
214 }
215
216 macro_rules! impl_serializable_string_for_fixed_size {
217 ($n:expr) => {
218 impl<'a> SerializableString for [StringComponent<'a>; $n] {
219 #[inline(always)]
220 fn serialized_size(&self) -> usize {
221 (&self[..]).serialized_size()
222 }
223
224 #[inline(always)]
225 fn serialize(&self, bytes: &mut [u8]) {
226 (&self[..]).serialize(bytes);
227 }
228 }
229 };
230 }
231
232 impl_serializable_string_for_fixed_size!(0);
233 impl_serializable_string_for_fixed_size!(1);
234 impl_serializable_string_for_fixed_size!(2);
235 impl_serializable_string_for_fixed_size!(3);
236 impl_serializable_string_for_fixed_size!(4);
237 impl_serializable_string_for_fixed_size!(5);
238 impl_serializable_string_for_fixed_size!(6);
239 impl_serializable_string_for_fixed_size!(7);
240 impl_serializable_string_for_fixed_size!(8);
241 impl_serializable_string_for_fixed_size!(9);
242 impl_serializable_string_for_fixed_size!(10);
243 impl_serializable_string_for_fixed_size!(11);
244 impl_serializable_string_for_fixed_size!(12);
245 impl_serializable_string_for_fixed_size!(13);
246 impl_serializable_string_for_fixed_size!(14);
247 impl_serializable_string_for_fixed_size!(15);
248 impl_serializable_string_for_fixed_size!(16);
249
250 fn serialize_index_entry(sink: &SerializationSink, id: StringId, addr: Addr) {
251 sink.write_atomic(8, |bytes| {
252 bytes[0..4].copy_from_slice(&id.0.to_le_bytes());
253 bytes[4..8].copy_from_slice(&addr.0.to_le_bytes());
254 });
255 }
256
257 impl StringTableBuilder {
258 pub fn new(
259 data_sink: Arc<SerializationSink>,
260 index_sink: Arc<SerializationSink>,
261 ) -> Result<StringTableBuilder, Box<dyn Error + Send + Sync>> {
262 // The first thing in every stream we generate must be the stream header.
263 write_file_header(&mut data_sink.as_std_write(), FILE_MAGIC_STRINGTABLE_DATA)?;
264 write_file_header(&mut index_sink.as_std_write(), FILE_MAGIC_STRINGTABLE_INDEX)?;
265
266 Ok(StringTableBuilder {
267 data_sink,
268 index_sink,
269 })
270 }
271
272 /// Creates a mapping so that `virtual_id` will resolve to the contents of
273 /// `concrete_id` when reading the string table.
274 pub fn map_virtual_to_concrete_string(&self, virtual_id: StringId, concrete_id: StringId) {
275 // This assertion does not use `is_virtual` on purpose because that
276 // would also allow to overwrite `METADATA_STRING_ID`.
277 assert!(virtual_id.0 <= MAX_USER_VIRTUAL_STRING_ID);
278 serialize_index_entry(&*self.index_sink, virtual_id, concrete_id.to_addr());
279 }
280
281 pub fn bulk_map_virtual_to_single_concrete_string<I>(
282 &self,
283 virtual_ids: I,
284 concrete_id: StringId,
285 ) where
286 I: Iterator<Item = StringId> + ExactSizeIterator,
287 {
288 // TODO: Index data encoding could have a special bulk mode that assigns
289 // multiple StringIds to the same addr, so we don't have to repeat
290 // the `concrete_id` over and over.
291
292 type MappingEntry = [u32; 2];
293 assert!(std::mem::size_of::<MappingEntry>() == 8);
294
295 let to_addr_le = concrete_id.to_addr().0.to_le();
296
297 let serialized: Vec<MappingEntry> = virtual_ids
298 .map(|from| {
299 let id = from.0;
300 assert!(id <= MAX_USER_VIRTUAL_STRING_ID);
301 [id.to_le(), to_addr_le]
302 })
303 .collect();
304
305 let num_bytes = serialized.len() * std::mem::size_of::<MappingEntry>();
306 let byte_ptr = serialized.as_ptr() as *const u8;
307
308 let bytes = unsafe { std::slice::from_raw_parts(byte_ptr, num_bytes) };
309
310 self.index_sink.write_bytes_atomic(bytes);
311 }
312
313 pub(crate) fn alloc_metadata<STR: SerializableString + ?Sized>(&self, s: &STR) {
314 let concrete_id = self.alloc(s);
315 let virtual_id = StringId(METADATA_STRING_ID);
316 assert!(virtual_id.is_virtual());
317 serialize_index_entry(&*self.index_sink, virtual_id, concrete_id.to_addr());
318 }
319
320 pub fn alloc<STR: SerializableString + ?Sized>(&self, s: &STR) -> StringId {
321 let size_in_bytes = s.serialized_size();
322 let addr = self.data_sink.write_atomic(size_in_bytes, |mem| {
323 s.serialize(mem);
324 });
325
326 StringId::from_addr(addr)
327 }
328 }