]> git.proxmox.com Git - rustc.git/blame - vendor/measureme/src/stringtable.rs
New upstream version 1.41.1+dfsg1
[rustc.git] / vendor / measureme / src / stringtable.rs
CommitLineData
48663c56
XL
1//! A string table implementation with a tree-like encoding.
2//!
60c5eb7d 3//! Each entry in the table represents a string and is encoded as a list of
48663c56
XL
4//! components where each component can either be
5//!
60c5eb7d
XL
6//! 1. a string _value_ that contains actual UTF-8 string content,
7//! 2. a string _ID_ that contains a reference to another entry, or
8//! 3. a terminator tag which marks the end of a component list.
48663c56 9//!
60c5eb7d
XL
10//! The string _content_ of an entry is defined as the concatenation of the
11//! content of its components. The content of a string value is its actual
12//! UTF-8 bytes. The content of a string ID is the contents of the entry
48663c56 13//! it references.
e74abb32 14//!
60c5eb7d
XL
15//! The byte-level encoding of component lists uses the structure of UTF-8 in
16//! order to save space:
e74abb32 17//!
60c5eb7d
XL
18//! - A valid UTF-8 codepoint never starts with the bits `10` as this bit
19//! prefix is reserved for bytes in the middle of a UTF-8 codepoint byte
20//! sequence. We make use of this fact by letting all string ID components
21//! start with this `10` prefix. Thus when we parse the contents of a value
22//! we know to stop if the start byte of the next codepoint has this prefix.
23//!
24//! - A valid UTF-8 string cannot contain the `0xFF` byte and since string IDs
25//! start with `10` as described above, they also cannot start with a `0xFF`
26//! byte. Thus we can safely use `0xFF` as our component list terminator.
27//!
28//! The sample composite string ["abc", ID(42), "def", TERMINATOR] would thus be
29//! encoded as:
30//!
31//! ```ignore
32//! ['a', 'b' , 'c', 128, 0, 0, 42, 'd', 'e', 'f', 255]
33//! ^^^^^^^^^^^^^ ^^^
34//! string ID 42 with 0b10 prefix terminator (0xFF)
35//! ```
36//!
37//! As you can see string IDs are encoded in big endian format so that highest
38//! order bits show up in the first byte we encounter.
39//!
40//! ----------------------------------------------------------------------------
41//!
42//! Each string in the table is referred to via a `StringId`. `StringId`s may
43//! be generated in two ways:
44//!
45//! 1. Calling `StringTable::alloc()` which returns the `StringId` for the
46//! allocated string.
47//! 2. Calling `StringTable::alloc_with_reserved_id()` and `StringId::reserved()`.
48//!
49//! String IDs allow you to deduplicate strings by allocating a string
50//! once and then referring to it by id over and over. This is a useful trick
51//! for strings which are recorded many times and it can significantly reduce
52//! the size of profile trace files.
e74abb32
XL
53//!
54//! `StringId`s are partitioned according to type:
55//!
56//! > [0 .. MAX_PRE_RESERVED_STRING_ID, METADATA_STRING_ID, .. ]
57//!
58//! From `0` to `MAX_PRE_RESERVED_STRING_ID` are the allowed values for reserved strings.
59//! After `MAX_PRE_RESERVED_STRING_ID`, there is one string id (`METADATA_STRING_ID`) which is used
60//! internally by `measureme` to record additional metadata about the profiling session.
61//! After `METADATA_STRING_ID` are all other `StringId` values.
60c5eb7d 62//!
e74abb32
XL
63
64use crate::file_header::{
60c5eb7d 65 write_file_header, FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX,
e74abb32 66};
48663c56 67use crate::serialization::{Addr, SerializationSink};
60c5eb7d 68use byteorder::{BigEndian, ByteOrder, LittleEndian};
48663c56
XL
69use std::sync::atomic::{AtomicU32, Ordering};
70use std::sync::Arc;
71
72/// A `StringId` is used to identify a string in the `StringTable`.
73#[derive(Clone, Copy, Eq, PartialEq, Debug, Hash)]
74#[repr(C)]
75pub struct StringId(u32);
76
77impl StringId {
78 #[inline]
79 pub fn reserved(id: u32) -> StringId {
60c5eb7d 80 assert!(id == id & STRING_ID_MASK);
48663c56
XL
81 StringId(id)
82 }
48663c56 83
60c5eb7d
XL
84 #[inline]
85 pub fn as_u32(self) -> u32 {
86 self.0
87 }
88}
48663c56 89
60c5eb7d
XL
90// See module-level documentation for more information on the encoding.
91pub const TERMINATOR: u8 = 0xFF;
48663c56 92
60c5eb7d
XL
93// All 1s except for the two highest bits.
94pub const MAX_STRING_ID: u32 = 0x3FFF_FFFF;
95pub const STRING_ID_MASK: u32 = 0x3FFF_FFFF;
48663c56 96
e74abb32 97/// The maximum id value a prereserved string may be.
60c5eb7d 98const MAX_PRE_RESERVED_STRING_ID: u32 = MAX_STRING_ID / 2;
48663c56 99
e74abb32 100/// The id of the profile metadata string entry.
60c5eb7d 101pub const METADATA_STRING_ID: u32 = MAX_PRE_RESERVED_STRING_ID + 1;
e74abb32 102
48663c56
XL
103/// Write-only version of the string table
104pub struct StringTableBuilder<S: SerializationSink> {
105 data_sink: Arc<S>,
106 index_sink: Arc<S>,
e74abb32 107 id_counter: AtomicU32, // initialized to METADATA_STRING_ID + 1
48663c56
XL
108}
109
110/// Anything that implements `SerializableString` can be written to a
111/// `StringTable`.
112pub trait SerializableString {
113 fn serialized_size(&self) -> usize;
114 fn serialize(&self, bytes: &mut [u8]);
115}
116
60c5eb7d 117// A single string is encoded as `[UTF-8 bytes][TERMINATOR]`
48663c56
XL
118impl SerializableString for str {
119 #[inline]
120 fn serialized_size(&self) -> usize {
48663c56
XL
121 self.len() + // actual bytes
122 1 // terminator
123 }
124
125 #[inline]
126 fn serialize(&self, bytes: &mut [u8]) {
48663c56 127 let last_byte_index = bytes.len() - 1;
60c5eb7d
XL
128 bytes[0..last_byte_index].copy_from_slice(self.as_bytes());
129 bytes[last_byte_index] = TERMINATOR;
48663c56
XL
130 }
131}
132
133/// A single component of a string. Used for building composite table entries.
134pub enum StringComponent<'s> {
135 Value(&'s str),
136 Ref(StringId),
137}
138
60c5eb7d
XL
139impl<'s> StringComponent<'s> {
140 #[inline]
141 fn serialized_size(&self) -> usize {
142 match *self {
143 StringComponent::Value(s) => s.len(),
144 StringComponent::Ref(_) => 4,
145 }
146 }
147
148 #[inline]
149 fn serialize<'b>(&self, bytes: &'b mut [u8]) -> &'b mut [u8] {
150 match *self {
151 StringComponent::Value(s) => {
152 bytes[..s.len()].copy_from_slice(s.as_bytes());
153 &mut bytes[s.len()..]
154 }
155 StringComponent::Ref(string_id) => {
156 assert!(string_id.0 == string_id.0 & STRING_ID_MASK);
157 let tagged = string_id.0 | (1u32 << 31);
158
159 BigEndian::write_u32(&mut bytes[0..4], tagged);
160 &mut bytes[4..]
161 }
162 }
163 }
164}
165
48663c56
XL
166impl<'a> SerializableString for [StringComponent<'a>] {
167 #[inline]
168 fn serialized_size(&self) -> usize {
60c5eb7d
XL
169 self.iter().map(|c| c.serialized_size()).sum::<usize>() + // size of components
170 1 // terminator
48663c56
XL
171 }
172
173 #[inline]
60c5eb7d
XL
174 fn serialize(&self, mut bytes: &mut [u8]) {
175 assert!(bytes.len() == self.serialized_size());
176 for component in self.iter() {
177 bytes = component.serialize(bytes);
178 }
179
180 // Assert that we used the exact number of bytes we anticipated.
181 assert!(bytes.len() == 1);
182 bytes[0] = TERMINATOR;
48663c56
XL
183 }
184}
185
60c5eb7d
XL
186macro_rules! impl_serializable_string_for_fixed_size {
187 ($n:expr) => {
188 impl<'a> SerializableString for [StringComponent<'a>; $n] {
189 #[inline(always)]
190 fn serialized_size(&self) -> usize {
191 (&self[..]).serialized_size()
192 }
193
194 #[inline(always)]
195 fn serialize(&self, bytes: &mut [u8]) {
196 (&self[..]).serialize(bytes);
197 }
198 }
199 };
200}
201
202impl_serializable_string_for_fixed_size!(0);
203impl_serializable_string_for_fixed_size!(1);
204impl_serializable_string_for_fixed_size!(2);
205impl_serializable_string_for_fixed_size!(3);
206impl_serializable_string_for_fixed_size!(4);
207impl_serializable_string_for_fixed_size!(5);
208impl_serializable_string_for_fixed_size!(6);
209impl_serializable_string_for_fixed_size!(7);
210impl_serializable_string_for_fixed_size!(8);
211impl_serializable_string_for_fixed_size!(9);
212impl_serializable_string_for_fixed_size!(10);
213impl_serializable_string_for_fixed_size!(11);
214impl_serializable_string_for_fixed_size!(12);
215impl_serializable_string_for_fixed_size!(13);
216impl_serializable_string_for_fixed_size!(14);
217impl_serializable_string_for_fixed_size!(15);
218impl_serializable_string_for_fixed_size!(16);
219
48663c56
XL
220fn serialize_index_entry<S: SerializationSink>(sink: &S, id: StringId, addr: Addr) {
221 sink.write_atomic(8, |bytes| {
222 LittleEndian::write_u32(&mut bytes[0..4], id.0);
223 LittleEndian::write_u32(&mut bytes[4..8], addr.0);
224 });
225}
226
48663c56
XL
227impl<S: SerializationSink> StringTableBuilder<S> {
228 pub fn new(data_sink: Arc<S>, index_sink: Arc<S>) -> StringTableBuilder<S> {
48663c56
XL
229 // The first thing in every file we generate must be the file header.
230 write_file_header(&*data_sink, FILE_MAGIC_STRINGTABLE_DATA);
231 write_file_header(&*index_sink, FILE_MAGIC_STRINGTABLE_INDEX);
232
233 StringTableBuilder {
234 data_sink,
235 index_sink,
e74abb32 236 id_counter: AtomicU32::new(METADATA_STRING_ID + 1),
48663c56
XL
237 }
238 }
239
48663c56
XL
240 pub fn alloc_with_reserved_id<STR: SerializableString + ?Sized>(
241 &self,
242 id: StringId,
243 s: &STR,
244 ) -> StringId {
245 assert!(id.0 <= MAX_PRE_RESERVED_STRING_ID);
246 self.alloc_unchecked(id, s);
247 id
248 }
249
e74abb32
XL
250 pub(crate) fn alloc_metadata<STR: SerializableString + ?Sized>(&self, s: &STR) -> StringId {
251 let id = StringId(METADATA_STRING_ID);
252 self.alloc_unchecked(id, s);
253 id
254 }
255
48663c56
XL
256 pub fn alloc<STR: SerializableString + ?Sized>(&self, s: &STR) -> StringId {
257 let id = StringId(self.id_counter.fetch_add(1, Ordering::SeqCst));
60c5eb7d
XL
258 assert!(id.0 > METADATA_STRING_ID);
259 assert!(id.0 <= MAX_STRING_ID);
48663c56
XL
260 self.alloc_unchecked(id, s);
261 id
262 }
263
264 #[inline]
265 fn alloc_unchecked<STR: SerializableString + ?Sized>(&self, id: StringId, s: &STR) {
266 let size_in_bytes = s.serialized_size();
267 let addr = self.data_sink.write_atomic(size_in_bytes, |mem| {
268 s.serialize(mem);
269 });
270
271 serialize_index_entry(&*self.index_sink, id, addr);
272 }
273}