3 /// A representation of byte oriented equivalence classes.
5 /// This is used in an FSM to reduce the size of the transition table. This can
6 /// have a particularly large impact not only on the total size of an FSM, but
7 /// also on compile times.
9 pub struct ByteClasses([u8; 256]);
12 /// Creates a new set of equivalence classes where all bytes are mapped to
14 pub fn empty() -> ByteClasses
{
18 /// Creates a new set of equivalence classes where each byte belongs to
19 /// its own equivalence class.
20 pub fn singletons() -> ByteClasses
{
21 let mut classes
= ByteClasses
::empty();
23 classes
.set(i
as u8, i
as u8);
28 /// Set the equivalence class for the given byte.
30 pub fn set(&mut self, byte
: u8, class
: u8) {
31 self.0[byte
as usize] = class
;
34 /// Get the equivalence class for the given byte.
36 pub fn get(&self, byte
: u8) -> u8 {
37 // SAFETY: This is safe because all dense transitions have
38 // exactly 256 elements, so all u8 values are valid indices.
42 /// Return the total number of elements in the alphabet represented by
43 /// these equivalence classes. Equivalently, this returns the total number
44 /// of equivalence classes.
46 pub fn alphabet_len(&self) -> usize {
47 self.0[255] as usize + 1
50 /// Returns true if and only if every byte in this class maps to its own
51 /// equivalence class. Equivalently, there are 256 equivalence classes
52 /// and each class contains exactly one byte.
54 pub fn is_singleton(&self) -> bool
{
55 self.alphabet_len() == 256
58 /// Returns an iterator over a sequence of representative bytes from each
59 /// equivalence class. Namely, this yields exactly N items, where N is
60 /// equivalent to the number of equivalence classes. Each item is an
61 /// arbitrary byte drawn from each equivalence class.
63 /// This is useful when one is determinizing an NFA and the NFA's alphabet
64 /// hasn't been converted to equivalence classes yet. Picking an arbitrary
65 /// byte from each equivalence class then permits a full exploration of
66 /// the NFA instead of using every possible byte value.
67 pub fn representatives(&self) -> ByteClassRepresentatives
<'_
> {
68 ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
71 /// Returns all of the bytes in the given equivalence class.
73 /// The second element in the tuple indicates the number of elements in
75 fn elements(&self, equiv
: u8) -> ([u8; 256], usize) {
76 let (mut array
, mut len
) = ([0; 256], 0);
78 if self.get(b
as u8) == equiv
{
87 impl fmt
::Debug
for ByteClasses
{
88 fn fmt(&self, f
: &mut fmt
::Formatter
<'_
>) -> fmt
::Result
{
89 if self.is_singleton() {
90 write
!(f
, "ByteClasses({{singletons}})")
92 write
!(f
, "ByteClasses(")?
;
93 for equiv
in 0..self.alphabet_len() {
94 let (members
, len
) = self.elements(equiv
as u8);
95 write
!(f
, " {} => {:?}", equiv
, &members
[..len
])?
;
102 /// An iterator over representative bytes from each equivalence class.
104 pub struct ByteClassRepresentatives
<'a
> {
105 classes
: &'a ByteClasses
,
107 last_class
: Option
<u8>,
110 impl<'a
> Iterator
for ByteClassRepresentatives
<'a
> {
113 fn next(&mut self) -> Option
<u8> {
114 while self.byte
< 256 {
115 let byte
= self.byte
as u8;
116 let class
= self.classes
.get(byte
);
119 if self.last_class
!= Some(class
) {
120 self.last_class
= Some(class
);
128 /// A byte class builder keeps track of an *approximation* of equivalence
129 /// classes of bytes during NFA construction. That is, every byte in an
130 /// equivalence class cannot discriminate between a match and a non-match.
132 /// For example, in the literals `abc` and `xyz`, the bytes [\x00-`], [d-w]
133 /// and [{-\xFF] never discriminate between a match and a non-match, precisely
134 /// because they never occur in the literals anywhere.
136 /// Note though that this does not necessarily compute the minimal set of
137 /// equivalence classes. For example, in the literals above, the byte ranges
138 /// [\x00-`], [d-w] and [{-\xFF] are all treated as distinct equivalence
139 /// classes even though they could be treated a single class. The reason for
140 /// this is implementation complexity. In the future, we should endeavor to
141 /// compute the minimal equivalence classes since they can have a rather large
142 /// impact on the size of the DFA.
144 /// The representation here is 256 booleans, all initially set to false. Each
145 /// boolean maps to its corresponding byte based on position. A `true` value
146 /// indicates the end of an equivalence class, where its corresponding byte
147 /// and all of the bytes corresponding to all previous contiguous `false`
148 /// values are in the same equivalence class.
150 /// This particular representation only permits contiguous ranges of bytes to
151 /// be in the same equivalence class, which means that we can never discover
152 /// the true minimal set of equivalence classes.
154 pub struct ByteClassBuilder(Vec
<bool
>);
156 impl ByteClassBuilder
{
157 /// Create a new builder of byte classes where all bytes are part of the
158 /// same equivalence class.
159 pub fn new() -> ByteClassBuilder
{
160 ByteClassBuilder(vec
![false; 256])
163 /// Indicate the the range of byte given (inclusive) can discriminate a
164 /// match between it and all other bytes outside of the range.
165 pub fn set_range(&mut self, start
: u8, end
: u8) {
166 debug_assert
!(start
<= end
);
168 self.0[start
as usize - 1] = true;
170 self.0[end
as usize] = true;
173 /// Build byte classes that map all byte values to their corresponding
174 /// equivalence class. The last mapping indicates the largest equivalence
175 /// class identifier (which is never bigger than 255).
176 pub fn build(&self) -> ByteClasses
{
177 let mut classes
= ByteClasses
::empty();
181 classes
.set(i
as u8, class
as u8);
186 class
= class
.checked_add(1).unwrap();
200 let mut set
= ByteClassBuilder
::new();
201 set
.set_range(b'a'
, b'z'
);
203 let classes
= set
.build();
204 assert_eq
!(classes
.get(0), 0);
205 assert_eq
!(classes
.get(1), 0);
206 assert_eq
!(classes
.get(2), 0);
207 assert_eq
!(classes
.get(b'a'
- 1), 0);
208 assert_eq
!(classes
.get(b'a'
), 1);
209 assert_eq
!(classes
.get(b'm'
), 1);
210 assert_eq
!(classes
.get(b'z'
), 1);
211 assert_eq
!(classes
.get(b'z'
+ 1), 2);
212 assert_eq
!(classes
.get(254), 2);
213 assert_eq
!(classes
.get(255), 2);
215 let mut set
= ByteClassBuilder
::new();
218 let classes
= set
.build();
219 assert_eq
!(classes
.get(0), 0);
220 assert_eq
!(classes
.get(1), 0);
221 assert_eq
!(classes
.get(2), 0);
222 assert_eq
!(classes
.get(3), 1);
223 assert_eq
!(classes
.get(4), 2);
224 assert_eq
!(classes
.get(5), 2);
225 assert_eq
!(classes
.get(6), 2);
226 assert_eq
!(classes
.get(7), 3);
227 assert_eq
!(classes
.get(255), 3);
231 fn full_byte_classes() {
232 let mut set
= ByteClassBuilder
::new();
234 set
.set_range(i
as u8, i
as u8);
236 assert_eq
!(set
.build().alphabet_len(), 256);