2 This is the Teddy searcher, but ported to AVX2.
4 See the module comments in the SSSE3 Teddy searcher for a more in depth
5 explanation of how this algorithm works. For the most part, this port is
6 basically the same as the SSSE3 version, but using 256-bit vectors instead of
7 128-bit vectors, which increases throughput.
12 use aho_corasick
::{self, AhoCorasick, AhoCorasickBuilder}
;
13 use syntax
::hir
::literal
::Literals
;
15 use vector
::avx2
::{AVX2VectorBuilder, u8x32}
;
17 /// Corresponds to the number of bytes read at a time in the haystack.
18 const BLOCK_SIZE
: usize = 32;
20 /// Match reports match information.
21 #[derive(Debug, Clone)]
23 /// The index of the pattern that matched. The index is in correspondence
24 /// with the order of the patterns given at construction.
26 /// The start byte offset of the match.
28 /// The end byte offset of the match. This is always `start + pat.len()`.
32 /// A SIMD accelerated multi substring searcher.
33 #[derive(Debug, Clone)]
35 /// A builder for AVX2 empowered vectors.
36 vb
: AVX2VectorBuilder
,
37 /// A list of substrings to match.
39 /// An Aho-Corasick automaton of the patterns. We use this when we need to
40 /// search pieces smaller than the Teddy block size.
42 /// A set of 8 buckets. Each bucket corresponds to a single member of a
43 /// bitset. A bucket contains zero or more substrings. This is useful
44 /// when the number of substrings exceeds 8, since our bitsets cannot have
45 /// more than 8 members.
46 buckets
: Vec
<Vec
<usize>>,
47 /// Our set of masks. There's one mask for each byte in the fingerprint.
52 /// Returns true if and only if Teddy is supported on this platform.
54 /// If this returns `false`, then `Teddy::new(...)` is guaranteed to
56 pub fn available() -> bool
{
57 AVX2VectorBuilder
::new().is_some()
60 /// Create a new `Teddy` multi substring matcher.
62 /// If a `Teddy` matcher could not be created (e.g., `pats` is empty or has
63 /// an empty substring), then `None` is returned.
64 pub fn new(pats
: &Literals
) -> Option
<Teddy
> {
65 let vb
= match AVX2VectorBuilder
::new() {
69 if !Teddy
::available() {
73 let pats
: Vec
<_
> = pats
.literals().iter().map(|p
|p
.to_vec()).collect();
74 let min_len
= pats
.iter().map(|p
| p
.len()).min().unwrap_or(0);
75 // Don't allow any empty patterns and require that we have at
80 // Pick the largest mask possible, but no larger than 3.
81 let nmasks
= cmp
::min(3, min_len
);
82 let mut masks
= Masks
::new(vb
, nmasks
);
83 let mut buckets
= vec
![vec
![]; 8];
84 // Assign a substring to each bucket, and add the bucket's bitfield to
85 // the appropriate position in the mask.
86 for (pati
, pat
) in pats
.iter().enumerate() {
87 let bucket
= pati
% 8;
88 buckets
[bucket
].push(pati
);
89 masks
.add(bucket
as u8, pat
);
91 let ac
= AhoCorasickBuilder
::new()
92 .match_kind(aho_corasick
::MatchKind
::LeftmostFirst
)
105 /// Returns all of the substrings matched by this `Teddy`.
106 pub fn patterns(&self) -> &[Vec
<u8>] {
110 /// Returns the number of substrings in this matcher.
111 pub fn len(&self) -> usize {
115 /// Returns the approximate size on the heap used by this matcher.
116 pub fn approximate_size(&self) -> usize {
117 self.pats
.iter().fold(0, |a
, b
| a
+ b
.len())
120 /// Searches `haystack` for the substrings in this `Teddy`. If a match was
121 /// found, then it is returned. Otherwise, `None` is returned.
122 pub fn find(&self, haystack
: &[u8]) -> Option
<Match
> {
123 // This is safe because the only way we can construct a Teddy type
124 // is if AVX2 is available.
125 unsafe { self.find_impl(haystack) }
128 #[allow(unused_attributes)]
129 #[target_feature(enable = "avx2")]
130 unsafe fn find_impl(&self, haystack
: &[u8]) -> Option
<Match
> {
131 // If our haystack is smaller than the block size, then fall back to
132 // a naive brute force search.
133 if haystack
.is_empty() || haystack
.len() < (BLOCK_SIZE
+ 2) {
134 return self.slow(haystack
, 0);
136 match self.masks
.len() {
138 1 => self.find1(haystack
),
139 2 => self.find2(haystack
),
140 3 => self.find3(haystack
),
145 /// `find1` is used when there is only 1 mask. This is the easy case and is
146 /// pretty much as described in the module documentation.
148 fn find1(&self, haystack
: &[u8]) -> Option
<Match
> {
150 let zero
= self.vb
.u8x32_splat(0);
151 let len
= haystack
.len();
152 debug_assert
!(len
>= BLOCK_SIZE
);
153 while pos
<= len
- BLOCK_SIZE
{
155 // I tried and failed to eliminate bounds checks in safe code.
156 // This is safe because of our loop invariant: pos is always
158 let p
= haystack
.get_unchecked(pos
..);
159 self.vb
.u8x32_load_unchecked_unaligned(p
)
161 // N.B. `res0` is our `C` in the module documentation.
162 let res0
= self.masks
.members1(h
);
163 // Only do expensive verification if there are any non-zero bits.
164 let bitfield
= res0
.ne(zero
).movemask();
166 if let Some(m
) = self.verify(haystack
, pos
, res0
, bitfield
) {
172 self.slow(haystack
, pos
)
175 /// `find2` is used when there are 2 masks, e.g., the fingerprint is 2 bytes
178 fn find2(&self, haystack
: &[u8]) -> Option
<Match
> {
179 // This is an exotic way to right shift a SIMD vector across lanes.
180 // See below at use for more details.
181 let zero
= self.vb
.u8x32_splat(0);
182 let len
= haystack
.len();
183 // The previous value of `C` (from the module documentation) for the
184 // *first* byte in the fingerprint. On subsequent iterations, we take
185 // the last bitset from the previous `C` and insert it into the first
186 // position of the current `C`, shifting all other bitsets to the right
187 // one lane. This causes `C` for the first byte to line up with `C` for
188 // the second byte, so that they can be `AND`'d together.
189 let mut prev0
= self.vb
.u8x32_splat(0xFF);
191 debug_assert
!(len
>= BLOCK_SIZE
);
192 while pos
<= len
- BLOCK_SIZE
{
194 // I tried and failed to eliminate bounds checks in safe code.
195 // This is safe because of our loop invariant: pos is always
197 let p
= haystack
.get_unchecked(pos
..);
198 self.vb
.u8x32_load_unchecked_unaligned(p
)
200 let (res0
, res1
) = self.masks
.members2(h
);
204 // (prev0 << 15) | (res0 >> 1)
206 // This lets us line up our C values for each byte.
207 let res0prev0
= res0
.alignr_15(prev0
);
209 // `AND`'s our `C` values together.
210 let res
= res0prev0
.and(res1
);
213 let bitfield
= res
.ne(zero
).movemask();
215 let pos
= pos
.checked_sub(1).unwrap();
216 if let Some(m
) = self.verify(haystack
, pos
, res
, bitfield
) {
222 // The windowing above doesn't check the last byte in the last
223 // window, so start the slow search at the last byte of the last
225 self.slow(haystack
, pos
.checked_sub(1).unwrap())
228 /// `find3` is used when there are 3 masks, e.g., the fingerprint is 3 bytes
231 /// N.B. This is a straight-forward extrapolation of `find2`. The only
232 /// difference is that we need to keep track of two previous values of `C`,
233 /// since we now need to align for three bytes.
235 fn find3(&self, haystack
: &[u8]) -> Option
<Match
> {
236 let zero
= self.vb
.u8x32_splat(0);
237 let len
= haystack
.len();
238 let mut prev0
= self.vb
.u8x32_splat(0xFF);
239 let mut prev1
= self.vb
.u8x32_splat(0xFF);
242 while pos
<= len
- BLOCK_SIZE
{
244 // I tried and failed to eliminate bounds checks in safe code.
245 // This is safe because of our loop invariant: pos is always
247 let p
= haystack
.get_unchecked(pos
..);
248 self.vb
.u8x32_load_unchecked_unaligned(p
)
250 let (res0
, res1
, res2
) = self.masks
.members3(h
);
252 let res0prev0
= res0
.alignr_14(prev0
);
253 let res1prev1
= res1
.alignr_15(prev1
);
254 let res
= res0prev0
.and(res1prev1
).and(res2
);
259 let bitfield
= res
.ne(zero
).movemask();
261 let pos
= pos
.checked_sub(2).unwrap();
262 if let Some(m
) = self.verify(haystack
, pos
, res
, bitfield
) {
268 // The windowing above doesn't check the last two bytes in the last
269 // window, so start the slow search at the penultimate byte of the
271 // self.slow(haystack, pos.saturating_sub(2))
272 self.slow(haystack
, pos
.checked_sub(2).unwrap())
275 /// Runs the verification procedure on `res` (i.e., `C` from the module
276 /// documentation), where the haystack block starts at `pos` in
277 /// `haystack`. `bitfield` has ones in the bit positions that `res` has
280 /// If a match exists, it returns the first one.
289 let patterns
= res
.bytes();
290 while bitfield
!= 0 {
291 // The next offset, relative to pos, where some fingerprint
293 let byte_pos
= bitfield
.trailing_zeros() as usize;
294 bitfield
&= !(1 << byte_pos
);
296 // Offset relative to the beginning of the haystack.
297 let start
= pos
+ byte_pos
;
299 // The bitfield telling us which patterns had fingerprints that
300 // match at this starting position.
301 let mut patterns
= patterns
[byte_pos
];
302 while patterns
!= 0 {
303 let bucket
= patterns
.trailing_zeros() as usize;
304 patterns
&= !(1 << bucket
);
306 // Actual substring search verification.
307 if let Some(m
) = self.verify_bucket(haystack
, bucket
, start
) {
316 /// Verifies whether any substring in the given bucket matches in haystack
317 /// at the given starting position.
325 // This cycles through the patterns in the bucket in the order that
326 // the patterns were given. Therefore, we guarantee leftmost-first
328 for &pati
in &self.buckets
[bucket
] {
329 let pat
= &*self.pats
[pati
];
330 if start
+ pat
.len() > haystack
.len() {
333 if pat
== &haystack
[start
..start
+ pat
.len()] {
337 end
: start
+ pat
.len(),
344 /// Slow substring search through all patterns in this matcher.
346 /// This is used when we don't have enough bytes in the haystack for our
347 /// block based approach.
349 fn slow(&self, haystack
: &[u8], pos
: usize) -> Option
<Match
> {
350 self.ac
.find(&haystack
[pos
..]).map(|m
| {
353 start
: pos
+ m
.start(),
360 /// A list of masks. This has length equal to the length of the fingerprint.
361 /// The length of the fingerprint is always `min(3, len(smallest_substring))`.
362 #[derive(Debug, Clone)]
364 vb
: AVX2VectorBuilder
,
370 /// Create a new set of masks of size `n`, where `n` corresponds to the
371 /// number of bytes in a fingerprint.
372 fn new(vb
: AVX2VectorBuilder
, n
: usize) -> Masks
{
375 masks
: [Mask
::new(vb
), Mask
::new(vb
), Mask
::new(vb
)],
380 /// Returns the number of masks.
381 fn len(&self) -> usize {
385 /// Adds the given pattern to the given bucket. The bucket should be a
386 /// power of `2 <= 2^7`.
387 fn add(&mut self, bucket
: u8, pat
: &[u8]) {
388 for i
in 0..self.len() {
389 self.masks
[i
].add(bucket
, pat
[i
]);
393 /// Finds the fingerprints that are in the given haystack block. i.e., this
394 /// returns `C` as described in the module documentation.
396 /// More specifically, `for i in 0..16` and `j in 0..8, C[i][j] == 1` if and
397 /// only if `haystack_block[i]` corresponds to a fingerprint that is part
398 /// of a pattern in bucket `j`.
400 fn members1(&self, haystack_block
: u8x32
) -> u8x32
{
401 let masklo
= self.vb
.u8x32_splat(0xF);
402 let hlo
= haystack_block
.and(masklo
);
403 let hhi
= haystack_block
.bit_shift_right_4().and(masklo
);
405 self.masks
[0].lo
.shuffle(hlo
).and(self.masks
[0].hi
.shuffle(hhi
))
408 /// Like members1, but computes C for the first and second bytes in the
411 fn members2(&self, haystack_block
: u8x32
) -> (u8x32
, u8x32
) {
412 let masklo
= self.vb
.u8x32_splat(0xF);
413 let hlo
= haystack_block
.and(masklo
);
414 let hhi
= haystack_block
.bit_shift_right_4().and(masklo
);
417 self.masks
[0].lo
.shuffle(hlo
).and(self.masks
[0].hi
.shuffle(hhi
));
419 self.masks
[1].lo
.shuffle(hlo
).and(self.masks
[1].hi
.shuffle(hhi
));
423 /// Like `members1`, but computes `C` for the first, second and third bytes
424 /// in the fingerprint.
426 fn members3(&self, haystack_block
: u8x32
) -> (u8x32
, u8x32
, u8x32
) {
427 let masklo
= self.vb
.u8x32_splat(0xF);
428 let hlo
= haystack_block
.and(masklo
);
429 let hhi
= haystack_block
.bit_shift_right_4().and(masklo
);
432 self.masks
[0].lo
.shuffle(hlo
).and(self.masks
[0].hi
.shuffle(hhi
));
434 self.masks
[1].lo
.shuffle(hlo
).and(self.masks
[1].hi
.shuffle(hhi
));
436 self.masks
[2].lo
.shuffle(hlo
).and(self.masks
[2].hi
.shuffle(hhi
));
442 #[derive(Debug, Clone, Copy)]
444 /// Bitsets for the low nybbles in a fingerprint.
446 /// Bitsets for the high nybbles in a fingerprint.
451 /// Create a new mask with no members.
452 fn new(vb
: AVX2VectorBuilder
) -> Mask
{
454 lo
: vb
.u8x32_splat(0),
455 hi
: vb
.u8x32_splat(0),
459 /// Adds the given byte to the given bucket.
460 fn add(&mut self, bucket
: u8, byte
: u8) {
461 // Split our byte into two nybbles, and add each nybble to our
463 let byte_lo
= (byte
& 0xF) as usize;
464 let byte_hi
= (byte
>> 4) as usize;
467 let mut lo_bytes
= self.lo
.bytes();
468 let lo
= lo_bytes
[byte_lo
] | ((1 << bucket
) as u8);
469 lo_bytes
[byte_lo
] = lo
;
470 lo_bytes
[byte_lo
+ 16] = lo
;
471 self.lo
.replace_bytes(lo_bytes
);
475 let mut hi_bytes
= self.hi
.bytes();
476 let hi
= hi_bytes
[byte_hi
] | ((1 << bucket
) as u8);
477 hi_bytes
[byte_hi
] = hi
;
478 hi_bytes
[byte_hi
+ 16] = hi
;
479 self.hi
.replace_bytes(hi_bytes
);