]> git.proxmox.com Git - cargo.git/blob - vendor/regex/src/literal/teddy_avx2/imp.rs
New upstream version 0.37.0
[cargo.git] / vendor / regex / src / literal / teddy_avx2 / imp.rs
1 /*!
2 This is the Teddy searcher, but ported to AVX2.
3
4 See the module comments in the SSSE3 Teddy searcher for a more in depth
5 explanation of how this algorithm works. For the most part, this port is
6 basically the same as the SSSE3 version, but using 256-bit vectors instead of
7 128-bit vectors, which increases throughput.
8 */
9
10 use std::cmp;
11
12 use aho_corasick::{self, AhoCorasick, AhoCorasickBuilder};
13 use syntax::hir::literal::Literals;
14
15 use vector::avx2::{AVX2VectorBuilder, u8x32};
16
17 /// Corresponds to the number of bytes read at a time in the haystack.
18 const BLOCK_SIZE: usize = 32;
19
20 /// Match reports match information.
21 #[derive(Debug, Clone)]
22 pub struct Match {
23 /// The index of the pattern that matched. The index is in correspondence
24 /// with the order of the patterns given at construction.
25 pub pat: usize,
26 /// The start byte offset of the match.
27 pub start: usize,
28 /// The end byte offset of the match. This is always `start + pat.len()`.
29 pub end: usize,
30 }
31
32 /// A SIMD accelerated multi substring searcher.
33 #[derive(Debug, Clone)]
34 pub struct Teddy {
35 /// A builder for AVX2 empowered vectors.
36 vb: AVX2VectorBuilder,
37 /// A list of substrings to match.
38 pats: Vec<Vec<u8>>,
39 /// An Aho-Corasick automaton of the patterns. We use this when we need to
40 /// search pieces smaller than the Teddy block size.
41 ac: AhoCorasick,
42 /// A set of 8 buckets. Each bucket corresponds to a single member of a
43 /// bitset. A bucket contains zero or more substrings. This is useful
44 /// when the number of substrings exceeds 8, since our bitsets cannot have
45 /// more than 8 members.
46 buckets: Vec<Vec<usize>>,
47 /// Our set of masks. There's one mask for each byte in the fingerprint.
48 masks: Masks,
49 }
50
51 impl Teddy {
52 /// Returns true if and only if Teddy is supported on this platform.
53 ///
54 /// If this returns `false`, then `Teddy::new(...)` is guaranteed to
55 /// return `None`.
56 pub fn available() -> bool {
57 AVX2VectorBuilder::new().is_some()
58 }
59
60 /// Create a new `Teddy` multi substring matcher.
61 ///
62 /// If a `Teddy` matcher could not be created (e.g., `pats` is empty or has
63 /// an empty substring), then `None` is returned.
64 pub fn new(pats: &Literals) -> Option<Teddy> {
65 let vb = match AVX2VectorBuilder::new() {
66 None => return None,
67 Some(vb) => vb,
68 };
69 if !Teddy::available() {
70 return None;
71 }
72
73 let pats: Vec<_> = pats.literals().iter().map(|p|p.to_vec()).collect();
74 let min_len = pats.iter().map(|p| p.len()).min().unwrap_or(0);
75 // Don't allow any empty patterns and require that we have at
76 // least one pattern.
77 if min_len < 1 {
78 return None;
79 }
80 // Pick the largest mask possible, but no larger than 3.
81 let nmasks = cmp::min(3, min_len);
82 let mut masks = Masks::new(vb, nmasks);
83 let mut buckets = vec![vec![]; 8];
84 // Assign a substring to each bucket, and add the bucket's bitfield to
85 // the appropriate position in the mask.
86 for (pati, pat) in pats.iter().enumerate() {
87 let bucket = pati % 8;
88 buckets[bucket].push(pati);
89 masks.add(bucket as u8, pat);
90 }
91 let ac = AhoCorasickBuilder::new()
92 .match_kind(aho_corasick::MatchKind::LeftmostFirst)
93 .dfa(true)
94 .prefilter(false)
95 .build(&pats);
96 Some(Teddy {
97 vb: vb,
98 pats: pats.to_vec(),
99 ac: ac,
100 buckets: buckets,
101 masks: masks,
102 })
103 }
104
105 /// Returns all of the substrings matched by this `Teddy`.
106 pub fn patterns(&self) -> &[Vec<u8>] {
107 &self.pats
108 }
109
110 /// Returns the number of substrings in this matcher.
111 pub fn len(&self) -> usize {
112 self.pats.len()
113 }
114
115 /// Returns the approximate size on the heap used by this matcher.
116 pub fn approximate_size(&self) -> usize {
117 self.pats.iter().fold(0, |a, b| a + b.len())
118 }
119
120 /// Searches `haystack` for the substrings in this `Teddy`. If a match was
121 /// found, then it is returned. Otherwise, `None` is returned.
122 pub fn find(&self, haystack: &[u8]) -> Option<Match> {
123 // This is safe because the only way we can construct a Teddy type
124 // is if AVX2 is available.
125 unsafe { self.find_impl(haystack) }
126 }
127
128 #[allow(unused_attributes)]
129 #[target_feature(enable = "avx2")]
130 unsafe fn find_impl(&self, haystack: &[u8]) -> Option<Match> {
131 // If our haystack is smaller than the block size, then fall back to
132 // a naive brute force search.
133 if haystack.is_empty() || haystack.len() < (BLOCK_SIZE + 2) {
134 return self.slow(haystack, 0);
135 }
136 match self.masks.len() {
137 0 => None,
138 1 => self.find1(haystack),
139 2 => self.find2(haystack),
140 3 => self.find3(haystack),
141 _ => unreachable!(),
142 }
143 }
144
145 /// `find1` is used when there is only 1 mask. This is the easy case and is
146 /// pretty much as described in the module documentation.
147 #[inline(always)]
148 fn find1(&self, haystack: &[u8]) -> Option<Match> {
149 let mut pos = 0;
150 let zero = self.vb.u8x32_splat(0);
151 let len = haystack.len();
152 debug_assert!(len >= BLOCK_SIZE);
153 while pos <= len - BLOCK_SIZE {
154 let h = unsafe {
155 // I tried and failed to eliminate bounds checks in safe code.
156 // This is safe because of our loop invariant: pos is always
157 // <= len-32.
158 let p = haystack.get_unchecked(pos..);
159 self.vb.u8x32_load_unchecked_unaligned(p)
160 };
161 // N.B. `res0` is our `C` in the module documentation.
162 let res0 = self.masks.members1(h);
163 // Only do expensive verification if there are any non-zero bits.
164 let bitfield = res0.ne(zero).movemask();
165 if bitfield != 0 {
166 if let Some(m) = self.verify(haystack, pos, res0, bitfield) {
167 return Some(m);
168 }
169 }
170 pos += BLOCK_SIZE;
171 }
172 self.slow(haystack, pos)
173 }
174
175 /// `find2` is used when there are 2 masks, e.g., the fingerprint is 2 bytes
176 /// long.
177 #[inline(always)]
178 fn find2(&self, haystack: &[u8]) -> Option<Match> {
179 // This is an exotic way to right shift a SIMD vector across lanes.
180 // See below at use for more details.
181 let zero = self.vb.u8x32_splat(0);
182 let len = haystack.len();
183 // The previous value of `C` (from the module documentation) for the
184 // *first* byte in the fingerprint. On subsequent iterations, we take
185 // the last bitset from the previous `C` and insert it into the first
186 // position of the current `C`, shifting all other bitsets to the right
187 // one lane. This causes `C` for the first byte to line up with `C` for
188 // the second byte, so that they can be `AND`'d together.
189 let mut prev0 = self.vb.u8x32_splat(0xFF);
190 let mut pos = 1;
191 debug_assert!(len >= BLOCK_SIZE);
192 while pos <= len - BLOCK_SIZE {
193 let h = unsafe {
194 // I tried and failed to eliminate bounds checks in safe code.
195 // This is safe because of our loop invariant: pos is always
196 // <= len-32.
197 let p = haystack.get_unchecked(pos..);
198 self.vb.u8x32_load_unchecked_unaligned(p)
199 };
200 let (res0, res1) = self.masks.members2(h);
201
202 // Do this:
203 //
204 // (prev0 << 15) | (res0 >> 1)
205 //
206 // This lets us line up our C values for each byte.
207 let res0prev0 = res0.alignr_15(prev0);
208
209 // `AND`'s our `C` values together.
210 let res = res0prev0.and(res1);
211 prev0 = res0;
212
213 let bitfield = res.ne(zero).movemask();
214 if bitfield != 0 {
215 let pos = pos.checked_sub(1).unwrap();
216 if let Some(m) = self.verify(haystack, pos, res, bitfield) {
217 return Some(m);
218 }
219 }
220 pos += BLOCK_SIZE;
221 }
222 // The windowing above doesn't check the last byte in the last
223 // window, so start the slow search at the last byte of the last
224 // window.
225 self.slow(haystack, pos.checked_sub(1).unwrap())
226 }
227
228 /// `find3` is used when there are 3 masks, e.g., the fingerprint is 3 bytes
229 /// long.
230 ///
231 /// N.B. This is a straight-forward extrapolation of `find2`. The only
232 /// difference is that we need to keep track of two previous values of `C`,
233 /// since we now need to align for three bytes.
234 #[inline(always)]
235 fn find3(&self, haystack: &[u8]) -> Option<Match> {
236 let zero = self.vb.u8x32_splat(0);
237 let len = haystack.len();
238 let mut prev0 = self.vb.u8x32_splat(0xFF);
239 let mut prev1 = self.vb.u8x32_splat(0xFF);
240 let mut pos = 2;
241
242 while pos <= len - BLOCK_SIZE {
243 let h = unsafe {
244 // I tried and failed to eliminate bounds checks in safe code.
245 // This is safe because of our loop invariant: pos is always
246 // <= len-32.
247 let p = haystack.get_unchecked(pos..);
248 self.vb.u8x32_load_unchecked_unaligned(p)
249 };
250 let (res0, res1, res2) = self.masks.members3(h);
251
252 let res0prev0 = res0.alignr_14(prev0);
253 let res1prev1 = res1.alignr_15(prev1);
254 let res = res0prev0.and(res1prev1).and(res2);
255
256 prev0 = res0;
257 prev1 = res1;
258
259 let bitfield = res.ne(zero).movemask();
260 if bitfield != 0 {
261 let pos = pos.checked_sub(2).unwrap();
262 if let Some(m) = self.verify(haystack, pos, res, bitfield) {
263 return Some(m);
264 }
265 }
266 pos += BLOCK_SIZE;
267 }
268 // The windowing above doesn't check the last two bytes in the last
269 // window, so start the slow search at the penultimate byte of the
270 // last window.
271 // self.slow(haystack, pos.saturating_sub(2))
272 self.slow(haystack, pos.checked_sub(2).unwrap())
273 }
274
275 /// Runs the verification procedure on `res` (i.e., `C` from the module
276 /// documentation), where the haystack block starts at `pos` in
277 /// `haystack`. `bitfield` has ones in the bit positions that `res` has
278 /// non-zero bytes.
279 ///
280 /// If a match exists, it returns the first one.
281 #[inline(always)]
282 fn verify(
283 &self,
284 haystack: &[u8],
285 pos: usize,
286 res: u8x32,
287 mut bitfield: u32,
288 ) -> Option<Match> {
289 let patterns = res.bytes();
290 while bitfield != 0 {
291 // The next offset, relative to pos, where some fingerprint
292 // matched.
293 let byte_pos = bitfield.trailing_zeros() as usize;
294 bitfield &= !(1 << byte_pos);
295
296 // Offset relative to the beginning of the haystack.
297 let start = pos + byte_pos;
298
299 // The bitfield telling us which patterns had fingerprints that
300 // match at this starting position.
301 let mut patterns = patterns[byte_pos];
302 while patterns != 0 {
303 let bucket = patterns.trailing_zeros() as usize;
304 patterns &= !(1 << bucket);
305
306 // Actual substring search verification.
307 if let Some(m) = self.verify_bucket(haystack, bucket, start) {
308 return Some(m);
309 }
310 }
311 }
312
313 None
314 }
315
316 /// Verifies whether any substring in the given bucket matches in haystack
317 /// at the given starting position.
318 #[inline(always)]
319 fn verify_bucket(
320 &self,
321 haystack: &[u8],
322 bucket: usize,
323 start: usize,
324 ) -> Option<Match> {
325 // This cycles through the patterns in the bucket in the order that
326 // the patterns were given. Therefore, we guarantee leftmost-first
327 // semantics.
328 for &pati in &self.buckets[bucket] {
329 let pat = &*self.pats[pati];
330 if start + pat.len() > haystack.len() {
331 continue;
332 }
333 if pat == &haystack[start..start + pat.len()] {
334 return Some(Match {
335 pat: pati,
336 start: start,
337 end: start + pat.len(),
338 });
339 }
340 }
341 None
342 }
343
344 /// Slow substring search through all patterns in this matcher.
345 ///
346 /// This is used when we don't have enough bytes in the haystack for our
347 /// block based approach.
348 #[inline(never)]
349 fn slow(&self, haystack: &[u8], pos: usize) -> Option<Match> {
350 self.ac.find(&haystack[pos..]).map(|m| {
351 Match {
352 pat: m.pattern(),
353 start: pos + m.start(),
354 end: pos + m.end(),
355 }
356 })
357 }
358 }
359
360 /// A list of masks. This has length equal to the length of the fingerprint.
361 /// The length of the fingerprint is always `min(3, len(smallest_substring))`.
362 #[derive(Debug, Clone)]
363 struct Masks {
364 vb: AVX2VectorBuilder,
365 masks: [Mask; 3],
366 size: usize,
367 }
368
369 impl Masks {
370 /// Create a new set of masks of size `n`, where `n` corresponds to the
371 /// number of bytes in a fingerprint.
372 fn new(vb: AVX2VectorBuilder, n: usize) -> Masks {
373 Masks {
374 vb: vb,
375 masks: [Mask::new(vb), Mask::new(vb), Mask::new(vb)],
376 size: n,
377 }
378 }
379
380 /// Returns the number of masks.
381 fn len(&self) -> usize {
382 self.size
383 }
384
385 /// Adds the given pattern to the given bucket. The bucket should be a
386 /// power of `2 <= 2^7`.
387 fn add(&mut self, bucket: u8, pat: &[u8]) {
388 for i in 0..self.len() {
389 self.masks[i].add(bucket, pat[i]);
390 }
391 }
392
393 /// Finds the fingerprints that are in the given haystack block. i.e., this
394 /// returns `C` as described in the module documentation.
395 ///
396 /// More specifically, `for i in 0..16` and `j in 0..8, C[i][j] == 1` if and
397 /// only if `haystack_block[i]` corresponds to a fingerprint that is part
398 /// of a pattern in bucket `j`.
399 #[inline(always)]
400 fn members1(&self, haystack_block: u8x32) -> u8x32 {
401 let masklo = self.vb.u8x32_splat(0xF);
402 let hlo = haystack_block.and(masklo);
403 let hhi = haystack_block.bit_shift_right_4().and(masklo);
404
405 self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi))
406 }
407
408 /// Like members1, but computes C for the first and second bytes in the
409 /// fingerprint.
410 #[inline(always)]
411 fn members2(&self, haystack_block: u8x32) -> (u8x32, u8x32) {
412 let masklo = self.vb.u8x32_splat(0xF);
413 let hlo = haystack_block.and(masklo);
414 let hhi = haystack_block.bit_shift_right_4().and(masklo);
415
416 let res0 =
417 self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
418 let res1 =
419 self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
420 (res0, res1)
421 }
422
423 /// Like `members1`, but computes `C` for the first, second and third bytes
424 /// in the fingerprint.
425 #[inline(always)]
426 fn members3(&self, haystack_block: u8x32) -> (u8x32, u8x32, u8x32) {
427 let masklo = self.vb.u8x32_splat(0xF);
428 let hlo = haystack_block.and(masklo);
429 let hhi = haystack_block.bit_shift_right_4().and(masklo);
430
431 let res0 =
432 self.masks[0].lo.shuffle(hlo).and(self.masks[0].hi.shuffle(hhi));
433 let res1 =
434 self.masks[1].lo.shuffle(hlo).and(self.masks[1].hi.shuffle(hhi));
435 let res2 =
436 self.masks[2].lo.shuffle(hlo).and(self.masks[2].hi.shuffle(hhi));
437 (res0, res1, res2)
438 }
439 }
440
441 /// A single mask.
442 #[derive(Debug, Clone, Copy)]
443 struct Mask {
444 /// Bitsets for the low nybbles in a fingerprint.
445 lo: u8x32,
446 /// Bitsets for the high nybbles in a fingerprint.
447 hi: u8x32,
448 }
449
450 impl Mask {
451 /// Create a new mask with no members.
452 fn new(vb: AVX2VectorBuilder) -> Mask {
453 Mask {
454 lo: vb.u8x32_splat(0),
455 hi: vb.u8x32_splat(0),
456 }
457 }
458
459 /// Adds the given byte to the given bucket.
460 fn add(&mut self, bucket: u8, byte: u8) {
461 // Split our byte into two nybbles, and add each nybble to our
462 // mask.
463 let byte_lo = (byte & 0xF) as usize;
464 let byte_hi = (byte >> 4) as usize;
465
466 {
467 let mut lo_bytes = self.lo.bytes();
468 let lo = lo_bytes[byte_lo] | ((1 << bucket) as u8);
469 lo_bytes[byte_lo] = lo;
470 lo_bytes[byte_lo + 16] = lo;
471 self.lo.replace_bytes(lo_bytes);
472 }
473
474 {
475 let mut hi_bytes = self.hi.bytes();
476 let hi = hi_bytes[byte_hi] | ((1 << bucket) as u8);
477 hi_bytes[byte_hi] = hi;
478 hi_bytes[byte_hi + 16] = hi;
479 self.hi.replace_bytes(hi_bytes);
480 }
481 }
482 }