]>
Commit | Line | Data |
---|---|---|
f20569fa XL |
1 | #[cfg(target_arch = "x86")] |
2 | use std::arch::x86::{ | |
3 | __m128i, | |
4 | _mm_and_si128, | |
5 | _mm_cmpeq_epi8, | |
ed00b5ec | 6 | _mm_cvtsi128_si32, |
f20569fa XL |
7 | _mm_loadu_si128, |
8 | _mm_sad_epu8, | |
9 | _mm_set1_epi8, | |
10 | _mm_setzero_si128, | |
ed00b5ec | 11 | _mm_shuffle_epi32, |
f20569fa XL |
12 | _mm_sub_epi8, |
13 | _mm_xor_si128, | |
14 | }; | |
15 | ||
16 | #[cfg(target_arch = "x86_64")] | |
17 | use std::arch::x86_64::{ | |
18 | __m128i, | |
19 | _mm_and_si128, | |
20 | _mm_cmpeq_epi8, | |
ed00b5ec | 21 | _mm_cvtsi128_si32, |
f20569fa XL |
22 | _mm_loadu_si128, |
23 | _mm_sad_epu8, | |
24 | _mm_set1_epi8, | |
25 | _mm_setzero_si128, | |
ed00b5ec | 26 | _mm_shuffle_epi32, |
f20569fa XL |
27 | _mm_sub_epi8, |
28 | _mm_xor_si128, | |
29 | }; | |
30 | ||
31 | #[target_feature(enable = "sse2")] | |
32 | pub unsafe fn _mm_set1_epu8(a: u8) -> __m128i { | |
33 | _mm_set1_epi8(a as i8) | |
34 | } | |
35 | ||
36 | #[target_feature(enable = "sse2")] | |
37 | pub unsafe fn mm_cmpneq_epi8(a: __m128i, b: __m128i) -> __m128i { | |
38 | _mm_xor_si128(_mm_cmpeq_epi8(a, b), _mm_set1_epi8(-1)) | |
39 | } | |
40 | ||
41 | const MASK: [u8; 32] = [ | |
42 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
43 | 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, | |
44 | ]; | |
45 | ||
46 | #[target_feature(enable = "sse2")] | |
47 | unsafe fn mm_from_offset(slice: &[u8], offset: usize) -> __m128i { | |
48 | _mm_loadu_si128(slice.as_ptr().offset(offset as isize) as *const _) | |
49 | } | |
50 | ||
51 | #[target_feature(enable = "sse2")] | |
52 | unsafe fn sum(u8s: &__m128i) -> usize { | |
53 | let sums = _mm_sad_epu8(*u8s, _mm_setzero_si128()); | |
ed00b5ec | 54 | (_mm_cvtsi128_si32(sums) + _mm_cvtsi128_si32(_mm_shuffle_epi32(sums, 0xaa))) as usize |
f20569fa XL |
55 | } |
56 | ||
57 | #[target_feature(enable = "sse2")] | |
58 | pub unsafe fn chunk_count(haystack: &[u8], needle: u8) -> usize { | |
59 | assert!(haystack.len() >= 16); | |
60 | ||
61 | let mut offset = 0; | |
62 | let mut count = 0; | |
63 | ||
64 | let needles = _mm_set1_epu8(needle); | |
65 | ||
66 | // 4080 | |
67 | while haystack.len() >= offset + 16 * 255 { | |
68 | let mut counts = _mm_setzero_si128(); | |
69 | for _ in 0..255 { | |
70 | counts = _mm_sub_epi8( | |
71 | counts, | |
72 | _mm_cmpeq_epi8(mm_from_offset(haystack, offset), needles) | |
73 | ); | |
74 | offset += 16; | |
75 | } | |
76 | count += sum(&counts); | |
77 | } | |
78 | ||
79 | // 2048 | |
80 | if haystack.len() >= offset + 16 * 128 { | |
81 | let mut counts = _mm_setzero_si128(); | |
82 | for _ in 0..128 { | |
83 | counts = _mm_sub_epi8( | |
84 | counts, | |
85 | _mm_cmpeq_epi8(mm_from_offset(haystack, offset), needles) | |
86 | ); | |
87 | offset += 16; | |
88 | } | |
89 | count += sum(&counts); | |
90 | } | |
91 | ||
92 | // 16 | |
93 | let mut counts = _mm_setzero_si128(); | |
94 | for i in 0..(haystack.len() - offset) / 16 { | |
95 | counts = _mm_sub_epi8( | |
96 | counts, | |
97 | _mm_cmpeq_epi8(mm_from_offset(haystack, offset + i * 16), needles) | |
98 | ); | |
99 | } | |
100 | if haystack.len() % 16 != 0 { | |
101 | counts = _mm_sub_epi8( | |
102 | counts, | |
103 | _mm_and_si128( | |
104 | _mm_cmpeq_epi8(mm_from_offset(haystack, haystack.len() - 16), needles), | |
105 | mm_from_offset(&MASK, haystack.len() % 16) | |
106 | ) | |
107 | ); | |
108 | } | |
109 | count += sum(&counts); | |
110 | ||
111 | count | |
112 | } | |
113 | ||
114 | #[target_feature(enable = "sse2")] | |
115 | unsafe fn is_leading_utf8_byte(u8s: __m128i) -> __m128i { | |
116 | mm_cmpneq_epi8(_mm_and_si128(u8s, _mm_set1_epu8(0b1100_0000)), _mm_set1_epu8(0b1000_0000)) | |
117 | } | |
118 | ||
119 | #[target_feature(enable = "sse2")] | |
120 | pub unsafe fn chunk_num_chars(utf8_chars: &[u8]) -> usize { | |
121 | assert!(utf8_chars.len() >= 16); | |
122 | ||
123 | let mut offset = 0; | |
124 | let mut count = 0; | |
125 | ||
126 | // 4080 | |
127 | while utf8_chars.len() >= offset + 16 * 255 { | |
128 | let mut counts = _mm_setzero_si128(); | |
129 | ||
130 | for _ in 0..255 { | |
131 | counts = _mm_sub_epi8( | |
132 | counts, | |
133 | is_leading_utf8_byte(mm_from_offset(utf8_chars, offset)) | |
134 | ); | |
135 | offset += 16; | |
136 | } | |
137 | count += sum(&counts); | |
138 | } | |
139 | ||
140 | // 2048 | |
141 | if utf8_chars.len() >= offset + 16 * 128 { | |
142 | let mut counts = _mm_setzero_si128(); | |
143 | for _ in 0..128 { | |
144 | counts = _mm_sub_epi8( | |
145 | counts, | |
146 | is_leading_utf8_byte(mm_from_offset(utf8_chars, offset)) | |
147 | ); | |
148 | offset += 16; | |
149 | } | |
150 | count += sum(&counts); | |
151 | } | |
152 | ||
153 | // 16 | |
154 | let mut counts = _mm_setzero_si128(); | |
155 | for i in 0..(utf8_chars.len() - offset) / 16 { | |
156 | counts = _mm_sub_epi8( | |
157 | counts, | |
158 | is_leading_utf8_byte(mm_from_offset(utf8_chars, offset + i * 16)) | |
159 | ); | |
160 | } | |
161 | if utf8_chars.len() % 16 != 0 { | |
162 | counts = _mm_sub_epi8( | |
163 | counts, | |
164 | _mm_and_si128( | |
165 | is_leading_utf8_byte(mm_from_offset(utf8_chars, utf8_chars.len() - 16)), | |
166 | mm_from_offset(&MASK, utf8_chars.len() % 16) | |
167 | ) | |
168 | ); | |
169 | } | |
170 | count += sum(&counts); | |
171 | ||
172 | count | |
173 | } |