]>
Commit | Line | Data |
---|---|---|
0531ce1d XL |
1 | //! An example showing runtime dispatch to an architecture-optimized |
2 | //! implementation. | |
3 | //! | |
4 | //! This program implements hex encoding a slice into a predetermined | |
5 | //! destination using various different instruction sets. This selects at | |
6 | //! runtime the most optimized implementation and uses that rather than being | |
7 | //! required to be compiled differently. | |
8 | //! | |
9 | //! You can test out this program via: | |
10 | //! | |
11 | //! echo test | cargo +nightly run --release --example hex -p stdsimd | |
12 | //! | |
13 | //! and you should see `746573740a` get printed out. | |
14 | ||
83c7162d | 15 | #![feature(stdsimd)] |
0531ce1d | 16 | #![cfg_attr(test, feature(test))] |
8faf50e0 XL |
17 | #![cfg_attr( |
18 | feature = "cargo-clippy", | |
19 | allow( | |
20 | result_unwrap_used, print_stdout, option_unwrap_used, shadow_reuse, | |
21 | cast_possible_wrap, cast_sign_loss, missing_docs_in_private_items | |
22 | ) | |
23 | )] | |
0531ce1d XL |
24 | |
25 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
26 | #[macro_use] | |
27 | extern crate stdsimd; | |
28 | ||
29 | #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] | |
30 | extern crate stdsimd; | |
31 | ||
32 | #[cfg(test)] | |
33 | #[macro_use] | |
34 | extern crate quickcheck; | |
35 | ||
0531ce1d | 36 | use std::io::{self, Read}; |
83c7162d | 37 | use std::str; |
0531ce1d XL |
38 | |
39 | #[cfg(target_arch = "x86")] | |
40 | use stdsimd::arch::x86::*; | |
41 | #[cfg(target_arch = "x86_64")] | |
42 | use stdsimd::arch::x86_64::*; | |
43 | ||
44 | fn main() { | |
45 | let mut input = Vec::new(); | |
46 | io::stdin().read_to_end(&mut input).unwrap(); | |
47 | let mut dst = vec![0; 2 * input.len()]; | |
48 | let s = hex_encode(&input, &mut dst).unwrap(); | |
49 | println!("{}", s); | |
50 | } | |
51 | ||
52 | fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { | |
53 | let len = src.len().checked_mul(2).unwrap(); | |
54 | if dst.len() < len { | |
55 | return Err(len); | |
56 | } | |
57 | ||
58 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
59 | { | |
60 | if is_x86_feature_detected!("avx2") { | |
61 | return unsafe { hex_encode_avx2(src, dst) }; | |
62 | } | |
63 | if is_x86_feature_detected!("sse4.1") { | |
64 | return unsafe { hex_encode_sse41(src, dst) }; | |
65 | } | |
66 | } | |
67 | ||
68 | hex_encode_fallback(src, dst) | |
69 | } | |
70 | ||
71 | #[target_feature(enable = "avx2")] | |
72 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
73 | unsafe fn hex_encode_avx2<'a>( | |
8faf50e0 | 74 | mut src: &[u8], dst: &'a mut [u8], |
0531ce1d XL |
75 | ) -> Result<&'a str, usize> { |
76 | let ascii_zero = _mm256_set1_epi8(b'0' as i8); | |
77 | let nines = _mm256_set1_epi8(9); | |
78 | let ascii_a = _mm256_set1_epi8((b'a' - 9 - 1) as i8); | |
79 | let and4bits = _mm256_set1_epi8(0xf); | |
80 | ||
81 | let mut i = 0_isize; | |
82 | while src.len() >= 32 { | |
83 | let invec = _mm256_loadu_si256(src.as_ptr() as *const _); | |
84 | ||
85 | let masked1 = _mm256_and_si256(invec, and4bits); | |
86 | let masked2 = _mm256_and_si256(_mm256_srli_epi64(invec, 4), and4bits); | |
87 | ||
88 | // return 0xff corresponding to the elements > 9, or 0x00 otherwise | |
89 | let cmpmask1 = _mm256_cmpgt_epi8(masked1, nines); | |
90 | let cmpmask2 = _mm256_cmpgt_epi8(masked2, nines); | |
91 | ||
92 | // add '0' or the offset depending on the masks | |
93 | let masked1 = _mm256_add_epi8( | |
94 | masked1, | |
95 | _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask1), | |
96 | ); | |
97 | let masked2 = _mm256_add_epi8( | |
98 | masked2, | |
99 | _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask2), | |
100 | ); | |
101 | ||
102 | // interleave masked1 and masked2 bytes | |
103 | let res1 = _mm256_unpacklo_epi8(masked2, masked1); | |
104 | let res2 = _mm256_unpackhi_epi8(masked2, masked1); | |
105 | ||
106 | // Store everything into the right destination now | |
107 | let base = dst.as_mut_ptr().offset(i * 2); | |
108 | let base1 = base.offset(0) as *mut _; | |
109 | let base2 = base.offset(16) as *mut _; | |
110 | let base3 = base.offset(32) as *mut _; | |
111 | let base4 = base.offset(48) as *mut _; | |
112 | _mm256_storeu2_m128i(base3, base1, res1); | |
113 | _mm256_storeu2_m128i(base4, base2, res2); | |
114 | src = &src[32..]; | |
115 | i += 32; | |
116 | } | |
117 | ||
118 | let i = i as usize; | |
119 | let _ = hex_encode_sse41(src, &mut dst[i * 2..]); | |
120 | ||
8faf50e0 | 121 | Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) |
0531ce1d XL |
122 | } |
123 | ||
124 | // copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp | |
125 | #[target_feature(enable = "sse4.1")] | |
126 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
127 | unsafe fn hex_encode_sse41<'a>( | |
8faf50e0 | 128 | mut src: &[u8], dst: &'a mut [u8], |
0531ce1d XL |
129 | ) -> Result<&'a str, usize> { |
130 | let ascii_zero = _mm_set1_epi8(b'0' as i8); | |
131 | let nines = _mm_set1_epi8(9); | |
132 | let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8); | |
133 | let and4bits = _mm_set1_epi8(0xf); | |
134 | ||
135 | let mut i = 0_isize; | |
136 | while src.len() >= 16 { | |
137 | let invec = _mm_loadu_si128(src.as_ptr() as *const _); | |
138 | ||
139 | let masked1 = _mm_and_si128(invec, and4bits); | |
140 | let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits); | |
141 | ||
142 | // return 0xff corresponding to the elements > 9, or 0x00 otherwise | |
143 | let cmpmask1 = _mm_cmpgt_epi8(masked1, nines); | |
144 | let cmpmask2 = _mm_cmpgt_epi8(masked2, nines); | |
145 | ||
146 | // add '0' or the offset depending on the masks | |
147 | let masked1 = _mm_add_epi8( | |
148 | masked1, | |
149 | _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1), | |
150 | ); | |
151 | let masked2 = _mm_add_epi8( | |
152 | masked2, | |
153 | _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2), | |
154 | ); | |
155 | ||
156 | // interleave masked1 and masked2 bytes | |
157 | let res1 = _mm_unpacklo_epi8(masked2, masked1); | |
158 | let res2 = _mm_unpackhi_epi8(masked2, masked1); | |
159 | ||
160 | _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1); | |
8faf50e0 | 161 | _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2); |
0531ce1d XL |
162 | src = &src[16..]; |
163 | i += 16; | |
164 | } | |
165 | ||
166 | let i = i as usize; | |
167 | let _ = hex_encode_fallback(src, &mut dst[i * 2..]); | |
168 | ||
8faf50e0 | 169 | Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) |
0531ce1d XL |
170 | } |
171 | ||
172 | fn hex_encode_fallback<'a>( | |
8faf50e0 | 173 | src: &[u8], dst: &'a mut [u8], |
0531ce1d XL |
174 | ) -> Result<&'a str, usize> { |
175 | fn hex(byte: u8) -> u8 { | |
176 | static TABLE: &[u8] = b"0123456789abcdef"; | |
177 | TABLE[byte as usize] | |
178 | } | |
179 | ||
180 | for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) { | |
181 | slots[0] = hex((*byte >> 4) & 0xf); | |
182 | slots[1] = hex(*byte & 0xf); | |
183 | } | |
184 | ||
185 | unsafe { Ok(str::from_utf8_unchecked(&dst[..src.len() * 2])) } | |
186 | } | |
187 | ||
188 | // Run these with `cargo +nightly test --example hex -p stdsimd` | |
189 | #[cfg(test)] | |
190 | mod tests { | |
191 | use std::iter; | |
192 | ||
193 | use super::*; | |
194 | ||
195 | fn test(input: &[u8], output: &str) { | |
196 | let tmp = || vec![0; input.len() * 2]; | |
197 | ||
8faf50e0 | 198 | assert_eq!(hex_encode_fallback(input, &mut tmp()).unwrap(), output); |
0531ce1d XL |
199 | assert_eq!(hex_encode(input, &mut tmp()).unwrap(), output); |
200 | ||
201 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
202 | unsafe { | |
203 | if is_x86_feature_detected!("avx2") { | |
204 | assert_eq!( | |
205 | hex_encode_avx2(input, &mut tmp()).unwrap(), | |
206 | output | |
207 | ); | |
208 | } | |
209 | if is_x86_feature_detected!("sse4.1") { | |
210 | assert_eq!( | |
211 | hex_encode_sse41(input, &mut tmp()).unwrap(), | |
212 | output | |
213 | ); | |
214 | } | |
215 | } | |
216 | } | |
217 | ||
218 | #[test] | |
219 | fn empty() { | |
220 | test(b"", ""); | |
221 | } | |
222 | ||
223 | #[test] | |
224 | fn big() { | |
225 | test( | |
226 | &[0; 1024], | |
227 | &iter::repeat('0').take(2048).collect::<String>(), | |
228 | ); | |
229 | } | |
230 | ||
231 | #[test] | |
232 | fn odd() { | |
233 | test( | |
234 | &[0; 313], | |
8faf50e0 | 235 | &iter::repeat('0').take(313 * 2).collect::<String>(), |
0531ce1d XL |
236 | ); |
237 | } | |
238 | ||
239 | #[test] | |
240 | fn avx_works() { | |
241 | let mut input = [0; 33]; | |
242 | input[4] = 3; | |
243 | input[16] = 3; | |
244 | input[17] = 0x30; | |
245 | input[21] = 1; | |
246 | input[31] = 0x24; | |
247 | test( | |
248 | &input, | |
249 | "\ | |
250 | 0000000003000000\ | |
251 | 0000000000000000\ | |
252 | 0330000000010000\ | |
253 | 0000000000000024\ | |
254 | 00\ | |
255 | ", | |
256 | ); | |
257 | } | |
258 | ||
259 | quickcheck! { | |
260 | fn encode_equals_fallback(input: Vec<u8>) -> bool { | |
261 | let mut space1 = vec![0; input.len() * 2]; | |
262 | let mut space2 = vec![0; input.len() * 2]; | |
263 | let a = hex_encode(&input, &mut space1).unwrap(); | |
264 | let b = hex_encode_fallback(&input, &mut space2).unwrap(); | |
265 | a == b | |
266 | } | |
267 | ||
268 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
269 | fn avx_equals_fallback(input: Vec<u8>) -> bool { | |
270 | if !is_x86_feature_detected!("avx2") { | |
271 | return true | |
272 | } | |
273 | let mut space1 = vec![0; input.len() * 2]; | |
274 | let mut space2 = vec![0; input.len() * 2]; | |
275 | let a = unsafe { hex_encode_avx2(&input, &mut space1).unwrap() }; | |
276 | let b = hex_encode_fallback(&input, &mut space2).unwrap(); | |
277 | a == b | |
278 | } | |
279 | ||
280 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
281 | fn sse41_equals_fallback(input: Vec<u8>) -> bool { | |
282 | if !is_x86_feature_detected!("avx2") { | |
283 | return true | |
284 | } | |
285 | let mut space1 = vec![0; input.len() * 2]; | |
286 | let mut space2 = vec![0; input.len() * 2]; | |
287 | let a = unsafe { hex_encode_sse41(&input, &mut space1).unwrap() }; | |
288 | let b = hex_encode_fallback(&input, &mut space2).unwrap(); | |
289 | a == b | |
290 | } | |
291 | } | |
292 | } | |
293 | ||
294 | // Run these with `cargo +nightly bench --example hex -p stdsimd` | |
295 | #[cfg(test)] | |
296 | mod benches { | |
297 | extern crate rand; | |
298 | extern crate test; | |
299 | ||
300 | use self::rand::Rng; | |
301 | ||
302 | use super::*; | |
303 | ||
304 | const SMALL_LEN: usize = 117; | |
305 | const LARGE_LEN: usize = 1 * 1024 * 1024; | |
306 | ||
307 | fn doit( | |
308 | b: &mut test::Bencher, len: usize, | |
309 | f: for<'a> unsafe fn(&[u8], &'a mut [u8]) -> Result<&'a str, usize>, | |
310 | ) { | |
311 | let input = rand::thread_rng() | |
312 | .gen_iter::<u8>() | |
313 | .take(len) | |
314 | .collect::<Vec<_>>(); | |
315 | let mut dst = vec![0; input.len() * 2]; | |
316 | b.bytes = len as u64; | |
317 | b.iter(|| unsafe { | |
318 | f(&input, &mut dst).unwrap(); | |
319 | dst[0] | |
320 | }); | |
321 | } | |
322 | ||
323 | #[bench] | |
324 | fn small_default(b: &mut test::Bencher) { | |
325 | doit(b, SMALL_LEN, hex_encode); | |
326 | } | |
327 | ||
328 | #[bench] | |
329 | fn small_fallback(b: &mut test::Bencher) { | |
330 | doit(b, SMALL_LEN, hex_encode_fallback); | |
331 | } | |
332 | ||
333 | #[bench] | |
334 | fn large_default(b: &mut test::Bencher) { | |
335 | doit(b, LARGE_LEN, hex_encode); | |
336 | } | |
337 | ||
338 | #[bench] | |
339 | fn large_fallback(b: &mut test::Bencher) { | |
340 | doit(b, LARGE_LEN, hex_encode_fallback); | |
341 | } | |
342 | ||
343 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
344 | mod x86 { | |
345 | use super::*; | |
346 | ||
347 | #[bench] | |
348 | fn small_avx2(b: &mut test::Bencher) { | |
349 | if is_x86_feature_detected!("avx2") { | |
350 | doit(b, SMALL_LEN, hex_encode_avx2); | |
351 | } | |
352 | } | |
353 | ||
354 | #[bench] | |
355 | fn small_sse41(b: &mut test::Bencher) { | |
356 | if is_x86_feature_detected!("sse4.1") { | |
357 | doit(b, SMALL_LEN, hex_encode_sse41); | |
358 | } | |
359 | } | |
360 | ||
361 | #[bench] | |
362 | fn large_avx2(b: &mut test::Bencher) { | |
363 | if is_x86_feature_detected!("avx2") { | |
364 | doit(b, LARGE_LEN, hex_encode_avx2); | |
365 | } | |
366 | } | |
367 | ||
368 | #[bench] | |
369 | fn large_sse41(b: &mut test::Bencher) { | |
370 | if is_x86_feature_detected!("sse4.1") { | |
371 | doit(b, LARGE_LEN, hex_encode_sse41); | |
372 | } | |
373 | } | |
374 | } | |
375 | } |