]>
Commit | Line | Data |
---|---|---|
0531ce1d XL |
1 | //! An example showing runtime dispatch to an architecture-optimized |
2 | //! implementation. | |
3 | //! | |
4 | //! This program implements hex encoding a slice into a predetermined | |
5 | //! destination using various different instruction sets. This selects at | |
6 | //! runtime the most optimized implementation and uses that rather than being | |
7 | //! required to be compiled differently. | |
8 | //! | |
9 | //! You can test out this program via: | |
10 | //! | |
9fa01778 | 11 | //! echo test | cargo +nightly run --release hex |
0531ce1d XL |
12 | //! |
13 | //! and you should see `746573740a` get printed out. | |
14 | ||
83c7162d | 15 | #![feature(stdsimd)] |
0531ce1d | 16 | #![cfg_attr(test, feature(test))] |
48663c56 XL |
17 | #![allow( |
18 | clippy::result_unwrap_used, | |
19 | clippy::print_stdout, | |
20 | clippy::option_unwrap_used, | |
21 | clippy::shadow_reuse, | |
22 | clippy::cast_possible_wrap, | |
23 | clippy::cast_ptr_alignment, | |
24 | clippy::cast_sign_loss, | |
25 | clippy::missing_docs_in_private_items | |
8faf50e0 | 26 | )] |
0531ce1d XL |
27 | |
28 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
9fa01778 XL |
29 | #[macro_use(is_x86_feature_detected)] |
30 | extern crate std_detect; | |
0531ce1d | 31 | |
9fa01778 | 32 | extern crate core_arch; |
0531ce1d XL |
33 | |
34 | #[cfg(test)] | |
35 | #[macro_use] | |
36 | extern crate quickcheck; | |
37 | ||
48663c56 XL |
38 | use std::{ |
39 | io::{self, Read}, | |
40 | str, | |
41 | }; | |
0531ce1d XL |
42 | |
43 | #[cfg(target_arch = "x86")] | |
9fa01778 | 44 | use core_arch::x86::*; |
0531ce1d | 45 | #[cfg(target_arch = "x86_64")] |
9fa01778 | 46 | use core_arch::x86_64::*; |
0531ce1d XL |
47 | |
48 | fn main() { | |
49 | let mut input = Vec::new(); | |
50 | io::stdin().read_to_end(&mut input).unwrap(); | |
51 | let mut dst = vec![0; 2 * input.len()]; | |
52 | let s = hex_encode(&input, &mut dst).unwrap(); | |
53 | println!("{}", s); | |
54 | } | |
55 | ||
56 | fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { | |
57 | let len = src.len().checked_mul(2).unwrap(); | |
58 | if dst.len() < len { | |
59 | return Err(len); | |
60 | } | |
61 | ||
62 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
63 | { | |
64 | if is_x86_feature_detected!("avx2") { | |
65 | return unsafe { hex_encode_avx2(src, dst) }; | |
66 | } | |
67 | if is_x86_feature_detected!("sse4.1") { | |
68 | return unsafe { hex_encode_sse41(src, dst) }; | |
69 | } | |
70 | } | |
71 | ||
72 | hex_encode_fallback(src, dst) | |
73 | } | |
74 | ||
75 | #[target_feature(enable = "avx2")] | |
76 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
0731742a | 77 | unsafe fn hex_encode_avx2<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { |
0531ce1d XL |
78 | let ascii_zero = _mm256_set1_epi8(b'0' as i8); |
79 | let nines = _mm256_set1_epi8(9); | |
80 | let ascii_a = _mm256_set1_epi8((b'a' - 9 - 1) as i8); | |
81 | let and4bits = _mm256_set1_epi8(0xf); | |
82 | ||
83 | let mut i = 0_isize; | |
84 | while src.len() >= 32 { | |
85 | let invec = _mm256_loadu_si256(src.as_ptr() as *const _); | |
86 | ||
87 | let masked1 = _mm256_and_si256(invec, and4bits); | |
88 | let masked2 = _mm256_and_si256(_mm256_srli_epi64(invec, 4), and4bits); | |
89 | ||
90 | // return 0xff corresponding to the elements > 9, or 0x00 otherwise | |
91 | let cmpmask1 = _mm256_cmpgt_epi8(masked1, nines); | |
92 | let cmpmask2 = _mm256_cmpgt_epi8(masked2, nines); | |
93 | ||
94 | // add '0' or the offset depending on the masks | |
0731742a XL |
95 | let masked1 = _mm256_add_epi8(masked1, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask1)); |
96 | let masked2 = _mm256_add_epi8(masked2, _mm256_blendv_epi8(ascii_zero, ascii_a, cmpmask2)); | |
0531ce1d XL |
97 | |
98 | // interleave masked1 and masked2 bytes | |
99 | let res1 = _mm256_unpacklo_epi8(masked2, masked1); | |
100 | let res2 = _mm256_unpackhi_epi8(masked2, masked1); | |
101 | ||
102 | // Store everything into the right destination now | |
103 | let base = dst.as_mut_ptr().offset(i * 2); | |
104 | let base1 = base.offset(0) as *mut _; | |
105 | let base2 = base.offset(16) as *mut _; | |
106 | let base3 = base.offset(32) as *mut _; | |
107 | let base4 = base.offset(48) as *mut _; | |
108 | _mm256_storeu2_m128i(base3, base1, res1); | |
109 | _mm256_storeu2_m128i(base4, base2, res2); | |
110 | src = &src[32..]; | |
111 | i += 32; | |
112 | } | |
113 | ||
114 | let i = i as usize; | |
115 | let _ = hex_encode_sse41(src, &mut dst[i * 2..]); | |
116 | ||
8faf50e0 | 117 | Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) |
0531ce1d XL |
118 | } |
119 | ||
120 | // copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp | |
121 | #[target_feature(enable = "sse4.1")] | |
122 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
0731742a | 123 | unsafe fn hex_encode_sse41<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { |
0531ce1d XL |
124 | let ascii_zero = _mm_set1_epi8(b'0' as i8); |
125 | let nines = _mm_set1_epi8(9); | |
126 | let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8); | |
127 | let and4bits = _mm_set1_epi8(0xf); | |
128 | ||
129 | let mut i = 0_isize; | |
130 | while src.len() >= 16 { | |
131 | let invec = _mm_loadu_si128(src.as_ptr() as *const _); | |
132 | ||
133 | let masked1 = _mm_and_si128(invec, and4bits); | |
134 | let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits); | |
135 | ||
136 | // return 0xff corresponding to the elements > 9, or 0x00 otherwise | |
137 | let cmpmask1 = _mm_cmpgt_epi8(masked1, nines); | |
138 | let cmpmask2 = _mm_cmpgt_epi8(masked2, nines); | |
139 | ||
140 | // add '0' or the offset depending on the masks | |
0731742a XL |
141 | let masked1 = _mm_add_epi8(masked1, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1)); |
142 | let masked2 = _mm_add_epi8(masked2, _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2)); | |
0531ce1d XL |
143 | |
144 | // interleave masked1 and masked2 bytes | |
145 | let res1 = _mm_unpacklo_epi8(masked2, masked1); | |
146 | let res2 = _mm_unpackhi_epi8(masked2, masked1); | |
147 | ||
148 | _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1); | |
8faf50e0 | 149 | _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2); |
0531ce1d XL |
150 | src = &src[16..]; |
151 | i += 16; | |
152 | } | |
153 | ||
154 | let i = i as usize; | |
155 | let _ = hex_encode_fallback(src, &mut dst[i * 2..]); | |
156 | ||
8faf50e0 | 157 | Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2])) |
0531ce1d XL |
158 | } |
159 | ||
0731742a | 160 | fn hex_encode_fallback<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> { |
0531ce1d XL |
161 | fn hex(byte: u8) -> u8 { |
162 | static TABLE: &[u8] = b"0123456789abcdef"; | |
163 | TABLE[byte as usize] | |
164 | } | |
165 | ||
166 | for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) { | |
167 | slots[0] = hex((*byte >> 4) & 0xf); | |
168 | slots[1] = hex(*byte & 0xf); | |
169 | } | |
170 | ||
171 | unsafe { Ok(str::from_utf8_unchecked(&dst[..src.len() * 2])) } | |
172 | } | |
173 | ||
416331ca | 174 | // Run these with `cargo +nightly test --example hex -p stdarch` |
0531ce1d XL |
175 | #[cfg(test)] |
176 | mod tests { | |
177 | use std::iter; | |
178 | ||
179 | use super::*; | |
180 | ||
181 | fn test(input: &[u8], output: &str) { | |
182 | let tmp = || vec![0; input.len() * 2]; | |
183 | ||
8faf50e0 | 184 | assert_eq!(hex_encode_fallback(input, &mut tmp()).unwrap(), output); |
0531ce1d XL |
185 | assert_eq!(hex_encode(input, &mut tmp()).unwrap(), output); |
186 | ||
187 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
188 | unsafe { | |
189 | if is_x86_feature_detected!("avx2") { | |
0731742a | 190 | assert_eq!(hex_encode_avx2(input, &mut tmp()).unwrap(), output); |
0531ce1d XL |
191 | } |
192 | if is_x86_feature_detected!("sse4.1") { | |
0731742a | 193 | assert_eq!(hex_encode_sse41(input, &mut tmp()).unwrap(), output); |
0531ce1d XL |
194 | } |
195 | } | |
196 | } | |
197 | ||
198 | #[test] | |
199 | fn empty() { | |
200 | test(b"", ""); | |
201 | } | |
202 | ||
203 | #[test] | |
204 | fn big() { | |
205 | test( | |
206 | &[0; 1024], | |
207 | &iter::repeat('0').take(2048).collect::<String>(), | |
208 | ); | |
209 | } | |
210 | ||
211 | #[test] | |
212 | fn odd() { | |
213 | test( | |
214 | &[0; 313], | |
8faf50e0 | 215 | &iter::repeat('0').take(313 * 2).collect::<String>(), |
0531ce1d XL |
216 | ); |
217 | } | |
218 | ||
219 | #[test] | |
220 | fn avx_works() { | |
221 | let mut input = [0; 33]; | |
222 | input[4] = 3; | |
223 | input[16] = 3; | |
224 | input[17] = 0x30; | |
225 | input[21] = 1; | |
226 | input[31] = 0x24; | |
227 | test( | |
228 | &input, | |
229 | "\ | |
230 | 0000000003000000\ | |
231 | 0000000000000000\ | |
232 | 0330000000010000\ | |
233 | 0000000000000024\ | |
234 | 00\ | |
235 | ", | |
236 | ); | |
237 | } | |
238 | ||
239 | quickcheck! { | |
240 | fn encode_equals_fallback(input: Vec<u8>) -> bool { | |
241 | let mut space1 = vec![0; input.len() * 2]; | |
242 | let mut space2 = vec![0; input.len() * 2]; | |
243 | let a = hex_encode(&input, &mut space1).unwrap(); | |
244 | let b = hex_encode_fallback(&input, &mut space2).unwrap(); | |
245 | a == b | |
246 | } | |
247 | ||
248 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
249 | fn avx_equals_fallback(input: Vec<u8>) -> bool { | |
250 | if !is_x86_feature_detected!("avx2") { | |
251 | return true | |
252 | } | |
253 | let mut space1 = vec![0; input.len() * 2]; | |
254 | let mut space2 = vec![0; input.len() * 2]; | |
255 | let a = unsafe { hex_encode_avx2(&input, &mut space1).unwrap() }; | |
256 | let b = hex_encode_fallback(&input, &mut space2).unwrap(); | |
257 | a == b | |
258 | } | |
259 | ||
260 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
261 | fn sse41_equals_fallback(input: Vec<u8>) -> bool { | |
262 | if !is_x86_feature_detected!("avx2") { | |
263 | return true | |
264 | } | |
265 | let mut space1 = vec![0; input.len() * 2]; | |
266 | let mut space2 = vec![0; input.len() * 2]; | |
267 | let a = unsafe { hex_encode_sse41(&input, &mut space1).unwrap() }; | |
268 | let b = hex_encode_fallback(&input, &mut space2).unwrap(); | |
269 | a == b | |
270 | } | |
271 | } | |
272 | } | |
273 | ||
416331ca | 274 | // Run these with `cargo +nightly bench --example hex -p stdarch` |
0531ce1d XL |
275 | #[cfg(test)] |
276 | mod benches { | |
277 | extern crate rand; | |
278 | extern crate test; | |
279 | ||
280 | use self::rand::Rng; | |
281 | ||
282 | use super::*; | |
283 | ||
284 | const SMALL_LEN: usize = 117; | |
285 | const LARGE_LEN: usize = 1 * 1024 * 1024; | |
286 | ||
287 | fn doit( | |
0731742a XL |
288 | b: &mut test::Bencher, |
289 | len: usize, | |
0531ce1d XL |
290 | f: for<'a> unsafe fn(&[u8], &'a mut [u8]) -> Result<&'a str, usize>, |
291 | ) { | |
9fa01778 XL |
292 | let mut rng = rand::thread_rng(); |
293 | let input = std::iter::repeat(()) | |
294 | .map(|()| rng.gen::<u8>()) | |
0531ce1d XL |
295 | .take(len) |
296 | .collect::<Vec<_>>(); | |
297 | let mut dst = vec![0; input.len() * 2]; | |
298 | b.bytes = len as u64; | |
299 | b.iter(|| unsafe { | |
300 | f(&input, &mut dst).unwrap(); | |
301 | dst[0] | |
302 | }); | |
303 | } | |
304 | ||
305 | #[bench] | |
306 | fn small_default(b: &mut test::Bencher) { | |
307 | doit(b, SMALL_LEN, hex_encode); | |
308 | } | |
309 | ||
310 | #[bench] | |
311 | fn small_fallback(b: &mut test::Bencher) { | |
312 | doit(b, SMALL_LEN, hex_encode_fallback); | |
313 | } | |
314 | ||
315 | #[bench] | |
316 | fn large_default(b: &mut test::Bencher) { | |
317 | doit(b, LARGE_LEN, hex_encode); | |
318 | } | |
319 | ||
320 | #[bench] | |
321 | fn large_fallback(b: &mut test::Bencher) { | |
322 | doit(b, LARGE_LEN, hex_encode_fallback); | |
323 | } | |
324 | ||
325 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
326 | mod x86 { | |
327 | use super::*; | |
328 | ||
329 | #[bench] | |
330 | fn small_avx2(b: &mut test::Bencher) { | |
331 | if is_x86_feature_detected!("avx2") { | |
332 | doit(b, SMALL_LEN, hex_encode_avx2); | |
333 | } | |
334 | } | |
335 | ||
336 | #[bench] | |
337 | fn small_sse41(b: &mut test::Bencher) { | |
338 | if is_x86_feature_detected!("sse4.1") { | |
339 | doit(b, SMALL_LEN, hex_encode_sse41); | |
340 | } | |
341 | } | |
342 | ||
343 | #[bench] | |
344 | fn large_avx2(b: &mut test::Bencher) { | |
345 | if is_x86_feature_detected!("avx2") { | |
346 | doit(b, LARGE_LEN, hex_encode_avx2); | |
347 | } | |
348 | } | |
349 | ||
350 | #[bench] | |
351 | fn large_sse41(b: &mut test::Bencher) { | |
352 | if is_x86_feature_detected!("sse4.1") { | |
353 | doit(b, LARGE_LEN, hex_encode_sse41); | |
354 | } | |
355 | } | |
356 | } | |
357 | } |