1 //! An example showing runtime dispatch to an architecture-optimized
4 //! This program implements hex encoding a slice into a predetermined
5 //! destination using various different instruction sets. This selects at
6 //! runtime the most optimized implementation and uses that rather than being
7 //! required to be compiled differently.
9 //! You can test out this program via:
11 //! echo test | cargo +nightly run --release --example hex -p stdsimd
13 //! and you should see `746573740a` get printed out.
15 #![feature(cfg_target_feature, target_feature, stdsimd)]
16 #![cfg_attr(test, feature(test))]
17 #![cfg_attr(feature = "cargo-clippy",
18 allow(result_unwrap_used
, print_stdout
, option_unwrap_used
,
19 shadow_reuse
, cast_possible_wrap
, cast_sign_loss
,
20 missing_docs_in_private_items
))]
22 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
26 #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
31 extern crate quickcheck
;
34 use std
::io
::{self, Read}
;
36 #[cfg(target_arch = "x86")]
37 use stdsimd
::arch
::x86
::*;
38 #[cfg(target_arch = "x86_64")]
39 use stdsimd
::arch
::x86_64
::*;
42 let mut input
= Vec
::new();
43 io
::stdin().read_to_end(&mut input
).unwrap();
44 let mut dst
= vec
![0; 2 * input
.len()];
45 let s
= hex_encode(&input
, &mut dst
).unwrap();
49 fn hex_encode
<'a
>(src
: &[u8], dst
: &'a
mut [u8]) -> Result
<&'a
str, usize> {
50 let len
= src
.len().checked_mul(2).unwrap();
55 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
57 if is_x86_feature_detected
!("avx2") {
58 return unsafe { hex_encode_avx2(src, dst) }
;
60 if is_x86_feature_detected
!("sse4.1") {
61 return unsafe { hex_encode_sse41(src, dst) }
;
65 hex_encode_fallback(src
, dst
)
68 #[target_feature(enable = "avx2")]
69 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
70 unsafe fn hex_encode_avx2
<'a
>(
71 mut src
: &[u8], dst
: &'a
mut [u8]
72 ) -> Result
<&'a
str, usize> {
73 let ascii_zero
= _mm256_set1_epi8(b'
0'
as i8);
74 let nines
= _mm256_set1_epi8(9);
75 let ascii_a
= _mm256_set1_epi8((b'a'
- 9 - 1) as i8);
76 let and4bits
= _mm256_set1_epi8(0xf);
79 while src
.len() >= 32 {
80 let invec
= _mm256_loadu_si256(src
.as_ptr() as *const _
);
82 let masked1
= _mm256_and_si256(invec
, and4bits
);
83 let masked2
= _mm256_and_si256(_mm256_srli_epi64(invec
, 4), and4bits
);
85 // return 0xff corresponding to the elements > 9, or 0x00 otherwise
86 let cmpmask1
= _mm256_cmpgt_epi8(masked1
, nines
);
87 let cmpmask2
= _mm256_cmpgt_epi8(masked2
, nines
);
89 // add '0' or the offset depending on the masks
90 let masked1
= _mm256_add_epi8(
92 _mm256_blendv_epi8(ascii_zero
, ascii_a
, cmpmask1
),
94 let masked2
= _mm256_add_epi8(
96 _mm256_blendv_epi8(ascii_zero
, ascii_a
, cmpmask2
),
99 // interleave masked1 and masked2 bytes
100 let res1
= _mm256_unpacklo_epi8(masked2
, masked1
);
101 let res2
= _mm256_unpackhi_epi8(masked2
, masked1
);
103 // Store everything into the right destination now
104 let base
= dst
.as_mut_ptr().offset(i
* 2);
105 let base1
= base
.offset(0) as *mut _
;
106 let base2
= base
.offset(16) as *mut _
;
107 let base3
= base
.offset(32) as *mut _
;
108 let base4
= base
.offset(48) as *mut _
;
109 _mm256_storeu2_m128i(base3
, base1
, res1
);
110 _mm256_storeu2_m128i(base4
, base2
, res2
);
116 let _
= hex_encode_sse41(src
, &mut dst
[i
* 2..]);
118 Ok(str::from_utf8_unchecked(&dst
[..src
.len() * 2 + i
* 2]))
121 // copied from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp
122 #[target_feature(enable = "sse4.1")]
123 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
124 unsafe fn hex_encode_sse41
<'a
>(
125 mut src
: &[u8], dst
: &'a
mut [u8]
126 ) -> Result
<&'a
str, usize> {
127 let ascii_zero
= _mm_set1_epi8(b'
0'
as i8);
128 let nines
= _mm_set1_epi8(9);
129 let ascii_a
= _mm_set1_epi8((b'a'
- 9 - 1) as i8);
130 let and4bits
= _mm_set1_epi8(0xf);
133 while src
.len() >= 16 {
134 let invec
= _mm_loadu_si128(src
.as_ptr() as *const _
);
136 let masked1
= _mm_and_si128(invec
, and4bits
);
137 let masked2
= _mm_and_si128(_mm_srli_epi64(invec
, 4), and4bits
);
139 // return 0xff corresponding to the elements > 9, or 0x00 otherwise
140 let cmpmask1
= _mm_cmpgt_epi8(masked1
, nines
);
141 let cmpmask2
= _mm_cmpgt_epi8(masked2
, nines
);
143 // add '0' or the offset depending on the masks
144 let masked1
= _mm_add_epi8(
146 _mm_blendv_epi8(ascii_zero
, ascii_a
, cmpmask1
),
148 let masked2
= _mm_add_epi8(
150 _mm_blendv_epi8(ascii_zero
, ascii_a
, cmpmask2
),
153 // interleave masked1 and masked2 bytes
154 let res1
= _mm_unpacklo_epi8(masked2
, masked1
);
155 let res2
= _mm_unpackhi_epi8(masked2
, masked1
);
157 _mm_storeu_si128(dst
.as_mut_ptr().offset(i
* 2) as *mut _
, res1
);
158 _mm_storeu_si128(dst
.as_mut_ptr().offset(i
* 2 + 16) as *mut _
, res2
);
164 let _
= hex_encode_fallback(src
, &mut dst
[i
* 2..]);
166 Ok(str::from_utf8_unchecked(&dst
[..src
.len() * 2 + i
* 2]))
169 fn hex_encode_fallback
<'a
>(
170 src
: &[u8], dst
: &'a
mut [u8]
171 ) -> Result
<&'a
str, usize> {
172 fn hex(byte
: u8) -> u8 {
173 static TABLE
: &[u8] = b
"0123456789abcdef";
177 for (byte
, slots
) in src
.iter().zip(dst
.chunks_mut(2)) {
178 slots
[0] = hex((*byte
>> 4) & 0xf);
179 slots
[1] = hex(*byte
& 0xf);
182 unsafe { Ok(str::from_utf8_unchecked(&dst[..src.len() * 2])) }
185 // Run these with `cargo +nightly test --example hex -p stdsimd`
192 fn test(input
: &[u8], output
: &str) {
193 let tmp
= || vec
![0; input
.len() * 2];
195 assert_eq
!(hex_encode_fallback(input
, &mut tmp()).unwrap(), output
);
196 assert_eq
!(hex_encode(input
, &mut tmp()).unwrap(), output
);
198 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
200 if is_x86_feature_detected
!("avx2") {
202 hex_encode_avx2(input
, &mut tmp()).unwrap(),
206 if is_x86_feature_detected
!("sse4.1") {
208 hex_encode_sse41(input
, &mut tmp()).unwrap(),
224 &iter
::repeat('
0'
).take(2048).collect
::<String
>(),
232 &iter
::repeat('
0'
).take(313 * 2).collect
::<String
>(),
238 let mut input
= [0; 33];
257 fn encode_equals_fallback(input
: Vec
<u8>) -> bool
{
258 let mut space1
= vec
![0; input
.len() * 2];
259 let mut space2
= vec
![0; input
.len() * 2];
260 let a
= hex_encode(&input
, &mut space1
).unwrap();
261 let b
= hex_encode_fallback(&input
, &mut space2
).unwrap();
265 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
266 fn avx_equals_fallback(input
: Vec
<u8>) -> bool
{
267 if !is_x86_feature_detected
!("avx2") {
270 let mut space1
= vec
![0; input
.len() * 2];
271 let mut space2
= vec
![0; input
.len() * 2];
272 let a
= unsafe { hex_encode_avx2(&input, &mut space1).unwrap() }
;
273 let b
= hex_encode_fallback(&input
, &mut space2
).unwrap();
277 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
278 fn sse41_equals_fallback(input
: Vec
<u8>) -> bool
{
279 if !is_x86_feature_detected
!("avx2") {
282 let mut space1
= vec
![0; input
.len() * 2];
283 let mut space2
= vec
![0; input
.len() * 2];
284 let a
= unsafe { hex_encode_sse41(&input, &mut space1).unwrap() }
;
285 let b
= hex_encode_fallback(&input
, &mut space2
).unwrap();
291 // Run these with `cargo +nightly bench --example hex -p stdsimd`
301 const SMALL_LEN
: usize = 117;
302 const LARGE_LEN
: usize = 1 * 1024 * 1024;
305 b
: &mut test
::Bencher
, len
: usize,
306 f
: for<'a
> unsafe fn(&[u8], &'a
mut [u8]) -> Result
<&'a
str, usize>,
308 let input
= rand
::thread_rng()
311 .collect
::<Vec
<_
>>();
312 let mut dst
= vec
![0; input
.len() * 2];
313 b
.bytes
= len
as u64;
315 f(&input
, &mut dst
).unwrap();
321 fn small_default(b
: &mut test
::Bencher
) {
322 doit(b
, SMALL_LEN
, hex_encode
);
326 fn small_fallback(b
: &mut test
::Bencher
) {
327 doit(b
, SMALL_LEN
, hex_encode_fallback
);
331 fn large_default(b
: &mut test
::Bencher
) {
332 doit(b
, LARGE_LEN
, hex_encode
);
336 fn large_fallback(b
: &mut test
::Bencher
) {
337 doit(b
, LARGE_LEN
, hex_encode_fallback
);
340 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
345 fn small_avx2(b
: &mut test
::Bencher
) {
346 if is_x86_feature_detected
!("avx2") {
347 doit(b
, SMALL_LEN
, hex_encode_avx2
);
352 fn small_sse41(b
: &mut test
::Bencher
) {
353 if is_x86_feature_detected
!("sse4.1") {
354 doit(b
, SMALL_LEN
, hex_encode_sse41
);
359 fn large_avx2(b
: &mut test
::Bencher
) {
360 if is_x86_feature_detected
!("avx2") {
361 doit(b
, LARGE_LEN
, hex_encode_avx2
);
366 fn large_sse41(b
: &mut test
::Bencher
) {
367 if is_x86_feature_detected
!("sse4.1") {
368 doit(b
, LARGE_LEN
, hex_encode_sse41
);