]> git.proxmox.com Git - rustc.git/blob - src/stdsimd/stdsimd/mod.rs
New upstream version 1.26.0+dfsg1
[rustc.git] / src / stdsimd / stdsimd / mod.rs
1 //! `stdsimd`
2
3 /// SIMD and vendor intrinsics module.
4 ///
5 /// This module is intended to be the gateway to architecture-specific
6 /// intrinsic functions, typically related to SIMD (but not always!). Each
7 /// architecture that Rust compiles to may contain a submodule here, which
8 /// means that this is not a portable module! If you're writing a portable
9 /// library take care when using these APIs!
10 ///
11 /// Under this module you'll find an architecture-named module, such as
12 /// `x86_64`. Each `#[cfg(target_arch)]` that Rust can compile to may have a
13 /// module entry here, only present on that particular target. For example the
14 /// `i686-pc-windows-msvc` target will have an `x86` module here, whereas
15 /// `x86_64-pc-windows-msvc` has `x86_64`.
16 ///
17 /// > **Note**: This module is currently unstable. It was designed in
18 /// > [RFC 2325][rfc] and is currently [tracked] for stabilization.
19 ///
20 /// [rfc]: https://github.com/rust-lang/rfcs/pull/2325
21 /// [tracked]: https://github.com/rust-lang/rust/issues/48556
22 ///
23 /// # Overview
24 ///
25 /// This module exposes vendor-specific intrinsics that typically correspond to
26 /// a single machine instruction. These intrinsics are not portable: their
27 /// availability is architecture-dependent, and not all machines of that
28 /// architecture might provide the intrinsic.
29 ///
30 /// The `arch` module is intended to be a low-level implementation detail for
31 /// higher-level APIs. Using it correctly can be quite tricky as you need to
32 /// ensure at least a few guarantees are upheld:
33 ///
34 /// * The correct architecture's module is used. For example the `arm` module
35 /// isn't available on the `x86_64-unknown-linux-gnu` target. This is
36 /// typically done by ensuring that `#[cfg]` is used appropriately when using
37 /// this module.
38 /// * The CPU the program is currently running on supports the function being
39 /// called. For example it is unsafe to call an AVX2 function on a CPU that
40 /// doesn't actually support AVX2.
41 ///
42 /// As a result of the latter of these guarantees all intrinsics in this module
43 /// are `unsafe` and extra care needs to be taken when calling them!
44 ///
45 /// # CPU Feature Detection
46 ///
47 /// In order to call these APIs in a safe fashion there's a number of
48 /// mechanisms available to ensure that the correct CPU feature is available
49 /// to call an intrinsic. Let's consider, for example, the `_mm256_add_epi64`
50 /// intrinsics on the `x86` and `x86_64` architectures. This function requires
51 /// the AVX2 feature as [documented by Intel][intel-dox] so to correctly call
52 /// this function we need to (a) guarantee we only call it on `x86`/`x86_64`
53 /// and (b) ensure that the CPU feature is available
54 ///
55 /// [intel-dox]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64&expand=100
56 ///
57 /// ## Static CPU Feature Detection
58 ///
59 /// The first option available to us is to conditionally compile code via the
60 /// `#[cfg]` attribute. CPU features correspond to the `target_feature` cfg
61 /// available, and can be used like so:
62 ///
63 /// ```ignore
64 /// #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
65 /// target_feature = "avx2"))]
66 /// fn foo() {
67 /// #[cfg(target_arch = "x86")]
68 /// use std::arch::x86::_mm256_add_epi64;
69 /// #[cfg(target_arch = "x86_64")]
70 /// use std::arch::x86_64::_mm256_add_epi64;
71 ///
72 /// unsafe {
73 /// _mm256_add_epi64(...);
74 /// }
75 /// }
76 /// ```
77 ///
78 /// Here we're using `#[cfg(target_feature = "avx2")]` to conditionally compile
79 /// this function into our module. This means that if the `avx2` feature is
80 /// *enabled statically* then we'll use the `_mm256_add_epi64` function at
81 /// runtime. The `unsafe` block here can be justified through the usage of
82 /// `#[cfg]` to only compile the code in situations where the safety guarantees
83 /// are upheld.
84 ///
85 /// Statically enabling a feature is typically done with the `-C
86 /// target-feature` or `-C target-cpu` flags to the compiler. For example if
87 /// your local CPU supports AVX2 then you can compile the above function with:
88 ///
89 /// ```sh
90 /// $ RUSTFLAGS='-C target-cpu=native' cargo build
91 /// ```
92 ///
93 /// Or otherwise you can specifically enable just the AVX2 feature:
94 ///
95 /// ```sh
96 /// $ RUSTFLAGS='-C target-feature=+avx2' cargo build
97 /// ```
98 ///
99 /// Note that when you compile a binary with a particular feature enabled it's
100 /// important to ensure that you only run the binary on systems which satisfy
101 /// the required feature set.
102 ///
103 /// ## Dynamic CPU Feature Detection
104 ///
105 /// Sometimes statically dispatching isn't quite what you want. Instead you
106 /// might want to build a portable binary that runs across a variety of CPUs,
107 /// but at runtime it selects the most optimized implementation available. This
108 /// allows you to build a "least common denominator" binary which has certain
109 /// sections more optimized for different CPUs.
110 ///
111 /// Taking our previous example from before, we're going to compile our binary
112 /// *without* AVX2 support, but we'd like to enable it for just one function.
113 /// We can do that in a manner like:
114 ///
115 /// ```ignore
116 /// fn foo() {
117 /// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
118 /// {
119 /// if is_x86_feature_detected!("avx2") {
120 /// return unsafe { foo_avx2() };
121 /// }
122 /// }
123 ///
124 /// // fallback implementation without using AVX2
125 /// }
126 ///
127 /// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
128 /// #[target_feature(enable = "avx2")]
129 /// unsafe fn foo_avx2() {
130 /// #[cfg(target_arch = "x86")]
131 /// use std::arch::x86::_mm256_add_epi64;
132 /// #[cfg(target_arch = "x86_64")]
133 /// use std::arch::x86_64::_mm256_add_epi64;
134 ///
135 /// _mm256_add_epi64(...);
136 /// }
137 /// ```
138 ///
139 /// There's a couple of components in play here, so let's go through them in
140 /// detail!
141 ///
142 /// * First up we notice the `is_x86_feature_detected!` macro. Provided by
143 /// the standard library, this macro will perform necessary runtime detection
144 /// to determine whether the CPU the program is running on supports the
145 /// specified feature. In this case the macro will expand to a boolean
146 /// expression evaluating to whether the local CPU has the AVX2 feature or
147 /// not.
148 ///
149 /// Note that this macro, like the `arch` module, is platform-specific. The
150 /// name of the macro is the same across platforms, but the arguments to the
151 /// macro are only the features for the current platform. For example calling
152 /// `is_x86_feature_detected!("avx2")` on ARM will be a compile time
153 /// error. To ensure we don't hit this error a statement level `#[cfg]` is
154 /// used to only compile usage of the macro on `x86`/`x86_64`.
155 ///
156 /// * Next up we see our AVX2-enabled function, `foo_avx2`. This function is
157 /// decorated with the `#[target_feature]` attribute which enables a CPU
158 /// feature for just this one function. Using a compiler flag like `-C
159 /// target-feature=+avx2` will enable AVX2 for the entire program, but using
160 /// an attribute will only enable it for the one function. Usage of the
161 /// `#[target_feature]` attribute currently requires the function to also be
162 /// `unsafe`, as we see here. This is because the function can only be
163 /// correctly called on systems which have the AVX2 (like the intrinsics
164 /// themselves).
165 ///
166 /// And with all that we should have a working program! This program will run
167 /// across all machines and it'll use the optimized AVX2 implementation on
168 /// machines where support is detected.
169 ///
170 /// # Ergonomics
171 ///
172 /// It's important to note that using the `arch` module is not the easiest
173 /// thing in the world, so if you're curious to try it out you may want to
174 /// brace yourself for some wordiness!
175 ///
176 /// The primary purpose of this module is to enable stable crates on crates.io
177 /// to build up much more ergonomic abstractions which end up using SIMD under
178 /// the hood. Over time these abstractions may also move into the standard
179 /// library itself, but for now this module is tasked with providing the bare
180 /// minimum necessary to use vendor intrinsics on stable Rust.
181 ///
182 /// # Other architectures
183 ///
184 /// This documentation is only for one particular architecture, you can find
185 /// others at:
186 ///
187 /// * [`x86`]
188 /// * [`x86_64`]
189 /// * [`arm`]
190 /// * [`aarch64`]
191 ///
192 /// [`x86`]: https://rust-lang-nursery.github.io/stdsimd/i686/stdsimd/arch/x86/index.html
193 /// [`x86_64`]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/arch/x86_64/index.html
194 /// [`arm`]: https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/arch/arm/index.html
195 /// [`aarch64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/aarch64/index.html
196 ///
197 /// # Examples
198 ///
199 /// First let's take a look at not actually using any intrinsics but instead
200 /// using LLVM's auto-vectorization to produce optimized vectorized code for
201 /// AVX2 and also for the default platform.
202 ///
203 /// ```rust
204 /// #![feature(cfg_target_feature, target_feature, stdsimd)]
205 ///
206 /// # #[cfg(not(dox))]
207 /// # #[macro_use]
208 /// # extern crate stdsimd;
209 ///
210 /// fn main() {
211 /// let mut dst = [0];
212 /// add_quickly(&[1], &[2], &mut dst);
213 /// assert_eq!(dst[0], 3);
214 /// }
215 ///
216 /// fn add_quickly(a: &[u8], b: &[u8], c: &mut [u8]) {
217 /// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
218 /// {
219 /// // Note that this `unsafe` block is safe because we're testing
220 /// // that the `avx2` feature is indeed available on our CPU.
221 /// if is_x86_feature_detected!("avx2") {
222 /// return unsafe { add_quickly_avx2(a, b, c) }
223 /// }
224 /// }
225 ///
226 /// add_quickly_fallback(a, b, c)
227 /// }
228 ///
229 /// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
230 /// #[target_feature(enable = "avx2")]
231 /// unsafe fn add_quickly_avx2(a: &[u8], b: &[u8], c: &mut [u8]) {
232 /// add_quickly_fallback(a, b, c) // the function below is inlined here
233 /// }
234 ///
235 /// fn add_quickly_fallback(a: &[u8], b: &[u8], c: &mut [u8]) {
236 /// for ((a, b), c) in a.iter().zip(b).zip(c) {
237 /// *c = *a + *b;
238 /// }
239 /// }
240 /// ```
241 ///
242 /// Next up let's take a look at an example of manually using intrinsics. Here
243 /// we'll be using SSE4.1 features to implement hex encoding.
244 ///
245 /// ```
246 /// #![feature(cfg_target_feature, target_feature, stdsimd)]
247 /// # #![cfg_attr(not(dox), no_std)]
248 /// # #[cfg(not(dox))]
249 /// # extern crate std as real_std;
250 /// # #[cfg(not(dox))]
251 /// # #[macro_use]
252 /// # extern crate stdsimd as std;
253 ///
254 /// fn main() {
255 /// let mut dst = [0; 32];
256 /// hex_encode(b"\x01\x02\x03", &mut dst);
257 /// assert_eq!(&dst[..6], b"010203");
258 ///
259 /// let mut src = [0; 16];
260 /// for i in 0..16 {
261 /// src[i] = (i + 1) as u8;
262 /// }
263 /// hex_encode(&src, &mut dst);
264 /// assert_eq!(&dst, b"0102030405060708090a0b0c0d0e0f10");
265 /// }
266 ///
267 /// pub fn hex_encode(src: &[u8], dst: &mut [u8]) {
268 /// let len = src.len().checked_mul(2).unwrap();
269 /// assert!(dst.len() >= len);
270 ///
271 /// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
272 /// {
273 /// if is_x86_feature_detected!("sse4.1") {
274 /// return unsafe { hex_encode_sse41(src, dst) };
275 /// }
276 /// }
277 ///
278 /// hex_encode_fallback(src, dst)
279 /// }
280 ///
281 /// // translated from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp
282 /// #[target_feature(enable = "sse4.1")]
283 /// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
284 /// unsafe fn hex_encode_sse41(mut src: &[u8], dst: &mut [u8]) {
285 /// #[cfg(target_arch = "x86")]
286 /// use std::arch::x86::*;
287 /// #[cfg(target_arch = "x86_64")]
288 /// use std::arch::x86_64::*;
289 ///
290 /// let ascii_zero = _mm_set1_epi8(b'0' as i8);
291 /// let nines = _mm_set1_epi8(9);
292 /// let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
293 /// let and4bits = _mm_set1_epi8(0xf);
294 ///
295 /// let mut i = 0_isize;
296 /// while src.len() >= 16 {
297 /// let invec = _mm_loadu_si128(src.as_ptr() as *const _);
298 ///
299 /// let masked1 = _mm_and_si128(invec, and4bits);
300 /// let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
301 ///
302 /// // return 0xff corresponding to the elements > 9, or 0x00 otherwise
303 /// let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
304 /// let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
305 ///
306 /// // add '0' or the offset depending on the masks
307 /// let masked1 = _mm_add_epi8(
308 /// masked1,
309 /// _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1),
310 /// );
311 /// let masked2 = _mm_add_epi8(
312 /// masked2,
313 /// _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2),
314 /// );
315 ///
316 /// // interleave masked1 and masked2 bytes
317 /// let res1 = _mm_unpacklo_epi8(masked2, masked1);
318 /// let res2 = _mm_unpackhi_epi8(masked2, masked1);
319 ///
320 /// _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
321 /// _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2);
322 /// src = &src[16..];
323 /// i += 16;
324 /// }
325 ///
326 /// let i = i as usize;
327 /// hex_encode_fallback(src, &mut dst[i * 2..]);
328 /// }
329 ///
330 /// fn hex_encode_fallback(src: &[u8], dst: &mut [u8]) {
331 /// fn hex(byte: u8) -> u8 {
332 /// static TABLE: &[u8] = b"0123456789abcdef";
333 /// TABLE[byte as usize]
334 /// }
335 ///
336 /// for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) {
337 /// slots[0] = hex((*byte >> 4) & 0xf);
338 /// slots[1] = hex(*byte & 0xf);
339 /// }
340 /// }
341 /// ```
342 #[unstable(feature = "stdsimd", issue = "0")]
343 pub mod arch {
344 #[cfg(all(not(dox), target_arch = "x86"))]
345 pub use coresimd::arch::x86;
346
347 #[cfg(all(not(dox), target_arch = "x86_64"))]
348 pub use coresimd::arch::x86_64;
349
350 #[cfg(all(not(dox), target_arch = "arm"))]
351 pub use coresimd::arch::arm;
352
353 #[cfg(all(not(dox), target_arch = "aarch64"))]
354 pub use coresimd::arch::aarch64;
355
356 #[cfg(target_arch = "wasm32")]
357 pub use coresimd::arch::wasm32;
358
359 #[doc(hidden)] // unstable implementation detail
360 pub mod detect;
361
362 /// Platform-specific intrinsics for the `x86` platform.
363 ///
364 /// The documentation with the full listing of `x86` intrinsics is
365 /// available in [libcore], but the module is re-exported here in std
366 /// as well.
367 ///
368 /// [libcore]: ../../../core/arch/x86/index.html
369 #[cfg(dox)]
370 #[doc(cfg(target_arch = "x86"))]
371 pub mod x86 {}
372
373 /// Platform-specific intrinsics for the `x86_64` platform.
374 ///
375 /// The documentation with the full listing of `x86_64` intrinsics is
376 /// available in [libcore], but the module is re-exported here in std
377 /// as well.
378 ///
379 /// [libcore]: ../../../core/arch/x86_64/index.html
380 #[cfg(dox)]
381 #[doc(cfg(target_arch = "x86_64"))]
382 pub mod x86_64 {}
383
384 /// Platform-specific intrinsics for the `arm` platform.
385 ///
386 /// The documentation with the full listing of `arm` intrinsics is
387 /// available in [libcore], but the module is re-exported here in std
388 /// as well.
389 ///
390 /// [libcore]: ../../../core/arch/arm/index.html
391 #[cfg(dox)]
392 #[doc(cfg(target_arch = "arm"))]
393 pub mod arm {}
394
395 /// Platform-specific intrinsics for the `aarch64` platform.
396 ///
397 /// The documentation with the full listing of `aarch64` intrinsics is
398 /// available in [libcore], but the module is re-exported here in std
399 /// as well.
400 ///
401 /// [libcore]: ../../../core/arch/aarch64/index.html
402 #[cfg(dox)]
403 #[doc(cfg(target_arch = "aarch64"))]
404 pub mod aarch64 {}
405 }
406
407 #[unstable(feature = "stdsimd", issue = "0")]
408 pub use coresimd::simd;