src/stdsimd/stdsimd/mod.rs

   1 //! `stdsimd`
   2
   3 /// SIMD and vendor intrinsics module.
   4 ///
   5 /// This module is intended to be the gateway to architecture-specific
   6 /// intrinsic functions, typically related to SIMD (but not always!). Each
   7 /// architecture that Rust compiles to may contain a submodule here, which
   8 /// means that this is not a portable module! If you're writing a portable
   9 /// library take care when using these APIs!
  10 ///
  11 /// Under this module you'll find an architecture-named module, such as
  12 /// `x86_64`. Each `#[cfg(target_arch)]` that Rust can compile to may have a
  13 /// module entry here, only present on that particular target. For example the
  14 /// `i686-pc-windows-msvc` target will have an `x86` module here, whereas
  15 /// `x86_64-pc-windows-msvc` has `x86_64`.
  16 ///
  17 /// > **Note**: This module is currently unstable. It was designed in
  18 /// > [RFC 2325][rfc] and is currently [tracked] for stabilization.
  19 ///
  20 /// [rfc]: https://github.com/rust-lang/rfcs/pull/2325
  21 /// [tracked]: https://github.com/rust-lang/rust/issues/48556
  22 ///
  23 /// # Overview
  24 ///
  25 /// This module exposes vendor-specific intrinsics that typically correspond to
  26 /// a single machine instruction. These intrinsics are not portable: their
  27 /// availability is architecture-dependent, and not all machines of that
  28 /// architecture might provide the intrinsic.
  29 ///
  30 /// The `arch` module is intended to be a low-level implementation detail for
  31 /// higher-level APIs. Using it correctly can be quite tricky as you need to
  32 /// ensure at least a few guarantees are upheld:
  33 ///
  34 /// * The correct architecture's module is used. For example the `arm` module
  35 ///   isn't available on the `x86_64-unknown-linux-gnu` target. This is
  36 ///   typically done by ensuring that `#[cfg]` is used appropriately when using
  37 ///   this module.
  38 /// * The CPU the program is currently running on supports the function being
  39 ///   called. For example it is unsafe to call an AVX2 function on a CPU that
  40 ///   doesn't actually support AVX2.
  41 ///
  42 /// As a result of the latter of these guarantees all intrinsics in this module
  43 /// are `unsafe` and extra care needs to be taken when calling them!
  44 ///
  45 /// # CPU Feature Detection
  46 ///
  47 /// In order to call these APIs in a safe fashion there's a number of
  48 /// mechanisms available to ensure that the correct CPU feature is available
  49 /// to call an intrinsic. Let's consider, for example, the `_mm256_add_epi64`
  50 /// intrinsics on the `x86` and `x86_64` architectures. This function requires
  51 /// the AVX2 feature as [documented by Intel][intel-dox] so to correctly call
  52 /// this function we need to (a) guarantee we only call it on `x86`/`x86_64`
  53 /// and (b) ensure that the CPU feature is available
  54 ///
  55 /// [intel-dox]: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64&expand=100
  56 ///
  57 /// ## Static CPU Feature Detection
  58 ///
  59 /// The first option available to us is to conditionally compile code via the
  60 /// `#[cfg]` attribute. CPU features correspond to the `target_feature` cfg
  61 /// available, and can be used like so:
  62 ///
  63 /// ```ignore
  64 /// #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"),
  65 ///       target_feature = "avx2"))]
  66 /// fn foo() {
  67 ///     #[cfg(target_arch = "x86")]
  68 ///     use std::arch::x86::_mm256_add_epi64;
  69 ///     #[cfg(target_arch = "x86_64")]
  70 ///     use std::arch::x86_64::_mm256_add_epi64;
  71 ///
  72 ///     unsafe {
  73 ///         _mm256_add_epi64(...);
  74 ///     }
  75 /// }
  76 /// ```
  77 ///
  78 /// Here we're using `#[cfg(target_feature = "avx2")]` to conditionally compile
  79 /// this function into our module. This means that if the `avx2` feature is
  80 /// *enabled statically* then we'll use the `_mm256_add_epi64` function at
  81 /// runtime. The `unsafe` block here can be justified through the usage of
  82 /// `#[cfg]` to only compile the code in situations where the safety guarantees
  83 /// are upheld.
  84 ///
  85 /// Statically enabling a feature is typically done with the `-C
  86 /// target-feature` or `-C target-cpu` flags to the compiler. For example if
  87 /// your local CPU supports AVX2 then you can compile the above function with:
  88 ///
  89 /// ```sh
  90 /// $ RUSTFLAGS='-C target-cpu=native' cargo build
  91 /// ```
  92 ///
  93 /// Or otherwise you can specifically enable just the AVX2 feature:
  94 ///
  95 /// ```sh
  96 /// $ RUSTFLAGS='-C target-feature=+avx2' cargo build
  97 /// ```
  98 ///
  99 /// Note that when you compile a binary with a particular feature enabled it's
 100 /// important to ensure that you only run the binary on systems which satisfy
 101 /// the required feature set.
 102 ///
 103 /// ## Dynamic CPU Feature Detection
 104 ///
 105 /// Sometimes statically dispatching isn't quite what you want. Instead you
 106 /// might want to build a portable binary that runs across a variety of CPUs,
 107 /// but at runtime it selects the most optimized implementation available. This
 108 /// allows you to build a "least common denominator" binary which has certain
 109 /// sections more optimized for different CPUs.
 110 ///
 111 /// Taking our previous example from before, we're going to compile our binary
 112 /// *without* AVX2 support, but we'd like to enable it for just one function.
 113 /// We can do that in a manner like:
 114 ///
 115 /// ```ignore
 116 /// fn foo() {
 117 ///     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 118 ///     {
 119 ///         if is_x86_feature_detected!("avx2") {
 120 ///             return unsafe { foo_avx2() };
 121 ///         }
 122 ///     }
 123 ///
 124 ///     // fallback implementation without using AVX2
 125 /// }
 126 ///
 127 /// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 128 /// #[target_feature(enable = "avx2")]
 129 /// unsafe fn foo_avx2() {
 130 ///     #[cfg(target_arch = "x86")]
 131 ///     use std::arch::x86::_mm256_add_epi64;
 132 ///     #[cfg(target_arch = "x86_64")]
 133 ///     use std::arch::x86_64::_mm256_add_epi64;
 134 ///
 135 ///     _mm256_add_epi64(...);
 136 /// }
 137 /// ```
 138 ///
 139 /// There's a couple of components in play here, so let's go through them in
 140 /// detail!
 141 ///
 142 /// * First up we notice the `is_x86_feature_detected!` macro. Provided by
 143 ///   the standard library, this macro will perform necessary runtime detection
 144 ///   to determine whether the CPU the program is running on supports the
 145 ///   specified feature. In this case the macro will expand to a boolean
 146 /// expression evaluating to whether the local CPU has the AVX2 feature or
 147 /// not.
 148 ///
 149 ///   Note that this macro, like the `arch` module, is platform-specific. The
 150 ///   name of the macro is the same across platforms, but the arguments to the
 151 ///   macro are only the features for the current platform. For example calling
 152 ///   `is_x86_feature_detected!("avx2")` on ARM will be a compile time
 153 ///   error. To ensure we don't hit this error a statement level `#[cfg]` is
 154 ///   used to only compile usage of the macro on `x86`/`x86_64`.
 155 ///
 156 /// * Next up we see our AVX2-enabled function, `foo_avx2`. This function is
 157 ///   decorated with the `#[target_feature]` attribute which enables a CPU
 158 ///   feature for just this one function. Using a compiler flag like `-C
 159 ///   target-feature=+avx2` will enable AVX2 for the entire program, but using
 160 ///   an attribute will only enable it for the one function. Usage of the
 161 ///   `#[target_feature]` attribute currently requires the function to also be
 162 ///   `unsafe`, as we see here. This is because the function can only be
 163 ///   correctly called on systems which have the AVX2 (like the intrinsics
 164 ///   themselves).
 165 ///
 166 /// And with all that we should have a working program! This program will run
 167 /// across all machines and it'll use the optimized AVX2 implementation on
 168 /// machines where support is detected.
 169 ///
 170 /// # Ergonomics
 171 ///
 172 /// It's important to note that using the `arch` module is not the easiest
 173 /// thing in the world, so if you're curious to try it out you may want to
 174 /// brace yourself for some wordiness!
 175 ///
 176 /// The primary purpose of this module is to enable stable crates on crates.io
 177 /// to build up much more ergonomic abstractions which end up using SIMD under
 178 /// the hood. Over time these abstractions may also move into the standard
 179 /// library itself, but for now this module is tasked with providing the bare
 180 /// minimum necessary to use vendor intrinsics on stable Rust.
 181 ///
 182 /// # Other architectures
 183 ///
 184 /// This documentation is only for one particular architecture, you can find
 185 /// others at:
 186 ///
 187 /// * [`x86`]
 188 /// * [`x86_64`]
 189 /// * [`arm`]
 190 /// * [`aarch64`]
 191 ///
 192 /// [`x86`]: https://rust-lang-nursery.github.io/stdsimd/i686/stdsimd/arch/x86/index.html
 193 /// [`x86_64`]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/arch/x86_64/index.html
 194 /// [`arm`]: https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/arch/arm/index.html
 195 /// [`aarch64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/aarch64/index.html
 196 ///
 197 /// # Examples
 198 ///
 199 /// First let's take a look at not actually using any intrinsics but instead
 200 /// using LLVM's auto-vectorization to produce optimized vectorized code for
 201 /// AVX2 and also for the default platform.
 202 ///
 203 /// ```rust
 204 /// #![feature(cfg_target_feature, target_feature, stdsimd)]
 205 ///
 206 /// # #[cfg(not(dox))]
 207 /// # #[macro_use]
 208 /// # extern crate stdsimd;
 209 ///
 210 /// fn main() {
 211 ///     let mut dst = [0];
 212 ///     add_quickly(&[1], &[2], &mut dst);
 213 ///     assert_eq!(dst[0], 3);
 214 /// }
 215 ///
 216 /// fn add_quickly(a: &[u8], b: &[u8], c: &mut [u8]) {
 217 ///     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 218 ///     {
 219 ///         // Note that this `unsafe` block is safe because we're testing
 220 ///         // that the `avx2` feature is indeed available on our CPU.
 221 ///         if is_x86_feature_detected!("avx2") {
 222 ///             return unsafe { add_quickly_avx2(a, b, c) }
 223 ///         }
 224 ///     }
 225 ///
 226 ///     add_quickly_fallback(a, b, c)
 227 /// }
 228 ///
 229 /// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 230 /// #[target_feature(enable = "avx2")]
 231 /// unsafe fn add_quickly_avx2(a: &[u8], b: &[u8], c: &mut [u8]) {
 232 ///     add_quickly_fallback(a, b, c) // the function below is inlined here
 233 /// }
 234 ///
 235 /// fn add_quickly_fallback(a: &[u8], b: &[u8], c: &mut [u8]) {
 236 ///     for ((a, b), c) in a.iter().zip(b).zip(c) {
 237 ///         *c = *a + *b;
 238 ///     }
 239 /// }
 240 /// ```
 241 ///
 242 /// Next up let's take a look at an example of manually using intrinsics. Here
 243 /// we'll be using SSE4.1 features to implement hex encoding.
 244 ///
 245 /// ```
 246 /// #![feature(cfg_target_feature, target_feature, stdsimd)]
 247 /// # #![cfg_attr(not(dox), no_std)]
 248 /// # #[cfg(not(dox))]
 249 /// # extern crate std as real_std;
 250 /// # #[cfg(not(dox))]
 251 /// # #[macro_use]
 252 /// # extern crate stdsimd as std;
 253 ///
 254 /// fn main() {
 255 ///     let mut dst = [0; 32];
 256 ///     hex_encode(b"\x01\x02\x03", &mut dst);
 257 ///     assert_eq!(&dst[..6], b"010203");
 258 ///
 259 ///     let mut src = [0; 16];
 260 ///     for i in 0..16 {
 261 ///         src[i] = (i + 1) as u8;
 262 ///     }
 263 ///     hex_encode(&src, &mut dst);
 264 ///     assert_eq!(&dst, b"0102030405060708090a0b0c0d0e0f10");
 265 /// }
 266 ///
 267 /// pub fn hex_encode(src: &[u8], dst: &mut [u8]) {
 268 ///     let len = src.len().checked_mul(2).unwrap();
 269 ///     assert!(dst.len() >= len);
 270 ///
 271 ///     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 272 ///     {
 273 ///         if is_x86_feature_detected!("sse4.1") {
 274 ///             return unsafe { hex_encode_sse41(src, dst) };
 275 ///         }
 276 ///     }
 277 ///
 278 ///     hex_encode_fallback(src, dst)
 279 /// }
 280 ///
 281 /// // translated from https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp
 282 /// #[target_feature(enable = "sse4.1")]
 283 /// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 284 /// unsafe fn hex_encode_sse41(mut src: &[u8], dst: &mut [u8]) {
 285 ///     #[cfg(target_arch = "x86")]
 286 ///     use std::arch::x86::*;
 287 ///     #[cfg(target_arch = "x86_64")]
 288 ///     use std::arch::x86_64::*;
 289 ///
 290 ///     let ascii_zero = _mm_set1_epi8(b'0' as i8);
 291 ///     let nines = _mm_set1_epi8(9);
 292 ///     let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
 293 ///     let and4bits = _mm_set1_epi8(0xf);
 294 ///
 295 ///     let mut i = 0_isize;
 296 ///     while src.len() >= 16 {
 297 ///         let invec = _mm_loadu_si128(src.as_ptr() as *const _);
 298 ///
 299 ///         let masked1 = _mm_and_si128(invec, and4bits);
 300 ///         let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
 301 ///
 302 ///         // return 0xff corresponding to the elements > 9, or 0x00 otherwise
 303 ///         let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
 304 ///         let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
 305 ///
 306 ///         // add '0' or the offset depending on the masks
 307 ///         let masked1 = _mm_add_epi8(
 308 ///             masked1,
 309 ///             _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1),
 310 ///         );
 311 ///         let masked2 = _mm_add_epi8(
 312 ///             masked2,
 313 ///             _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2),
 314 ///         );
 315 ///
 316 ///         // interleave masked1 and masked2 bytes
 317 ///         let res1 = _mm_unpacklo_epi8(masked2, masked1);
 318 ///         let res2 = _mm_unpackhi_epi8(masked2, masked1);
 319 ///
 320 ///         _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
 321 ///         _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2);
 322 ///         src = &src[16..];
 323 ///         i += 16;
 324 ///     }
 325 ///
 326 ///     let i = i as usize;
 327 ///     hex_encode_fallback(src, &mut dst[i * 2..]);
 328 /// }
 329 ///
 330 /// fn hex_encode_fallback(src: &[u8], dst: &mut [u8]) {
 331 ///     fn hex(byte: u8) -> u8 {
 332 ///         static TABLE: &[u8] = b"0123456789abcdef";
 333 ///         TABLE[byte as usize]
 334 ///     }
 335 ///
 336 ///     for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) {
 337 ///         slots[0] = hex((*byte >> 4) & 0xf);
 338 ///         slots[1] = hex(*byte & 0xf);
 339 ///     }
 340 /// }
 341 /// ```
 342 #[unstable(feature = "stdsimd", issue = "0")]
 343 pub mod arch {
 344     #[cfg(all(not(dox), target_arch = "x86"))]
 345     pub use coresimd::arch::x86;
 346
 347     #[cfg(all(not(dox), target_arch = "x86_64"))]
 348     pub use coresimd::arch::x86_64;
 349
 350     #[cfg(all(not(dox), target_arch = "arm"))]
 351     pub use coresimd::arch::arm;
 352
 353     #[cfg(all(not(dox), target_arch = "aarch64"))]
 354     pub use coresimd::arch::aarch64;
 355
 356     #[cfg(target_arch = "wasm32")]
 357     pub use coresimd::arch::wasm32;
 358
 359     #[doc(hidden)] // unstable implementation detail
 360     pub mod detect;
 361
 362     /// Platform-specific intrinsics for the `x86` platform.
 363     ///
 364     /// The documentation with the full listing of `x86` intrinsics is
 365     /// available in [libcore], but the module is re-exported here in std
 366     /// as well.
 367     ///
 368     /// [libcore]: ../../../core/arch/x86/index.html
 369     #[cfg(dox)]
 370     #[doc(cfg(target_arch = "x86"))]
 371     pub mod x86 {}
 372
 373     /// Platform-specific intrinsics for the `x86_64` platform.
 374     ///
 375     /// The documentation with the full listing of `x86_64` intrinsics is
 376     /// available in [libcore], but the module is re-exported here in std
 377     /// as well.
 378     ///
 379     /// [libcore]: ../../../core/arch/x86_64/index.html
 380     #[cfg(dox)]
 381     #[doc(cfg(target_arch = "x86_64"))]
 382     pub mod x86_64 {}
 383
 384     /// Platform-specific intrinsics for the `arm` platform.
 385     ///
 386     /// The documentation with the full listing of `arm` intrinsics is
 387     /// available in [libcore], but the module is re-exported here in std
 388     /// as well.
 389     ///
 390     /// [libcore]: ../../../core/arch/arm/index.html
 391     #[cfg(dox)]
 392     #[doc(cfg(target_arch = "arm"))]
 393     pub mod arm {}
 394
 395     /// Platform-specific intrinsics for the `aarch64` platform.
 396     ///
 397     /// The documentation with the full listing of `aarch64` intrinsics is
 398     /// available in [libcore], but the module is re-exported here in std
 399     /// as well.
 400     ///
 401     /// [libcore]: ../../../core/arch/aarch64/index.html
 402     #[cfg(dox)]
 403     #[doc(cfg(target_arch = "aarch64"))]
 404     pub mod aarch64 {}
 405 }
 406
 407 #[unstable(feature = "stdsimd", issue = "0")]
 408 pub use coresimd::simd;