vendor/packed_simd_2/src/lib.rs

   1 //! # Portable packed SIMD vectors
   2 //!
   3 //! This crate is proposed for stabilization as `std::packed_simd` in [RFC2366:
   4 //! `std::simd`](https://github.com/rust-lang/rfcs/pull/2366) .
   5 //!
   6 //! The examples available in the
   7 //! [`examples/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples)
   8 //! sub-directory of the crate showcase how to use the library in practice.
   9 //!
  10 //! ## Table of contents
  11 //!
  12 //! - [Introduction](#introduction)
  13 //! - [Vector types](#vector-types)
  14 //! - [Conditional operations](#conditional-operations)
  15 //! - [Conversions](#conversions)
  16 //! - [Performance
  17 //!   guide](https://rust-lang-nursery.github.io/packed_simd/perf-guide/)
  18 //!
  19 //! ## Introduction
  20 //!
  21 //! This crate exports [`Simd<[T; N]>`][`Simd`]: a packed vector of `N`
  22 //! elements of type `T` as well as many type aliases for this type: for
  23 //! example, [`f32x4`], which is just an alias for `Simd<[f32; 4]>`.
  24 //!
  25 //! The operations on packed vectors are, by default, "vertical", that is, they
  26 //! are applied to each vector lane in isolation of the others:
  27 //!
  28 //! ```
  29 //! # use packed_simd::*;
  30 //! let a = i32x4::new(1, 2, 3, 4);
  31 //! let b = i32x4::new(5, 6, 7, 8);
  32 //! assert_eq!(a + b, i32x4::new(6, 8, 10, 12));
  33 //! ```
  34 //!
  35 //! Many "horizontal" operations are also provided:
  36 //!
  37 //! ```
  38 //! # use packed_simd::*;
  39 //! # let a = i32x4::new(1, 2, 3, 4);
  40 //! assert_eq!(a.wrapping_sum(), 10);
  41 //! ```
  42 //!
  43 //! In virtually all architectures vertical operations are fast, while
  44 //! horizontal operations are, by comparison, much slower. That is, the
  45 //! most portably-efficient way of performing a reduction over a slice
  46 //! is to collect the results into a vector using vertical operations,
  47 //! and performing a single horizontal operation at the end:
  48 //!
  49 //! ```
  50 //! # use packed_simd::*;
  51 //! fn reduce(x: &[i32]) -> i32 {
  52 //!     assert!(x.len() % 4 == 0);
  53 //!     let mut sum = i32x4::splat(0); // [0, 0, 0, 0]
  54 //!     for i in (0..x.len()).step_by(4) {
  55 //!         sum += i32x4::from_slice_unaligned(&x[i..]);
  56 //!     }
  57 //!     sum.wrapping_sum()
  58 //! }
  59 //!
  60 //! let x = [0, 1, 2, 3, 4, 5, 6, 7];
  61 //! assert_eq!(reduce(&x), 28);
  62 //! ```
  63 //!
  64 //! ## Vector types
  65 //!
  66 //! The vector type aliases are named according to the following scheme:
  67 //!
  68 //! > `{element_type}x{number_of_lanes} == Simd<[element_type;
  69 //! number_of_lanes]>`
  70 //!
  71 //! where the following element types are supported:
  72 //!
  73 //! * `i{element_width}`: signed integer
  74 //! * `u{element_width}`: unsigned integer
  75 //! * `f{element_width}`: float
  76 //! * `m{element_width}`: mask (see below)
  77 //! * `*{const,mut} T`: `const` and `mut` pointers
  78 //!
  79 //! ## Basic operations
  80 //!
  81 //! ```
  82 //! # use packed_simd::*;
  83 //! // Sets all elements to `0`:
  84 //! let a = i32x4::splat(0);
  85 //!
  86 //! // Reads a vector from a slice:
  87 //! let mut arr = [0, 0, 0, 1, 2, 3, 4, 5];
  88 //! let b = i32x4::from_slice_unaligned(&arr);
  89 //!
  90 //! // Reads the 4-th element of a vector:
  91 //! assert_eq!(b.extract(3), 1);
  92 //!
  93 //! // Returns a new vector where the 4-th element is replaced with `1`:
  94 //! let a = a.replace(3, 1);
  95 //! assert_eq!(a, b);
  96 //!
  97 //! // Writes a vector to a slice:
  98 //! let a = a.replace(2, 1);
  99 //! a.write_to_slice_unaligned(&mut arr[4..]);
 100 //! assert_eq!(arr, [0, 0, 0, 1, 0, 0, 1, 1]);
 101 //! ```
 102 //!
 103 //! ## Conditional operations
 104 //!
 105 //! One often needs to perform an operation on some lanes of the vector. Vector
 106 //! masks, like `m32x4`, allow selecting on which vector lanes an operation is
 107 //! to be performed:
 108 //!
 109 //! ```
 110 //! # use packed_simd::*;
 111 //! let a = i32x4::new(1, 1, 2, 2);
 112 //!
 113 //! // Add `1` to the first two lanes of the vector.
 114 //! let m = m16x4::new(true, true, false, false);
 115 //! let a = m.select(a + 1, a);
 116 //! assert_eq!(a, i32x4::splat(2));
 117 //! ```
 118 //!
 119 //! The elements of a vector mask are either `true` or `false`. Here `true`
 120 //! means that a lane is "selected", while `false` means that a lane is not
 121 //! selected.
 122 //!
 123 //! All vector masks implement a `mask.select(a: T, b: T) -> T` method that
 124 //! works on all vectors that have the same number of lanes as the mask. The
 125 //! resulting vector contains the elements of `a` for those lanes for which the
 126 //! mask is `true`, and the elements of `b` otherwise.
 127 //!
 128 //! The example constructs a mask with the first two lanes set to `true` and
 129 //! the last two lanes set to `false`. This selects the first two lanes of `a +
 130 //! 1` and the last two lanes of `a`, producing a vector where the first two
 131 //! lanes have been incremented by `1`.
 132 //!
 133 //! > note: mask `select` can be used on vector types that have the same number
 134 //! > of lanes as the mask. The example shows this by using [`m16x4`] instead
 135 //! > of [`m32x4`]. It is _typically_ more performant to use a mask element
 136 //! > width equal to the element width of the vectors being operated upon.
 137 //! > This is, however, not true for 512-bit wide vectors when targetting
 138 //! > AVX-512, where the most efficient masks use only 1-bit per element.
 139 //!
 140 //! All vertical comparison operations returns masks:
 141 //!
 142 //! ```
 143 //! # use packed_simd::*;
 144 //! let a = i32x4::new(1, 1, 3, 3);
 145 //! let b = i32x4::new(2, 2, 0, 0);
 146 //!
 147 //! // ge: >= (Greater Eequal; see also lt, le, gt, eq, ne).
 148 //! let m = a.ge(i32x4::splat(2));
 149 //!
 150 //! if m.any() {
 151 //!     // all / any / none allow coherent control flow
 152 //!     let d = m.select(a, b);
 153 //!     assert_eq!(d, i32x4::new(2, 2, 3, 3));
 154 //! }
 155 //! ```
 156 //!
 157 //! ## Conversions
 158 //!
 159 //! * **lossless widening conversions**: [`From`]/[`Into`] are implemented for
 160 //!   vectors with the same number of lanes when the conversion is value
 161 //! preserving   (same as in `std`).
 162 //!
 163 //! * **safe bitwise conversions**: The cargo feature `into_bits` provides the
 164 //!   `IntoBits/FromBits` traits (`x.into_bits()`). These perform safe bitwise
 165 //!   `transmute`s when all bit patterns of the source type are valid bit
 166 //!   patterns of the target type and are also implemented for the
 167 //!   architecture-specific vector types of `std::arch`. For example, `let x:
 168 //!   u8x8 = m8x8::splat(true).into_bits();` is provided because all `m8x8` bit
 169 //!   patterns are valid `u8x8` bit patterns. However, the opposite is not
 170 //! true,   not all `u8x8` bit patterns are valid `m8x8` bit-patterns, so this
 171 //!   operation cannot be peformed safely using `x.into_bits()`; one needs to
 172 //!   use `unsafe { crate::mem::transmute(x) }` for that, making sure that the
 173 //!   value in the `u8x8` is a valid bit-pattern of `m8x8`.
 174 //!
 175 //! * **numeric casts** (`as`): are peformed using [`FromCast`]/[`Cast`]
 176 //! (`x.cast()`), just like `as`:
 177 //!
 178 //!   * casting integer vectors whose lane types have the same size (e.g.
 179 //! `i32xN`     -> `u32xN`) is a **no-op**,
 180 //!
 181 //!   * casting from a larger integer to a smaller integer (e.g. `u32xN` ->
 182 //! `u8xN`)     will **truncate**,
 183 //!
 184 //!   * casting from a smaller integer to a larger integer     (e.g. `u8xN` ->
 185 //!     `u32xN`) will:
 186 //!        * **zero-extend** if the source is unsigned, or
 187 //!        * **sign-extend** if the source is signed,
 188 //!
 189 //!   * casting from a float to an integer will **round the float towards
 190 //! zero**,
 191 //!
 192 //!   * casting from an integer to float will produce the floating point
 193 //!     representation of the integer, **rounding to nearest, ties to even**,
 194 //!
 195 //!   * casting from an `f32` to an `f64` is perfect and lossless,
 196 //!
 197 //!   * casting from an `f64` to an `f32` **rounds to nearest, ties to even**.
 198 //!
 199 //!   Numeric casts are not very "precise": sometimes lossy, sometimes value
 200 //!   preserving, etc.
 201
 202 #![feature(
 203     repr_simd,
 204     rustc_attrs,
 205     const_fn,
 206     platform_intrinsics,
 207     stdsimd,
 208     aarch64_target_feature,
 209     arm_target_feature,
 210     link_llvm_intrinsics,
 211     core_intrinsics,
 212     stmt_expr_attributes,
 213     crate_visibility_modifier,
 214     custom_inner_attributes,
 215     llvm_asm
 216 )]
 217 #![allow(non_camel_case_types, non_snake_case,
 218         // FIXME: these types are unsound in C FFI already
 219         // See https://github.com/rust-lang/rust/issues/53346
 220         improper_ctypes_definitions,
 221         clippy::cast_possible_truncation,
 222         clippy::cast_lossless,
 223         clippy::cast_possible_wrap,
 224         clippy::cast_precision_loss,
 225         // TODO: manually add the `#[must_use]` attribute where appropriate
 226         clippy::must_use_candidate,
 227         // This lint is currently broken for generic code
 228         // See https://github.com/rust-lang/rust-clippy/issues/3410
 229         clippy::use_self,
 230         clippy::wrong_self_convention,
 231 )]
 232 #![cfg_attr(test, feature(hashmap_internals))]
 233 #![deny(rust_2018_idioms, clippy::missing_inline_in_public_items)]
 234 #![no_std]
 235
 236 use cfg_if::cfg_if;
 237
 238 cfg_if! {
 239     if #[cfg(feature = "core_arch")] {
 240         #[allow(unused_imports)]
 241         use core_arch as arch;
 242     } else {
 243         #[allow(unused_imports)]
 244         use core::arch;
 245     }
 246 }
 247
 248 #[cfg(all(target_arch = "wasm32", test))]
 249 use wasm_bindgen_test::*;
 250
 251 #[allow(unused_imports)]
 252 use core::{
 253     /* arch (handled above), */ cmp, f32, f64, fmt, hash, hint, i128,
 254     i16, i32, i64, i8, intrinsics, isize, iter, marker, mem, ops, ptr, slice,
 255     u128, u16, u32, u64, u8, usize,
 256 };
 257
 258 #[macro_use]
 259 mod testing;
 260 #[macro_use]
 261 mod api;
 262 mod codegen;
 263 mod sealed;
 264
 265 pub use crate::sealed::{Simd as SimdVector, Shuffle, SimdArray, Mask};
 266
 267 /// Packed SIMD vector type.
 268 ///
 269 /// # Examples
 270 ///
 271 /// ```
 272 /// # use packed_simd::Simd;
 273 /// let v = Simd::<[i32; 4]>::new(0, 1, 2, 3);
 274 /// assert_eq!(v.extract(2), 2);
 275 /// ```
 276 #[repr(transparent)]
 277 #[derive(Copy, Clone)]
 278 pub struct Simd<A: sealed::SimdArray>(
 279     // FIXME: this type should be private,
 280     // but it currently must be public for the
 281     // `shuffle!` macro to work: it needs to
 282     // access the internal `repr(simd)` type
 283     // to call the shuffle intrinsics.
 284     #[doc(hidden)] pub <A as sealed::SimdArray>::Tuple,
 285 );
 286
 287 impl<A: sealed::SimdArray> sealed::Seal for Simd<A> {}
 288
 289 /// Wrapper over `T` implementing a lexicoraphical order via the `PartialOrd`
 290 /// and/or `Ord` traits.
 291 #[repr(transparent)]
 292 #[derive(Copy, Clone, Debug)]
 293 #[allow(clippy::missing_inline_in_public_items)]
 294 pub struct LexicographicallyOrdered<T>(T);
 295
 296 mod masks;
 297 pub use self::masks::*;
 298
 299 mod v16;
 300 pub use self::v16::*;
 301
 302 mod v32;
 303 pub use self::v32::*;
 304
 305 mod v64;
 306 pub use self::v64::*;
 307
 308 mod v128;
 309 pub use self::v128::*;
 310
 311 mod v256;
 312 pub use self::v256::*;
 313
 314 mod v512;
 315 pub use self::v512::*;
 316
 317 mod vSize;
 318 pub use self::vSize::*;
 319
 320 mod vPtr;
 321 pub use self::vPtr::*;
 322
 323 pub use self::api::cast::*;
 324
 325 #[cfg(feature = "into_bits")]
 326 pub use self::api::into_bits::*;
 327
 328 // Re-export the shuffle intrinsics required by the `shuffle!` macro.
 329 #[doc(hidden)]
 330 pub use self::codegen::llvm::{
 331     __shuffle_vector16, __shuffle_vector2, __shuffle_vector32,
 332     __shuffle_vector4, __shuffle_vector64, __shuffle_vector8,
 333 };
 334
 335 crate mod llvm {
 336     crate use crate::codegen::llvm::*;
 337 }