1 //! # Portable packed SIMD vectors
3 //! This crate is proposed for stabilization as `std::packed_simd` in [RFC2366:
4 //! `std::simd`](https://github.com/rust-lang/rfcs/pull/2366) .
6 //! The examples available in the
7 //! [`examples/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples)
8 //! sub-directory of the crate showcase how to use the library in practice.
10 //! ## Table of contents
12 //! - [Introduction](#introduction)
13 //! - [Vector types](#vector-types)
14 //! - [Conditional operations](#conditional-operations)
15 //! - [Conversions](#conversions)
17 //! guide](https://rust-lang-nursery.github.io/packed_simd/perf-guide/)
21 //! This crate exports [`Simd<[T; N]>`][`Simd`]: a packed vector of `N`
22 //! elements of type `T` as well as many type aliases for this type: for
23 //! example, [`f32x4`], which is just an alias for `Simd<[f32; 4]>`.
25 //! The operations on packed vectors are, by default, "vertical", that is, they
26 //! are applied to each vector lane in isolation of the others:
29 //! # use packed_simd::*;
30 //! let a = i32x4::new(1, 2, 3, 4);
31 //! let b = i32x4::new(5, 6, 7, 8);
32 //! assert_eq!(a + b, i32x4::new(6, 8, 10, 12));
35 //! Many "horizontal" operations are also provided:
38 //! # use packed_simd::*;
39 //! # let a = i32x4::new(1, 2, 3, 4);
40 //! assert_eq!(a.wrapping_sum(), 10);
43 //! In virtually all architectures vertical operations are fast, while
44 //! horizontal operations are, by comparison, much slower. That is, the
45 //! most portably-efficient way of performing a reduction over a slice
46 //! is to collect the results into a vector using vertical operations,
47 //! and performing a single horizontal operation at the end:
50 //! # use packed_simd::*;
51 //! fn reduce(x: &[i32]) -> i32 {
52 //! assert!(x.len() % 4 == 0);
53 //! let mut sum = i32x4::splat(0); // [0, 0, 0, 0]
54 //! for i in (0..x.len()).step_by(4) {
55 //! sum += i32x4::from_slice_unaligned(&x[i..]);
57 //! sum.wrapping_sum()
60 //! let x = [0, 1, 2, 3, 4, 5, 6, 7];
61 //! assert_eq!(reduce(&x), 28);
66 //! The vector type aliases are named according to the following scheme:
68 //! > `{element_type}x{number_of_lanes} == Simd<[element_type;
69 //! number_of_lanes]>`
71 //! where the following element types are supported:
73 //! * `i{element_width}`: signed integer
74 //! * `u{element_width}`: unsigned integer
75 //! * `f{element_width}`: float
76 //! * `m{element_width}`: mask (see below)
77 //! * `*{const,mut} T`: `const` and `mut` pointers
79 //! ## Basic operations
82 //! # use packed_simd::*;
83 //! // Sets all elements to `0`:
84 //! let a = i32x4::splat(0);
86 //! // Reads a vector from a slice:
87 //! let mut arr = [0, 0, 0, 1, 2, 3, 4, 5];
88 //! let b = i32x4::from_slice_unaligned(&arr);
90 //! // Reads the 4-th element of a vector:
91 //! assert_eq!(b.extract(3), 1);
93 //! // Returns a new vector where the 4-th element is replaced with `1`:
94 //! let a = a.replace(3, 1);
97 //! // Writes a vector to a slice:
98 //! let a = a.replace(2, 1);
99 //! a.write_to_slice_unaligned(&mut arr[4..]);
100 //! assert_eq!(arr, [0, 0, 0, 1, 0, 0, 1, 1]);
103 //! ## Conditional operations
105 //! One often needs to perform an operation on some lanes of the vector. Vector
106 //! masks, like `m32x4`, allow selecting on which vector lanes an operation is
110 //! # use packed_simd::*;
111 //! let a = i32x4::new(1, 1, 2, 2);
113 //! // Add `1` to the first two lanes of the vector.
114 //! let m = m16x4::new(true, true, false, false);
115 //! let a = m.select(a + 1, a);
116 //! assert_eq!(a, i32x4::splat(2));
119 //! The elements of a vector mask are either `true` or `false`. Here `true`
120 //! means that a lane is "selected", while `false` means that a lane is not
123 //! All vector masks implement a `mask.select(a: T, b: T) -> T` method that
124 //! works on all vectors that have the same number of lanes as the mask. The
125 //! resulting vector contains the elements of `a` for those lanes for which the
126 //! mask is `true`, and the elements of `b` otherwise.
128 //! The example constructs a mask with the first two lanes set to `true` and
129 //! the last two lanes set to `false`. This selects the first two lanes of `a +
130 //! 1` and the last two lanes of `a`, producing a vector where the first two
131 //! lanes have been incremented by `1`.
133 //! > note: mask `select` can be used on vector types that have the same number
134 //! > of lanes as the mask. The example shows this by using [`m16x4`] instead
135 //! > of [`m32x4`]. It is _typically_ more performant to use a mask element
136 //! > width equal to the element width of the vectors being operated upon.
137 //! > This is, however, not true for 512-bit wide vectors when targetting
138 //! > AVX-512, where the most efficient masks use only 1-bit per element.
140 //! All vertical comparison operations returns masks:
143 //! # use packed_simd::*;
144 //! let a = i32x4::new(1, 1, 3, 3);
145 //! let b = i32x4::new(2, 2, 0, 0);
147 //! // ge: >= (Greater Eequal; see also lt, le, gt, eq, ne).
148 //! let m = a.ge(i32x4::splat(2));
151 //! // all / any / none allow coherent control flow
152 //! let d = m.select(a, b);
153 //! assert_eq!(d, i32x4::new(2, 2, 3, 3));
159 //! * **lossless widening conversions**: [`From`]/[`Into`] are implemented for
160 //! vectors with the same number of lanes when the conversion is value
161 //! preserving (same as in `std`).
163 //! * **safe bitwise conversions**: The cargo feature `into_bits` provides the
164 //! `IntoBits/FromBits` traits (`x.into_bits()`). These perform safe bitwise
165 //! `transmute`s when all bit patterns of the source type are valid bit
166 //! patterns of the target type and are also implemented for the
167 //! architecture-specific vector types of `std::arch`. For example, `let x:
168 //! u8x8 = m8x8::splat(true).into_bits();` is provided because all `m8x8` bit
169 //! patterns are valid `u8x8` bit patterns. However, the opposite is not
170 //! true, not all `u8x8` bit patterns are valid `m8x8` bit-patterns, so this
171 //! operation cannot be peformed safely using `x.into_bits()`; one needs to
172 //! use `unsafe { crate::mem::transmute(x) }` for that, making sure that the
173 //! value in the `u8x8` is a valid bit-pattern of `m8x8`.
175 //! * **numeric casts** (`as`): are peformed using [`FromCast`]/[`Cast`]
176 //! (`x.cast()`), just like `as`:
178 //! * casting integer vectors whose lane types have the same size (e.g.
179 //! `i32xN` -> `u32xN`) is a **no-op**,
181 //! * casting from a larger integer to a smaller integer (e.g. `u32xN` ->
182 //! `u8xN`) will **truncate**,
184 //! * casting from a smaller integer to a larger integer (e.g. `u8xN` ->
186 //! * **zero-extend** if the source is unsigned, or
187 //! * **sign-extend** if the source is signed,
189 //! * casting from a float to an integer will **round the float towards
192 //! * casting from an integer to float will produce the floating point
193 //! representation of the integer, **rounding to nearest, ties to even**,
195 //! * casting from an `f32` to an `f64` is perfect and lossless,
197 //! * casting from an `f64` to an `f32` **rounds to nearest, ties to even**.
199 //! Numeric casts are not very "precise": sometimes lossy, sometimes value
208 aarch64_target_feature
,
210 link_llvm_intrinsics
,
212 stmt_expr_attributes
,
213 crate_visibility_modifier
,
214 custom_inner_attributes
,
217 #![allow(non_camel_case_types, non_snake_case,
218 // FIXME: these types are unsound in C FFI already
219 // See https://github.com/rust-lang/rust/issues/53346
220 improper_ctypes_definitions
,
221 clippy
::cast_possible_truncation
,
222 clippy
::cast_lossless
,
223 clippy
::cast_possible_wrap
,
224 clippy
::cast_precision_loss
,
225 // TODO: manually add the `#[must_use]` attribute where appropriate
226 clippy
::must_use_candidate
,
227 // This lint is currently broken for generic code
228 // See https://github.com/rust-lang/rust-clippy/issues/3410
230 clippy
::wrong_self_convention
,
232 #![cfg_attr(test, feature(hashmap_internals))]
233 #![deny(rust_2018_idioms, clippy::missing_inline_in_public_items)]
239 if #[cfg(feature = "core_arch")] {
240 #[allow(unused_imports)]
241 use core_arch
as arch
;
243 #[allow(unused_imports)]
248 #[cfg(all(target_arch = "wasm32", test))]
249 use wasm_bindgen_test
::*;
251 #[allow(unused_imports)]
253 /* arch (handled above), */ cmp
, f32, f64, fmt
, hash
, hint
, i128
,
254 i16, i32, i64, i8, intrinsics
, isize, iter
, marker
, mem
, ops
, ptr
, slice
,
255 u128
, u16, u32, u64, u8, usize,
265 pub use crate::sealed
::{Simd as SimdVector, Shuffle, SimdArray, Mask}
;
267 /// Packed SIMD vector type.
272 /// # use packed_simd::Simd;
273 /// let v = Simd::<[i32; 4]>::new(0, 1, 2, 3);
274 /// assert_eq!(v.extract(2), 2);
277 #[derive(Copy, Clone)]
278 pub struct Simd
<A
: sealed
::SimdArray
>(
279 // FIXME: this type should be private,
280 // but it currently must be public for the
281 // `shuffle!` macro to work: it needs to
282 // access the internal `repr(simd)` type
283 // to call the shuffle intrinsics.
284 #[doc(hidden)] pub <A as sealed::SimdArray>::Tuple,
287 impl<A
: sealed
::SimdArray
> sealed
::Seal
for Simd
<A
> {}
289 /// Wrapper over `T` implementing a lexicoraphical order via the `PartialOrd`
290 /// and/or `Ord` traits.
292 #[derive(Copy, Clone, Debug)]
293 #[allow(clippy::missing_inline_in_public_items)]
294 pub struct LexicographicallyOrdered
<T
>(T
);
297 pub use self::masks
::*;
300 pub use self::v16
::*;
303 pub use self::v32
::*;
306 pub use self::v64
::*;
309 pub use self::v128
::*;
312 pub use self::v256
::*;
315 pub use self::v512
::*;
318 pub use self::vSize
::*;
321 pub use self::vPtr
::*;
323 pub use self::api
::cast
::*;
325 #[cfg(feature = "into_bits")]
326 pub use self::api
::into_bits
::*;
328 // Re-export the shuffle intrinsics required by the `shuffle!` macro.
330 pub use self::codegen
::llvm
::{
331 __shuffle_vector16
, __shuffle_vector2
, __shuffle_vector32
,
332 __shuffle_vector4
, __shuffle_vector64
, __shuffle_vector8
,
336 crate use crate::codegen
::llvm
::*;