]>
Commit | Line | Data |
---|---|---|
f20569fa XL |
1 | //! Shuffle vector lanes with run-time indices. |
2 | ||
3 | use crate::*; | |
4 | ||
5 | pub trait Shuffle1Dyn { | |
6 | type Indices; | |
7 | fn shuffle1_dyn(self, _: Self::Indices) -> Self; | |
8 | } | |
9 | ||
10 | // Fallback implementation | |
11 | macro_rules! impl_fallback { | |
12 | ($id:ident) => { | |
13 | impl Shuffle1Dyn for $id { | |
14 | type Indices = Self; | |
15 | #[inline] | |
16 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
17 | let mut result = Self::splat(0); | |
18 | for i in 0..$id::lanes() { | |
19 | result = result | |
20 | .replace(i, self.extract(indices.extract(i) as usize)); | |
21 | } | |
22 | result | |
23 | } | |
24 | } | |
25 | }; | |
26 | } | |
27 | ||
28 | macro_rules! impl_shuffle1_dyn { | |
29 | (u8x8) => { | |
30 | cfg_if! { | |
31 | if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), | |
32 | target_feature = "ssse3"))] { | |
33 | impl Shuffle1Dyn for u8x8 { | |
34 | type Indices = Self; | |
35 | #[inline] | |
36 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
37 | #[cfg(target_arch = "x86")] | |
38 | use crate::arch::x86::_mm_shuffle_pi8; | |
39 | #[cfg(target_arch = "x86_64")] | |
40 | use crate::arch::x86_64::_mm_shuffle_pi8; | |
41 | ||
42 | unsafe { | |
43 | crate::mem::transmute( | |
44 | _mm_shuffle_pi8( | |
45 | crate::mem::transmute(self.0), | |
46 | crate::mem::transmute(indices.0) | |
47 | ) | |
48 | ) | |
49 | } | |
50 | } | |
51 | } | |
52 | } else if #[cfg(all( | |
53 | any( | |
54 | all(target_aarch = "aarch64", target_feature = "neon"), | |
55 | all(target_aarch = "arm", target_feature = "v7", | |
56 | target_feature = "neon") | |
57 | ), | |
58 | any(feature = "core_arch", libcore_neon) | |
59 | ) | |
60 | )] { | |
61 | impl Shuffle1Dyn for u8x8 { | |
62 | type Indices = Self; | |
63 | #[inline] | |
64 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
65 | #[cfg(targt_arch = "aarch64")] | |
66 | use crate::arch::aarch64::vtbl1_u8; | |
67 | #[cfg(targt_arch = "arm")] | |
68 | use crate::arch::arm::vtbl1_u8; | |
69 | ||
70 | // This is safe because the binary is compiled with | |
71 | // neon enabled at compile-time and can therefore only | |
72 | // run on CPUs that have it enabled. | |
73 | unsafe { | |
74 | Simd(mem::transmute( | |
75 | vtbl1_u8(mem::transmute(self.0), | |
76 | crate::mem::transmute(indices.0)) | |
77 | )) | |
78 | } | |
79 | } | |
80 | } | |
81 | } else { | |
82 | impl_fallback!(u8x8); | |
83 | } | |
84 | } | |
85 | }; | |
86 | (u8x16) => { | |
87 | cfg_if! { | |
88 | if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), | |
89 | target_feature = "ssse3"))] { | |
90 | impl Shuffle1Dyn for u8x16 { | |
91 | type Indices = Self; | |
92 | #[inline] | |
93 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
94 | #[cfg(target_arch = "x86")] | |
95 | use crate::arch::x86::_mm_shuffle_epi8; | |
96 | #[cfg(target_arch = "x86_64")] | |
97 | use crate::arch::x86_64::_mm_shuffle_epi8; | |
98 | // This is safe because the binary is compiled with | |
99 | // ssse3 enabled at compile-time and can therefore only | |
100 | // run on CPUs that have it enabled. | |
101 | unsafe { | |
102 | Simd(mem::transmute( | |
103 | _mm_shuffle_epi8(mem::transmute(self.0), | |
104 | crate::mem::transmute(indices)) | |
105 | )) | |
106 | } | |
107 | } | |
108 | } | |
109 | } else if #[cfg(all(target_aarch = "aarch64", target_feature = "neon", | |
110 | any(feature = "core_arch", libcore_neon)))] { | |
111 | impl Shuffle1Dyn for u8x16 { | |
112 | type Indices = Self; | |
113 | #[inline] | |
114 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
115 | use crate::arch::aarch64::vqtbl1q_u8; | |
116 | ||
117 | // This is safe because the binary is compiled with | |
118 | // neon enabled at compile-time and can therefore only | |
119 | // run on CPUs that have it enabled. | |
120 | unsafe { | |
121 | Simd(mem::transmute( | |
122 | vqtbl1q_u8(mem::transmute(self.0), | |
123 | crate::mem::transmute(indices.0)) | |
124 | )) | |
125 | } | |
126 | } | |
127 | } | |
128 | } else if #[cfg(all(target_aarch = "arm", target_feature = "v7", | |
129 | target_feature = "neon", | |
130 | any(feature = "core_arch", libcore_neon)))] { | |
131 | impl Shuffle1Dyn for u8x16 { | |
132 | type Indices = Self; | |
133 | #[inline] | |
134 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
135 | use crate::arch::arm::vtbl2_u8; | |
136 | ||
137 | // This is safe because the binary is compiled with | |
138 | // neon enabled at compile-time and can therefore only | |
139 | // run on CPUs that have it enabled. | |
140 | unsafe { | |
141 | union U { | |
142 | j: u8x16, | |
143 | s: (u8x8, u8x8), | |
144 | } | |
145 | ||
146 | let (i0, i1) = U { j: y }.s; | |
147 | ||
148 | let r0 = vtbl2_u8( | |
149 | mem::transmute(x), | |
150 | crate::mem::transmute(i0) | |
151 | ); | |
152 | let r1 = vtbl2_u8( | |
153 | mem::transmute(x), | |
154 | crate::mem::transmute(i1) | |
155 | ); | |
156 | ||
157 | let r = U { s: (r0, r1) }.j; | |
158 | ||
159 | Simd(mem::transmute(r)) | |
160 | } | |
161 | } | |
162 | } | |
163 | } else { | |
164 | impl_fallback!(u8x16); | |
165 | } | |
166 | } | |
167 | }; | |
168 | (u16x8) => { | |
169 | impl Shuffle1Dyn for u16x8 { | |
170 | type Indices = Self; | |
171 | #[inline] | |
172 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
173 | let indices: u8x8 = (indices * 2).cast(); | |
174 | let indices: u8x16 = shuffle!( | |
175 | indices, [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7] | |
176 | ); | |
177 | let v = u8x16::new( | |
178 | 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 | |
179 | ); | |
180 | let indices = indices + v; | |
181 | unsafe { | |
182 | let s: u8x16 =crate::mem::transmute(self); | |
183 | crate::mem::transmute(s.shuffle1_dyn(indices)) | |
184 | } | |
185 | } | |
186 | } | |
187 | }; | |
188 | (u32x4) => { | |
189 | cfg_if! { | |
190 | if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), | |
191 | target_feature = "avx"))] { | |
192 | impl Shuffle1Dyn for u32x4 { | |
193 | type Indices = Self; | |
194 | #[inline] | |
195 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
196 | #[cfg(target_arch = "x86")] | |
197 | use crate::arch::x86::{_mm_permutevar_ps}; | |
198 | #[cfg(target_arch = "x86_64")] | |
199 | use crate::arch::x86_64::{_mm_permutevar_ps}; | |
200 | ||
201 | unsafe { | |
202 | crate::mem::transmute( | |
203 | _mm_permutevar_ps( | |
204 | crate::mem::transmute(self.0), | |
205 | crate::mem::transmute(indices.0) | |
206 | ) | |
207 | ) | |
208 | } | |
209 | } | |
210 | } | |
211 | } else { | |
212 | impl Shuffle1Dyn for u32x4 { | |
213 | type Indices = Self; | |
214 | #[inline] | |
215 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
216 | let indices: u8x4 = (indices * 4).cast(); | |
217 | let indices: u8x16 = shuffle!( | |
218 | indices, | |
219 | [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3] | |
220 | ); | |
221 | let v = u8x16::new( | |
222 | 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 | |
223 | ); | |
224 | let indices = indices + v; | |
225 | unsafe { | |
226 | let s: u8x16 =crate::mem::transmute(self); | |
227 | crate::mem::transmute(s.shuffle1_dyn(indices)) | |
228 | } | |
229 | } | |
230 | } | |
231 | } | |
232 | } | |
233 | }; | |
234 | (u64x2) => { | |
235 | cfg_if! { | |
236 | if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), | |
237 | target_feature = "avx"))] { | |
238 | impl Shuffle1Dyn for u64x2 { | |
239 | type Indices = Self; | |
240 | #[inline] | |
241 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
242 | #[cfg(target_arch = "x86")] | |
243 | use crate::arch::x86::{_mm_permutevar_pd}; | |
244 | #[cfg(target_arch = "x86_64")] | |
245 | use crate::arch::x86_64::{_mm_permutevar_pd}; | |
246 | // _mm_permutevar_pd uses the _second_ bit of each | |
247 | // element to perform the selection, that is: 0b00 => 0, | |
248 | // 0b10 => 1: | |
249 | let indices = indices << 1; | |
250 | unsafe { | |
251 | crate::mem::transmute( | |
252 | _mm_permutevar_pd( | |
253 | crate::mem::transmute(self), | |
254 | crate::mem::transmute(indices) | |
255 | ) | |
256 | ) | |
257 | } | |
258 | } | |
259 | } | |
260 | } else { | |
261 | impl Shuffle1Dyn for u64x2 { | |
262 | type Indices = Self; | |
263 | #[inline] | |
264 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
265 | let indices: u8x2 = (indices * 8).cast(); | |
266 | let indices: u8x16 = shuffle!( | |
267 | indices, | |
268 | [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] | |
269 | ); | |
270 | let v = u8x16::new( | |
271 | 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 | |
272 | ); | |
273 | let indices = indices + v; | |
274 | unsafe { | |
275 | let s: u8x16 =crate::mem::transmute(self); | |
276 | crate::mem::transmute(s.shuffle1_dyn(indices)) | |
277 | } | |
278 | } | |
279 | } | |
280 | } | |
281 | } | |
282 | }; | |
283 | (u128x1) => { | |
284 | impl Shuffle1Dyn for u128x1 { | |
285 | type Indices = Self; | |
286 | #[inline] | |
287 | fn shuffle1_dyn(self, _indices: Self::Indices) -> Self { | |
288 | self | |
289 | } | |
290 | } | |
291 | }; | |
292 | ($id:ident) => { impl_fallback!($id); } | |
293 | } | |
294 | ||
295 | impl_shuffle1_dyn!(u8x2); | |
296 | impl_shuffle1_dyn!(u8x4); | |
297 | impl_shuffle1_dyn!(u8x8); | |
298 | impl_shuffle1_dyn!(u8x16); | |
299 | impl_shuffle1_dyn!(u8x32); | |
300 | impl_shuffle1_dyn!(u8x64); | |
301 | ||
302 | impl_shuffle1_dyn!(u16x2); | |
303 | impl_shuffle1_dyn!(u16x4); | |
304 | impl_shuffle1_dyn!(u16x8); | |
305 | impl_shuffle1_dyn!(u16x16); | |
306 | impl_shuffle1_dyn!(u16x32); | |
307 | ||
308 | impl_shuffle1_dyn!(u32x2); | |
309 | impl_shuffle1_dyn!(u32x4); | |
310 | impl_shuffle1_dyn!(u32x8); | |
311 | impl_shuffle1_dyn!(u32x16); | |
312 | ||
313 | impl_shuffle1_dyn!(u64x2); | |
314 | impl_shuffle1_dyn!(u64x4); | |
315 | impl_shuffle1_dyn!(u64x8); | |
316 | ||
317 | impl_shuffle1_dyn!(usizex2); | |
318 | impl_shuffle1_dyn!(usizex4); | |
319 | impl_shuffle1_dyn!(usizex8); | |
320 | ||
321 | impl_shuffle1_dyn!(u128x1); | |
322 | impl_shuffle1_dyn!(u128x2); | |
323 | impl_shuffle1_dyn!(u128x4); | |
324 | ||
325 | // Implementation for non-unsigned vector types | |
326 | macro_rules! impl_shuffle1_dyn_non_u { | |
327 | ($id:ident, $uid:ident) => { | |
328 | impl Shuffle1Dyn for $id { | |
329 | type Indices = $uid; | |
330 | #[inline] | |
331 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
332 | unsafe { | |
333 | let u: $uid = crate::mem::transmute(self); | |
334 | crate::mem::transmute(u.shuffle1_dyn(indices)) | |
335 | } | |
336 | } | |
337 | } | |
338 | }; | |
339 | } | |
340 | ||
341 | impl_shuffle1_dyn_non_u!(i8x2, u8x2); | |
342 | impl_shuffle1_dyn_non_u!(i8x4, u8x4); | |
343 | impl_shuffle1_dyn_non_u!(i8x8, u8x8); | |
344 | impl_shuffle1_dyn_non_u!(i8x16, u8x16); | |
345 | impl_shuffle1_dyn_non_u!(i8x32, u8x32); | |
346 | impl_shuffle1_dyn_non_u!(i8x64, u8x64); | |
347 | ||
348 | impl_shuffle1_dyn_non_u!(i16x2, u16x2); | |
349 | impl_shuffle1_dyn_non_u!(i16x4, u16x4); | |
350 | impl_shuffle1_dyn_non_u!(i16x8, u16x8); | |
351 | impl_shuffle1_dyn_non_u!(i16x16, u16x16); | |
352 | impl_shuffle1_dyn_non_u!(i16x32, u16x32); | |
353 | ||
354 | impl_shuffle1_dyn_non_u!(i32x2, u32x2); | |
355 | impl_shuffle1_dyn_non_u!(i32x4, u32x4); | |
356 | impl_shuffle1_dyn_non_u!(i32x8, u32x8); | |
357 | impl_shuffle1_dyn_non_u!(i32x16, u32x16); | |
358 | ||
359 | impl_shuffle1_dyn_non_u!(i64x2, u64x2); | |
360 | impl_shuffle1_dyn_non_u!(i64x4, u64x4); | |
361 | impl_shuffle1_dyn_non_u!(i64x8, u64x8); | |
362 | ||
363 | impl_shuffle1_dyn_non_u!(isizex2, usizex2); | |
364 | impl_shuffle1_dyn_non_u!(isizex4, usizex4); | |
365 | impl_shuffle1_dyn_non_u!(isizex8, usizex8); | |
366 | ||
367 | impl_shuffle1_dyn_non_u!(i128x1, u128x1); | |
368 | impl_shuffle1_dyn_non_u!(i128x2, u128x2); | |
369 | impl_shuffle1_dyn_non_u!(i128x4, u128x4); | |
370 | ||
371 | impl_shuffle1_dyn_non_u!(m8x2, u8x2); | |
372 | impl_shuffle1_dyn_non_u!(m8x4, u8x4); | |
373 | impl_shuffle1_dyn_non_u!(m8x8, u8x8); | |
374 | impl_shuffle1_dyn_non_u!(m8x16, u8x16); | |
375 | impl_shuffle1_dyn_non_u!(m8x32, u8x32); | |
376 | impl_shuffle1_dyn_non_u!(m8x64, u8x64); | |
377 | ||
378 | impl_shuffle1_dyn_non_u!(m16x2, u16x2); | |
379 | impl_shuffle1_dyn_non_u!(m16x4, u16x4); | |
380 | impl_shuffle1_dyn_non_u!(m16x8, u16x8); | |
381 | impl_shuffle1_dyn_non_u!(m16x16, u16x16); | |
382 | impl_shuffle1_dyn_non_u!(m16x32, u16x32); | |
383 | ||
384 | impl_shuffle1_dyn_non_u!(m32x2, u32x2); | |
385 | impl_shuffle1_dyn_non_u!(m32x4, u32x4); | |
386 | impl_shuffle1_dyn_non_u!(m32x8, u32x8); | |
387 | impl_shuffle1_dyn_non_u!(m32x16, u32x16); | |
388 | ||
389 | impl_shuffle1_dyn_non_u!(m64x2, u64x2); | |
390 | impl_shuffle1_dyn_non_u!(m64x4, u64x4); | |
391 | impl_shuffle1_dyn_non_u!(m64x8, u64x8); | |
392 | ||
393 | impl_shuffle1_dyn_non_u!(msizex2, usizex2); | |
394 | impl_shuffle1_dyn_non_u!(msizex4, usizex4); | |
395 | impl_shuffle1_dyn_non_u!(msizex8, usizex8); | |
396 | ||
397 | impl_shuffle1_dyn_non_u!(m128x1, u128x1); | |
398 | impl_shuffle1_dyn_non_u!(m128x2, u128x2); | |
399 | impl_shuffle1_dyn_non_u!(m128x4, u128x4); | |
400 | ||
401 | impl_shuffle1_dyn_non_u!(f32x2, u32x2); | |
402 | impl_shuffle1_dyn_non_u!(f32x4, u32x4); | |
403 | impl_shuffle1_dyn_non_u!(f32x8, u32x8); | |
404 | impl_shuffle1_dyn_non_u!(f32x16, u32x16); | |
405 | ||
406 | impl_shuffle1_dyn_non_u!(f64x2, u64x2); | |
407 | impl_shuffle1_dyn_non_u!(f64x4, u64x4); | |
408 | impl_shuffle1_dyn_non_u!(f64x8, u64x8); | |
409 | ||
410 | // Implementation for non-unsigned vector types | |
411 | macro_rules! impl_shuffle1_dyn_ptr { | |
412 | ($id:ident, $uid:ident) => { | |
413 | impl<T> Shuffle1Dyn for $id<T> { | |
414 | type Indices = $uid; | |
415 | #[inline] | |
416 | fn shuffle1_dyn(self, indices: Self::Indices) -> Self { | |
417 | unsafe { | |
418 | let u: $uid = crate::mem::transmute(self); | |
419 | crate::mem::transmute(u.shuffle1_dyn(indices)) | |
420 | } | |
421 | } | |
422 | } | |
423 | }; | |
424 | } | |
425 | ||
426 | impl_shuffle1_dyn_ptr!(cptrx2, usizex2); | |
427 | impl_shuffle1_dyn_ptr!(cptrx4, usizex4); | |
428 | impl_shuffle1_dyn_ptr!(cptrx8, usizex8); | |
429 | ||
430 | impl_shuffle1_dyn_ptr!(mptrx2, usizex2); | |
431 | impl_shuffle1_dyn_ptr!(mptrx4, usizex4); | |
432 | impl_shuffle1_dyn_ptr!(mptrx8, usizex8); |