]> git.proxmox.com Git - rustc.git/blob - vendor/ppv-lite86/src/x86_64/sse2.rs
New upstream version 1.38.0+dfsg1
[rustc.git] / vendor / ppv-lite86 / src / x86_64 / sse2.rs
1 use crate::soft::{x2, x4};
2 use crate::types::*;
3 use crate::vec128_storage;
4 use crate::x86_64::Avx2Machine;
5 use crate::x86_64::SseMachine as Machine86;
6 use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
7 use core::arch::x86_64::*;
8 use core::marker::PhantomData;
9 use core::ops::{
10 Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
11 };
12
13 macro_rules! impl_binop {
14 ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
15 impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
16 type Output = Self;
17 #[inline(always)]
18 fn $fn(self, rhs: Self) -> Self::Output {
19 Self::new(unsafe { $impl_fn(self.x, rhs.x) })
20 }
21 }
22 };
23 }
24
25 macro_rules! impl_binop_assign {
26 ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
27 impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
28 where
29 $vec<S3, S4, NI>: Copy,
30 {
31 #[inline(always)]
32 fn $fn_assign(&mut self, rhs: Self) {
33 *self = self.$fn(rhs);
34 }
35 }
36 };
37 }
38
39 macro_rules! def_vec {
40 ($vec:ident, $word:ident) => {
41 #[allow(non_camel_case_types)]
42 #[derive(Copy, Clone)]
43 pub struct $vec<S3, S4, NI> {
44 x: __m128i,
45 s3: PhantomData<S3>,
46 s4: PhantomData<S4>,
47 ni: PhantomData<NI>,
48 }
49
50 impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
51 #[inline(always)]
52 unsafe fn unpack(x: vec128_storage) -> Self {
53 Self::new(x.sse2)
54 }
55 }
56 impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
57 #[inline(always)]
58 fn from(x: $vec<S3, S4, NI>) -> Self {
59 vec128_storage { sse2: x.x }
60 }
61 }
62 impl<S3, S4, NI> $vec<S3, S4, NI> {
63 #[inline(always)]
64 fn new(x: __m128i) -> Self {
65 $vec {
66 x,
67 s3: PhantomData,
68 s4: PhantomData,
69 ni: PhantomData,
70 }
71 }
72 }
73
74 impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
75 where
76 Self: BSwap,
77 {
78 #[inline(always)]
79 unsafe fn unsafe_read_le(input: &[u8]) -> Self {
80 assert_eq!(input.len(), 16);
81 Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
82 }
83 #[inline(always)]
84 unsafe fn unsafe_read_be(input: &[u8]) -> Self {
85 assert_eq!(input.len(), 16);
86 Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
87 }
88 #[inline(always)]
89 fn write_le(self, out: &mut [u8]) {
90 assert_eq!(out.len(), 16);
91 unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
92 }
93 #[inline(always)]
94 fn write_be(self, out: &mut [u8]) {
95 assert_eq!(out.len(), 16);
96 let x = self.bswap().x;
97 unsafe {
98 _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
99 }
100 }
101 }
102
103 impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
104 #[inline(always)]
105 fn default() -> Self {
106 Self::new(unsafe { _mm_setzero_si128() })
107 }
108 }
109
110 impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
111 type Output = Self;
112 #[inline(always)]
113 fn not(self) -> Self::Output {
114 unsafe {
115 let ff = _mm_set1_epi64x(-1i64);
116 self ^ Self::new(ff)
117 }
118 }
119 }
120
121 impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
122 impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
123 impl_binop!($vec, BitOr, bitor, _mm_or_si128);
124 impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
125 impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
126 impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
127 impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
128 impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
129 type Output = Self;
130 #[inline(always)]
131 fn andnot(self, rhs: Self) -> Self {
132 Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
133 }
134 }
135 };
136 }
137
138 macro_rules! impl_bitops32 {
139 ($vec:ident) => {
140 impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
141 $vec<S3, S4, NI>: RotateEachWord32
142 {
143 }
144 };
145 }
146
147 macro_rules! impl_bitops64 {
148 ($vec:ident) => {
149 impl_bitops32!($vec);
150 impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
151 $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
152 {
153 }
154 };
155 }
156
157 macro_rules! impl_bitops128 {
158 ($vec:ident) => {
159 impl_bitops64!($vec);
160 impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
161 $vec<S3, S4, NI>: RotateEachWord128
162 {
163 }
164 };
165 }
166
167 macro_rules! rotr_32_s3 {
168 ($name:ident, $k0:expr, $k1:expr) => {
169 #[inline(always)]
170 fn $name(self) -> Self {
171 Self::new(unsafe {
172 _mm_shuffle_epi8(
173 self.x,
174 _mm_set_epi64x($k0, $k1),
175 )
176 })
177 }
178 };
179 }
180 macro_rules! rotr_32 {
181 ($name:ident, $i:expr) => {
182 #[inline(always)]
183 fn $name(self) -> Self {
184 Self::new(unsafe {
185 _mm_or_si128(
186 _mm_srli_epi32(self.x, $i as i32),
187 _mm_slli_epi32(self.x, 32 - $i as i32),
188 )
189 })
190 }
191 };
192 }
193 impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
194 rotr_32!(rotate_each_word_right7, 7);
195 rotr_32_s3!(
196 rotate_each_word_right8,
197 0x0c0f0e0d_080b0a09,
198 0x04070605_00030201
199 );
200 rotr_32!(rotate_each_word_right11, 11);
201 rotr_32!(rotate_each_word_right12, 12);
202 rotr_32_s3!(
203 rotate_each_word_right16,
204 0x0d0c0f0e_09080b0a,
205 0x05040706_01000302
206 );
207 rotr_32!(rotate_each_word_right20, 20);
208 rotr_32_s3!(
209 rotate_each_word_right24,
210 0x0e0d0c0f_0a09080b,
211 0x06050407_02010003
212 );
213 rotr_32!(rotate_each_word_right25, 25);
214 }
215 impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
216 rotr_32!(rotate_each_word_right7, 7);
217 rotr_32!(rotate_each_word_right8, 8);
218 rotr_32!(rotate_each_word_right11, 11);
219 rotr_32!(rotate_each_word_right12, 12);
220 #[inline(always)]
221 fn rotate_each_word_right16(self) -> Self {
222 Self::new(swap16_s2(self.x))
223 }
224 rotr_32!(rotate_each_word_right20, 20);
225 rotr_32!(rotate_each_word_right24, 24);
226 rotr_32!(rotate_each_word_right25, 25);
227 }
228
229 macro_rules! rotr_64_s3 {
230 ($name:ident, $k0:expr, $k1:expr) => {
231 #[inline(always)]
232 fn $name(self) -> Self {
233 Self::new(unsafe {
234 _mm_shuffle_epi8(
235 self.x,
236 _mm_set_epi64x($k0, $k1),
237 )
238 })
239 }
240 };
241 }
242 macro_rules! rotr_64 {
243 ($name:ident, $i:expr) => {
244 #[inline(always)]
245 fn $name(self) -> Self {
246 Self::new(unsafe {
247 _mm_or_si128(
248 _mm_srli_epi64(self.x, $i as i32),
249 _mm_slli_epi64(self.x, 64 - $i as i32),
250 )
251 })
252 }
253 };
254 }
255 impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
256 rotr_64!(rotate_each_word_right7, 7);
257 rotr_64_s3!(
258 rotate_each_word_right8,
259 0x080f_0e0d_0c0b_0a09,
260 0x0007_0605_0403_0201
261 );
262 rotr_64!(rotate_each_word_right11, 11);
263 rotr_64!(rotate_each_word_right12, 12);
264 rotr_64_s3!(
265 rotate_each_word_right16,
266 0x0908_0f0e_0d0c_0b0a,
267 0x0100_0706_0504_0302
268 );
269 rotr_64!(rotate_each_word_right20, 20);
270 rotr_64_s3!(
271 rotate_each_word_right24,
272 0x0a09_080f_0e0d_0c0b,
273 0x0201_0007_0605_0403
274 );
275 rotr_64!(rotate_each_word_right25, 25);
276 }
277 impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
278 rotr_64!(rotate_each_word_right7, 7);
279 rotr_64!(rotate_each_word_right8, 8);
280 rotr_64!(rotate_each_word_right11, 11);
281 rotr_64!(rotate_each_word_right12, 12);
282 #[inline(always)]
283 fn rotate_each_word_right16(self) -> Self {
284 Self::new(swap16_s2(self.x))
285 }
286 rotr_64!(rotate_each_word_right20, 20);
287 rotr_64!(rotate_each_word_right24, 24);
288 rotr_64!(rotate_each_word_right25, 25);
289 }
290 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
291 #[inline(always)]
292 fn rotate_each_word_right32(self) -> Self {
293 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
294 }
295 }
296
297 macro_rules! rotr_128 {
298 ($name:ident, $i:expr) => {
299 #[inline(always)]
300 fn $name(self) -> Self {
301 Self::new(unsafe {
302 _mm_or_si128(
303 _mm_srli_si128(self.x, $i as i32),
304 _mm_slli_si128(self.x, 128 - $i as i32),
305 )
306 })
307 }
308 };
309 }
310 // TODO: completely unoptimized
311 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
312 rotr_128!(rotate_each_word_right7, 7);
313 rotr_128!(rotate_each_word_right8, 8);
314 rotr_128!(rotate_each_word_right11, 11);
315 rotr_128!(rotate_each_word_right12, 12);
316 rotr_128!(rotate_each_word_right16, 16);
317 rotr_128!(rotate_each_word_right20, 20);
318 rotr_128!(rotate_each_word_right24, 24);
319 rotr_128!(rotate_each_word_right25, 25);
320 }
321 // TODO: completely unoptimized
322 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
323 rotr_128!(rotate_each_word_right32, 32);
324 }
325 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
326
327 def_vec!(u32x4_sse2, u32);
328 def_vec!(u64x2_sse2, u64);
329 def_vec!(u128x1_sse2, u128);
330
331 impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
332 #[inline(always)]
333 fn to_lanes(self) -> [u32; 4] {
334 unsafe {
335 let x = _mm_cvtsi128_si64(self.x) as u64;
336 let y = _mm_extract_epi64(self.x, 1) as u64;
337 [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
338 }
339 }
340 #[inline(always)]
341 fn from_lanes(xs: [u32; 4]) -> Self {
342 unsafe {
343 let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
344 x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
345 Self::new(x)
346 }
347 }
348 }
349 impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
350 #[inline(always)]
351 fn to_lanes(self) -> [u32; 4] {
352 unsafe {
353 let x = _mm_cvtsi128_si64(self.x) as u64;
354 let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
355 [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
356 }
357 }
358 #[inline(always)]
359 fn from_lanes(xs: [u32; 4]) -> Self {
360 unsafe {
361 let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
362 let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
363 let x = _mm_cvtsi64_si128(x);
364 let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
365 Self::new(_mm_or_si128(x, y))
366 }
367 }
368 }
369 impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
370 #[inline(always)]
371 fn to_lanes(self) -> [u64; 2] {
372 unsafe {
373 [
374 _mm_cvtsi128_si64(self.x) as u64,
375 _mm_extract_epi64(self.x, 1) as u64,
376 ]
377 }
378 }
379 #[inline(always)]
380 fn from_lanes(xs: [u64; 2]) -> Self {
381 unsafe {
382 let mut x = _mm_cvtsi64_si128(xs[0] as i64);
383 x = _mm_insert_epi64(x, xs[1] as i64, 1);
384 Self::new(x)
385 }
386 }
387 }
388 impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
389 #[inline(always)]
390 fn to_lanes(self) -> [u64; 2] {
391 unsafe {
392 [
393 _mm_cvtsi128_si64(self.x) as u64,
394 _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
395 ]
396 }
397 }
398 #[inline(always)]
399 fn from_lanes(xs: [u64; 2]) -> Self {
400 unsafe {
401 let x = _mm_cvtsi64_si128(xs[0] as i64);
402 let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
403 Self::new(_mm_or_si128(x, y))
404 }
405 }
406 }
407 impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
408 #[inline(always)]
409 fn to_lanes(self) -> [u128; 1] {
410 unimplemented!()
411 }
412 #[inline(always)]
413 fn from_lanes(xs: [u128; 1]) -> Self {
414 unimplemented!()
415 }
416 }
417
418 impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
419 where
420 u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
421 {
422 #[inline(always)]
423 fn to_lanes(self) -> [u64; 4] {
424 let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
425 [a[0], a[1], b[0], b[1]]
426 }
427 #[inline(always)]
428 fn from_lanes(xs: [u64; 4]) -> Self {
429 let (a, b) = (
430 u64x2_sse2::from_lanes([xs[0], xs[1]]),
431 u64x2_sse2::from_lanes([xs[2], xs[3]]),
432 );
433 x2::new([a, b])
434 }
435 }
436
437 macro_rules! impl_into {
438 ($from:ident, $to:ident) => {
439 impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
440 #[inline(always)]
441 fn from(x: $from<S3, S4, NI>) -> Self {
442 $to::new(x.x)
443 }
444 }
445 };
446 }
447
448 impl_into!(u128x1_sse2, u32x4_sse2);
449 impl_into!(u128x1_sse2, u64x2_sse2);
450
451 impl_bitops32!(u32x4_sse2);
452 impl_bitops64!(u64x2_sse2);
453 impl_bitops128!(u128x1_sse2);
454
455 impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
456 u32x4_sse2<S3, S4, NI>: BSwap
457 {
458 }
459 impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
460 u64x2_sse2<S3, S4, NI>: BSwap
461 {
462 }
463 impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
464 impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
465 impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
466 impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
467
468 impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
469 where
470 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
471 Machine86<S3, S4, NI>: Machine,
472 {
473 }
474 impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
475 where
476 u64x2_sse2<S3, S4, NI>:
477 RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
478 Machine86<S3, S4, NI>: Machine,
479 {
480 }
481 impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
482 where
483 u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
484 Machine86<S3, S4, NI>: Machine,
485 u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
486 u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
487 {
488 }
489
490 impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
491 where
492 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
493 Machine86<YesS3, YesS4, NI>: Machine,
494 {
495 }
496 impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
497 where
498 u64x2_sse2<YesS3, YesS4, NI>:
499 RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
500 Machine86<YesS3, YesS4, NI>: Machine,
501 {
502 }
503 impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
504 where
505 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
506 Machine86<YesS3, YesS4, NI>: Machine,
507 u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
508 u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
509 {
510 }
511
512 impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
513 #[inline(always)]
514 unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
515 Self::new(_mm_set_epi32(
516 xs[3] as i32,
517 xs[2] as i32,
518 xs[1] as i32,
519 xs[0] as i32,
520 ))
521 }
522 }
523
524 impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
525 where
526 Self: MultiLane<[u32; 4]>,
527 {
528 #[inline(always)]
529 fn extract(self, i: u32) -> u32 {
530 self.to_lanes()[i as usize]
531 }
532 #[inline(always)]
533 fn insert(self, v: u32, i: u32) -> Self {
534 Self::new(unsafe {
535 match i {
536 0 => _mm_insert_epi32(self.x, v as i32, 0),
537 1 => _mm_insert_epi32(self.x, v as i32, 1),
538 2 => _mm_insert_epi32(self.x, v as i32, 2),
539 3 => _mm_insert_epi32(self.x, v as i32, 3),
540 _ => unreachable!(),
541 }
542 })
543 }
544 }
545 impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
546 where
547 Self: MultiLane<[u32; 4]>,
548 {
549 #[inline(always)]
550 fn extract(self, i: u32) -> u32 {
551 self.to_lanes()[i as usize]
552 }
553 #[inline(always)]
554 fn insert(self, v: u32, i: u32) -> Self {
555 Self::new(unsafe {
556 match i {
557 0 => {
558 let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
559 _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
560 }
561 1 => {
562 let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
563 x = _mm_slli_si128(x, 4);
564 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
565 _mm_shuffle_epi32(x, 0b1110_0001)
566 }
567 2 => {
568 let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
569 x = _mm_slli_si128(x, 4);
570 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
571 _mm_shuffle_epi32(x, 0b1100_1001)
572 }
573 3 => {
574 let mut x = _mm_slli_si128(self.x, 4);
575 x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
576 _mm_shuffle_epi32(x, 0b0011_1001)
577 }
578 _ => unreachable!(),
579 }
580 })
581 }
582 }
583
584 impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
585 #[inline(always)]
586 fn shuffle_lane_words2301(self) -> Self {
587 self.shuffle2301()
588 }
589 #[inline(always)]
590 fn shuffle_lane_words1230(self) -> Self {
591 self.shuffle1230()
592 }
593 #[inline(always)]
594 fn shuffle_lane_words3012(self) -> Self {
595 self.shuffle3012()
596 }
597 }
598
599 impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
600 #[inline(always)]
601 fn shuffle2301(self) -> Self {
602 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
603 }
604 #[inline(always)]
605 fn shuffle1230(self) -> Self {
606 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
607 }
608 #[inline(always)]
609 fn shuffle3012(self) -> Self {
610 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
611 }
612 }
613
614 impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
615 #[inline(always)]
616 fn shuffle2301(self) -> Self {
617 x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
618 }
619 #[inline(always)]
620 fn shuffle3012(self) -> Self {
621 unsafe {
622 x2::new([
623 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
624 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
625 ])
626 }
627 }
628 #[inline(always)]
629 fn shuffle1230(self) -> Self {
630 unsafe {
631 x2::new([
632 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
633 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
634 ])
635 }
636 }
637 }
638 impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
639 #[inline(always)]
640 fn shuffle2301(self) -> Self {
641 x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
642 }
643 #[inline(always)]
644 fn shuffle3012(self) -> Self {
645 unsafe {
646 let a = _mm_srli_si128(self.0[0].x, 8);
647 let b = _mm_slli_si128(self.0[0].x, 8);
648 let c = _mm_srli_si128(self.0[1].x, 8);
649 let d = _mm_slli_si128(self.0[1].x, 8);
650 let da = _mm_or_si128(d, a);
651 let bc = _mm_or_si128(b, c);
652 x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
653 }
654 }
655 #[inline(always)]
656 fn shuffle1230(self) -> Self {
657 unsafe {
658 let a = _mm_srli_si128(self.0[0].x, 8);
659 let b = _mm_slli_si128(self.0[0].x, 8);
660 let c = _mm_srli_si128(self.0[1].x, 8);
661 let d = _mm_slli_si128(self.0[1].x, 8);
662 let da = _mm_or_si128(d, a);
663 let bc = _mm_or_si128(b, c);
664 x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
665 }
666 }
667 }
668
669 impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
670 #[inline(always)]
671 unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
672 Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
673 }
674 }
675
676 impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
677 #[inline(always)]
678 fn extract(self, i: u32) -> u64 {
679 unsafe {
680 match i {
681 0 => _mm_cvtsi128_si64(self.x) as u64,
682 1 => _mm_extract_epi64(self.x, 1) as u64,
683 _ => unreachable!(),
684 }
685 }
686 }
687 #[inline(always)]
688 fn insert(self, x: u64, i: u32) -> Self {
689 Self::new(unsafe {
690 match i {
691 0 => _mm_insert_epi64(self.x, x as i64, 0),
692 1 => _mm_insert_epi64(self.x, x as i64, 1),
693 _ => unreachable!(),
694 }
695 })
696 }
697 }
698 impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
699 #[inline(always)]
700 fn extract(self, i: u32) -> u64 {
701 unsafe {
702 match i {
703 0 => _mm_cvtsi128_si64(self.x) as u64,
704 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
705 _ => unreachable!(),
706 }
707 }
708 }
709 #[inline(always)]
710 fn insert(self, x: u64, i: u32) -> Self {
711 Self::new(unsafe {
712 match i {
713 0 => _mm_or_si128(
714 _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
715 _mm_cvtsi64_si128(x as i64),
716 ),
717 1 => _mm_or_si128(
718 _mm_move_epi64(self.x),
719 _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
720 ),
721 _ => unreachable!(),
722 }
723 })
724 }
725 }
726
727 impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
728 #[inline(always)]
729 fn bswap(self) -> Self {
730 Self::new(unsafe {
731 let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
732 _mm_shuffle_epi8(self.x, k)
733 })
734 }
735 }
736 #[inline(always)]
737 fn bswap32_s2(x: __m128i) -> __m128i {
738 unsafe {
739 let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
740 y = _mm_shufflehi_epi16(y, 0b0001_1011);
741 y = _mm_shufflelo_epi16(y, 0b0001_1011);
742 let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
743 z = _mm_shufflehi_epi16(z, 0b0001_1011);
744 z = _mm_shufflelo_epi16(z, 0b0001_1011);
745 _mm_packus_epi16(y, z)
746 }
747 }
748 impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
749 #[inline(always)]
750 fn bswap(self) -> Self {
751 Self::new(bswap32_s2(self.x))
752 }
753 }
754
755 impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
756 #[inline(always)]
757 fn bswap(self) -> Self {
758 Self::new(unsafe {
759 let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
760 _mm_shuffle_epi8(self.x, k)
761 })
762 }
763 }
764 impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
765 #[inline(always)]
766 fn bswap(self) -> Self {
767 Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
768 }
769 }
770
771 impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
772 #[inline(always)]
773 fn bswap(self) -> Self {
774 Self::new(unsafe {
775 let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
776 _mm_shuffle_epi8(self.x, k)
777 })
778 }
779 }
780 impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
781 #[inline(always)]
782 fn bswap(self) -> Self {
783 Self::new(unsafe { unimplemented!() })
784 }
785 }
786
787 macro_rules! swapi {
788 ($x:expr, $i:expr, $k:expr) => {
789 unsafe {
790 const K: u8 = $k;
791 let k = _mm_set1_epi8(K as i8);
792 u128x1_sse2::new(_mm_or_si128(
793 _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
794 _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
795 ))
796 }
797 };
798 }
799 #[inline(always)]
800 fn swap16_s2(x: __m128i) -> __m128i {
801 unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
802 }
803 impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
804 #[inline(always)]
805 fn swap1(self) -> Self {
806 swapi!(self, 1, 0xaa)
807 }
808 #[inline(always)]
809 fn swap2(self) -> Self {
810 swapi!(self, 2, 0xcc)
811 }
812 #[inline(always)]
813 fn swap4(self) -> Self {
814 swapi!(self, 4, 0xf0)
815 }
816 #[inline(always)]
817 fn swap8(self) -> Self {
818 u128x1_sse2::new(unsafe {
819 let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
820 _mm_shuffle_epi8(self.x, k)
821 })
822 }
823 #[inline(always)]
824 fn swap16(self) -> Self {
825 u128x1_sse2::new(unsafe {
826 let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
827 _mm_shuffle_epi8(self.x, k)
828 })
829 }
830 #[inline(always)]
831 fn swap32(self) -> Self {
832 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
833 }
834 #[inline(always)]
835 fn swap64(self) -> Self {
836 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
837 }
838 }
839 impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
840 #[inline(always)]
841 fn swap1(self) -> Self {
842 swapi!(self, 1, 0xaa)
843 }
844 #[inline(always)]
845 fn swap2(self) -> Self {
846 swapi!(self, 2, 0xcc)
847 }
848 #[inline(always)]
849 fn swap4(self) -> Self {
850 swapi!(self, 4, 0xf0)
851 }
852 #[inline(always)]
853 fn swap8(self) -> Self {
854 u128x1_sse2::new(unsafe {
855 _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
856 })
857 }
858 #[inline(always)]
859 fn swap16(self) -> Self {
860 u128x1_sse2::new(swap16_s2(self.x))
861 }
862 #[inline(always)]
863 fn swap32(self) -> Self {
864 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
865 }
866 #[inline(always)]
867 fn swap64(self) -> Self {
868 u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
869 }
870 }
871
872 #[derive(Copy, Clone)]
873 pub struct G0;
874 #[derive(Copy, Clone)]
875 pub struct G1;
876
877 #[allow(non_camel_case_types)]
878 pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
879 #[allow(non_camel_case_types)]
880 pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
881 #[allow(non_camel_case_types)]
882 pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
883 #[allow(non_camel_case_types)]
884 pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
885
886 #[allow(non_camel_case_types)]
887 pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
888 #[allow(non_camel_case_types)]
889 pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
890 #[allow(non_camel_case_types)]
891 pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
892
893 impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
894 where
895 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
896 Machine86<S3, S4, NI>: Machine,
897 u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
898 u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
899 {
900 }
901 impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
902 where
903 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
904 Machine86<S3, S4, NI>: Machine,
905 u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
906 u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
907 {
908 }
909 impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
910 where
911 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
912 Machine86<S3, S4, NI>: Machine,
913 u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
914 {
915 }
916 impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
917 where
918 u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
919 Machine86<S3, S4, NI>: Machine,
920 u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
921 u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
922 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
923 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
924 u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
925 {
926 }
927
928 impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
929 where
930 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
931 Avx2Machine<NI>: Machine,
932 u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
933 u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
934 {
935 }
936 impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
937 where
938 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
939 Avx2Machine<NI>: Machine,
940 u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
941 u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
942 {
943 }
944 impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
945 where
946 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
947 Avx2Machine<NI>: Machine,
948 u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
949 {
950 }
951 impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
952 where
953 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
954 Avx2Machine<NI>: Machine,
955 u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
956 u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
957 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
958 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
959 u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
960 {
961 }
962
963 impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
964 where
965 u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
966 {
967 #[inline(always)]
968 fn extract(self, i: u32) -> u64 {
969 match i {
970 0 => self.0[0].extract(0),
971 1 => self.0[0].extract(1),
972 2 => self.0[1].extract(0),
973 3 => self.0[1].extract(1),
974 _ => panic!(),
975 }
976 }
977 #[inline(always)]
978 fn insert(mut self, w: u64, i: u32) -> Self {
979 match i {
980 0 => self.0[0] = self.0[0].insert(w, 0),
981 1 => self.0[0] = self.0[0].insert(w, 1),
982 2 => self.0[1] = self.0[1].insert(w, 0),
983 3 => self.0[1] = self.0[1].insert(w, 1),
984 _ => panic!(),
985 };
986 self
987 }
988 }
989
990 impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
991 where
992 u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
993 Machine86<S3, S4, NI>: Machine,
994 u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
995 u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
996 {
997 }
998 impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
999 where
1000 u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1001 Machine86<S3, S4, NI>: Machine,
1002 u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
1003 u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
1004 {
1005 }
1006 impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
1007 where
1008 u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
1009 Machine86<S3, S4, NI>: Machine,
1010 u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1011 u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1012 u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1013 u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1014 {
1015 }
1016
1017 impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI>
1018 where
1019 u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
1020 Avx2Machine<NI>: Machine,
1021 u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>,
1022 u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>,
1023 {
1024 }
1025 impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1026 where
1027 u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1028 Avx2Machine<NI>: Machine,
1029 u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1030 u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1031 {
1032 }
1033 impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1034 where
1035 u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1036 Avx2Machine<NI>: Machine,
1037 u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1038 u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1039 u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1040 u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1041 {
1042 }
1043
1044 macro_rules! impl_into_x {
1045 ($from:ident, $to:ident) => {
1046 impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1047 for x2<$to<S3, S4, NI>, Gt>
1048 {
1049 #[inline(always)]
1050 fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1051 x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1052 }
1053 }
1054 impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1055 #[inline(always)]
1056 fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1057 x4::new([
1058 $to::from(x.0[0]),
1059 $to::from(x.0[1]),
1060 $to::from(x.0[2]),
1061 $to::from(x.0[3]),
1062 ])
1063 }
1064 }
1065 };
1066 }
1067 impl_into_x!(u128x1_sse2, u64x2_sse2);
1068 impl_into_x!(u128x1_sse2, u32x4_sse2);
1069
1070 ///// Debugging
1071
1072 use core::fmt::{Debug, Formatter, Result};
1073
1074 impl<W: PartialEq, G> PartialEq for x2<W, G> {
1075 #[inline(always)]
1076 fn eq(&self, rhs: &Self) -> bool {
1077 self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1078 }
1079 }
1080
1081 #[inline(always)]
1082 unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1083 let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1084 _mm_cvtsi128_si64(q) == -1
1085 }
1086
1087 #[inline(always)]
1088 unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1089 let q = _mm_cmpeq_epi32(x, y);
1090 let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1091 let q = _mm_cvtsi128_si64(q);
1092 (p & q) == -1
1093 }
1094
1095 impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1096 #[inline(always)]
1097 fn eq(&self, rhs: &Self) -> bool {
1098 unsafe { eq128_s2(self.x, rhs.x) }
1099 }
1100 }
1101 impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1102 where
1103 Self: Copy + MultiLane<[u32; 4]>,
1104 {
1105 #[cold]
1106 fn fmt(&self, fmt: &mut Formatter) -> Result {
1107 fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1108 }
1109 }
1110
1111 impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1112 #[inline(always)]
1113 fn eq(&self, rhs: &Self) -> bool {
1114 unsafe { eq128_s2(self.x, rhs.x) }
1115 }
1116 }
1117 impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1118 where
1119 Self: Copy + MultiLane<[u64; 2]>,
1120 {
1121 #[cold]
1122 fn fmt(&self, fmt: &mut Formatter) -> Result {
1123 fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1124 }
1125 }
1126
1127 impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1128 where
1129 u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1130 {
1131 #[cold]
1132 fn fmt(&self, fmt: &mut Formatter) -> Result {
1133 let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1134 fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1135 }
1136 }
1137
1138 #[cfg(test)]
1139 mod test {
1140 use super::*;
1141 use crate::x86_64::{SSE2, SSE41, SSSE3};
1142 use crate::Machine;
1143
1144 #[test]
1145 #[cfg(target_arch = "x86_64")]
1146 fn test_bswap32_s2_vs_s3() {
1147 let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1148 let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1149
1150 let s2 = unsafe { SSE2::instance() };
1151 let s3 = unsafe { SSSE3::instance() };
1152
1153 let x_s2 = {
1154 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1155 x_s2.bswap()
1156 };
1157
1158 let x_s3 = {
1159 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1160 x_s3.bswap()
1161 };
1162
1163 assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) });
1164 assert_eq!(x_s2, s2.vec(ys));
1165 }
1166
1167 #[test]
1168 #[cfg(target_arch = "x86_64")]
1169 fn test_bswap64_s2_vs_s3() {
1170 let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1171 let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1172
1173 let s2 = unsafe { SSE2::instance() };
1174 let s3 = unsafe { SSSE3::instance() };
1175
1176 let x_s2 = {
1177 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1178 x_s2.bswap()
1179 };
1180
1181 let x_s3 = {
1182 let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1183 x_s3.bswap()
1184 };
1185
1186 assert_eq!(x_s2, s2.vec(ys));
1187 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1188 }
1189
1190 #[test]
1191 #[cfg(target_arch = "x86_64")]
1192 fn test_shuffle32_s2_vs_s3() {
1193 let xs = [0x0, 0x1, 0x2, 0x3];
1194 let ys = [0x2, 0x3, 0x0, 0x1];
1195 let zs = [0x1, 0x2, 0x3, 0x0];
1196
1197 let s2 = unsafe { SSE2::instance() };
1198 let s3 = unsafe { SSSE3::instance() };
1199
1200 let x_s2 = {
1201 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1202 x_s2.shuffle2301()
1203 };
1204 let x_s3 = {
1205 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1206 x_s3.shuffle2301()
1207 };
1208 assert_eq!(x_s2, s2.vec(ys));
1209 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1210
1211 let x_s2 = {
1212 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1213 x_s2.shuffle3012()
1214 };
1215 let x_s3 = {
1216 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1217 x_s3.shuffle3012()
1218 };
1219 assert_eq!(x_s2, s2.vec(zs));
1220 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1221
1222 let x_s2 = x_s2.shuffle1230();
1223 let x_s3 = x_s3.shuffle1230();
1224 assert_eq!(x_s2, s2.vec(xs));
1225 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1226 }
1227
1228 #[test]
1229 #[cfg(target_arch = "x86_64")]
1230 fn test_shuffle64_s2_vs_s3() {
1231 let xs = [0x0, 0x1, 0x2, 0x3];
1232 let ys = [0x2, 0x3, 0x0, 0x1];
1233 let zs = [0x1, 0x2, 0x3, 0x0];
1234
1235 let s2 = unsafe { SSE2::instance() };
1236 let s3 = unsafe { SSSE3::instance() };
1237
1238 let x_s2 = {
1239 let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1240 x_s2.shuffle2301()
1241 };
1242 let x_s3 = {
1243 let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1244 x_s3.shuffle2301()
1245 };
1246 assert_eq!(x_s2, s2.vec(ys));
1247 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1248
1249 let x_s2 = {
1250 let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1251 x_s2.shuffle3012()
1252 };
1253 let x_s3 = {
1254 let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1255 x_s3.shuffle3012()
1256 };
1257 assert_eq!(x_s2, s2.vec(zs));
1258 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1259
1260 let x_s2 = x_s2.shuffle1230();
1261 let x_s3 = x_s3.shuffle1230();
1262 assert_eq!(x_s2, s2.vec(xs));
1263 assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1264 }
1265
1266 #[test]
1267 #[cfg(target_arch = "x86_64")]
1268 fn test_lanes_u32x4() {
1269 let xs = [0x1, 0x2, 0x3, 0x4];
1270
1271 let s2 = unsafe { SSE2::instance() };
1272 let s3 = unsafe { SSSE3::instance() };
1273 let s4 = unsafe { SSE41::instance() };
1274
1275 {
1276 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1277 let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1278 assert_eq!(x_s2, y_s2);
1279 assert_eq!(xs, y_s2.to_lanes());
1280 }
1281
1282 {
1283 let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1284 let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1285 assert_eq!(x_s3, y_s3);
1286 assert_eq!(xs, y_s3.to_lanes());
1287 }
1288
1289 {
1290 let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1291 let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1292 assert_eq!(x_s4, y_s4);
1293 assert_eq!(xs, y_s4.to_lanes());
1294 }
1295 }
1296
1297 #[test]
1298 #[cfg(target_arch = "x86_64")]
1299 fn test_lanes_u64x2() {
1300 let xs = [0x1, 0x2];
1301
1302 let s2 = unsafe { SSE2::instance() };
1303 let s3 = unsafe { SSSE3::instance() };
1304 let s4 = unsafe { SSE41::instance() };
1305
1306 {
1307 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1308 let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1309 assert_eq!(x_s2, y_s2);
1310 assert_eq!(xs, y_s2.to_lanes());
1311 }
1312
1313 {
1314 let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1315 let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1316 assert_eq!(x_s3, y_s3);
1317 assert_eq!(xs, y_s3.to_lanes());
1318 }
1319
1320 {
1321 let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1322 let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1323 assert_eq!(x_s4, y_s4);
1324 assert_eq!(xs, y_s4.to_lanes());
1325 }
1326 }
1327
1328 #[test]
1329 #[cfg(target_arch = "x86_64")]
1330 fn test_vec4_u32x4_s2() {
1331 let xs = [1, 2, 3, 4];
1332 let s2 = unsafe { SSE2::instance() };
1333 let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1334 assert_eq!(x_s2.extract(0), 1);
1335 assert_eq!(x_s2.extract(1), 2);
1336 assert_eq!(x_s2.extract(2), 3);
1337 assert_eq!(x_s2.extract(3), 4);
1338 assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1339 assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1340 assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1341 assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1342 }
1343
1344 #[test]
1345 #[cfg(target_arch = "x86_64")]
1346 fn test_vec4_u32x4_s4() {
1347 let xs = [1, 2, 3, 4];
1348 let s4 = unsafe { SSE41::instance() };
1349 let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1350 assert_eq!(x_s4.extract(0), 1);
1351 assert_eq!(x_s4.extract(1), 2);
1352 assert_eq!(x_s4.extract(2), 3);
1353 assert_eq!(x_s4.extract(3), 4);
1354 assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1355 assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1356 assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1357 assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1358 }
1359
1360 #[test]
1361 #[cfg(target_arch = "x86_64")]
1362 fn test_vec2_u64x2_s2() {
1363 let xs = [0x1, 0x2];
1364 let s2 = unsafe { SSE2::instance() };
1365 let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1366 assert_eq!(x_s2.extract(0), 1);
1367 assert_eq!(x_s2.extract(1), 2);
1368 assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1369 assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1370 }
1371
1372 #[test]
1373 #[cfg(target_arch = "x86_64")]
1374 fn test_vec4_u64x2_s4() {
1375 let xs = [0x1, 0x2];
1376 let s4 = unsafe { SSE41::instance() };
1377 let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1378 assert_eq!(x_s4.extract(0), 1);
1379 assert_eq!(x_s4.extract(1), 2);
1380 assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1381 assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1382 }
1383 }
1384
1385 pub mod avx2 {
1386 #![allow(non_camel_case_types)]
1387 use crate::soft::x4;
1388 use crate::types::*;
1389 use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2};
1390 use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1391 use core::arch::x86_64::*;
1392 use core::marker::PhantomData;
1393 use core::ops::*;
1394
1395 #[derive(Copy, Clone)]
1396 pub struct u32x4x4_avx2<NI> {
1397 x: [__m256i; 2],
1398 ni: PhantomData<NI>,
1399 }
1400
1401 impl<NI> u32x4x4_avx2<NI> {
1402 #[inline(always)]
1403 fn new(x: [__m256i; 2]) -> Self {
1404 Self { x, ni: PhantomData }
1405 }
1406 }
1407
1408 impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {}
1409 impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> {
1410 #[inline(always)]
1411 unsafe fn unpack(p: vec512_storage) -> Self {
1412 Self::new([p.avx[0].avx, p.avx[1].avx])
1413 }
1414 }
1415 impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1416 #[inline(always)]
1417 fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1418 unsafe {
1419 [
1420 u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1421 u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1422 u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1423 u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1424 ]
1425 }
1426 }
1427 #[inline(always)]
1428 fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1429 Self::new(unsafe {
1430 [
1431 _mm256_setr_m128i(x[0].x, x[1].x),
1432 _mm256_setr_m128i(x[2].x, x[3].x),
1433 ]
1434 })
1435 }
1436 }
1437 impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1438 #[inline(always)]
1439 fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1440 unsafe {
1441 match i {
1442 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1443 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1444 2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1445 3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1446 _ => panic!(),
1447 }
1448 }
1449 }
1450 #[inline(always)]
1451 fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1452 Self::new(unsafe {
1453 match i {
1454 0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]],
1455 1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]],
1456 2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)],
1457 3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)],
1458 _ => panic!(),
1459 }
1460 })
1461 }
1462 }
1463 impl<NI> LaneWords4 for u32x4x4_avx2<NI> {
1464 #[inline(always)]
1465 fn shuffle_lane_words1230(self) -> Self {
1466 Self::new(unsafe {
1467 [
1468 _mm256_shuffle_epi32(self.x[0], 0b1001_0011),
1469 _mm256_shuffle_epi32(self.x[1], 0b1001_0011),
1470 ]
1471 })
1472 }
1473 #[inline(always)]
1474 fn shuffle_lane_words2301(self) -> Self {
1475 Self::new(unsafe {
1476 [
1477 _mm256_shuffle_epi32(self.x[0], 0b0100_1110),
1478 _mm256_shuffle_epi32(self.x[1], 0b0100_1110),
1479 ]
1480 })
1481 }
1482 #[inline(always)]
1483 fn shuffle_lane_words3012(self) -> Self {
1484 Self::new(unsafe {
1485 [
1486 _mm256_shuffle_epi32(self.x[0], 0b0011_1001),
1487 _mm256_shuffle_epi32(self.x[1], 0b0011_1001),
1488 ]
1489 })
1490 }
1491 }
1492 impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {}
1493 impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {}
1494 macro_rules! shuf_lane_bytes {
1495 ($name:ident, $k0:expr, $k1:expr) => {
1496 #[inline(always)]
1497 fn $name(self) -> Self {
1498 Self::new(unsafe {
1499 [
1500 _mm256_shuffle_epi8(
1501 self.x[0],
1502 _mm256_set_epi64x($k0, $k1, $k0, $k1),
1503 ),
1504 _mm256_shuffle_epi8(
1505 self.x[1],
1506 _mm256_set_epi64x($k0, $k1, $k0, $k1),
1507 )
1508 ]
1509 })
1510 }
1511 };
1512 }
1513 macro_rules! rotr_32 {
1514 ($name:ident, $i:expr) => {
1515 #[inline(always)]
1516 fn $name(self) -> Self {
1517 Self::new(unsafe {
1518 [
1519 _mm256_or_si256(
1520 _mm256_srli_epi32(self.x[0], $i as i32),
1521 _mm256_slli_epi32(self.x[0], 32 - $i as i32),
1522 ),
1523 _mm256_or_si256(
1524 _mm256_srli_epi32(self.x[1], $i as i32),
1525 _mm256_slli_epi32(self.x[1], 32 - $i as i32),
1526 )
1527 ]
1528 })
1529 }
1530 };
1531 }
1532 impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> {
1533 rotr_32!(rotate_each_word_right7, 7);
1534 shuf_lane_bytes!(
1535 rotate_each_word_right8,
1536 0x0c0f0e0d_080b0a09,
1537 0x04070605_00030201
1538 );
1539 rotr_32!(rotate_each_word_right11, 11);
1540 rotr_32!(rotate_each_word_right12, 12);
1541 shuf_lane_bytes!(
1542 rotate_each_word_right16,
1543 0x0d0c0f0e_09080b0a,
1544 0x05040706_01000302
1545 );
1546 rotr_32!(rotate_each_word_right20, 20);
1547 shuf_lane_bytes!(
1548 rotate_each_word_right24,
1549 0x0e0d0c0f_0a09080b,
1550 0x06050407_02010003
1551 );
1552 rotr_32!(rotate_each_word_right25, 25);
1553 }
1554 impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {}
1555 impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage {
1556 #[inline(always)]
1557 fn from(x: u32x4x4_avx2<NI>) -> Self {
1558 Self {
1559 avx: [
1560 vec256_storage { avx: x.x[0] },
1561 vec256_storage { avx: x.x[1] },
1562 ],
1563 }
1564 }
1565 }
1566
1567 macro_rules! impl_assign {
1568 ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1569 impl<NI> $Assign for $vec<NI>
1570 where
1571 NI: Copy,
1572 {
1573 #[inline(always)]
1574 fn $assign_fn(&mut self, rhs: Self) {
1575 *self = self.$bin_fn(rhs);
1576 }
1577 }
1578 };
1579 }
1580 impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor);
1581 impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor);
1582 impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand);
1583 impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add);
1584
1585 macro_rules! impl_bitop_x2 {
1586 ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1587 impl<NI> $Op for $vec<NI> {
1588 type Output = Self;
1589 #[inline(always)]
1590 fn $op_fn(self, rhs: Self) -> Self::Output {
1591 Self::new(unsafe {
1592 [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])]
1593 })
1594 }
1595 }
1596 };
1597 }
1598 impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256);
1599 impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256);
1600 impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256);
1601 impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256);
1602 impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32);
1603
1604 impl<NI> Not for u32x4x4_avx2<NI> {
1605 type Output = Self;
1606 #[inline(always)]
1607 fn not(self) -> Self::Output {
1608 unsafe {
1609 let f = _mm256_set1_epi8(-0x7f);
1610 Self::new([f, f]) ^ self
1611 }
1612 }
1613 }
1614
1615 impl<NI> BSwap for u32x4x4_avx2<NI> {
1616 shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1617 }
1618
1619 impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI>
1620 where
1621 NI: Copy,
1622 {
1623 #[inline(always)]
1624 fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1625 Self::new(unsafe {
1626 [
1627 _mm256_setr_m128i(x.0[0].x, x.0[1].x),
1628 _mm256_setr_m128i(x.0[2].x, x.0[3].x),
1629 ]
1630 })
1631 }
1632 }
1633 }