3 //! - Section 8.5 "32-bit SIMD intrinsics" of ACLE
5 //! Intrinsics that could live here
66 use stdarch_test
::assert_instr
;
68 use crate::{core_arch::acle::dsp::int16x2_t, mem::transmute}
;
71 /// ARM-specific 32-bit wide vector of four packed `i8`.
72 pub struct int8x4_t(i8, i8, i8, i8);
73 /// ARM-specific 32-bit wide vector of four packed `u8`.
74 pub struct uint8x4_t(u8, u8, u8, u8);
77 macro_rules
! dsp_call
{
78 ($name
:expr
, $a
:expr
, $b
:expr
) => {
79 transmute($
name(transmute($a
), transmute($b
)))
84 #[link_name = "llvm.arm.qadd8"]
85 fn arm_qadd8(a
: i32, b
: i32) -> i32;
87 #[link_name = "llvm.arm.qsub8"]
88 fn arm_qsub8(a
: i32, b
: i32) -> i32;
90 #[link_name = "llvm.arm.qsub16"]
91 fn arm_qsub16(a
: i32, b
: i32) -> i32;
93 #[link_name = "llvm.arm.qadd16"]
94 fn arm_qadd16(a
: i32, b
: i32) -> i32;
96 #[link_name = "llvm.arm.qasx"]
97 fn arm_qasx(a
: i32, b
: i32) -> i32;
99 #[link_name = "llvm.arm.qsax"]
100 fn arm_qsax(a
: i32, b
: i32) -> i32;
102 #[link_name = "llvm.arm.sadd16"]
103 fn arm_sadd16(a
: i32, b
: i32) -> i32;
105 #[link_name = "llvm.arm.sadd8"]
106 fn arm_sadd8(a
: i32, b
: i32) -> i32;
108 #[link_name = "llvm.arm.smlad"]
109 fn arm_smlad(a
: i32, b
: i32, c
: i32) -> i32;
111 #[link_name = "llvm.arm.smlsd"]
112 fn arm_smlsd(a
: i32, b
: i32, c
: i32) -> i32;
114 #[link_name = "llvm.arm.sasx"]
115 fn arm_sasx(a
: i32, b
: i32) -> i32;
117 #[link_name = "llvm.arm.sel"]
118 fn arm_sel(a
: i32, b
: i32) -> i32;
120 #[link_name = "llvm.arm.shadd8"]
121 fn arm_shadd8(a
: i32, b
: i32) -> i32;
123 #[link_name = "llvm.arm.shadd16"]
124 fn arm_shadd16(a
: i32, b
: i32) -> i32;
126 #[link_name = "llvm.arm.shsub8"]
127 fn arm_shsub8(a
: i32, b
: i32) -> i32;
129 #[link_name = "llvm.arm.ssub8"]
130 fn arm_ssub8(a
: i32, b
: i32) -> i32;
132 #[link_name = "llvm.arm.usub8"]
133 fn arm_usub8(a
: i32, b
: i32) -> i32;
135 #[link_name = "llvm.arm.shsub16"]
136 fn arm_shsub16(a
: i32, b
: i32) -> i32;
138 #[link_name = "llvm.arm.smuad"]
139 fn arm_smuad(a
: i32, b
: i32) -> i32;
141 #[link_name = "llvm.arm.smuadx"]
142 fn arm_smuadx(a
: i32, b
: i32) -> i32;
144 #[link_name = "llvm.arm.smusd"]
145 fn arm_smusd(a
: i32, b
: i32) -> i32;
147 #[link_name = "llvm.arm.smusdx"]
148 fn arm_smusdx(a
: i32, b
: i32) -> i32;
150 #[link_name = "llvm.arm.usad8"]
151 fn arm_usad8(a
: i32, b
: i32) -> u32;
154 /// Saturating four 8-bit integer additions
156 /// Returns the 8-bit signed equivalent of
158 /// res\[0\] = a\[0\] + b\[0\]
159 /// res\[1\] = a\[1\] + b\[1\]
160 /// res\[2\] = a\[2\] + b\[2\]
161 /// res\[3\] = a\[3\] + b\[3\]
163 #[cfg_attr(test, assert_instr(qadd8))]
164 pub unsafe fn __qadd8(a
: int8x4_t
, b
: int8x4_t
) -> int8x4_t
{
165 dsp_call
!(arm_qadd8
, a
, b
)
168 /// Saturating two 8-bit integer subtraction
170 /// Returns the 8-bit signed equivalent of
172 /// res\[0\] = a\[0\] - b\[0\]
173 /// res\[1\] = a\[1\] - b\[1\]
174 /// res\[2\] = a\[2\] - b\[2\]
175 /// res\[3\] = a\[3\] - b\[3\]
177 #[cfg_attr(test, assert_instr(qsub8))]
178 pub unsafe fn __qsub8(a
: int8x4_t
, b
: int8x4_t
) -> int8x4_t
{
179 dsp_call
!(arm_qsub8
, a
, b
)
182 /// Saturating two 16-bit integer subtraction
184 /// Returns the 16-bit signed equivalent of
186 /// res\[0\] = a\[0\] - b\[0\]
187 /// res\[1\] = a\[1\] - b\[1\]
189 #[cfg_attr(test, assert_instr(qsub16))]
190 pub unsafe fn __qsub16(a
: int16x2_t
, b
: int16x2_t
) -> int16x2_t
{
191 dsp_call
!(arm_qsub16
, a
, b
)
194 /// Saturating two 16-bit integer additions
196 /// Returns the 16-bit signed equivalent of
198 /// res\[0\] = a\[0\] + b\[0\]
199 /// res\[1\] = a\[1\] + b\[1\]
201 #[cfg_attr(test, assert_instr(qadd16))]
202 pub unsafe fn __qadd16(a
: int16x2_t
, b
: int16x2_t
) -> int16x2_t
{
203 dsp_call
!(arm_qadd16
, a
, b
)
206 /// Returns the 16-bit signed saturated equivalent of
208 /// res\[0\] = a\[0\] - b\[1\]
209 /// res\[1\] = a\[1\] + b\[0\]
211 #[cfg_attr(test, assert_instr(qasx))]
212 pub unsafe fn __qasx(a
: int16x2_t
, b
: int16x2_t
) -> int16x2_t
{
213 dsp_call
!(arm_qasx
, a
, b
)
216 /// Returns the 16-bit signed saturated equivalent of
218 /// res\[0\] = a\[0\] + b\[1\]
219 /// res\[1\] = a\[1\] - b\[0\]
221 #[cfg_attr(test, assert_instr(qsax))]
222 pub unsafe fn __qsax(a
: int16x2_t
, b
: int16x2_t
) -> int16x2_t
{
223 dsp_call
!(arm_qsax
, a
, b
)
226 /// Returns the 16-bit signed saturated equivalent of
228 /// res\[0\] = a\[0\] + b\[1\]
229 /// res\[1\] = a\[1\] + b\[0\]
231 /// and the GE bits of the APSR are set.
233 #[cfg_attr(test, assert_instr(sadd16))]
234 pub unsafe fn __sadd16(a
: int16x2_t
, b
: int16x2_t
) -> int16x2_t
{
235 dsp_call
!(arm_sadd16
, a
, b
)
238 /// Returns the 8-bit signed saturated equivalent of
240 /// res\[0\] = a\[0\] + b\[1\]
241 /// res\[1\] = a\[1\] + b\[0\]
242 /// res\[2\] = a\[2\] + b\[2\]
243 /// res\[3\] = a\[3\] + b\[3\]
245 /// and the GE bits of the APSR are set.
247 #[cfg_attr(test, assert_instr(sadd8))]
248 pub unsafe fn __sadd8(a
: int8x4_t
, b
: int8x4_t
) -> int8x4_t
{
249 dsp_call
!(arm_sadd8
, a
, b
)
252 /// Dual 16-bit Signed Multiply with Addition of products
253 /// and 32-bit accumulation.
255 /// Returns the 16-bit signed equivalent of
256 /// res = a\[0\] * b\[0\] + a\[1\] * b\[1\] + c
258 #[cfg_attr(test, assert_instr(smlad))]
259 pub unsafe fn __smlad(a
: int16x2_t
, b
: int16x2_t
, c
: i32) -> i32 {
260 arm_smlad(transmute(a
), transmute(b
), c
)
263 /// Dual 16-bit Signed Multiply with Subtraction of products
264 /// and 32-bit accumulation and overflow detection.
266 /// Returns the 16-bit signed equivalent of
267 /// res = a\[0\] * b\[0\] - a\[1\] * b\[1\] + c
269 #[cfg_attr(test, assert_instr(smlsd))]
270 pub unsafe fn __smlsd(a
: int16x2_t
, b
: int16x2_t
, c
: i32) -> i32 {
271 arm_smlsd(transmute(a
), transmute(b
), c
)
274 /// Returns the 16-bit signed equivalent of
276 /// res\[0\] = a\[0\] - b\[1\]
277 /// res\[1\] = a\[1\] + b\[0\]
279 /// and the GE bits of the APSR are set.
281 #[cfg_attr(test, assert_instr(sasx))]
282 pub unsafe fn __sasx(a
: int16x2_t
, b
: int16x2_t
) -> int16x2_t
{
283 dsp_call
!(arm_sasx
, a
, b
)
286 /// Select bytes from each operand according to APSR GE flags
288 /// Returns the equivalent of
290 /// res\[0\] = GE\[0\] ? a\[0\] : b\[0\]
291 /// res\[1\] = GE\[1\] ? a\[1\] : b\[1\]
292 /// res\[2\] = GE\[2\] ? a\[2\] : b\[2\]
293 /// res\[3\] = GE\[3\] ? a\[3\] : b\[3\]
295 /// where GE are bits of APSR
297 #[cfg_attr(test, assert_instr(sel))]
298 pub unsafe fn __sel(a
: int8x4_t
, b
: int8x4_t
) -> int8x4_t
{
299 dsp_call
!(arm_sel
, a
, b
)
302 /// Signed halving parallel byte-wise addition.
304 /// Returns the 8-bit signed equivalent of
306 /// res\[0\] = (a\[0\] + b\[0\]) / 2
307 /// res\[1\] = (a\[1\] + b\[1\]) / 2
308 /// res\[2\] = (a\[2\] + b\[2\]) / 2
309 /// res\[3\] = (a\[3\] + b\[3\]) / 2
311 #[cfg_attr(test, assert_instr(shadd8))]
312 pub unsafe fn __shadd8(a
: int8x4_t
, b
: int8x4_t
) -> int8x4_t
{
313 dsp_call
!(arm_shadd8
, a
, b
)
316 /// Signed halving parallel halfword-wise addition.
318 /// Returns the 16-bit signed equivalent of
320 /// res\[0\] = (a\[0\] + b\[0\]) / 2
321 /// res\[1\] = (a\[1\] + b\[1\]) / 2
323 #[cfg_attr(test, assert_instr(shadd16))]
324 pub unsafe fn __shadd16(a
: int16x2_t
, b
: int16x2_t
) -> int16x2_t
{
325 dsp_call
!(arm_shadd16
, a
, b
)
328 /// Signed halving parallel byte-wise subtraction.
330 /// Returns the 8-bit signed equivalent of
332 /// res\[0\] = (a\[0\] - b\[0\]) / 2
333 /// res\[1\] = (a\[1\] - b\[1\]) / 2
334 /// res\[2\] = (a\[2\] - b\[2\]) / 2
335 /// res\[3\] = (a\[3\] - b\[3\]) / 2
337 #[cfg_attr(test, assert_instr(shsub8))]
338 pub unsafe fn __shsub8(a
: int8x4_t
, b
: int8x4_t
) -> int8x4_t
{
339 dsp_call
!(arm_shsub8
, a
, b
)
342 /// Inserts a `USUB8` instruction.
344 /// Returns the 8-bit unsigned equivalent of
346 /// res\[0\] = a\[0\] - a\[0\]
347 /// res\[1\] = a\[1\] - a\[1\]
348 /// res\[2\] = a\[2\] - a\[2\]
349 /// res\[3\] = a\[3\] - a\[3\]
351 /// where \[0\] is the lower 8 bits and \[3\] is the upper 8 bits.
352 /// The GE bits of the APSR are set.
354 #[cfg_attr(test, assert_instr(usub8))]
355 pub unsafe fn __usub8(a
: uint8x4_t
, b
: uint8x4_t
) -> uint8x4_t
{
356 dsp_call
!(arm_usub8
, a
, b
)
359 /// Inserts a `SSUB8` instruction.
361 /// Returns the 8-bit signed equivalent of
363 /// res\[0\] = a\[0\] - a\[0\]
364 /// res\[1\] = a\[1\] - a\[1\]
365 /// res\[2\] = a\[2\] - a\[2\]
366 /// res\[3\] = a\[3\] - a\[3\]
368 /// where \[0\] is the lower 8 bits and \[3\] is the upper 8 bits.
369 /// The GE bits of the APSR are set.
371 #[cfg_attr(test, assert_instr(ssub8))]
372 pub unsafe fn __ssub8(a
: int8x4_t
, b
: int8x4_t
) -> int8x4_t
{
373 dsp_call
!(arm_ssub8
, a
, b
)
376 /// Signed halving parallel halfword-wise subtraction.
378 /// Returns the 16-bit signed equivalent of
380 /// res\[0\] = (a\[0\] - b\[0\]) / 2
381 /// res\[1\] = (a\[1\] - b\[1\]) / 2
383 #[cfg_attr(test, assert_instr(shsub16))]
384 pub unsafe fn __shsub16(a
: int16x2_t
, b
: int16x2_t
) -> int16x2_t
{
385 dsp_call
!(arm_shsub16
, a
, b
)
388 /// Signed Dual Multiply Add.
390 /// Returns the equivalent of
392 /// res = a\[0\] * b\[0\] + a\[1\] * b\[1\]
394 /// and sets the Q flag if overflow occurs on the addition.
396 #[cfg_attr(test, assert_instr(smuad))]
397 pub unsafe fn __smuad(a
: int16x2_t
, b
: int16x2_t
) -> i32 {
398 arm_smuad(transmute(a
), transmute(b
))
401 /// Signed Dual Multiply Add Reversed.
403 /// Returns the equivalent of
405 /// res = a\[0\] * b\[1\] + a\[1\] * b\[0\]
407 /// and sets the Q flag if overflow occurs on the addition.
409 #[cfg_attr(test, assert_instr(smuadx))]
410 pub unsafe fn __smuadx(a
: int16x2_t
, b
: int16x2_t
) -> i32 {
411 arm_smuadx(transmute(a
), transmute(b
))
414 /// Signed Dual Multiply Subtract.
416 /// Returns the equivalent of
418 /// res = a\[0\] * b\[0\] - a\[1\] * b\[1\]
420 /// and sets the Q flag if overflow occurs on the addition.
422 #[cfg_attr(test, assert_instr(smusd))]
423 pub unsafe fn __smusd(a
: int16x2_t
, b
: int16x2_t
) -> i32 {
424 arm_smusd(transmute(a
), transmute(b
))
427 /// Signed Dual Multiply Subtract Reversed.
429 /// Returns the equivalent of
431 /// res = a\[0\] * b\[1\] - a\[1\] * b\[0\]
433 /// and sets the Q flag if overflow occurs on the addition.
435 #[cfg_attr(test, assert_instr(smusdx))]
436 pub unsafe fn __smusdx(a
: int16x2_t
, b
: int16x2_t
) -> i32 {
437 arm_smusdx(transmute(a
), transmute(b
))
440 /// Sum of 8-bit absolute differences.
442 /// Returns the 8-bit unsigned equivalent of
444 /// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
445 /// (a\[2\] - b\[2\]) + (a\[3\] - b\[3\])
447 #[cfg_attr(test, assert_instr(usad8))]
448 pub unsafe fn __usad8(a
: int8x4_t
, b
: int8x4_t
) -> u32 {
449 arm_usad8(transmute(a
), transmute(b
))
452 /// Sum of 8-bit absolute differences and constant.
454 /// Returns the 8-bit unsigned equivalent of
456 /// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
457 /// (a\[2\] - b\[2\]) + (a\[3\] - b\[3\]) + c
459 #[cfg_attr(test, assert_instr(usad8))]
460 pub unsafe fn __usada8(a
: int8x4_t
, b
: int8x4_t
, c
: u32) -> u32 {
466 use crate::core_arch
::simd
::{i16x2, i8x4, u8x4}
;
467 use std
::mem
::transmute
;
468 use stdarch_test
::simd_test
;
473 let a
= i8x4
::new(1, 2, 3, i8::MAX
);
474 let b
= i8x4
::new(2, -1, 0, 1);
475 let c
= i8x4
::new(3, 1, 3, i8::MAX
);
476 let r
: i8x4
= dsp_call
!(super::__qadd8
, a
, b
);
484 let a
= i8x4
::new(1, 2, 3, i8::MIN
);
485 let b
= i8x4
::new(2, -1, 0, 1);
486 let c
= i8x4
::new(-1, 3, 3, i8::MIN
);
487 let r
: i8x4
= dsp_call
!(super::__qsub8
, a
, b
);
495 let a
= i16x2
::new(1, 2);
496 let b
= i16x2
::new(2, -1);
497 let c
= i16x2
::new(3, 1);
498 let r
: i16x2
= dsp_call
!(super::__qadd16
, a
, b
);
506 let a
= i16x2
::new(10, 20);
507 let b
= i16x2
::new(20, -10);
508 let c
= i16x2
::new(-10, 30);
509 let r
: i16x2
= dsp_call
!(super::__qsub16
, a
, b
);
517 let a
= i16x2
::new(1, i16::MAX
);
518 let b
= i16x2
::new(2, 2);
519 let c
= i16x2
::new(-1, i16::MAX
);
520 let r
: i16x2
= dsp_call
!(super::__qasx
, a
, b
);
528 let a
= i16x2
::new(1, i16::MAX
);
529 let b
= i16x2
::new(2, 2);
530 let c
= i16x2
::new(3, i16::MAX
- 2);
531 let r
: i16x2
= dsp_call
!(super::__qsax
, a
, b
);
539 let a
= i16x2
::new(1, i16::MAX
);
540 let b
= i16x2
::new(2, 2);
541 let c
= i16x2
::new(3, -i16::MAX
);
542 let r
: i16x2
= dsp_call
!(super::__sadd16
, a
, b
);
550 let a
= i8x4
::new(1, 2, 3, i8::MAX
);
551 let b
= i8x4
::new(4, 3, 2, 2);
552 let c
= i8x4
::new(5, 5, 5, -i8::MAX
);
553 let r
: i8x4
= dsp_call
!(super::__sadd8
, a
, b
);
561 let a
= i16x2
::new(1, 2);
562 let b
= i16x2
::new(2, 1);
563 let c
= i16x2
::new(0, 4);
564 let r
: i16x2
= dsp_call
!(super::__sasx
, a
, b
);
572 let a
= i16x2
::new(1, 2);
573 let b
= i16x2
::new(3, 4);
574 let r
= super::__smlad(transmute(a
), transmute(b
), 10);
575 assert_eq
!(r
, (1 * 3) + (2 * 4) + 10);
582 let a
= i16x2
::new(1, 2);
583 let b
= i16x2
::new(3, 4);
584 let r
= super::__smlsd(transmute(a
), transmute(b
), 10);
585 assert_eq
!(r
, ((1 * 3) - (2 * 4)) + 10);
592 let a
= i8x4
::new(1, 2, 3, i8::MAX
);
593 let b
= i8x4
::new(4, 3, 2, 2);
594 // call sadd8() to set GE bits
595 super::__sadd8(transmute(a
), transmute(b
));
596 let c
= i8x4
::new(1, 2, 3, i8::MAX
);
597 let r
: i8x4
= dsp_call
!(super::__sel
, a
, b
);
605 let a
= i8x4
::new(1, 2, 3, 4);
606 let b
= i8x4
::new(5, 4, 3, 2);
607 let c
= i8x4
::new(3, 3, 3, 3);
608 let r
: i8x4
= dsp_call
!(super::__shadd8
, a
, b
);
616 let a
= i16x2
::new(1, 2);
617 let b
= i16x2
::new(5, 4);
618 let c
= i16x2
::new(3, 3);
619 let r
: i16x2
= dsp_call
!(super::__shadd16
, a
, b
);
627 let a
= i8x4
::new(1, 2, 3, 4);
628 let b
= i8x4
::new(5, 4, 3, 2);
629 let c
= i8x4
::new(-2, -1, 0, 1);
630 let r
: i8x4
= dsp_call
!(super::__shsub8
, a
, b
);
638 let a
= i8x4
::new(1, 2, 3, 4);
639 let b
= i8x4
::new(5, 4, 3, 2);
640 let c
= i8x4
::new(-4, -2, 0, 2);
641 let r
: i8x4
= dsp_call
!(super::__ssub8
, a
, b
);
649 let a
= u8x4
::new(1, 2, 3, 4);
650 let b
= u8x4
::new(5, 4, 3, 2);
651 let c
= u8x4
::new(252, 254, 0, 2);
652 let r
: u8x4
= dsp_call
!(super::__usub8
, a
, b
);
660 let a
= i16x2
::new(1, 2);
661 let b
= i16x2
::new(5, 4);
662 let c
= i16x2
::new(-2, -1);
663 let r
: i16x2
= dsp_call
!(super::__shsub16
, a
, b
);
671 let a
= i16x2
::new(1, 2);
672 let b
= i16x2
::new(5, 4);
673 let r
= super::__smuad(transmute(a
), transmute(b
));
681 let a
= i16x2
::new(1, 2);
682 let b
= i16x2
::new(5, 4);
683 let r
= super::__smuadx(transmute(a
), transmute(b
));
691 let a
= i16x2
::new(1, 2);
692 let b
= i16x2
::new(5, 4);
693 let r
= super::__smusd(transmute(a
), transmute(b
));
701 let a
= i16x2
::new(1, 2);
702 let b
= i16x2
::new(5, 4);
703 let r
= super::__smusdx(transmute(a
), transmute(b
));
711 let a
= i8x4
::new(1, 2, 3, 4);
712 let b
= i8x4
::new(4, 3, 2, 1);
713 let r
= super::__usad8(transmute(a
), transmute(b
));
721 let a
= i8x4
::new(1, 2, 3, 4);
722 let b
= i8x4
::new(4, 3, 2, 1);
724 let r
= super::__usada8(transmute(a
), transmute(b
), c
);
725 assert_eq
!(r
, 8 + c
);