#include "tcg/tcg-gvec-desc.h"
#include "fpu/softfloat.h"
#include "tcg/tcg.h"
+#include "vec_internal.h"
/* Note that vector data is stored in host-endian 64-bit chunks,
DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
+static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
+{
+ int8_t n1 = n, n2 = n >> 8;
+ return m + n1 + n2;
+}
+
+static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
+{
+ int16_t n1 = n, n2 = n >> 16;
+ return m + n1 + n2;
+}
+
+static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
+{
+ int32_t n1 = n, n2 = n >> 32;
+ return m + n1 + n2;
+}
+
+DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
+DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
+DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
+
+static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
+{
+ uint8_t n1 = n, n2 = n >> 8;
+ return m + n1 + n2;
+}
+
+static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
+{
+ uint16_t n1 = n, n2 = n >> 16;
+ return m + n1 + n2;
+}
+
+static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
+{
+ uint32_t n1 = n, n2 = n >> 32;
+ return m + n1 + n2;
+}
+
+DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
+DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
+DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
+
+#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
+#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
+#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
+#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
+
+DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
+DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
+DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
+DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
+
+#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
+#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
+#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
+#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
+
+DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
+DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
+DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
+DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
+
+/*
+ * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
+ * We pass in a pointer to a dummy saturation field to trigger
+ * the saturating arithmetic but discard the information about
+ * whether it has occurred.
+ */
+#define do_sqshl_b(n, m) \
+ ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
+#define do_sqshl_h(n, m) \
+ ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
+#define do_sqshl_s(n, m) \
+ ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
+#define do_sqshl_d(n, m) \
+ ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
+
+DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
+DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
+DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
+DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
+
+#define do_uqshl_b(n, m) \
+ ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
+#define do_uqshl_h(n, m) \
+ ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
+#define do_uqshl_s(n, m) \
+ ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
+#define do_uqshl_d(n, m) \
+ ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
+
+DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
+DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
+DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
+DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
+
+#define do_sqrshl_b(n, m) \
+ ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
+#define do_sqrshl_h(n, m) \
+ ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
+#define do_sqrshl_s(n, m) \
+ ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
+#define do_sqrshl_d(n, m) \
+ ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
+
+DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
+DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
+DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
+DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
+
+#undef do_sqrshl_d
+
+#define do_uqrshl_b(n, m) \
+ ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
+#define do_uqrshl_h(n, m) \
+ ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
+#define do_uqrshl_s(n, m) \
+ ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
+#define do_uqrshl_d(n, m) \
+ ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
+
+DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
+DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
+DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
+DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
+
+#undef do_uqrshl_d
+
+#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
+#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
+
+DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
+DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
+DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
+DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
+
+DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
+DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
+DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
+DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
+
+#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
+#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
+
+DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
+DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
+DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
+DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
+
+DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
+DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
+DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
+DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
+
+#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
+#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
+
+DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
+DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
+DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
+DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
+
+DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
+DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
+DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
+DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
+
+static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
+{
+ return val >= max ? max : val <= min ? min : val;
+}
+
+#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
+#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
+#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
+
+static inline int64_t do_sqadd_d(int64_t n, int64_t m)
+{
+ int64_t r = n + m;
+ if (((r ^ n) & ~(n ^ m)) < 0) {
+ /* Signed overflow. */
+ return r < 0 ? INT64_MAX : INT64_MIN;
+ }
+ return r;
+}
+
+DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
+DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
+DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
+DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
+
+#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
+#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
+#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
+
+static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
+{
+ uint64_t r = n + m;
+ return r < n ? UINT64_MAX : r;
+}
+
+DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
+DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
+DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
+DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
+
+#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
+#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
+#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
+
+static inline int64_t do_sqsub_d(int64_t n, int64_t m)
+{
+ int64_t r = n - m;
+ if (((r ^ n) & (n ^ m)) < 0) {
+ /* Signed overflow. */
+ return r < 0 ? INT64_MAX : INT64_MIN;
+ }
+ return r;
+}
+
+DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
+DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
+DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
+DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
+
+#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
+#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
+#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
+
+static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
+{
+ return n > m ? n - m : 0;
+}
+
+DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
+DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
+DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
+DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
+
+#define DO_SUQADD_B(n, m) \
+ do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
+#define DO_SUQADD_H(n, m) \
+ do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
+#define DO_SUQADD_S(n, m) \
+ do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
+
+static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
+{
+ uint64_t r = n + m;
+
+ if (n < 0) {
+ /* Note that m - abs(n) cannot underflow. */
+ if (r > INT64_MAX) {
+ /* Result is either very large positive or negative. */
+ if (m > -n) {
+ /* m > abs(n), so r is a very large positive. */
+ return INT64_MAX;
+ }
+ /* Result is negative. */
+ }
+ } else {
+ /* Both inputs are positive: check for overflow. */
+ if (r < m || r > INT64_MAX) {
+ return INT64_MAX;
+ }
+ }
+ return r;
+}
+
+DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
+DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
+DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
+DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
+
+#define DO_USQADD_B(n, m) \
+ do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
+#define DO_USQADD_H(n, m) \
+ do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
+#define DO_USQADD_S(n, m) \
+ do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
+
+static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
+{
+ uint64_t r = n + m;
+
+ if (m < 0) {
+ return n < -m ? 0 : r;
+ }
+ return r < n ? UINT64_MAX : r;
+}
+
+DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
+DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
+DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
+DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
+
#undef DO_ZPZZ
#undef DO_ZPZZ_D
+/*
+ * Three operand expander, operating on element pairs.
+ * If the slot I is even, the elements from from VN {I, I+1}.
+ * If the slot I is odd, the elements from from VM {I-1, I}.
+ * Load all of the input elements in each pair before overwriting output.
+ */
+#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPE n0 = *(TYPE *)(vn + H(i)); \
+ TYPE m0 = *(TYPE *)(vm + H(i)); \
+ TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
+ TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
+ if (pg & 1) { \
+ *(TYPE *)(vd + H(i)) = OP(n0, n1); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ if (pg & 1) { \
+ *(TYPE *)(vd + H(i)) = OP(m0, m1); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ } while (i & 15); \
+ } \
+}
+
+/* Similarly, specialized for 64-bit operands. */
+#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ uint8_t *pg = vg; \
+ for (i = 0; i < opr_sz; i += 2) { \
+ TYPE n0 = n[i], n1 = n[i + 1]; \
+ TYPE m0 = m[i], m1 = m[i + 1]; \
+ if (pg[H1(i)] & 1) { \
+ d[i] = OP(n0, n1); \
+ } \
+ if (pg[H1(i + 1)] & 1) { \
+ d[i + 1] = OP(m0, m1); \
+ } \
+ } \
+}
+
+DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
+DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
+DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
+DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
+
+DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
+DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
+DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
+DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
+
+DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
+DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
+DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
+DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
+
+DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
+DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
+DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
+DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
+
+DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
+DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
+DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
+DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
+
+#undef DO_ZPZZ_PAIR
+#undef DO_ZPZZ_PAIR_D
+
+#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
+ void *status, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPE n0 = *(TYPE *)(vn + H(i)); \
+ TYPE m0 = *(TYPE *)(vm + H(i)); \
+ TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
+ TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
+ if (pg & 1) { \
+ *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ if (pg & 1) { \
+ *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ } while (i & 15); \
+ } \
+}
+
+DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
+DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
+DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, , float64_add)
+
+DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
+DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
+DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, , float64_maxnum)
+
+DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
+DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
+DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, , float64_minnum)
+
+DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
+DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
+DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, , float64_max)
+
+DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
+DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
+DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, , float64_min)
+
+#undef DO_ZPZZ_PAIR_FP
+
/* Three-operand expander, controlled by a predicate, in which the
* third operand is "wide". That is, for D = N op M, the same 64-bit
* value of M is used with all of the narrower values of N.
DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
+#define DO_SQABS(X) \
+ ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
+ x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
+
+DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
+DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
+DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
+DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
+
+#define DO_SQNEG(X) \
+ ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
+ x_ == min_ ? -min_ - 1 : -x_; })
+
+DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
+DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
+DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
+DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
+
+DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
+DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
+
/* Three-operand expander, unpredicated, in which the third operand is "wide".
*/
#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
-#undef DO_ZZW
+#undef DO_ZZW
+
+#undef DO_CLS_B
+#undef DO_CLS_H
+#undef DO_CLZ_B
+#undef DO_CLZ_H
+#undef DO_CNOT
+#undef DO_FABS
+#undef DO_FNEG
+#undef DO_ABS
+#undef DO_NEG
+#undef DO_ZPZ
+#undef DO_ZPZ_D
+
+/*
+ * Three-operand expander, unpredicated, in which the two inputs are
+ * selected from the top or bottom half of the wide column.
+ */
+#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
+ int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
+ TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
+ *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
+ } \
+}
+
+DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
+DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
+DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, , H1_4, DO_ADD)
+
+DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
+DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
+DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, , H1_4, DO_SUB)
+
+DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
+DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
+DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, , H1_4, DO_ABD)
+
+DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
+DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
+DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, , H1_4, DO_ADD)
+
+DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
+DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
+DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, , H1_4, DO_SUB)
+
+DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
+DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
+DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, , H1_4, DO_ABD)
+
+DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
+DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
+DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, , H1_4, DO_MUL)
+
+DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
+DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
+DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, , H1_4, DO_MUL)
+
+/* Note that the multiply cannot overflow, but the doubling can. */
+static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
+{
+ int16_t val = n * m;
+ return DO_SQADD_H(val, val);
+}
+
+static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
+{
+ int32_t val = n * m;
+ return DO_SQADD_S(val, val);
+}
+
+static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
+{
+ int64_t val = n * m;
+ return do_sqadd_d(val, val);
+}
+
+DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
+DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
+DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, , H1_4, do_sqdmull_d)
+
+#undef DO_ZZZ_TB
+
+#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEW *)(vn + HW(i)); \
+ TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
+ *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
+ } \
+}
+
+DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
+DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
+DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, , H1_4, DO_ADD)
+
+DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
+DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
+DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, , H1_4, DO_SUB)
+
+DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
+DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
+DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, , H1_4, DO_ADD)
+
+DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
+DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
+DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, , H1_4, DO_SUB)
+
+#undef DO_ZZZ_WTB
+
+#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
+ intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
+ for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
+ TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
+ TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
+ *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
+ } \
+}
+
+DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
+
+#undef DO_ZZZ_NTB
+
+#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
+ TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
+ TYPEW aa = *(TYPEW *)(va + HW(i)); \
+ *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
+ } \
+}
+
+DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
+DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
+DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, , H1_4, DO_ABD)
+
+DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
+DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
+DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, , H1_4, DO_ABD)
+
+DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
+DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
+DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, , H1_4, DO_MUL)
+
+DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
+DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
+DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, , H1_4, DO_MUL)
+
+#define DO_NMUL(N, M) -(N * M)
+
+DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
+DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
+DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, , H1_4, DO_NMUL)
+
+DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
+DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
+DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, , H1_4, DO_NMUL)
+
+#undef DO_ZZZW_ACC
+
+#define DO_XTNB(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
+ TYPE nn = *(TYPE *)(vn + i); \
+ nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
+ *(TYPE *)(vd + i) = nn; \
+ } \
+}
+
+#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
+ TYPE nn = *(TYPE *)(vn + i); \
+ *(TYPEN *)(vd + i + odd) = OP(nn); \
+ } \
+}
+
+#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
+#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
+#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
+
+DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
+DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
+DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
+
+DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
+DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
+DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
+
+#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
+#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
+#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
+
+DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
+DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
+DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
+
+DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
+DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
+DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
+
+DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
+DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
+DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
+
+DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
+DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
+DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
+
+#undef DO_XTNB
+#undef DO_XTNT
+
+void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
+ uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ uint32_t *a = va, *n = vn;
+ uint64_t *d = vd, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ uint32_t e1 = a[2 * i + H4(0)];
+ uint32_t e2 = n[2 * i + sel] ^ inv;
+ uint64_t c = extract64(m[i], 32, 1);
+ /* Compute and store the entire 33-bit result at once. */
+ d[i] = c + e1 + e2;
+ }
+}
+
+void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ uint64_t *d = vd, *a = va, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; i += 2) {
+ Int128 e1 = int128_make64(a[i]);
+ Int128 e2 = int128_make64(n[i + sel] ^ inv);
+ Int128 c = int128_make64(m[i + 1] & 1);
+ Int128 r = int128_add(int128_add(e1, e2), c);
+ d[i + 0] = int128_getlo(r);
+ d[i + 1] = int128_gethi(r);
+ }
+}
+
+#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
+ int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
+ TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
+ TYPEW aa = *(TYPEW *)(va + HW(i)); \
+ *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
+ } \
+}
+
+DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
+ do_sqdmull_h, DO_SQADD_H)
+DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
+ do_sqdmull_s, DO_SQADD_S)
+DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, , H1_4,
+ do_sqdmull_d, do_sqadd_d)
+
+DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
+ do_sqdmull_h, DO_SQSUB_H)
+DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
+ do_sqdmull_s, DO_SQSUB_S)
+DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, , H1_4,
+ do_sqdmull_d, do_sqsub_d)
+
+#undef DO_SQDMLAL
+
+#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
+ int rot = simd_data(desc); \
+ int sel_a = rot & 1, sel_b = sel_a ^ 1; \
+ bool sub_r = rot == 1 || rot == 2; \
+ bool sub_i = rot >= 2; \
+ TYPE *d = vd, *n = vn, *m = vm, *a = va; \
+ for (i = 0; i < opr_sz; i += 2) { \
+ TYPE elt1_a = n[H(i + sel_a)]; \
+ TYPE elt2_a = m[H(i + sel_a)]; \
+ TYPE elt2_b = m[H(i + sel_b)]; \
+ d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
+ d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
+ } \
+}
+
+#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
+
+DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
+DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
+DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
+DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, , DO_CMLA)
+
+#define DO_SQRDMLAH_B(N, M, A, S) \
+ do_sqrdmlah_b(N, M, A, S, true)
+#define DO_SQRDMLAH_H(N, M, A, S) \
+ ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
+#define DO_SQRDMLAH_S(N, M, A, S) \
+ ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
+#define DO_SQRDMLAH_D(N, M, A, S) \
+ do_sqrdmlah_d(N, M, A, S, true)
+
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, , DO_SQRDMLAH_D)
+
+#undef DO_CMLA
+#undef DO_CMLA_FUNC
+#undef DO_SQRDMLAH_B
+#undef DO_SQRDMLAH_H
+#undef DO_SQRDMLAH_S
+#undef DO_SQRDMLAH_D
+
+#define DO_BITPERM(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
+ TYPE nn = *(TYPE *)(vn + i); \
+ TYPE mm = *(TYPE *)(vm + i); \
+ *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
+ } \
+}
+
+static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
+{
+ uint64_t res = 0;
+ int db, rb = 0;
+
+ for (db = 0; db < n; ++db) {
+ if ((mask >> db) & 1) {
+ res |= ((data >> db) & 1) << rb;
+ ++rb;
+ }
+ }
+ return res;
+}
+
+DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
+DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
+DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
+DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
+
+static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
+{
+ uint64_t res = 0;
+ int rb, db = 0;
+
+ for (rb = 0; rb < n; ++rb) {
+ if ((mask >> rb) & 1) {
+ res |= ((data >> db) & 1) << rb;
+ ++db;
+ }
+ }
+ return res;
+}
+
+DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
+DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
+DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
+DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
+
+static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
+{
+ uint64_t resm = 0, resu = 0;
+ int db, rbm = 0, rbu = 0;
+
+ for (db = 0; db < n; ++db) {
+ uint64_t val = (data >> db) & 1;
+ if ((mask >> db) & 1) {
+ resm |= val << rbm++;
+ } else {
+ resu |= val << rbu++;
+ }
+ }
+
+ return resm | (resu << rbm);
+}
+
+DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
+DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
+DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
+DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
+
+#undef DO_BITPERM
+
+#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int sub_r = simd_data(desc); \
+ if (sub_r) { \
+ for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
+ TYPE acc_r = *(TYPE *)(vn + H(i)); \
+ TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
+ TYPE el2_r = *(TYPE *)(vm + H(i)); \
+ TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
+ acc_r = ADD_OP(acc_r, el2_i); \
+ acc_i = SUB_OP(acc_i, el2_r); \
+ *(TYPE *)(vd + H(i)) = acc_r; \
+ *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
+ } \
+ } else { \
+ for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
+ TYPE acc_r = *(TYPE *)(vn + H(i)); \
+ TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
+ TYPE el2_r = *(TYPE *)(vm + H(i)); \
+ TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
+ acc_r = SUB_OP(acc_r, el2_i); \
+ acc_i = ADD_OP(acc_i, el2_r); \
+ *(TYPE *)(vd + H(i)) = acc_r; \
+ *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
+ } \
+ } \
+}
+
+DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_d, int64_t, , DO_ADD, DO_SUB)
+
+DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
+DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
+DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
+DO_CADD(sve2_sqcadd_d, int64_t, , do_sqadd_d, do_sqsub_d)
+
+#undef DO_CADD
+
+#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
+ int shift = simd_data(desc) >> 1; \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
+ *(TYPEW *)(vd + HW(i)) = nn << shift; \
+ } \
+}
+
+DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
+DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
+DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, , H1_4)
+
+DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
+DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
+DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, , H1_4)
-#undef DO_CLS_B
-#undef DO_CLS_H
-#undef DO_CLZ_B
-#undef DO_CLZ_H
-#undef DO_CNOT
-#undef DO_FABS
-#undef DO_FNEG
-#undef DO_ABS
-#undef DO_NEG
-#undef DO_ZPZ
-#undef DO_ZPZ_D
+#undef DO_ZZI_SHLL
/* Two-operand reduction expander, controlled by a predicate.
* The difference between TYPERED and TYPERET has to do with
return (intptr_t)-1 << esz;
}
-uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
+uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
{
+ intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
uint32_t flags = PREDTEST_INIT;
uint64_t *d = vd, *g = vg;
intptr_t i = 0;
uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
{
- intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
- intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+ intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+ intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
uint32_t flags = PREDTEST_INIT;
uint64_t *d = vd, *g = vg, esz_mask;
intptr_t i, next;
when N is negative, add 2**M-1. */
#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
+static inline uint64_t do_urshr(uint64_t x, unsigned sh)
+{
+ if (likely(sh < 64)) {
+ return (x >> sh) + ((x >> (sh - 1)) & 1);
+ } else if (sh == 64) {
+ return x >> 63;
+ } else {
+ return 0;
+ }
+}
+
+static inline int64_t do_srshr(int64_t x, unsigned sh)
+{
+ if (likely(sh < 64)) {
+ return (x >> sh) + ((x >> (sh - 1)) & 1);
+ } else {
+ /* Rounding the sign bit always produces 0. */
+ return 0;
+ }
+}
+
DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
-#undef DO_SHR
-#undef DO_SHL
#undef DO_ASRD
#undef DO_ZPZI
#undef DO_ZPZI_D
+#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int shift = simd_data(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEW *)(vn + i); \
+ *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
+ } \
+}
+
+#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int shift = simd_data(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEW *)(vn + HW(i)); \
+ *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
+ } \
+}
+
+DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
+DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
+DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
+
+DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
+DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
+DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, , H1_4, DO_SHR)
+
+DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
+DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
+DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
+
+DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
+DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
+DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, , H1_4, do_urshr)
+
+#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
+#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
+#define DO_SQSHRUN_D(x, sh) \
+ do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
+
+DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
+DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
+DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
+
+DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
+DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
+DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, , H1_4, DO_SQSHRUN_D)
+
+#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
+#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
+#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
+
+DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
+DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
+DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
+
+DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
+DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
+DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRUN_D)
+
+#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
+#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
+#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
+
+DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
+DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
+DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
+
+DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
+DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
+DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, , H1_4, DO_SQSHRN_D)
+
+#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
+#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
+#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
+
+DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
+DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
+DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
+
+DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
+DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
+DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRN_D)
+
+#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
+#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
+#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
+
+DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
+DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
+DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
+
+DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
+DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
+DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQSHRN_D)
+
+#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
+#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
+#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
+
+DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
+DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
+DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
+
+DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
+DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
+DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQRSHRN_D)
+
+#undef DO_SHRNB
+#undef DO_SHRNT
+
+#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEW *)(vn + i); \
+ TYPEW mm = *(TYPEW *)(vm + i); \
+ *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
+ } \
+}
+
+#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEW *)(vn + HW(i)); \
+ TYPEW mm = *(TYPEW *)(vm + HW(i)); \
+ *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
+ } \
+}
+
+#define DO_ADDHN(N, M, SH) ((N + M) >> SH)
+#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
+#define DO_SUBHN(N, M, SH) ((N - M) >> SH)
+#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
+
+DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
+DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
+DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
+
+DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
+DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
+DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_ADDHN)
+
+DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
+DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
+DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
+
+DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
+DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
+DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_RADDHN)
+
+DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
+DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
+DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
+
+DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
+DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
+DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_SUBHN)
+
+DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
+DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
+DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
+
+DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
+DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
+DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_RSUBHN)
+
+#undef DO_RSUBHN
+#undef DO_SUBHN
+#undef DO_RADDHN
+#undef DO_ADDHN
+
+#undef DO_BINOPNB
+
/* Fully general four-operand expander, controlled by a predicate.
*/
#define DO_ZPZZZ(NAME, TYPE, H, OP) \
intptr_t i, oprsz = simd_oprsz(desc);
for (i = 0; i < oprsz; i += sizeof(int8_t)) {
- int r = *(int8_t *)(a + i) + b;
- if (r > INT8_MAX) {
- r = INT8_MAX;
- } else if (r < INT8_MIN) {
- r = INT8_MIN;
- }
- *(int8_t *)(d + i) = r;
+ *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
}
}
intptr_t i, oprsz = simd_oprsz(desc);
for (i = 0; i < oprsz; i += sizeof(int16_t)) {
- int r = *(int16_t *)(a + i) + b;
- if (r > INT16_MAX) {
- r = INT16_MAX;
- } else if (r < INT16_MIN) {
- r = INT16_MIN;
- }
- *(int16_t *)(d + i) = r;
+ *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
}
}
intptr_t i, oprsz = simd_oprsz(desc);
for (i = 0; i < oprsz; i += sizeof(int32_t)) {
- int64_t r = *(int32_t *)(a + i) + b;
- if (r > INT32_MAX) {
- r = INT32_MAX;
- } else if (r < INT32_MIN) {
- r = INT32_MIN;
- }
- *(int32_t *)(d + i) = r;
+ *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
}
}
intptr_t i, oprsz = simd_oprsz(desc);
for (i = 0; i < oprsz; i += sizeof(int64_t)) {
- int64_t ai = *(int64_t *)(a + i);
- int64_t r = ai + b;
- if (((r ^ ai) & ~(ai ^ b)) < 0) {
- /* Signed overflow. */
- r = (r < 0 ? INT64_MAX : INT64_MIN);
- }
- *(int64_t *)(d + i) = r;
+ *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
}
}
intptr_t i, oprsz = simd_oprsz(desc);
for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
- int r = *(uint8_t *)(a + i) + b;
- if (r > UINT8_MAX) {
- r = UINT8_MAX;
- } else if (r < 0) {
- r = 0;
- }
- *(uint8_t *)(d + i) = r;
+ *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
}
}
intptr_t i, oprsz = simd_oprsz(desc);
for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
- int r = *(uint16_t *)(a + i) + b;
- if (r > UINT16_MAX) {
- r = UINT16_MAX;
- } else if (r < 0) {
- r = 0;
- }
- *(uint16_t *)(d + i) = r;
+ *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
}
}
intptr_t i, oprsz = simd_oprsz(desc);
for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
- int64_t r = *(uint32_t *)(a + i) + b;
- if (r > UINT32_MAX) {
- r = UINT32_MAX;
- } else if (r < 0) {
- r = 0;
- }
- *(uint32_t *)(d + i) = r;
+ *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
}
}
intptr_t i, oprsz = simd_oprsz(desc);
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
- uint64_t r = *(uint64_t *)(a + i) + b;
- if (r < b) {
- r = UINT64_MAX;
- }
- *(uint64_t *)(d + i) = r;
+ *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
}
}
intptr_t i, oprsz = simd_oprsz(desc);
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
- uint64_t ai = *(uint64_t *)(a + i);
- *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
+ *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
}
}
void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
- intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
+ int esize = 1 << esz;
uint64_t *d = vd;
intptr_t i;
mm = extract64(mm, high * half, half);
nn = expand_bits(nn, esz);
mm = expand_bits(mm, esz);
- d[0] = nn + (mm << (1 << esz));
+ d[0] = nn | (mm << esize);
} else {
- ARMPredicateReg tmp_n, tmp_m;
+ ARMPredicateReg tmp;
/* We produce output faster than we consume input.
Therefore we must be mindful of possible overlap. */
- if ((vn - vd) < (uintptr_t)oprsz) {
- vn = memcpy(&tmp_n, vn, oprsz);
- }
- if ((vm - vd) < (uintptr_t)oprsz) {
- vm = memcpy(&tmp_m, vm, oprsz);
+ if (vd == vn) {
+ vn = memcpy(&tmp, vn, oprsz);
+ if (vd == vm) {
+ vm = vn;
+ }
+ } else if (vd == vm) {
+ vm = memcpy(&tmp, vm, oprsz);
}
if (high) {
high = oprsz >> 1;
}
- if ((high & 3) == 0) {
+ if ((oprsz & 7) == 0) {
uint32_t *n = vn, *m = vm;
high >>= 2;
- for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
+ for (i = 0; i < oprsz / 8; i++) {
uint64_t nn = n[H4(high + i)];
uint64_t mm = m[H4(high + i)];
nn = expand_bits(nn, esz);
mm = expand_bits(mm, esz);
- d[i] = nn + (mm << (1 << esz));
+ d[i] = nn | (mm << esize);
}
} else {
uint8_t *n = vn, *m = vm;
nn = expand_bits(nn, esz);
mm = expand_bits(mm, esz);
- d16[H2(i)] = nn + (mm << (1 << esz));
+ d16[H2(i)] = nn | (mm << esize);
}
}
}
void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
- int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
uint64_t *d = vd, *n = vn, *m = vm;
uint64_t l, h;
intptr_t i;
if (oprsz <= 8) {
l = compress_bits(n[0] >> odd, esz);
h = compress_bits(m[0] >> odd, esz);
- d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
+ d[0] = l | (h << (4 * oprsz));
} else {
ARMPredicateReg tmp_m;
intptr_t oprsz_16 = oprsz / 16;
h = n[2 * i + 1];
l = compress_bits(l >> odd, esz);
h = compress_bits(h >> odd, esz);
- d[i] = l + (h << 32);
+ d[i] = l | (h << 32);
}
- /* For VL which is not a power of 2, the results from M do not
- align nicely with the uint64_t for D. Put the aligned results
- from M into TMP_M and then copy it into place afterward. */
+ /*
+ * For VL which is not a multiple of 512, the results from M do not
+ * align nicely with the uint64_t for D. Put the aligned results
+ * from M into TMP_M and then copy it into place afterward.
+ */
if (oprsz & 15) {
- d[i] = compress_bits(n[2 * i] >> odd, esz);
+ int final_shift = (oprsz & 15) * 2;
+
+ l = n[2 * i + 0];
+ h = n[2 * i + 1];
+ l = compress_bits(l >> odd, esz);
+ h = compress_bits(h >> odd, esz);
+ d[i] = l | (h << final_shift);
for (i = 0; i < oprsz_16; i++) {
l = m[2 * i + 0];
h = m[2 * i + 1];
l = compress_bits(l >> odd, esz);
h = compress_bits(h >> odd, esz);
- tmp_m.p[i] = l + (h << 32);
+ tmp_m.p[i] = l | (h << 32);
}
- tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
+ l = m[2 * i + 0];
+ h = m[2 * i + 1];
+ l = compress_bits(l >> odd, esz);
+ h = compress_bits(h >> odd, esz);
+ tmp_m.p[i] = l | (h << final_shift);
swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
} else {
h = m[2 * i + 1];
l = compress_bits(l >> odd, esz);
h = compress_bits(h >> odd, esz);
- d[oprsz_16 + i] = l + (h << 32);
+ d[oprsz_16 + i] = l | (h << 32);
}
}
}
void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
- bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
uint64_t *d = vd, *n = vn, *m = vm;
uint64_t mask;
int shr, shl;
void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
intptr_t i, oprsz_2 = oprsz / 2;
if (oprsz <= 8) {
void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
uint64_t *d = vd;
intptr_t i;
high = oprsz >> 1;
}
- if ((high & 3) == 0) {
+ if ((oprsz & 7) == 0) {
uint32_t *n = vn;
high >>= 2;
- for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
+ for (i = 0; i < oprsz / 8; i++) {
uint64_t nn = n[H4(high + i)];
d[i] = expand_bits(nn, 0);
}
*/
int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+ intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+ intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
- return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
+ return last_active_element(vg, words, esz);
}
void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
if (last_active_pred(vn, vg, oprsz)) {
compute_brk_z(vd, vm, vg, oprsz, true);
} else {
uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
if (last_active_pred(vn, vg, oprsz)) {
return compute_brks_z(vd, vm, vg, oprsz, true);
} else {
void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
if (last_active_pred(vn, vg, oprsz)) {
compute_brk_z(vd, vm, vg, oprsz, false);
} else {
uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
if (last_active_pred(vn, vg, oprsz)) {
return compute_brks_z(vd, vm, vg, oprsz, false);
} else {
void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
compute_brk_z(vd, vn, vg, oprsz, true);
}
uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
return compute_brks_z(vd, vn, vg, oprsz, true);
}
void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
compute_brk_z(vd, vn, vg, oprsz, false);
}
uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
return compute_brks_z(vd, vn, vg, oprsz, false);
}
void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
compute_brk_m(vd, vn, vg, oprsz, true);
}
uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
return compute_brks_m(vd, vn, vg, oprsz, true);
}
void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
compute_brk_m(vd, vn, vg, oprsz, false);
}
uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
return compute_brks_m(vd, vn, vg, oprsz, false);
}
void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
-
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
if (!last_active_pred(vn, vg, oprsz)) {
do_zero(vd, oprsz);
}
uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
-
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
if (last_active_pred(vn, vg, oprsz)) {
return predtest_ones(vd, oprsz, -1);
} else {
uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+ intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+ intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
intptr_t i;
- for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
+ for (i = 0; i < words; ++i) {
uint64_t t = n[i] & g[i] & mask;
sum += ctpop64(t);
}
return sum;
}
-uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
+uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
{
- uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
uint64_t esz_mask = pred_esz_masks[esz];
ARMPredicateReg *d = vd;
uint32_t flags;
return predtest_ones(d, oprsz, esz_mask);
}
+uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint64_t esz_mask = pred_esz_masks[esz];
+ ARMPredicateReg *d = vd;
+ intptr_t i, invcount, oprbits;
+ uint64_t bits;
+
+ if (count == 0) {
+ return do_zero(d, oprsz);
+ }
+
+ oprbits = oprsz * 8;
+ tcg_debug_assert(count <= oprbits);
+
+ bits = esz_mask;
+ if (oprbits & 63) {
+ bits &= MAKE_64BIT_MASK(0, oprbits & 63);
+ }
+
+ invcount = oprbits - count;
+ for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
+ d->p[i] = bits;
+ bits = esz_mask;
+ }
+
+ d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
+
+ while (--i >= 0) {
+ d->p[i] = 0;
+ }
+
+ return predtest_ones(d, oprsz, esz_mask);
+}
+
/* Recursive reduction on a function;
* C.f. the ARM ARM function ReducePredicated.
*
} \
uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
{ \
- uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
+ uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
for (i = 0; i < oprsz; ) { \
uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
#endif
}
-typedef uint64_t mte_check_fn(CPUARMState *, uint32_t, uint64_t, uintptr_t);
-
-static inline QEMU_ALWAYS_INLINE
-void sve_cont_ldst_mte_check_int(SVEContLdSt *info, CPUARMState *env,
- uint64_t *vg, target_ulong addr, int esize,
- int msize, uint32_t mtedesc, uintptr_t ra,
- mte_check_fn *check)
+static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr, int esize,
+ int msize, uint32_t mtedesc, uintptr_t ra)
{
intptr_t mem_off, reg_off, reg_last;
uint64_t pg = vg[reg_off >> 6];
do {
if ((pg >> (reg_off & 63)) & 1) {
- check(env, mtedesc, addr, ra);
+ mte_check(env, mtedesc, addr, ra);
}
reg_off += esize;
mem_off += msize;
uint64_t pg = vg[reg_off >> 6];
do {
if ((pg >> (reg_off & 63)) & 1) {
- check(env, mtedesc, addr, ra);
+ mte_check(env, mtedesc, addr, ra);
}
reg_off += esize;
mem_off += msize;
}
}
-typedef void sve_cont_ldst_mte_check_fn(SVEContLdSt *info, CPUARMState *env,
- uint64_t *vg, target_ulong addr,
- int esize, int msize, uint32_t mtedesc,
- uintptr_t ra);
-
-static void sve_cont_ldst_mte_check1(SVEContLdSt *info, CPUARMState *env,
- uint64_t *vg, target_ulong addr,
- int esize, int msize, uint32_t mtedesc,
- uintptr_t ra)
-{
- sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
- mtedesc, ra, mte_check1);
-}
-
-static void sve_cont_ldst_mte_checkN(SVEContLdSt *info, CPUARMState *env,
- uint64_t *vg, target_ulong addr,
- int esize, int msize, uint32_t mtedesc,
- uintptr_t ra)
-{
- sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
- mtedesc, ra, mte_checkN);
-}
-
-
/*
* Common helper for all contiguous 1,2,3,4-register predicated stores.
*/
uint32_t desc, const uintptr_t retaddr,
const int esz, const int msz, const int N, uint32_t mtedesc,
sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn,
- sve_cont_ldst_mte_check_fn *mte_check_fn)
+ sve_ldst1_tlb_fn *tlb_fn)
{
const unsigned rd = simd_data(desc);
const intptr_t reg_max = simd_oprsz(desc);
* Handle mte checks for all active elements.
* Since TBI must be set for MTE, !mtedesc => !mte_active.
*/
- if (mte_check_fn && mtedesc) {
- mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
- mtedesc, retaddr);
+ if (mtedesc) {
+ sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
+ mtedesc, retaddr);
}
flags = info.page[0].flags | info.page[1].flags;
mtedesc = 0;
}
- sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
- N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
+ sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
}
#define DO_LD1_1(NAME, ESZ) \
target_ulong addr, uint32_t desc) \
{ \
sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
- sve_##NAME##_host, sve_##NAME##_tlb, NULL); \
+ sve_##NAME##_host, sve_##NAME##_tlb); \
} \
void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
target_ulong addr, uint32_t desc) \
{ \
sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
- sve_##NAME##_le_host, sve_##NAME##_le_tlb, NULL); \
+ sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
} \
void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
{ \
sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
- sve_##NAME##_be_host, sve_##NAME##_be_tlb, NULL); \
+ sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
} \
void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
+ target_ulong addr, uint32_t desc) \
{ \
sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
} \
void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
+ target_ulong addr, uint32_t desc) \
{ \
sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
target_ulong addr, uint32_t desc) \
{ \
sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
- sve_ld1bb_host, sve_ld1bb_tlb, NULL); \
+ sve_ld1bb_host, sve_ld1bb_tlb); \
} \
void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
target_ulong addr, uint32_t desc) \
{ \
sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
- sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb, NULL); \
+ sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
} \
void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
{ \
sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
- sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb, NULL); \
+ sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
} \
void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
if (fault == FAULT_FIRST) {
/* Trapping mte check for the first-fault element. */
if (mtedesc) {
- mte_check1(env, mtedesc, addr + mem_off, retaddr);
+ mte_check(env, mtedesc, addr + mem_off, retaddr);
}
/*
/* Watchpoint hit, see below. */
goto do_fault;
}
- if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
+ if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
goto do_fault;
}
/*
& BP_MEM_READ)) {
goto do_fault;
}
- if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
+ if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
goto do_fault;
}
host_fn(vd, reg_off, host + mem_off);
uint32_t desc, const uintptr_t retaddr,
const int esz, const int msz, const int N, uint32_t mtedesc,
sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn,
- sve_cont_ldst_mte_check_fn *mte_check_fn)
+ sve_ldst1_tlb_fn *tlb_fn)
{
const unsigned rd = simd_data(desc);
const intptr_t reg_max = simd_oprsz(desc);
* Handle mte checks for all active elements.
* Since TBI must be set for MTE, !mtedesc => !mte_active.
*/
- if (mte_check_fn && mtedesc) {
- mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
- mtedesc, retaddr);
+ if (mtedesc) {
+ sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
+ mtedesc, retaddr);
}
flags = info.page[0].flags | info.page[1].flags;
mtedesc = 0;
}
- sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
- N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
+ sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
}
#define DO_STN_1(N, NAME, ESZ) \
target_ulong addr, uint32_t desc) \
{ \
sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
- sve_st1##NAME##_host, sve_st1##NAME##_tlb, NULL); \
+ sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
} \
void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
target_ulong addr, uint32_t desc) \
{ \
sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
- sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb, NULL); \
+ sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
} \
void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
{ \
sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
- sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb, NULL); \
+ sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
} \
void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
info.attrs, BP_MEM_READ, retaddr);
}
if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
- mte_check1(env, mtedesc, addr, retaddr);
+ mte_check(env, mtedesc, addr, retaddr);
}
host_fn(&scratch, reg_off, info.host);
} else {
BP_MEM_READ, retaddr);
}
if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
- mte_check1(env, mtedesc, addr, retaddr);
+ mte_check(env, mtedesc, addr, retaddr);
}
tlb_fn(env, &scratch, reg_off, addr, retaddr);
}
*/
addr = base + (off_fn(vm, reg_off) << scale);
if (mtedesc) {
- mte_check1(env, mtedesc, addr, retaddr);
+ mte_check(env, mtedesc, addr, retaddr);
}
tlb_fn(env, vd, reg_off, addr, retaddr);
}
if (mtedesc &&
arm_tlb_mte_tagged(&info.attrs) &&
- !mte_probe1(env, mtedesc, addr)) {
+ !mte_probe(env, mtedesc, addr)) {
goto fault;
}
}
if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
- mte_check1(env, mtedesc, addr, retaddr);
+ mte_check(env, mtedesc, addr, retaddr);
}
}
i += 1;
#undef DO_ST1_ZPZ_S
#undef DO_ST1_ZPZ_D
+
+void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = n[i] ^ m[i] ^ k[i];
+ }
+}
+
+void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = n[i] ^ (m[i] & ~k[i]);
+ }
+}
+
+void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
+ }
+}
+
+void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
+ }
+}
+
+void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
+ }
+}
+
+/*
+ * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
+ * See hasless(v,1) from
+ * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+ */
+static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
+{
+ int bits = 8 << esz;
+ uint64_t ones = dup_const(esz, 1);
+ uint64_t signs = ones << (bits - 1);
+ uint64_t cmp0, cmp1;
+
+ cmp1 = dup_const(esz, n);
+ cmp0 = cmp1 ^ m0;
+ cmp1 = cmp1 ^ m1;
+ cmp0 = (cmp0 - ones) & ~cmp0;
+ cmp1 = (cmp1 - ones) & ~cmp1;
+ return (cmp0 | cmp1) & signs;
+}
+
+static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
+ uint32_t desc, int esz, bool nmatch)
+{
+ uint16_t esz_mask = pred_esz_masks[esz];
+ intptr_t opr_sz = simd_oprsz(desc);
+ uint32_t flags = PREDTEST_INIT;
+ intptr_t i, j, k;
+
+ for (i = 0; i < opr_sz; i += 16) {
+ uint64_t m0 = *(uint64_t *)(vm + i);
+ uint64_t m1 = *(uint64_t *)(vm + i + 8);
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
+ uint16_t out = 0;
+
+ for (j = 0; j < 16; j += 8) {
+ uint64_t n = *(uint64_t *)(vn + i + j);
+
+ for (k = 0; k < 8; k += 1 << esz) {
+ if (pg & (1 << (j + k))) {
+ bool o = do_match2(n >> (k * 8), m0, m1, esz);
+ out |= (o ^ nmatch) << (j + k);
+ }
+ }
+ }
+ *(uint16_t *)(vd + H1_2(i >> 3)) = out;
+ flags = iter_predtest_fwd(out, pg, flags);
+ }
+ return flags;
+}
+
+#define DO_PPZZ_MATCH(NAME, ESZ, INV) \
+uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{ \
+ return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
+}
+
+DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
+DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
+
+DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
+DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
+
+#undef DO_PPZZ_MATCH
+
+void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
+ uint32_t desc)
+{
+ ARMVectorReg scratch;
+ intptr_t i, j;
+ intptr_t opr_sz = simd_oprsz(desc);
+ uint32_t *d = vd, *n = vn, *m = vm;
+ uint8_t *pg = vg;
+
+ if (d == n) {
+ n = memcpy(&scratch, n, opr_sz);
+ if (d == m) {
+ m = n;
+ }
+ } else if (d == m) {
+ m = memcpy(&scratch, m, opr_sz);
+ }
+
+ for (i = 0; i < opr_sz; i += 4) {
+ uint64_t count = 0;
+ uint8_t pred;
+
+ pred = pg[H1(i >> 3)] >> (i & 7);
+ if (pred & 1) {
+ uint32_t nn = n[H4(i >> 2)];
+
+ for (j = 0; j <= i; j += 4) {
+ pred = pg[H1(j >> 3)] >> (j & 7);
+ if ((pred & 1) && nn == m[H4(j >> 2)]) {
+ ++count;
+ }
+ }
+ }
+ d[H4(i >> 2)] = count;
+ }
+}
+
+void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
+ uint32_t desc)
+{
+ ARMVectorReg scratch;
+ intptr_t i, j;
+ intptr_t opr_sz = simd_oprsz(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint8_t *pg = vg;
+
+ if (d == n) {
+ n = memcpy(&scratch, n, opr_sz);
+ if (d == m) {
+ m = n;
+ }
+ } else if (d == m) {
+ m = memcpy(&scratch, m, opr_sz);
+ }
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ uint64_t count = 0;
+ if (pg[H1(i)] & 1) {
+ uint64_t nn = n[i];
+ for (j = 0; j <= i; ++j) {
+ if ((pg[H1(j)] & 1) && nn == m[j]) {
+ ++count;
+ }
+ }
+ }
+ d[i] = count;
+ }
+}
+
+/*
+ * Returns the number of bytes in m0 and m1 that match n.
+ * Unlike do_match2 we don't just need true/false, we need an exact count.
+ * This requires two extra logical operations.
+ */
+static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
+{
+ const uint64_t mask = dup_const(MO_8, 0x7f);
+ uint64_t cmp0, cmp1;
+
+ cmp1 = dup_const(MO_8, n);
+ cmp0 = cmp1 ^ m0;
+ cmp1 = cmp1 ^ m1;
+
+ /*
+ * 1: clear msb of each byte to avoid carry to next byte (& mask)
+ * 2: carry in to msb if byte != 0 (+ mask)
+ * 3: set msb if cmp has msb set (| cmp)
+ * 4: set ~msb to ignore them (| mask)
+ * We now have 0xff for byte != 0 or 0x7f for byte == 0.
+ * 5: invert, resulting in 0x80 if and only if byte == 0.
+ */
+ cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
+ cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
+
+ /*
+ * Combine the two compares in a way that the bits do
+ * not overlap, and so preserves the count of set bits.
+ * If the host has an efficient instruction for ctpop,
+ * then ctpop(x) + ctpop(y) has the same number of
+ * operations as ctpop(x | (y >> 1)). If the host does
+ * not have an efficient ctpop, then we only want to
+ * use it once.
+ */
+ return ctpop64(cmp0 | (cmp1 >> 1));
+}
+
+void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, j;
+ intptr_t opr_sz = simd_oprsz(desc);
+
+ for (i = 0; i < opr_sz; i += 16) {
+ uint64_t n0 = *(uint64_t *)(vn + i);
+ uint64_t m0 = *(uint64_t *)(vm + i);
+ uint64_t n1 = *(uint64_t *)(vn + i + 8);
+ uint64_t m1 = *(uint64_t *)(vm + i + 8);
+ uint64_t out0 = 0;
+ uint64_t out1 = 0;
+
+ for (j = 0; j < 64; j += 8) {
+ uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
+ uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
+ out0 |= cnt0 << j;
+ out1 |= cnt1 << j;
+ }
+
+ *(uint64_t *)(vd + i) = out0;
+ *(uint64_t *)(vd + i + 8) = out1;
+ }
+}
+
+void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ int shr = simd_data(desc);
+ int shl = 8 - shr;
+ uint64_t mask = dup_const(MO_8, 0xff >> shr);
+ uint64_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ uint64_t t = n[i] ^ m[i];
+ d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
+ }
+}
+
+void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ int shr = simd_data(desc);
+ int shl = 16 - shr;
+ uint64_t mask = dup_const(MO_16, 0xffff >> shr);
+ uint64_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ uint64_t t = n[i] ^ m[i];
+ d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
+ }
+}
+
+void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+ int shr = simd_data(desc);
+ uint32_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = ror32(n[i] ^ m[i], shr);
+ }
+}