+++ /dev/null
-/*
- * crypto_helper.c - emulate v8 Crypto Extensions instructions
- *
- * Copyright (C) 2013 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- */
-
-#include "qemu/osdep.h"
-
-#include "cpu.h"
-#include "exec/helper-proto.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "crypto/aes.h"
-#include "crypto/sm4.h"
-#include "vec_internal.h"
-
-union CRYPTO_STATE {
- uint8_t bytes[16];
- uint32_t words[4];
- uint64_t l[2];
-};
-
-#if HOST_BIG_ENDIAN
-#define CR_ST_BYTE(state, i) ((state).bytes[(15 - (i)) ^ 8])
-#define CR_ST_WORD(state, i) ((state).words[(3 - (i)) ^ 2])
-#else
-#define CR_ST_BYTE(state, i) ((state).bytes[i])
-#define CR_ST_WORD(state, i) ((state).words[i])
-#endif
-
-/*
- * The caller has not been converted to full gvec, and so only
- * modifies the low 16 bytes of the vector register.
- */
-static void clear_tail_16(void *vd, uint32_t desc)
-{
- int opr_sz = simd_oprsz(desc);
- int max_sz = simd_maxsz(desc);
-
- assert(opr_sz == 16);
- clear_tail(vd, opr_sz, max_sz);
-}
-
-static void do_crypto_aese(uint64_t *rd, uint64_t *rn,
- uint64_t *rm, bool decrypt)
-{
- static uint8_t const * const sbox[2] = { AES_sbox, AES_isbox };
- static uint8_t const * const shift[2] = { AES_shifts, AES_ishifts };
- union CRYPTO_STATE rk = { .l = { rm[0], rm[1] } };
- union CRYPTO_STATE st = { .l = { rn[0], rn[1] } };
- int i;
-
- /* xor state vector with round key */
- rk.l[0] ^= st.l[0];
- rk.l[1] ^= st.l[1];
-
- /* combine ShiftRows operation and sbox substitution */
- for (i = 0; i < 16; i++) {
- CR_ST_BYTE(st, i) = sbox[decrypt][CR_ST_BYTE(rk, shift[decrypt][i])];
- }
-
- rd[0] = st.l[0];
- rd[1] = st.l[1];
-}
-
-void HELPER(crypto_aese)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- bool decrypt = simd_data(desc);
-
- for (i = 0; i < opr_sz; i += 16) {
- do_crypto_aese(vd + i, vn + i, vm + i, decrypt);
- }
- clear_tail(vd, opr_sz, simd_maxsz(desc));
-}
-
-static void do_crypto_aesmc(uint64_t *rd, uint64_t *rm, bool decrypt)
-{
- static uint32_t const mc[][256] = { {
- /* MixColumns lookup table */
- 0x00000000, 0x03010102, 0x06020204, 0x05030306,
- 0x0c040408, 0x0f05050a, 0x0a06060c, 0x0907070e,
- 0x18080810, 0x1b090912, 0x1e0a0a14, 0x1d0b0b16,
- 0x140c0c18, 0x170d0d1a, 0x120e0e1c, 0x110f0f1e,
- 0x30101020, 0x33111122, 0x36121224, 0x35131326,
- 0x3c141428, 0x3f15152a, 0x3a16162c, 0x3917172e,
- 0x28181830, 0x2b191932, 0x2e1a1a34, 0x2d1b1b36,
- 0x241c1c38, 0x271d1d3a, 0x221e1e3c, 0x211f1f3e,
- 0x60202040, 0x63212142, 0x66222244, 0x65232346,
- 0x6c242448, 0x6f25254a, 0x6a26264c, 0x6927274e,
- 0x78282850, 0x7b292952, 0x7e2a2a54, 0x7d2b2b56,
- 0x742c2c58, 0x772d2d5a, 0x722e2e5c, 0x712f2f5e,
- 0x50303060, 0x53313162, 0x56323264, 0x55333366,
- 0x5c343468, 0x5f35356a, 0x5a36366c, 0x5937376e,
- 0x48383870, 0x4b393972, 0x4e3a3a74, 0x4d3b3b76,
- 0x443c3c78, 0x473d3d7a, 0x423e3e7c, 0x413f3f7e,
- 0xc0404080, 0xc3414182, 0xc6424284, 0xc5434386,
- 0xcc444488, 0xcf45458a, 0xca46468c, 0xc947478e,
- 0xd8484890, 0xdb494992, 0xde4a4a94, 0xdd4b4b96,
- 0xd44c4c98, 0xd74d4d9a, 0xd24e4e9c, 0xd14f4f9e,
- 0xf05050a0, 0xf35151a2, 0xf65252a4, 0xf55353a6,
- 0xfc5454a8, 0xff5555aa, 0xfa5656ac, 0xf95757ae,
- 0xe85858b0, 0xeb5959b2, 0xee5a5ab4, 0xed5b5bb6,
- 0xe45c5cb8, 0xe75d5dba, 0xe25e5ebc, 0xe15f5fbe,
- 0xa06060c0, 0xa36161c2, 0xa66262c4, 0xa56363c6,
- 0xac6464c8, 0xaf6565ca, 0xaa6666cc, 0xa96767ce,
- 0xb86868d0, 0xbb6969d2, 0xbe6a6ad4, 0xbd6b6bd6,
- 0xb46c6cd8, 0xb76d6dda, 0xb26e6edc, 0xb16f6fde,
- 0x907070e0, 0x937171e2, 0x967272e4, 0x957373e6,
- 0x9c7474e8, 0x9f7575ea, 0x9a7676ec, 0x997777ee,
- 0x887878f0, 0x8b7979f2, 0x8e7a7af4, 0x8d7b7bf6,
- 0x847c7cf8, 0x877d7dfa, 0x827e7efc, 0x817f7ffe,
- 0x9b80801b, 0x98818119, 0x9d82821f, 0x9e83831d,
- 0x97848413, 0x94858511, 0x91868617, 0x92878715,
- 0x8388880b, 0x80898909, 0x858a8a0f, 0x868b8b0d,
- 0x8f8c8c03, 0x8c8d8d01, 0x898e8e07, 0x8a8f8f05,
- 0xab90903b, 0xa8919139, 0xad92923f, 0xae93933d,
- 0xa7949433, 0xa4959531, 0xa1969637, 0xa2979735,
- 0xb398982b, 0xb0999929, 0xb59a9a2f, 0xb69b9b2d,
- 0xbf9c9c23, 0xbc9d9d21, 0xb99e9e27, 0xba9f9f25,
- 0xfba0a05b, 0xf8a1a159, 0xfda2a25f, 0xfea3a35d,
- 0xf7a4a453, 0xf4a5a551, 0xf1a6a657, 0xf2a7a755,
- 0xe3a8a84b, 0xe0a9a949, 0xe5aaaa4f, 0xe6abab4d,
- 0xefacac43, 0xecadad41, 0xe9aeae47, 0xeaafaf45,
- 0xcbb0b07b, 0xc8b1b179, 0xcdb2b27f, 0xceb3b37d,
- 0xc7b4b473, 0xc4b5b571, 0xc1b6b677, 0xc2b7b775,
- 0xd3b8b86b, 0xd0b9b969, 0xd5baba6f, 0xd6bbbb6d,
- 0xdfbcbc63, 0xdcbdbd61, 0xd9bebe67, 0xdabfbf65,
- 0x5bc0c09b, 0x58c1c199, 0x5dc2c29f, 0x5ec3c39d,
- 0x57c4c493, 0x54c5c591, 0x51c6c697, 0x52c7c795,
- 0x43c8c88b, 0x40c9c989, 0x45caca8f, 0x46cbcb8d,
- 0x4fcccc83, 0x4ccdcd81, 0x49cece87, 0x4acfcf85,
- 0x6bd0d0bb, 0x68d1d1b9, 0x6dd2d2bf, 0x6ed3d3bd,
- 0x67d4d4b3, 0x64d5d5b1, 0x61d6d6b7, 0x62d7d7b5,
- 0x73d8d8ab, 0x70d9d9a9, 0x75dadaaf, 0x76dbdbad,
- 0x7fdcdca3, 0x7cdddda1, 0x79dedea7, 0x7adfdfa5,
- 0x3be0e0db, 0x38e1e1d9, 0x3de2e2df, 0x3ee3e3dd,
- 0x37e4e4d3, 0x34e5e5d1, 0x31e6e6d7, 0x32e7e7d5,
- 0x23e8e8cb, 0x20e9e9c9, 0x25eaeacf, 0x26ebebcd,
- 0x2fececc3, 0x2cededc1, 0x29eeeec7, 0x2aefefc5,
- 0x0bf0f0fb, 0x08f1f1f9, 0x0df2f2ff, 0x0ef3f3fd,
- 0x07f4f4f3, 0x04f5f5f1, 0x01f6f6f7, 0x02f7f7f5,
- 0x13f8f8eb, 0x10f9f9e9, 0x15fafaef, 0x16fbfbed,
- 0x1ffcfce3, 0x1cfdfde1, 0x19fefee7, 0x1affffe5,
- }, {
- /* Inverse MixColumns lookup table */
- 0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12,
- 0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a,
- 0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362,
- 0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a,
- 0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2,
- 0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca,
- 0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382,
- 0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba,
- 0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9,
- 0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1,
- 0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9,
- 0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81,
- 0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029,
- 0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411,
- 0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859,
- 0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61,
- 0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf,
- 0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987,
- 0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf,
- 0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7,
- 0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f,
- 0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967,
- 0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f,
- 0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117,
- 0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664,
- 0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c,
- 0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14,
- 0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c,
- 0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684,
- 0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc,
- 0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4,
- 0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc,
- 0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753,
- 0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b,
- 0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23,
- 0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b,
- 0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3,
- 0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b,
- 0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3,
- 0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb,
- 0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88,
- 0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0,
- 0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8,
- 0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0,
- 0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68,
- 0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850,
- 0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418,
- 0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020,
- 0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe,
- 0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6,
- 0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e,
- 0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6,
- 0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e,
- 0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526,
- 0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e,
- 0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56,
- 0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25,
- 0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d,
- 0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255,
- 0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d,
- 0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5,
- 0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd,
- 0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5,
- 0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d,
- } };
-
- union CRYPTO_STATE st = { .l = { rm[0], rm[1] } };
- int i;
-
- for (i = 0; i < 16; i += 4) {
- CR_ST_WORD(st, i >> 2) =
- mc[decrypt][CR_ST_BYTE(st, i)] ^
- rol32(mc[decrypt][CR_ST_BYTE(st, i + 1)], 8) ^
- rol32(mc[decrypt][CR_ST_BYTE(st, i + 2)], 16) ^
- rol32(mc[decrypt][CR_ST_BYTE(st, i + 3)], 24);
- }
-
- rd[0] = st.l[0];
- rd[1] = st.l[1];
-}
-
-void HELPER(crypto_aesmc)(void *vd, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- bool decrypt = simd_data(desc);
-
- for (i = 0; i < opr_sz; i += 16) {
- do_crypto_aesmc(vd + i, vm + i, decrypt);
- }
- clear_tail(vd, opr_sz, simd_maxsz(desc));
-}
-
-/*
- * SHA-1 logical functions
- */
-
-static uint32_t cho(uint32_t x, uint32_t y, uint32_t z)
-{
- return (x & (y ^ z)) ^ z;
-}
-
-static uint32_t par(uint32_t x, uint32_t y, uint32_t z)
-{
- return x ^ y ^ z;
-}
-
-static uint32_t maj(uint32_t x, uint32_t y, uint32_t z)
-{
- return (x & y) | ((x | y) & z);
-}
-
-void HELPER(crypto_sha1su0)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- uint64_t *d = vd, *n = vn, *m = vm;
- uint64_t d0, d1;
-
- d0 = d[1] ^ d[0] ^ m[0];
- d1 = n[0] ^ d[1] ^ m[1];
- d[0] = d0;
- d[1] = d1;
-
- clear_tail_16(vd, desc);
-}
-
-static inline void crypto_sha1_3reg(uint64_t *rd, uint64_t *rn,
- uint64_t *rm, uint32_t desc,
- uint32_t (*fn)(union CRYPTO_STATE *d))
-{
- union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
- union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
- union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
- int i;
-
- for (i = 0; i < 4; i++) {
- uint32_t t = fn(&d);
-
- t += rol32(CR_ST_WORD(d, 0), 5) + CR_ST_WORD(n, 0)
- + CR_ST_WORD(m, i);
-
- CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3);
- CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
- CR_ST_WORD(d, 2) = ror32(CR_ST_WORD(d, 1), 2);
- CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
- CR_ST_WORD(d, 0) = t;
- }
- rd[0] = d.l[0];
- rd[1] = d.l[1];
-
- clear_tail_16(rd, desc);
-}
-
-static uint32_t do_sha1c(union CRYPTO_STATE *d)
-{
- return cho(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
-}
-
-void HELPER(crypto_sha1c)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- crypto_sha1_3reg(vd, vn, vm, desc, do_sha1c);
-}
-
-static uint32_t do_sha1p(union CRYPTO_STATE *d)
-{
- return par(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
-}
-
-void HELPER(crypto_sha1p)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- crypto_sha1_3reg(vd, vn, vm, desc, do_sha1p);
-}
-
-static uint32_t do_sha1m(union CRYPTO_STATE *d)
-{
- return maj(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
-}
-
-void HELPER(crypto_sha1m)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- crypto_sha1_3reg(vd, vn, vm, desc, do_sha1m);
-}
-
-void HELPER(crypto_sha1h)(void *vd, void *vm, uint32_t desc)
-{
- uint64_t *rd = vd;
- uint64_t *rm = vm;
- union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-
- CR_ST_WORD(m, 0) = ror32(CR_ST_WORD(m, 0), 2);
- CR_ST_WORD(m, 1) = CR_ST_WORD(m, 2) = CR_ST_WORD(m, 3) = 0;
-
- rd[0] = m.l[0];
- rd[1] = m.l[1];
-
- clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha1su1)(void *vd, void *vm, uint32_t desc)
-{
- uint64_t *rd = vd;
- uint64_t *rm = vm;
- union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
- union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-
- CR_ST_WORD(d, 0) = rol32(CR_ST_WORD(d, 0) ^ CR_ST_WORD(m, 1), 1);
- CR_ST_WORD(d, 1) = rol32(CR_ST_WORD(d, 1) ^ CR_ST_WORD(m, 2), 1);
- CR_ST_WORD(d, 2) = rol32(CR_ST_WORD(d, 2) ^ CR_ST_WORD(m, 3), 1);
- CR_ST_WORD(d, 3) = rol32(CR_ST_WORD(d, 3) ^ CR_ST_WORD(d, 0), 1);
-
- rd[0] = d.l[0];
- rd[1] = d.l[1];
-
- clear_tail_16(vd, desc);
-}
-
-/*
- * The SHA-256 logical functions, according to
- * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
- */
-
-static uint32_t S0(uint32_t x)
-{
- return ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22);
-}
-
-static uint32_t S1(uint32_t x)
-{
- return ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25);
-}
-
-static uint32_t s0(uint32_t x)
-{
- return ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3);
-}
-
-static uint32_t s1(uint32_t x)
-{
- return ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10);
-}
-
-void HELPER(crypto_sha256h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- uint64_t *rd = vd;
- uint64_t *rn = vn;
- uint64_t *rm = vm;
- union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
- union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
- union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
- int i;
-
- for (i = 0; i < 4; i++) {
- uint32_t t = cho(CR_ST_WORD(n, 0), CR_ST_WORD(n, 1), CR_ST_WORD(n, 2))
- + CR_ST_WORD(n, 3) + S1(CR_ST_WORD(n, 0))
- + CR_ST_WORD(m, i);
-
- CR_ST_WORD(n, 3) = CR_ST_WORD(n, 2);
- CR_ST_WORD(n, 2) = CR_ST_WORD(n, 1);
- CR_ST_WORD(n, 1) = CR_ST_WORD(n, 0);
- CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3) + t;
-
- t += maj(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2))
- + S0(CR_ST_WORD(d, 0));
-
- CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
- CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1);
- CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
- CR_ST_WORD(d, 0) = t;
- }
-
- rd[0] = d.l[0];
- rd[1] = d.l[1];
-
- clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha256h2)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- uint64_t *rd = vd;
- uint64_t *rn = vn;
- uint64_t *rm = vm;
- union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
- union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
- union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
- int i;
-
- for (i = 0; i < 4; i++) {
- uint32_t t = cho(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2))
- + CR_ST_WORD(d, 3) + S1(CR_ST_WORD(d, 0))
- + CR_ST_WORD(m, i);
-
- CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
- CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1);
- CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
- CR_ST_WORD(d, 0) = CR_ST_WORD(n, 3 - i) + t;
- }
-
- rd[0] = d.l[0];
- rd[1] = d.l[1];
-
- clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha256su0)(void *vd, void *vm, uint32_t desc)
-{
- uint64_t *rd = vd;
- uint64_t *rm = vm;
- union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
- union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-
- CR_ST_WORD(d, 0) += s0(CR_ST_WORD(d, 1));
- CR_ST_WORD(d, 1) += s0(CR_ST_WORD(d, 2));
- CR_ST_WORD(d, 2) += s0(CR_ST_WORD(d, 3));
- CR_ST_WORD(d, 3) += s0(CR_ST_WORD(m, 0));
-
- rd[0] = d.l[0];
- rd[1] = d.l[1];
-
- clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha256su1)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- uint64_t *rd = vd;
- uint64_t *rn = vn;
- uint64_t *rm = vm;
- union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
- union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
- union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
-
- CR_ST_WORD(d, 0) += s1(CR_ST_WORD(m, 2)) + CR_ST_WORD(n, 1);
- CR_ST_WORD(d, 1) += s1(CR_ST_WORD(m, 3)) + CR_ST_WORD(n, 2);
- CR_ST_WORD(d, 2) += s1(CR_ST_WORD(d, 0)) + CR_ST_WORD(n, 3);
- CR_ST_WORD(d, 3) += s1(CR_ST_WORD(d, 1)) + CR_ST_WORD(m, 0);
-
- rd[0] = d.l[0];
- rd[1] = d.l[1];
-
- clear_tail_16(vd, desc);
-}
-
-/*
- * The SHA-512 logical functions (same as above but using 64-bit operands)
- */
-
-static uint64_t cho512(uint64_t x, uint64_t y, uint64_t z)
-{
- return (x & (y ^ z)) ^ z;
-}
-
-static uint64_t maj512(uint64_t x, uint64_t y, uint64_t z)
-{
- return (x & y) | ((x | y) & z);
-}
-
-static uint64_t S0_512(uint64_t x)
-{
- return ror64(x, 28) ^ ror64(x, 34) ^ ror64(x, 39);
-}
-
-static uint64_t S1_512(uint64_t x)
-{
- return ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41);
-}
-
-static uint64_t s0_512(uint64_t x)
-{
- return ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7);
-}
-
-static uint64_t s1_512(uint64_t x)
-{
- return ror64(x, 19) ^ ror64(x, 61) ^ (x >> 6);
-}
-
-void HELPER(crypto_sha512h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- uint64_t *rd = vd;
- uint64_t *rn = vn;
- uint64_t *rm = vm;
- uint64_t d0 = rd[0];
- uint64_t d1 = rd[1];
-
- d1 += S1_512(rm[1]) + cho512(rm[1], rn[0], rn[1]);
- d0 += S1_512(d1 + rm[0]) + cho512(d1 + rm[0], rm[1], rn[0]);
-
- rd[0] = d0;
- rd[1] = d1;
-
- clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha512h2)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- uint64_t *rd = vd;
- uint64_t *rn = vn;
- uint64_t *rm = vm;
- uint64_t d0 = rd[0];
- uint64_t d1 = rd[1];
-
- d1 += S0_512(rm[0]) + maj512(rn[0], rm[1], rm[0]);
- d0 += S0_512(d1) + maj512(d1, rm[0], rm[1]);
-
- rd[0] = d0;
- rd[1] = d1;
-
- clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha512su0)(void *vd, void *vn, uint32_t desc)
-{
- uint64_t *rd = vd;
- uint64_t *rn = vn;
- uint64_t d0 = rd[0];
- uint64_t d1 = rd[1];
-
- d0 += s0_512(rd[1]);
- d1 += s0_512(rn[0]);
-
- rd[0] = d0;
- rd[1] = d1;
-
- clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sha512su1)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- uint64_t *rd = vd;
- uint64_t *rn = vn;
- uint64_t *rm = vm;
-
- rd[0] += s1_512(rn[0]) + rm[0];
- rd[1] += s1_512(rn[1]) + rm[1];
-
- clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sm3partw1)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- uint64_t *rd = vd;
- uint64_t *rn = vn;
- uint64_t *rm = vm;
- union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
- union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
- union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
- uint32_t t;
-
- t = CR_ST_WORD(d, 0) ^ CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 1), 17);
- CR_ST_WORD(d, 0) = t ^ ror32(t, 17) ^ ror32(t, 9);
-
- t = CR_ST_WORD(d, 1) ^ CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 2), 17);
- CR_ST_WORD(d, 1) = t ^ ror32(t, 17) ^ ror32(t, 9);
-
- t = CR_ST_WORD(d, 2) ^ CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 3), 17);
- CR_ST_WORD(d, 2) = t ^ ror32(t, 17) ^ ror32(t, 9);
-
- t = CR_ST_WORD(d, 3) ^ CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 0), 17);
- CR_ST_WORD(d, 3) = t ^ ror32(t, 17) ^ ror32(t, 9);
-
- rd[0] = d.l[0];
- rd[1] = d.l[1];
-
- clear_tail_16(vd, desc);
-}
-
-void HELPER(crypto_sm3partw2)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- uint64_t *rd = vd;
- uint64_t *rn = vn;
- uint64_t *rm = vm;
- union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
- union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
- union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
- uint32_t t = CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 0), 25);
-
- CR_ST_WORD(d, 0) ^= t;
- CR_ST_WORD(d, 1) ^= CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 1), 25);
- CR_ST_WORD(d, 2) ^= CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 2), 25);
- CR_ST_WORD(d, 3) ^= CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(m, 3), 25) ^
- ror32(t, 17) ^ ror32(t, 2) ^ ror32(t, 26);
-
- rd[0] = d.l[0];
- rd[1] = d.l[1];
-
- clear_tail_16(vd, desc);
-}
-
-static inline void QEMU_ALWAYS_INLINE
-crypto_sm3tt(uint64_t *rd, uint64_t *rn, uint64_t *rm,
- uint32_t desc, uint32_t opcode)
-{
- union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
- union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
- union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
- uint32_t imm2 = simd_data(desc);
- uint32_t t;
-
- assert(imm2 < 4);
-
- if (opcode == 0 || opcode == 2) {
- /* SM3TT1A, SM3TT2A */
- t = par(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
- } else if (opcode == 1) {
- /* SM3TT1B */
- t = maj(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
- } else if (opcode == 3) {
- /* SM3TT2B */
- t = cho(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
- } else {
- qemu_build_not_reached();
- }
-
- t += CR_ST_WORD(d, 0) + CR_ST_WORD(m, imm2);
-
- CR_ST_WORD(d, 0) = CR_ST_WORD(d, 1);
-
- if (opcode < 2) {
- /* SM3TT1A, SM3TT1B */
- t += CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 3), 20);
-
- CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 23);
- } else {
- /* SM3TT2A, SM3TT2B */
- t += CR_ST_WORD(n, 3);
- t ^= rol32(t, 9) ^ rol32(t, 17);
-
- CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 13);
- }
-
- CR_ST_WORD(d, 2) = CR_ST_WORD(d, 3);
- CR_ST_WORD(d, 3) = t;
-
- rd[0] = d.l[0];
- rd[1] = d.l[1];
-
- clear_tail_16(rd, desc);
-}
-
-#define DO_SM3TT(NAME, OPCODE) \
- void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
- { crypto_sm3tt(vd, vn, vm, desc, OPCODE); }
-
-DO_SM3TT(crypto_sm3tt1a, 0)
-DO_SM3TT(crypto_sm3tt1b, 1)
-DO_SM3TT(crypto_sm3tt2a, 2)
-DO_SM3TT(crypto_sm3tt2b, 3)
-
-#undef DO_SM3TT
-
-static void do_crypto_sm4e(uint64_t *rd, uint64_t *rn, uint64_t *rm)
-{
- union CRYPTO_STATE d = { .l = { rn[0], rn[1] } };
- union CRYPTO_STATE n = { .l = { rm[0], rm[1] } };
- uint32_t t, i;
-
- for (i = 0; i < 4; i++) {
- t = CR_ST_WORD(d, (i + 1) % 4) ^
- CR_ST_WORD(d, (i + 2) % 4) ^
- CR_ST_WORD(d, (i + 3) % 4) ^
- CR_ST_WORD(n, i);
-
- t = sm4_sbox[t & 0xff] |
- sm4_sbox[(t >> 8) & 0xff] << 8 |
- sm4_sbox[(t >> 16) & 0xff] << 16 |
- sm4_sbox[(t >> 24) & 0xff] << 24;
-
- CR_ST_WORD(d, i) ^= t ^ rol32(t, 2) ^ rol32(t, 10) ^ rol32(t, 18) ^
- rol32(t, 24);
- }
-
- rd[0] = d.l[0];
- rd[1] = d.l[1];
-}
-
-void HELPER(crypto_sm4e)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
-
- for (i = 0; i < opr_sz; i += 16) {
- do_crypto_sm4e(vd + i, vn + i, vm + i);
- }
- clear_tail(vd, opr_sz, simd_maxsz(desc));
-}
-
-static void do_crypto_sm4ekey(uint64_t *rd, uint64_t *rn, uint64_t *rm)
-{
- union CRYPTO_STATE d;
- union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
- union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
- uint32_t t, i;
-
- d = n;
- for (i = 0; i < 4; i++) {
- t = CR_ST_WORD(d, (i + 1) % 4) ^
- CR_ST_WORD(d, (i + 2) % 4) ^
- CR_ST_WORD(d, (i + 3) % 4) ^
- CR_ST_WORD(m, i);
-
- t = sm4_sbox[t & 0xff] |
- sm4_sbox[(t >> 8) & 0xff] << 8 |
- sm4_sbox[(t >> 16) & 0xff] << 16 |
- sm4_sbox[(t >> 24) & 0xff] << 24;
-
- CR_ST_WORD(d, i) ^= t ^ rol32(t, 13) ^ rol32(t, 23);
- }
-
- rd[0] = d.l[0];
- rd[1] = d.l[1];
-}
-
-void HELPER(crypto_sm4ekey)(void *vd, void *vn, void* vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
-
- for (i = 0; i < opr_sz; i += 16) {
- do_crypto_sm4ekey(vd + i, vn + i, vm + i);
- }
- clear_tail(vd, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(crypto_rax1)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 8; ++i) {
- d[i] = n[i] ^ rol64(m[i], 1);
- }
- clear_tail(vd, opr_sz, simd_maxsz(desc));
-}
+++ /dev/null
-/*
- * AArch64 specific helpers
- *
- * Copyright (c) 2013 Alexander Graf <agraf@suse.de>
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "qemu/units.h"
-#include "cpu.h"
-#include "exec/gdbstub.h"
-#include "exec/helper-proto.h"
-#include "qemu/host-utils.h"
-#include "qemu/log.h"
-#include "qemu/main-loop.h"
-#include "qemu/bitops.h"
-#include "internals.h"
-#include "qemu/crc32c.h"
-#include "exec/exec-all.h"
-#include "exec/cpu_ldst.h"
-#include "qemu/int128.h"
-#include "qemu/atomic128.h"
-#include "fpu/softfloat.h"
-#include <zlib.h> /* For crc32 */
-
-/* C2.4.7 Multiply and divide */
-/* special cases for 0 and LLONG_MIN are mandated by the standard */
-uint64_t HELPER(udiv64)(uint64_t num, uint64_t den)
-{
- if (den == 0) {
- return 0;
- }
- return num / den;
-}
-
-int64_t HELPER(sdiv64)(int64_t num, int64_t den)
-{
- if (den == 0) {
- return 0;
- }
- if (num == LLONG_MIN && den == -1) {
- return LLONG_MIN;
- }
- return num / den;
-}
-
-uint64_t HELPER(rbit64)(uint64_t x)
-{
- return revbit64(x);
-}
-
-void HELPER(msr_i_spsel)(CPUARMState *env, uint32_t imm)
-{
- update_spsel(env, imm);
-}
-
-static void daif_check(CPUARMState *env, uint32_t op,
- uint32_t imm, uintptr_t ra)
-{
- /* DAIF update to PSTATE. This is OK from EL0 only if UMA is set. */
- if (arm_current_el(env) == 0 && !(arm_sctlr(env, 0) & SCTLR_UMA)) {
- raise_exception_ra(env, EXCP_UDEF,
- syn_aa64_sysregtrap(0, extract32(op, 0, 3),
- extract32(op, 3, 3), 4,
- imm, 0x1f, 0),
- exception_target_el(env), ra);
- }
-}
-
-void HELPER(msr_i_daifset)(CPUARMState *env, uint32_t imm)
-{
- daif_check(env, 0x1e, imm, GETPC());
- env->daif |= (imm << 6) & PSTATE_DAIF;
- arm_rebuild_hflags(env);
-}
-
-void HELPER(msr_i_daifclear)(CPUARMState *env, uint32_t imm)
-{
- daif_check(env, 0x1f, imm, GETPC());
- env->daif &= ~((imm << 6) & PSTATE_DAIF);
- arm_rebuild_hflags(env);
-}
-
-/* Convert a softfloat float_relation_ (as returned by
- * the float*_compare functions) to the correct ARM
- * NZCV flag state.
- */
-static inline uint32_t float_rel_to_flags(int res)
-{
- uint64_t flags;
- switch (res) {
- case float_relation_equal:
- flags = PSTATE_Z | PSTATE_C;
- break;
- case float_relation_less:
- flags = PSTATE_N;
- break;
- case float_relation_greater:
- flags = PSTATE_C;
- break;
- case float_relation_unordered:
- default:
- flags = PSTATE_C | PSTATE_V;
- break;
- }
- return flags;
-}
-
-uint64_t HELPER(vfp_cmph_a64)(uint32_t x, uint32_t y, void *fp_status)
-{
- return float_rel_to_flags(float16_compare_quiet(x, y, fp_status));
-}
-
-uint64_t HELPER(vfp_cmpeh_a64)(uint32_t x, uint32_t y, void *fp_status)
-{
- return float_rel_to_flags(float16_compare(x, y, fp_status));
-}
-
-uint64_t HELPER(vfp_cmps_a64)(float32 x, float32 y, void *fp_status)
-{
- return float_rel_to_flags(float32_compare_quiet(x, y, fp_status));
-}
-
-uint64_t HELPER(vfp_cmpes_a64)(float32 x, float32 y, void *fp_status)
-{
- return float_rel_to_flags(float32_compare(x, y, fp_status));
-}
-
-uint64_t HELPER(vfp_cmpd_a64)(float64 x, float64 y, void *fp_status)
-{
- return float_rel_to_flags(float64_compare_quiet(x, y, fp_status));
-}
-
-uint64_t HELPER(vfp_cmped_a64)(float64 x, float64 y, void *fp_status)
-{
- return float_rel_to_flags(float64_compare(x, y, fp_status));
-}
-
-float32 HELPER(vfp_mulxs)(float32 a, float32 b, void *fpstp)
-{
- float_status *fpst = fpstp;
-
- a = float32_squash_input_denormal(a, fpst);
- b = float32_squash_input_denormal(b, fpst);
-
- if ((float32_is_zero(a) && float32_is_infinity(b)) ||
- (float32_is_infinity(a) && float32_is_zero(b))) {
- /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
- return make_float32((1U << 30) |
- ((float32_val(a) ^ float32_val(b)) & (1U << 31)));
- }
- return float32_mul(a, b, fpst);
-}
-
-float64 HELPER(vfp_mulxd)(float64 a, float64 b, void *fpstp)
-{
- float_status *fpst = fpstp;
-
- a = float64_squash_input_denormal(a, fpst);
- b = float64_squash_input_denormal(b, fpst);
-
- if ((float64_is_zero(a) && float64_is_infinity(b)) ||
- (float64_is_infinity(a) && float64_is_zero(b))) {
- /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
- return make_float64((1ULL << 62) |
- ((float64_val(a) ^ float64_val(b)) & (1ULL << 63)));
- }
- return float64_mul(a, b, fpst);
-}
-
-/* 64bit/double versions of the neon float compare functions */
-uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp)
-{
- float_status *fpst = fpstp;
- return -float64_eq_quiet(a, b, fpst);
-}
-
-uint64_t HELPER(neon_cge_f64)(float64 a, float64 b, void *fpstp)
-{
- float_status *fpst = fpstp;
- return -float64_le(b, a, fpst);
-}
-
-uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
-{
- float_status *fpst = fpstp;
- return -float64_lt(b, a, fpst);
-}
-
-/* Reciprocal step and sqrt step. Note that unlike the A32/T32
- * versions, these do a fully fused multiply-add or
- * multiply-add-and-halve.
- */
-
-uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
-
- a = float16_squash_input_denormal(a, fpst);
- b = float16_squash_input_denormal(b, fpst);
-
- a = float16_chs(a);
- if ((float16_is_infinity(a) && float16_is_zero(b)) ||
- (float16_is_infinity(b) && float16_is_zero(a))) {
- return float16_two;
- }
- return float16_muladd(a, b, float16_two, 0, fpst);
-}
-
-float32 HELPER(recpsf_f32)(float32 a, float32 b, void *fpstp)
-{
- float_status *fpst = fpstp;
-
- a = float32_squash_input_denormal(a, fpst);
- b = float32_squash_input_denormal(b, fpst);
-
- a = float32_chs(a);
- if ((float32_is_infinity(a) && float32_is_zero(b)) ||
- (float32_is_infinity(b) && float32_is_zero(a))) {
- return float32_two;
- }
- return float32_muladd(a, b, float32_two, 0, fpst);
-}
-
-float64 HELPER(recpsf_f64)(float64 a, float64 b, void *fpstp)
-{
- float_status *fpst = fpstp;
-
- a = float64_squash_input_denormal(a, fpst);
- b = float64_squash_input_denormal(b, fpst);
-
- a = float64_chs(a);
- if ((float64_is_infinity(a) && float64_is_zero(b)) ||
- (float64_is_infinity(b) && float64_is_zero(a))) {
- return float64_two;
- }
- return float64_muladd(a, b, float64_two, 0, fpst);
-}
-
-uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
-
- a = float16_squash_input_denormal(a, fpst);
- b = float16_squash_input_denormal(b, fpst);
-
- a = float16_chs(a);
- if ((float16_is_infinity(a) && float16_is_zero(b)) ||
- (float16_is_infinity(b) && float16_is_zero(a))) {
- return float16_one_point_five;
- }
- return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
-}
-
-float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, void *fpstp)
-{
- float_status *fpst = fpstp;
-
- a = float32_squash_input_denormal(a, fpst);
- b = float32_squash_input_denormal(b, fpst);
-
- a = float32_chs(a);
- if ((float32_is_infinity(a) && float32_is_zero(b)) ||
- (float32_is_infinity(b) && float32_is_zero(a))) {
- return float32_one_point_five;
- }
- return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
-}
-
-float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp)
-{
- float_status *fpst = fpstp;
-
- a = float64_squash_input_denormal(a, fpst);
- b = float64_squash_input_denormal(b, fpst);
-
- a = float64_chs(a);
- if ((float64_is_infinity(a) && float64_is_zero(b)) ||
- (float64_is_infinity(b) && float64_is_zero(a))) {
- return float64_one_point_five;
- }
- return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
-}
-
-/* Pairwise long add: add pairs of adjacent elements into
- * double-width elements in the result (eg _s8 is an 8x8->16 op)
- */
-uint64_t HELPER(neon_addlp_s8)(uint64_t a)
-{
- uint64_t nsignmask = 0x0080008000800080ULL;
- uint64_t wsignmask = 0x8000800080008000ULL;
- uint64_t elementmask = 0x00ff00ff00ff00ffULL;
- uint64_t tmp1, tmp2;
- uint64_t res, signres;
-
- /* Extract odd elements, sign extend each to a 16 bit field */
- tmp1 = a & elementmask;
- tmp1 ^= nsignmask;
- tmp1 |= wsignmask;
- tmp1 = (tmp1 - nsignmask) ^ wsignmask;
- /* Ditto for the even elements */
- tmp2 = (a >> 8) & elementmask;
- tmp2 ^= nsignmask;
- tmp2 |= wsignmask;
- tmp2 = (tmp2 - nsignmask) ^ wsignmask;
-
- /* calculate the result by summing bits 0..14, 16..22, etc,
- * and then adjusting the sign bits 15, 23, etc manually.
- * This ensures the addition can't overflow the 16 bit field.
- */
- signres = (tmp1 ^ tmp2) & wsignmask;
- res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
- res ^= signres;
-
- return res;
-}
-
-uint64_t HELPER(neon_addlp_u8)(uint64_t a)
-{
- uint64_t tmp;
-
- tmp = a & 0x00ff00ff00ff00ffULL;
- tmp += (a >> 8) & 0x00ff00ff00ff00ffULL;
- return tmp;
-}
-
-uint64_t HELPER(neon_addlp_s16)(uint64_t a)
-{
- int32_t reslo, reshi;
-
- reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
- reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
-
- return (uint32_t)reslo | (((uint64_t)reshi) << 32);
-}
-
-uint64_t HELPER(neon_addlp_u16)(uint64_t a)
-{
- uint64_t tmp;
-
- tmp = a & 0x0000ffff0000ffffULL;
- tmp += (a >> 16) & 0x0000ffff0000ffffULL;
- return tmp;
-}
-
-/* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
-uint32_t HELPER(frecpx_f16)(uint32_t a, void *fpstp)
-{
- float_status *fpst = fpstp;
- uint16_t val16, sbit;
- int16_t exp;
-
- if (float16_is_any_nan(a)) {
- float16 nan = a;
- if (float16_is_signaling_nan(a, fpst)) {
- float_raise(float_flag_invalid, fpst);
- if (!fpst->default_nan_mode) {
- nan = float16_silence_nan(a, fpst);
- }
- }
- if (fpst->default_nan_mode) {
- nan = float16_default_nan(fpst);
- }
- return nan;
- }
-
- a = float16_squash_input_denormal(a, fpst);
-
- val16 = float16_val(a);
- sbit = 0x8000 & val16;
- exp = extract32(val16, 10, 5);
-
- if (exp == 0) {
- return make_float16(deposit32(sbit, 10, 5, 0x1e));
- } else {
- return make_float16(deposit32(sbit, 10, 5, ~exp));
- }
-}
-
-float32 HELPER(frecpx_f32)(float32 a, void *fpstp)
-{
- float_status *fpst = fpstp;
- uint32_t val32, sbit;
- int32_t exp;
-
- if (float32_is_any_nan(a)) {
- float32 nan = a;
- if (float32_is_signaling_nan(a, fpst)) {
- float_raise(float_flag_invalid, fpst);
- if (!fpst->default_nan_mode) {
- nan = float32_silence_nan(a, fpst);
- }
- }
- if (fpst->default_nan_mode) {
- nan = float32_default_nan(fpst);
- }
- return nan;
- }
-
- a = float32_squash_input_denormal(a, fpst);
-
- val32 = float32_val(a);
- sbit = 0x80000000ULL & val32;
- exp = extract32(val32, 23, 8);
-
- if (exp == 0) {
- return make_float32(sbit | (0xfe << 23));
- } else {
- return make_float32(sbit | (~exp & 0xff) << 23);
- }
-}
-
-float64 HELPER(frecpx_f64)(float64 a, void *fpstp)
-{
- float_status *fpst = fpstp;
- uint64_t val64, sbit;
- int64_t exp;
-
- if (float64_is_any_nan(a)) {
- float64 nan = a;
- if (float64_is_signaling_nan(a, fpst)) {
- float_raise(float_flag_invalid, fpst);
- if (!fpst->default_nan_mode) {
- nan = float64_silence_nan(a, fpst);
- }
- }
- if (fpst->default_nan_mode) {
- nan = float64_default_nan(fpst);
- }
- return nan;
- }
-
- a = float64_squash_input_denormal(a, fpst);
-
- val64 = float64_val(a);
- sbit = 0x8000000000000000ULL & val64;
- exp = extract64(float64_val(a), 52, 11);
-
- if (exp == 0) {
- return make_float64(sbit | (0x7feULL << 52));
- } else {
- return make_float64(sbit | (~exp & 0x7ffULL) << 52);
- }
-}
-
-float32 HELPER(fcvtx_f64_to_f32)(float64 a, CPUARMState *env)
-{
- /* Von Neumann rounding is implemented by using round-to-zero
- * and then setting the LSB of the result if Inexact was raised.
- */
- float32 r;
- float_status *fpst = &env->vfp.fp_status;
- float_status tstat = *fpst;
- int exflags;
-
- set_float_rounding_mode(float_round_to_zero, &tstat);
- set_float_exception_flags(0, &tstat);
- r = float64_to_float32(a, &tstat);
- exflags = get_float_exception_flags(&tstat);
- if (exflags & float_flag_inexact) {
- r = make_float32(float32_val(r) | 1);
- }
- exflags |= get_float_exception_flags(fpst);
- set_float_exception_flags(exflags, fpst);
- return r;
-}
-
-/* 64-bit versions of the CRC helpers. Note that although the operation
- * (and the prototypes of crc32c() and crc32() mean that only the bottom
- * 32 bits of the accumulator and result are used, we pass and return
- * uint64_t for convenience of the generated code. Unlike the 32-bit
- * instruction set versions, val may genuinely have 64 bits of data in it.
- * The upper bytes of val (above the number specified by 'bytes') must have
- * been zeroed out by the caller.
- */
-uint64_t HELPER(crc32_64)(uint64_t acc, uint64_t val, uint32_t bytes)
-{
- uint8_t buf[8];
-
- stq_le_p(buf, val);
-
- /* zlib crc32 converts the accumulator and output to one's complement. */
- return crc32(acc ^ 0xffffffff, buf, bytes) ^ 0xffffffff;
-}
-
-uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
-{
- uint8_t buf[8];
-
- stq_le_p(buf, val);
-
- /* Linux crc32c converts the output to one's complement. */
- return crc32c(acc, buf, bytes) ^ 0xffffffff;
-}
-
-/*
- * AdvSIMD half-precision
- */
-
-#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
-
-#define ADVSIMD_HALFOP(name) \
-uint32_t ADVSIMD_HELPER(name, h)(uint32_t a, uint32_t b, void *fpstp) \
-{ \
- float_status *fpst = fpstp; \
- return float16_ ## name(a, b, fpst); \
-}
-
-ADVSIMD_HALFOP(add)
-ADVSIMD_HALFOP(sub)
-ADVSIMD_HALFOP(mul)
-ADVSIMD_HALFOP(div)
-ADVSIMD_HALFOP(min)
-ADVSIMD_HALFOP(max)
-ADVSIMD_HALFOP(minnum)
-ADVSIMD_HALFOP(maxnum)
-
-#define ADVSIMD_TWOHALFOP(name) \
-uint32_t ADVSIMD_HELPER(name, 2h)(uint32_t two_a, uint32_t two_b, void *fpstp) \
-{ \
- float16 a1, a2, b1, b2; \
- uint32_t r1, r2; \
- float_status *fpst = fpstp; \
- a1 = extract32(two_a, 0, 16); \
- a2 = extract32(two_a, 16, 16); \
- b1 = extract32(two_b, 0, 16); \
- b2 = extract32(two_b, 16, 16); \
- r1 = float16_ ## name(a1, b1, fpst); \
- r2 = float16_ ## name(a2, b2, fpst); \
- return deposit32(r1, 16, 16, r2); \
-}
-
-ADVSIMD_TWOHALFOP(add)
-ADVSIMD_TWOHALFOP(sub)
-ADVSIMD_TWOHALFOP(mul)
-ADVSIMD_TWOHALFOP(div)
-ADVSIMD_TWOHALFOP(min)
-ADVSIMD_TWOHALFOP(max)
-ADVSIMD_TWOHALFOP(minnum)
-ADVSIMD_TWOHALFOP(maxnum)
-
-/* Data processing - scalar floating-point and advanced SIMD */
-static float16 float16_mulx(float16 a, float16 b, void *fpstp)
-{
- float_status *fpst = fpstp;
-
- a = float16_squash_input_denormal(a, fpst);
- b = float16_squash_input_denormal(b, fpst);
-
- if ((float16_is_zero(a) && float16_is_infinity(b)) ||
- (float16_is_infinity(a) && float16_is_zero(b))) {
- /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
- return make_float16((1U << 14) |
- ((float16_val(a) ^ float16_val(b)) & (1U << 15)));
- }
- return float16_mul(a, b, fpst);
-}
-
-ADVSIMD_HALFOP(mulx)
-ADVSIMD_TWOHALFOP(mulx)
-
-/* fused multiply-accumulate */
-uint32_t HELPER(advsimd_muladdh)(uint32_t a, uint32_t b, uint32_t c,
- void *fpstp)
-{
- float_status *fpst = fpstp;
- return float16_muladd(a, b, c, 0, fpst);
-}
-
-uint32_t HELPER(advsimd_muladd2h)(uint32_t two_a, uint32_t two_b,
- uint32_t two_c, void *fpstp)
-{
- float_status *fpst = fpstp;
- float16 a1, a2, b1, b2, c1, c2;
- uint32_t r1, r2;
- a1 = extract32(two_a, 0, 16);
- a2 = extract32(two_a, 16, 16);
- b1 = extract32(two_b, 0, 16);
- b2 = extract32(two_b, 16, 16);
- c1 = extract32(two_c, 0, 16);
- c2 = extract32(two_c, 16, 16);
- r1 = float16_muladd(a1, b1, c1, 0, fpst);
- r2 = float16_muladd(a2, b2, c2, 0, fpst);
- return deposit32(r1, 16, 16, r2);
-}
-
-/*
- * Floating point comparisons produce an integer result. Softfloat
- * routines return float_relation types which we convert to the 0/-1
- * Neon requires.
- */
-
-#define ADVSIMD_CMPRES(test) (test) ? 0xffff : 0
-
-uint32_t HELPER(advsimd_ceq_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
- int compare = float16_compare_quiet(a, b, fpst);
- return ADVSIMD_CMPRES(compare == float_relation_equal);
-}
-
-uint32_t HELPER(advsimd_cge_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
- int compare = float16_compare(a, b, fpst);
- return ADVSIMD_CMPRES(compare == float_relation_greater ||
- compare == float_relation_equal);
-}
-
-uint32_t HELPER(advsimd_cgt_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
- int compare = float16_compare(a, b, fpst);
- return ADVSIMD_CMPRES(compare == float_relation_greater);
-}
-
-uint32_t HELPER(advsimd_acge_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
- float16 f0 = float16_abs(a);
- float16 f1 = float16_abs(b);
- int compare = float16_compare(f0, f1, fpst);
- return ADVSIMD_CMPRES(compare == float_relation_greater ||
- compare == float_relation_equal);
-}
-
-uint32_t HELPER(advsimd_acgt_f16)(uint32_t a, uint32_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
- float16 f0 = float16_abs(a);
- float16 f1 = float16_abs(b);
- int compare = float16_compare(f0, f1, fpst);
- return ADVSIMD_CMPRES(compare == float_relation_greater);
-}
-
-/* round to integral */
-uint32_t HELPER(advsimd_rinth_exact)(uint32_t x, void *fp_status)
-{
- return float16_round_to_int(x, fp_status);
-}
-
-uint32_t HELPER(advsimd_rinth)(uint32_t x, void *fp_status)
-{
- int old_flags = get_float_exception_flags(fp_status), new_flags;
- float16 ret;
-
- ret = float16_round_to_int(x, fp_status);
-
- /* Suppress any inexact exceptions the conversion produced */
- if (!(old_flags & float_flag_inexact)) {
- new_flags = get_float_exception_flags(fp_status);
- set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
- }
-
- return ret;
-}
-
-/*
- * Half-precision floating point conversion functions
- *
- * There are a multitude of conversion functions with various
- * different rounding modes. This is dealt with by the calling code
- * setting the mode appropriately before calling the helper.
- */
-
-uint32_t HELPER(advsimd_f16tosinth)(uint32_t a, void *fpstp)
-{
- float_status *fpst = fpstp;
-
- /* Invalid if we are passed a NaN */
- if (float16_is_any_nan(a)) {
- float_raise(float_flag_invalid, fpst);
- return 0;
- }
- return float16_to_int16(a, fpst);
-}
-
-uint32_t HELPER(advsimd_f16touinth)(uint32_t a, void *fpstp)
-{
- float_status *fpst = fpstp;
-
- /* Invalid if we are passed a NaN */
- if (float16_is_any_nan(a)) {
- float_raise(float_flag_invalid, fpst);
- return 0;
- }
- return float16_to_uint16(a, fpst);
-}
-
-static int el_from_spsr(uint32_t spsr)
-{
- /* Return the exception level that this SPSR is requesting a return to,
- * or -1 if it is invalid (an illegal return)
- */
- if (spsr & PSTATE_nRW) {
- switch (spsr & CPSR_M) {
- case ARM_CPU_MODE_USR:
- return 0;
- case ARM_CPU_MODE_HYP:
- return 2;
- case ARM_CPU_MODE_FIQ:
- case ARM_CPU_MODE_IRQ:
- case ARM_CPU_MODE_SVC:
- case ARM_CPU_MODE_ABT:
- case ARM_CPU_MODE_UND:
- case ARM_CPU_MODE_SYS:
- return 1;
- case ARM_CPU_MODE_MON:
- /* Returning to Mon from AArch64 is never possible,
- * so this is an illegal return.
- */
- default:
- return -1;
- }
- } else {
- if (extract32(spsr, 1, 1)) {
- /* Return with reserved M[1] bit set */
- return -1;
- }
- if (extract32(spsr, 0, 4) == 1) {
- /* return to EL0 with M[0] bit set */
- return -1;
- }
- return extract32(spsr, 2, 2);
- }
-}
-
-static void cpsr_write_from_spsr_elx(CPUARMState *env,
- uint32_t val)
-{
- uint32_t mask;
-
- /* Save SPSR_ELx.SS into PSTATE. */
- env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS);
- val &= ~PSTATE_SS;
-
- /* Move DIT to the correct location for CPSR */
- if (val & PSTATE_DIT) {
- val &= ~PSTATE_DIT;
- val |= CPSR_DIT;
- }
-
- mask = aarch32_cpsr_valid_mask(env->features, \
- &env_archcpu(env)->isar);
- cpsr_write(env, val, mask, CPSRWriteRaw);
-}
-
-void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
-{
- int cur_el = arm_current_el(env);
- unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el);
- uint32_t spsr = env->banked_spsr[spsr_idx];
- int new_el;
- bool return_to_aa64 = (spsr & PSTATE_nRW) == 0;
-
- aarch64_save_sp(env, cur_el);
-
- arm_clear_exclusive(env);
-
- /* We must squash the PSTATE.SS bit to zero unless both of the
- * following hold:
- * 1. debug exceptions are currently disabled
- * 2. singlestep will be active in the EL we return to
- * We check 1 here and 2 after we've done the pstate/cpsr write() to
- * transition to the EL we're going to.
- */
- if (arm_generate_debug_exceptions(env)) {
- spsr &= ~PSTATE_SS;
- }
-
- new_el = el_from_spsr(spsr);
- if (new_el == -1) {
- goto illegal_return;
- }
- if (new_el > cur_el || (new_el == 2 && !arm_is_el2_enabled(env))) {
- /* Disallow return to an EL which is unimplemented or higher
- * than the current one.
- */
- goto illegal_return;
- }
-
- if (new_el != 0 && arm_el_is_aa64(env, new_el) != return_to_aa64) {
- /* Return to an EL which is configured for a different register width */
- goto illegal_return;
- }
-
- if (new_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
- goto illegal_return;
- }
-
- qemu_mutex_lock_iothread();
- arm_call_pre_el_change_hook(env_archcpu(env));
- qemu_mutex_unlock_iothread();
-
- if (!return_to_aa64) {
- env->aarch64 = false;
- /* We do a raw CPSR write because aarch64_sync_64_to_32()
- * will sort the register banks out for us, and we've already
- * caught all the bad-mode cases in el_from_spsr().
- */
- cpsr_write_from_spsr_elx(env, spsr);
- if (!arm_singlestep_active(env)) {
- env->pstate &= ~PSTATE_SS;
- }
- aarch64_sync_64_to_32(env);
-
- if (spsr & CPSR_T) {
- env->regs[15] = new_pc & ~0x1;
- } else {
- env->regs[15] = new_pc & ~0x3;
- }
- helper_rebuild_hflags_a32(env, new_el);
- qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
- "AArch32 EL%d PC 0x%" PRIx32 "\n",
- cur_el, new_el, env->regs[15]);
- } else {
- int tbii;
-
- env->aarch64 = true;
- spsr &= aarch64_pstate_valid_mask(&env_archcpu(env)->isar);
- pstate_write(env, spsr);
- if (!arm_singlestep_active(env)) {
- env->pstate &= ~PSTATE_SS;
- }
- aarch64_restore_sp(env, new_el);
- helper_rebuild_hflags_a64(env, new_el);
-
- /*
- * Apply TBI to the exception return address. We had to delay this
- * until after we selected the new EL, so that we could select the
- * correct TBI+TBID bits. This is made easier by waiting until after
- * the hflags rebuild, since we can pull the composite TBII field
- * from there.
- */
- tbii = EX_TBFLAG_A64(env->hflags, TBII);
- if ((tbii >> extract64(new_pc, 55, 1)) & 1) {
- /* TBI is enabled. */
- int core_mmu_idx = cpu_mmu_index(env, false);
- if (regime_has_2_ranges(core_to_aa64_mmu_idx(core_mmu_idx))) {
- new_pc = sextract64(new_pc, 0, 56);
- } else {
- new_pc = extract64(new_pc, 0, 56);
- }
- }
- env->pc = new_pc;
-
- qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
- "AArch64 EL%d PC 0x%" PRIx64 "\n",
- cur_el, new_el, env->pc);
- }
-
- /*
- * Note that cur_el can never be 0. If new_el is 0, then
- * el0_a64 is return_to_aa64, else el0_a64 is ignored.
- */
- aarch64_sve_change_el(env, cur_el, new_el, return_to_aa64);
-
- qemu_mutex_lock_iothread();
- arm_call_el_change_hook(env_archcpu(env));
- qemu_mutex_unlock_iothread();
-
- return;
-
-illegal_return:
- /* Illegal return events of various kinds have architecturally
- * mandated behaviour:
- * restore NZCV and DAIF from SPSR_ELx
- * set PSTATE.IL
- * restore PC from ELR_ELx
- * no change to exception level, execution state or stack pointer
- */
- env->pstate |= PSTATE_IL;
- env->pc = new_pc;
- spsr &= PSTATE_NZCV | PSTATE_DAIF;
- spsr |= pstate_read(env) & ~(PSTATE_NZCV | PSTATE_DAIF);
- pstate_write(env, spsr);
- if (!arm_singlestep_active(env)) {
- env->pstate &= ~PSTATE_SS;
- }
- helper_rebuild_hflags_a64(env, cur_el);
- qemu_log_mask(LOG_GUEST_ERROR, "Illegal exception return at EL%d: "
- "resuming execution at 0x%" PRIx64 "\n", cur_el, env->pc);
-}
-
-/*
- * Square Root and Reciprocal square root
- */
-
-uint32_t HELPER(sqrt_f16)(uint32_t a, void *fpstp)
-{
- float_status *s = fpstp;
-
- return float16_sqrt(a, s);
-}
-
-void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in)
-{
- /*
- * Implement DC ZVA, which zeroes a fixed-length block of memory.
- * Note that we do not implement the (architecturally mandated)
- * alignment fault for attempts to use this on Device memory
- * (which matches the usual QEMU behaviour of not implementing either
- * alignment faults or any memory attribute handling).
- */
- int blocklen = 4 << env_archcpu(env)->dcz_blocksize;
- uint64_t vaddr = vaddr_in & ~(blocklen - 1);
- int mmu_idx = cpu_mmu_index(env, false);
- void *mem;
-
- /*
- * Trapless lookup. In addition to actual invalid page, may
- * return NULL for I/O, watchpoints, clean pages, etc.
- */
- mem = tlb_vaddr_to_host(env, vaddr, MMU_DATA_STORE, mmu_idx);
-
-#ifndef CONFIG_USER_ONLY
- if (unlikely(!mem)) {
- uintptr_t ra = GETPC();
-
- /*
- * Trap if accessing an invalid page. DC_ZVA requires that we supply
- * the original pointer for an invalid page. But watchpoints require
- * that we probe the actual space. So do both.
- */
- (void) probe_write(env, vaddr_in, 1, mmu_idx, ra);
- mem = probe_write(env, vaddr, blocklen, mmu_idx, ra);
-
- if (unlikely(!mem)) {
- /*
- * The only remaining reason for mem == NULL is I/O.
- * Just do a series of byte writes as the architecture demands.
- */
- for (int i = 0; i < blocklen; i++) {
- cpu_stb_mmuidx_ra(env, vaddr + i, 0, mmu_idx, ra);
- }
- return;
- }
- }
-#endif
-
- memset(mem, 0, blocklen);
-}
+++ /dev/null
-/*
- * iwMMXt micro operations for XScale.
- *
- * Copyright (c) 2007 OpenedHand, Ltd.
- * Written by Andrzej Zaborowski <andrew@openedhand.com>
- * Copyright (c) 2008 CodeSourcery
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-
-#include "cpu.h"
-#include "exec/helper-proto.h"
-
-/* iwMMXt macros extracted from GNU gdb. */
-
-/* Set the SIMD wCASF flags for 8, 16, 32 or 64-bit operations. */
-#define SIMD8_SET(v, n, b) ((v != 0) << ((((b) + 1) * 4) + (n)))
-#define SIMD16_SET(v, n, h) ((v != 0) << ((((h) + 1) * 8) + (n)))
-#define SIMD32_SET(v, n, w) ((v != 0) << ((((w) + 1) * 16) + (n)))
-#define SIMD64_SET(v, n) ((v != 0) << (32 + (n)))
-/* Flags to pass as "n" above. */
-#define SIMD_NBIT -1
-#define SIMD_ZBIT -2
-#define SIMD_CBIT -3
-#define SIMD_VBIT -4
-/* Various status bit macros. */
-#define NBIT8(x) ((x) & 0x80)
-#define NBIT16(x) ((x) & 0x8000)
-#define NBIT32(x) ((x) & 0x80000000)
-#define NBIT64(x) ((x) & 0x8000000000000000ULL)
-#define ZBIT8(x) (((x) & 0xff) == 0)
-#define ZBIT16(x) (((x) & 0xffff) == 0)
-#define ZBIT32(x) (((x) & 0xffffffff) == 0)
-#define ZBIT64(x) (x == 0)
-/* Sign extension macros. */
-#define EXTEND8H(a) ((uint16_t) (int8_t) (a))
-#define EXTEND8(a) ((uint32_t) (int8_t) (a))
-#define EXTEND16(a) ((uint32_t) (int16_t) (a))
-#define EXTEND16S(a) ((int32_t) (int16_t) (a))
-#define EXTEND32(a) ((uint64_t) (int32_t) (a))
-
-uint64_t HELPER(iwmmxt_maddsq)(uint64_t a, uint64_t b)
-{
- a = ((
- EXTEND16S((a >> 0) & 0xffff) * EXTEND16S((b >> 0) & 0xffff) +
- EXTEND16S((a >> 16) & 0xffff) * EXTEND16S((b >> 16) & 0xffff)
- ) & 0xffffffff) | ((uint64_t) (
- EXTEND16S((a >> 32) & 0xffff) * EXTEND16S((b >> 32) & 0xffff) +
- EXTEND16S((a >> 48) & 0xffff) * EXTEND16S((b >> 48) & 0xffff)
- ) << 32);
- return a;
-}
-
-uint64_t HELPER(iwmmxt_madduq)(uint64_t a, uint64_t b)
-{
- a = ((
- ((a >> 0) & 0xffff) * ((b >> 0) & 0xffff) +
- ((a >> 16) & 0xffff) * ((b >> 16) & 0xffff)
- ) & 0xffffffff) | ((
- ((a >> 32) & 0xffff) * ((b >> 32) & 0xffff) +
- ((a >> 48) & 0xffff) * ((b >> 48) & 0xffff)
- ) << 32);
- return a;
-}
-
-uint64_t HELPER(iwmmxt_sadb)(uint64_t a, uint64_t b)
-{
-#define abs(x) (((x) >= 0) ? x : -x)
-#define SADB(SHR) abs((int) ((a >> SHR) & 0xff) - (int) ((b >> SHR) & 0xff))
- return
- SADB(0) + SADB(8) + SADB(16) + SADB(24) +
- SADB(32) + SADB(40) + SADB(48) + SADB(56);
-#undef SADB
-}
-
-uint64_t HELPER(iwmmxt_sadw)(uint64_t a, uint64_t b)
-{
-#define SADW(SHR) \
- abs((int) ((a >> SHR) & 0xffff) - (int) ((b >> SHR) & 0xffff))
- return SADW(0) + SADW(16) + SADW(32) + SADW(48);
-#undef SADW
-}
-
-uint64_t HELPER(iwmmxt_mulslw)(uint64_t a, uint64_t b)
-{
-#define MULS(SHR) ((uint64_t) ((( \
- EXTEND16S((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff) \
- ) >> 0) & 0xffff) << SHR)
- return MULS(0) | MULS(16) | MULS(32) | MULS(48);
-#undef MULS
-}
-
-uint64_t HELPER(iwmmxt_mulshw)(uint64_t a, uint64_t b)
-{
-#define MULS(SHR) ((uint64_t) ((( \
- EXTEND16S((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff) \
- ) >> 16) & 0xffff) << SHR)
- return MULS(0) | MULS(16) | MULS(32) | MULS(48);
-#undef MULS
-}
-
-uint64_t HELPER(iwmmxt_mululw)(uint64_t a, uint64_t b)
-{
-#define MULU(SHR) ((uint64_t) ((( \
- ((a >> SHR) & 0xffff) * ((b >> SHR) & 0xffff) \
- ) >> 0) & 0xffff) << SHR)
- return MULU(0) | MULU(16) | MULU(32) | MULU(48);
-#undef MULU
-}
-
-uint64_t HELPER(iwmmxt_muluhw)(uint64_t a, uint64_t b)
-{
-#define MULU(SHR) ((uint64_t) ((( \
- ((a >> SHR) & 0xffff) * ((b >> SHR) & 0xffff) \
- ) >> 16) & 0xffff) << SHR)
- return MULU(0) | MULU(16) | MULU(32) | MULU(48);
-#undef MULU
-}
-
-uint64_t HELPER(iwmmxt_macsw)(uint64_t a, uint64_t b)
-{
-#define MACS(SHR) ( \
- EXTEND16((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff))
- return (int64_t) (MACS(0) + MACS(16) + MACS(32) + MACS(48));
-#undef MACS
-}
-
-uint64_t HELPER(iwmmxt_macuw)(uint64_t a, uint64_t b)
-{
-#define MACU(SHR) ( \
- (uint32_t) ((a >> SHR) & 0xffff) * \
- (uint32_t) ((b >> SHR) & 0xffff))
- return MACU(0) + MACU(16) + MACU(32) + MACU(48);
-#undef MACU
-}
-
-#define NZBIT8(x, i) \
- SIMD8_SET(NBIT8((x) & 0xff), SIMD_NBIT, i) | \
- SIMD8_SET(ZBIT8((x) & 0xff), SIMD_ZBIT, i)
-#define NZBIT16(x, i) \
- SIMD16_SET(NBIT16((x) & 0xffff), SIMD_NBIT, i) | \
- SIMD16_SET(ZBIT16((x) & 0xffff), SIMD_ZBIT, i)
-#define NZBIT32(x, i) \
- SIMD32_SET(NBIT32((x) & 0xffffffff), SIMD_NBIT, i) | \
- SIMD32_SET(ZBIT32((x) & 0xffffffff), SIMD_ZBIT, i)
-#define NZBIT64(x) \
- SIMD64_SET(NBIT64(x), SIMD_NBIT) | \
- SIMD64_SET(ZBIT64(x), SIMD_ZBIT)
-#define IWMMXT_OP_UNPACK(S, SH0, SH1, SH2, SH3) \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, b)))(CPUARMState *env, \
- uint64_t a, uint64_t b) \
-{ \
- a = \
- (((a >> SH0) & 0xff) << 0) | (((b >> SH0) & 0xff) << 8) | \
- (((a >> SH1) & 0xff) << 16) | (((b >> SH1) & 0xff) << 24) | \
- (((a >> SH2) & 0xff) << 32) | (((b >> SH2) & 0xff) << 40) | \
- (((a >> SH3) & 0xff) << 48) | (((b >> SH3) & 0xff) << 56); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
- NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) | \
- NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) | \
- NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) | \
- NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7); \
- return a; \
-} \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, w)))(CPUARMState *env, \
- uint64_t a, uint64_t b) \
-{ \
- a = \
- (((a >> SH0) & 0xffff) << 0) | \
- (((b >> SH0) & 0xffff) << 16) | \
- (((a >> SH2) & 0xffff) << 32) | \
- (((b >> SH2) & 0xffff) << 48); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
- NZBIT8(a >> 0, 0) | NZBIT8(a >> 16, 1) | \
- NZBIT8(a >> 32, 2) | NZBIT8(a >> 48, 3); \
- return a; \
-} \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, l)))(CPUARMState *env, \
- uint64_t a, uint64_t b) \
-{ \
- a = \
- (((a >> SH0) & 0xffffffff) << 0) | \
- (((b >> SH0) & 0xffffffff) << 32); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
- NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1); \
- return a; \
-} \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ub)))(CPUARMState *env, \
- uint64_t x) \
-{ \
- x = \
- (((x >> SH0) & 0xff) << 0) | \
- (((x >> SH1) & 0xff) << 16) | \
- (((x >> SH2) & 0xff) << 32) | \
- (((x >> SH3) & 0xff) << 48); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
- NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | \
- NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); \
- return x; \
-} \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, uw)))(CPUARMState *env, \
- uint64_t x) \
-{ \
- x = \
- (((x >> SH0) & 0xffff) << 0) | \
- (((x >> SH2) & 0xffff) << 32); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
- NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); \
- return x; \
-} \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ul)))(CPUARMState *env, \
- uint64_t x) \
-{ \
- x = (((x >> SH0) & 0xffffffff) << 0); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0); \
- return x; \
-} \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sb)))(CPUARMState *env, \
- uint64_t x) \
-{ \
- x = \
- ((uint64_t) EXTEND8H((x >> SH0) & 0xff) << 0) | \
- ((uint64_t) EXTEND8H((x >> SH1) & 0xff) << 16) | \
- ((uint64_t) EXTEND8H((x >> SH2) & 0xff) << 32) | \
- ((uint64_t) EXTEND8H((x >> SH3) & 0xff) << 48); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
- NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | \
- NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); \
- return x; \
-} \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sw)))(CPUARMState *env, \
- uint64_t x) \
-{ \
- x = \
- ((uint64_t) EXTEND16((x >> SH0) & 0xffff) << 0) | \
- ((uint64_t) EXTEND16((x >> SH2) & 0xffff) << 32); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
- NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); \
- return x; \
-} \
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sl)))(CPUARMState *env, \
- uint64_t x) \
-{ \
- x = EXTEND32((x >> SH0) & 0xffffffff); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0); \
- return x; \
-}
-IWMMXT_OP_UNPACK(l, 0, 8, 16, 24)
-IWMMXT_OP_UNPACK(h, 32, 40, 48, 56)
-
-#define IWMMXT_OP_CMP(SUFF, Tb, Tw, Tl, O) \
-uint64_t HELPER(glue(iwmmxt_, glue(SUFF, b)))(CPUARMState *env, \
- uint64_t a, uint64_t b) \
-{ \
- a = \
- CMP(0, Tb, O, 0xff) | CMP(8, Tb, O, 0xff) | \
- CMP(16, Tb, O, 0xff) | CMP(24, Tb, O, 0xff) | \
- CMP(32, Tb, O, 0xff) | CMP(40, Tb, O, 0xff) | \
- CMP(48, Tb, O, 0xff) | CMP(56, Tb, O, 0xff); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
- NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) | \
- NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) | \
- NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) | \
- NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7); \
- return a; \
-} \
-uint64_t HELPER(glue(iwmmxt_, glue(SUFF, w)))(CPUARMState *env, \
- uint64_t a, uint64_t b) \
-{ \
- a = CMP(0, Tw, O, 0xffff) | CMP(16, Tw, O, 0xffff) | \
- CMP(32, Tw, O, 0xffff) | CMP(48, Tw, O, 0xffff); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
- NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) | \
- NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3); \
- return a; \
-} \
-uint64_t HELPER(glue(iwmmxt_, glue(SUFF, l)))(CPUARMState *env, \
- uint64_t a, uint64_t b) \
-{ \
- a = CMP(0, Tl, O, 0xffffffff) | \
- CMP(32, Tl, O, 0xffffffff); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
- NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1); \
- return a; \
-}
-#define CMP(SHR, TYPE, OPER, MASK) ((((TYPE) ((a >> SHR) & MASK) OPER \
- (TYPE) ((b >> SHR) & MASK)) ? (uint64_t) MASK : 0) << SHR)
-IWMMXT_OP_CMP(cmpeq, uint8_t, uint16_t, uint32_t, ==)
-IWMMXT_OP_CMP(cmpgts, int8_t, int16_t, int32_t, >)
-IWMMXT_OP_CMP(cmpgtu, uint8_t, uint16_t, uint32_t, >)
-#undef CMP
-#define CMP(SHR, TYPE, OPER, MASK) ((((TYPE) ((a >> SHR) & MASK) OPER \
- (TYPE) ((b >> SHR) & MASK)) ? a : b) & ((uint64_t) MASK << SHR))
-IWMMXT_OP_CMP(mins, int8_t, int16_t, int32_t, <)
-IWMMXT_OP_CMP(minu, uint8_t, uint16_t, uint32_t, <)
-IWMMXT_OP_CMP(maxs, int8_t, int16_t, int32_t, >)
-IWMMXT_OP_CMP(maxu, uint8_t, uint16_t, uint32_t, >)
-#undef CMP
-#define CMP(SHR, TYPE, OPER, MASK) ((uint64_t) (((TYPE) ((a >> SHR) & MASK) \
- OPER (TYPE) ((b >> SHR) & MASK)) & MASK) << SHR)
-IWMMXT_OP_CMP(subn, uint8_t, uint16_t, uint32_t, -)
-IWMMXT_OP_CMP(addn, uint8_t, uint16_t, uint32_t, +)
-#undef CMP
-/* TODO Signed- and Unsigned-Saturation */
-#define CMP(SHR, TYPE, OPER, MASK) ((uint64_t) (((TYPE) ((a >> SHR) & MASK) \
- OPER (TYPE) ((b >> SHR) & MASK)) & MASK) << SHR)
-IWMMXT_OP_CMP(subu, uint8_t, uint16_t, uint32_t, -)
-IWMMXT_OP_CMP(addu, uint8_t, uint16_t, uint32_t, +)
-IWMMXT_OP_CMP(subs, int8_t, int16_t, int32_t, -)
-IWMMXT_OP_CMP(adds, int8_t, int16_t, int32_t, +)
-#undef CMP
-#undef IWMMXT_OP_CMP
-
-#define AVGB(SHR) ((( \
- ((a >> SHR) & 0xff) + ((b >> SHR) & 0xff) + round) >> 1) << SHR)
-#define IWMMXT_OP_AVGB(r) \
-uint64_t HELPER(iwmmxt_avgb##r)(CPUARMState *env, uint64_t a, uint64_t b) \
-{ \
- const int round = r; \
- a = AVGB(0) | AVGB(8) | AVGB(16) | AVGB(24) | \
- AVGB(32) | AVGB(40) | AVGB(48) | AVGB(56); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
- SIMD8_SET(ZBIT8((a >> 0) & 0xff), SIMD_ZBIT, 0) | \
- SIMD8_SET(ZBIT8((a >> 8) & 0xff), SIMD_ZBIT, 1) | \
- SIMD8_SET(ZBIT8((a >> 16) & 0xff), SIMD_ZBIT, 2) | \
- SIMD8_SET(ZBIT8((a >> 24) & 0xff), SIMD_ZBIT, 3) | \
- SIMD8_SET(ZBIT8((a >> 32) & 0xff), SIMD_ZBIT, 4) | \
- SIMD8_SET(ZBIT8((a >> 40) & 0xff), SIMD_ZBIT, 5) | \
- SIMD8_SET(ZBIT8((a >> 48) & 0xff), SIMD_ZBIT, 6) | \
- SIMD8_SET(ZBIT8((a >> 56) & 0xff), SIMD_ZBIT, 7); \
- return a; \
-}
-IWMMXT_OP_AVGB(0)
-IWMMXT_OP_AVGB(1)
-#undef IWMMXT_OP_AVGB
-#undef AVGB
-
-#define AVGW(SHR) ((( \
- ((a >> SHR) & 0xffff) + ((b >> SHR) & 0xffff) + round) >> 1) << SHR)
-#define IWMMXT_OP_AVGW(r) \
-uint64_t HELPER(iwmmxt_avgw##r)(CPUARMState *env, uint64_t a, uint64_t b) \
-{ \
- const int round = r; \
- a = AVGW(0) | AVGW(16) | AVGW(32) | AVGW(48); \
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
- SIMD16_SET(ZBIT16((a >> 0) & 0xffff), SIMD_ZBIT, 0) | \
- SIMD16_SET(ZBIT16((a >> 16) & 0xffff), SIMD_ZBIT, 1) | \
- SIMD16_SET(ZBIT16((a >> 32) & 0xffff), SIMD_ZBIT, 2) | \
- SIMD16_SET(ZBIT16((a >> 48) & 0xffff), SIMD_ZBIT, 3); \
- return a; \
-}
-IWMMXT_OP_AVGW(0)
-IWMMXT_OP_AVGW(1)
-#undef IWMMXT_OP_AVGW
-#undef AVGW
-
-uint64_t HELPER(iwmmxt_align)(uint64_t a, uint64_t b, uint32_t n)
-{
- a >>= n << 3;
- a |= b << (64 - (n << 3));
- return a;
-}
-
-uint64_t HELPER(iwmmxt_insr)(uint64_t x, uint32_t a, uint32_t b, uint32_t n)
-{
- x &= ~((uint64_t) b << n);
- x |= (uint64_t) (a & b) << n;
- return x;
-}
-
-uint32_t HELPER(iwmmxt_setpsr_nz)(uint64_t x)
-{
- return SIMD64_SET((x == 0), SIMD_ZBIT) |
- SIMD64_SET((x & (1ULL << 63)), SIMD_NBIT);
-}
-
-uint64_t HELPER(iwmmxt_bcstb)(uint32_t arg)
-{
- arg &= 0xff;
- return
- ((uint64_t) arg << 0 ) | ((uint64_t) arg << 8 ) |
- ((uint64_t) arg << 16) | ((uint64_t) arg << 24) |
- ((uint64_t) arg << 32) | ((uint64_t) arg << 40) |
- ((uint64_t) arg << 48) | ((uint64_t) arg << 56);
-}
-
-uint64_t HELPER(iwmmxt_bcstw)(uint32_t arg)
-{
- arg &= 0xffff;
- return
- ((uint64_t) arg << 0 ) | ((uint64_t) arg << 16) |
- ((uint64_t) arg << 32) | ((uint64_t) arg << 48);
-}
-
-uint64_t HELPER(iwmmxt_bcstl)(uint32_t arg)
-{
- return arg | ((uint64_t) arg << 32);
-}
-
-uint64_t HELPER(iwmmxt_addcb)(uint64_t x)
-{
- return
- ((x >> 0) & 0xff) + ((x >> 8) & 0xff) +
- ((x >> 16) & 0xff) + ((x >> 24) & 0xff) +
- ((x >> 32) & 0xff) + ((x >> 40) & 0xff) +
- ((x >> 48) & 0xff) + ((x >> 56) & 0xff);
-}
-
-uint64_t HELPER(iwmmxt_addcw)(uint64_t x)
-{
- return
- ((x >> 0) & 0xffff) + ((x >> 16) & 0xffff) +
- ((x >> 32) & 0xffff) + ((x >> 48) & 0xffff);
-}
-
-uint64_t HELPER(iwmmxt_addcl)(uint64_t x)
-{
- return (x & 0xffffffff) + (x >> 32);
-}
-
-uint32_t HELPER(iwmmxt_msbb)(uint64_t x)
-{
- return
- ((x >> 7) & 0x01) | ((x >> 14) & 0x02) |
- ((x >> 21) & 0x04) | ((x >> 28) & 0x08) |
- ((x >> 35) & 0x10) | ((x >> 42) & 0x20) |
- ((x >> 49) & 0x40) | ((x >> 56) & 0x80);
-}
-
-uint32_t HELPER(iwmmxt_msbw)(uint64_t x)
-{
- return
- ((x >> 15) & 0x01) | ((x >> 30) & 0x02) |
- ((x >> 45) & 0x04) | ((x >> 52) & 0x08);
-}
-
-uint32_t HELPER(iwmmxt_msbl)(uint64_t x)
-{
- return ((x >> 31) & 0x01) | ((x >> 62) & 0x02);
-}
-
-/* FIXME: Split wCASF setting into a separate op to avoid env use. */
-uint64_t HELPER(iwmmxt_srlw)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x = (((x & (0xffffll << 0)) >> n) & (0xffffll << 0)) |
- (((x & (0xffffll << 16)) >> n) & (0xffffll << 16)) |
- (((x & (0xffffll << 32)) >> n) & (0xffffll << 32)) |
- (((x & (0xffffll << 48)) >> n) & (0xffffll << 48));
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
- NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
- return x;
-}
-
-uint64_t HELPER(iwmmxt_srll)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x = ((x & (0xffffffffll << 0)) >> n) |
- ((x >> n) & (0xffffffffll << 32));
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
- return x;
-}
-
-uint64_t HELPER(iwmmxt_srlq)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x >>= n;
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
- return x;
-}
-
-uint64_t HELPER(iwmmxt_sllw)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x = (((x & (0xffffll << 0)) << n) & (0xffffll << 0)) |
- (((x & (0xffffll << 16)) << n) & (0xffffll << 16)) |
- (((x & (0xffffll << 32)) << n) & (0xffffll << 32)) |
- (((x & (0xffffll << 48)) << n) & (0xffffll << 48));
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
- NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
- return x;
-}
-
-uint64_t HELPER(iwmmxt_slll)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x = ((x << n) & (0xffffffffll << 0)) |
- ((x & (0xffffffffll << 32)) << n);
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
- return x;
-}
-
-uint64_t HELPER(iwmmxt_sllq)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x <<= n;
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
- return x;
-}
-
-uint64_t HELPER(iwmmxt_sraw)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x = ((uint64_t) ((EXTEND16(x >> 0) >> n) & 0xffff) << 0) |
- ((uint64_t) ((EXTEND16(x >> 16) >> n) & 0xffff) << 16) |
- ((uint64_t) ((EXTEND16(x >> 32) >> n) & 0xffff) << 32) |
- ((uint64_t) ((EXTEND16(x >> 48) >> n) & 0xffff) << 48);
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
- NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
- return x;
-}
-
-uint64_t HELPER(iwmmxt_sral)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x = (((EXTEND32(x >> 0) >> n) & 0xffffffff) << 0) |
- (((EXTEND32(x >> 32) >> n) & 0xffffffff) << 32);
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
- return x;
-}
-
-uint64_t HELPER(iwmmxt_sraq)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x = (int64_t) x >> n;
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
- return x;
-}
-
-uint64_t HELPER(iwmmxt_rorw)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x = ((((x & (0xffffll << 0)) >> n) |
- ((x & (0xffffll << 0)) << (16 - n))) & (0xffffll << 0)) |
- ((((x & (0xffffll << 16)) >> n) |
- ((x & (0xffffll << 16)) << (16 - n))) & (0xffffll << 16)) |
- ((((x & (0xffffll << 32)) >> n) |
- ((x & (0xffffll << 32)) << (16 - n))) & (0xffffll << 32)) |
- ((((x & (0xffffll << 48)) >> n) |
- ((x & (0xffffll << 48)) << (16 - n))) & (0xffffll << 48));
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
- NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
- return x;
-}
-
-uint64_t HELPER(iwmmxt_rorl)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x = ((x & (0xffffffffll << 0)) >> n) |
- ((x >> n) & (0xffffffffll << 32)) |
- ((x << (32 - n)) & (0xffffffffll << 0)) |
- ((x & (0xffffffffll << 32)) << (32 - n));
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
- return x;
-}
-
-uint64_t HELPER(iwmmxt_rorq)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x = ror64(x, n);
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
- return x;
-}
-
-uint64_t HELPER(iwmmxt_shufh)(CPUARMState *env, uint64_t x, uint32_t n)
-{
- x = (((x >> ((n << 4) & 0x30)) & 0xffff) << 0) |
- (((x >> ((n << 2) & 0x30)) & 0xffff) << 16) |
- (((x >> ((n << 0) & 0x30)) & 0xffff) << 32) |
- (((x >> ((n >> 2) & 0x30)) & 0xffff) << 48);
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
- NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
- return x;
-}
-
-/* TODO: Unsigned-Saturation */
-uint64_t HELPER(iwmmxt_packuw)(CPUARMState *env, uint64_t a, uint64_t b)
-{
- a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) |
- (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) |
- (((b >> 0) & 0xff) << 32) | (((b >> 16) & 0xff) << 40) |
- (((b >> 32) & 0xff) << 48) | (((b >> 48) & 0xff) << 56);
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) |
- NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) |
- NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) |
- NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);
- return a;
-}
-
-uint64_t HELPER(iwmmxt_packul)(CPUARMState *env, uint64_t a, uint64_t b)
-{
- a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) |
- (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48);
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) |
- NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3);
- return a;
-}
-
-uint64_t HELPER(iwmmxt_packuq)(CPUARMState *env, uint64_t a, uint64_t b)
-{
- a = (a & 0xffffffff) | ((b & 0xffffffff) << 32);
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);
- return a;
-}
-
-/* TODO: Signed-Saturation */
-uint64_t HELPER(iwmmxt_packsw)(CPUARMState *env, uint64_t a, uint64_t b)
-{
- a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) |
- (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) |
- (((b >> 0) & 0xff) << 32) | (((b >> 16) & 0xff) << 40) |
- (((b >> 32) & 0xff) << 48) | (((b >> 48) & 0xff) << 56);
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) |
- NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) |
- NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) |
- NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);
- return a;
-}
-
-uint64_t HELPER(iwmmxt_packsl)(CPUARMState *env, uint64_t a, uint64_t b)
-{
- a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) |
- (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48);
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) |
- NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3);
- return a;
-}
-
-uint64_t HELPER(iwmmxt_packsq)(CPUARMState *env, uint64_t a, uint64_t b)
-{
- a = (a & 0xffffffff) | ((b & 0xffffffff) << 32);
- env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
- NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);
- return a;
-}
-
-uint64_t HELPER(iwmmxt_muladdsl)(uint64_t c, uint32_t a, uint32_t b)
-{
- return c + ((int32_t) EXTEND32(a) * (int32_t) EXTEND32(b));
-}
-
-uint64_t HELPER(iwmmxt_muladdsw)(uint64_t c, uint32_t a, uint32_t b)
-{
- c += EXTEND32(EXTEND16S((a >> 0) & 0xffff) *
- EXTEND16S((b >> 0) & 0xffff));
- c += EXTEND32(EXTEND16S((a >> 16) & 0xffff) *
- EXTEND16S((b >> 16) & 0xffff));
- return c;
-}
-
-uint64_t HELPER(iwmmxt_muladdswl)(uint64_t c, uint32_t a, uint32_t b)
-{
- return c + (EXTEND32(EXTEND16S(a & 0xffff) *
- EXTEND16S(b & 0xffff)));
-}
+++ /dev/null
-/*
- * ARM generic helpers.
- *
- * This code is licensed under the GNU GPL v2 or later.
- *
- * SPDX-License-Identifier: GPL-2.0-or-later
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "internals.h"
-#include "exec/helper-proto.h"
-#include "qemu/main-loop.h"
-#include "qemu/bitops.h"
-#include "qemu/log.h"
-#include "exec/exec-all.h"
-#ifdef CONFIG_TCG
-#include "exec/cpu_ldst.h"
-#include "semihosting/common-semi.h"
-#endif
-#if !defined(CONFIG_USER_ONLY)
-#include "hw/intc/armv7m_nvic.h"
-#endif
-
-static void v7m_msr_xpsr(CPUARMState *env, uint32_t mask,
- uint32_t reg, uint32_t val)
-{
- /* Only APSR is actually writable */
- if (!(reg & 4)) {
- uint32_t apsrmask = 0;
-
- if (mask & 8) {
- apsrmask |= XPSR_NZCV | XPSR_Q;
- }
- if ((mask & 4) && arm_feature(env, ARM_FEATURE_THUMB_DSP)) {
- apsrmask |= XPSR_GE;
- }
- xpsr_write(env, val, apsrmask);
- }
-}
-
-static uint32_t v7m_mrs_xpsr(CPUARMState *env, uint32_t reg, unsigned el)
-{
- uint32_t mask = 0;
-
- if ((reg & 1) && el) {
- mask |= XPSR_EXCP; /* IPSR (unpriv. reads as zero) */
- }
- if (!(reg & 4)) {
- mask |= XPSR_NZCV | XPSR_Q; /* APSR */
- if (arm_feature(env, ARM_FEATURE_THUMB_DSP)) {
- mask |= XPSR_GE;
- }
- }
- /* EPSR reads as zero */
- return xpsr_read(env) & mask;
-}
-
-static uint32_t v7m_mrs_control(CPUARMState *env, uint32_t secure)
-{
- uint32_t value = env->v7m.control[secure];
-
- if (!secure) {
- /* SFPA is RAZ/WI from NS; FPCA is stored in the M_REG_S bank */
- value |= env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK;
- }
- return value;
-}
-
-#ifdef CONFIG_USER_ONLY
-
-void HELPER(v7m_msr)(CPUARMState *env, uint32_t maskreg, uint32_t val)
-{
- uint32_t mask = extract32(maskreg, 8, 4);
- uint32_t reg = extract32(maskreg, 0, 8);
-
- switch (reg) {
- case 0 ... 7: /* xPSR sub-fields */
- v7m_msr_xpsr(env, mask, reg, val);
- break;
- case 20: /* CONTROL */
- /* There are no sub-fields that are actually writable from EL0. */
- break;
- default:
- /* Unprivileged writes to other registers are ignored */
- break;
- }
-}
-
-uint32_t HELPER(v7m_mrs)(CPUARMState *env, uint32_t reg)
-{
- switch (reg) {
- case 0 ... 7: /* xPSR sub-fields */
- return v7m_mrs_xpsr(env, reg, 0);
- case 20: /* CONTROL */
- return v7m_mrs_control(env, 0);
- default:
- /* Unprivileged reads others as zero. */
- return 0;
- }
-}
-
-void HELPER(v7m_bxns)(CPUARMState *env, uint32_t dest)
-{
- /* translate.c should never generate calls here in user-only mode */
- g_assert_not_reached();
-}
-
-void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest)
-{
- /* translate.c should never generate calls here in user-only mode */
- g_assert_not_reached();
-}
-
-void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
-{
- /* translate.c should never generate calls here in user-only mode */
- g_assert_not_reached();
-}
-
-void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
-{
- /* translate.c should never generate calls here in user-only mode */
- g_assert_not_reached();
-}
-
-void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
-{
- /* translate.c should never generate calls here in user-only mode */
- g_assert_not_reached();
-}
-
-uint32_t HELPER(v7m_tt)(CPUARMState *env, uint32_t addr, uint32_t op)
-{
- /*
- * The TT instructions can be used by unprivileged code, but in
- * user-only emulation we don't have the MPU.
- * Luckily since we know we are NonSecure unprivileged (and that in
- * turn means that the A flag wasn't specified), all the bits in the
- * register must be zero:
- * IREGION: 0 because IRVALID is 0
- * IRVALID: 0 because NS
- * S: 0 because NS
- * NSRW: 0 because NS
- * NSR: 0 because NS
- * RW: 0 because unpriv and A flag not set
- * R: 0 because unpriv and A flag not set
- * SRVALID: 0 because NS
- * MRVALID: 0 because unpriv and A flag not set
- * SREGION: 0 becaus SRVALID is 0
- * MREGION: 0 because MRVALID is 0
- */
- return 0;
-}
-
-ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate)
-{
- return ARMMMUIdx_MUser;
-}
-
-#else /* !CONFIG_USER_ONLY */
-
-static ARMMMUIdx arm_v7m_mmu_idx_all(CPUARMState *env,
- bool secstate, bool priv, bool negpri)
-{
- ARMMMUIdx mmu_idx = ARM_MMU_IDX_M;
-
- if (priv) {
- mmu_idx |= ARM_MMU_IDX_M_PRIV;
- }
-
- if (negpri) {
- mmu_idx |= ARM_MMU_IDX_M_NEGPRI;
- }
-
- if (secstate) {
- mmu_idx |= ARM_MMU_IDX_M_S;
- }
-
- return mmu_idx;
-}
-
-static ARMMMUIdx arm_v7m_mmu_idx_for_secstate_and_priv(CPUARMState *env,
- bool secstate, bool priv)
-{
- bool negpri = armv7m_nvic_neg_prio_requested(env->nvic, secstate);
-
- return arm_v7m_mmu_idx_all(env, secstate, priv, negpri);
-}
-
-/* Return the MMU index for a v7M CPU in the specified security state */
-ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate)
-{
- bool priv = arm_v7m_is_handler_mode(env) ||
- !(env->v7m.control[secstate] & 1);
-
- return arm_v7m_mmu_idx_for_secstate_and_priv(env, secstate, priv);
-}
-
-/*
- * What kind of stack write are we doing? This affects how exceptions
- * generated during the stacking are treated.
- */
-typedef enum StackingMode {
- STACK_NORMAL,
- STACK_IGNFAULTS,
- STACK_LAZYFP,
-} StackingMode;
-
-static bool v7m_stack_write(ARMCPU *cpu, uint32_t addr, uint32_t value,
- ARMMMUIdx mmu_idx, StackingMode mode)
-{
- CPUState *cs = CPU(cpu);
- CPUARMState *env = &cpu->env;
- MemTxResult txres;
- GetPhysAddrResult res = {};
- ARMMMUFaultInfo fi = {};
- bool secure = mmu_idx & ARM_MMU_IDX_M_S;
- int exc;
- bool exc_secure;
-
- if (get_phys_addr(env, addr, MMU_DATA_STORE, mmu_idx, &res, &fi)) {
- /* MPU/SAU lookup failed */
- if (fi.type == ARMFault_QEMU_SFault) {
- if (mode == STACK_LAZYFP) {
- qemu_log_mask(CPU_LOG_INT,
- "...SecureFault with SFSR.LSPERR "
- "during lazy stacking\n");
- env->v7m.sfsr |= R_V7M_SFSR_LSPERR_MASK;
- } else {
- qemu_log_mask(CPU_LOG_INT,
- "...SecureFault with SFSR.AUVIOL "
- "during stacking\n");
- env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK;
- }
- env->v7m.sfsr |= R_V7M_SFSR_SFARVALID_MASK;
- env->v7m.sfar = addr;
- exc = ARMV7M_EXCP_SECURE;
- exc_secure = false;
- } else {
- if (mode == STACK_LAZYFP) {
- qemu_log_mask(CPU_LOG_INT,
- "...MemManageFault with CFSR.MLSPERR\n");
- env->v7m.cfsr[secure] |= R_V7M_CFSR_MLSPERR_MASK;
- } else {
- qemu_log_mask(CPU_LOG_INT,
- "...MemManageFault with CFSR.MSTKERR\n");
- env->v7m.cfsr[secure] |= R_V7M_CFSR_MSTKERR_MASK;
- }
- exc = ARMV7M_EXCP_MEM;
- exc_secure = secure;
- }
- goto pend_fault;
- }
- address_space_stl_le(arm_addressspace(cs, res.f.attrs), res.f.phys_addr,
- value, res.f.attrs, &txres);
- if (txres != MEMTX_OK) {
- /* BusFault trying to write the data */
- if (mode == STACK_LAZYFP) {
- qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.LSPERR\n");
- env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_LSPERR_MASK;
- } else {
- qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.STKERR\n");
- env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_STKERR_MASK;
- }
- exc = ARMV7M_EXCP_BUS;
- exc_secure = false;
- goto pend_fault;
- }
- return true;
-
-pend_fault:
- /*
- * By pending the exception at this point we are making
- * the IMPDEF choice "overridden exceptions pended" (see the
- * MergeExcInfo() pseudocode). The other choice would be to not
- * pend them now and then make a choice about which to throw away
- * later if we have two derived exceptions.
- * The only case when we must not pend the exception but instead
- * throw it away is if we are doing the push of the callee registers
- * and we've already generated a derived exception (this is indicated
- * by the caller passing STACK_IGNFAULTS). Even in this case we will
- * still update the fault status registers.
- */
- switch (mode) {
- case STACK_NORMAL:
- armv7m_nvic_set_pending_derived(env->nvic, exc, exc_secure);
- break;
- case STACK_LAZYFP:
- armv7m_nvic_set_pending_lazyfp(env->nvic, exc, exc_secure);
- break;
- case STACK_IGNFAULTS:
- break;
- }
- return false;
-}
-
-static bool v7m_stack_read(ARMCPU *cpu, uint32_t *dest, uint32_t addr,
- ARMMMUIdx mmu_idx)
-{
- CPUState *cs = CPU(cpu);
- CPUARMState *env = &cpu->env;
- MemTxResult txres;
- GetPhysAddrResult res = {};
- ARMMMUFaultInfo fi = {};
- bool secure = mmu_idx & ARM_MMU_IDX_M_S;
- int exc;
- bool exc_secure;
- uint32_t value;
-
- if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &res, &fi)) {
- /* MPU/SAU lookup failed */
- if (fi.type == ARMFault_QEMU_SFault) {
- qemu_log_mask(CPU_LOG_INT,
- "...SecureFault with SFSR.AUVIOL during unstack\n");
- env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK;
- env->v7m.sfar = addr;
- exc = ARMV7M_EXCP_SECURE;
- exc_secure = false;
- } else {
- qemu_log_mask(CPU_LOG_INT,
- "...MemManageFault with CFSR.MUNSTKERR\n");
- env->v7m.cfsr[secure] |= R_V7M_CFSR_MUNSTKERR_MASK;
- exc = ARMV7M_EXCP_MEM;
- exc_secure = secure;
- }
- goto pend_fault;
- }
-
- value = address_space_ldl(arm_addressspace(cs, res.f.attrs),
- res.f.phys_addr, res.f.attrs, &txres);
- if (txres != MEMTX_OK) {
- /* BusFault trying to read the data */
- qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.UNSTKERR\n");
- env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_UNSTKERR_MASK;
- exc = ARMV7M_EXCP_BUS;
- exc_secure = false;
- goto pend_fault;
- }
-
- *dest = value;
- return true;
-
-pend_fault:
- /*
- * By pending the exception at this point we are making
- * the IMPDEF choice "overridden exceptions pended" (see the
- * MergeExcInfo() pseudocode). The other choice would be to not
- * pend them now and then make a choice about which to throw away
- * later if we have two derived exceptions.
- */
- armv7m_nvic_set_pending(env->nvic, exc, exc_secure);
- return false;
-}
-
-void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
-{
- /*
- * Preserve FP state (because LSPACT was set and we are about
- * to execute an FP instruction). This corresponds to the
- * PreserveFPState() pseudocode.
- * We may throw an exception if the stacking fails.
- */
- ARMCPU *cpu = env_archcpu(env);
- bool is_secure = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
- bool negpri = !(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_HFRDY_MASK);
- bool is_priv = !(env->v7m.fpccr[is_secure] & R_V7M_FPCCR_USER_MASK);
- bool splimviol = env->v7m.fpccr[is_secure] & R_V7M_FPCCR_SPLIMVIOL_MASK;
- uint32_t fpcar = env->v7m.fpcar[is_secure];
- bool stacked_ok = true;
- bool ts = is_secure && (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK);
- bool take_exception;
-
- /* Take the iothread lock as we are going to touch the NVIC */
- qemu_mutex_lock_iothread();
-
- /* Check the background context had access to the FPU */
- if (!v7m_cpacr_pass(env, is_secure, is_priv)) {
- armv7m_nvic_set_pending_lazyfp(env->nvic, ARMV7M_EXCP_USAGE, is_secure);
- env->v7m.cfsr[is_secure] |= R_V7M_CFSR_NOCP_MASK;
- stacked_ok = false;
- } else if (!is_secure && !extract32(env->v7m.nsacr, 10, 1)) {
- armv7m_nvic_set_pending_lazyfp(env->nvic, ARMV7M_EXCP_USAGE, M_REG_S);
- env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK;
- stacked_ok = false;
- }
-
- if (!splimviol && stacked_ok) {
- /* We only stack if the stack limit wasn't violated */
- int i;
- ARMMMUIdx mmu_idx;
-
- mmu_idx = arm_v7m_mmu_idx_all(env, is_secure, is_priv, negpri);
- for (i = 0; i < (ts ? 32 : 16); i += 2) {
- uint64_t dn = *aa32_vfp_dreg(env, i / 2);
- uint32_t faddr = fpcar + 4 * i;
- uint32_t slo = extract64(dn, 0, 32);
- uint32_t shi = extract64(dn, 32, 32);
-
- if (i >= 16) {
- faddr += 8; /* skip the slot for the FPSCR/VPR */
- }
- stacked_ok = stacked_ok &&
- v7m_stack_write(cpu, faddr, slo, mmu_idx, STACK_LAZYFP) &&
- v7m_stack_write(cpu, faddr + 4, shi, mmu_idx, STACK_LAZYFP);
- }
-
- stacked_ok = stacked_ok &&
- v7m_stack_write(cpu, fpcar + 0x40,
- vfp_get_fpscr(env), mmu_idx, STACK_LAZYFP);
- if (cpu_isar_feature(aa32_mve, cpu)) {
- stacked_ok = stacked_ok &&
- v7m_stack_write(cpu, fpcar + 0x44,
- env->v7m.vpr, mmu_idx, STACK_LAZYFP);
- }
- }
-
- /*
- * We definitely pended an exception, but it's possible that it
- * might not be able to be taken now. If its priority permits us
- * to take it now, then we must not update the LSPACT or FP regs,
- * but instead jump out to take the exception immediately.
- * If it's just pending and won't be taken until the current
- * handler exits, then we do update LSPACT and the FP regs.
- */
- take_exception = !stacked_ok &&
- armv7m_nvic_can_take_pending_exception(env->nvic);
-
- qemu_mutex_unlock_iothread();
-
- if (take_exception) {
- raise_exception_ra(env, EXCP_LAZYFP, 0, 1, GETPC());
- }
-
- env->v7m.fpccr[is_secure] &= ~R_V7M_FPCCR_LSPACT_MASK;
-
- if (ts) {
- /* Clear s0 to s31 and the FPSCR and VPR */
- int i;
-
- for (i = 0; i < 32; i += 2) {
- *aa32_vfp_dreg(env, i / 2) = 0;
- }
- vfp_set_fpscr(env, 0);
- if (cpu_isar_feature(aa32_mve, cpu)) {
- env->v7m.vpr = 0;
- }
- }
- /*
- * Otherwise s0 to s15, FPSCR and VPR are UNKNOWN; we choose to leave them
- * unchanged.
- */
-}
-
-/*
- * Write to v7M CONTROL.SPSEL bit for the specified security bank.
- * This may change the current stack pointer between Main and Process
- * stack pointers if it is done for the CONTROL register for the current
- * security state.
- */
-static void write_v7m_control_spsel_for_secstate(CPUARMState *env,
- bool new_spsel,
- bool secstate)
-{
- bool old_is_psp = v7m_using_psp(env);
-
- env->v7m.control[secstate] =
- deposit32(env->v7m.control[secstate],
- R_V7M_CONTROL_SPSEL_SHIFT,
- R_V7M_CONTROL_SPSEL_LENGTH, new_spsel);
-
- if (secstate == env->v7m.secure) {
- bool new_is_psp = v7m_using_psp(env);
- uint32_t tmp;
-
- if (old_is_psp != new_is_psp) {
- tmp = env->v7m.other_sp;
- env->v7m.other_sp = env->regs[13];
- env->regs[13] = tmp;
- }
- }
-}
-
-/*
- * Write to v7M CONTROL.SPSEL bit. This may change the current
- * stack pointer between Main and Process stack pointers.
- */
-static void write_v7m_control_spsel(CPUARMState *env, bool new_spsel)
-{
- write_v7m_control_spsel_for_secstate(env, new_spsel, env->v7m.secure);
-}
-
-void write_v7m_exception(CPUARMState *env, uint32_t new_exc)
-{
- /*
- * Write a new value to v7m.exception, thus transitioning into or out
- * of Handler mode; this may result in a change of active stack pointer.
- */
- bool new_is_psp, old_is_psp = v7m_using_psp(env);
- uint32_t tmp;
-
- env->v7m.exception = new_exc;
-
- new_is_psp = v7m_using_psp(env);
-
- if (old_is_psp != new_is_psp) {
- tmp = env->v7m.other_sp;
- env->v7m.other_sp = env->regs[13];
- env->regs[13] = tmp;
- }
-}
-
-/* Switch M profile security state between NS and S */
-static void switch_v7m_security_state(CPUARMState *env, bool new_secstate)
-{
- uint32_t new_ss_msp, new_ss_psp;
-
- if (env->v7m.secure == new_secstate) {
- return;
- }
-
- /*
- * All the banked state is accessed by looking at env->v7m.secure
- * except for the stack pointer; rearrange the SP appropriately.
- */
- new_ss_msp = env->v7m.other_ss_msp;
- new_ss_psp = env->v7m.other_ss_psp;
-
- if (v7m_using_psp(env)) {
- env->v7m.other_ss_psp = env->regs[13];
- env->v7m.other_ss_msp = env->v7m.other_sp;
- } else {
- env->v7m.other_ss_msp = env->regs[13];
- env->v7m.other_ss_psp = env->v7m.other_sp;
- }
-
- env->v7m.secure = new_secstate;
-
- if (v7m_using_psp(env)) {
- env->regs[13] = new_ss_psp;
- env->v7m.other_sp = new_ss_msp;
- } else {
- env->regs[13] = new_ss_msp;
- env->v7m.other_sp = new_ss_psp;
- }
-}
-
-void HELPER(v7m_bxns)(CPUARMState *env, uint32_t dest)
-{
- /*
- * Handle v7M BXNS:
- * - if the return value is a magic value, do exception return (like BX)
- * - otherwise bit 0 of the return value is the target security state
- */
- uint32_t min_magic;
-
- if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
- /* Covers FNC_RETURN and EXC_RETURN magic */
- min_magic = FNC_RETURN_MIN_MAGIC;
- } else {
- /* EXC_RETURN magic only */
- min_magic = EXC_RETURN_MIN_MAGIC;
- }
-
- if (dest >= min_magic) {
- /*
- * This is an exception return magic value; put it where
- * do_v7m_exception_exit() expects and raise EXCEPTION_EXIT.
- * Note that if we ever add gen_ss_advance() singlestep support to
- * M profile this should count as an "instruction execution complete"
- * event (compare gen_bx_excret_final_code()).
- */
- env->regs[15] = dest & ~1;
- env->thumb = dest & 1;
- HELPER(exception_internal)(env, EXCP_EXCEPTION_EXIT);
- /* notreached */
- }
-
- /* translate.c should have made BXNS UNDEF unless we're secure */
- assert(env->v7m.secure);
-
- if (!(dest & 1)) {
- env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
- }
- switch_v7m_security_state(env, dest & 1);
- env->thumb = true;
- env->regs[15] = dest & ~1;
- arm_rebuild_hflags(env);
-}
-
-void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest)
-{
- /*
- * Handle v7M BLXNS:
- * - bit 0 of the destination address is the target security state
- */
-
- /* At this point regs[15] is the address just after the BLXNS */
- uint32_t nextinst = env->regs[15] | 1;
- uint32_t sp = env->regs[13] - 8;
- uint32_t saved_psr;
-
- /* translate.c will have made BLXNS UNDEF unless we're secure */
- assert(env->v7m.secure);
-
- if (dest & 1) {
- /*
- * Target is Secure, so this is just a normal BLX,
- * except that the low bit doesn't indicate Thumb/not.
- */
- env->regs[14] = nextinst;
- env->thumb = true;
- env->regs[15] = dest & ~1;
- return;
- }
-
- /* Target is non-secure: first push a stack frame */
- if (!QEMU_IS_ALIGNED(sp, 8)) {
- qemu_log_mask(LOG_GUEST_ERROR,
- "BLXNS with misaligned SP is UNPREDICTABLE\n");
- }
-
- if (sp < v7m_sp_limit(env)) {
- raise_exception(env, EXCP_STKOF, 0, 1);
- }
-
- saved_psr = env->v7m.exception;
- if (env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK) {
- saved_psr |= XPSR_SFPA;
- }
-
- /* Note that these stores can throw exceptions on MPU faults */
- cpu_stl_data_ra(env, sp, nextinst, GETPC());
- cpu_stl_data_ra(env, sp + 4, saved_psr, GETPC());
-
- env->regs[13] = sp;
- env->regs[14] = 0xfeffffff;
- if (arm_v7m_is_handler_mode(env)) {
- /*
- * Write a dummy value to IPSR, to avoid leaking the current secure
- * exception number to non-secure code. This is guaranteed not
- * to cause write_v7m_exception() to actually change stacks.
- */
- write_v7m_exception(env, 1);
- }
- env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
- switch_v7m_security_state(env, 0);
- env->thumb = true;
- env->regs[15] = dest;
- arm_rebuild_hflags(env);
-}
-
-static uint32_t *get_v7m_sp_ptr(CPUARMState *env, bool secure, bool threadmode,
- bool spsel)
-{
- /*
- * Return a pointer to the location where we currently store the
- * stack pointer for the requested security state and thread mode.
- * This pointer will become invalid if the CPU state is updated
- * such that the stack pointers are switched around (eg changing
- * the SPSEL control bit).
- * Compare the v8M ARM ARM pseudocode LookUpSP_with_security_mode().
- * Unlike that pseudocode, we require the caller to pass us in the
- * SPSEL control bit value; this is because we also use this
- * function in handling of pushing of the callee-saves registers
- * part of the v8M stack frame (pseudocode PushCalleeStack()),
- * and in the tailchain codepath the SPSEL bit comes from the exception
- * return magic LR value from the previous exception. The pseudocode
- * opencodes the stack-selection in PushCalleeStack(), but we prefer
- * to make this utility function generic enough to do the job.
- */
- bool want_psp = threadmode && spsel;
-
- if (secure == env->v7m.secure) {
- if (want_psp == v7m_using_psp(env)) {
- return &env->regs[13];
- } else {
- return &env->v7m.other_sp;
- }
- } else {
- if (want_psp) {
- return &env->v7m.other_ss_psp;
- } else {
- return &env->v7m.other_ss_msp;
- }
- }
-}
-
-static bool arm_v7m_load_vector(ARMCPU *cpu, int exc, bool targets_secure,
- uint32_t *pvec)
-{
- CPUState *cs = CPU(cpu);
- CPUARMState *env = &cpu->env;
- MemTxResult result;
- uint32_t addr = env->v7m.vecbase[targets_secure] + exc * 4;
- uint32_t vector_entry;
- MemTxAttrs attrs = {};
- ARMMMUIdx mmu_idx;
- bool exc_secure;
-
- qemu_log_mask(CPU_LOG_INT,
- "...loading from element %d of %s vector table at 0x%x\n",
- exc, targets_secure ? "secure" : "non-secure", addr);
-
- mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targets_secure, true);
-
- /*
- * We don't do a get_phys_addr() here because the rules for vector
- * loads are special: they always use the default memory map, and
- * the default memory map permits reads from all addresses.
- * Since there's no easy way to pass through to pmsav8_mpu_lookup()
- * that we want this special case which would always say "yes",
- * we just do the SAU lookup here followed by a direct physical load.
- */
- attrs.secure = targets_secure;
- attrs.user = false;
-
- if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
- V8M_SAttributes sattrs = {};
-
- v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx,
- targets_secure, &sattrs);
- if (sattrs.ns) {
- attrs.secure = false;
- } else if (!targets_secure) {
- /*
- * NS access to S memory: the underlying exception which we escalate
- * to HardFault is SecureFault, which always targets Secure.
- */
- exc_secure = true;
- goto load_fail;
- }
- }
-
- vector_entry = address_space_ldl(arm_addressspace(cs, attrs), addr,
- attrs, &result);
- if (result != MEMTX_OK) {
- /*
- * Underlying exception is BusFault: its target security state
- * depends on BFHFNMINS.
- */
- exc_secure = !(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK);
- goto load_fail;
- }
- *pvec = vector_entry;
- qemu_log_mask(CPU_LOG_INT, "...loaded new PC 0x%x\n", *pvec);
- return true;
-
-load_fail:
- /*
- * All vector table fetch fails are reported as HardFault, with
- * HFSR.VECTTBL and .FORCED set. (FORCED is set because
- * technically the underlying exception is a SecureFault or BusFault
- * that is escalated to HardFault.) This is a terminal exception,
- * so we will either take the HardFault immediately or else enter
- * lockup (the latter case is handled in armv7m_nvic_set_pending_derived()).
- * The HardFault is Secure if BFHFNMINS is 0 (meaning that all HFs are
- * secure); otherwise it targets the same security state as the
- * underlying exception.
- * In v8.1M HardFaults from vector table fetch fails don't set FORCED.
- */
- if (!(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK)) {
- exc_secure = true;
- }
- env->v7m.hfsr |= R_V7M_HFSR_VECTTBL_MASK;
- if (!arm_feature(env, ARM_FEATURE_V8_1M)) {
- env->v7m.hfsr |= R_V7M_HFSR_FORCED_MASK;
- }
- armv7m_nvic_set_pending_derived(env->nvic, ARMV7M_EXCP_HARD, exc_secure);
- return false;
-}
-
-static uint32_t v7m_integrity_sig(CPUARMState *env, uint32_t lr)
-{
- /*
- * Return the integrity signature value for the callee-saves
- * stack frame section. @lr is the exception return payload/LR value
- * whose FType bit forms bit 0 of the signature if FP is present.
- */
- uint32_t sig = 0xfefa125a;
-
- if (!cpu_isar_feature(aa32_vfp_simd, env_archcpu(env))
- || (lr & R_V7M_EXCRET_FTYPE_MASK)) {
- sig |= 1;
- }
- return sig;
-}
-
-static bool v7m_push_callee_stack(ARMCPU *cpu, uint32_t lr, bool dotailchain,
- bool ignore_faults)
-{
- /*
- * For v8M, push the callee-saves register part of the stack frame.
- * Compare the v8M pseudocode PushCalleeStack().
- * In the tailchaining case this may not be the current stack.
- */
- CPUARMState *env = &cpu->env;
- uint32_t *frame_sp_p;
- uint32_t frameptr;
- ARMMMUIdx mmu_idx;
- bool stacked_ok;
- uint32_t limit;
- bool want_psp;
- uint32_t sig;
- StackingMode smode = ignore_faults ? STACK_IGNFAULTS : STACK_NORMAL;
-
- if (dotailchain) {
- bool mode = lr & R_V7M_EXCRET_MODE_MASK;
- bool priv = !(env->v7m.control[M_REG_S] & R_V7M_CONTROL_NPRIV_MASK) ||
- !mode;
-
- mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, M_REG_S, priv);
- frame_sp_p = get_v7m_sp_ptr(env, M_REG_S, mode,
- lr & R_V7M_EXCRET_SPSEL_MASK);
- want_psp = mode && (lr & R_V7M_EXCRET_SPSEL_MASK);
- if (want_psp) {
- limit = env->v7m.psplim[M_REG_S];
- } else {
- limit = env->v7m.msplim[M_REG_S];
- }
- } else {
- mmu_idx = arm_mmu_idx(env);
- frame_sp_p = &env->regs[13];
- limit = v7m_sp_limit(env);
- }
-
- frameptr = *frame_sp_p - 0x28;
- if (frameptr < limit) {
- /*
- * Stack limit failure: set SP to the limit value, and generate
- * STKOF UsageFault. Stack pushes below the limit must not be
- * performed. It is IMPDEF whether pushes above the limit are
- * performed; we choose not to.
- */
- qemu_log_mask(CPU_LOG_INT,
- "...STKOF during callee-saves register stacking\n");
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
- env->v7m.secure);
- *frame_sp_p = limit;
- return true;
- }
-
- /*
- * Write as much of the stack frame as we can. A write failure may
- * cause us to pend a derived exception.
- */
- sig = v7m_integrity_sig(env, lr);
- stacked_ok =
- v7m_stack_write(cpu, frameptr, sig, mmu_idx, smode) &&
- v7m_stack_write(cpu, frameptr + 0x8, env->regs[4], mmu_idx, smode) &&
- v7m_stack_write(cpu, frameptr + 0xc, env->regs[5], mmu_idx, smode) &&
- v7m_stack_write(cpu, frameptr + 0x10, env->regs[6], mmu_idx, smode) &&
- v7m_stack_write(cpu, frameptr + 0x14, env->regs[7], mmu_idx, smode) &&
- v7m_stack_write(cpu, frameptr + 0x18, env->regs[8], mmu_idx, smode) &&
- v7m_stack_write(cpu, frameptr + 0x1c, env->regs[9], mmu_idx, smode) &&
- v7m_stack_write(cpu, frameptr + 0x20, env->regs[10], mmu_idx, smode) &&
- v7m_stack_write(cpu, frameptr + 0x24, env->regs[11], mmu_idx, smode);
-
- /* Update SP regardless of whether any of the stack accesses failed. */
- *frame_sp_p = frameptr;
-
- return !stacked_ok;
-}
-
-static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, bool dotailchain,
- bool ignore_stackfaults)
-{
- /*
- * Do the "take the exception" parts of exception entry,
- * but not the pushing of state to the stack. This is
- * similar to the pseudocode ExceptionTaken() function.
- */
- CPUARMState *env = &cpu->env;
- uint32_t addr;
- bool targets_secure;
- int exc;
- bool push_failed = false;
-
- armv7m_nvic_get_pending_irq_info(env->nvic, &exc, &targets_secure);
- qemu_log_mask(CPU_LOG_INT, "...taking pending %s exception %d\n",
- targets_secure ? "secure" : "nonsecure", exc);
-
- if (dotailchain) {
- /* Sanitize LR FType and PREFIX bits */
- if (!cpu_isar_feature(aa32_vfp_simd, cpu)) {
- lr |= R_V7M_EXCRET_FTYPE_MASK;
- }
- lr = deposit32(lr, 24, 8, 0xff);
- }
-
- if (arm_feature(env, ARM_FEATURE_V8)) {
- if (arm_feature(env, ARM_FEATURE_M_SECURITY) &&
- (lr & R_V7M_EXCRET_S_MASK)) {
- /*
- * The background code (the owner of the registers in the
- * exception frame) is Secure. This means it may either already
- * have or now needs to push callee-saves registers.
- */
- if (targets_secure) {
- if (dotailchain && !(lr & R_V7M_EXCRET_ES_MASK)) {
- /*
- * We took an exception from Secure to NonSecure
- * (which means the callee-saved registers got stacked)
- * and are now tailchaining to a Secure exception.
- * Clear DCRS so eventual return from this Secure
- * exception unstacks the callee-saved registers.
- */
- lr &= ~R_V7M_EXCRET_DCRS_MASK;
- }
- } else {
- /*
- * We're going to a non-secure exception; push the
- * callee-saves registers to the stack now, if they're
- * not already saved.
- */
- if (lr & R_V7M_EXCRET_DCRS_MASK &&
- !(dotailchain && !(lr & R_V7M_EXCRET_ES_MASK))) {
- push_failed = v7m_push_callee_stack(cpu, lr, dotailchain,
- ignore_stackfaults);
- }
- lr |= R_V7M_EXCRET_DCRS_MASK;
- }
- }
-
- lr &= ~R_V7M_EXCRET_ES_MASK;
- if (targets_secure) {
- lr |= R_V7M_EXCRET_ES_MASK;
- }
- lr &= ~R_V7M_EXCRET_SPSEL_MASK;
- if (env->v7m.control[targets_secure] & R_V7M_CONTROL_SPSEL_MASK) {
- lr |= R_V7M_EXCRET_SPSEL_MASK;
- }
-
- /*
- * Clear registers if necessary to prevent non-secure exception
- * code being able to see register values from secure code.
- * Where register values become architecturally UNKNOWN we leave
- * them with their previous values. v8.1M is tighter than v8.0M
- * here and always zeroes the caller-saved registers regardless
- * of the security state the exception is targeting.
- */
- if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
- if (!targets_secure || arm_feature(env, ARM_FEATURE_V8_1M)) {
- /*
- * Always clear the caller-saved registers (they have been
- * pushed to the stack earlier in v7m_push_stack()).
- * Clear callee-saved registers if the background code is
- * Secure (in which case these regs were saved in
- * v7m_push_callee_stack()).
- */
- int i;
- /*
- * r4..r11 are callee-saves, zero only if background
- * state was Secure (EXCRET.S == 1) and exception
- * targets Non-secure state
- */
- bool zero_callee_saves = !targets_secure &&
- (lr & R_V7M_EXCRET_S_MASK);
-
- for (i = 0; i < 13; i++) {
- if (i < 4 || i > 11 || zero_callee_saves) {
- env->regs[i] = 0;
- }
- }
- /* Clear EAPSR */
- xpsr_write(env, 0, XPSR_NZCV | XPSR_Q | XPSR_GE | XPSR_IT);
- }
- }
- }
-
- if (push_failed && !ignore_stackfaults) {
- /*
- * Derived exception on callee-saves register stacking:
- * we might now want to take a different exception which
- * targets a different security state, so try again from the top.
- */
- qemu_log_mask(CPU_LOG_INT,
- "...derived exception on callee-saves register stacking");
- v7m_exception_taken(cpu, lr, true, true);
- return;
- }
-
- if (!arm_v7m_load_vector(cpu, exc, targets_secure, &addr)) {
- /* Vector load failed: derived exception */
- qemu_log_mask(CPU_LOG_INT, "...derived exception on vector table load");
- v7m_exception_taken(cpu, lr, true, true);
- return;
- }
-
- /*
- * Now we've done everything that might cause a derived exception
- * we can go ahead and activate whichever exception we're going to
- * take (which might now be the derived exception).
- */
- armv7m_nvic_acknowledge_irq(env->nvic);
-
- /* Switch to target security state -- must do this before writing SPSEL */
- switch_v7m_security_state(env, targets_secure);
- write_v7m_control_spsel(env, 0);
- arm_clear_exclusive(env);
- /* Clear SFPA and FPCA (has no effect if no FPU) */
- env->v7m.control[M_REG_S] &=
- ~(R_V7M_CONTROL_FPCA_MASK | R_V7M_CONTROL_SFPA_MASK);
- /* Clear IT bits */
- env->condexec_bits = 0;
- env->regs[14] = lr;
- env->regs[15] = addr & 0xfffffffe;
- env->thumb = addr & 1;
- arm_rebuild_hflags(env);
-}
-
-static void v7m_update_fpccr(CPUARMState *env, uint32_t frameptr,
- bool apply_splim)
-{
- /*
- * Like the pseudocode UpdateFPCCR: save state in FPCAR and FPCCR
- * that we will need later in order to do lazy FP reg stacking.
- */
- bool is_secure = env->v7m.secure;
- NVICState *nvic = env->nvic;
- /*
- * Some bits are unbanked and live always in fpccr[M_REG_S]; some bits
- * are banked and we want to update the bit in the bank for the
- * current security state; and in one case we want to specifically
- * update the NS banked version of a bit even if we are secure.
- */
- uint32_t *fpccr_s = &env->v7m.fpccr[M_REG_S];
- uint32_t *fpccr_ns = &env->v7m.fpccr[M_REG_NS];
- uint32_t *fpccr = &env->v7m.fpccr[is_secure];
- bool hfrdy, bfrdy, mmrdy, ns_ufrdy, s_ufrdy, sfrdy, monrdy;
-
- env->v7m.fpcar[is_secure] = frameptr & ~0x7;
-
- if (apply_splim && arm_feature(env, ARM_FEATURE_V8)) {
- bool splimviol;
- uint32_t splim = v7m_sp_limit(env);
- bool ign = armv7m_nvic_neg_prio_requested(nvic, is_secure) &&
- (env->v7m.ccr[is_secure] & R_V7M_CCR_STKOFHFNMIGN_MASK);
-
- splimviol = !ign && frameptr < splim;
- *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, SPLIMVIOL, splimviol);
- }
-
- *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, LSPACT, 1);
-
- *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, S, is_secure);
-
- *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, USER, arm_current_el(env) == 0);
-
- *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, THREAD,
- !arm_v7m_is_handler_mode(env));
-
- hfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_HARD, false);
- *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, HFRDY, hfrdy);
-
- bfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_BUS, false);
- *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, BFRDY, bfrdy);
-
- mmrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_MEM, is_secure);
- *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, MMRDY, mmrdy);
-
- ns_ufrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_USAGE, false);
- *fpccr_ns = FIELD_DP32(*fpccr_ns, V7M_FPCCR, UFRDY, ns_ufrdy);
-
- monrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_DEBUG, false);
- *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, MONRDY, monrdy);
-
- if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
- s_ufrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_USAGE, true);
- *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, UFRDY, s_ufrdy);
-
- sfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_SECURE, false);
- *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, SFRDY, sfrdy);
- }
-}
-
-void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
-{
- /* fptr is the value of Rn, the frame pointer we store the FP regs to */
- ARMCPU *cpu = env_archcpu(env);
- bool s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
- bool lspact = env->v7m.fpccr[s] & R_V7M_FPCCR_LSPACT_MASK;
- uintptr_t ra = GETPC();
-
- assert(env->v7m.secure);
-
- if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) {
- return;
- }
-
- /* Check access to the coprocessor is permitted */
- if (!v7m_cpacr_pass(env, true, arm_current_el(env) != 0)) {
- raise_exception_ra(env, EXCP_NOCP, 0, 1, GETPC());
- }
-
- if (lspact) {
- /* LSPACT should not be active when there is active FP state */
- raise_exception_ra(env, EXCP_LSERR, 0, 1, GETPC());
- }
-
- if (fptr & 7) {
- raise_exception_ra(env, EXCP_UNALIGNED, 0, 1, GETPC());
- }
-
- /*
- * Note that we do not use v7m_stack_write() here, because the
- * accesses should not set the FSR bits for stacking errors if they
- * fail. (In pseudocode terms, they are AccType_NORMAL, not AccType_STACK
- * or AccType_LAZYFP). Faults in cpu_stl_data_ra() will throw exceptions
- * and longjmp out.
- */
- if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) {
- bool ts = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK;
- int i;
-
- for (i = 0; i < (ts ? 32 : 16); i += 2) {
- uint64_t dn = *aa32_vfp_dreg(env, i / 2);
- uint32_t faddr = fptr + 4 * i;
- uint32_t slo = extract64(dn, 0, 32);
- uint32_t shi = extract64(dn, 32, 32);
-
- if (i >= 16) {
- faddr += 8; /* skip the slot for the FPSCR */
- }
- cpu_stl_data_ra(env, faddr, slo, ra);
- cpu_stl_data_ra(env, faddr + 4, shi, ra);
- }
- cpu_stl_data_ra(env, fptr + 0x40, vfp_get_fpscr(env), ra);
- if (cpu_isar_feature(aa32_mve, cpu)) {
- cpu_stl_data_ra(env, fptr + 0x44, env->v7m.vpr, ra);
- }
-
- /*
- * If TS is 0 then s0 to s15, FPSCR and VPR are UNKNOWN; we choose to
- * leave them unchanged, matching our choice in v7m_preserve_fp_state.
- */
- if (ts) {
- for (i = 0; i < 32; i += 2) {
- *aa32_vfp_dreg(env, i / 2) = 0;
- }
- vfp_set_fpscr(env, 0);
- if (cpu_isar_feature(aa32_mve, cpu)) {
- env->v7m.vpr = 0;
- }
- }
- } else {
- v7m_update_fpccr(env, fptr, false);
- }
-
- env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK;
-}
-
-void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
-{
- ARMCPU *cpu = env_archcpu(env);
- uintptr_t ra = GETPC();
-
- /* fptr is the value of Rn, the frame pointer we load the FP regs from */
- assert(env->v7m.secure);
-
- if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) {
- return;
- }
-
- /* Check access to the coprocessor is permitted */
- if (!v7m_cpacr_pass(env, true, arm_current_el(env) != 0)) {
- raise_exception_ra(env, EXCP_NOCP, 0, 1, GETPC());
- }
-
- if (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK) {
- /* State in FP is still valid */
- env->v7m.fpccr[M_REG_S] &= ~R_V7M_FPCCR_LSPACT_MASK;
- } else {
- bool ts = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK;
- int i;
- uint32_t fpscr;
-
- if (fptr & 7) {
- raise_exception_ra(env, EXCP_UNALIGNED, 0, 1, GETPC());
- }
-
- for (i = 0; i < (ts ? 32 : 16); i += 2) {
- uint32_t slo, shi;
- uint64_t dn;
- uint32_t faddr = fptr + 4 * i;
-
- if (i >= 16) {
- faddr += 8; /* skip the slot for the FPSCR and VPR */
- }
-
- slo = cpu_ldl_data_ra(env, faddr, ra);
- shi = cpu_ldl_data_ra(env, faddr + 4, ra);
-
- dn = (uint64_t) shi << 32 | slo;
- *aa32_vfp_dreg(env, i / 2) = dn;
- }
- fpscr = cpu_ldl_data_ra(env, fptr + 0x40, ra);
- vfp_set_fpscr(env, fpscr);
- if (cpu_isar_feature(aa32_mve, cpu)) {
- env->v7m.vpr = cpu_ldl_data_ra(env, fptr + 0x44, ra);
- }
- }
-
- env->v7m.control[M_REG_S] |= R_V7M_CONTROL_FPCA_MASK;
-}
-
-static bool v7m_push_stack(ARMCPU *cpu)
-{
- /*
- * Do the "set up stack frame" part of exception entry,
- * similar to pseudocode PushStack().
- * Return true if we generate a derived exception (and so
- * should ignore further stack faults trying to process
- * that derived exception.)
- */
- bool stacked_ok = true, limitviol = false;
- CPUARMState *env = &cpu->env;
- uint32_t xpsr = xpsr_read(env);
- uint32_t frameptr = env->regs[13];
- ARMMMUIdx mmu_idx = arm_mmu_idx(env);
- uint32_t framesize;
- bool nsacr_cp10 = extract32(env->v7m.nsacr, 10, 1);
-
- if ((env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) &&
- (env->v7m.secure || nsacr_cp10)) {
- if (env->v7m.secure &&
- env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK) {
- framesize = 0xa8;
- } else {
- framesize = 0x68;
- }
- } else {
- framesize = 0x20;
- }
-
- /* Align stack pointer if the guest wants that */
- if ((frameptr & 4) &&
- (env->v7m.ccr[env->v7m.secure] & R_V7M_CCR_STKALIGN_MASK)) {
- frameptr -= 4;
- xpsr |= XPSR_SPREALIGN;
- }
-
- xpsr &= ~XPSR_SFPA;
- if (env->v7m.secure &&
- (env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) {
- xpsr |= XPSR_SFPA;
- }
-
- frameptr -= framesize;
-
- if (arm_feature(env, ARM_FEATURE_V8)) {
- uint32_t limit = v7m_sp_limit(env);
-
- if (frameptr < limit) {
- /*
- * Stack limit failure: set SP to the limit value, and generate
- * STKOF UsageFault. Stack pushes below the limit must not be
- * performed. It is IMPDEF whether pushes above the limit are
- * performed; we choose not to.
- */
- qemu_log_mask(CPU_LOG_INT,
- "...STKOF during stacking\n");
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
- env->v7m.secure);
- env->regs[13] = limit;
- /*
- * We won't try to perform any further memory accesses but
- * we must continue through the following code to check for
- * permission faults during FPU state preservation, and we
- * must update FPCCR if lazy stacking is enabled.
- */
- limitviol = true;
- stacked_ok = false;
- }
- }
-
- /*
- * Write as much of the stack frame as we can. If we fail a stack
- * write this will result in a derived exception being pended
- * (which may be taken in preference to the one we started with
- * if it has higher priority).
- */
- stacked_ok = stacked_ok &&
- v7m_stack_write(cpu, frameptr, env->regs[0], mmu_idx, STACK_NORMAL) &&
- v7m_stack_write(cpu, frameptr + 4, env->regs[1],
- mmu_idx, STACK_NORMAL) &&
- v7m_stack_write(cpu, frameptr + 8, env->regs[2],
- mmu_idx, STACK_NORMAL) &&
- v7m_stack_write(cpu, frameptr + 12, env->regs[3],
- mmu_idx, STACK_NORMAL) &&
- v7m_stack_write(cpu, frameptr + 16, env->regs[12],
- mmu_idx, STACK_NORMAL) &&
- v7m_stack_write(cpu, frameptr + 20, env->regs[14],
- mmu_idx, STACK_NORMAL) &&
- v7m_stack_write(cpu, frameptr + 24, env->regs[15],
- mmu_idx, STACK_NORMAL) &&
- v7m_stack_write(cpu, frameptr + 28, xpsr, mmu_idx, STACK_NORMAL);
-
- if (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) {
- /* FPU is active, try to save its registers */
- bool fpccr_s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
- bool lspact = env->v7m.fpccr[fpccr_s] & R_V7M_FPCCR_LSPACT_MASK;
-
- if (lspact && arm_feature(env, ARM_FEATURE_M_SECURITY)) {
- qemu_log_mask(CPU_LOG_INT,
- "...SecureFault because LSPACT and FPCA both set\n");
- env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
- } else if (!env->v7m.secure && !nsacr_cp10) {
- qemu_log_mask(CPU_LOG_INT,
- "...Secure UsageFault with CFSR.NOCP because "
- "NSACR.CP10 prevents stacking FP regs\n");
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, M_REG_S);
- env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK;
- } else {
- if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) {
- /* Lazy stacking disabled, save registers now */
- int i;
- bool cpacr_pass = v7m_cpacr_pass(env, env->v7m.secure,
- arm_current_el(env) != 0);
-
- if (stacked_ok && !cpacr_pass) {
- /*
- * Take UsageFault if CPACR forbids access. The pseudocode
- * here does a full CheckCPEnabled() but we know the NSACR
- * check can never fail as we have already handled that.
- */
- qemu_log_mask(CPU_LOG_INT,
- "...UsageFault with CFSR.NOCP because "
- "CPACR.CP10 prevents stacking FP regs\n");
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
- env->v7m.secure);
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_NOCP_MASK;
- stacked_ok = false;
- }
-
- for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) {
- uint64_t dn = *aa32_vfp_dreg(env, i / 2);
- uint32_t faddr = frameptr + 0x20 + 4 * i;
- uint32_t slo = extract64(dn, 0, 32);
- uint32_t shi = extract64(dn, 32, 32);
-
- if (i >= 16) {
- faddr += 8; /* skip the slot for the FPSCR and VPR */
- }
- stacked_ok = stacked_ok &&
- v7m_stack_write(cpu, faddr, slo,
- mmu_idx, STACK_NORMAL) &&
- v7m_stack_write(cpu, faddr + 4, shi,
- mmu_idx, STACK_NORMAL);
- }
- stacked_ok = stacked_ok &&
- v7m_stack_write(cpu, frameptr + 0x60,
- vfp_get_fpscr(env), mmu_idx, STACK_NORMAL);
- if (cpu_isar_feature(aa32_mve, cpu)) {
- stacked_ok = stacked_ok &&
- v7m_stack_write(cpu, frameptr + 0x64,
- env->v7m.vpr, mmu_idx, STACK_NORMAL);
- }
- if (cpacr_pass) {
- for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) {
- *aa32_vfp_dreg(env, i / 2) = 0;
- }
- vfp_set_fpscr(env, 0);
- if (cpu_isar_feature(aa32_mve, cpu)) {
- env->v7m.vpr = 0;
- }
- }
- } else {
- /* Lazy stacking enabled, save necessary info to stack later */
- v7m_update_fpccr(env, frameptr + 0x20, true);
- }
- }
- }
-
- /*
- * If we broke a stack limit then SP was already updated earlier;
- * otherwise we update SP regardless of whether any of the stack
- * accesses failed or we took some other kind of fault.
- */
- if (!limitviol) {
- env->regs[13] = frameptr;
- }
-
- return !stacked_ok;
-}
-
-static void do_v7m_exception_exit(ARMCPU *cpu)
-{
- CPUARMState *env = &cpu->env;
- uint32_t excret;
- uint32_t xpsr, xpsr_mask;
- bool ufault = false;
- bool sfault = false;
- bool return_to_sp_process;
- bool return_to_handler;
- bool rettobase = false;
- bool exc_secure = false;
- bool return_to_secure;
- bool ftype;
- bool restore_s16_s31 = false;
-
- /*
- * If we're not in Handler mode then jumps to magic exception-exit
- * addresses don't have magic behaviour. However for the v8M
- * security extensions the magic secure-function-return has to
- * work in thread mode too, so to avoid doing an extra check in
- * the generated code we allow exception-exit magic to also cause the
- * internal exception and bring us here in thread mode. Correct code
- * will never try to do this (the following insn fetch will always
- * fault) so we the overhead of having taken an unnecessary exception
- * doesn't matter.
- */
- if (!arm_v7m_is_handler_mode(env)) {
- return;
- }
-
- /*
- * In the spec pseudocode ExceptionReturn() is called directly
- * from BXWritePC() and gets the full target PC value including
- * bit zero. In QEMU's implementation we treat it as a normal
- * jump-to-register (which is then caught later on), and so split
- * the target value up between env->regs[15] and env->thumb in
- * gen_bx(). Reconstitute it.
- */
- excret = env->regs[15];
- if (env->thumb) {
- excret |= 1;
- }
-
- qemu_log_mask(CPU_LOG_INT, "Exception return: magic PC %" PRIx32
- " previous exception %d\n",
- excret, env->v7m.exception);
-
- if ((excret & R_V7M_EXCRET_RES1_MASK) != R_V7M_EXCRET_RES1_MASK) {
- qemu_log_mask(LOG_GUEST_ERROR, "M profile: zero high bits in exception "
- "exit PC value 0x%" PRIx32 " are UNPREDICTABLE\n",
- excret);
- }
-
- ftype = excret & R_V7M_EXCRET_FTYPE_MASK;
-
- if (!ftype && !cpu_isar_feature(aa32_vfp_simd, cpu)) {
- qemu_log_mask(LOG_GUEST_ERROR, "M profile: zero FTYPE in exception "
- "exit PC value 0x%" PRIx32 " is UNPREDICTABLE "
- "if FPU not present\n",
- excret);
- ftype = true;
- }
-
- if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
- /*
- * EXC_RETURN.ES validation check (R_SMFL). We must do this before
- * we pick which FAULTMASK to clear.
- */
- if (!env->v7m.secure &&
- ((excret & R_V7M_EXCRET_ES_MASK) ||
- !(excret & R_V7M_EXCRET_DCRS_MASK))) {
- sfault = 1;
- /* For all other purposes, treat ES as 0 (R_HXSR) */
- excret &= ~R_V7M_EXCRET_ES_MASK;
- }
- exc_secure = excret & R_V7M_EXCRET_ES_MASK;
- }
-
- if (env->v7m.exception != ARMV7M_EXCP_NMI) {
- /*
- * Auto-clear FAULTMASK on return from other than NMI.
- * If the security extension is implemented then this only
- * happens if the raw execution priority is >= 0; the
- * value of the ES bit in the exception return value indicates
- * which security state's faultmask to clear. (v8M ARM ARM R_KBNF.)
- */
- if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
- if (armv7m_nvic_raw_execution_priority(env->nvic) >= 0) {
- env->v7m.faultmask[exc_secure] = 0;
- }
- } else {
- env->v7m.faultmask[M_REG_NS] = 0;
- }
- }
-
- switch (armv7m_nvic_complete_irq(env->nvic, env->v7m.exception,
- exc_secure)) {
- case -1:
- /* attempt to exit an exception that isn't active */
- ufault = true;
- break;
- case 0:
- /* still an irq active now */
- break;
- case 1:
- /*
- * We returned to base exception level, no nesting.
- * (In the pseudocode this is written using "NestedActivation != 1"
- * where we have 'rettobase == false'.)
- */
- rettobase = true;
- break;
- default:
- g_assert_not_reached();
- }
-
- return_to_handler = !(excret & R_V7M_EXCRET_MODE_MASK);
- return_to_sp_process = excret & R_V7M_EXCRET_SPSEL_MASK;
- return_to_secure = arm_feature(env, ARM_FEATURE_M_SECURITY) &&
- (excret & R_V7M_EXCRET_S_MASK);
-
- if (arm_feature(env, ARM_FEATURE_V8)) {
- if (!arm_feature(env, ARM_FEATURE_M_SECURITY)) {
- /*
- * UNPREDICTABLE if S == 1 or DCRS == 0 or ES == 1 (R_XLCP);
- * we choose to take the UsageFault.
- */
- if ((excret & R_V7M_EXCRET_S_MASK) ||
- (excret & R_V7M_EXCRET_ES_MASK) ||
- !(excret & R_V7M_EXCRET_DCRS_MASK)) {
- ufault = true;
- }
- }
- if (excret & R_V7M_EXCRET_RES0_MASK) {
- ufault = true;
- }
- } else {
- /* For v7M we only recognize certain combinations of the low bits */
- switch (excret & 0xf) {
- case 1: /* Return to Handler */
- break;
- case 13: /* Return to Thread using Process stack */
- case 9: /* Return to Thread using Main stack */
- /*
- * We only need to check NONBASETHRDENA for v7M, because in
- * v8M this bit does not exist (it is RES1).
- */
- if (!rettobase &&
- !(env->v7m.ccr[env->v7m.secure] &
- R_V7M_CCR_NONBASETHRDENA_MASK)) {
- ufault = true;
- }
- break;
- default:
- ufault = true;
- }
- }
-
- /*
- * Set CONTROL.SPSEL from excret.SPSEL. Since we're still in
- * Handler mode (and will be until we write the new XPSR.Interrupt
- * field) this does not switch around the current stack pointer.
- * We must do this before we do any kind of tailchaining, including
- * for the derived exceptions on integrity check failures, or we will
- * give the guest an incorrect EXCRET.SPSEL value on exception entry.
- */
- write_v7m_control_spsel_for_secstate(env, return_to_sp_process, exc_secure);
-
- /*
- * Clear scratch FP values left in caller saved registers; this
- * must happen before any kind of tail chaining.
- */
- if ((env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_CLRONRET_MASK) &&
- (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK)) {
- if (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK) {
- env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
- qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing "
- "stackframe: error during lazy state deactivation\n");
- v7m_exception_taken(cpu, excret, true, false);
- return;
- } else {
- if (arm_feature(env, ARM_FEATURE_V8_1M)) {
- /* v8.1M adds this NOCP check */
- bool nsacr_pass = exc_secure ||
- extract32(env->v7m.nsacr, 10, 1);
- bool cpacr_pass = v7m_cpacr_pass(env, exc_secure, true);
- if (!nsacr_pass) {
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, true);
- env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK;
- qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
- "stackframe: NSACR prevents clearing FPU registers\n");
- v7m_exception_taken(cpu, excret, true, false);
- return;
- } else if (!cpacr_pass) {
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
- exc_secure);
- env->v7m.cfsr[exc_secure] |= R_V7M_CFSR_NOCP_MASK;
- qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
- "stackframe: CPACR prevents clearing FPU registers\n");
- v7m_exception_taken(cpu, excret, true, false);
- return;
- }
- }
- /* Clear s0..s15, FPSCR and VPR */
- int i;
-
- for (i = 0; i < 16; i += 2) {
- *aa32_vfp_dreg(env, i / 2) = 0;
- }
- vfp_set_fpscr(env, 0);
- if (cpu_isar_feature(aa32_mve, cpu)) {
- env->v7m.vpr = 0;
- }
- }
- }
-
- if (sfault) {
- env->v7m.sfsr |= R_V7M_SFSR_INVER_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
- qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing "
- "stackframe: failed EXC_RETURN.ES validity check\n");
- v7m_exception_taken(cpu, excret, true, false);
- return;
- }
-
- if (ufault) {
- /*
- * Bad exception return: instead of popping the exception
- * stack, directly take a usage fault on the current stack.
- */
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
- qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
- "stackframe: failed exception return integrity check\n");
- v7m_exception_taken(cpu, excret, true, false);
- return;
- }
-
- /*
- * Tailchaining: if there is currently a pending exception that
- * is high enough priority to preempt execution at the level we're
- * about to return to, then just directly take that exception now,
- * avoiding an unstack-and-then-stack. Note that now we have
- * deactivated the previous exception by calling armv7m_nvic_complete_irq()
- * our current execution priority is already the execution priority we are
- * returning to -- none of the state we would unstack or set based on
- * the EXCRET value affects it.
- */
- if (armv7m_nvic_can_take_pending_exception(env->nvic)) {
- qemu_log_mask(CPU_LOG_INT, "...tailchaining to pending exception\n");
- v7m_exception_taken(cpu, excret, true, false);
- return;
- }
-
- switch_v7m_security_state(env, return_to_secure);
-
- {
- /*
- * The stack pointer we should be reading the exception frame from
- * depends on bits in the magic exception return type value (and
- * for v8M isn't necessarily the stack pointer we will eventually
- * end up resuming execution with). Get a pointer to the location
- * in the CPU state struct where the SP we need is currently being
- * stored; we will use and modify it in place.
- * We use this limited C variable scope so we don't accidentally
- * use 'frame_sp_p' after we do something that makes it invalid.
- */
- bool spsel = env->v7m.control[return_to_secure] & R_V7M_CONTROL_SPSEL_MASK;
- uint32_t *frame_sp_p = get_v7m_sp_ptr(env,
- return_to_secure,
- !return_to_handler,
- spsel);
- uint32_t frameptr = *frame_sp_p;
- bool pop_ok = true;
- ARMMMUIdx mmu_idx;
- bool return_to_priv = return_to_handler ||
- !(env->v7m.control[return_to_secure] & R_V7M_CONTROL_NPRIV_MASK);
-
- mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, return_to_secure,
- return_to_priv);
-
- if (!QEMU_IS_ALIGNED(frameptr, 8) &&
- arm_feature(env, ARM_FEATURE_V8)) {
- qemu_log_mask(LOG_GUEST_ERROR,
- "M profile exception return with non-8-aligned SP "
- "for destination state is UNPREDICTABLE\n");
- }
-
- /* Do we need to pop callee-saved registers? */
- if (return_to_secure &&
- ((excret & R_V7M_EXCRET_ES_MASK) == 0 ||
- (excret & R_V7M_EXCRET_DCRS_MASK) == 0)) {
- uint32_t actual_sig;
-
- pop_ok = v7m_stack_read(cpu, &actual_sig, frameptr, mmu_idx);
-
- if (pop_ok && v7m_integrity_sig(env, excret) != actual_sig) {
- /* Take a SecureFault on the current stack */
- env->v7m.sfsr |= R_V7M_SFSR_INVIS_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
- qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing "
- "stackframe: failed exception return integrity "
- "signature check\n");
- v7m_exception_taken(cpu, excret, true, false);
- return;
- }
-
- pop_ok = pop_ok &&
- v7m_stack_read(cpu, &env->regs[4], frameptr + 0x8, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[5], frameptr + 0xc, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[6], frameptr + 0x10, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[7], frameptr + 0x14, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[8], frameptr + 0x18, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[9], frameptr + 0x1c, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[10], frameptr + 0x20, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[11], frameptr + 0x24, mmu_idx);
-
- frameptr += 0x28;
- }
-
- /* Pop registers */
- pop_ok = pop_ok &&
- v7m_stack_read(cpu, &env->regs[0], frameptr, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[1], frameptr + 0x4, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[2], frameptr + 0x8, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[3], frameptr + 0xc, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[12], frameptr + 0x10, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[14], frameptr + 0x14, mmu_idx) &&
- v7m_stack_read(cpu, &env->regs[15], frameptr + 0x18, mmu_idx) &&
- v7m_stack_read(cpu, &xpsr, frameptr + 0x1c, mmu_idx);
-
- if (!pop_ok) {
- /*
- * v7m_stack_read() pended a fault, so take it (as a tail
- * chained exception on the same stack frame)
- */
- qemu_log_mask(CPU_LOG_INT, "...derived exception on unstacking\n");
- v7m_exception_taken(cpu, excret, true, false);
- return;
- }
-
- /*
- * Returning from an exception with a PC with bit 0 set is defined
- * behaviour on v8M (bit 0 is ignored), but for v7M it was specified
- * to be UNPREDICTABLE. In practice actual v7M hardware seems to ignore
- * the lsbit, and there are several RTOSes out there which incorrectly
- * assume the r15 in the stack frame should be a Thumb-style "lsbit
- * indicates ARM/Thumb" value, so ignore the bit on v7M as well, but
- * complain about the badly behaved guest.
- */
- if (env->regs[15] & 1) {
- env->regs[15] &= ~1U;
- if (!arm_feature(env, ARM_FEATURE_V8)) {
- qemu_log_mask(LOG_GUEST_ERROR,
- "M profile return from interrupt with misaligned "
- "PC is UNPREDICTABLE on v7M\n");
- }
- }
-
- if (arm_feature(env, ARM_FEATURE_V8)) {
- /*
- * For v8M we have to check whether the xPSR exception field
- * matches the EXCRET value for return to handler/thread
- * before we commit to changing the SP and xPSR.
- */
- bool will_be_handler = (xpsr & XPSR_EXCP) != 0;
- if (return_to_handler != will_be_handler) {
- /*
- * Take an INVPC UsageFault on the current stack.
- * By this point we will have switched to the security state
- * for the background state, so this UsageFault will target
- * that state.
- */
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
- env->v7m.secure);
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
- qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
- "stackframe: failed exception return integrity "
- "check\n");
- v7m_exception_taken(cpu, excret, true, false);
- return;
- }
- }
-
- if (!ftype) {
- /* FP present and we need to handle it */
- if (!return_to_secure &&
- (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK)) {
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
- env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
- qemu_log_mask(CPU_LOG_INT,
- "...taking SecureFault on existing stackframe: "
- "Secure LSPACT set but exception return is "
- "not to secure state\n");
- v7m_exception_taken(cpu, excret, true, false);
- return;
- }
-
- restore_s16_s31 = return_to_secure &&
- (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK);
-
- if (env->v7m.fpccr[return_to_secure] & R_V7M_FPCCR_LSPACT_MASK) {
- /* State in FPU is still valid, just clear LSPACT */
- env->v7m.fpccr[return_to_secure] &= ~R_V7M_FPCCR_LSPACT_MASK;
- } else {
- int i;
- uint32_t fpscr;
- bool cpacr_pass, nsacr_pass;
-
- cpacr_pass = v7m_cpacr_pass(env, return_to_secure,
- return_to_priv);
- nsacr_pass = return_to_secure ||
- extract32(env->v7m.nsacr, 10, 1);
-
- if (!cpacr_pass) {
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
- return_to_secure);
- env->v7m.cfsr[return_to_secure] |= R_V7M_CFSR_NOCP_MASK;
- qemu_log_mask(CPU_LOG_INT,
- "...taking UsageFault on existing "
- "stackframe: CPACR.CP10 prevents unstacking "
- "FP regs\n");
- v7m_exception_taken(cpu, excret, true, false);
- return;
- } else if (!nsacr_pass) {
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, true);
- env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_INVPC_MASK;
- qemu_log_mask(CPU_LOG_INT,
- "...taking Secure UsageFault on existing "
- "stackframe: NSACR.CP10 prevents unstacking "
- "FP regs\n");
- v7m_exception_taken(cpu, excret, true, false);
- return;
- }
-
- for (i = 0; i < (restore_s16_s31 ? 32 : 16); i += 2) {
- uint32_t slo, shi;
- uint64_t dn;
- uint32_t faddr = frameptr + 0x20 + 4 * i;
-
- if (i >= 16) {
- faddr += 8; /* Skip the slot for the FPSCR and VPR */
- }
-
- pop_ok = pop_ok &&
- v7m_stack_read(cpu, &slo, faddr, mmu_idx) &&
- v7m_stack_read(cpu, &shi, faddr + 4, mmu_idx);
-
- if (!pop_ok) {
- break;
- }
-
- dn = (uint64_t)shi << 32 | slo;
- *aa32_vfp_dreg(env, i / 2) = dn;
- }
- pop_ok = pop_ok &&
- v7m_stack_read(cpu, &fpscr, frameptr + 0x60, mmu_idx);
- if (pop_ok) {
- vfp_set_fpscr(env, fpscr);
- }
- if (cpu_isar_feature(aa32_mve, cpu)) {
- pop_ok = pop_ok &&
- v7m_stack_read(cpu, &env->v7m.vpr,
- frameptr + 0x64, mmu_idx);
- }
- if (!pop_ok) {
- /*
- * These regs are 0 if security extension present;
- * otherwise merely UNKNOWN. We zero always.
- */
- for (i = 0; i < (restore_s16_s31 ? 32 : 16); i += 2) {
- *aa32_vfp_dreg(env, i / 2) = 0;
- }
- vfp_set_fpscr(env, 0);
- if (cpu_isar_feature(aa32_mve, cpu)) {
- env->v7m.vpr = 0;
- }
- }
- }
- }
- env->v7m.control[M_REG_S] = FIELD_DP32(env->v7m.control[M_REG_S],
- V7M_CONTROL, FPCA, !ftype);
-
- /* Commit to consuming the stack frame */
- frameptr += 0x20;
- if (!ftype) {
- frameptr += 0x48;
- if (restore_s16_s31) {
- frameptr += 0x40;
- }
- }
- /*
- * Undo stack alignment (the SPREALIGN bit indicates that the original
- * pre-exception SP was not 8-aligned and we added a padding word to
- * align it, so we undo this by ORing in the bit that increases it
- * from the current 8-aligned value to the 8-unaligned value. (Adding 4
- * would work too but a logical OR is how the pseudocode specifies it.)
- */
- if (xpsr & XPSR_SPREALIGN) {
- frameptr |= 4;
- }
- *frame_sp_p = frameptr;
- }
-
- xpsr_mask = ~(XPSR_SPREALIGN | XPSR_SFPA);
- if (!arm_feature(env, ARM_FEATURE_THUMB_DSP)) {
- xpsr_mask &= ~XPSR_GE;
- }
- /* This xpsr_write() will invalidate frame_sp_p as it may switch stack */
- xpsr_write(env, xpsr, xpsr_mask);
-
- if (env->v7m.secure) {
- bool sfpa = xpsr & XPSR_SFPA;
-
- env->v7m.control[M_REG_S] = FIELD_DP32(env->v7m.control[M_REG_S],
- V7M_CONTROL, SFPA, sfpa);
- }
-
- /*
- * The restored xPSR exception field will be zero if we're
- * resuming in Thread mode. If that doesn't match what the
- * exception return excret specified then this is a UsageFault.
- * v7M requires we make this check here; v8M did it earlier.
- */
- if (return_to_handler != arm_v7m_is_handler_mode(env)) {
- /*
- * Take an INVPC UsageFault by pushing the stack again;
- * we know we're v7M so this is never a Secure UsageFault.
- */
- bool ignore_stackfaults;
-
- assert(!arm_feature(env, ARM_FEATURE_V8));
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, false);
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
- ignore_stackfaults = v7m_push_stack(cpu);
- qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on new stackframe: "
- "failed exception return integrity check\n");
- v7m_exception_taken(cpu, excret, false, ignore_stackfaults);
- return;
- }
-
- /* Otherwise, we have a successful exception exit. */
- arm_clear_exclusive(env);
- arm_rebuild_hflags(env);
- qemu_log_mask(CPU_LOG_INT, "...successful exception return\n");
-}
-
-static bool do_v7m_function_return(ARMCPU *cpu)
-{
- /*
- * v8M security extensions magic function return.
- * We may either:
- * (1) throw an exception (longjump)
- * (2) return true if we successfully handled the function return
- * (3) return false if we failed a consistency check and have
- * pended a UsageFault that needs to be taken now
- *
- * At this point the magic return value is split between env->regs[15]
- * and env->thumb. We don't bother to reconstitute it because we don't
- * need it (all values are handled the same way).
- */
- CPUARMState *env = &cpu->env;
- uint32_t newpc, newpsr, newpsr_exc;
-
- qemu_log_mask(CPU_LOG_INT, "...really v7M secure function return\n");
-
- {
- bool threadmode, spsel;
- MemOpIdx oi;
- ARMMMUIdx mmu_idx;
- uint32_t *frame_sp_p;
- uint32_t frameptr;
-
- /* Pull the return address and IPSR from the Secure stack */
- threadmode = !arm_v7m_is_handler_mode(env);
- spsel = env->v7m.control[M_REG_S] & R_V7M_CONTROL_SPSEL_MASK;
-
- frame_sp_p = get_v7m_sp_ptr(env, true, threadmode, spsel);
- frameptr = *frame_sp_p;
-
- /*
- * These loads may throw an exception (for MPU faults). We want to
- * do them as secure, so work out what MMU index that is.
- */
- mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
- oi = make_memop_idx(MO_LEUL, arm_to_core_mmu_idx(mmu_idx));
- newpc = cpu_ldl_le_mmu(env, frameptr, oi, 0);
- newpsr = cpu_ldl_le_mmu(env, frameptr + 4, oi, 0);
-
- /* Consistency checks on new IPSR */
- newpsr_exc = newpsr & XPSR_EXCP;
- if (!((env->v7m.exception == 0 && newpsr_exc == 0) ||
- (env->v7m.exception == 1 && newpsr_exc != 0))) {
- /* Pend the fault and tell our caller to take it */
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
- env->v7m.secure);
- qemu_log_mask(CPU_LOG_INT,
- "...taking INVPC UsageFault: "
- "IPSR consistency check failed\n");
- return false;
- }
-
- *frame_sp_p = frameptr + 8;
- }
-
- /* This invalidates frame_sp_p */
- switch_v7m_security_state(env, true);
- env->v7m.exception = newpsr_exc;
- env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
- if (newpsr & XPSR_SFPA) {
- env->v7m.control[M_REG_S] |= R_V7M_CONTROL_SFPA_MASK;
- }
- xpsr_write(env, 0, XPSR_IT);
- env->thumb = newpc & 1;
- env->regs[15] = newpc & ~1;
- arm_rebuild_hflags(env);
-
- qemu_log_mask(CPU_LOG_INT, "...function return successful\n");
- return true;
-}
-
-static bool v7m_read_half_insn(ARMCPU *cpu, ARMMMUIdx mmu_idx, bool secure,
- uint32_t addr, uint16_t *insn)
-{
- /*
- * Load a 16-bit portion of a v7M instruction, returning true on success,
- * or false on failure (in which case we will have pended the appropriate
- * exception).
- * We need to do the instruction fetch's MPU and SAU checks
- * like this because there is no MMU index that would allow
- * doing the load with a single function call. Instead we must
- * first check that the security attributes permit the load
- * and that they don't mismatch on the two halves of the instruction,
- * and then we do the load as a secure load (ie using the security
- * attributes of the address, not the CPU, as architecturally required).
- */
- CPUState *cs = CPU(cpu);
- CPUARMState *env = &cpu->env;
- V8M_SAttributes sattrs = {};
- GetPhysAddrResult res = {};
- ARMMMUFaultInfo fi = {};
- MemTxResult txres;
-
- v8m_security_lookup(env, addr, MMU_INST_FETCH, mmu_idx, secure, &sattrs);
- if (!sattrs.nsc || sattrs.ns) {
- /*
- * This must be the second half of the insn, and it straddles a
- * region boundary with the second half not being S&NSC.
- */
- env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
- qemu_log_mask(CPU_LOG_INT,
- "...really SecureFault with SFSR.INVEP\n");
- return false;
- }
- if (get_phys_addr(env, addr, MMU_INST_FETCH, mmu_idx, &res, &fi)) {
- /* the MPU lookup failed */
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, env->v7m.secure);
- qemu_log_mask(CPU_LOG_INT, "...really MemManage with CFSR.IACCVIOL\n");
- return false;
- }
- *insn = address_space_lduw_le(arm_addressspace(cs, res.f.attrs),
- res.f.phys_addr, res.f.attrs, &txres);
- if (txres != MEMTX_OK) {
- env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_IBUSERR_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false);
- qemu_log_mask(CPU_LOG_INT, "...really BusFault with CFSR.IBUSERR\n");
- return false;
- }
- return true;
-}
-
-static bool v7m_read_sg_stack_word(ARMCPU *cpu, ARMMMUIdx mmu_idx,
- uint32_t addr, uint32_t *spdata)
-{
- /*
- * Read a word of data from the stack for the SG instruction,
- * writing the value into *spdata. If the load succeeds, return
- * true; otherwise pend an appropriate exception and return false.
- * (We can't use data load helpers here that throw an exception
- * because of the context we're called in, which is halfway through
- * arm_v7m_cpu_do_interrupt().)
- */
- CPUState *cs = CPU(cpu);
- CPUARMState *env = &cpu->env;
- MemTxResult txres;
- GetPhysAddrResult res = {};
- ARMMMUFaultInfo fi = {};
- uint32_t value;
-
- if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &res, &fi)) {
- /* MPU/SAU lookup failed */
- if (fi.type == ARMFault_QEMU_SFault) {
- qemu_log_mask(CPU_LOG_INT,
- "...SecureFault during stack word read\n");
- env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK;
- env->v7m.sfar = addr;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
- } else {
- qemu_log_mask(CPU_LOG_INT,
- "...MemManageFault during stack word read\n");
- env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_DACCVIOL_MASK |
- R_V7M_CFSR_MMARVALID_MASK;
- env->v7m.mmfar[M_REG_S] = addr;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, false);
- }
- return false;
- }
- value = address_space_ldl(arm_addressspace(cs, res.f.attrs),
- res.f.phys_addr, res.f.attrs, &txres);
- if (txres != MEMTX_OK) {
- /* BusFault trying to read the data */
- qemu_log_mask(CPU_LOG_INT,
- "...BusFault during stack word read\n");
- env->v7m.cfsr[M_REG_NS] |=
- (R_V7M_CFSR_PRECISERR_MASK | R_V7M_CFSR_BFARVALID_MASK);
- env->v7m.bfar = addr;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false);
- return false;
- }
-
- *spdata = value;
- return true;
-}
-
-static bool v7m_handle_execute_nsc(ARMCPU *cpu)
-{
- /*
- * Check whether this attempt to execute code in a Secure & NS-Callable
- * memory region is for an SG instruction; if so, then emulate the
- * effect of the SG instruction and return true. Otherwise pend
- * the correct kind of exception and return false.
- */
- CPUARMState *env = &cpu->env;
- ARMMMUIdx mmu_idx;
- uint16_t insn;
-
- /*
- * We should never get here unless get_phys_addr_pmsav8() caused
- * an exception for NS executing in S&NSC memory.
- */
- assert(!env->v7m.secure);
- assert(arm_feature(env, ARM_FEATURE_M_SECURITY));
-
- /* We want to do the MPU lookup as secure; work out what mmu_idx that is */
- mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
-
- if (!v7m_read_half_insn(cpu, mmu_idx, true, env->regs[15], &insn)) {
- return false;
- }
-
- if (!env->thumb) {
- goto gen_invep;
- }
-
- if (insn != 0xe97f) {
- /*
- * Not an SG instruction first half (we choose the IMPDEF
- * early-SG-check option).
- */
- goto gen_invep;
- }
-
- if (!v7m_read_half_insn(cpu, mmu_idx, true, env->regs[15] + 2, &insn)) {
- return false;
- }
-
- if (insn != 0xe97f) {
- /*
- * Not an SG instruction second half (yes, both halves of the SG
- * insn have the same hex value)
- */
- goto gen_invep;
- }
-
- /*
- * OK, we have confirmed that we really have an SG instruction.
- * We know we're NS in S memory so don't need to repeat those checks.
- */
- qemu_log_mask(CPU_LOG_INT, "...really an SG instruction at 0x%08" PRIx32
- ", executing it\n", env->regs[15]);
-
- if (cpu_isar_feature(aa32_m_sec_state, cpu) &&
- !arm_v7m_is_handler_mode(env)) {
- /*
- * v8.1M exception stack frame integrity check. Note that we
- * must perform the memory access even if CCR_S.TRD is zero
- * and we aren't going to check what the data loaded is.
- */
- uint32_t spdata, sp;
-
- /*
- * We know we are currently NS, so the S stack pointers must be
- * in other_ss_{psp,msp}, not in regs[13]/other_sp.
- */
- sp = v7m_using_psp(env) ? env->v7m.other_ss_psp : env->v7m.other_ss_msp;
- if (!v7m_read_sg_stack_word(cpu, mmu_idx, sp, &spdata)) {
- /* Stack access failed and an exception has been pended */
- return false;
- }
-
- if (env->v7m.ccr[M_REG_S] & R_V7M_CCR_TRD_MASK) {
- if (((spdata & ~1) == 0xfefa125a) ||
- !(env->v7m.control[M_REG_S] & 1)) {
- goto gen_invep;
- }
- }
- }
-
- env->regs[14] &= ~1;
- env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
- switch_v7m_security_state(env, true);
- xpsr_write(env, 0, XPSR_IT);
- env->regs[15] += 4;
- arm_rebuild_hflags(env);
- return true;
-
-gen_invep:
- env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
- qemu_log_mask(CPU_LOG_INT,
- "...really SecureFault with SFSR.INVEP\n");
- return false;
-}
-
-void arm_v7m_cpu_do_interrupt(CPUState *cs)
-{
- ARMCPU *cpu = ARM_CPU(cs);
- CPUARMState *env = &cpu->env;
- uint32_t lr;
- bool ignore_stackfaults;
-
- arm_log_exception(cs);
-
- /*
- * For exceptions we just mark as pending on the NVIC, and let that
- * handle it.
- */
- switch (cs->exception_index) {
- case EXCP_UDEF:
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNDEFINSTR_MASK;
- break;
- case EXCP_NOCP:
- {
- /*
- * NOCP might be directed to something other than the current
- * security state if this fault is because of NSACR; we indicate
- * the target security state using exception.target_el.
- */
- int target_secstate;
-
- if (env->exception.target_el == 3) {
- target_secstate = M_REG_S;
- } else {
- target_secstate = env->v7m.secure;
- }
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, target_secstate);
- env->v7m.cfsr[target_secstate] |= R_V7M_CFSR_NOCP_MASK;
- break;
- }
- case EXCP_INVSTATE:
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVSTATE_MASK;
- break;
- case EXCP_STKOF:
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK;
- break;
- case EXCP_LSERR:
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
- env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
- break;
- case EXCP_UNALIGNED:
- /* Unaligned faults reported by M-profile aware code */
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK;
- break;
- case EXCP_DIVBYZERO:
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_DIVBYZERO_MASK;
- break;
- case EXCP_SWI:
- /* The PC already points to the next instruction. */
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SVC, env->v7m.secure);
- break;
- case EXCP_PREFETCH_ABORT:
- case EXCP_DATA_ABORT:
- /*
- * Note that for M profile we don't have a guest facing FSR, but
- * the env->exception.fsr will be populated by the code that
- * raises the fault, in the A profile short-descriptor format.
- *
- * Log the exception.vaddress now regardless of subtype, because
- * logging below only logs it when it goes into a guest visible
- * register.
- */
- qemu_log_mask(CPU_LOG_INT, "...at fault address 0x%x\n",
- (uint32_t)env->exception.vaddress);
- switch (env->exception.fsr & 0xf) {
- case M_FAKE_FSR_NSC_EXEC:
- /*
- * Exception generated when we try to execute code at an address
- * which is marked as Secure & Non-Secure Callable and the CPU
- * is in the Non-Secure state. The only instruction which can
- * be executed like this is SG (and that only if both halves of
- * the SG instruction have the same security attributes.)
- * Everything else must generate an INVEP SecureFault, so we
- * emulate the SG instruction here.
- */
- if (v7m_handle_execute_nsc(cpu)) {
- return;
- }
- break;
- case M_FAKE_FSR_SFAULT:
- /*
- * Various flavours of SecureFault for attempts to execute or
- * access data in the wrong security state.
- */
- switch (cs->exception_index) {
- case EXCP_PREFETCH_ABORT:
- if (env->v7m.secure) {
- env->v7m.sfsr |= R_V7M_SFSR_INVTRAN_MASK;
- qemu_log_mask(CPU_LOG_INT,
- "...really SecureFault with SFSR.INVTRAN\n");
- } else {
- env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK;
- qemu_log_mask(CPU_LOG_INT,
- "...really SecureFault with SFSR.INVEP\n");
- }
- break;
- case EXCP_DATA_ABORT:
- /* This must be an NS access to S memory */
- env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK;
- qemu_log_mask(CPU_LOG_INT,
- "...really SecureFault with SFSR.AUVIOL\n");
- break;
- }
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
- break;
- case 0x8: /* External Abort */
- switch (cs->exception_index) {
- case EXCP_PREFETCH_ABORT:
- env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_IBUSERR_MASK;
- qemu_log_mask(CPU_LOG_INT, "...with CFSR.IBUSERR\n");
- break;
- case EXCP_DATA_ABORT:
- env->v7m.cfsr[M_REG_NS] |=
- (R_V7M_CFSR_PRECISERR_MASK | R_V7M_CFSR_BFARVALID_MASK);
- env->v7m.bfar = env->exception.vaddress;
- qemu_log_mask(CPU_LOG_INT,
- "...with CFSR.PRECISERR and BFAR 0x%x\n",
- env->v7m.bfar);
- break;
- }
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false);
- break;
- case 0x1: /* Alignment fault reported by generic code */
- qemu_log_mask(CPU_LOG_INT,
- "...really UsageFault with UFSR.UNALIGNED\n");
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK;
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
- env->v7m.secure);
- break;
- default:
- /*
- * All other FSR values are either MPU faults or "can't happen
- * for M profile" cases.
- */
- switch (cs->exception_index) {
- case EXCP_PREFETCH_ABORT:
- env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK;
- qemu_log_mask(CPU_LOG_INT, "...with CFSR.IACCVIOL\n");
- break;
- case EXCP_DATA_ABORT:
- env->v7m.cfsr[env->v7m.secure] |=
- (R_V7M_CFSR_DACCVIOL_MASK | R_V7M_CFSR_MMARVALID_MASK);
- env->v7m.mmfar[env->v7m.secure] = env->exception.vaddress;
- qemu_log_mask(CPU_LOG_INT,
- "...with CFSR.DACCVIOL and MMFAR 0x%x\n",
- env->v7m.mmfar[env->v7m.secure]);
- break;
- }
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM,
- env->v7m.secure);
- break;
- }
- break;
- case EXCP_SEMIHOST:
- qemu_log_mask(CPU_LOG_INT,
- "...handling as semihosting call 0x%x\n",
- env->regs[0]);
-#ifdef CONFIG_TCG
- do_common_semihosting(cs);
-#else
- g_assert_not_reached();
-#endif
- env->regs[15] += env->thumb ? 2 : 4;
- return;
- case EXCP_BKPT:
- armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_DEBUG, false);
- break;
- case EXCP_IRQ:
- break;
- case EXCP_EXCEPTION_EXIT:
- if (env->regs[15] < EXC_RETURN_MIN_MAGIC) {
- /* Must be v8M security extension function return */
- assert(env->regs[15] >= FNC_RETURN_MIN_MAGIC);
- assert(arm_feature(env, ARM_FEATURE_M_SECURITY));
- if (do_v7m_function_return(cpu)) {
- return;
- }
- } else {
- do_v7m_exception_exit(cpu);
- return;
- }
- break;
- case EXCP_LAZYFP:
- /*
- * We already pended the specific exception in the NVIC in the
- * v7m_preserve_fp_state() helper function.
- */
- break;
- default:
- cpu_abort(cs, "Unhandled exception 0x%x\n", cs->exception_index);
- return; /* Never happens. Keep compiler happy. */
- }
-
- if (arm_feature(env, ARM_FEATURE_V8)) {
- lr = R_V7M_EXCRET_RES1_MASK |
- R_V7M_EXCRET_DCRS_MASK;
- /*
- * The S bit indicates whether we should return to Secure
- * or NonSecure (ie our current state).
- * The ES bit indicates whether we're taking this exception
- * to Secure or NonSecure (ie our target state). We set it
- * later, in v7m_exception_taken().
- * The SPSEL bit is also set in v7m_exception_taken() for v8M.
- * This corresponds to the ARM ARM pseudocode for v8M setting
- * some LR bits in PushStack() and some in ExceptionTaken();
- * the distinction matters for the tailchain cases where we
- * can take an exception without pushing the stack.
- */
- if (env->v7m.secure) {
- lr |= R_V7M_EXCRET_S_MASK;
- }
- } else {
- lr = R_V7M_EXCRET_RES1_MASK |
- R_V7M_EXCRET_S_MASK |
- R_V7M_EXCRET_DCRS_MASK |
- R_V7M_EXCRET_ES_MASK;
- if (env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK) {
- lr |= R_V7M_EXCRET_SPSEL_MASK;
- }
- }
- if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK)) {
- lr |= R_V7M_EXCRET_FTYPE_MASK;
- }
- if (!arm_v7m_is_handler_mode(env)) {
- lr |= R_V7M_EXCRET_MODE_MASK;
- }
-
- ignore_stackfaults = v7m_push_stack(cpu);
- v7m_exception_taken(cpu, lr, false, ignore_stackfaults);
-}
-
-uint32_t HELPER(v7m_mrs)(CPUARMState *env, uint32_t reg)
-{
- unsigned el = arm_current_el(env);
-
- /* First handle registers which unprivileged can read */
- switch (reg) {
- case 0 ... 7: /* xPSR sub-fields */
- return v7m_mrs_xpsr(env, reg, el);
- case 20: /* CONTROL */
- return v7m_mrs_control(env, env->v7m.secure);
- case 0x94: /* CONTROL_NS */
- /*
- * We have to handle this here because unprivileged Secure code
- * can read the NS CONTROL register.
- */
- if (!env->v7m.secure) {
- return 0;
- }
- return env->v7m.control[M_REG_NS] |
- (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK);
- }
-
- if (el == 0) {
- return 0; /* unprivileged reads others as zero */
- }
-
- if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
- switch (reg) {
- case 0x88: /* MSP_NS */
- if (!env->v7m.secure) {
- return 0;
- }
- return env->v7m.other_ss_msp;
- case 0x89: /* PSP_NS */
- if (!env->v7m.secure) {
- return 0;
- }
- return env->v7m.other_ss_psp;
- case 0x8a: /* MSPLIM_NS */
- if (!env->v7m.secure) {
- return 0;
- }
- return env->v7m.msplim[M_REG_NS];
- case 0x8b: /* PSPLIM_NS */
- if (!env->v7m.secure) {
- return 0;
- }
- return env->v7m.psplim[M_REG_NS];
- case 0x90: /* PRIMASK_NS */
- if (!env->v7m.secure) {
- return 0;
- }
- return env->v7m.primask[M_REG_NS];
- case 0x91: /* BASEPRI_NS */
- if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
- goto bad_reg;
- }
- if (!env->v7m.secure) {
- return 0;
- }
- return env->v7m.basepri[M_REG_NS];
- case 0x93: /* FAULTMASK_NS */
- if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
- goto bad_reg;
- }
- if (!env->v7m.secure) {
- return 0;
- }
- return env->v7m.faultmask[M_REG_NS];
- case 0x98: /* SP_NS */
- {
- /*
- * This gives the non-secure SP selected based on whether we're
- * currently in handler mode or not, using the NS CONTROL.SPSEL.
- */
- bool spsel = env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK;
-
- if (!env->v7m.secure) {
- return 0;
- }
- if (!arm_v7m_is_handler_mode(env) && spsel) {
- return env->v7m.other_ss_psp;
- } else {
- return env->v7m.other_ss_msp;
- }
- }
- default:
- break;
- }
- }
-
- switch (reg) {
- case 8: /* MSP */
- return v7m_using_psp(env) ? env->v7m.other_sp : env->regs[13];
- case 9: /* PSP */
- return v7m_using_psp(env) ? env->regs[13] : env->v7m.other_sp;
- case 10: /* MSPLIM */
- if (!arm_feature(env, ARM_FEATURE_V8)) {
- goto bad_reg;
- }
- return env->v7m.msplim[env->v7m.secure];
- case 11: /* PSPLIM */
- if (!arm_feature(env, ARM_FEATURE_V8)) {
- goto bad_reg;
- }
- return env->v7m.psplim[env->v7m.secure];
- case 16: /* PRIMASK */
- return env->v7m.primask[env->v7m.secure];
- case 17: /* BASEPRI */
- case 18: /* BASEPRI_MAX */
- if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
- goto bad_reg;
- }
- return env->v7m.basepri[env->v7m.secure];
- case 19: /* FAULTMASK */
- if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
- goto bad_reg;
- }
- return env->v7m.faultmask[env->v7m.secure];
- default:
- bad_reg:
- qemu_log_mask(LOG_GUEST_ERROR, "Attempt to read unknown special"
- " register %d\n", reg);
- return 0;
- }
-}
-
-void HELPER(v7m_msr)(CPUARMState *env, uint32_t maskreg, uint32_t val)
-{
- /*
- * We're passed bits [11..0] of the instruction; extract
- * SYSm and the mask bits.
- * Invalid combinations of SYSm and mask are UNPREDICTABLE;
- * we choose to treat them as if the mask bits were valid.
- * NB that the pseudocode 'mask' variable is bits [11..10],
- * whereas ours is [11..8].
- */
- uint32_t mask = extract32(maskreg, 8, 4);
- uint32_t reg = extract32(maskreg, 0, 8);
- int cur_el = arm_current_el(env);
-
- if (cur_el == 0 && reg > 7 && reg != 20) {
- /*
- * only xPSR sub-fields and CONTROL.SFPA may be written by
- * unprivileged code
- */
- return;
- }
-
- if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
- switch (reg) {
- case 0x88: /* MSP_NS */
- if (!env->v7m.secure) {
- return;
- }
- env->v7m.other_ss_msp = val & ~3;
- return;
- case 0x89: /* PSP_NS */
- if (!env->v7m.secure) {
- return;
- }
- env->v7m.other_ss_psp = val & ~3;
- return;
- case 0x8a: /* MSPLIM_NS */
- if (!env->v7m.secure) {
- return;
- }
- env->v7m.msplim[M_REG_NS] = val & ~7;
- return;
- case 0x8b: /* PSPLIM_NS */
- if (!env->v7m.secure) {
- return;
- }
- env->v7m.psplim[M_REG_NS] = val & ~7;
- return;
- case 0x90: /* PRIMASK_NS */
- if (!env->v7m.secure) {
- return;
- }
- env->v7m.primask[M_REG_NS] = val & 1;
- return;
- case 0x91: /* BASEPRI_NS */
- if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
- goto bad_reg;
- }
- if (!env->v7m.secure) {
- return;
- }
- env->v7m.basepri[M_REG_NS] = val & 0xff;
- return;
- case 0x93: /* FAULTMASK_NS */
- if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
- goto bad_reg;
- }
- if (!env->v7m.secure) {
- return;
- }
- env->v7m.faultmask[M_REG_NS] = val & 1;
- return;
- case 0x94: /* CONTROL_NS */
- if (!env->v7m.secure) {
- return;
- }
- write_v7m_control_spsel_for_secstate(env,
- val & R_V7M_CONTROL_SPSEL_MASK,
- M_REG_NS);
- if (arm_feature(env, ARM_FEATURE_M_MAIN)) {
- env->v7m.control[M_REG_NS] &= ~R_V7M_CONTROL_NPRIV_MASK;
- env->v7m.control[M_REG_NS] |= val & R_V7M_CONTROL_NPRIV_MASK;
- }
- /*
- * SFPA is RAZ/WI from NS. FPCA is RO if NSACR.CP10 == 0,
- * RES0 if the FPU is not present, and is stored in the S bank
- */
- if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env)) &&
- extract32(env->v7m.nsacr, 10, 1)) {
- env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK;
- env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_FPCA_MASK;
- }
- return;
- case 0x98: /* SP_NS */
- {
- /*
- * This gives the non-secure SP selected based on whether we're
- * currently in handler mode or not, using the NS CONTROL.SPSEL.
- */
- bool spsel = env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK;
- bool is_psp = !arm_v7m_is_handler_mode(env) && spsel;
- uint32_t limit;
-
- if (!env->v7m.secure) {
- return;
- }
-
- limit = is_psp ? env->v7m.psplim[false] : env->v7m.msplim[false];
-
- val &= ~0x3;
-
- if (val < limit) {
- raise_exception_ra(env, EXCP_STKOF, 0, 1, GETPC());
- }
-
- if (is_psp) {
- env->v7m.other_ss_psp = val;
- } else {
- env->v7m.other_ss_msp = val;
- }
- return;
- }
- default:
- break;
- }
- }
-
- switch (reg) {
- case 0 ... 7: /* xPSR sub-fields */
- v7m_msr_xpsr(env, mask, reg, val);
- break;
- case 8: /* MSP */
- if (v7m_using_psp(env)) {
- env->v7m.other_sp = val & ~3;
- } else {
- env->regs[13] = val & ~3;
- }
- break;
- case 9: /* PSP */
- if (v7m_using_psp(env)) {
- env->regs[13] = val & ~3;
- } else {
- env->v7m.other_sp = val & ~3;
- }
- break;
- case 10: /* MSPLIM */
- if (!arm_feature(env, ARM_FEATURE_V8)) {
- goto bad_reg;
- }
- env->v7m.msplim[env->v7m.secure] = val & ~7;
- break;
- case 11: /* PSPLIM */
- if (!arm_feature(env, ARM_FEATURE_V8)) {
- goto bad_reg;
- }
- env->v7m.psplim[env->v7m.secure] = val & ~7;
- break;
- case 16: /* PRIMASK */
- env->v7m.primask[env->v7m.secure] = val & 1;
- break;
- case 17: /* BASEPRI */
- if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
- goto bad_reg;
- }
- env->v7m.basepri[env->v7m.secure] = val & 0xff;
- break;
- case 18: /* BASEPRI_MAX */
- if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
- goto bad_reg;
- }
- val &= 0xff;
- if (val != 0 && (val < env->v7m.basepri[env->v7m.secure]
- || env->v7m.basepri[env->v7m.secure] == 0)) {
- env->v7m.basepri[env->v7m.secure] = val;
- }
- break;
- case 19: /* FAULTMASK */
- if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
- goto bad_reg;
- }
- env->v7m.faultmask[env->v7m.secure] = val & 1;
- break;
- case 20: /* CONTROL */
- /*
- * Writing to the SPSEL bit only has an effect if we are in
- * thread mode; other bits can be updated by any privileged code.
- * write_v7m_control_spsel() deals with updating the SPSEL bit in
- * env->v7m.control, so we only need update the others.
- * For v7M, we must just ignore explicit writes to SPSEL in handler
- * mode; for v8M the write is permitted but will have no effect.
- * All these bits are writes-ignored from non-privileged code,
- * except for SFPA.
- */
- if (cur_el > 0 && (arm_feature(env, ARM_FEATURE_V8) ||
- !arm_v7m_is_handler_mode(env))) {
- write_v7m_control_spsel(env, (val & R_V7M_CONTROL_SPSEL_MASK) != 0);
- }
- if (cur_el > 0 && arm_feature(env, ARM_FEATURE_M_MAIN)) {
- env->v7m.control[env->v7m.secure] &= ~R_V7M_CONTROL_NPRIV_MASK;
- env->v7m.control[env->v7m.secure] |= val & R_V7M_CONTROL_NPRIV_MASK;
- }
- if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env))) {
- /*
- * SFPA is RAZ/WI from NS or if no FPU.
- * FPCA is RO if NSACR.CP10 == 0, RES0 if the FPU is not present.
- * Both are stored in the S bank.
- */
- if (env->v7m.secure) {
- env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
- env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_SFPA_MASK;
- }
- if (cur_el > 0 &&
- (env->v7m.secure || !arm_feature(env, ARM_FEATURE_M_SECURITY) ||
- extract32(env->v7m.nsacr, 10, 1))) {
- env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK;
- env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_FPCA_MASK;
- }
- }
- break;
- default:
- bad_reg:
- qemu_log_mask(LOG_GUEST_ERROR, "Attempt to write unknown special"
- " register %d\n", reg);
- return;
- }
-}
-
-uint32_t HELPER(v7m_tt)(CPUARMState *env, uint32_t addr, uint32_t op)
-{
- /* Implement the TT instruction. op is bits [7:6] of the insn. */
- bool forceunpriv = op & 1;
- bool alt = op & 2;
- V8M_SAttributes sattrs = {};
- uint32_t tt_resp;
- bool r, rw, nsr, nsrw, mrvalid;
- ARMMMUIdx mmu_idx;
- uint32_t mregion;
- bool targetpriv;
- bool targetsec = env->v7m.secure;
-
- /*
- * Work out what the security state and privilege level we're
- * interested in is...
- */
- if (alt) {
- targetsec = !targetsec;
- }
-
- if (forceunpriv) {
- targetpriv = false;
- } else {
- targetpriv = arm_v7m_is_handler_mode(env) ||
- !(env->v7m.control[targetsec] & R_V7M_CONTROL_NPRIV_MASK);
- }
-
- /* ...and then figure out which MMU index this is */
- mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targetsec, targetpriv);
-
- /*
- * We know that the MPU and SAU don't care about the access type
- * for our purposes beyond that we don't want to claim to be
- * an insn fetch, so we arbitrarily call this a read.
- */
-
- /*
- * MPU region info only available for privileged or if
- * inspecting the other MPU state.
- */
- if (arm_current_el(env) != 0 || alt) {
- GetPhysAddrResult res = {};
- ARMMMUFaultInfo fi = {};
-
- /* We can ignore the return value as prot is always set */
- pmsav8_mpu_lookup(env, addr, MMU_DATA_LOAD, mmu_idx, targetsec,
- &res, &fi, &mregion);
- if (mregion == -1) {
- mrvalid = false;
- mregion = 0;
- } else {
- mrvalid = true;
- }
- r = res.f.prot & PAGE_READ;
- rw = res.f.prot & PAGE_WRITE;
- } else {
- r = false;
- rw = false;
- mrvalid = false;
- mregion = 0;
- }
-
- if (env->v7m.secure) {
- v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx,
- targetsec, &sattrs);
- nsr = sattrs.ns && r;
- nsrw = sattrs.ns && rw;
- } else {
- sattrs.ns = true;
- nsr = false;
- nsrw = false;
- }
-
- tt_resp = (sattrs.iregion << 24) |
- (sattrs.irvalid << 23) |
- ((!sattrs.ns) << 22) |
- (nsrw << 21) |
- (nsr << 20) |
- (rw << 19) |
- (r << 18) |
- (sattrs.srvalid << 17) |
- (mrvalid << 16) |
- (sattrs.sregion << 8) |
- mregion;
-
- return tt_resp;
-}
-
-#endif /* !CONFIG_USER_ONLY */
arm_ss = ss.source_set()
arm_ss.add(files(
'cpu.c',
- 'crypto_helper.c',
'debug_helper.c',
'gdbstub.c',
'helper.c',
- 'iwmmxt_helper.c',
- 'm_helper.c',
- 'mve_helper.c',
- 'neon_helper.c',
- 'op_helper.c',
- 'tlb_helper.c',
- 'vec_helper.c',
'vfp_helper.c',
'cpu_tcg.c',
))
arm_ss.add(when: 'TARGET_AARCH64', if_true: files(
'cpu64.c',
'gdbstub64.c',
- 'helper-a64.c',
- 'mte_helper.c',
- 'pauth_helper.c',
- 'sve_helper.c',
- 'sme_helper.c',
))
arm_softmmu_ss = ss.source_set()
if 'CONFIG_TCG' in config_all
subdir('tcg')
+else
+ arm_ss.add(files('tcg-stubs.c'))
endif
target_arch += {'arm': arm_ss}
+++ /dev/null
-/*
- * ARM v8.5-MemTag Operations
- *
- * Copyright (c) 2020 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "qemu/log.h"
-#include "cpu.h"
-#include "internals.h"
-#include "exec/exec-all.h"
-#include "exec/ram_addr.h"
-#include "exec/cpu_ldst.h"
-#include "exec/helper-proto.h"
-#include "qapi/error.h"
-#include "qemu/guest-random.h"
-
-
-static int choose_nonexcluded_tag(int tag, int offset, uint16_t exclude)
-{
- if (exclude == 0xffff) {
- return 0;
- }
- if (offset == 0) {
- while (exclude & (1 << tag)) {
- tag = (tag + 1) & 15;
- }
- } else {
- do {
- do {
- tag = (tag + 1) & 15;
- } while (exclude & (1 << tag));
- } while (--offset > 0);
- }
- return tag;
-}
-
-/**
- * allocation_tag_mem:
- * @env: the cpu environment
- * @ptr_mmu_idx: the addressing regime to use for the virtual address
- * @ptr: the virtual address for which to look up tag memory
- * @ptr_access: the access to use for the virtual address
- * @ptr_size: the number of bytes in the normal memory access
- * @tag_access: the access to use for the tag memory
- * @tag_size: the number of bytes in the tag memory access
- * @ra: the return address for exception handling
- *
- * Our tag memory is formatted as a sequence of little-endian nibbles.
- * That is, the byte at (addr >> (LOG2_TAG_GRANULE + 1)) contains two
- * tags, with the tag at [3:0] for the lower addr and the tag at [7:4]
- * for the higher addr.
- *
- * Here, resolve the physical address from the virtual address, and return
- * a pointer to the corresponding tag byte. Exit with exception if the
- * virtual address is not accessible for @ptr_access.
- *
- * The @ptr_size and @tag_size values may not have an obvious relation
- * due to the alignment of @ptr, and the number of tag checks required.
- *
- * If there is no tag storage corresponding to @ptr, return NULL.
- */
-static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
- uint64_t ptr, MMUAccessType ptr_access,
- int ptr_size, MMUAccessType tag_access,
- int tag_size, uintptr_t ra)
-{
-#ifdef CONFIG_USER_ONLY
- uint64_t clean_ptr = useronly_clean_ptr(ptr);
- int flags = page_get_flags(clean_ptr);
- uint8_t *tags;
- uintptr_t index;
-
- if (!(flags & (ptr_access == MMU_DATA_STORE ? PAGE_WRITE_ORG : PAGE_READ))) {
- cpu_loop_exit_sigsegv(env_cpu(env), ptr, ptr_access,
- !(flags & PAGE_VALID), ra);
- }
-
- /* Require both MAP_ANON and PROT_MTE for the page. */
- if (!(flags & PAGE_ANON) || !(flags & PAGE_MTE)) {
- return NULL;
- }
-
- tags = page_get_target_data(clean_ptr);
-
- index = extract32(ptr, LOG2_TAG_GRANULE + 1,
- TARGET_PAGE_BITS - LOG2_TAG_GRANULE - 1);
- return tags + index;
-#else
- CPUTLBEntryFull *full;
- MemTxAttrs attrs;
- int in_page, flags;
- hwaddr ptr_paddr, tag_paddr, xlat;
- MemoryRegion *mr;
- ARMASIdx tag_asi;
- AddressSpace *tag_as;
- void *host;
-
- /*
- * Probe the first byte of the virtual address. This raises an
- * exception for inaccessible pages, and resolves the virtual address
- * into the softmmu tlb.
- *
- * When RA == 0, this is for mte_probe. The page is expected to be
- * valid. Indicate to probe_access_flags no-fault, then assert that
- * we received a valid page.
- */
- flags = probe_access_full(env, ptr, ptr_access, ptr_mmu_idx,
- ra == 0, &host, &full, ra);
- assert(!(flags & TLB_INVALID_MASK));
-
- /* If the virtual page MemAttr != Tagged, access unchecked. */
- if (full->pte_attrs != 0xf0) {
- return NULL;
- }
-
- /*
- * If not backed by host ram, there is no tag storage: access unchecked.
- * This is probably a guest os bug though, so log it.
- */
- if (unlikely(flags & TLB_MMIO)) {
- qemu_log_mask(LOG_GUEST_ERROR,
- "Page @ 0x%" PRIx64 " indicates Tagged Normal memory "
- "but is not backed by host ram\n", ptr);
- return NULL;
- }
-
- /*
- * Remember these values across the second lookup below,
- * which may invalidate this pointer via tlb resize.
- */
- ptr_paddr = full->phys_addr | (ptr & ~TARGET_PAGE_MASK);
- attrs = full->attrs;
- full = NULL;
-
- /*
- * The Normal memory access can extend to the next page. E.g. a single
- * 8-byte access to the last byte of a page will check only the last
- * tag on the first page.
- * Any page access exception has priority over tag check exception.
- */
- in_page = -(ptr | TARGET_PAGE_MASK);
- if (unlikely(ptr_size > in_page)) {
- flags |= probe_access_full(env, ptr + in_page, ptr_access,
- ptr_mmu_idx, ra == 0, &host, &full, ra);
- assert(!(flags & TLB_INVALID_MASK));
- }
-
- /* Any debug exception has priority over a tag check exception. */
- if (unlikely(flags & TLB_WATCHPOINT)) {
- int wp = ptr_access == MMU_DATA_LOAD ? BP_MEM_READ : BP_MEM_WRITE;
- assert(ra != 0);
- cpu_check_watchpoint(env_cpu(env), ptr, ptr_size, attrs, wp, ra);
- }
-
- /* Convert to the physical address in tag space. */
- tag_paddr = ptr_paddr >> (LOG2_TAG_GRANULE + 1);
-
- /* Look up the address in tag space. */
- tag_asi = attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS;
- tag_as = cpu_get_address_space(env_cpu(env), tag_asi);
- mr = address_space_translate(tag_as, tag_paddr, &xlat, NULL,
- tag_access == MMU_DATA_STORE, attrs);
-
- /*
- * Note that @mr will never be NULL. If there is nothing in the address
- * space at @tag_paddr, the translation will return the unallocated memory
- * region. For our purposes, the result must be ram.
- */
- if (unlikely(!memory_region_is_ram(mr))) {
- /* ??? Failure is a board configuration error. */
- qemu_log_mask(LOG_UNIMP,
- "Tag Memory @ 0x%" HWADDR_PRIx " not found for "
- "Normal Memory @ 0x%" HWADDR_PRIx "\n",
- tag_paddr, ptr_paddr);
- return NULL;
- }
-
- /*
- * Ensure the tag memory is dirty on write, for migration.
- * Tag memory can never contain code or display memory (vga).
- */
- if (tag_access == MMU_DATA_STORE) {
- ram_addr_t tag_ra = memory_region_get_ram_addr(mr) + xlat;
- cpu_physical_memory_set_dirty_flag(tag_ra, DIRTY_MEMORY_MIGRATION);
- }
-
- return memory_region_get_ram_ptr(mr) + xlat;
-#endif
-}
-
-uint64_t HELPER(irg)(CPUARMState *env, uint64_t rn, uint64_t rm)
-{
- uint16_t exclude = extract32(rm | env->cp15.gcr_el1, 0, 16);
- int rrnd = extract32(env->cp15.gcr_el1, 16, 1);
- int start = extract32(env->cp15.rgsr_el1, 0, 4);
- int seed = extract32(env->cp15.rgsr_el1, 8, 16);
- int offset, i, rtag;
-
- /*
- * Our IMPDEF choice for GCR_EL1.RRND==1 is to continue to use the
- * deterministic algorithm. Except that with RRND==1 the kernel is
- * not required to have set RGSR_EL1.SEED != 0, which is required for
- * the deterministic algorithm to function. So we force a non-zero
- * SEED for that case.
- */
- if (unlikely(seed == 0) && rrnd) {
- do {
- Error *err = NULL;
- uint16_t two;
-
- if (qemu_guest_getrandom(&two, sizeof(two), &err) < 0) {
- /*
- * Failed, for unknown reasons in the crypto subsystem.
- * Best we can do is log the reason and use a constant seed.
- */
- qemu_log_mask(LOG_UNIMP, "IRG: Crypto failure: %s\n",
- error_get_pretty(err));
- error_free(err);
- two = 1;
- }
- seed = two;
- } while (seed == 0);
- }
-
- /* RandomTag */
- for (i = offset = 0; i < 4; ++i) {
- /* NextRandomTagBit */
- int top = (extract32(seed, 5, 1) ^ extract32(seed, 3, 1) ^
- extract32(seed, 2, 1) ^ extract32(seed, 0, 1));
- seed = (top << 15) | (seed >> 1);
- offset |= top << i;
- }
- rtag = choose_nonexcluded_tag(start, offset, exclude);
- env->cp15.rgsr_el1 = rtag | (seed << 8);
-
- return address_with_allocation_tag(rn, rtag);
-}
-
-uint64_t HELPER(addsubg)(CPUARMState *env, uint64_t ptr,
- int32_t offset, uint32_t tag_offset)
-{
- int start_tag = allocation_tag_from_addr(ptr);
- uint16_t exclude = extract32(env->cp15.gcr_el1, 0, 16);
- int rtag = choose_nonexcluded_tag(start_tag, tag_offset, exclude);
-
- return address_with_allocation_tag(ptr + offset, rtag);
-}
-
-static int load_tag1(uint64_t ptr, uint8_t *mem)
-{
- int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
- return extract32(*mem, ofs, 4);
-}
-
-uint64_t HELPER(ldg)(CPUARMState *env, uint64_t ptr, uint64_t xt)
-{
- int mmu_idx = cpu_mmu_index(env, false);
- uint8_t *mem;
- int rtag = 0;
-
- /* Trap if accessing an invalid page. */
- mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD, 1,
- MMU_DATA_LOAD, 1, GETPC());
-
- /* Load if page supports tags. */
- if (mem) {
- rtag = load_tag1(ptr, mem);
- }
-
- return address_with_allocation_tag(xt, rtag);
-}
-
-static void check_tag_aligned(CPUARMState *env, uint64_t ptr, uintptr_t ra)
-{
- if (unlikely(!QEMU_IS_ALIGNED(ptr, TAG_GRANULE))) {
- arm_cpu_do_unaligned_access(env_cpu(env), ptr, MMU_DATA_STORE,
- cpu_mmu_index(env, false), ra);
- g_assert_not_reached();
- }
-}
-
-/* For use in a non-parallel context, store to the given nibble. */
-static void store_tag1(uint64_t ptr, uint8_t *mem, int tag)
-{
- int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
- *mem = deposit32(*mem, ofs, 4, tag);
-}
-
-/* For use in a parallel context, atomically store to the given nibble. */
-static void store_tag1_parallel(uint64_t ptr, uint8_t *mem, int tag)
-{
- int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
- uint8_t old = qatomic_read(mem);
-
- while (1) {
- uint8_t new = deposit32(old, ofs, 4, tag);
- uint8_t cmp = qatomic_cmpxchg(mem, old, new);
- if (likely(cmp == old)) {
- return;
- }
- old = cmp;
- }
-}
-
-typedef void stg_store1(uint64_t, uint8_t *, int);
-
-static inline void do_stg(CPUARMState *env, uint64_t ptr, uint64_t xt,
- uintptr_t ra, stg_store1 store1)
-{
- int mmu_idx = cpu_mmu_index(env, false);
- uint8_t *mem;
-
- check_tag_aligned(env, ptr, ra);
-
- /* Trap if accessing an invalid page. */
- mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, TAG_GRANULE,
- MMU_DATA_STORE, 1, ra);
-
- /* Store if page supports tags. */
- if (mem) {
- store1(ptr, mem, allocation_tag_from_addr(xt));
- }
-}
-
-void HELPER(stg)(CPUARMState *env, uint64_t ptr, uint64_t xt)
-{
- do_stg(env, ptr, xt, GETPC(), store_tag1);
-}
-
-void HELPER(stg_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt)
-{
- do_stg(env, ptr, xt, GETPC(), store_tag1_parallel);
-}
-
-void HELPER(stg_stub)(CPUARMState *env, uint64_t ptr)
-{
- int mmu_idx = cpu_mmu_index(env, false);
- uintptr_t ra = GETPC();
-
- check_tag_aligned(env, ptr, ra);
- probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra);
-}
-
-static inline void do_st2g(CPUARMState *env, uint64_t ptr, uint64_t xt,
- uintptr_t ra, stg_store1 store1)
-{
- int mmu_idx = cpu_mmu_index(env, false);
- int tag = allocation_tag_from_addr(xt);
- uint8_t *mem1, *mem2;
-
- check_tag_aligned(env, ptr, ra);
-
- /*
- * Trap if accessing an invalid page(s).
- * This takes priority over !allocation_tag_access_enabled.
- */
- if (ptr & TAG_GRANULE) {
- /* Two stores unaligned mod TAG_GRANULE*2 -- modify two bytes. */
- mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE,
- TAG_GRANULE, MMU_DATA_STORE, 1, ra);
- mem2 = allocation_tag_mem(env, mmu_idx, ptr + TAG_GRANULE,
- MMU_DATA_STORE, TAG_GRANULE,
- MMU_DATA_STORE, 1, ra);
-
- /* Store if page(s) support tags. */
- if (mem1) {
- store1(TAG_GRANULE, mem1, tag);
- }
- if (mem2) {
- store1(0, mem2, tag);
- }
- } else {
- /* Two stores aligned mod TAG_GRANULE*2 -- modify one byte. */
- mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE,
- 2 * TAG_GRANULE, MMU_DATA_STORE, 1, ra);
- if (mem1) {
- tag |= tag << 4;
- qatomic_set(mem1, tag);
- }
- }
-}
-
-void HELPER(st2g)(CPUARMState *env, uint64_t ptr, uint64_t xt)
-{
- do_st2g(env, ptr, xt, GETPC(), store_tag1);
-}
-
-void HELPER(st2g_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt)
-{
- do_st2g(env, ptr, xt, GETPC(), store_tag1_parallel);
-}
-
-void HELPER(st2g_stub)(CPUARMState *env, uint64_t ptr)
-{
- int mmu_idx = cpu_mmu_index(env, false);
- uintptr_t ra = GETPC();
- int in_page = -(ptr | TARGET_PAGE_MASK);
-
- check_tag_aligned(env, ptr, ra);
-
- if (likely(in_page >= 2 * TAG_GRANULE)) {
- probe_write(env, ptr, 2 * TAG_GRANULE, mmu_idx, ra);
- } else {
- probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra);
- probe_write(env, ptr + TAG_GRANULE, TAG_GRANULE, mmu_idx, ra);
- }
-}
-
-#define LDGM_STGM_SIZE (4 << GMID_EL1_BS)
-
-uint64_t HELPER(ldgm)(CPUARMState *env, uint64_t ptr)
-{
- int mmu_idx = cpu_mmu_index(env, false);
- uintptr_t ra = GETPC();
- void *tag_mem;
-
- ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE);
-
- /* Trap if accessing an invalid page. */
- tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD,
- LDGM_STGM_SIZE, MMU_DATA_LOAD,
- LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra);
-
- /* The tag is squashed to zero if the page does not support tags. */
- if (!tag_mem) {
- return 0;
- }
-
- QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6);
- /*
- * We are loading 64-bits worth of tags. The ordering of elements
- * within the word corresponds to a 64-bit little-endian operation.
- */
- return ldq_le_p(tag_mem);
-}
-
-void HELPER(stgm)(CPUARMState *env, uint64_t ptr, uint64_t val)
-{
- int mmu_idx = cpu_mmu_index(env, false);
- uintptr_t ra = GETPC();
- void *tag_mem;
-
- ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE);
-
- /* Trap if accessing an invalid page. */
- tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE,
- LDGM_STGM_SIZE, MMU_DATA_LOAD,
- LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra);
-
- /*
- * Tag store only happens if the page support tags,
- * and if the OS has enabled access to the tags.
- */
- if (!tag_mem) {
- return;
- }
-
- QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6);
- /*
- * We are storing 64-bits worth of tags. The ordering of elements
- * within the word corresponds to a 64-bit little-endian operation.
- */
- stq_le_p(tag_mem, val);
-}
-
-void HELPER(stzgm_tags)(CPUARMState *env, uint64_t ptr, uint64_t val)
-{
- uintptr_t ra = GETPC();
- int mmu_idx = cpu_mmu_index(env, false);
- int log2_dcz_bytes, log2_tag_bytes;
- intptr_t dcz_bytes, tag_bytes;
- uint8_t *mem;
-
- /*
- * In arm_cpu_realizefn, we assert that dcz > LOG2_TAG_GRANULE+1,
- * i.e. 32 bytes, which is an unreasonably small dcz anyway,
- * to make sure that we can access one complete tag byte here.
- */
- log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2;
- log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1);
- dcz_bytes = (intptr_t)1 << log2_dcz_bytes;
- tag_bytes = (intptr_t)1 << log2_tag_bytes;
- ptr &= -dcz_bytes;
-
- mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, dcz_bytes,
- MMU_DATA_STORE, tag_bytes, ra);
- if (mem) {
- int tag_pair = (val & 0xf) * 0x11;
- memset(mem, tag_pair, tag_bytes);
- }
-}
-
-static void mte_sync_check_fail(CPUARMState *env, uint32_t desc,
- uint64_t dirty_ptr, uintptr_t ra)
-{
- int is_write, syn;
-
- env->exception.vaddress = dirty_ptr;
-
- is_write = FIELD_EX32(desc, MTEDESC, WRITE);
- syn = syn_data_abort_no_iss(arm_current_el(env) != 0, 0, 0, 0, 0, is_write,
- 0x11);
- raise_exception_ra(env, EXCP_DATA_ABORT, syn, exception_target_el(env), ra);
- g_assert_not_reached();
-}
-
-static void mte_async_check_fail(CPUARMState *env, uint64_t dirty_ptr,
- uintptr_t ra, ARMMMUIdx arm_mmu_idx, int el)
-{
- int select;
-
- if (regime_has_2_ranges(arm_mmu_idx)) {
- select = extract64(dirty_ptr, 55, 1);
- } else {
- select = 0;
- }
- env->cp15.tfsr_el[el] |= 1 << select;
-#ifdef CONFIG_USER_ONLY
- /*
- * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT,
- * which then sends a SIGSEGV when the thread is next scheduled.
- * This cpu will return to the main loop at the end of the TB,
- * which is rather sooner than "normal". But the alternative
- * is waiting until the next syscall.
- */
- qemu_cpu_kick(env_cpu(env));
-#endif
-}
-
-/* Record a tag check failure. */
-static void mte_check_fail(CPUARMState *env, uint32_t desc,
- uint64_t dirty_ptr, uintptr_t ra)
-{
- int mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
- ARMMMUIdx arm_mmu_idx = core_to_aa64_mmu_idx(mmu_idx);
- int el, reg_el, tcf;
- uint64_t sctlr;
-
- reg_el = regime_el(env, arm_mmu_idx);
- sctlr = env->cp15.sctlr_el[reg_el];
-
- switch (arm_mmu_idx) {
- case ARMMMUIdx_E10_0:
- case ARMMMUIdx_E20_0:
- el = 0;
- tcf = extract64(sctlr, 38, 2);
- break;
- default:
- el = reg_el;
- tcf = extract64(sctlr, 40, 2);
- }
-
- switch (tcf) {
- case 1:
- /* Tag check fail causes a synchronous exception. */
- mte_sync_check_fail(env, desc, dirty_ptr, ra);
- break;
-
- case 0:
- /*
- * Tag check fail does not affect the PE.
- * We eliminate this case by not setting MTE_ACTIVE
- * in tb_flags, so that we never make this runtime call.
- */
- g_assert_not_reached();
-
- case 2:
- /* Tag check fail causes asynchronous flag set. */
- mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el);
- break;
-
- case 3:
- /*
- * Tag check fail causes asynchronous flag set for stores, or
- * a synchronous exception for loads.
- */
- if (FIELD_EX32(desc, MTEDESC, WRITE)) {
- mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el);
- } else {
- mte_sync_check_fail(env, desc, dirty_ptr, ra);
- }
- break;
- }
-}
-
-/**
- * checkN:
- * @tag: tag memory to test
- * @odd: true to begin testing at tags at odd nibble
- * @cmp: the tag to compare against
- * @count: number of tags to test
- *
- * Return the number of successful tests.
- * Thus a return value < @count indicates a failure.
- *
- * A note about sizes: count is expected to be small.
- *
- * The most common use will be LDP/STP of two integer registers,
- * which means 16 bytes of memory touching at most 2 tags, but
- * often the access is aligned and thus just 1 tag.
- *
- * Using AdvSIMD LD/ST (multiple), one can access 64 bytes of memory,
- * touching at most 5 tags. SVE LDR/STR (vector) with the default
- * vector length is also 64 bytes; the maximum architectural length
- * is 256 bytes touching at most 9 tags.
- *
- * The loop below uses 7 logical operations and 1 memory operation
- * per tag pair. An implementation that loads an aligned word and
- * uses masking to ignore adjacent tags requires 18 logical operations
- * and thus does not begin to pay off until 6 tags.
- * Which, according to the survey above, is unlikely to be common.
- */
-static int checkN(uint8_t *mem, int odd, int cmp, int count)
-{
- int n = 0, diff;
-
- /* Replicate the test tag and compare. */
- cmp *= 0x11;
- diff = *mem++ ^ cmp;
-
- if (odd) {
- goto start_odd;
- }
-
- while (1) {
- /* Test even tag. */
- if (unlikely((diff) & 0x0f)) {
- break;
- }
- if (++n == count) {
- break;
- }
-
- start_odd:
- /* Test odd tag. */
- if (unlikely((diff) & 0xf0)) {
- break;
- }
- if (++n == count) {
- break;
- }
-
- diff = *mem++ ^ cmp;
- }
- return n;
-}
-
-/**
- * mte_probe_int() - helper for mte_probe and mte_check
- * @env: CPU environment
- * @desc: MTEDESC descriptor
- * @ptr: virtual address of the base of the access
- * @fault: return virtual address of the first check failure
- *
- * Internal routine for both mte_probe and mte_check.
- * Return zero on failure, filling in *fault.
- * Return negative on trivial success for tbi disabled.
- * Return positive on success with tbi enabled.
- */
-static int mte_probe_int(CPUARMState *env, uint32_t desc, uint64_t ptr,
- uintptr_t ra, uint64_t *fault)
-{
- int mmu_idx, ptr_tag, bit55;
- uint64_t ptr_last, prev_page, next_page;
- uint64_t tag_first, tag_last;
- uint64_t tag_byte_first, tag_byte_last;
- uint32_t sizem1, tag_count, tag_size, n, c;
- uint8_t *mem1, *mem2;
- MMUAccessType type;
-
- bit55 = extract64(ptr, 55, 1);
- *fault = ptr;
-
- /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */
- if (unlikely(!tbi_check(desc, bit55))) {
- return -1;
- }
-
- ptr_tag = allocation_tag_from_addr(ptr);
-
- if (tcma_check(desc, bit55, ptr_tag)) {
- return 1;
- }
-
- mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
- type = FIELD_EX32(desc, MTEDESC, WRITE) ? MMU_DATA_STORE : MMU_DATA_LOAD;
- sizem1 = FIELD_EX32(desc, MTEDESC, SIZEM1);
-
- /* Find the addr of the end of the access */
- ptr_last = ptr + sizem1;
-
- /* Round the bounds to the tag granule, and compute the number of tags. */
- tag_first = QEMU_ALIGN_DOWN(ptr, TAG_GRANULE);
- tag_last = QEMU_ALIGN_DOWN(ptr_last, TAG_GRANULE);
- tag_count = ((tag_last - tag_first) / TAG_GRANULE) + 1;
-
- /* Round the bounds to twice the tag granule, and compute the bytes. */
- tag_byte_first = QEMU_ALIGN_DOWN(ptr, 2 * TAG_GRANULE);
- tag_byte_last = QEMU_ALIGN_DOWN(ptr_last, 2 * TAG_GRANULE);
-
- /* Locate the page boundaries. */
- prev_page = ptr & TARGET_PAGE_MASK;
- next_page = prev_page + TARGET_PAGE_SIZE;
-
- if (likely(tag_last - prev_page < TARGET_PAGE_SIZE)) {
- /* Memory access stays on one page. */
- tag_size = ((tag_byte_last - tag_byte_first) / (2 * TAG_GRANULE)) + 1;
- mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, sizem1 + 1,
- MMU_DATA_LOAD, tag_size, ra);
- if (!mem1) {
- return 1;
- }
- /* Perform all of the comparisons. */
- n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, tag_count);
- } else {
- /* Memory access crosses to next page. */
- tag_size = (next_page - tag_byte_first) / (2 * TAG_GRANULE);
- mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, next_page - ptr,
- MMU_DATA_LOAD, tag_size, ra);
-
- tag_size = ((tag_byte_last - next_page) / (2 * TAG_GRANULE)) + 1;
- mem2 = allocation_tag_mem(env, mmu_idx, next_page, type,
- ptr_last - next_page + 1,
- MMU_DATA_LOAD, tag_size, ra);
-
- /*
- * Perform all of the comparisons.
- * Note the possible but unlikely case of the operation spanning
- * two pages that do not both have tagging enabled.
- */
- n = c = (next_page - tag_first) / TAG_GRANULE;
- if (mem1) {
- n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, c);
- }
- if (n == c) {
- if (!mem2) {
- return 1;
- }
- n += checkN(mem2, 0, ptr_tag, tag_count - c);
- }
- }
-
- if (likely(n == tag_count)) {
- return 1;
- }
-
- /*
- * If we failed, we know which granule. For the first granule, the
- * failure address is @ptr, the first byte accessed. Otherwise the
- * failure address is the first byte of the nth granule.
- */
- if (n > 0) {
- *fault = tag_first + n * TAG_GRANULE;
- }
- return 0;
-}
-
-uint64_t mte_check(CPUARMState *env, uint32_t desc, uint64_t ptr, uintptr_t ra)
-{
- uint64_t fault;
- int ret = mte_probe_int(env, desc, ptr, ra, &fault);
-
- if (unlikely(ret == 0)) {
- mte_check_fail(env, desc, fault, ra);
- } else if (ret < 0) {
- return ptr;
- }
- return useronly_clean_ptr(ptr);
-}
-
-uint64_t HELPER(mte_check)(CPUARMState *env, uint32_t desc, uint64_t ptr)
-{
- return mte_check(env, desc, ptr, GETPC());
-}
-
-/*
- * No-fault version of mte_check, to be used by SVE for MemSingleNF.
- * Returns false if the access is Checked and the check failed. This
- * is only intended to probe the tag -- the validity of the page must
- * be checked beforehand.
- */
-bool mte_probe(CPUARMState *env, uint32_t desc, uint64_t ptr)
-{
- uint64_t fault;
- int ret = mte_probe_int(env, desc, ptr, 0, &fault);
-
- return ret != 0;
-}
-
-/*
- * Perform an MTE checked access for DC_ZVA.
- */
-uint64_t HELPER(mte_check_zva)(CPUARMState *env, uint32_t desc, uint64_t ptr)
-{
- uintptr_t ra = GETPC();
- int log2_dcz_bytes, log2_tag_bytes;
- int mmu_idx, bit55;
- intptr_t dcz_bytes, tag_bytes, i;
- void *mem;
- uint64_t ptr_tag, mem_tag, align_ptr;
-
- bit55 = extract64(ptr, 55, 1);
-
- /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */
- if (unlikely(!tbi_check(desc, bit55))) {
- return ptr;
- }
-
- ptr_tag = allocation_tag_from_addr(ptr);
-
- if (tcma_check(desc, bit55, ptr_tag)) {
- goto done;
- }
-
- /*
- * In arm_cpu_realizefn, we asserted that dcz > LOG2_TAG_GRANULE+1,
- * i.e. 32 bytes, which is an unreasonably small dcz anyway, to make
- * sure that we can access one complete tag byte here.
- */
- log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2;
- log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1);
- dcz_bytes = (intptr_t)1 << log2_dcz_bytes;
- tag_bytes = (intptr_t)1 << log2_tag_bytes;
- align_ptr = ptr & -dcz_bytes;
-
- /*
- * Trap if accessing an invalid page. DC_ZVA requires that we supply
- * the original pointer for an invalid page. But watchpoints require
- * that we probe the actual space. So do both.
- */
- mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
- (void) probe_write(env, ptr, 1, mmu_idx, ra);
- mem = allocation_tag_mem(env, mmu_idx, align_ptr, MMU_DATA_STORE,
- dcz_bytes, MMU_DATA_LOAD, tag_bytes, ra);
- if (!mem) {
- goto done;
- }
-
- /*
- * Unlike the reasoning for checkN, DC_ZVA is always aligned, and thus
- * it is quite easy to perform all of the comparisons at once without
- * any extra masking.
- *
- * The most common zva block size is 64; some of the thunderx cpus use
- * a block size of 128. For user-only, aarch64_max_initfn will set the
- * block size to 512. Fill out the other cases for future-proofing.
- *
- * In order to be able to find the first miscompare later, we want the
- * tag bytes to be in little-endian order.
- */
- switch (log2_tag_bytes) {
- case 0: /* zva_blocksize 32 */
- mem_tag = *(uint8_t *)mem;
- ptr_tag *= 0x11u;
- break;
- case 1: /* zva_blocksize 64 */
- mem_tag = cpu_to_le16(*(uint16_t *)mem);
- ptr_tag *= 0x1111u;
- break;
- case 2: /* zva_blocksize 128 */
- mem_tag = cpu_to_le32(*(uint32_t *)mem);
- ptr_tag *= 0x11111111u;
- break;
- case 3: /* zva_blocksize 256 */
- mem_tag = cpu_to_le64(*(uint64_t *)mem);
- ptr_tag *= 0x1111111111111111ull;
- break;
-
- default: /* zva_blocksize 512, 1024, 2048 */
- ptr_tag *= 0x1111111111111111ull;
- i = 0;
- do {
- mem_tag = cpu_to_le64(*(uint64_t *)(mem + i));
- if (unlikely(mem_tag != ptr_tag)) {
- goto fail;
- }
- i += 8;
- align_ptr += 16 * TAG_GRANULE;
- } while (i < tag_bytes);
- goto done;
- }
-
- if (likely(mem_tag == ptr_tag)) {
- goto done;
- }
-
- fail:
- /* Locate the first nibble that differs. */
- i = ctz64(mem_tag ^ ptr_tag) >> 4;
- mte_check_fail(env, desc, align_ptr + i * TAG_GRANULE, ra);
-
- done:
- return useronly_clean_ptr(ptr);
-}
+++ /dev/null
-/*
- * M-profile MVE Operations
- *
- * Copyright (c) 2021 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "internals.h"
-#include "vec_internal.h"
-#include "exec/helper-proto.h"
-#include "exec/cpu_ldst.h"
-#include "exec/exec-all.h"
-#include "tcg/tcg.h"
-#include "fpu/softfloat.h"
-
-static uint16_t mve_eci_mask(CPUARMState *env)
-{
- /*
- * Return the mask of which elements in the MVE vector correspond
- * to beats being executed. The mask has 1 bits for executed lanes
- * and 0 bits where ECI says this beat was already executed.
- */
- int eci;
-
- if ((env->condexec_bits & 0xf) != 0) {
- return 0xffff;
- }
-
- eci = env->condexec_bits >> 4;
- switch (eci) {
- case ECI_NONE:
- return 0xffff;
- case ECI_A0:
- return 0xfff0;
- case ECI_A0A1:
- return 0xff00;
- case ECI_A0A1A2:
- case ECI_A0A1A2B0:
- return 0xf000;
- default:
- g_assert_not_reached();
- }
-}
-
-static uint16_t mve_element_mask(CPUARMState *env)
-{
- /*
- * Return the mask of which elements in the MVE vector should be
- * updated. This is a combination of multiple things:
- * (1) by default, we update every lane in the vector
- * (2) VPT predication stores its state in the VPR register;
- * (3) low-overhead-branch tail predication will mask out part
- * the vector on the final iteration of the loop
- * (4) if EPSR.ECI is set then we must execute only some beats
- * of the insn
- * We combine all these into a 16-bit result with the same semantics
- * as VPR.P0: 0 to mask the lane, 1 if it is active.
- * 8-bit vector ops will look at all bits of the result;
- * 16-bit ops will look at bits 0, 2, 4, ...;
- * 32-bit ops will look at bits 0, 4, 8 and 12.
- * Compare pseudocode GetCurInstrBeat(), though that only returns
- * the 4-bit slice of the mask corresponding to a single beat.
- */
- uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
-
- if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) {
- mask |= 0xff;
- }
- if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) {
- mask |= 0xff00;
- }
-
- if (env->v7m.ltpsize < 4 &&
- env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) {
- /*
- * Tail predication active, and this is the last loop iteration.
- * The element size is (1 << ltpsize), and we only want to process
- * loopcount elements, so we want to retain the least significant
- * (loopcount * esize) predicate bits and zero out bits above that.
- */
- int masklen = env->regs[14] << env->v7m.ltpsize;
- assert(masklen <= 16);
- uint16_t ltpmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0;
- mask &= ltpmask;
- }
-
- /*
- * ECI bits indicate which beats are already executed;
- * we handle this by effectively predicating them out.
- */
- mask &= mve_eci_mask(env);
- return mask;
-}
-
-static void mve_advance_vpt(CPUARMState *env)
-{
- /* Advance the VPT and ECI state if necessary */
- uint32_t vpr = env->v7m.vpr;
- unsigned mask01, mask23;
- uint16_t inv_mask;
- uint16_t eci_mask = mve_eci_mask(env);
-
- if ((env->condexec_bits & 0xf) == 0) {
- env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ?
- (ECI_A0 << 4) : (ECI_NONE << 4);
- }
-
- if (!(vpr & (R_V7M_VPR_MASK01_MASK | R_V7M_VPR_MASK23_MASK))) {
- /* VPT not enabled, nothing to do */
- return;
- }
-
- /* Invert P0 bits if needed, but only for beats we actually executed */
- mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01);
- mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23);
- /* Start by assuming we invert all bits corresponding to executed beats */
- inv_mask = eci_mask;
- if (mask01 <= 8) {
- /* MASK01 says don't invert low half of P0 */
- inv_mask &= ~0xff;
- }
- if (mask23 <= 8) {
- /* MASK23 says don't invert high half of P0 */
- inv_mask &= ~0xff00;
- }
- vpr ^= inv_mask;
- /* Only update MASK01 if beat 1 executed */
- if (eci_mask & 0xf0) {
- vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1);
- }
- /* Beat 3 always executes, so update MASK23 */
- vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1);
- env->v7m.vpr = vpr;
-}
-
-/* For loads, predicated lanes are zeroed instead of keeping their old values */
-#define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE) \
- void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \
- { \
- TYPE *d = vd; \
- uint16_t mask = mve_element_mask(env); \
- uint16_t eci_mask = mve_eci_mask(env); \
- unsigned b, e; \
- /* \
- * R_SXTM allows the dest reg to become UNKNOWN for abandoned \
- * beats so we don't care if we update part of the dest and \
- * then take an exception. \
- */ \
- for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \
- if (eci_mask & (1 << b)) { \
- d[H##ESIZE(e)] = (mask & (1 << b)) ? \
- cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \
- } \
- addr += MSIZE; \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE) \
- void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \
- { \
- TYPE *d = vd; \
- uint16_t mask = mve_element_mask(env); \
- unsigned b, e; \
- for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \
- if (mask & (1 << b)) { \
- cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
- } \
- addr += MSIZE; \
- } \
- mve_advance_vpt(env); \
- }
-
-DO_VLDR(vldrb, 1, ldub, 1, uint8_t)
-DO_VLDR(vldrh, 2, lduw, 2, uint16_t)
-DO_VLDR(vldrw, 4, ldl, 4, uint32_t)
-
-DO_VSTR(vstrb, 1, stb, 1, uint8_t)
-DO_VSTR(vstrh, 2, stw, 2, uint16_t)
-DO_VSTR(vstrw, 4, stl, 4, uint32_t)
-
-DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t)
-DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t)
-DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t)
-DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t)
-DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t)
-DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t)
-
-DO_VSTR(vstrb_h, 1, stb, 2, int16_t)
-DO_VSTR(vstrb_w, 1, stb, 4, int32_t)
-DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
-
-#undef DO_VLDR
-#undef DO_VSTR
-
-/*
- * Gather loads/scatter stores. Here each element of Qm specifies
- * an offset to use from the base register Rm. In the _os_ versions
- * that offset is scaled by the element size.
- * For loads, predicated lanes are zeroed instead of retaining
- * their previous values.
- */
-#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB) \
- void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
- uint32_t base) \
- { \
- TYPE *d = vd; \
- OFFTYPE *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- uint16_t eci_mask = mve_eci_mask(env); \
- unsigned e; \
- uint32_t addr; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
- if (!(eci_mask & 1)) { \
- continue; \
- } \
- addr = ADDRFN(base, m[H##ESIZE(e)]); \
- d[H##ESIZE(e)] = (mask & 1) ? \
- cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \
- if (WB) { \
- m[H##ESIZE(e)] = addr; \
- } \
- } \
- mve_advance_vpt(env); \
- }
-
-/* We know here TYPE is unsigned so always the same as the offset type */
-#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN, WB) \
- void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
- uint32_t base) \
- { \
- TYPE *d = vd; \
- TYPE *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- uint16_t eci_mask = mve_eci_mask(env); \
- unsigned e; \
- uint32_t addr; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
- if (!(eci_mask & 1)) { \
- continue; \
- } \
- addr = ADDRFN(base, m[H##ESIZE(e)]); \
- if (mask & 1) { \
- cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
- } \
- if (WB) { \
- m[H##ESIZE(e)] = addr; \
- } \
- } \
- mve_advance_vpt(env); \
- }
-
-/*
- * 64-bit accesses are slightly different: they are done as two 32-bit
- * accesses, controlled by the predicate mask for the relevant beat,
- * and with a single 32-bit offset in the first of the two Qm elements.
- * Note that for QEMU our IMPDEF AIRCR.ENDIANNESS is always 0 (little).
- * Address writeback happens on the odd beats and updates the address
- * stored in the even-beat element.
- */
-#define DO_VLDR64_SG(OP, ADDRFN, WB) \
- void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
- uint32_t base) \
- { \
- uint32_t *d = vd; \
- uint32_t *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- uint16_t eci_mask = mve_eci_mask(env); \
- unsigned e; \
- uint32_t addr; \
- for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \
- if (!(eci_mask & 1)) { \
- continue; \
- } \
- addr = ADDRFN(base, m[H4(e & ~1)]); \
- addr += 4 * (e & 1); \
- d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \
- if (WB && (e & 1)) { \
- m[H4(e & ~1)] = addr - 4; \
- } \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_VSTR64_SG(OP, ADDRFN, WB) \
- void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
- uint32_t base) \
- { \
- uint32_t *d = vd; \
- uint32_t *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- uint16_t eci_mask = mve_eci_mask(env); \
- unsigned e; \
- uint32_t addr; \
- for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \
- if (!(eci_mask & 1)) { \
- continue; \
- } \
- addr = ADDRFN(base, m[H4(e & ~1)]); \
- addr += 4 * (e & 1); \
- if (mask & 1) { \
- cpu_stl_data_ra(env, addr, d[H4(e)], GETPC()); \
- } \
- if (WB && (e & 1)) { \
- m[H4(e & ~1)] = addr - 4; \
- } \
- } \
- mve_advance_vpt(env); \
- }
-
-#define ADDR_ADD(BASE, OFFSET) ((BASE) + (OFFSET))
-#define ADDR_ADD_OSH(BASE, OFFSET) ((BASE) + ((OFFSET) << 1))
-#define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2))
-#define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3))
-
-DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD, false)
-
-DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD, false)
-DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false)
-DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD, false)
-
-DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH, false)
-DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH, false)
-DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH, false)
-DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW, false)
-DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD, false)
-
-DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD, false)
-DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD, false)
-DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD, false)
-
-DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH, false)
-DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH, false)
-DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW, false)
-DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD, false)
-
-DO_VLDR_SG(vldrw_sg_wb_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true)
-DO_VLDR64_SG(vldrd_sg_wb_ud, ADDR_ADD, true)
-DO_VSTR_SG(vstrw_sg_wb_uw, stl, 4, uint32_t, ADDR_ADD, true)
-DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true)
-
-/*
- * Deinterleaving loads/interleaving stores.
- *
- * For these helpers we are passed the index of the first Qreg
- * (VLD2/VST2 will also access Qn+1, VLD4/VST4 access Qn .. Qn+3)
- * and the value of the base address register Rn.
- * The helpers are specialized for pattern and element size, so
- * for instance vld42h is VLD4 with pattern 2, element size MO_16.
- *
- * These insns are beatwise but not predicated, so we must honour ECI,
- * but need not look at mve_element_mask().
- *
- * The pseudocode implements these insns with multiple memory accesses
- * of the element size, but rules R_VVVG and R_FXDM permit us to make
- * one 32-bit memory access per beat.
- */
-#define DO_VLD4B(OP, O1, O2, O3, O4) \
- void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
- uint32_t base) \
- { \
- int beat, e; \
- uint16_t mask = mve_eci_mask(env); \
- static const uint8_t off[4] = { O1, O2, O3, O4 }; \
- uint32_t addr, data; \
- for (beat = 0; beat < 4; beat++, mask >>= 4) { \
- if ((mask & 1) == 0) { \
- /* ECI says skip this beat */ \
- continue; \
- } \
- addr = base + off[beat] * 4; \
- data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
- for (e = 0; e < 4; e++, data >>= 8) { \
- uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \
- qd[H1(off[beat])] = data; \
- } \
- } \
- }
-
-#define DO_VLD4H(OP, O1, O2) \
- void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
- uint32_t base) \
- { \
- int beat; \
- uint16_t mask = mve_eci_mask(env); \
- static const uint8_t off[4] = { O1, O1, O2, O2 }; \
- uint32_t addr, data; \
- int y; /* y counts 0 2 0 2 */ \
- uint16_t *qd; \
- for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \
- if ((mask & 1) == 0) { \
- /* ECI says skip this beat */ \
- continue; \
- } \
- addr = base + off[beat] * 8 + (beat & 1) * 4; \
- data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
- qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y); \
- qd[H2(off[beat])] = data; \
- data >>= 16; \
- qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1); \
- qd[H2(off[beat])] = data; \
- } \
- }
-
-#define DO_VLD4W(OP, O1, O2, O3, O4) \
- void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
- uint32_t base) \
- { \
- int beat; \
- uint16_t mask = mve_eci_mask(env); \
- static const uint8_t off[4] = { O1, O2, O3, O4 }; \
- uint32_t addr, data; \
- uint32_t *qd; \
- int y; \
- for (beat = 0; beat < 4; beat++, mask >>= 4) { \
- if ((mask & 1) == 0) { \
- /* ECI says skip this beat */ \
- continue; \
- } \
- addr = base + off[beat] * 4; \
- data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
- y = (beat + (O1 & 2)) & 3; \
- qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \
- qd[H4(off[beat] >> 2)] = data; \
- } \
- }
-
-DO_VLD4B(vld40b, 0, 1, 10, 11)
-DO_VLD4B(vld41b, 2, 3, 12, 13)
-DO_VLD4B(vld42b, 4, 5, 14, 15)
-DO_VLD4B(vld43b, 6, 7, 8, 9)
-
-DO_VLD4H(vld40h, 0, 5)
-DO_VLD4H(vld41h, 1, 6)
-DO_VLD4H(vld42h, 2, 7)
-DO_VLD4H(vld43h, 3, 4)
-
-DO_VLD4W(vld40w, 0, 1, 10, 11)
-DO_VLD4W(vld41w, 2, 3, 12, 13)
-DO_VLD4W(vld42w, 4, 5, 14, 15)
-DO_VLD4W(vld43w, 6, 7, 8, 9)
-
-#define DO_VLD2B(OP, O1, O2, O3, O4) \
- void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
- uint32_t base) \
- { \
- int beat, e; \
- uint16_t mask = mve_eci_mask(env); \
- static const uint8_t off[4] = { O1, O2, O3, O4 }; \
- uint32_t addr, data; \
- uint8_t *qd; \
- for (beat = 0; beat < 4; beat++, mask >>= 4) { \
- if ((mask & 1) == 0) { \
- /* ECI says skip this beat */ \
- continue; \
- } \
- addr = base + off[beat] * 2; \
- data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
- for (e = 0; e < 4; e++, data >>= 8) { \
- qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \
- qd[H1(off[beat] + (e >> 1))] = data; \
- } \
- } \
- }
-
-#define DO_VLD2H(OP, O1, O2, O3, O4) \
- void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
- uint32_t base) \
- { \
- int beat; \
- uint16_t mask = mve_eci_mask(env); \
- static const uint8_t off[4] = { O1, O2, O3, O4 }; \
- uint32_t addr, data; \
- int e; \
- uint16_t *qd; \
- for (beat = 0; beat < 4; beat++, mask >>= 4) { \
- if ((mask & 1) == 0) { \
- /* ECI says skip this beat */ \
- continue; \
- } \
- addr = base + off[beat] * 4; \
- data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
- for (e = 0; e < 2; e++, data >>= 16) { \
- qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \
- qd[H2(off[beat])] = data; \
- } \
- } \
- }
-
-#define DO_VLD2W(OP, O1, O2, O3, O4) \
- void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
- uint32_t base) \
- { \
- int beat; \
- uint16_t mask = mve_eci_mask(env); \
- static const uint8_t off[4] = { O1, O2, O3, O4 }; \
- uint32_t addr, data; \
- uint32_t *qd; \
- for (beat = 0; beat < 4; beat++, mask >>= 4) { \
- if ((mask & 1) == 0) { \
- /* ECI says skip this beat */ \
- continue; \
- } \
- addr = base + off[beat]; \
- data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
- qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \
- qd[H4(off[beat] >> 3)] = data; \
- } \
- }
-
-DO_VLD2B(vld20b, 0, 2, 12, 14)
-DO_VLD2B(vld21b, 4, 6, 8, 10)
-
-DO_VLD2H(vld20h, 0, 1, 6, 7)
-DO_VLD2H(vld21h, 2, 3, 4, 5)
-
-DO_VLD2W(vld20w, 0, 4, 24, 28)
-DO_VLD2W(vld21w, 8, 12, 16, 20)
-
-#define DO_VST4B(OP, O1, O2, O3, O4) \
- void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
- uint32_t base) \
- { \
- int beat, e; \
- uint16_t mask = mve_eci_mask(env); \
- static const uint8_t off[4] = { O1, O2, O3, O4 }; \
- uint32_t addr, data; \
- for (beat = 0; beat < 4; beat++, mask >>= 4) { \
- if ((mask & 1) == 0) { \
- /* ECI says skip this beat */ \
- continue; \
- } \
- addr = base + off[beat] * 4; \
- data = 0; \
- for (e = 3; e >= 0; e--) { \
- uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \
- data = (data << 8) | qd[H1(off[beat])]; \
- } \
- cpu_stl_le_data_ra(env, addr, data, GETPC()); \
- } \
- }
-
-#define DO_VST4H(OP, O1, O2) \
- void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
- uint32_t base) \
- { \
- int beat; \
- uint16_t mask = mve_eci_mask(env); \
- static const uint8_t off[4] = { O1, O1, O2, O2 }; \
- uint32_t addr, data; \
- int y; /* y counts 0 2 0 2 */ \
- uint16_t *qd; \
- for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \
- if ((mask & 1) == 0) { \
- /* ECI says skip this beat */ \
- continue; \
- } \
- addr = base + off[beat] * 8 + (beat & 1) * 4; \
- qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y); \
- data = qd[H2(off[beat])]; \
- qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1); \
- data |= qd[H2(off[beat])] << 16; \
- cpu_stl_le_data_ra(env, addr, data, GETPC()); \
- } \
- }
-
-#define DO_VST4W(OP, O1, O2, O3, O4) \
- void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
- uint32_t base) \
- { \
- int beat; \
- uint16_t mask = mve_eci_mask(env); \
- static const uint8_t off[4] = { O1, O2, O3, O4 }; \
- uint32_t addr, data; \
- uint32_t *qd; \
- int y; \
- for (beat = 0; beat < 4; beat++, mask >>= 4) { \
- if ((mask & 1) == 0) { \
- /* ECI says skip this beat */ \
- continue; \
- } \
- addr = base + off[beat] * 4; \
- y = (beat + (O1 & 2)) & 3; \
- qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \
- data = qd[H4(off[beat] >> 2)]; \
- cpu_stl_le_data_ra(env, addr, data, GETPC()); \
- } \
- }
-
-DO_VST4B(vst40b, 0, 1, 10, 11)
-DO_VST4B(vst41b, 2, 3, 12, 13)
-DO_VST4B(vst42b, 4, 5, 14, 15)
-DO_VST4B(vst43b, 6, 7, 8, 9)
-
-DO_VST4H(vst40h, 0, 5)
-DO_VST4H(vst41h, 1, 6)
-DO_VST4H(vst42h, 2, 7)
-DO_VST4H(vst43h, 3, 4)
-
-DO_VST4W(vst40w, 0, 1, 10, 11)
-DO_VST4W(vst41w, 2, 3, 12, 13)
-DO_VST4W(vst42w, 4, 5, 14, 15)
-DO_VST4W(vst43w, 6, 7, 8, 9)
-
-#define DO_VST2B(OP, O1, O2, O3, O4) \
- void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
- uint32_t base) \
- { \
- int beat, e; \
- uint16_t mask = mve_eci_mask(env); \
- static const uint8_t off[4] = { O1, O2, O3, O4 }; \
- uint32_t addr, data; \
- uint8_t *qd; \
- for (beat = 0; beat < 4; beat++, mask >>= 4) { \
- if ((mask & 1) == 0) { \
- /* ECI says skip this beat */ \
- continue; \
- } \
- addr = base + off[beat] * 2; \
- data = 0; \
- for (e = 3; e >= 0; e--) { \
- qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \
- data = (data << 8) | qd[H1(off[beat] + (e >> 1))]; \
- } \
- cpu_stl_le_data_ra(env, addr, data, GETPC()); \
- } \
- }
-
-#define DO_VST2H(OP, O1, O2, O3, O4) \
- void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
- uint32_t base) \
- { \
- int beat; \
- uint16_t mask = mve_eci_mask(env); \
- static const uint8_t off[4] = { O1, O2, O3, O4 }; \
- uint32_t addr, data; \
- int e; \
- uint16_t *qd; \
- for (beat = 0; beat < 4; beat++, mask >>= 4) { \
- if ((mask & 1) == 0) { \
- /* ECI says skip this beat */ \
- continue; \
- } \
- addr = base + off[beat] * 4; \
- data = 0; \
- for (e = 1; e >= 0; e--) { \
- qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \
- data = (data << 16) | qd[H2(off[beat])]; \
- } \
- cpu_stl_le_data_ra(env, addr, data, GETPC()); \
- } \
- }
-
-#define DO_VST2W(OP, O1, O2, O3, O4) \
- void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
- uint32_t base) \
- { \
- int beat; \
- uint16_t mask = mve_eci_mask(env); \
- static const uint8_t off[4] = { O1, O2, O3, O4 }; \
- uint32_t addr, data; \
- uint32_t *qd; \
- for (beat = 0; beat < 4; beat++, mask >>= 4) { \
- if ((mask & 1) == 0) { \
- /* ECI says skip this beat */ \
- continue; \
- } \
- addr = base + off[beat]; \
- qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \
- data = qd[H4(off[beat] >> 3)]; \
- cpu_stl_le_data_ra(env, addr, data, GETPC()); \
- } \
- }
-
-DO_VST2B(vst20b, 0, 2, 12, 14)
-DO_VST2B(vst21b, 4, 6, 8, 10)
-
-DO_VST2H(vst20h, 0, 1, 6, 7)
-DO_VST2H(vst21h, 2, 3, 4, 5)
-
-DO_VST2W(vst20w, 0, 4, 24, 28)
-DO_VST2W(vst21w, 8, 12, 16, 20)
-
-/*
- * The mergemask(D, R, M) macro performs the operation "*D = R" but
- * storing only the bytes which correspond to 1 bits in M,
- * leaving other bytes in *D unchanged. We use _Generic
- * to select the correct implementation based on the type of D.
- */
-
-static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask)
-{
- if (mask & 1) {
- *d = r;
- }
-}
-
-static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask)
-{
- mergemask_ub((uint8_t *)d, r, mask);
-}
-
-static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask)
-{
- uint16_t bmask = expand_pred_b(mask);
- *d = (*d & ~bmask) | (r & bmask);
-}
-
-static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask)
-{
- mergemask_uh((uint16_t *)d, r, mask);
-}
-
-static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask)
-{
- uint32_t bmask = expand_pred_b(mask);
- *d = (*d & ~bmask) | (r & bmask);
-}
-
-static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask)
-{
- mergemask_uw((uint32_t *)d, r, mask);
-}
-
-static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask)
-{
- uint64_t bmask = expand_pred_b(mask);
- *d = (*d & ~bmask) | (r & bmask);
-}
-
-static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask)
-{
- mergemask_uq((uint64_t *)d, r, mask);
-}
-
-#define mergemask(D, R, M) \
- _Generic(D, \
- uint8_t *: mergemask_ub, \
- int8_t *: mergemask_sb, \
- uint16_t *: mergemask_uh, \
- int16_t *: mergemask_sh, \
- uint32_t *: mergemask_uw, \
- int32_t *: mergemask_sw, \
- uint64_t *: mergemask_uq, \
- int64_t *: mergemask_sq)(D, R, M)
-
-void HELPER(mve_vdup)(CPUARMState *env, void *vd, uint32_t val)
-{
- /*
- * The generated code already replicated an 8 or 16 bit constant
- * into the 32-bit value, so we only need to write the 32-bit
- * value to all elements of the Qreg, allowing for predication.
- */
- uint32_t *d = vd;
- uint16_t mask = mve_element_mask(env);
- unsigned e;
- for (e = 0; e < 16 / 4; e++, mask >>= 4) {
- mergemask(&d[H4(e)], val, mask);
- }
- mve_advance_vpt(env);
-}
-
-#define DO_1OP(OP, ESIZE, TYPE, FN) \
- void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \
- { \
- TYPE *d = vd, *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_CLS_B(N) (clrsb32(N) - 24)
-#define DO_CLS_H(N) (clrsb32(N) - 16)
-
-DO_1OP(vclsb, 1, int8_t, DO_CLS_B)
-DO_1OP(vclsh, 2, int16_t, DO_CLS_H)
-DO_1OP(vclsw, 4, int32_t, clrsb32)
-
-#define DO_CLZ_B(N) (clz32(N) - 24)
-#define DO_CLZ_H(N) (clz32(N) - 16)
-
-DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B)
-DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H)
-DO_1OP(vclzw, 4, uint32_t, clz32)
-
-DO_1OP(vrev16b, 2, uint16_t, bswap16)
-DO_1OP(vrev32b, 4, uint32_t, bswap32)
-DO_1OP(vrev32h, 4, uint32_t, hswap32)
-DO_1OP(vrev64b, 8, uint64_t, bswap64)
-DO_1OP(vrev64h, 8, uint64_t, hswap64)
-DO_1OP(vrev64w, 8, uint64_t, wswap64)
-
-#define DO_NOT(N) (~(N))
-
-DO_1OP(vmvn, 8, uint64_t, DO_NOT)
-
-#define DO_ABS(N) ((N) < 0 ? -(N) : (N))
-#define DO_FABSH(N) ((N) & dup_const(MO_16, 0x7fff))
-#define DO_FABSS(N) ((N) & dup_const(MO_32, 0x7fffffff))
-
-DO_1OP(vabsb, 1, int8_t, DO_ABS)
-DO_1OP(vabsh, 2, int16_t, DO_ABS)
-DO_1OP(vabsw, 4, int32_t, DO_ABS)
-
-/* We can do these 64 bits at a time */
-DO_1OP(vfabsh, 8, uint64_t, DO_FABSH)
-DO_1OP(vfabss, 8, uint64_t, DO_FABSS)
-
-#define DO_NEG(N) (-(N))
-#define DO_FNEGH(N) ((N) ^ dup_const(MO_16, 0x8000))
-#define DO_FNEGS(N) ((N) ^ dup_const(MO_32, 0x80000000))
-
-DO_1OP(vnegb, 1, int8_t, DO_NEG)
-DO_1OP(vnegh, 2, int16_t, DO_NEG)
-DO_1OP(vnegw, 4, int32_t, DO_NEG)
-
-/* We can do these 64 bits at a time */
-DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH)
-DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS)
-
-/*
- * 1 operand immediates: Vda is destination and possibly also one source.
- * All these insns work at 64-bit widths.
- */
-#define DO_1OP_IMM(OP, FN) \
- void HELPER(mve_##OP)(CPUARMState *env, void *vda, uint64_t imm) \
- { \
- uint64_t *da = vda; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- for (e = 0; e < 16 / 8; e++, mask >>= 8) { \
- mergemask(&da[H8(e)], FN(da[H8(e)], imm), mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_MOVI(N, I) (I)
-#define DO_ANDI(N, I) ((N) & (I))
-#define DO_ORRI(N, I) ((N) | (I))
-
-DO_1OP_IMM(vmovi, DO_MOVI)
-DO_1OP_IMM(vandi, DO_ANDI)
-DO_1OP_IMM(vorri, DO_ORRI)
-
-#define DO_2OP(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, \
- void *vd, void *vn, void *vm) \
- { \
- TYPE *d = vd, *n = vn, *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- mergemask(&d[H##ESIZE(e)], \
- FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-/* provide unsigned 2-op helpers for all sizes */
-#define DO_2OP_U(OP, FN) \
- DO_2OP(OP##b, 1, uint8_t, FN) \
- DO_2OP(OP##h, 2, uint16_t, FN) \
- DO_2OP(OP##w, 4, uint32_t, FN)
-
-/* provide signed 2-op helpers for all sizes */
-#define DO_2OP_S(OP, FN) \
- DO_2OP(OP##b, 1, int8_t, FN) \
- DO_2OP(OP##h, 2, int16_t, FN) \
- DO_2OP(OP##w, 4, int32_t, FN)
-
-/*
- * "Long" operations where two half-sized inputs (taken from either the
- * top or the bottom of the input vector) produce a double-width result.
- * Here ESIZE, TYPE are for the input, and LESIZE, LTYPE for the output.
- */
-#define DO_2OP_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
- { \
- LTYPE *d = vd; \
- TYPE *n = vn, *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned le; \
- for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
- LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], \
- m[H##ESIZE(le * 2 + TOP)]); \
- mergemask(&d[H##LESIZE(le)], r, mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_2OP_SAT(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
- { \
- TYPE *d = vd, *n = vn, *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- bool qc = false; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- bool sat = false; \
- TYPE r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat); \
- mergemask(&d[H##ESIZE(e)], r, mask); \
- qc |= sat & mask & 1; \
- } \
- if (qc) { \
- env->vfp.qc[0] = qc; \
- } \
- mve_advance_vpt(env); \
- }
-
-/* provide unsigned 2-op helpers for all sizes */
-#define DO_2OP_SAT_U(OP, FN) \
- DO_2OP_SAT(OP##b, 1, uint8_t, FN) \
- DO_2OP_SAT(OP##h, 2, uint16_t, FN) \
- DO_2OP_SAT(OP##w, 4, uint32_t, FN)
-
-/* provide signed 2-op helpers for all sizes */
-#define DO_2OP_SAT_S(OP, FN) \
- DO_2OP_SAT(OP##b, 1, int8_t, FN) \
- DO_2OP_SAT(OP##h, 2, int16_t, FN) \
- DO_2OP_SAT(OP##w, 4, int32_t, FN)
-
-#define DO_AND(N, M) ((N) & (M))
-#define DO_BIC(N, M) ((N) & ~(M))
-#define DO_ORR(N, M) ((N) | (M))
-#define DO_ORN(N, M) ((N) | ~(M))
-#define DO_EOR(N, M) ((N) ^ (M))
-
-DO_2OP(vand, 8, uint64_t, DO_AND)
-DO_2OP(vbic, 8, uint64_t, DO_BIC)
-DO_2OP(vorr, 8, uint64_t, DO_ORR)
-DO_2OP(vorn, 8, uint64_t, DO_ORN)
-DO_2OP(veor, 8, uint64_t, DO_EOR)
-
-#define DO_ADD(N, M) ((N) + (M))
-#define DO_SUB(N, M) ((N) - (M))
-#define DO_MUL(N, M) ((N) * (M))
-
-DO_2OP_U(vadd, DO_ADD)
-DO_2OP_U(vsub, DO_SUB)
-DO_2OP_U(vmul, DO_MUL)
-
-DO_2OP_L(vmullbsb, 0, 1, int8_t, 2, int16_t, DO_MUL)
-DO_2OP_L(vmullbsh, 0, 2, int16_t, 4, int32_t, DO_MUL)
-DO_2OP_L(vmullbsw, 0, 4, int32_t, 8, int64_t, DO_MUL)
-DO_2OP_L(vmullbub, 0, 1, uint8_t, 2, uint16_t, DO_MUL)
-DO_2OP_L(vmullbuh, 0, 2, uint16_t, 4, uint32_t, DO_MUL)
-DO_2OP_L(vmullbuw, 0, 4, uint32_t, 8, uint64_t, DO_MUL)
-
-DO_2OP_L(vmulltsb, 1, 1, int8_t, 2, int16_t, DO_MUL)
-DO_2OP_L(vmulltsh, 1, 2, int16_t, 4, int32_t, DO_MUL)
-DO_2OP_L(vmulltsw, 1, 4, int32_t, 8, int64_t, DO_MUL)
-DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL)
-DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL)
-DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
-
-/*
- * Polynomial multiply. We can always do this generating 64 bits
- * of the result at a time, so we don't need to use DO_2OP_L.
- */
-#define VMULLPH_MASK 0x00ff00ff00ff00ffULL
-#define VMULLPW_MASK 0x0000ffff0000ffffULL
-#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK)
-#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8)
-#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK)
-#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16)
-
-DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH)
-DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH)
-DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW)
-DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW)
-
-/*
- * Because the computation type is at least twice as large as required,
- * these work for both signed and unsigned source types.
- */
-static inline uint8_t do_mulh_b(int32_t n, int32_t m)
-{
- return (n * m) >> 8;
-}
-
-static inline uint16_t do_mulh_h(int32_t n, int32_t m)
-{
- return (n * m) >> 16;
-}
-
-static inline uint32_t do_mulh_w(int64_t n, int64_t m)
-{
- return (n * m) >> 32;
-}
-
-static inline uint8_t do_rmulh_b(int32_t n, int32_t m)
-{
- return (n * m + (1U << 7)) >> 8;
-}
-
-static inline uint16_t do_rmulh_h(int32_t n, int32_t m)
-{
- return (n * m + (1U << 15)) >> 16;
-}
-
-static inline uint32_t do_rmulh_w(int64_t n, int64_t m)
-{
- return (n * m + (1U << 31)) >> 32;
-}
-
-DO_2OP(vmulhsb, 1, int8_t, do_mulh_b)
-DO_2OP(vmulhsh, 2, int16_t, do_mulh_h)
-DO_2OP(vmulhsw, 4, int32_t, do_mulh_w)
-DO_2OP(vmulhub, 1, uint8_t, do_mulh_b)
-DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h)
-DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w)
-
-DO_2OP(vrmulhsb, 1, int8_t, do_rmulh_b)
-DO_2OP(vrmulhsh, 2, int16_t, do_rmulh_h)
-DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w)
-DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b)
-DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h)
-DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w)
-
-#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
-#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
-
-DO_2OP_S(vmaxs, DO_MAX)
-DO_2OP_U(vmaxu, DO_MAX)
-DO_2OP_S(vmins, DO_MIN)
-DO_2OP_U(vminu, DO_MIN)
-
-#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
-
-DO_2OP_S(vabds, DO_ABD)
-DO_2OP_U(vabdu, DO_ABD)
-
-static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m)
-{
- return ((uint64_t)n + m) >> 1;
-}
-
-static inline int32_t do_vhadd_s(int32_t n, int32_t m)
-{
- return ((int64_t)n + m) >> 1;
-}
-
-static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m)
-{
- return ((uint64_t)n - m) >> 1;
-}
-
-static inline int32_t do_vhsub_s(int32_t n, int32_t m)
-{
- return ((int64_t)n - m) >> 1;
-}
-
-DO_2OP_S(vhadds, do_vhadd_s)
-DO_2OP_U(vhaddu, do_vhadd_u)
-DO_2OP_S(vhsubs, do_vhsub_s)
-DO_2OP_U(vhsubu, do_vhsub_u)
-
-#define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
-#define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
-#define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
-#define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
-
-DO_2OP_S(vshls, DO_VSHLS)
-DO_2OP_U(vshlu, DO_VSHLU)
-DO_2OP_S(vrshls, DO_VRSHLS)
-DO_2OP_U(vrshlu, DO_VRSHLU)
-
-#define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1)
-#define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1)
-
-DO_2OP_S(vrhadds, DO_RHADD_S)
-DO_2OP_U(vrhaddu, DO_RHADD_U)
-
-static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m,
- uint32_t inv, uint32_t carry_in, bool update_flags)
-{
- uint16_t mask = mve_element_mask(env);
- unsigned e;
-
- /* If any additions trigger, we will update flags. */
- if (mask & 0x1111) {
- update_flags = true;
- }
-
- for (e = 0; e < 16 / 4; e++, mask >>= 4) {
- uint64_t r = carry_in;
- r += n[H4(e)];
- r += m[H4(e)] ^ inv;
- if (mask & 1) {
- carry_in = r >> 32;
- }
- mergemask(&d[H4(e)], r, mask);
- }
-
- if (update_flags) {
- /* Store C, clear NZV. */
- env->vfp.xregs[ARM_VFP_FPSCR] &= ~FPCR_NZCV_MASK;
- env->vfp.xregs[ARM_VFP_FPSCR] |= carry_in * FPCR_C;
- }
- mve_advance_vpt(env);
-}
-
-void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm)
-{
- bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C;
- do_vadc(env, vd, vn, vm, 0, carry_in, false);
-}
-
-void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm)
-{
- bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C;
- do_vadc(env, vd, vn, vm, -1, carry_in, false);
-}
-
-
-void HELPER(mve_vadci)(CPUARMState *env, void *vd, void *vn, void *vm)
-{
- do_vadc(env, vd, vn, vm, 0, 0, true);
-}
-
-void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm)
-{
- do_vadc(env, vd, vn, vm, -1, 1, true);
-}
-
-#define DO_VCADD(OP, ESIZE, TYPE, FN0, FN1) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
- { \
- TYPE *d = vd, *n = vn, *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- TYPE r[16 / ESIZE]; \
- /* Calculate all results first to avoid overwriting inputs */ \
- for (e = 0; e < 16 / ESIZE; e++) { \
- if (!(e & 1)) { \
- r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]); \
- } else { \
- r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]); \
- } \
- } \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- mergemask(&d[H##ESIZE(e)], r[e], mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_VCADD_ALL(OP, FN0, FN1) \
- DO_VCADD(OP##b, 1, int8_t, FN0, FN1) \
- DO_VCADD(OP##h, 2, int16_t, FN0, FN1) \
- DO_VCADD(OP##w, 4, int32_t, FN0, FN1)
-
-DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD)
-DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB)
-DO_VCADD_ALL(vhcadd90, do_vhsub_s, do_vhadd_s)
-DO_VCADD_ALL(vhcadd270, do_vhadd_s, do_vhsub_s)
-
-static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s)
-{
- if (val > max) {
- *s = true;
- return max;
- } else if (val < min) {
- *s = true;
- return min;
- }
- return val;
-}
-
-#define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s)
-#define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s)
-#define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s)
-
-#define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s)
-#define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s)
-#define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s)
-
-#define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s)
-#define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s)
-#define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s)
-
-#define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s)
-#define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s)
-#define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s)
-
-/*
- * For QDMULH and QRDMULH we simplify "double and shift by esize" into
- * "shift by esize-1", adjusting the QRDMULH rounding constant to match.
- */
-#define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \
- INT8_MIN, INT8_MAX, s)
-#define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \
- INT16_MIN, INT16_MAX, s)
-#define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \
- INT32_MIN, INT32_MAX, s)
-
-#define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \
- INT8_MIN, INT8_MAX, s)
-#define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \
- INT16_MIN, INT16_MAX, s)
-#define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \
- INT32_MIN, INT32_MAX, s)
-
-DO_2OP_SAT(vqdmulhb, 1, int8_t, DO_QDMULH_B)
-DO_2OP_SAT(vqdmulhh, 2, int16_t, DO_QDMULH_H)
-DO_2OP_SAT(vqdmulhw, 4, int32_t, DO_QDMULH_W)
-
-DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B)
-DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H)
-DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W)
-
-DO_2OP_SAT(vqaddub, 1, uint8_t, DO_UQADD_B)
-DO_2OP_SAT(vqadduh, 2, uint16_t, DO_UQADD_H)
-DO_2OP_SAT(vqadduw, 4, uint32_t, DO_UQADD_W)
-DO_2OP_SAT(vqaddsb, 1, int8_t, DO_SQADD_B)
-DO_2OP_SAT(vqaddsh, 2, int16_t, DO_SQADD_H)
-DO_2OP_SAT(vqaddsw, 4, int32_t, DO_SQADD_W)
-
-DO_2OP_SAT(vqsubub, 1, uint8_t, DO_UQSUB_B)
-DO_2OP_SAT(vqsubuh, 2, uint16_t, DO_UQSUB_H)
-DO_2OP_SAT(vqsubuw, 4, uint32_t, DO_UQSUB_W)
-DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B)
-DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H)
-DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W)
-
-/*
- * This wrapper fixes up the impedance mismatch between do_sqrshl_bhs()
- * and friends wanting a uint32_t* sat and our needing a bool*.
- */
-#define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp) \
- ({ \
- uint32_t su32 = 0; \
- typeof(N) r = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32); \
- if (su32) { \
- *satp = true; \
- } \
- r; \
- })
-
-#define DO_SQSHL_OP(N, M, satp) \
- WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp)
-#define DO_UQSHL_OP(N, M, satp) \
- WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp)
-#define DO_SQRSHL_OP(N, M, satp) \
- WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp)
-#define DO_UQRSHL_OP(N, M, satp) \
- WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp)
-#define DO_SUQSHL_OP(N, M, satp) \
- WRAP_QRSHL_HELPER(do_suqrshl_bhs, N, M, false, satp)
-
-DO_2OP_SAT_S(vqshls, DO_SQSHL_OP)
-DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP)
-DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP)
-DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP)
-
-/*
- * Multiply add dual returning high half
- * The 'FN' here takes four inputs A, B, C, D, a 0/1 indicator of
- * whether to add the rounding constant, and the pointer to the
- * saturation flag, and should do "(A * B + C * D) * 2 + rounding constant",
- * saturate to twice the input size and return the high half; or
- * (A * B - C * D) etc for VQDMLSDH.
- */
-#define DO_VQDMLADH_OP(OP, ESIZE, TYPE, XCHG, ROUND, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
- void *vm) \
- { \
- TYPE *d = vd, *n = vn, *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- bool qc = false; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- bool sat = false; \
- if ((e & 1) == XCHG) { \
- TYPE r = FN(n[H##ESIZE(e)], \
- m[H##ESIZE(e - XCHG)], \
- n[H##ESIZE(e + (1 - 2 * XCHG))], \
- m[H##ESIZE(e + (1 - XCHG))], \
- ROUND, &sat); \
- mergemask(&d[H##ESIZE(e)], r, mask); \
- qc |= sat & mask & 1; \
- } \
- } \
- if (qc) { \
- env->vfp.qc[0] = qc; \
- } \
- mve_advance_vpt(env); \
- }
-
-static int8_t do_vqdmladh_b(int8_t a, int8_t b, int8_t c, int8_t d,
- int round, bool *sat)
-{
- int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 7);
- return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
-}
-
-static int16_t do_vqdmladh_h(int16_t a, int16_t b, int16_t c, int16_t d,
- int round, bool *sat)
-{
- int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 15);
- return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
-}
-
-static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d,
- int round, bool *sat)
-{
- int64_t m1 = (int64_t)a * b;
- int64_t m2 = (int64_t)c * d;
- int64_t r;
- /*
- * Architecturally we should do the entire add, double, round
- * and then check for saturation. We do three saturating adds,
- * but we need to be careful about the order. If the first
- * m1 + m2 saturates then it's impossible for the *2+rc to
- * bring it back into the non-saturated range. However, if
- * m1 + m2 is negative then it's possible that doing the doubling
- * would take the intermediate result below INT64_MAX and the
- * addition of the rounding constant then brings it back in range.
- * So we add half the rounding constant before doubling rather
- * than adding the rounding constant after the doubling.
- */
- if (sadd64_overflow(m1, m2, &r) ||
- sadd64_overflow(r, (round << 30), &r) ||
- sadd64_overflow(r, r, &r)) {
- *sat = true;
- return r < 0 ? INT32_MAX : INT32_MIN;
- }
- return r >> 32;
-}
-
-static int8_t do_vqdmlsdh_b(int8_t a, int8_t b, int8_t c, int8_t d,
- int round, bool *sat)
-{
- int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7);
- return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
-}
-
-static int16_t do_vqdmlsdh_h(int16_t a, int16_t b, int16_t c, int16_t d,
- int round, bool *sat)
-{
- int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15);
- return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
-}
-
-static int32_t do_vqdmlsdh_w(int32_t a, int32_t b, int32_t c, int32_t d,
- int round, bool *sat)
-{
- int64_t m1 = (int64_t)a * b;
- int64_t m2 = (int64_t)c * d;
- int64_t r;
- /* The same ordering issue as in do_vqdmladh_w applies here too */
- if (ssub64_overflow(m1, m2, &r) ||
- sadd64_overflow(r, (round << 30), &r) ||
- sadd64_overflow(r, r, &r)) {
- *sat = true;
- return r < 0 ? INT32_MAX : INT32_MIN;
- }
- return r >> 32;
-}
-
-DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b)
-DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h)
-DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w)
-DO_VQDMLADH_OP(vqdmladhxb, 1, int8_t, 1, 0, do_vqdmladh_b)
-DO_VQDMLADH_OP(vqdmladhxh, 2, int16_t, 1, 0, do_vqdmladh_h)
-DO_VQDMLADH_OP(vqdmladhxw, 4, int32_t, 1, 0, do_vqdmladh_w)
-
-DO_VQDMLADH_OP(vqrdmladhb, 1, int8_t, 0, 1, do_vqdmladh_b)
-DO_VQDMLADH_OP(vqrdmladhh, 2, int16_t, 0, 1, do_vqdmladh_h)
-DO_VQDMLADH_OP(vqrdmladhw, 4, int32_t, 0, 1, do_vqdmladh_w)
-DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b)
-DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h)
-DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w)
-
-DO_VQDMLADH_OP(vqdmlsdhb, 1, int8_t, 0, 0, do_vqdmlsdh_b)
-DO_VQDMLADH_OP(vqdmlsdhh, 2, int16_t, 0, 0, do_vqdmlsdh_h)
-DO_VQDMLADH_OP(vqdmlsdhw, 4, int32_t, 0, 0, do_vqdmlsdh_w)
-DO_VQDMLADH_OP(vqdmlsdhxb, 1, int8_t, 1, 0, do_vqdmlsdh_b)
-DO_VQDMLADH_OP(vqdmlsdhxh, 2, int16_t, 1, 0, do_vqdmlsdh_h)
-DO_VQDMLADH_OP(vqdmlsdhxw, 4, int32_t, 1, 0, do_vqdmlsdh_w)
-
-DO_VQDMLADH_OP(vqrdmlsdhb, 1, int8_t, 0, 1, do_vqdmlsdh_b)
-DO_VQDMLADH_OP(vqrdmlsdhh, 2, int16_t, 0, 1, do_vqdmlsdh_h)
-DO_VQDMLADH_OP(vqrdmlsdhw, 4, int32_t, 0, 1, do_vqdmlsdh_w)
-DO_VQDMLADH_OP(vqrdmlsdhxb, 1, int8_t, 1, 1, do_vqdmlsdh_b)
-DO_VQDMLADH_OP(vqrdmlsdhxh, 2, int16_t, 1, 1, do_vqdmlsdh_h)
-DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w)
-
-#define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
- uint32_t rm) \
- { \
- TYPE *d = vd, *n = vn; \
- TYPE m = rm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_2OP_SAT_SCALAR(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
- uint32_t rm) \
- { \
- TYPE *d = vd, *n = vn; \
- TYPE m = rm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- bool qc = false; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- bool sat = false; \
- mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat), \
- mask); \
- qc |= sat & mask & 1; \
- } \
- if (qc) { \
- env->vfp.qc[0] = qc; \
- } \
- mve_advance_vpt(env); \
- }
-
-/* "accumulating" version where FN takes d as well as n and m */
-#define DO_2OP_ACC_SCALAR(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
- uint32_t rm) \
- { \
- TYPE *d = vd, *n = vn; \
- TYPE m = rm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- mergemask(&d[H##ESIZE(e)], \
- FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m), mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_2OP_SAT_ACC_SCALAR(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
- uint32_t rm) \
- { \
- TYPE *d = vd, *n = vn; \
- TYPE m = rm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- bool qc = false; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- bool sat = false; \
- mergemask(&d[H##ESIZE(e)], \
- FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m, &sat), \
- mask); \
- qc |= sat & mask & 1; \
- } \
- if (qc) { \
- env->vfp.qc[0] = qc; \
- } \
- mve_advance_vpt(env); \
- }
-
-/* provide unsigned 2-op scalar helpers for all sizes */
-#define DO_2OP_SCALAR_U(OP, FN) \
- DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \
- DO_2OP_SCALAR(OP##h, 2, uint16_t, FN) \
- DO_2OP_SCALAR(OP##w, 4, uint32_t, FN)
-#define DO_2OP_SCALAR_S(OP, FN) \
- DO_2OP_SCALAR(OP##b, 1, int8_t, FN) \
- DO_2OP_SCALAR(OP##h, 2, int16_t, FN) \
- DO_2OP_SCALAR(OP##w, 4, int32_t, FN)
-
-#define DO_2OP_ACC_SCALAR_U(OP, FN) \
- DO_2OP_ACC_SCALAR(OP##b, 1, uint8_t, FN) \
- DO_2OP_ACC_SCALAR(OP##h, 2, uint16_t, FN) \
- DO_2OP_ACC_SCALAR(OP##w, 4, uint32_t, FN)
-
-DO_2OP_SCALAR_U(vadd_scalar, DO_ADD)
-DO_2OP_SCALAR_U(vsub_scalar, DO_SUB)
-DO_2OP_SCALAR_U(vmul_scalar, DO_MUL)
-DO_2OP_SCALAR_S(vhadds_scalar, do_vhadd_s)
-DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u)
-DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s)
-DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u)
-
-DO_2OP_SAT_SCALAR(vqaddu_scalarb, 1, uint8_t, DO_UQADD_B)
-DO_2OP_SAT_SCALAR(vqaddu_scalarh, 2, uint16_t, DO_UQADD_H)
-DO_2OP_SAT_SCALAR(vqaddu_scalarw, 4, uint32_t, DO_UQADD_W)
-DO_2OP_SAT_SCALAR(vqadds_scalarb, 1, int8_t, DO_SQADD_B)
-DO_2OP_SAT_SCALAR(vqadds_scalarh, 2, int16_t, DO_SQADD_H)
-DO_2OP_SAT_SCALAR(vqadds_scalarw, 4, int32_t, DO_SQADD_W)
-
-DO_2OP_SAT_SCALAR(vqsubu_scalarb, 1, uint8_t, DO_UQSUB_B)
-DO_2OP_SAT_SCALAR(vqsubu_scalarh, 2, uint16_t, DO_UQSUB_H)
-DO_2OP_SAT_SCALAR(vqsubu_scalarw, 4, uint32_t, DO_UQSUB_W)
-DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B)
-DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H)
-DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W)
-
-DO_2OP_SAT_SCALAR(vqdmulh_scalarb, 1, int8_t, DO_QDMULH_B)
-DO_2OP_SAT_SCALAR(vqdmulh_scalarh, 2, int16_t, DO_QDMULH_H)
-DO_2OP_SAT_SCALAR(vqdmulh_scalarw, 4, int32_t, DO_QDMULH_W)
-DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B)
-DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H)
-DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W)
-
-static int8_t do_vqdmlah_b(int8_t a, int8_t b, int8_t c, int round, bool *sat)
-{
- int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 8) + (round << 7);
- return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
-}
-
-static int16_t do_vqdmlah_h(int16_t a, int16_t b, int16_t c,
- int round, bool *sat)
-{
- int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 16) + (round << 15);
- return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
-}
-
-static int32_t do_vqdmlah_w(int32_t a, int32_t b, int32_t c,
- int round, bool *sat)
-{
- /*
- * Architecturally we should do the entire add, double, round
- * and then check for saturation. We do three saturating adds,
- * but we need to be careful about the order. If the first
- * m1 + m2 saturates then it's impossible for the *2+rc to
- * bring it back into the non-saturated range. However, if
- * m1 + m2 is negative then it's possible that doing the doubling
- * would take the intermediate result below INT64_MAX and the
- * addition of the rounding constant then brings it back in range.
- * So we add half the rounding constant and half the "c << esize"
- * before doubling rather than adding the rounding constant after
- * the doubling.
- */
- int64_t m1 = (int64_t)a * b;
- int64_t m2 = (int64_t)c << 31;
- int64_t r;
- if (sadd64_overflow(m1, m2, &r) ||
- sadd64_overflow(r, (round << 30), &r) ||
- sadd64_overflow(r, r, &r)) {
- *sat = true;
- return r < 0 ? INT32_MAX : INT32_MIN;
- }
- return r >> 32;
-}
-
-/*
- * The *MLAH insns are vector * scalar + vector;
- * the *MLASH insns are vector * vector + scalar
- */
-#define DO_VQDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 0, S)
-#define DO_VQDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 0, S)
-#define DO_VQDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 0, S)
-#define DO_VQRDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 1, S)
-#define DO_VQRDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 1, S)
-#define DO_VQRDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 1, S)
-
-#define DO_VQDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 0, S)
-#define DO_VQDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 0, S)
-#define DO_VQDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 0, S)
-#define DO_VQRDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 1, S)
-#define DO_VQRDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 1, S)
-#define DO_VQRDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 1, S)
-
-DO_2OP_SAT_ACC_SCALAR(vqdmlahb, 1, int8_t, DO_VQDMLAH_B)
-DO_2OP_SAT_ACC_SCALAR(vqdmlahh, 2, int16_t, DO_VQDMLAH_H)
-DO_2OP_SAT_ACC_SCALAR(vqdmlahw, 4, int32_t, DO_VQDMLAH_W)
-DO_2OP_SAT_ACC_SCALAR(vqrdmlahb, 1, int8_t, DO_VQRDMLAH_B)
-DO_2OP_SAT_ACC_SCALAR(vqrdmlahh, 2, int16_t, DO_VQRDMLAH_H)
-DO_2OP_SAT_ACC_SCALAR(vqrdmlahw, 4, int32_t, DO_VQRDMLAH_W)
-
-DO_2OP_SAT_ACC_SCALAR(vqdmlashb, 1, int8_t, DO_VQDMLASH_B)
-DO_2OP_SAT_ACC_SCALAR(vqdmlashh, 2, int16_t, DO_VQDMLASH_H)
-DO_2OP_SAT_ACC_SCALAR(vqdmlashw, 4, int32_t, DO_VQDMLASH_W)
-DO_2OP_SAT_ACC_SCALAR(vqrdmlashb, 1, int8_t, DO_VQRDMLASH_B)
-DO_2OP_SAT_ACC_SCALAR(vqrdmlashh, 2, int16_t, DO_VQRDMLASH_H)
-DO_2OP_SAT_ACC_SCALAR(vqrdmlashw, 4, int32_t, DO_VQRDMLASH_W)
-
-/* Vector by scalar plus vector */
-#define DO_VMLA(D, N, M) ((N) * (M) + (D))
-
-DO_2OP_ACC_SCALAR_U(vmla, DO_VMLA)
-
-/* Vector by vector plus scalar */
-#define DO_VMLAS(D, N, M) ((N) * (D) + (M))
-
-DO_2OP_ACC_SCALAR_U(vmlas, DO_VMLAS)
-
-/*
- * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the
- * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type.
- * SATMASK specifies which bits of the predicate mask matter for determining
- * whether to propagate a saturation indication into FPSCR.QC -- for
- * the 16x16->32 case we must check only the bit corresponding to the T or B
- * half that we used, but for the 32x32->64 case we propagate if the mask
- * bit is set for either half.
- */
-#define DO_2OP_SAT_SCALAR_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
- uint32_t rm) \
- { \
- LTYPE *d = vd; \
- TYPE *n = vn; \
- TYPE m = rm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned le; \
- bool qc = false; \
- for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
- bool sat = false; \
- LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat); \
- mergemask(&d[H##LESIZE(le)], r, mask); \
- qc |= sat && (mask & SATMASK); \
- } \
- if (qc) { \
- env->vfp.qc[0] = qc; \
- } \
- mve_advance_vpt(env); \
- }
-
-static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat)
-{
- int64_t r = ((int64_t)n * m) * 2;
- return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat);
-}
-
-static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat)
-{
- /* The multiply can't overflow, but the doubling might */
- int64_t r = (int64_t)n * m;
- if (r > INT64_MAX / 2) {
- *sat = true;
- return INT64_MAX;
- } else if (r < INT64_MIN / 2) {
- *sat = true;
- return INT64_MIN;
- } else {
- return r * 2;
- }
-}
-
-#define SATMASK16B 1
-#define SATMASK16T (1 << 2)
-#define SATMASK32 ((1 << 4) | 1)
-
-DO_2OP_SAT_SCALAR_L(vqdmullb_scalarh, 0, 2, int16_t, 4, int32_t, \
- do_qdmullh, SATMASK16B)
-DO_2OP_SAT_SCALAR_L(vqdmullb_scalarw, 0, 4, int32_t, 8, int64_t, \
- do_qdmullw, SATMASK32)
-DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \
- do_qdmullh, SATMASK16T)
-DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \
- do_qdmullw, SATMASK32)
-
-/*
- * Long saturating ops
- */
-#define DO_2OP_SAT_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
- void *vm) \
- { \
- LTYPE *d = vd; \
- TYPE *n = vn, *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned le; \
- bool qc = false; \
- for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
- bool sat = false; \
- LTYPE op1 = n[H##ESIZE(le * 2 + TOP)]; \
- LTYPE op2 = m[H##ESIZE(le * 2 + TOP)]; \
- mergemask(&d[H##LESIZE(le)], FN(op1, op2, &sat), mask); \
- qc |= sat && (mask & SATMASK); \
- } \
- if (qc) { \
- env->vfp.qc[0] = qc; \
- } \
- mve_advance_vpt(env); \
- }
-
-DO_2OP_SAT_L(vqdmullbh, 0, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16B)
-DO_2OP_SAT_L(vqdmullbw, 0, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
-DO_2OP_SAT_L(vqdmullth, 1, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16T)
-DO_2OP_SAT_L(vqdmulltw, 1, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
-
-static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m)
-{
- m &= 0xff;
- if (m == 0) {
- return 0;
- }
- n = revbit8(n);
- if (m < 8) {
- n >>= 8 - m;
- }
- return n;
-}
-
-static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m)
-{
- m &= 0xff;
- if (m == 0) {
- return 0;
- }
- n = revbit16(n);
- if (m < 16) {
- n >>= 16 - m;
- }
- return n;
-}
-
-static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m)
-{
- m &= 0xff;
- if (m == 0) {
- return 0;
- }
- n = revbit32(n);
- if (m < 32) {
- n >>= 32 - m;
- }
- return n;
-}
-
-DO_2OP_SCALAR(vbrsrb, 1, uint8_t, do_vbrsrb)
-DO_2OP_SCALAR(vbrsrh, 2, uint16_t, do_vbrsrh)
-DO_2OP_SCALAR(vbrsrw, 4, uint32_t, do_vbrsrw)
-
-/*
- * Multiply add long dual accumulate ops.
- */
-#define DO_LDAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \
- uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
- void *vm, uint64_t a) \
- { \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- TYPE *n = vn, *m = vm; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if (mask & 1) { \
- if (e & 1) { \
- a ODDACC \
- (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
- } else { \
- a EVENACC \
- (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \
- } \
- } \
- } \
- mve_advance_vpt(env); \
- return a; \
- }
-
-DO_LDAV(vmlaldavsh, 2, int16_t, false, +=, +=)
-DO_LDAV(vmlaldavxsh, 2, int16_t, true, +=, +=)
-DO_LDAV(vmlaldavsw, 4, int32_t, false, +=, +=)
-DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=)
-
-DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=)
-DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=)
-
-DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=)
-DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=)
-DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=)
-DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=)
-
-/*
- * Multiply add dual accumulate ops
- */
-#define DO_DAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \
- uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
- void *vm, uint32_t a) \
- { \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- TYPE *n = vn, *m = vm; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if (mask & 1) { \
- if (e & 1) { \
- a ODDACC \
- n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
- } else { \
- a EVENACC \
- n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \
- } \
- } \
- } \
- mve_advance_vpt(env); \
- return a; \
- }
-
-#define DO_DAV_S(INSN, XCHG, EVENACC, ODDACC) \
- DO_DAV(INSN##b, 1, int8_t, XCHG, EVENACC, ODDACC) \
- DO_DAV(INSN##h, 2, int16_t, XCHG, EVENACC, ODDACC) \
- DO_DAV(INSN##w, 4, int32_t, XCHG, EVENACC, ODDACC)
-
-#define DO_DAV_U(INSN, XCHG, EVENACC, ODDACC) \
- DO_DAV(INSN##b, 1, uint8_t, XCHG, EVENACC, ODDACC) \
- DO_DAV(INSN##h, 2, uint16_t, XCHG, EVENACC, ODDACC) \
- DO_DAV(INSN##w, 4, uint32_t, XCHG, EVENACC, ODDACC)
-
-DO_DAV_S(vmladavs, false, +=, +=)
-DO_DAV_U(vmladavu, false, +=, +=)
-DO_DAV_S(vmlsdav, false, +=, -=)
-DO_DAV_S(vmladavsx, true, +=, +=)
-DO_DAV_S(vmlsdavx, true, +=, -=)
-
-/*
- * Rounding multiply add long dual accumulate high. In the pseudocode
- * this is implemented with a 72-bit internal accumulator value of which
- * the top 64 bits are returned. We optimize this to avoid having to
- * use 128-bit arithmetic -- we can do this because the 74-bit accumulator
- * is squashed back into 64-bits after each beat.
- */
-#define DO_LDAVH(OP, TYPE, LTYPE, XCHG, SUB) \
- uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
- void *vm, uint64_t a) \
- { \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- TYPE *n = vn, *m = vm; \
- for (e = 0; e < 16 / 4; e++, mask >>= 4) { \
- if (mask & 1) { \
- LTYPE mul; \
- if (e & 1) { \
- mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)]; \
- if (SUB) { \
- mul = -mul; \
- } \
- } else { \
- mul = (LTYPE)n[H4(e + 1 * XCHG)] * m[H4(e)]; \
- } \
- mul = (mul >> 8) + ((mul >> 7) & 1); \
- a += mul; \
- } \
- } \
- mve_advance_vpt(env); \
- return a; \
- }
-
-DO_LDAVH(vrmlaldavhsw, int32_t, int64_t, false, false)
-DO_LDAVH(vrmlaldavhxsw, int32_t, int64_t, true, false)
-
-DO_LDAVH(vrmlaldavhuw, uint32_t, uint64_t, false, false)
-
-DO_LDAVH(vrmlsldavhsw, int32_t, int64_t, false, true)
-DO_LDAVH(vrmlsldavhxsw, int32_t, int64_t, true, true)
-
-/* Vector add across vector */
-#define DO_VADDV(OP, ESIZE, TYPE) \
- uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
- uint32_t ra) \
- { \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- TYPE *m = vm; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if (mask & 1) { \
- ra += m[H##ESIZE(e)]; \
- } \
- } \
- mve_advance_vpt(env); \
- return ra; \
- } \
-
-DO_VADDV(vaddvsb, 1, int8_t)
-DO_VADDV(vaddvsh, 2, int16_t)
-DO_VADDV(vaddvsw, 4, int32_t)
-DO_VADDV(vaddvub, 1, uint8_t)
-DO_VADDV(vaddvuh, 2, uint16_t)
-DO_VADDV(vaddvuw, 4, uint32_t)
-
-/*
- * Vector max/min across vector. Unlike VADDV, we must
- * read ra as the element size, not its full width.
- * We work with int64_t internally for simplicity.
- */
-#define DO_VMAXMINV(OP, ESIZE, TYPE, RATYPE, FN) \
- uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
- uint32_t ra_in) \
- { \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- TYPE *m = vm; \
- int64_t ra = (RATYPE)ra_in; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if (mask & 1) { \
- ra = FN(ra, m[H##ESIZE(e)]); \
- } \
- } \
- mve_advance_vpt(env); \
- return ra; \
- } \
-
-#define DO_VMAXMINV_U(INSN, FN) \
- DO_VMAXMINV(INSN##b, 1, uint8_t, uint8_t, FN) \
- DO_VMAXMINV(INSN##h, 2, uint16_t, uint16_t, FN) \
- DO_VMAXMINV(INSN##w, 4, uint32_t, uint32_t, FN)
-#define DO_VMAXMINV_S(INSN, FN) \
- DO_VMAXMINV(INSN##b, 1, int8_t, int8_t, FN) \
- DO_VMAXMINV(INSN##h, 2, int16_t, int16_t, FN) \
- DO_VMAXMINV(INSN##w, 4, int32_t, int32_t, FN)
-
-/*
- * Helpers for max and min of absolute values across vector:
- * note that we only take the absolute value of 'm', not 'n'
- */
-static int64_t do_maxa(int64_t n, int64_t m)
-{
- if (m < 0) {
- m = -m;
- }
- return MAX(n, m);
-}
-
-static int64_t do_mina(int64_t n, int64_t m)
-{
- if (m < 0) {
- m = -m;
- }
- return MIN(n, m);
-}
-
-DO_VMAXMINV_S(vmaxvs, DO_MAX)
-DO_VMAXMINV_U(vmaxvu, DO_MAX)
-DO_VMAXMINV_S(vminvs, DO_MIN)
-DO_VMAXMINV_U(vminvu, DO_MIN)
-/*
- * VMAXAV, VMINAV treat the general purpose input as unsigned
- * and the vector elements as signed.
- */
-DO_VMAXMINV(vmaxavb, 1, int8_t, uint8_t, do_maxa)
-DO_VMAXMINV(vmaxavh, 2, int16_t, uint16_t, do_maxa)
-DO_VMAXMINV(vmaxavw, 4, int32_t, uint32_t, do_maxa)
-DO_VMAXMINV(vminavb, 1, int8_t, uint8_t, do_mina)
-DO_VMAXMINV(vminavh, 2, int16_t, uint16_t, do_mina)
-DO_VMAXMINV(vminavw, 4, int32_t, uint32_t, do_mina)
-
-#define DO_VABAV(OP, ESIZE, TYPE) \
- uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
- void *vm, uint32_t ra) \
- { \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- TYPE *m = vm, *n = vn; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if (mask & 1) { \
- int64_t n0 = n[H##ESIZE(e)]; \
- int64_t m0 = m[H##ESIZE(e)]; \
- uint32_t r = n0 >= m0 ? (n0 - m0) : (m0 - n0); \
- ra += r; \
- } \
- } \
- mve_advance_vpt(env); \
- return ra; \
- }
-
-DO_VABAV(vabavsb, 1, int8_t)
-DO_VABAV(vabavsh, 2, int16_t)
-DO_VABAV(vabavsw, 4, int32_t)
-DO_VABAV(vabavub, 1, uint8_t)
-DO_VABAV(vabavuh, 2, uint16_t)
-DO_VABAV(vabavuw, 4, uint32_t)
-
-#define DO_VADDLV(OP, TYPE, LTYPE) \
- uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
- uint64_t ra) \
- { \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- TYPE *m = vm; \
- for (e = 0; e < 16 / 4; e++, mask >>= 4) { \
- if (mask & 1) { \
- ra += (LTYPE)m[H4(e)]; \
- } \
- } \
- mve_advance_vpt(env); \
- return ra; \
- } \
-
-DO_VADDLV(vaddlv_s, int32_t, int64_t)
-DO_VADDLV(vaddlv_u, uint32_t, uint64_t)
-
-/* Shifts by immediate */
-#define DO_2SHIFT(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
- void *vm, uint32_t shift) \
- { \
- TYPE *d = vd, *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- mergemask(&d[H##ESIZE(e)], \
- FN(m[H##ESIZE(e)], shift), mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_2SHIFT_SAT(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
- void *vm, uint32_t shift) \
- { \
- TYPE *d = vd, *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- bool qc = false; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- bool sat = false; \
- mergemask(&d[H##ESIZE(e)], \
- FN(m[H##ESIZE(e)], shift, &sat), mask); \
- qc |= sat & mask & 1; \
- } \
- if (qc) { \
- env->vfp.qc[0] = qc; \
- } \
- mve_advance_vpt(env); \
- }
-
-/* provide unsigned 2-op shift helpers for all sizes */
-#define DO_2SHIFT_U(OP, FN) \
- DO_2SHIFT(OP##b, 1, uint8_t, FN) \
- DO_2SHIFT(OP##h, 2, uint16_t, FN) \
- DO_2SHIFT(OP##w, 4, uint32_t, FN)
-#define DO_2SHIFT_S(OP, FN) \
- DO_2SHIFT(OP##b, 1, int8_t, FN) \
- DO_2SHIFT(OP##h, 2, int16_t, FN) \
- DO_2SHIFT(OP##w, 4, int32_t, FN)
-
-#define DO_2SHIFT_SAT_U(OP, FN) \
- DO_2SHIFT_SAT(OP##b, 1, uint8_t, FN) \
- DO_2SHIFT_SAT(OP##h, 2, uint16_t, FN) \
- DO_2SHIFT_SAT(OP##w, 4, uint32_t, FN)
-#define DO_2SHIFT_SAT_S(OP, FN) \
- DO_2SHIFT_SAT(OP##b, 1, int8_t, FN) \
- DO_2SHIFT_SAT(OP##h, 2, int16_t, FN) \
- DO_2SHIFT_SAT(OP##w, 4, int32_t, FN)
-
-DO_2SHIFT_U(vshli_u, DO_VSHLU)
-DO_2SHIFT_S(vshli_s, DO_VSHLS)
-DO_2SHIFT_SAT_U(vqshli_u, DO_UQSHL_OP)
-DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP)
-DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP)
-DO_2SHIFT_U(vrshli_u, DO_VRSHLU)
-DO_2SHIFT_S(vrshli_s, DO_VRSHLS)
-DO_2SHIFT_SAT_U(vqrshli_u, DO_UQRSHL_OP)
-DO_2SHIFT_SAT_S(vqrshli_s, DO_SQRSHL_OP)
-
-/* Shift-and-insert; we always work with 64 bits at a time */
-#define DO_2SHIFT_INSERT(OP, ESIZE, SHIFTFN, MASKFN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
- void *vm, uint32_t shift) \
- { \
- uint64_t *d = vd, *m = vm; \
- uint16_t mask; \
- uint64_t shiftmask; \
- unsigned e; \
- if (shift == ESIZE * 8) { \
- /* \
- * Only VSRI can shift by <dt>; it should mean "don't \
- * update the destination". The generic logic can't handle \
- * this because it would try to shift by an out-of-range \
- * amount, so special case it here. \
- */ \
- goto done; \
- } \
- assert(shift < ESIZE * 8); \
- mask = mve_element_mask(env); \
- /* ESIZE / 2 gives the MO_* value if ESIZE is in [1,2,4] */ \
- shiftmask = dup_const(ESIZE / 2, MASKFN(ESIZE * 8, shift)); \
- for (e = 0; e < 16 / 8; e++, mask >>= 8) { \
- uint64_t r = (SHIFTFN(m[H8(e)], shift) & shiftmask) | \
- (d[H8(e)] & ~shiftmask); \
- mergemask(&d[H8(e)], r, mask); \
- } \
-done: \
- mve_advance_vpt(env); \
- }
-
-#define DO_SHL(N, SHIFT) ((N) << (SHIFT))
-#define DO_SHR(N, SHIFT) ((N) >> (SHIFT))
-#define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT))
-#define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT))
-
-DO_2SHIFT_INSERT(vsrib, 1, DO_SHR, SHR_MASK)
-DO_2SHIFT_INSERT(vsrih, 2, DO_SHR, SHR_MASK)
-DO_2SHIFT_INSERT(vsriw, 4, DO_SHR, SHR_MASK)
-DO_2SHIFT_INSERT(vslib, 1, DO_SHL, SHL_MASK)
-DO_2SHIFT_INSERT(vslih, 2, DO_SHL, SHL_MASK)
-DO_2SHIFT_INSERT(vsliw, 4, DO_SHL, SHL_MASK)
-
-/*
- * Long shifts taking half-sized inputs from top or bottom of the input
- * vector and producing a double-width result. ESIZE, TYPE are for
- * the input, and LESIZE, LTYPE for the output.
- * Unlike the normal shift helpers, we do not handle negative shift counts,
- * because the long shift is strictly left-only.
- */
-#define DO_VSHLL(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
- void *vm, uint32_t shift) \
- { \
- LTYPE *d = vd; \
- TYPE *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned le; \
- assert(shift <= 16); \
- for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
- LTYPE r = (LTYPE)m[H##ESIZE(le * 2 + TOP)] << shift; \
- mergemask(&d[H##LESIZE(le)], r, mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_VSHLL_ALL(OP, TOP) \
- DO_VSHLL(OP##sb, TOP, 1, int8_t, 2, int16_t) \
- DO_VSHLL(OP##ub, TOP, 1, uint8_t, 2, uint16_t) \
- DO_VSHLL(OP##sh, TOP, 2, int16_t, 4, int32_t) \
- DO_VSHLL(OP##uh, TOP, 2, uint16_t, 4, uint32_t) \
-
-DO_VSHLL_ALL(vshllb, false)
-DO_VSHLL_ALL(vshllt, true)
-
-/*
- * Narrowing right shifts, taking a double sized input, shifting it
- * and putting the result in either the top or bottom half of the output.
- * ESIZE, TYPE are the output, and LESIZE, LTYPE the input.
- */
-#define DO_VSHRN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
- void *vm, uint32_t shift) \
- { \
- LTYPE *m = vm; \
- TYPE *d = vd; \
- uint16_t mask = mve_element_mask(env); \
- unsigned le; \
- mask >>= ESIZE * TOP; \
- for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
- TYPE r = FN(m[H##LESIZE(le)], shift); \
- mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_VSHRN_ALL(OP, FN) \
- DO_VSHRN(OP##bb, false, 1, uint8_t, 2, uint16_t, FN) \
- DO_VSHRN(OP##bh, false, 2, uint16_t, 4, uint32_t, FN) \
- DO_VSHRN(OP##tb, true, 1, uint8_t, 2, uint16_t, FN) \
- DO_VSHRN(OP##th, true, 2, uint16_t, 4, uint32_t, FN)
-
-static inline uint64_t do_urshr(uint64_t x, unsigned sh)
-{
- if (likely(sh < 64)) {
- return (x >> sh) + ((x >> (sh - 1)) & 1);
- } else if (sh == 64) {
- return x >> 63;
- } else {
- return 0;
- }
-}
-
-static inline int64_t do_srshr(int64_t x, unsigned sh)
-{
- if (likely(sh < 64)) {
- return (x >> sh) + ((x >> (sh - 1)) & 1);
- } else {
- /* Rounding the sign bit always produces 0. */
- return 0;
- }
-}
-
-DO_VSHRN_ALL(vshrn, DO_SHR)
-DO_VSHRN_ALL(vrshrn, do_urshr)
-
-static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max,
- bool *satp)
-{
- if (val > max) {
- *satp = true;
- return max;
- } else if (val < min) {
- *satp = true;
- return min;
- } else {
- return val;
- }
-}
-
-/* Saturating narrowing right shifts */
-#define DO_VSHRN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
- void *vm, uint32_t shift) \
- { \
- LTYPE *m = vm; \
- TYPE *d = vd; \
- uint16_t mask = mve_element_mask(env); \
- bool qc = false; \
- unsigned le; \
- mask >>= ESIZE * TOP; \
- for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
- bool sat = false; \
- TYPE r = FN(m[H##LESIZE(le)], shift, &sat); \
- mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \
- qc |= sat & mask & 1; \
- } \
- if (qc) { \
- env->vfp.qc[0] = qc; \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_VSHRN_SAT_UB(BOP, TOP, FN) \
- DO_VSHRN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN) \
- DO_VSHRN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN)
-
-#define DO_VSHRN_SAT_UH(BOP, TOP, FN) \
- DO_VSHRN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN) \
- DO_VSHRN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN)
-
-#define DO_VSHRN_SAT_SB(BOP, TOP, FN) \
- DO_VSHRN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN) \
- DO_VSHRN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN)
-
-#define DO_VSHRN_SAT_SH(BOP, TOP, FN) \
- DO_VSHRN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN) \
- DO_VSHRN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN)
-
-#define DO_SHRN_SB(N, M, SATP) \
- do_sat_bhs((int64_t)(N) >> (M), INT8_MIN, INT8_MAX, SATP)
-#define DO_SHRN_UB(N, M, SATP) \
- do_sat_bhs((uint64_t)(N) >> (M), 0, UINT8_MAX, SATP)
-#define DO_SHRUN_B(N, M, SATP) \
- do_sat_bhs((int64_t)(N) >> (M), 0, UINT8_MAX, SATP)
-
-#define DO_SHRN_SH(N, M, SATP) \
- do_sat_bhs((int64_t)(N) >> (M), INT16_MIN, INT16_MAX, SATP)
-#define DO_SHRN_UH(N, M, SATP) \
- do_sat_bhs((uint64_t)(N) >> (M), 0, UINT16_MAX, SATP)
-#define DO_SHRUN_H(N, M, SATP) \
- do_sat_bhs((int64_t)(N) >> (M), 0, UINT16_MAX, SATP)
-
-#define DO_RSHRN_SB(N, M, SATP) \
- do_sat_bhs(do_srshr(N, M), INT8_MIN, INT8_MAX, SATP)
-#define DO_RSHRN_UB(N, M, SATP) \
- do_sat_bhs(do_urshr(N, M), 0, UINT8_MAX, SATP)
-#define DO_RSHRUN_B(N, M, SATP) \
- do_sat_bhs(do_srshr(N, M), 0, UINT8_MAX, SATP)
-
-#define DO_RSHRN_SH(N, M, SATP) \
- do_sat_bhs(do_srshr(N, M), INT16_MIN, INT16_MAX, SATP)
-#define DO_RSHRN_UH(N, M, SATP) \
- do_sat_bhs(do_urshr(N, M), 0, UINT16_MAX, SATP)
-#define DO_RSHRUN_H(N, M, SATP) \
- do_sat_bhs(do_srshr(N, M), 0, UINT16_MAX, SATP)
-
-DO_VSHRN_SAT_SB(vqshrnb_sb, vqshrnt_sb, DO_SHRN_SB)
-DO_VSHRN_SAT_SH(vqshrnb_sh, vqshrnt_sh, DO_SHRN_SH)
-DO_VSHRN_SAT_UB(vqshrnb_ub, vqshrnt_ub, DO_SHRN_UB)
-DO_VSHRN_SAT_UH(vqshrnb_uh, vqshrnt_uh, DO_SHRN_UH)
-DO_VSHRN_SAT_SB(vqshrunbb, vqshruntb, DO_SHRUN_B)
-DO_VSHRN_SAT_SH(vqshrunbh, vqshrunth, DO_SHRUN_H)
-
-DO_VSHRN_SAT_SB(vqrshrnb_sb, vqrshrnt_sb, DO_RSHRN_SB)
-DO_VSHRN_SAT_SH(vqrshrnb_sh, vqrshrnt_sh, DO_RSHRN_SH)
-DO_VSHRN_SAT_UB(vqrshrnb_ub, vqrshrnt_ub, DO_RSHRN_UB)
-DO_VSHRN_SAT_UH(vqrshrnb_uh, vqrshrnt_uh, DO_RSHRN_UH)
-DO_VSHRN_SAT_SB(vqrshrunbb, vqrshruntb, DO_RSHRUN_B)
-DO_VSHRN_SAT_SH(vqrshrunbh, vqrshrunth, DO_RSHRUN_H)
-
-#define DO_VMOVN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE) \
- void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \
- { \
- LTYPE *m = vm; \
- TYPE *d = vd; \
- uint16_t mask = mve_element_mask(env); \
- unsigned le; \
- mask >>= ESIZE * TOP; \
- for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
- mergemask(&d[H##ESIZE(le * 2 + TOP)], \
- m[H##LESIZE(le)], mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-DO_VMOVN(vmovnbb, false, 1, uint8_t, 2, uint16_t)
-DO_VMOVN(vmovnbh, false, 2, uint16_t, 4, uint32_t)
-DO_VMOVN(vmovntb, true, 1, uint8_t, 2, uint16_t)
-DO_VMOVN(vmovnth, true, 2, uint16_t, 4, uint32_t)
-
-#define DO_VMOVN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \
- void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \
- { \
- LTYPE *m = vm; \
- TYPE *d = vd; \
- uint16_t mask = mve_element_mask(env); \
- bool qc = false; \
- unsigned le; \
- mask >>= ESIZE * TOP; \
- for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
- bool sat = false; \
- TYPE r = FN(m[H##LESIZE(le)], &sat); \
- mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \
- qc |= sat & mask & 1; \
- } \
- if (qc) { \
- env->vfp.qc[0] = qc; \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_VMOVN_SAT_UB(BOP, TOP, FN) \
- DO_VMOVN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN) \
- DO_VMOVN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN)
-
-#define DO_VMOVN_SAT_UH(BOP, TOP, FN) \
- DO_VMOVN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN) \
- DO_VMOVN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN)
-
-#define DO_VMOVN_SAT_SB(BOP, TOP, FN) \
- DO_VMOVN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN) \
- DO_VMOVN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN)
-
-#define DO_VMOVN_SAT_SH(BOP, TOP, FN) \
- DO_VMOVN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN) \
- DO_VMOVN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN)
-
-#define DO_VQMOVN_SB(N, SATP) \
- do_sat_bhs((int64_t)(N), INT8_MIN, INT8_MAX, SATP)
-#define DO_VQMOVN_UB(N, SATP) \
- do_sat_bhs((uint64_t)(N), 0, UINT8_MAX, SATP)
-#define DO_VQMOVUN_B(N, SATP) \
- do_sat_bhs((int64_t)(N), 0, UINT8_MAX, SATP)
-
-#define DO_VQMOVN_SH(N, SATP) \
- do_sat_bhs((int64_t)(N), INT16_MIN, INT16_MAX, SATP)
-#define DO_VQMOVN_UH(N, SATP) \
- do_sat_bhs((uint64_t)(N), 0, UINT16_MAX, SATP)
-#define DO_VQMOVUN_H(N, SATP) \
- do_sat_bhs((int64_t)(N), 0, UINT16_MAX, SATP)
-
-DO_VMOVN_SAT_SB(vqmovnbsb, vqmovntsb, DO_VQMOVN_SB)
-DO_VMOVN_SAT_SH(vqmovnbsh, vqmovntsh, DO_VQMOVN_SH)
-DO_VMOVN_SAT_UB(vqmovnbub, vqmovntub, DO_VQMOVN_UB)
-DO_VMOVN_SAT_UH(vqmovnbuh, vqmovntuh, DO_VQMOVN_UH)
-DO_VMOVN_SAT_SB(vqmovunbb, vqmovuntb, DO_VQMOVUN_B)
-DO_VMOVN_SAT_SH(vqmovunbh, vqmovunth, DO_VQMOVUN_H)
-
-uint32_t HELPER(mve_vshlc)(CPUARMState *env, void *vd, uint32_t rdm,
- uint32_t shift)
-{
- uint32_t *d = vd;
- uint16_t mask = mve_element_mask(env);
- unsigned e;
- uint32_t r;
-
- /*
- * For each 32-bit element, we shift it left, bringing in the
- * low 'shift' bits of rdm at the bottom. Bits shifted out at
- * the top become the new rdm, if the predicate mask permits.
- * The final rdm value is returned to update the register.
- * shift == 0 here means "shift by 32 bits".
- */
- if (shift == 0) {
- for (e = 0; e < 16 / 4; e++, mask >>= 4) {
- r = rdm;
- if (mask & 1) {
- rdm = d[H4(e)];
- }
- mergemask(&d[H4(e)], r, mask);
- }
- } else {
- uint32_t shiftmask = MAKE_64BIT_MASK(0, shift);
-
- for (e = 0; e < 16 / 4; e++, mask >>= 4) {
- r = (d[H4(e)] << shift) | (rdm & shiftmask);
- if (mask & 1) {
- rdm = d[H4(e)] >> (32 - shift);
- }
- mergemask(&d[H4(e)], r, mask);
- }
- }
- mve_advance_vpt(env);
- return rdm;
-}
-
-uint64_t HELPER(mve_sshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
- return do_sqrshl_d(n, -(int8_t)shift, false, NULL);
-}
-
-uint64_t HELPER(mve_ushll)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
- return do_uqrshl_d(n, (int8_t)shift, false, NULL);
-}
-
-uint64_t HELPER(mve_sqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
- return do_sqrshl_d(n, (int8_t)shift, false, &env->QF);
-}
-
-uint64_t HELPER(mve_uqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
- return do_uqrshl_d(n, (int8_t)shift, false, &env->QF);
-}
-
-uint64_t HELPER(mve_sqrshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
- return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF);
-}
-
-uint64_t HELPER(mve_uqrshll)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
- return do_uqrshl_d(n, (int8_t)shift, true, &env->QF);
-}
-
-/* Operate on 64-bit values, but saturate at 48 bits */
-static inline int64_t do_sqrshl48_d(int64_t src, int64_t shift,
- bool round, uint32_t *sat)
-{
- int64_t val, extval;
-
- if (shift <= -48) {
- /* Rounding the sign bit always produces 0. */
- if (round) {
- return 0;
- }
- return src >> 63;
- } else if (shift < 0) {
- if (round) {
- src >>= -shift - 1;
- val = (src >> 1) + (src & 1);
- } else {
- val = src >> -shift;
- }
- extval = sextract64(val, 0, 48);
- if (!sat || val == extval) {
- return extval;
- }
- } else if (shift < 48) {
- int64_t extval = sextract64(src << shift, 0, 48);
- if (!sat || src == (extval >> shift)) {
- return extval;
- }
- } else if (!sat || src == 0) {
- return 0;
- }
-
- *sat = 1;
- return src >= 0 ? MAKE_64BIT_MASK(0, 47) : MAKE_64BIT_MASK(47, 17);
-}
-
-/* Operate on 64-bit values, but saturate at 48 bits */
-static inline uint64_t do_uqrshl48_d(uint64_t src, int64_t shift,
- bool round, uint32_t *sat)
-{
- uint64_t val, extval;
-
- if (shift <= -(48 + round)) {
- return 0;
- } else if (shift < 0) {
- if (round) {
- val = src >> (-shift - 1);
- val = (val >> 1) + (val & 1);
- } else {
- val = src >> -shift;
- }
- extval = extract64(val, 0, 48);
- if (!sat || val == extval) {
- return extval;
- }
- } else if (shift < 48) {
- uint64_t extval = extract64(src << shift, 0, 48);
- if (!sat || src == (extval >> shift)) {
- return extval;
- }
- } else if (!sat || src == 0) {
- return 0;
- }
-
- *sat = 1;
- return MAKE_64BIT_MASK(0, 48);
-}
-
-uint64_t HELPER(mve_sqrshrl48)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
- return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF);
-}
-
-uint64_t HELPER(mve_uqrshll48)(CPUARMState *env, uint64_t n, uint32_t shift)
-{
- return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF);
-}
-
-uint32_t HELPER(mve_uqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
-{
- return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
-}
-
-uint32_t HELPER(mve_sqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
-{
- return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
-}
-
-uint32_t HELPER(mve_uqrshl)(CPUARMState *env, uint32_t n, uint32_t shift)
-{
- return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF);
-}
-
-uint32_t HELPER(mve_sqrshr)(CPUARMState *env, uint32_t n, uint32_t shift)
-{
- return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF);
-}
-
-#define DO_VIDUP(OP, ESIZE, TYPE, FN) \
- uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd, \
- uint32_t offset, uint32_t imm) \
- { \
- TYPE *d = vd; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- mergemask(&d[H##ESIZE(e)], offset, mask); \
- offset = FN(offset, imm); \
- } \
- mve_advance_vpt(env); \
- return offset; \
- }
-
-#define DO_VIWDUP(OP, ESIZE, TYPE, FN) \
- uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd, \
- uint32_t offset, uint32_t wrap, \
- uint32_t imm) \
- { \
- TYPE *d = vd; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- mergemask(&d[H##ESIZE(e)], offset, mask); \
- offset = FN(offset, wrap, imm); \
- } \
- mve_advance_vpt(env); \
- return offset; \
- }
-
-#define DO_VIDUP_ALL(OP, FN) \
- DO_VIDUP(OP##b, 1, int8_t, FN) \
- DO_VIDUP(OP##h, 2, int16_t, FN) \
- DO_VIDUP(OP##w, 4, int32_t, FN)
-
-#define DO_VIWDUP_ALL(OP, FN) \
- DO_VIWDUP(OP##b, 1, int8_t, FN) \
- DO_VIWDUP(OP##h, 2, int16_t, FN) \
- DO_VIWDUP(OP##w, 4, int32_t, FN)
-
-static uint32_t do_add_wrap(uint32_t offset, uint32_t wrap, uint32_t imm)
-{
- offset += imm;
- if (offset == wrap) {
- offset = 0;
- }
- return offset;
-}
-
-static uint32_t do_sub_wrap(uint32_t offset, uint32_t wrap, uint32_t imm)
-{
- if (offset == 0) {
- offset = wrap;
- }
- offset -= imm;
- return offset;
-}
-
-DO_VIDUP_ALL(vidup, DO_ADD)
-DO_VIWDUP_ALL(viwdup, do_add_wrap)
-DO_VIWDUP_ALL(vdwdup, do_sub_wrap)
-
-/*
- * Vector comparison.
- * P0 bits for non-executed beats (where eci_mask is 0) are unchanged.
- * P0 bits for predicated lanes in executed beats (where mask is 0) are 0.
- * P0 bits otherwise are updated with the results of the comparisons.
- * We must also keep unchanged the MASK fields at the top of v7m.vpr.
- */
-#define DO_VCMP(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm) \
- { \
- TYPE *n = vn, *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- uint16_t eci_mask = mve_eci_mask(env); \
- uint16_t beatpred = 0; \
- uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \
- unsigned e; \
- for (e = 0; e < 16 / ESIZE; e++) { \
- bool r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)]); \
- /* Comparison sets 0/1 bits for each byte in the element */ \
- beatpred |= r * emask; \
- emask <<= ESIZE; \
- } \
- beatpred &= mask; \
- env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \
- (beatpred & eci_mask); \
- mve_advance_vpt(env); \
- }
-
-#define DO_VCMP_SCALAR(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
- uint32_t rm) \
- { \
- TYPE *n = vn; \
- uint16_t mask = mve_element_mask(env); \
- uint16_t eci_mask = mve_eci_mask(env); \
- uint16_t beatpred = 0; \
- uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \
- unsigned e; \
- for (e = 0; e < 16 / ESIZE; e++) { \
- bool r = FN(n[H##ESIZE(e)], (TYPE)rm); \
- /* Comparison sets 0/1 bits for each byte in the element */ \
- beatpred |= r * emask; \
- emask <<= ESIZE; \
- } \
- beatpred &= mask; \
- env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \
- (beatpred & eci_mask); \
- mve_advance_vpt(env); \
- }
-
-#define DO_VCMP_S(OP, FN) \
- DO_VCMP(OP##b, 1, int8_t, FN) \
- DO_VCMP(OP##h, 2, int16_t, FN) \
- DO_VCMP(OP##w, 4, int32_t, FN) \
- DO_VCMP_SCALAR(OP##_scalarb, 1, int8_t, FN) \
- DO_VCMP_SCALAR(OP##_scalarh, 2, int16_t, FN) \
- DO_VCMP_SCALAR(OP##_scalarw, 4, int32_t, FN)
-
-#define DO_VCMP_U(OP, FN) \
- DO_VCMP(OP##b, 1, uint8_t, FN) \
- DO_VCMP(OP##h, 2, uint16_t, FN) \
- DO_VCMP(OP##w, 4, uint32_t, FN) \
- DO_VCMP_SCALAR(OP##_scalarb, 1, uint8_t, FN) \
- DO_VCMP_SCALAR(OP##_scalarh, 2, uint16_t, FN) \
- DO_VCMP_SCALAR(OP##_scalarw, 4, uint32_t, FN)
-
-#define DO_EQ(N, M) ((N) == (M))
-#define DO_NE(N, M) ((N) != (M))
-#define DO_EQ(N, M) ((N) == (M))
-#define DO_EQ(N, M) ((N) == (M))
-#define DO_GE(N, M) ((N) >= (M))
-#define DO_LT(N, M) ((N) < (M))
-#define DO_GT(N, M) ((N) > (M))
-#define DO_LE(N, M) ((N) <= (M))
-
-DO_VCMP_U(vcmpeq, DO_EQ)
-DO_VCMP_U(vcmpne, DO_NE)
-DO_VCMP_U(vcmpcs, DO_GE)
-DO_VCMP_U(vcmphi, DO_GT)
-DO_VCMP_S(vcmpge, DO_GE)
-DO_VCMP_S(vcmplt, DO_LT)
-DO_VCMP_S(vcmpgt, DO_GT)
-DO_VCMP_S(vcmple, DO_LE)
-
-void HELPER(mve_vpsel)(CPUARMState *env, void *vd, void *vn, void *vm)
-{
- /*
- * Qd[n] = VPR.P0[n] ? Qn[n] : Qm[n]
- * but note that whether bytes are written to Qd is still subject
- * to (all forms of) predication in the usual way.
- */
- uint64_t *d = vd, *n = vn, *m = vm;
- uint16_t mask = mve_element_mask(env);
- uint16_t p0 = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
- unsigned e;
- for (e = 0; e < 16 / 8; e++, mask >>= 8, p0 >>= 8) {
- uint64_t r = m[H8(e)];
- mergemask(&r, n[H8(e)], p0);
- mergemask(&d[H8(e)], r, mask);
- }
- mve_advance_vpt(env);
-}
-
-void HELPER(mve_vpnot)(CPUARMState *env)
-{
- /*
- * P0 bits for unexecuted beats (where eci_mask is 0) are unchanged.
- * P0 bits for predicated lanes in executed bits (where mask is 0) are 0.
- * P0 bits otherwise are inverted.
- * (This is the same logic as VCMP.)
- * This insn is itself subject to predication and to beat-wise execution,
- * and after it executes VPT state advances in the usual way.
- */
- uint16_t mask = mve_element_mask(env);
- uint16_t eci_mask = mve_eci_mask(env);
- uint16_t beatpred = ~env->v7m.vpr & mask;
- env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (beatpred & eci_mask);
- mve_advance_vpt(env);
-}
-
-/*
- * VCTP: P0 unexecuted bits unchanged, predicated bits zeroed,
- * otherwise set according to value of Rn. The calculation of
- * newmask here works in the same way as the calculation of the
- * ltpmask in mve_element_mask(), but we have pre-calculated
- * the masklen in the generated code.
- */
-void HELPER(mve_vctp)(CPUARMState *env, uint32_t masklen)
-{
- uint16_t mask = mve_element_mask(env);
- uint16_t eci_mask = mve_eci_mask(env);
- uint16_t newmask;
-
- assert(masklen <= 16);
- newmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0;
- newmask &= mask;
- env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (newmask & eci_mask);
- mve_advance_vpt(env);
-}
-
-#define DO_1OP_SAT(OP, ESIZE, TYPE, FN) \
- void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \
- { \
- TYPE *d = vd, *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- bool qc = false; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- bool sat = false; \
- mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)], &sat), mask); \
- qc |= sat & mask & 1; \
- } \
- if (qc) { \
- env->vfp.qc[0] = qc; \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_VQABS_B(N, SATP) \
- do_sat_bhs(DO_ABS((int64_t)N), INT8_MIN, INT8_MAX, SATP)
-#define DO_VQABS_H(N, SATP) \
- do_sat_bhs(DO_ABS((int64_t)N), INT16_MIN, INT16_MAX, SATP)
-#define DO_VQABS_W(N, SATP) \
- do_sat_bhs(DO_ABS((int64_t)N), INT32_MIN, INT32_MAX, SATP)
-
-#define DO_VQNEG_B(N, SATP) do_sat_bhs(-(int64_t)N, INT8_MIN, INT8_MAX, SATP)
-#define DO_VQNEG_H(N, SATP) do_sat_bhs(-(int64_t)N, INT16_MIN, INT16_MAX, SATP)
-#define DO_VQNEG_W(N, SATP) do_sat_bhs(-(int64_t)N, INT32_MIN, INT32_MAX, SATP)
-
-DO_1OP_SAT(vqabsb, 1, int8_t, DO_VQABS_B)
-DO_1OP_SAT(vqabsh, 2, int16_t, DO_VQABS_H)
-DO_1OP_SAT(vqabsw, 4, int32_t, DO_VQABS_W)
-
-DO_1OP_SAT(vqnegb, 1, int8_t, DO_VQNEG_B)
-DO_1OP_SAT(vqnegh, 2, int16_t, DO_VQNEG_H)
-DO_1OP_SAT(vqnegw, 4, int32_t, DO_VQNEG_W)
-
-/*
- * VMAXA, VMINA: vd is unsigned; vm is signed, and we take its
- * absolute value; we then do an unsigned comparison.
- */
-#define DO_VMAXMINA(OP, ESIZE, STYPE, UTYPE, FN) \
- void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \
- { \
- UTYPE *d = vd; \
- STYPE *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- UTYPE r = DO_ABS(m[H##ESIZE(e)]); \
- r = FN(d[H##ESIZE(e)], r); \
- mergemask(&d[H##ESIZE(e)], r, mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-DO_VMAXMINA(vmaxab, 1, int8_t, uint8_t, DO_MAX)
-DO_VMAXMINA(vmaxah, 2, int16_t, uint16_t, DO_MAX)
-DO_VMAXMINA(vmaxaw, 4, int32_t, uint32_t, DO_MAX)
-DO_VMAXMINA(vminab, 1, int8_t, uint8_t, DO_MIN)
-DO_VMAXMINA(vminah, 2, int16_t, uint16_t, DO_MIN)
-DO_VMAXMINA(vminaw, 4, int32_t, uint32_t, DO_MIN)
-
-/*
- * 2-operand floating point. Note that if an element is partially
- * predicated we must do the FP operation to update the non-predicated
- * bytes, but we must be careful to avoid updating the FP exception
- * state unless byte 0 of the element was unpredicated.
- */
-#define DO_2OP_FP(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, \
- void *vd, void *vn, void *vm) \
- { \
- TYPE *d = vd, *n = vn, *m = vm; \
- TYPE r; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- float_status *fpst; \
- float_status scratch_fpst; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
- continue; \
- } \
- fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
- &env->vfp.standard_fp_status; \
- if (!(mask & 1)) { \
- /* We need the result but without updating flags */ \
- scratch_fpst = *fpst; \
- fpst = &scratch_fpst; \
- } \
- r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst); \
- mergemask(&d[H##ESIZE(e)], r, mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_2OP_FP_ALL(OP, FN) \
- DO_2OP_FP(OP##h, 2, float16, float16_##FN) \
- DO_2OP_FP(OP##s, 4, float32, float32_##FN)
-
-DO_2OP_FP_ALL(vfadd, add)
-DO_2OP_FP_ALL(vfsub, sub)
-DO_2OP_FP_ALL(vfmul, mul)
-
-static inline float16 float16_abd(float16 a, float16 b, float_status *s)
-{
- return float16_abs(float16_sub(a, b, s));
-}
-
-static inline float32 float32_abd(float32 a, float32 b, float_status *s)
-{
- return float32_abs(float32_sub(a, b, s));
-}
-
-DO_2OP_FP_ALL(vfabd, abd)
-DO_2OP_FP_ALL(vmaxnm, maxnum)
-DO_2OP_FP_ALL(vminnm, minnum)
-
-static inline float16 float16_maxnuma(float16 a, float16 b, float_status *s)
-{
- return float16_maxnum(float16_abs(a), float16_abs(b), s);
-}
-
-static inline float32 float32_maxnuma(float32 a, float32 b, float_status *s)
-{
- return float32_maxnum(float32_abs(a), float32_abs(b), s);
-}
-
-static inline float16 float16_minnuma(float16 a, float16 b, float_status *s)
-{
- return float16_minnum(float16_abs(a), float16_abs(b), s);
-}
-
-static inline float32 float32_minnuma(float32 a, float32 b, float_status *s)
-{
- return float32_minnum(float32_abs(a), float32_abs(b), s);
-}
-
-DO_2OP_FP_ALL(vmaxnma, maxnuma)
-DO_2OP_FP_ALL(vminnma, minnuma)
-
-#define DO_VCADD_FP(OP, ESIZE, TYPE, FN0, FN1) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, \
- void *vd, void *vn, void *vm) \
- { \
- TYPE *d = vd, *n = vn, *m = vm; \
- TYPE r[16 / ESIZE]; \
- uint16_t tm, mask = mve_element_mask(env); \
- unsigned e; \
- float_status *fpst; \
- float_status scratch_fpst; \
- /* Calculate all results first to avoid overwriting inputs */ \
- for (e = 0, tm = mask; e < 16 / ESIZE; e++, tm >>= ESIZE) { \
- if ((tm & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
- r[e] = 0; \
- continue; \
- } \
- fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
- &env->vfp.standard_fp_status; \
- if (!(tm & 1)) { \
- /* We need the result but without updating flags */ \
- scratch_fpst = *fpst; \
- fpst = &scratch_fpst; \
- } \
- if (!(e & 1)) { \
- r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)], fpst); \
- } else { \
- r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)], fpst); \
- } \
- } \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- mergemask(&d[H##ESIZE(e)], r[e], mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-DO_VCADD_FP(vfcadd90h, 2, float16, float16_sub, float16_add)
-DO_VCADD_FP(vfcadd90s, 4, float32, float32_sub, float32_add)
-DO_VCADD_FP(vfcadd270h, 2, float16, float16_add, float16_sub)
-DO_VCADD_FP(vfcadd270s, 4, float32, float32_add, float32_sub)
-
-#define DO_VFMA(OP, ESIZE, TYPE, CHS) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, \
- void *vd, void *vn, void *vm) \
- { \
- TYPE *d = vd, *n = vn, *m = vm; \
- TYPE r; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- float_status *fpst; \
- float_status scratch_fpst; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
- continue; \
- } \
- fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
- &env->vfp.standard_fp_status; \
- if (!(mask & 1)) { \
- /* We need the result but without updating flags */ \
- scratch_fpst = *fpst; \
- fpst = &scratch_fpst; \
- } \
- r = n[H##ESIZE(e)]; \
- if (CHS) { \
- r = TYPE##_chs(r); \
- } \
- r = TYPE##_muladd(r, m[H##ESIZE(e)], d[H##ESIZE(e)], \
- 0, fpst); \
- mergemask(&d[H##ESIZE(e)], r, mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-DO_VFMA(vfmah, 2, float16, false)
-DO_VFMA(vfmas, 4, float32, false)
-DO_VFMA(vfmsh, 2, float16, true)
-DO_VFMA(vfmss, 4, float32, true)
-
-#define DO_VCMLA(OP, ESIZE, TYPE, ROT, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, \
- void *vd, void *vn, void *vm) \
- { \
- TYPE *d = vd, *n = vn, *m = vm; \
- TYPE r0, r1, e1, e2, e3, e4; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- float_status *fpst0, *fpst1; \
- float_status scratch_fpst; \
- /* We loop through pairs of elements at a time */ \
- for (e = 0; e < 16 / ESIZE; e += 2, mask >>= ESIZE * 2) { \
- if ((mask & MAKE_64BIT_MASK(0, ESIZE * 2)) == 0) { \
- continue; \
- } \
- fpst0 = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
- &env->vfp.standard_fp_status; \
- fpst1 = fpst0; \
- if (!(mask & 1)) { \
- scratch_fpst = *fpst0; \
- fpst0 = &scratch_fpst; \
- } \
- if (!(mask & (1 << ESIZE))) { \
- scratch_fpst = *fpst1; \
- fpst1 = &scratch_fpst; \
- } \
- switch (ROT) { \
- case 0: \
- e1 = m[H##ESIZE(e)]; \
- e2 = n[H##ESIZE(e)]; \
- e3 = m[H##ESIZE(e + 1)]; \
- e4 = n[H##ESIZE(e)]; \
- break; \
- case 1: \
- e1 = TYPE##_chs(m[H##ESIZE(e + 1)]); \
- e2 = n[H##ESIZE(e + 1)]; \
- e3 = m[H##ESIZE(e)]; \
- e4 = n[H##ESIZE(e + 1)]; \
- break; \
- case 2: \
- e1 = TYPE##_chs(m[H##ESIZE(e)]); \
- e2 = n[H##ESIZE(e)]; \
- e3 = TYPE##_chs(m[H##ESIZE(e + 1)]); \
- e4 = n[H##ESIZE(e)]; \
- break; \
- case 3: \
- e1 = m[H##ESIZE(e + 1)]; \
- e2 = n[H##ESIZE(e + 1)]; \
- e3 = TYPE##_chs(m[H##ESIZE(e)]); \
- e4 = n[H##ESIZE(e + 1)]; \
- break; \
- default: \
- g_assert_not_reached(); \
- } \
- r0 = FN(e2, e1, d[H##ESIZE(e)], fpst0); \
- r1 = FN(e4, e3, d[H##ESIZE(e + 1)], fpst1); \
- mergemask(&d[H##ESIZE(e)], r0, mask); \
- mergemask(&d[H##ESIZE(e + 1)], r1, mask >> ESIZE); \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_VCMULH(N, M, D, S) float16_mul(N, M, S)
-#define DO_VCMULS(N, M, D, S) float32_mul(N, M, S)
-
-#define DO_VCMLAH(N, M, D, S) float16_muladd(N, M, D, 0, S)
-#define DO_VCMLAS(N, M, D, S) float32_muladd(N, M, D, 0, S)
-
-DO_VCMLA(vcmul0h, 2, float16, 0, DO_VCMULH)
-DO_VCMLA(vcmul0s, 4, float32, 0, DO_VCMULS)
-DO_VCMLA(vcmul90h, 2, float16, 1, DO_VCMULH)
-DO_VCMLA(vcmul90s, 4, float32, 1, DO_VCMULS)
-DO_VCMLA(vcmul180h, 2, float16, 2, DO_VCMULH)
-DO_VCMLA(vcmul180s, 4, float32, 2, DO_VCMULS)
-DO_VCMLA(vcmul270h, 2, float16, 3, DO_VCMULH)
-DO_VCMLA(vcmul270s, 4, float32, 3, DO_VCMULS)
-
-DO_VCMLA(vcmla0h, 2, float16, 0, DO_VCMLAH)
-DO_VCMLA(vcmla0s, 4, float32, 0, DO_VCMLAS)
-DO_VCMLA(vcmla90h, 2, float16, 1, DO_VCMLAH)
-DO_VCMLA(vcmla90s, 4, float32, 1, DO_VCMLAS)
-DO_VCMLA(vcmla180h, 2, float16, 2, DO_VCMLAH)
-DO_VCMLA(vcmla180s, 4, float32, 2, DO_VCMLAS)
-DO_VCMLA(vcmla270h, 2, float16, 3, DO_VCMLAH)
-DO_VCMLA(vcmla270s, 4, float32, 3, DO_VCMLAS)
-
-#define DO_2OP_FP_SCALAR(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, \
- void *vd, void *vn, uint32_t rm) \
- { \
- TYPE *d = vd, *n = vn; \
- TYPE r, m = rm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- float_status *fpst; \
- float_status scratch_fpst; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
- continue; \
- } \
- fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
- &env->vfp.standard_fp_status; \
- if (!(mask & 1)) { \
- /* We need the result but without updating flags */ \
- scratch_fpst = *fpst; \
- fpst = &scratch_fpst; \
- } \
- r = FN(n[H##ESIZE(e)], m, fpst); \
- mergemask(&d[H##ESIZE(e)], r, mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-#define DO_2OP_FP_SCALAR_ALL(OP, FN) \
- DO_2OP_FP_SCALAR(OP##h, 2, float16, float16_##FN) \
- DO_2OP_FP_SCALAR(OP##s, 4, float32, float32_##FN)
-
-DO_2OP_FP_SCALAR_ALL(vfadd_scalar, add)
-DO_2OP_FP_SCALAR_ALL(vfsub_scalar, sub)
-DO_2OP_FP_SCALAR_ALL(vfmul_scalar, mul)
-
-#define DO_2OP_FP_ACC_SCALAR(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, \
- void *vd, void *vn, uint32_t rm) \
- { \
- TYPE *d = vd, *n = vn; \
- TYPE r, m = rm; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- float_status *fpst; \
- float_status scratch_fpst; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
- continue; \
- } \
- fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
- &env->vfp.standard_fp_status; \
- if (!(mask & 1)) { \
- /* We need the result but without updating flags */ \
- scratch_fpst = *fpst; \
- fpst = &scratch_fpst; \
- } \
- r = FN(n[H##ESIZE(e)], m, d[H##ESIZE(e)], 0, fpst); \
- mergemask(&d[H##ESIZE(e)], r, mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-/* VFMAS is vector * vector + scalar, so swap op2 and op3 */
-#define DO_VFMAS_SCALARH(N, M, D, F, S) float16_muladd(N, D, M, F, S)
-#define DO_VFMAS_SCALARS(N, M, D, F, S) float32_muladd(N, D, M, F, S)
-
-/* VFMA is vector * scalar + vector */
-DO_2OP_FP_ACC_SCALAR(vfma_scalarh, 2, float16, float16_muladd)
-DO_2OP_FP_ACC_SCALAR(vfma_scalars, 4, float32, float32_muladd)
-DO_2OP_FP_ACC_SCALAR(vfmas_scalarh, 2, float16, DO_VFMAS_SCALARH)
-DO_2OP_FP_ACC_SCALAR(vfmas_scalars, 4, float32, DO_VFMAS_SCALARS)
-
-/* Floating point max/min across vector. */
-#define DO_FP_VMAXMINV(OP, ESIZE, TYPE, ABS, FN) \
- uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
- uint32_t ra_in) \
- { \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- TYPE *m = vm; \
- TYPE ra = (TYPE)ra_in; \
- float_status *fpst = (ESIZE == 2) ? \
- &env->vfp.standard_fp_status_f16 : \
- &env->vfp.standard_fp_status; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if (mask & 1) { \
- TYPE v = m[H##ESIZE(e)]; \
- if (TYPE##_is_signaling_nan(ra, fpst)) { \
- ra = TYPE##_silence_nan(ra, fpst); \
- float_raise(float_flag_invalid, fpst); \
- } \
- if (TYPE##_is_signaling_nan(v, fpst)) { \
- v = TYPE##_silence_nan(v, fpst); \
- float_raise(float_flag_invalid, fpst); \
- } \
- if (ABS) { \
- v = TYPE##_abs(v); \
- } \
- ra = FN(ra, v, fpst); \
- } \
- } \
- mve_advance_vpt(env); \
- return ra; \
- } \
-
-#define NOP(X) (X)
-
-DO_FP_VMAXMINV(vmaxnmvh, 2, float16, false, float16_maxnum)
-DO_FP_VMAXMINV(vmaxnmvs, 4, float32, false, float32_maxnum)
-DO_FP_VMAXMINV(vminnmvh, 2, float16, false, float16_minnum)
-DO_FP_VMAXMINV(vminnmvs, 4, float32, false, float32_minnum)
-DO_FP_VMAXMINV(vmaxnmavh, 2, float16, true, float16_maxnum)
-DO_FP_VMAXMINV(vmaxnmavs, 4, float32, true, float32_maxnum)
-DO_FP_VMAXMINV(vminnmavh, 2, float16, true, float16_minnum)
-DO_FP_VMAXMINV(vminnmavs, 4, float32, true, float32_minnum)
-
-/* FP compares; note that all comparisons signal InvalidOp for QNaNs */
-#define DO_VCMP_FP(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm) \
- { \
- TYPE *n = vn, *m = vm; \
- uint16_t mask = mve_element_mask(env); \
- uint16_t eci_mask = mve_eci_mask(env); \
- uint16_t beatpred = 0; \
- uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \
- unsigned e; \
- float_status *fpst; \
- float_status scratch_fpst; \
- bool r; \
- for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) { \
- if ((mask & emask) == 0) { \
- continue; \
- } \
- fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
- &env->vfp.standard_fp_status; \
- if (!(mask & (1 << (e * ESIZE)))) { \
- /* We need the result but without updating flags */ \
- scratch_fpst = *fpst; \
- fpst = &scratch_fpst; \
- } \
- r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst); \
- /* Comparison sets 0/1 bits for each byte in the element */ \
- beatpred |= r * emask; \
- } \
- beatpred &= mask; \
- env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \
- (beatpred & eci_mask); \
- mve_advance_vpt(env); \
- }
-
-#define DO_VCMP_FP_SCALAR(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
- uint32_t rm) \
- { \
- TYPE *n = vn; \
- uint16_t mask = mve_element_mask(env); \
- uint16_t eci_mask = mve_eci_mask(env); \
- uint16_t beatpred = 0; \
- uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \
- unsigned e; \
- float_status *fpst; \
- float_status scratch_fpst; \
- bool r; \
- for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) { \
- if ((mask & emask) == 0) { \
- continue; \
- } \
- fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
- &env->vfp.standard_fp_status; \
- if (!(mask & (1 << (e * ESIZE)))) { \
- /* We need the result but without updating flags */ \
- scratch_fpst = *fpst; \
- fpst = &scratch_fpst; \
- } \
- r = FN(n[H##ESIZE(e)], (TYPE)rm, fpst); \
- /* Comparison sets 0/1 bits for each byte in the element */ \
- beatpred |= r * emask; \
- } \
- beatpred &= mask; \
- env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \
- (beatpred & eci_mask); \
- mve_advance_vpt(env); \
- }
-
-#define DO_VCMP_FP_BOTH(VOP, SOP, ESIZE, TYPE, FN) \
- DO_VCMP_FP(VOP, ESIZE, TYPE, FN) \
- DO_VCMP_FP_SCALAR(SOP, ESIZE, TYPE, FN)
-
-/*
- * Some care is needed here to get the correct result for the unordered case.
- * Architecturally EQ, GE and GT are defined to be false for unordered, but
- * the NE, LT and LE comparisons are defined as simple logical inverses of
- * EQ, GE and GT and so they must return true for unordered. The softfloat
- * comparison functions float*_{eq,le,lt} all return false for unordered.
- */
-#define DO_GE16(X, Y, S) float16_le(Y, X, S)
-#define DO_GE32(X, Y, S) float32_le(Y, X, S)
-#define DO_GT16(X, Y, S) float16_lt(Y, X, S)
-#define DO_GT32(X, Y, S) float32_lt(Y, X, S)
-
-DO_VCMP_FP_BOTH(vfcmpeqh, vfcmpeq_scalarh, 2, float16, float16_eq)
-DO_VCMP_FP_BOTH(vfcmpeqs, vfcmpeq_scalars, 4, float32, float32_eq)
-
-DO_VCMP_FP_BOTH(vfcmpneh, vfcmpne_scalarh, 2, float16, !float16_eq)
-DO_VCMP_FP_BOTH(vfcmpnes, vfcmpne_scalars, 4, float32, !float32_eq)
-
-DO_VCMP_FP_BOTH(vfcmpgeh, vfcmpge_scalarh, 2, float16, DO_GE16)
-DO_VCMP_FP_BOTH(vfcmpges, vfcmpge_scalars, 4, float32, DO_GE32)
-
-DO_VCMP_FP_BOTH(vfcmplth, vfcmplt_scalarh, 2, float16, !DO_GE16)
-DO_VCMP_FP_BOTH(vfcmplts, vfcmplt_scalars, 4, float32, !DO_GE32)
-
-DO_VCMP_FP_BOTH(vfcmpgth, vfcmpgt_scalarh, 2, float16, DO_GT16)
-DO_VCMP_FP_BOTH(vfcmpgts, vfcmpgt_scalars, 4, float32, DO_GT32)
-
-DO_VCMP_FP_BOTH(vfcmpleh, vfcmple_scalarh, 2, float16, !DO_GT16)
-DO_VCMP_FP_BOTH(vfcmples, vfcmple_scalars, 4, float32, !DO_GT32)
-
-#define DO_VCVT_FIXED(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm, \
- uint32_t shift) \
- { \
- TYPE *d = vd, *m = vm; \
- TYPE r; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- float_status *fpst; \
- float_status scratch_fpst; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
- continue; \
- } \
- fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
- &env->vfp.standard_fp_status; \
- if (!(mask & 1)) { \
- /* We need the result but without updating flags */ \
- scratch_fpst = *fpst; \
- fpst = &scratch_fpst; \
- } \
- r = FN(m[H##ESIZE(e)], shift, fpst); \
- mergemask(&d[H##ESIZE(e)], r, mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-DO_VCVT_FIXED(vcvt_sh, 2, int16_t, helper_vfp_shtoh)
-DO_VCVT_FIXED(vcvt_uh, 2, uint16_t, helper_vfp_uhtoh)
-DO_VCVT_FIXED(vcvt_hs, 2, int16_t, helper_vfp_toshh_round_to_zero)
-DO_VCVT_FIXED(vcvt_hu, 2, uint16_t, helper_vfp_touhh_round_to_zero)
-DO_VCVT_FIXED(vcvt_sf, 4, int32_t, helper_vfp_sltos)
-DO_VCVT_FIXED(vcvt_uf, 4, uint32_t, helper_vfp_ultos)
-DO_VCVT_FIXED(vcvt_fs, 4, int32_t, helper_vfp_tosls_round_to_zero)
-DO_VCVT_FIXED(vcvt_fu, 4, uint32_t, helper_vfp_touls_round_to_zero)
-
-/* VCVT with specified rmode */
-#define DO_VCVT_RMODE(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, \
- void *vd, void *vm, uint32_t rmode) \
- { \
- TYPE *d = vd, *m = vm; \
- TYPE r; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- float_status *fpst; \
- float_status scratch_fpst; \
- float_status *base_fpst = (ESIZE == 2) ? \
- &env->vfp.standard_fp_status_f16 : \
- &env->vfp.standard_fp_status; \
- uint32_t prev_rmode = get_float_rounding_mode(base_fpst); \
- set_float_rounding_mode(rmode, base_fpst); \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
- continue; \
- } \
- fpst = base_fpst; \
- if (!(mask & 1)) { \
- /* We need the result but without updating flags */ \
- scratch_fpst = *fpst; \
- fpst = &scratch_fpst; \
- } \
- r = FN(m[H##ESIZE(e)], 0, fpst); \
- mergemask(&d[H##ESIZE(e)], r, mask); \
- } \
- set_float_rounding_mode(prev_rmode, base_fpst); \
- mve_advance_vpt(env); \
- }
-
-DO_VCVT_RMODE(vcvt_rm_sh, 2, uint16_t, helper_vfp_toshh)
-DO_VCVT_RMODE(vcvt_rm_uh, 2, uint16_t, helper_vfp_touhh)
-DO_VCVT_RMODE(vcvt_rm_ss, 4, uint32_t, helper_vfp_tosls)
-DO_VCVT_RMODE(vcvt_rm_us, 4, uint32_t, helper_vfp_touls)
-
-#define DO_VRINT_RM_H(M, F, S) helper_rinth(M, S)
-#define DO_VRINT_RM_S(M, F, S) helper_rints(M, S)
-
-DO_VCVT_RMODE(vrint_rm_h, 2, uint16_t, DO_VRINT_RM_H)
-DO_VCVT_RMODE(vrint_rm_s, 4, uint32_t, DO_VRINT_RM_S)
-
-/*
- * VCVT between halfprec and singleprec. As usual for halfprec
- * conversions, FZ16 is ignored and AHP is observed.
- */
-static void do_vcvt_sh(CPUARMState *env, void *vd, void *vm, int top)
-{
- uint16_t *d = vd;
- uint32_t *m = vm;
- uint16_t r;
- uint16_t mask = mve_element_mask(env);
- bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP);
- unsigned e;
- float_status *fpst;
- float_status scratch_fpst;
- float_status *base_fpst = &env->vfp.standard_fp_status;
- bool old_fz = get_flush_to_zero(base_fpst);
- set_flush_to_zero(false, base_fpst);
- for (e = 0; e < 16 / 4; e++, mask >>= 4) {
- if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) {
- continue;
- }
- fpst = base_fpst;
- if (!(mask & 1)) {
- /* We need the result but without updating flags */
- scratch_fpst = *fpst;
- fpst = &scratch_fpst;
- }
- r = float32_to_float16(m[H4(e)], ieee, fpst);
- mergemask(&d[H2(e * 2 + top)], r, mask >> (top * 2));
- }
- set_flush_to_zero(old_fz, base_fpst);
- mve_advance_vpt(env);
-}
-
-static void do_vcvt_hs(CPUARMState *env, void *vd, void *vm, int top)
-{
- uint32_t *d = vd;
- uint16_t *m = vm;
- uint32_t r;
- uint16_t mask = mve_element_mask(env);
- bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP);
- unsigned e;
- float_status *fpst;
- float_status scratch_fpst;
- float_status *base_fpst = &env->vfp.standard_fp_status;
- bool old_fiz = get_flush_inputs_to_zero(base_fpst);
- set_flush_inputs_to_zero(false, base_fpst);
- for (e = 0; e < 16 / 4; e++, mask >>= 4) {
- if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) {
- continue;
- }
- fpst = base_fpst;
- if (!(mask & (1 << (top * 2)))) {
- /* We need the result but without updating flags */
- scratch_fpst = *fpst;
- fpst = &scratch_fpst;
- }
- r = float16_to_float32(m[H2(e * 2 + top)], ieee, fpst);
- mergemask(&d[H4(e)], r, mask);
- }
- set_flush_inputs_to_zero(old_fiz, base_fpst);
- mve_advance_vpt(env);
-}
-
-void HELPER(mve_vcvtb_sh)(CPUARMState *env, void *vd, void *vm)
-{
- do_vcvt_sh(env, vd, vm, 0);
-}
-void HELPER(mve_vcvtt_sh)(CPUARMState *env, void *vd, void *vm)
-{
- do_vcvt_sh(env, vd, vm, 1);
-}
-void HELPER(mve_vcvtb_hs)(CPUARMState *env, void *vd, void *vm)
-{
- do_vcvt_hs(env, vd, vm, 0);
-}
-void HELPER(mve_vcvtt_hs)(CPUARMState *env, void *vd, void *vm)
-{
- do_vcvt_hs(env, vd, vm, 1);
-}
-
-#define DO_1OP_FP(OP, ESIZE, TYPE, FN) \
- void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm) \
- { \
- TYPE *d = vd, *m = vm; \
- TYPE r; \
- uint16_t mask = mve_element_mask(env); \
- unsigned e; \
- float_status *fpst; \
- float_status scratch_fpst; \
- for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
- if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
- continue; \
- } \
- fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
- &env->vfp.standard_fp_status; \
- if (!(mask & 1)) { \
- /* We need the result but without updating flags */ \
- scratch_fpst = *fpst; \
- fpst = &scratch_fpst; \
- } \
- r = FN(m[H##ESIZE(e)], fpst); \
- mergemask(&d[H##ESIZE(e)], r, mask); \
- } \
- mve_advance_vpt(env); \
- }
-
-DO_1OP_FP(vrintx_h, 2, float16, float16_round_to_int)
-DO_1OP_FP(vrintx_s, 4, float32, float32_round_to_int)
+++ /dev/null
-/*
- * ARM NEON vector operations.
- *
- * Copyright (c) 2007, 2008 CodeSourcery.
- * Written by Paul Brook
- *
- * This code is licensed under the GNU GPL v2.
- */
-#include "qemu/osdep.h"
-
-#include "cpu.h"
-#include "exec/helper-proto.h"
-#include "fpu/softfloat.h"
-#include "vec_internal.h"
-
-#define SIGNBIT (uint32_t)0x80000000
-#define SIGNBIT64 ((uint64_t)1 << 63)
-
-#define SET_QC() env->vfp.qc[0] = 1
-
-#define NEON_TYPE1(name, type) \
-typedef struct \
-{ \
- type v1; \
-} neon_##name;
-#if HOST_BIG_ENDIAN
-#define NEON_TYPE2(name, type) \
-typedef struct \
-{ \
- type v2; \
- type v1; \
-} neon_##name;
-#define NEON_TYPE4(name, type) \
-typedef struct \
-{ \
- type v4; \
- type v3; \
- type v2; \
- type v1; \
-} neon_##name;
-#else
-#define NEON_TYPE2(name, type) \
-typedef struct \
-{ \
- type v1; \
- type v2; \
-} neon_##name;
-#define NEON_TYPE4(name, type) \
-typedef struct \
-{ \
- type v1; \
- type v2; \
- type v3; \
- type v4; \
-} neon_##name;
-#endif
-
-NEON_TYPE4(s8, int8_t)
-NEON_TYPE4(u8, uint8_t)
-NEON_TYPE2(s16, int16_t)
-NEON_TYPE2(u16, uint16_t)
-NEON_TYPE1(s32, int32_t)
-NEON_TYPE1(u32, uint32_t)
-#undef NEON_TYPE4
-#undef NEON_TYPE2
-#undef NEON_TYPE1
-
-/* Copy from a uint32_t to a vector structure type. */
-#define NEON_UNPACK(vtype, dest, val) do { \
- union { \
- vtype v; \
- uint32_t i; \
- } conv_u; \
- conv_u.i = (val); \
- dest = conv_u.v; \
- } while(0)
-
-/* Copy from a vector structure type to a uint32_t. */
-#define NEON_PACK(vtype, dest, val) do { \
- union { \
- vtype v; \
- uint32_t i; \
- } conv_u; \
- conv_u.v = (val); \
- dest = conv_u.i; \
- } while(0)
-
-#define NEON_DO1 \
- NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
-#define NEON_DO2 \
- NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
- NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
-#define NEON_DO4 \
- NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
- NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
- NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
- NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
-
-#define NEON_VOP_BODY(vtype, n) \
-{ \
- uint32_t res; \
- vtype vsrc1; \
- vtype vsrc2; \
- vtype vdest; \
- NEON_UNPACK(vtype, vsrc1, arg1); \
- NEON_UNPACK(vtype, vsrc2, arg2); \
- NEON_DO##n; \
- NEON_PACK(vtype, res, vdest); \
- return res; \
-}
-
-#define NEON_VOP(name, vtype, n) \
-uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
-NEON_VOP_BODY(vtype, n)
-
-#define NEON_VOP_ENV(name, vtype, n) \
-uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
-NEON_VOP_BODY(vtype, n)
-
-/* Pairwise operations. */
-/* For 32-bit elements each segment only contains a single element, so
- the elementwise and pairwise operations are the same. */
-#define NEON_PDO2 \
- NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
- NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
-#define NEON_PDO4 \
- NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
- NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
- NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
- NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
-
-#define NEON_POP(name, vtype, n) \
-uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
-{ \
- uint32_t res; \
- vtype vsrc1; \
- vtype vsrc2; \
- vtype vdest; \
- NEON_UNPACK(vtype, vsrc1, arg1); \
- NEON_UNPACK(vtype, vsrc2, arg2); \
- NEON_PDO##n; \
- NEON_PACK(vtype, res, vdest); \
- return res; \
-}
-
-/* Unary operators. */
-#define NEON_VOP1(name, vtype, n) \
-uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
-{ \
- vtype vsrc1; \
- vtype vdest; \
- NEON_UNPACK(vtype, vsrc1, arg); \
- NEON_DO##n; \
- NEON_PACK(vtype, arg, vdest); \
- return arg; \
-}
-
-
-#define NEON_USAT(dest, src1, src2, type) do { \
- uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
- if (tmp != (type)tmp) { \
- SET_QC(); \
- dest = ~0; \
- } else { \
- dest = tmp; \
- }} while(0)
-#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
-NEON_VOP_ENV(qadd_u8, neon_u8, 4)
-#undef NEON_FN
-#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
-NEON_VOP_ENV(qadd_u16, neon_u16, 2)
-#undef NEON_FN
-#undef NEON_USAT
-
-uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- uint32_t res = a + b;
- if (res < a) {
- SET_QC();
- res = ~0;
- }
- return res;
-}
-
-uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
-{
- uint64_t res;
-
- res = src1 + src2;
- if (res < src1) {
- SET_QC();
- res = ~(uint64_t)0;
- }
- return res;
-}
-
-#define NEON_SSAT(dest, src1, src2, type) do { \
- int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
- if (tmp != (type)tmp) { \
- SET_QC(); \
- if (src2 > 0) { \
- tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
- } else { \
- tmp = 1 << (sizeof(type) * 8 - 1); \
- } \
- } \
- dest = tmp; \
- } while(0)
-#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
-NEON_VOP_ENV(qadd_s8, neon_s8, 4)
-#undef NEON_FN
-#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
-NEON_VOP_ENV(qadd_s16, neon_s16, 2)
-#undef NEON_FN
-#undef NEON_SSAT
-
-uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- uint32_t res = a + b;
- if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
- SET_QC();
- res = ~(((int32_t)a >> 31) ^ SIGNBIT);
- }
- return res;
-}
-
-uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
-{
- uint64_t res;
-
- res = src1 + src2;
- if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
- SET_QC();
- res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
- }
- return res;
-}
-
-/* Unsigned saturating accumulate of signed value
- *
- * Op1/Rn is treated as signed
- * Op2/Rd is treated as unsigned
- *
- * Explicit casting is used to ensure the correct sign extension of
- * inputs. The result is treated as a unsigned value and saturated as such.
- *
- * We use a macro for the 8/16 bit cases which expects signed integers of va,
- * vb, and vr for interim calculation and an unsigned 32 bit result value r.
- */
-
-#define USATACC(bits, shift) \
- do { \
- va = sextract32(a, shift, bits); \
- vb = extract32(b, shift, bits); \
- vr = va + vb; \
- if (vr > UINT##bits##_MAX) { \
- SET_QC(); \
- vr = UINT##bits##_MAX; \
- } else if (vr < 0) { \
- SET_QC(); \
- vr = 0; \
- } \
- r = deposit32(r, shift, bits, vr); \
- } while (0)
-
-uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- int16_t va, vb, vr;
- uint32_t r = 0;
-
- USATACC(8, 0);
- USATACC(8, 8);
- USATACC(8, 16);
- USATACC(8, 24);
- return r;
-}
-
-uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- int32_t va, vb, vr;
- uint64_t r = 0;
-
- USATACC(16, 0);
- USATACC(16, 16);
- return r;
-}
-
-#undef USATACC
-
-uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- int64_t va = (int32_t)a;
- int64_t vb = (uint32_t)b;
- int64_t vr = va + vb;
- if (vr > UINT32_MAX) {
- SET_QC();
- vr = UINT32_MAX;
- } else if (vr < 0) {
- SET_QC();
- vr = 0;
- }
- return vr;
-}
-
-uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b)
-{
- uint64_t res;
- res = a + b;
- /* We only need to look at the pattern of SIGN bits to detect
- * +ve/-ve saturation
- */
- if (~a & b & ~res & SIGNBIT64) {
- SET_QC();
- res = UINT64_MAX;
- } else if (a & ~b & res & SIGNBIT64) {
- SET_QC();
- res = 0;
- }
- return res;
-}
-
-/* Signed saturating accumulate of unsigned value
- *
- * Op1/Rn is treated as unsigned
- * Op2/Rd is treated as signed
- *
- * The result is treated as a signed value and saturated as such
- *
- * We use a macro for the 8/16 bit cases which expects signed integers of va,
- * vb, and vr for interim calculation and an unsigned 32 bit result value r.
- */
-
-#define SSATACC(bits, shift) \
- do { \
- va = extract32(a, shift, bits); \
- vb = sextract32(b, shift, bits); \
- vr = va + vb; \
- if (vr > INT##bits##_MAX) { \
- SET_QC(); \
- vr = INT##bits##_MAX; \
- } else if (vr < INT##bits##_MIN) { \
- SET_QC(); \
- vr = INT##bits##_MIN; \
- } \
- r = deposit32(r, shift, bits, vr); \
- } while (0)
-
-uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- int16_t va, vb, vr;
- uint32_t r = 0;
-
- SSATACC(8, 0);
- SSATACC(8, 8);
- SSATACC(8, 16);
- SSATACC(8, 24);
- return r;
-}
-
-uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- int32_t va, vb, vr;
- uint32_t r = 0;
-
- SSATACC(16, 0);
- SSATACC(16, 16);
-
- return r;
-}
-
-#undef SSATACC
-
-uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- int64_t res;
- int64_t op1 = (uint32_t)a;
- int64_t op2 = (int32_t)b;
- res = op1 + op2;
- if (res > INT32_MAX) {
- SET_QC();
- res = INT32_MAX;
- } else if (res < INT32_MIN) {
- SET_QC();
- res = INT32_MIN;
- }
- return res;
-}
-
-uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b)
-{
- uint64_t res;
- res = a + b;
- /* We only need to look at the pattern of SIGN bits to detect an overflow */
- if (((a & res)
- | (~b & res)
- | (a & ~b)) & SIGNBIT64) {
- SET_QC();
- res = INT64_MAX;
- }
- return res;
-}
-
-
-#define NEON_USAT(dest, src1, src2, type) do { \
- uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
- if (tmp != (type)tmp) { \
- SET_QC(); \
- dest = 0; \
- } else { \
- dest = tmp; \
- }} while(0)
-#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
-NEON_VOP_ENV(qsub_u8, neon_u8, 4)
-#undef NEON_FN
-#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
-NEON_VOP_ENV(qsub_u16, neon_u16, 2)
-#undef NEON_FN
-#undef NEON_USAT
-
-uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- uint32_t res = a - b;
- if (res > a) {
- SET_QC();
- res = 0;
- }
- return res;
-}
-
-uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
-{
- uint64_t res;
-
- if (src1 < src2) {
- SET_QC();
- res = 0;
- } else {
- res = src1 - src2;
- }
- return res;
-}
-
-#define NEON_SSAT(dest, src1, src2, type) do { \
- int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
- if (tmp != (type)tmp) { \
- SET_QC(); \
- if (src2 < 0) { \
- tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
- } else { \
- tmp = 1 << (sizeof(type) * 8 - 1); \
- } \
- } \
- dest = tmp; \
- } while(0)
-#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
-NEON_VOP_ENV(qsub_s8, neon_s8, 4)
-#undef NEON_FN
-#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
-NEON_VOP_ENV(qsub_s16, neon_s16, 2)
-#undef NEON_FN
-#undef NEON_SSAT
-
-uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- uint32_t res = a - b;
- if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
- SET_QC();
- res = ~(((int32_t)a >> 31) ^ SIGNBIT);
- }
- return res;
-}
-
-uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
-{
- uint64_t res;
-
- res = src1 - src2;
- if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
- SET_QC();
- res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
- }
- return res;
-}
-
-#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
-NEON_VOP(hadd_s8, neon_s8, 4)
-NEON_VOP(hadd_u8, neon_u8, 4)
-NEON_VOP(hadd_s16, neon_s16, 2)
-NEON_VOP(hadd_u16, neon_u16, 2)
-#undef NEON_FN
-
-int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
-{
- int32_t dest;
-
- dest = (src1 >> 1) + (src2 >> 1);
- if (src1 & src2 & 1)
- dest++;
- return dest;
-}
-
-uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
-{
- uint32_t dest;
-
- dest = (src1 >> 1) + (src2 >> 1);
- if (src1 & src2 & 1)
- dest++;
- return dest;
-}
-
-#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
-NEON_VOP(rhadd_s8, neon_s8, 4)
-NEON_VOP(rhadd_u8, neon_u8, 4)
-NEON_VOP(rhadd_s16, neon_s16, 2)
-NEON_VOP(rhadd_u16, neon_u16, 2)
-#undef NEON_FN
-
-int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
-{
- int32_t dest;
-
- dest = (src1 >> 1) + (src2 >> 1);
- if ((src1 | src2) & 1)
- dest++;
- return dest;
-}
-
-uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
-{
- uint32_t dest;
-
- dest = (src1 >> 1) + (src2 >> 1);
- if ((src1 | src2) & 1)
- dest++;
- return dest;
-}
-
-#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
-NEON_VOP(hsub_s8, neon_s8, 4)
-NEON_VOP(hsub_u8, neon_u8, 4)
-NEON_VOP(hsub_s16, neon_s16, 2)
-NEON_VOP(hsub_u16, neon_u16, 2)
-#undef NEON_FN
-
-int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
-{
- int32_t dest;
-
- dest = (src1 >> 1) - (src2 >> 1);
- if ((~src1) & src2 & 1)
- dest--;
- return dest;
-}
-
-uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
-{
- uint32_t dest;
-
- dest = (src1 >> 1) - (src2 >> 1);
- if ((~src1) & src2 & 1)
- dest--;
- return dest;
-}
-
-#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
-NEON_POP(pmin_s8, neon_s8, 4)
-NEON_POP(pmin_u8, neon_u8, 4)
-NEON_POP(pmin_s16, neon_s16, 2)
-NEON_POP(pmin_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
-NEON_POP(pmax_s8, neon_s8, 4)
-NEON_POP(pmax_u8, neon_u8, 4)
-NEON_POP(pmax_s16, neon_s16, 2)
-NEON_POP(pmax_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
-NEON_VOP(shl_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
-NEON_VOP(shl_s16, neon_s16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
-NEON_VOP(rshl_s8, neon_s8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
-NEON_VOP(rshl_s16, neon_s16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
-{
- return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
-}
-
-uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
-{
- return do_sqrshl_d(val, (int8_t)shift, true, NULL);
-}
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
-NEON_VOP(rshl_u8, neon_u8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
-NEON_VOP(rshl_u16, neon_u16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
-{
- return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
-}
-
-uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
-{
- return do_uqrshl_d(val, (int8_t)shift, true, NULL);
-}
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
-NEON_VOP_ENV(qshl_u8, neon_u8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
-NEON_VOP_ENV(qshl_u16, neon_u16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
-{
- return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
-}
-
-uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
-{
- return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
-}
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
-NEON_VOP_ENV(qshl_s8, neon_s8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
-NEON_VOP_ENV(qshl_s16, neon_s16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
-{
- return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
-}
-
-uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
-{
- return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
-}
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
-NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
-NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
-{
- return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
-}
-
-uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
-{
- return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
-}
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
-NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
-NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
-{
- return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
-}
-
-uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
-{
- return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
-}
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
-NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) \
- (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
-NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
-{
- return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
-}
-
-uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
-{
- return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
-}
-
-uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
-{
- uint32_t mask;
- mask = (a ^ b) & 0x80808080u;
- a &= ~0x80808080u;
- b &= ~0x80808080u;
- return (a + b) ^ mask;
-}
-
-uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
-{
- uint32_t mask;
- mask = (a ^ b) & 0x80008000u;
- a &= ~0x80008000u;
- b &= ~0x80008000u;
- return (a + b) ^ mask;
-}
-
-#define NEON_FN(dest, src1, src2) dest = src1 + src2
-NEON_POP(padd_u8, neon_u8, 4)
-NEON_POP(padd_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) dest = src1 - src2
-NEON_VOP(sub_u8, neon_u8, 4)
-NEON_VOP(sub_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) dest = src1 * src2
-NEON_VOP(mul_u8, neon_u8, 4)
-NEON_VOP(mul_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
-NEON_VOP(tst_u8, neon_u8, 4)
-NEON_VOP(tst_u16, neon_u16, 2)
-NEON_VOP(tst_u32, neon_u32, 1)
-#undef NEON_FN
-
-/* Count Leading Sign/Zero Bits. */
-static inline int do_clz8(uint8_t x)
-{
- int n;
- for (n = 8; x; n--)
- x >>= 1;
- return n;
-}
-
-static inline int do_clz16(uint16_t x)
-{
- int n;
- for (n = 16; x; n--)
- x >>= 1;
- return n;
-}
-
-#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
-NEON_VOP1(clz_u8, neon_u8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
-NEON_VOP1(clz_u16, neon_u16, 2)
-#undef NEON_FN
-
-#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
-NEON_VOP1(cls_s8, neon_s8, 4)
-#undef NEON_FN
-
-#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
-NEON_VOP1(cls_s16, neon_s16, 2)
-#undef NEON_FN
-
-uint32_t HELPER(neon_cls_s32)(uint32_t x)
-{
- int count;
- if ((int32_t)x < 0)
- x = ~x;
- for (count = 32; x; count--)
- x = x >> 1;
- return count - 1;
-}
-
-/* Bit count. */
-uint32_t HELPER(neon_cnt_u8)(uint32_t x)
-{
- x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
- x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
- x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f);
- return x;
-}
-
-/* Reverse bits in each 8 bit word */
-uint32_t HELPER(neon_rbit_u8)(uint32_t x)
-{
- x = ((x & 0xf0f0f0f0) >> 4)
- | ((x & 0x0f0f0f0f) << 4);
- x = ((x & 0x88888888) >> 3)
- | ((x & 0x44444444) >> 1)
- | ((x & 0x22222222) << 1)
- | ((x & 0x11111111) << 3);
- return x;
-}
-
-#define NEON_QDMULH16(dest, src1, src2, round) do { \
- uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
- if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
- SET_QC(); \
- tmp = (tmp >> 31) ^ ~SIGNBIT; \
- } else { \
- tmp <<= 1; \
- } \
- if (round) { \
- int32_t old = tmp; \
- tmp += 1 << 15; \
- if ((int32_t)tmp < old) { \
- SET_QC(); \
- tmp = SIGNBIT - 1; \
- } \
- } \
- dest = tmp >> 16; \
- } while(0)
-#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
-NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
-#undef NEON_FN
-#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
-NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
-#undef NEON_FN
-#undef NEON_QDMULH16
-
-#define NEON_QDMULH32(dest, src1, src2, round) do { \
- uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
- if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
- SET_QC(); \
- tmp = (tmp >> 63) ^ ~SIGNBIT64; \
- } else { \
- tmp <<= 1; \
- } \
- if (round) { \
- int64_t old = tmp; \
- tmp += (int64_t)1 << 31; \
- if ((int64_t)tmp < old) { \
- SET_QC(); \
- tmp = SIGNBIT64 - 1; \
- } \
- } \
- dest = tmp >> 32; \
- } while(0)
-#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
-NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
-#undef NEON_FN
-#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
-NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
-#undef NEON_FN
-#undef NEON_QDMULH32
-
-uint32_t HELPER(neon_narrow_u8)(uint64_t x)
-{
- return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
- | ((x >> 24) & 0xff000000u);
-}
-
-uint32_t HELPER(neon_narrow_u16)(uint64_t x)
-{
- return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
-}
-
-uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
-{
- return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
- | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
-}
-
-uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
-{
- return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
-}
-
-uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
-{
- x &= 0xff80ff80ff80ff80ull;
- x += 0x0080008000800080ull;
- return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
- | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
-}
-
-uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
-{
- x &= 0xffff8000ffff8000ull;
- x += 0x0000800000008000ull;
- return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
-}
-
-uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
-{
- uint16_t s;
- uint8_t d;
- uint32_t res = 0;
-#define SAT8(n) \
- s = x >> n; \
- if (s & 0x8000) { \
- SET_QC(); \
- } else { \
- if (s > 0xff) { \
- d = 0xff; \
- SET_QC(); \
- } else { \
- d = s; \
- } \
- res |= (uint32_t)d << (n / 2); \
- }
-
- SAT8(0);
- SAT8(16);
- SAT8(32);
- SAT8(48);
-#undef SAT8
- return res;
-}
-
-uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
-{
- uint16_t s;
- uint8_t d;
- uint32_t res = 0;
-#define SAT8(n) \
- s = x >> n; \
- if (s > 0xff) { \
- d = 0xff; \
- SET_QC(); \
- } else { \
- d = s; \
- } \
- res |= (uint32_t)d << (n / 2);
-
- SAT8(0);
- SAT8(16);
- SAT8(32);
- SAT8(48);
-#undef SAT8
- return res;
-}
-
-uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
-{
- int16_t s;
- uint8_t d;
- uint32_t res = 0;
-#define SAT8(n) \
- s = x >> n; \
- if (s != (int8_t)s) { \
- d = (s >> 15) ^ 0x7f; \
- SET_QC(); \
- } else { \
- d = s; \
- } \
- res |= (uint32_t)d << (n / 2);
-
- SAT8(0);
- SAT8(16);
- SAT8(32);
- SAT8(48);
-#undef SAT8
- return res;
-}
-
-uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
-{
- uint32_t high;
- uint32_t low;
- low = x;
- if (low & 0x80000000) {
- low = 0;
- SET_QC();
- } else if (low > 0xffff) {
- low = 0xffff;
- SET_QC();
- }
- high = x >> 32;
- if (high & 0x80000000) {
- high = 0;
- SET_QC();
- } else if (high > 0xffff) {
- high = 0xffff;
- SET_QC();
- }
- return low | (high << 16);
-}
-
-uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
-{
- uint32_t high;
- uint32_t low;
- low = x;
- if (low > 0xffff) {
- low = 0xffff;
- SET_QC();
- }
- high = x >> 32;
- if (high > 0xffff) {
- high = 0xffff;
- SET_QC();
- }
- return low | (high << 16);
-}
-
-uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
-{
- int32_t low;
- int32_t high;
- low = x;
- if (low != (int16_t)low) {
- low = (low >> 31) ^ 0x7fff;
- SET_QC();
- }
- high = x >> 32;
- if (high != (int16_t)high) {
- high = (high >> 31) ^ 0x7fff;
- SET_QC();
- }
- return (uint16_t)low | (high << 16);
-}
-
-uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
-{
- if (x & 0x8000000000000000ull) {
- SET_QC();
- return 0;
- }
- if (x > 0xffffffffu) {
- SET_QC();
- return 0xffffffffu;
- }
- return x;
-}
-
-uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
-{
- if (x > 0xffffffffu) {
- SET_QC();
- return 0xffffffffu;
- }
- return x;
-}
-
-uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
-{
- if ((int64_t)x != (int32_t)x) {
- SET_QC();
- return ((int64_t)x >> 63) ^ 0x7fffffff;
- }
- return x;
-}
-
-uint64_t HELPER(neon_widen_u8)(uint32_t x)
-{
- uint64_t tmp;
- uint64_t ret;
- ret = (uint8_t)x;
- tmp = (uint8_t)(x >> 8);
- ret |= tmp << 16;
- tmp = (uint8_t)(x >> 16);
- ret |= tmp << 32;
- tmp = (uint8_t)(x >> 24);
- ret |= tmp << 48;
- return ret;
-}
-
-uint64_t HELPER(neon_widen_s8)(uint32_t x)
-{
- uint64_t tmp;
- uint64_t ret;
- ret = (uint16_t)(int8_t)x;
- tmp = (uint16_t)(int8_t)(x >> 8);
- ret |= tmp << 16;
- tmp = (uint16_t)(int8_t)(x >> 16);
- ret |= tmp << 32;
- tmp = (uint16_t)(int8_t)(x >> 24);
- ret |= tmp << 48;
- return ret;
-}
-
-uint64_t HELPER(neon_widen_u16)(uint32_t x)
-{
- uint64_t high = (uint16_t)(x >> 16);
- return ((uint16_t)x) | (high << 32);
-}
-
-uint64_t HELPER(neon_widen_s16)(uint32_t x)
-{
- uint64_t high = (int16_t)(x >> 16);
- return ((uint32_t)(int16_t)x) | (high << 32);
-}
-
-uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
-{
- uint64_t mask;
- mask = (a ^ b) & 0x8000800080008000ull;
- a &= ~0x8000800080008000ull;
- b &= ~0x8000800080008000ull;
- return (a + b) ^ mask;
-}
-
-uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
-{
- uint64_t mask;
- mask = (a ^ b) & 0x8000000080000000ull;
- a &= ~0x8000000080000000ull;
- b &= ~0x8000000080000000ull;
- return (a + b) ^ mask;
-}
-
-uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
-{
- uint64_t tmp;
- uint64_t tmp2;
-
- tmp = a & 0x0000ffff0000ffffull;
- tmp += (a >> 16) & 0x0000ffff0000ffffull;
- tmp2 = b & 0xffff0000ffff0000ull;
- tmp2 += (b << 16) & 0xffff0000ffff0000ull;
- return ( tmp & 0xffff)
- | ((tmp >> 16) & 0xffff0000ull)
- | ((tmp2 << 16) & 0xffff00000000ull)
- | ( tmp2 & 0xffff000000000000ull);
-}
-
-uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
-{
- uint32_t low = a + (a >> 32);
- uint32_t high = b + (b >> 32);
- return low + ((uint64_t)high << 32);
-}
-
-uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
-{
- uint64_t mask;
- mask = (a ^ ~b) & 0x8000800080008000ull;
- a |= 0x8000800080008000ull;
- b &= ~0x8000800080008000ull;
- return (a - b) ^ mask;
-}
-
-uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
-{
- uint64_t mask;
- mask = (a ^ ~b) & 0x8000000080000000ull;
- a |= 0x8000000080000000ull;
- b &= ~0x8000000080000000ull;
- return (a - b) ^ mask;
-}
-
-uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
-{
- uint32_t x, y;
- uint32_t low, high;
-
- x = a;
- y = b;
- low = x + y;
- if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
- SET_QC();
- low = ((int32_t)x >> 31) ^ ~SIGNBIT;
- }
- x = a >> 32;
- y = b >> 32;
- high = x + y;
- if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
- SET_QC();
- high = ((int32_t)x >> 31) ^ ~SIGNBIT;
- }
- return low | ((uint64_t)high << 32);
-}
-
-uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
-{
- uint64_t result;
-
- result = a + b;
- if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
- SET_QC();
- result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
- }
- return result;
-}
-
-/* We have to do the arithmetic in a larger type than
- * the input type, because for example with a signed 32 bit
- * op the absolute difference can overflow a signed 32 bit value.
- */
-#define DO_ABD(dest, x, y, intype, arithtype) do { \
- arithtype tmp_x = (intype)(x); \
- arithtype tmp_y = (intype)(y); \
- dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
- } while(0)
-
-uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
-{
- uint64_t tmp;
- uint64_t result;
- DO_ABD(result, a, b, uint8_t, uint32_t);
- DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
- result |= tmp << 16;
- DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
- result |= tmp << 32;
- DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
- result |= tmp << 48;
- return result;
-}
-
-uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
-{
- uint64_t tmp;
- uint64_t result;
- DO_ABD(result, a, b, int8_t, int32_t);
- DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
- result |= tmp << 16;
- DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
- result |= tmp << 32;
- DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
- result |= tmp << 48;
- return result;
-}
-
-uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
-{
- uint64_t tmp;
- uint64_t result;
- DO_ABD(result, a, b, uint16_t, uint32_t);
- DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
- return result | (tmp << 32);
-}
-
-uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
-{
- uint64_t tmp;
- uint64_t result;
- DO_ABD(result, a, b, int16_t, int32_t);
- DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
- return result | (tmp << 32);
-}
-
-uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
-{
- uint64_t result;
- DO_ABD(result, a, b, uint32_t, uint64_t);
- return result;
-}
-
-uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
-{
- uint64_t result;
- DO_ABD(result, a, b, int32_t, int64_t);
- return result;
-}
-#undef DO_ABD
-
-/* Widening multiply. Named type is the source type. */
-#define DO_MULL(dest, x, y, type1, type2) do { \
- type1 tmp_x = x; \
- type1 tmp_y = y; \
- dest = (type2)((type2)tmp_x * (type2)tmp_y); \
- } while(0)
-
-uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
-{
- uint64_t tmp;
- uint64_t result;
-
- DO_MULL(result, a, b, uint8_t, uint16_t);
- DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
- result |= tmp << 16;
- DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
- result |= tmp << 32;
- DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
- result |= tmp << 48;
- return result;
-}
-
-uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
-{
- uint64_t tmp;
- uint64_t result;
-
- DO_MULL(result, a, b, int8_t, uint16_t);
- DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
- result |= tmp << 16;
- DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
- result |= tmp << 32;
- DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
- result |= tmp << 48;
- return result;
-}
-
-uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
-{
- uint64_t tmp;
- uint64_t result;
-
- DO_MULL(result, a, b, uint16_t, uint32_t);
- DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
- return result | (tmp << 32);
-}
-
-uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
-{
- uint64_t tmp;
- uint64_t result;
-
- DO_MULL(result, a, b, int16_t, uint32_t);
- DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
- return result | (tmp << 32);
-}
-
-uint64_t HELPER(neon_negl_u16)(uint64_t x)
-{
- uint16_t tmp;
- uint64_t result;
- result = (uint16_t)-x;
- tmp = -(x >> 16);
- result |= (uint64_t)tmp << 16;
- tmp = -(x >> 32);
- result |= (uint64_t)tmp << 32;
- tmp = -(x >> 48);
- result |= (uint64_t)tmp << 48;
- return result;
-}
-
-uint64_t HELPER(neon_negl_u32)(uint64_t x)
-{
- uint32_t low = -x;
- uint32_t high = -(x >> 32);
- return low | ((uint64_t)high << 32);
-}
-
-/* Saturating sign manipulation. */
-/* ??? Make these use NEON_VOP1 */
-#define DO_QABS8(x) do { \
- if (x == (int8_t)0x80) { \
- x = 0x7f; \
- SET_QC(); \
- } else if (x < 0) { \
- x = -x; \
- }} while (0)
-uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
-{
- neon_s8 vec;
- NEON_UNPACK(neon_s8, vec, x);
- DO_QABS8(vec.v1);
- DO_QABS8(vec.v2);
- DO_QABS8(vec.v3);
- DO_QABS8(vec.v4);
- NEON_PACK(neon_s8, x, vec);
- return x;
-}
-#undef DO_QABS8
-
-#define DO_QNEG8(x) do { \
- if (x == (int8_t)0x80) { \
- x = 0x7f; \
- SET_QC(); \
- } else { \
- x = -x; \
- }} while (0)
-uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
-{
- neon_s8 vec;
- NEON_UNPACK(neon_s8, vec, x);
- DO_QNEG8(vec.v1);
- DO_QNEG8(vec.v2);
- DO_QNEG8(vec.v3);
- DO_QNEG8(vec.v4);
- NEON_PACK(neon_s8, x, vec);
- return x;
-}
-#undef DO_QNEG8
-
-#define DO_QABS16(x) do { \
- if (x == (int16_t)0x8000) { \
- x = 0x7fff; \
- SET_QC(); \
- } else if (x < 0) { \
- x = -x; \
- }} while (0)
-uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
-{
- neon_s16 vec;
- NEON_UNPACK(neon_s16, vec, x);
- DO_QABS16(vec.v1);
- DO_QABS16(vec.v2);
- NEON_PACK(neon_s16, x, vec);
- return x;
-}
-#undef DO_QABS16
-
-#define DO_QNEG16(x) do { \
- if (x == (int16_t)0x8000) { \
- x = 0x7fff; \
- SET_QC(); \
- } else { \
- x = -x; \
- }} while (0)
-uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
-{
- neon_s16 vec;
- NEON_UNPACK(neon_s16, vec, x);
- DO_QNEG16(vec.v1);
- DO_QNEG16(vec.v2);
- NEON_PACK(neon_s16, x, vec);
- return x;
-}
-#undef DO_QNEG16
-
-uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
-{
- if (x == SIGNBIT) {
- SET_QC();
- x = ~SIGNBIT;
- } else if ((int32_t)x < 0) {
- x = -x;
- }
- return x;
-}
-
-uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
-{
- if (x == SIGNBIT) {
- SET_QC();
- x = ~SIGNBIT;
- } else {
- x = -x;
- }
- return x;
-}
-
-uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
-{
- if (x == SIGNBIT64) {
- SET_QC();
- x = ~SIGNBIT64;
- } else if ((int64_t)x < 0) {
- x = -x;
- }
- return x;
-}
-
-uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
-{
- if (x == SIGNBIT64) {
- SET_QC();
- x = ~SIGNBIT64;
- } else {
- x = -x;
- }
- return x;
-}
-
-/* NEON Float helpers. */
-
-/* Floating point comparisons produce an integer result.
- * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
- * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
- */
-uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
- return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
-}
-
-uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
- return -float32_le(make_float32(b), make_float32(a), fpst);
-}
-
-uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
- return -float32_lt(make_float32(b), make_float32(a), fpst);
-}
-
-uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
- float32 f0 = float32_abs(make_float32(a));
- float32 f1 = float32_abs(make_float32(b));
- return -float32_le(f1, f0, fpst);
-}
-
-uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
- float32 f0 = float32_abs(make_float32(a));
- float32 f1 = float32_abs(make_float32(b));
- return -float32_lt(f1, f0, fpst);
-}
-
-uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
- float64 f0 = float64_abs(make_float64(a));
- float64 f1 = float64_abs(make_float64(b));
- return -float64_le(f1, f0, fpst);
-}
-
-uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
-{
- float_status *fpst = fpstp;
- float64 f0 = float64_abs(make_float64(a));
- float64 f1 = float64_abs(make_float64(b));
- return -float64_lt(f1, f0, fpst);
-}
-
-#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
-
-void HELPER(neon_qunzip8)(void *vd, void *vm)
-{
- uint64_t *rd = vd, *rm = vm;
- uint64_t zd0 = rd[0], zd1 = rd[1];
- uint64_t zm0 = rm[0], zm1 = rm[1];
-
- uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
- | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
- | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
- | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
- uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
- | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
- | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
- | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
- uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
- | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
- | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
- | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
- uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
- | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
- | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
- | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
-
- rm[0] = m0;
- rm[1] = m1;
- rd[0] = d0;
- rd[1] = d1;
-}
-
-void HELPER(neon_qunzip16)(void *vd, void *vm)
-{
- uint64_t *rd = vd, *rm = vm;
- uint64_t zd0 = rd[0], zd1 = rd[1];
- uint64_t zm0 = rm[0], zm1 = rm[1];
-
- uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
- | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
- uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
- | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
- uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
- | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
- uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
- | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
-
- rm[0] = m0;
- rm[1] = m1;
- rd[0] = d0;
- rd[1] = d1;
-}
-
-void HELPER(neon_qunzip32)(void *vd, void *vm)
-{
- uint64_t *rd = vd, *rm = vm;
- uint64_t zd0 = rd[0], zd1 = rd[1];
- uint64_t zm0 = rm[0], zm1 = rm[1];
-
- uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
- uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
- uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
- uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
-
- rm[0] = m0;
- rm[1] = m1;
- rd[0] = d0;
- rd[1] = d1;
-}
-
-void HELPER(neon_unzip8)(void *vd, void *vm)
-{
- uint64_t *rd = vd, *rm = vm;
- uint64_t zd = rd[0], zm = rm[0];
-
- uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
- | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
- | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
- | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
- uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
- | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
- | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
- | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
-
- rm[0] = m0;
- rd[0] = d0;
-}
-
-void HELPER(neon_unzip16)(void *vd, void *vm)
-{
- uint64_t *rd = vd, *rm = vm;
- uint64_t zd = rd[0], zm = rm[0];
-
- uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
- | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
- uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
- | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
-
- rm[0] = m0;
- rd[0] = d0;
-}
-
-void HELPER(neon_qzip8)(void *vd, void *vm)
-{
- uint64_t *rd = vd, *rm = vm;
- uint64_t zd0 = rd[0], zd1 = rd[1];
- uint64_t zm0 = rm[0], zm1 = rm[1];
-
- uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
- | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
- | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
- | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
- uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
- | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
- | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
- | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
- uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
- | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
- | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
- | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
- uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
- | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
- | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
- | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
-
- rm[0] = m0;
- rm[1] = m1;
- rd[0] = d0;
- rd[1] = d1;
-}
-
-void HELPER(neon_qzip16)(void *vd, void *vm)
-{
- uint64_t *rd = vd, *rm = vm;
- uint64_t zd0 = rd[0], zd1 = rd[1];
- uint64_t zm0 = rm[0], zm1 = rm[1];
-
- uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
- | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
- uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
- | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
- uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
- | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
- uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
- | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
-
- rm[0] = m0;
- rm[1] = m1;
- rd[0] = d0;
- rd[1] = d1;
-}
-
-void HELPER(neon_qzip32)(void *vd, void *vm)
-{
- uint64_t *rd = vd, *rm = vm;
- uint64_t zd0 = rd[0], zd1 = rd[1];
- uint64_t zm0 = rm[0], zm1 = rm[1];
-
- uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
- uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
- uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
- uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
-
- rm[0] = m0;
- rm[1] = m1;
- rd[0] = d0;
- rd[1] = d1;
-}
-
-void HELPER(neon_zip8)(void *vd, void *vm)
-{
- uint64_t *rd = vd, *rm = vm;
- uint64_t zd = rd[0], zm = rm[0];
-
- uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
- | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
- | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
- | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
- uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
- | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
- | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
- | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
-
- rm[0] = m0;
- rd[0] = d0;
-}
-
-void HELPER(neon_zip16)(void *vd, void *vm)
-{
- uint64_t *rd = vd, *rm = vm;
- uint64_t zd = rd[0], zm = rm[0];
-
- uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
- | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
- uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
- | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
-
- rm[0] = m0;
- rd[0] = d0;
-}
+++ /dev/null
-/*
- * ARM helper routines
- *
- * Copyright (c) 2005-2007 CodeSourcery, LLC
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-#include "qemu/osdep.h"
-#include "qemu/main-loop.h"
-#include "cpu.h"
-#include "exec/helper-proto.h"
-#include "internals.h"
-#include "exec/exec-all.h"
-#include "exec/cpu_ldst.h"
-#include "cpregs.h"
-
-#define SIGNBIT (uint32_t)0x80000000
-#define SIGNBIT64 ((uint64_t)1 << 63)
-
-int exception_target_el(CPUARMState *env)
-{
- int target_el = MAX(1, arm_current_el(env));
-
- /*
- * No such thing as secure EL1 if EL3 is aarch32,
- * so update the target EL to EL3 in this case.
- */
- if (arm_is_secure(env) && !arm_el_is_aa64(env, 3) && target_el == 1) {
- target_el = 3;
- }
-
- return target_el;
-}
-
-void raise_exception(CPUARMState *env, uint32_t excp,
- uint32_t syndrome, uint32_t target_el)
-{
- CPUState *cs = env_cpu(env);
-
- if (target_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
- /*
- * Redirect NS EL1 exceptions to NS EL2. These are reported with
- * their original syndrome register value, with the exception of
- * SIMD/FP access traps, which are reported as uncategorized
- * (see DDI0478C.a D1.10.4)
- */
- target_el = 2;
- if (syn_get_ec(syndrome) == EC_ADVSIMDFPACCESSTRAP) {
- syndrome = syn_uncategorized();
- }
- }
-
- assert(!excp_is_internal(excp));
- cs->exception_index = excp;
- env->exception.syndrome = syndrome;
- env->exception.target_el = target_el;
- cpu_loop_exit(cs);
-}
-
-void raise_exception_ra(CPUARMState *env, uint32_t excp, uint32_t syndrome,
- uint32_t target_el, uintptr_t ra)
-{
- CPUState *cs = env_cpu(env);
-
- /*
- * restore_state_to_opc() will set env->exception.syndrome, so
- * we must restore CPU state here before setting the syndrome
- * the caller passed us, and cannot use cpu_loop_exit_restore().
- */
- cpu_restore_state(cs, ra);
- raise_exception(env, excp, syndrome, target_el);
-}
-
-uint64_t HELPER(neon_tbl)(CPUARMState *env, uint32_t desc,
- uint64_t ireg, uint64_t def)
-{
- uint64_t tmp, val = 0;
- uint32_t maxindex = ((desc & 3) + 1) * 8;
- uint32_t base_reg = desc >> 2;
- uint32_t shift, index, reg;
-
- for (shift = 0; shift < 64; shift += 8) {
- index = (ireg >> shift) & 0xff;
- if (index < maxindex) {
- reg = base_reg + (index >> 3);
- tmp = *aa32_vfp_dreg(env, reg);
- tmp = ((tmp >> ((index & 7) << 3)) & 0xff) << shift;
- } else {
- tmp = def & (0xffull << shift);
- }
- val |= tmp;
- }
- return val;
-}
-
-void HELPER(v8m_stackcheck)(CPUARMState *env, uint32_t newvalue)
-{
- /*
- * Perform the v8M stack limit check for SP updates from translated code,
- * raising an exception if the limit is breached.
- */
- if (newvalue < v7m_sp_limit(env)) {
- /*
- * Stack limit exceptions are a rare case, so rather than syncing
- * PC/condbits before the call, we use raise_exception_ra() so
- * that cpu_restore_state() will sort them out.
- */
- raise_exception_ra(env, EXCP_STKOF, 0, 1, GETPC());
- }
-}
-
-uint32_t HELPER(add_setq)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- uint32_t res = a + b;
- if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT))
- env->QF = 1;
- return res;
-}
-
-uint32_t HELPER(add_saturate)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- uint32_t res = a + b;
- if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
- env->QF = 1;
- res = ~(((int32_t)a >> 31) ^ SIGNBIT);
- }
- return res;
-}
-
-uint32_t HELPER(sub_saturate)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- uint32_t res = a - b;
- if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
- env->QF = 1;
- res = ~(((int32_t)a >> 31) ^ SIGNBIT);
- }
- return res;
-}
-
-uint32_t HELPER(add_usaturate)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- uint32_t res = a + b;
- if (res < a) {
- env->QF = 1;
- res = ~0;
- }
- return res;
-}
-
-uint32_t HELPER(sub_usaturate)(CPUARMState *env, uint32_t a, uint32_t b)
-{
- uint32_t res = a - b;
- if (res > a) {
- env->QF = 1;
- res = 0;
- }
- return res;
-}
-
-/* Signed saturation. */
-static inline uint32_t do_ssat(CPUARMState *env, int32_t val, int shift)
-{
- int32_t top;
- uint32_t mask;
-
- top = val >> shift;
- mask = (1u << shift) - 1;
- if (top > 0) {
- env->QF = 1;
- return mask;
- } else if (top < -1) {
- env->QF = 1;
- return ~mask;
- }
- return val;
-}
-
-/* Unsigned saturation. */
-static inline uint32_t do_usat(CPUARMState *env, int32_t val, int shift)
-{
- uint32_t max;
-
- max = (1u << shift) - 1;
- if (val < 0) {
- env->QF = 1;
- return 0;
- } else if (val > max) {
- env->QF = 1;
- return max;
- }
- return val;
-}
-
-/* Signed saturate. */
-uint32_t HELPER(ssat)(CPUARMState *env, uint32_t x, uint32_t shift)
-{
- return do_ssat(env, x, shift);
-}
-
-/* Dual halfword signed saturate. */
-uint32_t HELPER(ssat16)(CPUARMState *env, uint32_t x, uint32_t shift)
-{
- uint32_t res;
-
- res = (uint16_t)do_ssat(env, (int16_t)x, shift);
- res |= do_ssat(env, ((int32_t)x) >> 16, shift) << 16;
- return res;
-}
-
-/* Unsigned saturate. */
-uint32_t HELPER(usat)(CPUARMState *env, uint32_t x, uint32_t shift)
-{
- return do_usat(env, x, shift);
-}
-
-/* Dual halfword unsigned saturate. */
-uint32_t HELPER(usat16)(CPUARMState *env, uint32_t x, uint32_t shift)
-{
- uint32_t res;
-
- res = (uint16_t)do_usat(env, (int16_t)x, shift);
- res |= do_usat(env, ((int32_t)x) >> 16, shift) << 16;
- return res;
-}
-
-void HELPER(setend)(CPUARMState *env)
-{
- env->uncached_cpsr ^= CPSR_E;
- arm_rebuild_hflags(env);
-}
-
-void HELPER(check_bxj_trap)(CPUARMState *env, uint32_t rm)
-{
- /*
- * Only called if in NS EL0 or EL1 for a BXJ for a v7A CPU;
- * check if HSTR.TJDBX means we need to trap to EL2.
- */
- if (env->cp15.hstr_el2 & HSTR_TJDBX) {
- /*
- * We know the condition code check passed, so take the IMPDEF
- * choice to always report CV=1 COND 0xe
- */
- uint32_t syn = syn_bxjtrap(1, 0xe, rm);
- raise_exception_ra(env, EXCP_HYP_TRAP, syn, 2, GETPC());
- }
-}
-
-#ifndef CONFIG_USER_ONLY
-/* Function checks whether WFx (WFI/WFE) instructions are set up to be trapped.
- * The function returns the target EL (1-3) if the instruction is to be trapped;
- * otherwise it returns 0 indicating it is not trapped.
- */
-static inline int check_wfx_trap(CPUARMState *env, bool is_wfe)
-{
- int cur_el = arm_current_el(env);
- uint64_t mask;
-
- if (arm_feature(env, ARM_FEATURE_M)) {
- /* M profile cores can never trap WFI/WFE. */
- return 0;
- }
-
- /* If we are currently in EL0 then we need to check if SCTLR is set up for
- * WFx instructions being trapped to EL1. These trap bits don't exist in v7.
- */
- if (cur_el < 1 && arm_feature(env, ARM_FEATURE_V8)) {
- int target_el;
-
- mask = is_wfe ? SCTLR_nTWE : SCTLR_nTWI;
- if (arm_is_secure_below_el3(env) && !arm_el_is_aa64(env, 3)) {
- /* Secure EL0 and Secure PL1 is at EL3 */
- target_el = 3;
- } else {
- target_el = 1;
- }
-
- if (!(env->cp15.sctlr_el[target_el] & mask)) {
- return target_el;
- }
- }
-
- /* We are not trapping to EL1; trap to EL2 if HCR_EL2 requires it
- * No need for ARM_FEATURE check as if HCR_EL2 doesn't exist the
- * bits will be zero indicating no trap.
- */
- if (cur_el < 2) {
- mask = is_wfe ? HCR_TWE : HCR_TWI;
- if (arm_hcr_el2_eff(env) & mask) {
- return 2;
- }
- }
-
- /* We are not trapping to EL1 or EL2; trap to EL3 if SCR_EL3 requires it */
- if (cur_el < 3) {
- mask = (is_wfe) ? SCR_TWE : SCR_TWI;
- if (env->cp15.scr_el3 & mask) {
- return 3;
- }
- }
-
- return 0;
-}
-#endif
-
-void HELPER(wfi)(CPUARMState *env, uint32_t insn_len)
-{
-#ifdef CONFIG_USER_ONLY
- /*
- * WFI in the user-mode emulator is technically permitted but not
- * something any real-world code would do. AArch64 Linux kernels
- * trap it via SCTRL_EL1.nTWI and make it an (expensive) NOP;
- * AArch32 kernels don't trap it so it will delay a bit.
- * For QEMU, make it NOP here, because trying to raise EXCP_HLT
- * would trigger an abort.
- */
- return;
-#else
- CPUState *cs = env_cpu(env);
- int target_el = check_wfx_trap(env, false);
-
- if (cpu_has_work(cs)) {
- /* Don't bother to go into our "low power state" if
- * we would just wake up immediately.
- */
- return;
- }
-
- if (target_el) {
- if (env->aarch64) {
- env->pc -= insn_len;
- } else {
- env->regs[15] -= insn_len;
- }
-
- raise_exception(env, EXCP_UDEF, syn_wfx(1, 0xe, 0, insn_len == 2),
- target_el);
- }
-
- cs->exception_index = EXCP_HLT;
- cs->halted = 1;
- cpu_loop_exit(cs);
-#endif
-}
-
-void HELPER(wfe)(CPUARMState *env)
-{
- /* This is a hint instruction that is semantically different
- * from YIELD even though we currently implement it identically.
- * Don't actually halt the CPU, just yield back to top
- * level loop. This is not going into a "low power state"
- * (ie halting until some event occurs), so we never take
- * a configurable trap to a different exception level.
- */
- HELPER(yield)(env);
-}
-
-void HELPER(yield)(CPUARMState *env)
-{
- CPUState *cs = env_cpu(env);
-
- /* This is a non-trappable hint instruction that generally indicates
- * that the guest is currently busy-looping. Yield control back to the
- * top level loop so that a more deserving VCPU has a chance to run.
- */
- cs->exception_index = EXCP_YIELD;
- cpu_loop_exit(cs);
-}
-
-/* Raise an internal-to-QEMU exception. This is limited to only
- * those EXCP values which are special cases for QEMU to interrupt
- * execution and not to be used for exceptions which are passed to
- * the guest (those must all have syndrome information and thus should
- * use exception_with_syndrome*).
- */
-void HELPER(exception_internal)(CPUARMState *env, uint32_t excp)
-{
- CPUState *cs = env_cpu(env);
-
- assert(excp_is_internal(excp));
- cs->exception_index = excp;
- cpu_loop_exit(cs);
-}
-
-/* Raise an exception with the specified syndrome register value */
-void HELPER(exception_with_syndrome_el)(CPUARMState *env, uint32_t excp,
- uint32_t syndrome, uint32_t target_el)
-{
- raise_exception(env, excp, syndrome, target_el);
-}
-
-/*
- * Raise an exception with the specified syndrome register value
- * to the default target el.
- */
-void HELPER(exception_with_syndrome)(CPUARMState *env, uint32_t excp,
- uint32_t syndrome)
-{
- raise_exception(env, excp, syndrome, exception_target_el(env));
-}
-
-uint32_t HELPER(cpsr_read)(CPUARMState *env)
-{
- return cpsr_read(env) & ~CPSR_EXEC;
-}
-
-void HELPER(cpsr_write)(CPUARMState *env, uint32_t val, uint32_t mask)
-{
- cpsr_write(env, val, mask, CPSRWriteByInstr);
- /* TODO: Not all cpsr bits are relevant to hflags. */
- arm_rebuild_hflags(env);
-}
-
-/* Write the CPSR for a 32-bit exception return */
-void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val)
-{
- uint32_t mask;
-
- qemu_mutex_lock_iothread();
- arm_call_pre_el_change_hook(env_archcpu(env));
- qemu_mutex_unlock_iothread();
-
- mask = aarch32_cpsr_valid_mask(env->features, &env_archcpu(env)->isar);
- cpsr_write(env, val, mask, CPSRWriteExceptionReturn);
-
- /* Generated code has already stored the new PC value, but
- * without masking out its low bits, because which bits need
- * masking depends on whether we're returning to Thumb or ARM
- * state. Do the masking now.
- */
- env->regs[15] &= (env->thumb ? ~1 : ~3);
- arm_rebuild_hflags(env);
-
- qemu_mutex_lock_iothread();
- arm_call_el_change_hook(env_archcpu(env));
- qemu_mutex_unlock_iothread();
-}
-
-/* Access to user mode registers from privileged modes. */
-uint32_t HELPER(get_user_reg)(CPUARMState *env, uint32_t regno)
-{
- uint32_t val;
-
- if (regno == 13) {
- val = env->banked_r13[BANK_USRSYS];
- } else if (regno == 14) {
- val = env->banked_r14[BANK_USRSYS];
- } else if (regno >= 8
- && (env->uncached_cpsr & 0x1f) == ARM_CPU_MODE_FIQ) {
- val = env->usr_regs[regno - 8];
- } else {
- val = env->regs[regno];
- }
- return val;
-}
-
-void HELPER(set_user_reg)(CPUARMState *env, uint32_t regno, uint32_t val)
-{
- if (regno == 13) {
- env->banked_r13[BANK_USRSYS] = val;
- } else if (regno == 14) {
- env->banked_r14[BANK_USRSYS] = val;
- } else if (regno >= 8
- && (env->uncached_cpsr & 0x1f) == ARM_CPU_MODE_FIQ) {
- env->usr_regs[regno - 8] = val;
- } else {
- env->regs[regno] = val;
- }
-}
-
-void HELPER(set_r13_banked)(CPUARMState *env, uint32_t mode, uint32_t val)
-{
- if ((env->uncached_cpsr & CPSR_M) == mode) {
- env->regs[13] = val;
- } else {
- env->banked_r13[bank_number(mode)] = val;
- }
-}
-
-uint32_t HELPER(get_r13_banked)(CPUARMState *env, uint32_t mode)
-{
- if ((env->uncached_cpsr & CPSR_M) == ARM_CPU_MODE_SYS) {
- /* SRS instruction is UNPREDICTABLE from System mode; we UNDEF.
- * Other UNPREDICTABLE and UNDEF cases were caught at translate time.
- */
- raise_exception(env, EXCP_UDEF, syn_uncategorized(),
- exception_target_el(env));
- }
-
- if ((env->uncached_cpsr & CPSR_M) == mode) {
- return env->regs[13];
- } else {
- return env->banked_r13[bank_number(mode)];
- }
-}
-
-static void msr_mrs_banked_exc_checks(CPUARMState *env, uint32_t tgtmode,
- uint32_t regno)
-{
- /* Raise an exception if the requested access is one of the UNPREDICTABLE
- * cases; otherwise return. This broadly corresponds to the pseudocode
- * BankedRegisterAccessValid() and SPSRAccessValid(),
- * except that we have already handled some cases at translate time.
- */
- int curmode = env->uncached_cpsr & CPSR_M;
-
- if (regno == 17) {
- /* ELR_Hyp: a special case because access from tgtmode is OK */
- if (curmode != ARM_CPU_MODE_HYP && curmode != ARM_CPU_MODE_MON) {
- goto undef;
- }
- return;
- }
-
- if (curmode == tgtmode) {
- goto undef;
- }
-
- if (tgtmode == ARM_CPU_MODE_USR) {
- switch (regno) {
- case 8 ... 12:
- if (curmode != ARM_CPU_MODE_FIQ) {
- goto undef;
- }
- break;
- case 13:
- if (curmode == ARM_CPU_MODE_SYS) {
- goto undef;
- }
- break;
- case 14:
- if (curmode == ARM_CPU_MODE_HYP || curmode == ARM_CPU_MODE_SYS) {
- goto undef;
- }
- break;
- default:
- break;
- }
- }
-
- if (tgtmode == ARM_CPU_MODE_HYP) {
- /* SPSR_Hyp, r13_hyp: accessible from Monitor mode only */
- if (curmode != ARM_CPU_MODE_MON) {
- goto undef;
- }
- }
-
- return;
-
-undef:
- raise_exception(env, EXCP_UDEF, syn_uncategorized(),
- exception_target_el(env));
-}
-
-void HELPER(msr_banked)(CPUARMState *env, uint32_t value, uint32_t tgtmode,
- uint32_t regno)
-{
- msr_mrs_banked_exc_checks(env, tgtmode, regno);
-
- switch (regno) {
- case 16: /* SPSRs */
- env->banked_spsr[bank_number(tgtmode)] = value;
- break;
- case 17: /* ELR_Hyp */
- env->elr_el[2] = value;
- break;
- case 13:
- env->banked_r13[bank_number(tgtmode)] = value;
- break;
- case 14:
- env->banked_r14[r14_bank_number(tgtmode)] = value;
- break;
- case 8 ... 12:
- switch (tgtmode) {
- case ARM_CPU_MODE_USR:
- env->usr_regs[regno - 8] = value;
- break;
- case ARM_CPU_MODE_FIQ:
- env->fiq_regs[regno - 8] = value;
- break;
- default:
- g_assert_not_reached();
- }
- break;
- default:
- g_assert_not_reached();
- }
-}
-
-uint32_t HELPER(mrs_banked)(CPUARMState *env, uint32_t tgtmode, uint32_t regno)
-{
- msr_mrs_banked_exc_checks(env, tgtmode, regno);
-
- switch (regno) {
- case 16: /* SPSRs */
- return env->banked_spsr[bank_number(tgtmode)];
- case 17: /* ELR_Hyp */
- return env->elr_el[2];
- case 13:
- return env->banked_r13[bank_number(tgtmode)];
- case 14:
- return env->banked_r14[r14_bank_number(tgtmode)];
- case 8 ... 12:
- switch (tgtmode) {
- case ARM_CPU_MODE_USR:
- return env->usr_regs[regno - 8];
- case ARM_CPU_MODE_FIQ:
- return env->fiq_regs[regno - 8];
- default:
- g_assert_not_reached();
- }
- default:
- g_assert_not_reached();
- }
-}
-
-const void *HELPER(access_check_cp_reg)(CPUARMState *env, uint32_t key,
- uint32_t syndrome, uint32_t isread)
-{
- ARMCPU *cpu = env_archcpu(env);
- const ARMCPRegInfo *ri = get_arm_cp_reginfo(cpu->cp_regs, key);
- CPAccessResult res = CP_ACCESS_OK;
- int target_el;
-
- assert(ri != NULL);
-
- if (arm_feature(env, ARM_FEATURE_XSCALE) && ri->cp < 14
- && extract32(env->cp15.c15_cpar, ri->cp, 1) == 0) {
- res = CP_ACCESS_TRAP;
- goto fail;
- }
-
- if (ri->accessfn) {
- res = ri->accessfn(env, ri, isread);
- }
-
- /*
- * If the access function indicates a trap from EL0 to EL1 then
- * that always takes priority over the HSTR_EL2 trap. (If it indicates
- * a trap to EL3, then the HSTR_EL2 trap takes priority; if it indicates
- * a trap to EL2, then the syndrome is the same either way so we don't
- * care whether technically the architecture says that HSTR_EL2 trap or
- * the other trap takes priority. So we take the "check HSTR_EL2" path
- * for all of those cases.)
- */
- if (res != CP_ACCESS_OK && ((res & CP_ACCESS_EL_MASK) == 0) &&
- arm_current_el(env) == 0) {
- goto fail;
- }
-
- /*
- * HSTR_EL2 traps from EL1 are checked earlier, in generated code;
- * we only need to check here for traps from EL0.
- */
- if (!is_a64(env) && arm_current_el(env) == 0 && ri->cp == 15 &&
- arm_is_el2_enabled(env) &&
- (arm_hcr_el2_eff(env) & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE)) {
- uint32_t mask = 1 << ri->crn;
-
- if (ri->type & ARM_CP_64BIT) {
- mask = 1 << ri->crm;
- }
-
- /* T4 and T14 are RES0 */
- mask &= ~((1 << 4) | (1 << 14));
-
- if (env->cp15.hstr_el2 & mask) {
- res = CP_ACCESS_TRAP_EL2;
- goto fail;
- }
- }
-
- /*
- * Fine-grained traps also are lower priority than undef-to-EL1,
- * higher priority than trap-to-EL3, and we don't care about priority
- * order with other EL2 traps because the syndrome value is the same.
- */
- if (arm_fgt_active(env, arm_current_el(env))) {
- uint64_t trapword = 0;
- unsigned int idx = FIELD_EX32(ri->fgt, FGT, IDX);
- unsigned int bitpos = FIELD_EX32(ri->fgt, FGT, BITPOS);
- bool rev = FIELD_EX32(ri->fgt, FGT, REV);
- bool trapbit;
-
- if (ri->fgt & FGT_EXEC) {
- assert(idx < ARRAY_SIZE(env->cp15.fgt_exec));
- trapword = env->cp15.fgt_exec[idx];
- } else if (isread && (ri->fgt & FGT_R)) {
- assert(idx < ARRAY_SIZE(env->cp15.fgt_read));
- trapword = env->cp15.fgt_read[idx];
- } else if (!isread && (ri->fgt & FGT_W)) {
- assert(idx < ARRAY_SIZE(env->cp15.fgt_write));
- trapword = env->cp15.fgt_write[idx];
- }
-
- trapbit = extract64(trapword, bitpos, 1);
- if (trapbit != rev) {
- res = CP_ACCESS_TRAP_EL2;
- goto fail;
- }
- }
-
- if (likely(res == CP_ACCESS_OK)) {
- return ri;
- }
-
- fail:
- switch (res & ~CP_ACCESS_EL_MASK) {
- case CP_ACCESS_TRAP:
- break;
- case CP_ACCESS_TRAP_UNCATEGORIZED:
- /* Only CP_ACCESS_TRAP traps are direct to a specified EL */
- assert((res & CP_ACCESS_EL_MASK) == 0);
- if (cpu_isar_feature(aa64_ids, cpu) && isread &&
- arm_cpreg_in_idspace(ri)) {
- /*
- * FEAT_IDST says this should be reported as EC_SYSTEMREGISTERTRAP,
- * not EC_UNCATEGORIZED
- */
- break;
- }
- syndrome = syn_uncategorized();
- break;
- default:
- g_assert_not_reached();
- }
-
- target_el = res & CP_ACCESS_EL_MASK;
- switch (target_el) {
- case 0:
- target_el = exception_target_el(env);
- break;
- case 2:
- assert(arm_current_el(env) != 3);
- assert(arm_is_el2_enabled(env));
- break;
- case 3:
- assert(arm_feature(env, ARM_FEATURE_EL3));
- break;
- default:
- /* No "direct" traps to EL1 */
- g_assert_not_reached();
- }
-
- raise_exception(env, EXCP_UDEF, syndrome, target_el);
-}
-
-const void *HELPER(lookup_cp_reg)(CPUARMState *env, uint32_t key)
-{
- ARMCPU *cpu = env_archcpu(env);
- const ARMCPRegInfo *ri = get_arm_cp_reginfo(cpu->cp_regs, key);
-
- assert(ri != NULL);
- return ri;
-}
-
-void HELPER(set_cp_reg)(CPUARMState *env, const void *rip, uint32_t value)
-{
- const ARMCPRegInfo *ri = rip;
-
- if (ri->type & ARM_CP_IO) {
- qemu_mutex_lock_iothread();
- ri->writefn(env, ri, value);
- qemu_mutex_unlock_iothread();
- } else {
- ri->writefn(env, ri, value);
- }
-}
-
-uint32_t HELPER(get_cp_reg)(CPUARMState *env, const void *rip)
-{
- const ARMCPRegInfo *ri = rip;
- uint32_t res;
-
- if (ri->type & ARM_CP_IO) {
- qemu_mutex_lock_iothread();
- res = ri->readfn(env, ri);
- qemu_mutex_unlock_iothread();
- } else {
- res = ri->readfn(env, ri);
- }
-
- return res;
-}
-
-void HELPER(set_cp_reg64)(CPUARMState *env, const void *rip, uint64_t value)
-{
- const ARMCPRegInfo *ri = rip;
-
- if (ri->type & ARM_CP_IO) {
- qemu_mutex_lock_iothread();
- ri->writefn(env, ri, value);
- qemu_mutex_unlock_iothread();
- } else {
- ri->writefn(env, ri, value);
- }
-}
-
-uint64_t HELPER(get_cp_reg64)(CPUARMState *env, const void *rip)
-{
- const ARMCPRegInfo *ri = rip;
- uint64_t res;
-
- if (ri->type & ARM_CP_IO) {
- qemu_mutex_lock_iothread();
- res = ri->readfn(env, ri);
- qemu_mutex_unlock_iothread();
- } else {
- res = ri->readfn(env, ri);
- }
-
- return res;
-}
-
-void HELPER(pre_hvc)(CPUARMState *env)
-{
- ARMCPU *cpu = env_archcpu(env);
- int cur_el = arm_current_el(env);
- /* FIXME: Use actual secure state. */
- bool secure = false;
- bool undef;
-
- if (arm_is_psci_call(cpu, EXCP_HVC)) {
- /* If PSCI is enabled and this looks like a valid PSCI call then
- * that overrides the architecturally mandated HVC behaviour.
- */
- return;
- }
-
- if (!arm_feature(env, ARM_FEATURE_EL2)) {
- /* If EL2 doesn't exist, HVC always UNDEFs */
- undef = true;
- } else if (arm_feature(env, ARM_FEATURE_EL3)) {
- /* EL3.HCE has priority over EL2.HCD. */
- undef = !(env->cp15.scr_el3 & SCR_HCE);
- } else {
- undef = env->cp15.hcr_el2 & HCR_HCD;
- }
-
- /* In ARMv7 and ARMv8/AArch32, HVC is undef in secure state.
- * For ARMv8/AArch64, HVC is allowed in EL3.
- * Note that we've already trapped HVC from EL0 at translation
- * time.
- */
- if (secure && (!is_a64(env) || cur_el == 1)) {
- undef = true;
- }
-
- if (undef) {
- raise_exception(env, EXCP_UDEF, syn_uncategorized(),
- exception_target_el(env));
- }
-}
-
-void HELPER(pre_smc)(CPUARMState *env, uint32_t syndrome)
-{
- ARMCPU *cpu = env_archcpu(env);
- int cur_el = arm_current_el(env);
- bool secure = arm_is_secure(env);
- bool smd_flag = env->cp15.scr_el3 & SCR_SMD;
-
- /*
- * SMC behaviour is summarized in the following table.
- * This helper handles the "Trap to EL2" and "Undef insn" cases.
- * The "Trap to EL3" and "PSCI call" cases are handled in the exception
- * helper.
- *
- * -> ARM_FEATURE_EL3 and !SMD
- * HCR_TSC && NS EL1 !HCR_TSC || !NS EL1
- *
- * Conduit SMC, valid call Trap to EL2 PSCI Call
- * Conduit SMC, inval call Trap to EL2 Trap to EL3
- * Conduit not SMC Trap to EL2 Trap to EL3
- *
- *
- * -> ARM_FEATURE_EL3 and SMD
- * HCR_TSC && NS EL1 !HCR_TSC || !NS EL1
- *
- * Conduit SMC, valid call Trap to EL2 PSCI Call
- * Conduit SMC, inval call Trap to EL2 Undef insn
- * Conduit not SMC Trap to EL2 Undef insn
- *
- *
- * -> !ARM_FEATURE_EL3
- * HCR_TSC && NS EL1 !HCR_TSC || !NS EL1
- *
- * Conduit SMC, valid call Trap to EL2 PSCI Call
- * Conduit SMC, inval call Trap to EL2 Undef insn
- * Conduit not SMC Undef insn Undef insn
- */
-
- /* On ARMv8 with EL3 AArch64, SMD applies to both S and NS state.
- * On ARMv8 with EL3 AArch32, or ARMv7 with the Virtualization
- * extensions, SMD only applies to NS state.
- * On ARMv7 without the Virtualization extensions, the SMD bit
- * doesn't exist, but we forbid the guest to set it to 1 in scr_write(),
- * so we need not special case this here.
- */
- bool smd = arm_feature(env, ARM_FEATURE_AARCH64) ? smd_flag
- : smd_flag && !secure;
-
- if (!arm_feature(env, ARM_FEATURE_EL3) &&
- cpu->psci_conduit != QEMU_PSCI_CONDUIT_SMC) {
- /* If we have no EL3 then SMC always UNDEFs and can't be
- * trapped to EL2. PSCI-via-SMC is a sort of ersatz EL3
- * firmware within QEMU, and we want an EL2 guest to be able
- * to forbid its EL1 from making PSCI calls into QEMU's
- * "firmware" via HCR.TSC, so for these purposes treat
- * PSCI-via-SMC as implying an EL3.
- * This handles the very last line of the previous table.
- */
- raise_exception(env, EXCP_UDEF, syn_uncategorized(),
- exception_target_el(env));
- }
-
- if (cur_el == 1 && (arm_hcr_el2_eff(env) & HCR_TSC)) {
- /* In NS EL1, HCR controlled routing to EL2 has priority over SMD.
- * We also want an EL2 guest to be able to forbid its EL1 from
- * making PSCI calls into QEMU's "firmware" via HCR.TSC.
- * This handles all the "Trap to EL2" cases of the previous table.
- */
- raise_exception(env, EXCP_HYP_TRAP, syndrome, 2);
- }
-
- /* Catch the two remaining "Undef insn" cases of the previous table:
- * - PSCI conduit is SMC but we don't have a valid PCSI call,
- * - We don't have EL3 or SMD is set.
- */
- if (!arm_is_psci_call(cpu, EXCP_SMC) &&
- (smd || !arm_feature(env, ARM_FEATURE_EL3))) {
- raise_exception(env, EXCP_UDEF, syn_uncategorized(),
- exception_target_el(env));
- }
-}
-
-/* ??? Flag setting arithmetic is awkward because we need to do comparisons.
- The only way to do that in TCG is a conditional branch, which clobbers
- all our temporaries. For now implement these as helper functions. */
-
-/* Similarly for variable shift instructions. */
-
-uint32_t HELPER(shl_cc)(CPUARMState *env, uint32_t x, uint32_t i)
-{
- int shift = i & 0xff;
- if (shift >= 32) {
- if (shift == 32)
- env->CF = x & 1;
- else
- env->CF = 0;
- return 0;
- } else if (shift != 0) {
- env->CF = (x >> (32 - shift)) & 1;
- return x << shift;
- }
- return x;
-}
-
-uint32_t HELPER(shr_cc)(CPUARMState *env, uint32_t x, uint32_t i)
-{
- int shift = i & 0xff;
- if (shift >= 32) {
- if (shift == 32)
- env->CF = (x >> 31) & 1;
- else
- env->CF = 0;
- return 0;
- } else if (shift != 0) {
- env->CF = (x >> (shift - 1)) & 1;
- return x >> shift;
- }
- return x;
-}
-
-uint32_t HELPER(sar_cc)(CPUARMState *env, uint32_t x, uint32_t i)
-{
- int shift = i & 0xff;
- if (shift >= 32) {
- env->CF = (x >> 31) & 1;
- return (int32_t)x >> 31;
- } else if (shift != 0) {
- env->CF = (x >> (shift - 1)) & 1;
- return (int32_t)x >> shift;
- }
- return x;
-}
-
-uint32_t HELPER(ror_cc)(CPUARMState *env, uint32_t x, uint32_t i)
-{
- int shift1, shift;
- shift1 = i & 0xff;
- shift = shift1 & 0x1f;
- if (shift == 0) {
- if (shift1 != 0)
- env->CF = (x >> 31) & 1;
- return x;
- } else {
- env->CF = (x >> (shift - 1)) & 1;
- return ((uint32_t)x >> shift) | (x << (32 - shift));
- }
-}
-
-void HELPER(probe_access)(CPUARMState *env, target_ulong ptr,
- uint32_t access_type, uint32_t mmu_idx,
- uint32_t size)
-{
- uint32_t in_page = -((uint32_t)ptr | TARGET_PAGE_SIZE);
- uintptr_t ra = GETPC();
-
- if (likely(size <= in_page)) {
- probe_access(env, ptr, size, access_type, mmu_idx, ra);
- } else {
- probe_access(env, ptr, in_page, access_type, mmu_idx, ra);
- probe_access(env, ptr + in_page, size - in_page,
- access_type, mmu_idx, ra);
- }
-}
-
-/*
- * This function corresponds to AArch64.vESBOperation().
- * Note that the AArch32 version is not functionally different.
- */
-void HELPER(vesb)(CPUARMState *env)
-{
- /*
- * The EL2Enabled() check is done inside arm_hcr_el2_eff,
- * and will return HCR_EL2.VSE == 0, so nothing happens.
- */
- uint64_t hcr = arm_hcr_el2_eff(env);
- bool enabled = !(hcr & HCR_TGE) && (hcr & HCR_AMO);
- bool pending = enabled && (hcr & HCR_VSE);
- bool masked = (env->daif & PSTATE_A);
-
- /* If VSE pending and masked, defer the exception. */
- if (pending && masked) {
- uint32_t syndrome;
-
- if (arm_el_is_aa64(env, 1)) {
- /* Copy across IDS and ISS from VSESR. */
- syndrome = env->cp15.vsesr_el2 & 0x1ffffff;
- } else {
- ARMMMUFaultInfo fi = { .type = ARMFault_AsyncExternal };
-
- if (extended_addresses_enabled(env)) {
- syndrome = arm_fi_to_lfsc(&fi);
- } else {
- syndrome = arm_fi_to_sfsc(&fi);
- }
- /* Copy across AET and ExT from VSESR. */
- syndrome |= env->cp15.vsesr_el2 & 0xd000;
- }
-
- /* Set VDISR_EL2.A along with the syndrome. */
- env->cp15.vdisr_el2 = syndrome | (1u << 31);
-
- /* Clear pending virtual SError */
- env->cp15.hcr_el2 &= ~HCR_VSE;
- cpu_reset_interrupt(env_cpu(env), CPU_INTERRUPT_VSERR);
- }
-}
+++ /dev/null
-/*
- * ARM v8.3-PAuth Operations
- *
- * Copyright (c) 2019 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "internals.h"
-#include "exec/exec-all.h"
-#include "exec/cpu_ldst.h"
-#include "exec/helper-proto.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "qemu/xxhash.h"
-
-
-static uint64_t pac_cell_shuffle(uint64_t i)
-{
- uint64_t o = 0;
-
- o |= extract64(i, 52, 4);
- o |= extract64(i, 24, 4) << 4;
- o |= extract64(i, 44, 4) << 8;
- o |= extract64(i, 0, 4) << 12;
-
- o |= extract64(i, 28, 4) << 16;
- o |= extract64(i, 48, 4) << 20;
- o |= extract64(i, 4, 4) << 24;
- o |= extract64(i, 40, 4) << 28;
-
- o |= extract64(i, 32, 4) << 32;
- o |= extract64(i, 12, 4) << 36;
- o |= extract64(i, 56, 4) << 40;
- o |= extract64(i, 20, 4) << 44;
-
- o |= extract64(i, 8, 4) << 48;
- o |= extract64(i, 36, 4) << 52;
- o |= extract64(i, 16, 4) << 56;
- o |= extract64(i, 60, 4) << 60;
-
- return o;
-}
-
-static uint64_t pac_cell_inv_shuffle(uint64_t i)
-{
- uint64_t o = 0;
-
- o |= extract64(i, 12, 4);
- o |= extract64(i, 24, 4) << 4;
- o |= extract64(i, 48, 4) << 8;
- o |= extract64(i, 36, 4) << 12;
-
- o |= extract64(i, 56, 4) << 16;
- o |= extract64(i, 44, 4) << 20;
- o |= extract64(i, 4, 4) << 24;
- o |= extract64(i, 16, 4) << 28;
-
- o |= i & MAKE_64BIT_MASK(32, 4);
- o |= extract64(i, 52, 4) << 36;
- o |= extract64(i, 28, 4) << 40;
- o |= extract64(i, 8, 4) << 44;
-
- o |= extract64(i, 20, 4) << 48;
- o |= extract64(i, 0, 4) << 52;
- o |= extract64(i, 40, 4) << 56;
- o |= i & MAKE_64BIT_MASK(60, 4);
-
- return o;
-}
-
-static uint64_t pac_sub(uint64_t i)
-{
- static const uint8_t sub[16] = {
- 0xb, 0x6, 0x8, 0xf, 0xc, 0x0, 0x9, 0xe,
- 0x3, 0x7, 0x4, 0x5, 0xd, 0x2, 0x1, 0xa,
- };
- uint64_t o = 0;
- int b;
-
- for (b = 0; b < 64; b += 4) {
- o |= (uint64_t)sub[(i >> b) & 0xf] << b;
- }
- return o;
-}
-
-static uint64_t pac_inv_sub(uint64_t i)
-{
- static const uint8_t inv_sub[16] = {
- 0x5, 0xe, 0xd, 0x8, 0xa, 0xb, 0x1, 0x9,
- 0x2, 0x6, 0xf, 0x0, 0x4, 0xc, 0x7, 0x3,
- };
- uint64_t o = 0;
- int b;
-
- for (b = 0; b < 64; b += 4) {
- o |= (uint64_t)inv_sub[(i >> b) & 0xf] << b;
- }
- return o;
-}
-
-static int rot_cell(int cell, int n)
-{
- /* 4-bit rotate left by n. */
- cell |= cell << 4;
- return extract32(cell, 4 - n, 4);
-}
-
-static uint64_t pac_mult(uint64_t i)
-{
- uint64_t o = 0;
- int b;
-
- for (b = 0; b < 4 * 4; b += 4) {
- int i0, i4, i8, ic, t0, t1, t2, t3;
-
- i0 = extract64(i, b, 4);
- i4 = extract64(i, b + 4 * 4, 4);
- i8 = extract64(i, b + 8 * 4, 4);
- ic = extract64(i, b + 12 * 4, 4);
-
- t0 = rot_cell(i8, 1) ^ rot_cell(i4, 2) ^ rot_cell(i0, 1);
- t1 = rot_cell(ic, 1) ^ rot_cell(i4, 1) ^ rot_cell(i0, 2);
- t2 = rot_cell(ic, 2) ^ rot_cell(i8, 1) ^ rot_cell(i0, 1);
- t3 = rot_cell(ic, 1) ^ rot_cell(i8, 2) ^ rot_cell(i4, 1);
-
- o |= (uint64_t)t3 << b;
- o |= (uint64_t)t2 << (b + 4 * 4);
- o |= (uint64_t)t1 << (b + 8 * 4);
- o |= (uint64_t)t0 << (b + 12 * 4);
- }
- return o;
-}
-
-static uint64_t tweak_cell_rot(uint64_t cell)
-{
- return (cell >> 1) | (((cell ^ (cell >> 1)) & 1) << 3);
-}
-
-static uint64_t tweak_shuffle(uint64_t i)
-{
- uint64_t o = 0;
-
- o |= extract64(i, 16, 4) << 0;
- o |= extract64(i, 20, 4) << 4;
- o |= tweak_cell_rot(extract64(i, 24, 4)) << 8;
- o |= extract64(i, 28, 4) << 12;
-
- o |= tweak_cell_rot(extract64(i, 44, 4)) << 16;
- o |= extract64(i, 8, 4) << 20;
- o |= extract64(i, 12, 4) << 24;
- o |= tweak_cell_rot(extract64(i, 32, 4)) << 28;
-
- o |= extract64(i, 48, 4) << 32;
- o |= extract64(i, 52, 4) << 36;
- o |= extract64(i, 56, 4) << 40;
- o |= tweak_cell_rot(extract64(i, 60, 4)) << 44;
-
- o |= tweak_cell_rot(extract64(i, 0, 4)) << 48;
- o |= extract64(i, 4, 4) << 52;
- o |= tweak_cell_rot(extract64(i, 40, 4)) << 56;
- o |= tweak_cell_rot(extract64(i, 36, 4)) << 60;
-
- return o;
-}
-
-static uint64_t tweak_cell_inv_rot(uint64_t cell)
-{
- return ((cell << 1) & 0xf) | ((cell & 1) ^ (cell >> 3));
-}
-
-static uint64_t tweak_inv_shuffle(uint64_t i)
-{
- uint64_t o = 0;
-
- o |= tweak_cell_inv_rot(extract64(i, 48, 4));
- o |= extract64(i, 52, 4) << 4;
- o |= extract64(i, 20, 4) << 8;
- o |= extract64(i, 24, 4) << 12;
-
- o |= extract64(i, 0, 4) << 16;
- o |= extract64(i, 4, 4) << 20;
- o |= tweak_cell_inv_rot(extract64(i, 8, 4)) << 24;
- o |= extract64(i, 12, 4) << 28;
-
- o |= tweak_cell_inv_rot(extract64(i, 28, 4)) << 32;
- o |= tweak_cell_inv_rot(extract64(i, 60, 4)) << 36;
- o |= tweak_cell_inv_rot(extract64(i, 56, 4)) << 40;
- o |= tweak_cell_inv_rot(extract64(i, 16, 4)) << 44;
-
- o |= extract64(i, 32, 4) << 48;
- o |= extract64(i, 36, 4) << 52;
- o |= extract64(i, 40, 4) << 56;
- o |= tweak_cell_inv_rot(extract64(i, 44, 4)) << 60;
-
- return o;
-}
-
-static uint64_t pauth_computepac_architected(uint64_t data, uint64_t modifier,
- ARMPACKey key)
-{
- static const uint64_t RC[5] = {
- 0x0000000000000000ull,
- 0x13198A2E03707344ull,
- 0xA4093822299F31D0ull,
- 0x082EFA98EC4E6C89ull,
- 0x452821E638D01377ull,
- };
- const uint64_t alpha = 0xC0AC29B7C97C50DDull;
- /*
- * Note that in the ARM pseudocode, key0 contains bits <127:64>
- * and key1 contains bits <63:0> of the 128-bit key.
- */
- uint64_t key0 = key.hi, key1 = key.lo;
- uint64_t workingval, runningmod, roundkey, modk0;
- int i;
-
- modk0 = (key0 << 63) | ((key0 >> 1) ^ (key0 >> 63));
- runningmod = modifier;
- workingval = data ^ key0;
-
- for (i = 0; i <= 4; ++i) {
- roundkey = key1 ^ runningmod;
- workingval ^= roundkey;
- workingval ^= RC[i];
- if (i > 0) {
- workingval = pac_cell_shuffle(workingval);
- workingval = pac_mult(workingval);
- }
- workingval = pac_sub(workingval);
- runningmod = tweak_shuffle(runningmod);
- }
- roundkey = modk0 ^ runningmod;
- workingval ^= roundkey;
- workingval = pac_cell_shuffle(workingval);
- workingval = pac_mult(workingval);
- workingval = pac_sub(workingval);
- workingval = pac_cell_shuffle(workingval);
- workingval = pac_mult(workingval);
- workingval ^= key1;
- workingval = pac_cell_inv_shuffle(workingval);
- workingval = pac_inv_sub(workingval);
- workingval = pac_mult(workingval);
- workingval = pac_cell_inv_shuffle(workingval);
- workingval ^= key0;
- workingval ^= runningmod;
- for (i = 0; i <= 4; ++i) {
- workingval = pac_inv_sub(workingval);
- if (i < 4) {
- workingval = pac_mult(workingval);
- workingval = pac_cell_inv_shuffle(workingval);
- }
- runningmod = tweak_inv_shuffle(runningmod);
- roundkey = key1 ^ runningmod;
- workingval ^= RC[4 - i];
- workingval ^= roundkey;
- workingval ^= alpha;
- }
- workingval ^= modk0;
-
- return workingval;
-}
-
-static uint64_t pauth_computepac_impdef(uint64_t data, uint64_t modifier,
- ARMPACKey key)
-{
- return qemu_xxhash64_4(data, modifier, key.lo, key.hi);
-}
-
-static uint64_t pauth_computepac(CPUARMState *env, uint64_t data,
- uint64_t modifier, ARMPACKey key)
-{
- if (cpu_isar_feature(aa64_pauth_arch, env_archcpu(env))) {
- return pauth_computepac_architected(data, modifier, key);
- } else {
- return pauth_computepac_impdef(data, modifier, key);
- }
-}
-
-static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier,
- ARMPACKey *key, bool data)
-{
- ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env);
- ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data);
- uint64_t pac, ext_ptr, ext, test;
- int bot_bit, top_bit;
-
- /* If tagged pointers are in use, use ptr<55>, otherwise ptr<63>. */
- if (param.tbi) {
- ext = sextract64(ptr, 55, 1);
- } else {
- ext = sextract64(ptr, 63, 1);
- }
-
- /* Build a pointer with known good extension bits. */
- top_bit = 64 - 8 * param.tbi;
- bot_bit = 64 - param.tsz;
- ext_ptr = deposit64(ptr, bot_bit, top_bit - bot_bit, ext);
-
- pac = pauth_computepac(env, ext_ptr, modifier, *key);
-
- /*
- * Check if the ptr has good extension bits and corrupt the
- * pointer authentication code if not.
- */
- test = sextract64(ptr, bot_bit, top_bit - bot_bit);
- if (test != 0 && test != -1) {
- /*
- * Note that our top_bit is one greater than the pseudocode's
- * version, hence "- 2" here.
- */
- pac ^= MAKE_64BIT_MASK(top_bit - 2, 1);
- }
-
- /*
- * Preserve the determination between upper and lower at bit 55,
- * and insert pointer authentication code.
- */
- if (param.tbi) {
- ptr &= ~MAKE_64BIT_MASK(bot_bit, 55 - bot_bit + 1);
- pac &= MAKE_64BIT_MASK(bot_bit, 54 - bot_bit + 1);
- } else {
- ptr &= MAKE_64BIT_MASK(0, bot_bit);
- pac &= ~(MAKE_64BIT_MASK(55, 1) | MAKE_64BIT_MASK(0, bot_bit));
- }
- ext &= MAKE_64BIT_MASK(55, 1);
- return pac | ext | ptr;
-}
-
-static uint64_t pauth_original_ptr(uint64_t ptr, ARMVAParameters param)
-{
- /* Note that bit 55 is used whether or not the regime has 2 ranges. */
- uint64_t extfield = sextract64(ptr, 55, 1);
- int bot_pac_bit = 64 - param.tsz;
- int top_pac_bit = 64 - 8 * param.tbi;
-
- return deposit64(ptr, bot_pac_bit, top_pac_bit - bot_pac_bit, extfield);
-}
-
-static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier,
- ARMPACKey *key, bool data, int keynumber)
-{
- ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env);
- ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data);
- int bot_bit, top_bit;
- uint64_t pac, orig_ptr, test;
-
- orig_ptr = pauth_original_ptr(ptr, param);
- pac = pauth_computepac(env, orig_ptr, modifier, *key);
- bot_bit = 64 - param.tsz;
- top_bit = 64 - 8 * param.tbi;
-
- test = (pac ^ ptr) & ~MAKE_64BIT_MASK(55, 1);
- if (unlikely(extract64(test, bot_bit, top_bit - bot_bit))) {
- int error_code = (keynumber << 1) | (keynumber ^ 1);
- if (param.tbi) {
- return deposit64(orig_ptr, 53, 2, error_code);
- } else {
- return deposit64(orig_ptr, 61, 2, error_code);
- }
- }
- return orig_ptr;
-}
-
-static uint64_t pauth_strip(CPUARMState *env, uint64_t ptr, bool data)
-{
- ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env);
- ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data);
-
- return pauth_original_ptr(ptr, param);
-}
-
-static G_NORETURN
-void pauth_trap(CPUARMState *env, int target_el, uintptr_t ra)
-{
- raise_exception_ra(env, EXCP_UDEF, syn_pactrap(), target_el, ra);
-}
-
-static void pauth_check_trap(CPUARMState *env, int el, uintptr_t ra)
-{
- if (el < 2 && arm_is_el2_enabled(env)) {
- uint64_t hcr = arm_hcr_el2_eff(env);
- bool trap = !(hcr & HCR_API);
- if (el == 0) {
- /* Trap only applies to EL1&0 regime. */
- trap &= (hcr & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE);
- }
- /* FIXME: ARMv8.3-NV: HCR_NV trap takes precedence for ERETA[AB]. */
- if (trap) {
- pauth_trap(env, 2, ra);
- }
- }
- if (el < 3 && arm_feature(env, ARM_FEATURE_EL3)) {
- if (!(env->cp15.scr_el3 & SCR_API)) {
- pauth_trap(env, 3, ra);
- }
- }
-}
-
-static bool pauth_key_enabled(CPUARMState *env, int el, uint32_t bit)
-{
- return (arm_sctlr(env, el) & bit) != 0;
-}
-
-uint64_t HELPER(pacia)(CPUARMState *env, uint64_t x, uint64_t y)
-{
- int el = arm_current_el(env);
- if (!pauth_key_enabled(env, el, SCTLR_EnIA)) {
- return x;
- }
- pauth_check_trap(env, el, GETPC());
- return pauth_addpac(env, x, y, &env->keys.apia, false);
-}
-
-uint64_t HELPER(pacib)(CPUARMState *env, uint64_t x, uint64_t y)
-{
- int el = arm_current_el(env);
- if (!pauth_key_enabled(env, el, SCTLR_EnIB)) {
- return x;
- }
- pauth_check_trap(env, el, GETPC());
- return pauth_addpac(env, x, y, &env->keys.apib, false);
-}
-
-uint64_t HELPER(pacda)(CPUARMState *env, uint64_t x, uint64_t y)
-{
- int el = arm_current_el(env);
- if (!pauth_key_enabled(env, el, SCTLR_EnDA)) {
- return x;
- }
- pauth_check_trap(env, el, GETPC());
- return pauth_addpac(env, x, y, &env->keys.apda, true);
-}
-
-uint64_t HELPER(pacdb)(CPUARMState *env, uint64_t x, uint64_t y)
-{
- int el = arm_current_el(env);
- if (!pauth_key_enabled(env, el, SCTLR_EnDB)) {
- return x;
- }
- pauth_check_trap(env, el, GETPC());
- return pauth_addpac(env, x, y, &env->keys.apdb, true);
-}
-
-uint64_t HELPER(pacga)(CPUARMState *env, uint64_t x, uint64_t y)
-{
- uint64_t pac;
-
- pauth_check_trap(env, arm_current_el(env), GETPC());
- pac = pauth_computepac(env, x, y, env->keys.apga);
-
- return pac & 0xffffffff00000000ull;
-}
-
-uint64_t HELPER(autia)(CPUARMState *env, uint64_t x, uint64_t y)
-{
- int el = arm_current_el(env);
- if (!pauth_key_enabled(env, el, SCTLR_EnIA)) {
- return x;
- }
- pauth_check_trap(env, el, GETPC());
- return pauth_auth(env, x, y, &env->keys.apia, false, 0);
-}
-
-uint64_t HELPER(autib)(CPUARMState *env, uint64_t x, uint64_t y)
-{
- int el = arm_current_el(env);
- if (!pauth_key_enabled(env, el, SCTLR_EnIB)) {
- return x;
- }
- pauth_check_trap(env, el, GETPC());
- return pauth_auth(env, x, y, &env->keys.apib, false, 1);
-}
-
-uint64_t HELPER(autda)(CPUARMState *env, uint64_t x, uint64_t y)
-{
- int el = arm_current_el(env);
- if (!pauth_key_enabled(env, el, SCTLR_EnDA)) {
- return x;
- }
- pauth_check_trap(env, el, GETPC());
- return pauth_auth(env, x, y, &env->keys.apda, true, 0);
-}
-
-uint64_t HELPER(autdb)(CPUARMState *env, uint64_t x, uint64_t y)
-{
- int el = arm_current_el(env);
- if (!pauth_key_enabled(env, el, SCTLR_EnDB)) {
- return x;
- }
- pauth_check_trap(env, el, GETPC());
- return pauth_auth(env, x, y, &env->keys.apdb, true, 1);
-}
-
-uint64_t HELPER(xpaci)(CPUARMState *env, uint64_t a)
-{
- return pauth_strip(env, a, false);
-}
-
-uint64_t HELPER(xpacd)(CPUARMState *env, uint64_t a)
-{
- return pauth_strip(env, a, true);
-}
+++ /dev/null
-/*
- * ARM SME Operations
- *
- * Copyright (c) 2022 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "internals.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "exec/helper-proto.h"
-#include "exec/cpu_ldst.h"
-#include "exec/exec-all.h"
-#include "qemu/int128.h"
-#include "fpu/softfloat.h"
-#include "vec_internal.h"
-#include "sve_ldst_internal.h"
-
-void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask)
-{
- aarch64_set_svcr(env, val, mask);
-}
-
-void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
-{
- uint32_t i;
-
- /*
- * Special case clearing the entire ZA space.
- * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
- * parts of the ZA storage outside of SVL.
- */
- if (imm == 0xff) {
- memset(env->zarray, 0, sizeof(env->zarray));
- return;
- }
-
- /*
- * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
- * so each row is discontiguous within ZA[].
- */
- for (i = 0; i < svl; i++) {
- if (imm & (1 << (i % 8))) {
- memset(&env->zarray[i], 0, svl);
- }
- }
-}
-
-
-/*
- * When considering the ZA storage as an array of elements of
- * type T, the index within that array of the Nth element of
- * a vertical slice of a tile can be calculated like this,
- * regardless of the size of type T. This is because the tiles
- * are interleaved, so if type T is size N bytes then row 1 of
- * the tile is N rows away from row 0. The division by N to
- * convert a byte offset into an array index and the multiplication
- * by N to convert from vslice-index-within-the-tile to
- * the index within the ZA storage cancel out.
- */
-#define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
-
-/*
- * When doing byte arithmetic on the ZA storage, the element
- * byteoff bytes away in a tile vertical slice is always this
- * many bytes away in the ZA storage, regardless of the
- * size of the tile element, assuming that byteoff is a multiple
- * of the element size. Again this is because of the interleaving
- * of the tiles. For instance if we have 1 byte per element then
- * each row of the ZA storage has one byte of the vslice data,
- * and (counting from 0) byte 8 goes in row 8 of the storage
- * at offset (8 * row-size-in-bytes).
- * If we have 8 bytes per element then each row of the ZA storage
- * has 8 bytes of the data, but there are 8 interleaved tiles and
- * so byte 8 of the data goes into row 1 of the tile,
- * which is again row 8 of the storage, so the offset is still
- * (8 * row-size-in-bytes). Similarly for other element sizes.
- */
-#define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
-
-
-/*
- * Move Zreg vector to ZArray column.
- */
-#define DO_MOVA_C(NAME, TYPE, H) \
-void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc) \
-{ \
- int i, oprsz = simd_oprsz(desc); \
- for (i = 0; i < oprsz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- if (pg & 1) { \
- *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
- } \
- i += sizeof(TYPE); \
- pg >>= sizeof(TYPE); \
- } while (i & 15); \
- } \
-}
-
-DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
-DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
-DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
-
-void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
-{
- int i, oprsz = simd_oprsz(desc) / 8;
- uint8_t *pg = vg;
- uint64_t *n = vn;
- uint64_t *a = za;
-
- for (i = 0; i < oprsz; i++) {
- if (pg[H1(i)] & 1) {
- a[tile_vslice_index(i)] = n[i];
- }
- }
-}
-
-void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
-{
- int i, oprsz = simd_oprsz(desc) / 16;
- uint16_t *pg = vg;
- Int128 *n = vn;
- Int128 *a = za;
-
- /*
- * Int128 is used here simply to copy 16 bytes, and to simplify
- * the address arithmetic.
- */
- for (i = 0; i < oprsz; i++) {
- if (pg[H2(i)] & 1) {
- a[tile_vslice_index(i)] = n[i];
- }
- }
-}
-
-#undef DO_MOVA_C
-
-/*
- * Move ZArray column to Zreg vector.
- */
-#define DO_MOVA_Z(NAME, TYPE, H) \
-void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc) \
-{ \
- int i, oprsz = simd_oprsz(desc); \
- for (i = 0; i < oprsz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- if (pg & 1) { \
- *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
- } \
- i += sizeof(TYPE); \
- pg >>= sizeof(TYPE); \
- } while (i & 15); \
- } \
-}
-
-DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
-DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
-DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
-
-void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
-{
- int i, oprsz = simd_oprsz(desc) / 8;
- uint8_t *pg = vg;
- uint64_t *d = vd;
- uint64_t *a = za;
-
- for (i = 0; i < oprsz; i++) {
- if (pg[H1(i)] & 1) {
- d[i] = a[tile_vslice_index(i)];
- }
- }
-}
-
-void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
-{
- int i, oprsz = simd_oprsz(desc) / 16;
- uint16_t *pg = vg;
- Int128 *d = vd;
- Int128 *a = za;
-
- /*
- * Int128 is used here simply to copy 16 bytes, and to simplify
- * the address arithmetic.
- */
- for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
- if (pg[H2(i)] & 1) {
- d[i] = a[tile_vslice_index(i)];
- }
- }
-}
-
-#undef DO_MOVA_Z
-
-/*
- * Clear elements in a tile slice comprising len bytes.
- */
-
-typedef void ClearFn(void *ptr, size_t off, size_t len);
-
-static void clear_horizontal(void *ptr, size_t off, size_t len)
-{
- memset(ptr + off, 0, len);
-}
-
-static void clear_vertical_b(void *vptr, size_t off, size_t len)
-{
- for (size_t i = 0; i < len; ++i) {
- *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
- }
-}
-
-static void clear_vertical_h(void *vptr, size_t off, size_t len)
-{
- for (size_t i = 0; i < len; i += 2) {
- *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
- }
-}
-
-static void clear_vertical_s(void *vptr, size_t off, size_t len)
-{
- for (size_t i = 0; i < len; i += 4) {
- *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
- }
-}
-
-static void clear_vertical_d(void *vptr, size_t off, size_t len)
-{
- for (size_t i = 0; i < len; i += 8) {
- *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
- }
-}
-
-static void clear_vertical_q(void *vptr, size_t off, size_t len)
-{
- for (size_t i = 0; i < len; i += 16) {
- memset(vptr + tile_vslice_offset(i + off), 0, 16);
- }
-}
-
-/*
- * Copy elements from an array into a tile slice comprising len bytes.
- */
-
-typedef void CopyFn(void *dst, const void *src, size_t len);
-
-static void copy_horizontal(void *dst, const void *src, size_t len)
-{
- memcpy(dst, src, len);
-}
-
-static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
-{
- const uint8_t *src = vsrc;
- uint8_t *dst = vdst;
- size_t i;
-
- for (i = 0; i < len; ++i) {
- dst[tile_vslice_index(i)] = src[i];
- }
-}
-
-static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
-{
- const uint16_t *src = vsrc;
- uint16_t *dst = vdst;
- size_t i;
-
- for (i = 0; i < len / 2; ++i) {
- dst[tile_vslice_index(i)] = src[i];
- }
-}
-
-static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
-{
- const uint32_t *src = vsrc;
- uint32_t *dst = vdst;
- size_t i;
-
- for (i = 0; i < len / 4; ++i) {
- dst[tile_vslice_index(i)] = src[i];
- }
-}
-
-static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
-{
- const uint64_t *src = vsrc;
- uint64_t *dst = vdst;
- size_t i;
-
- for (i = 0; i < len / 8; ++i) {
- dst[tile_vslice_index(i)] = src[i];
- }
-}
-
-static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
-{
- for (size_t i = 0; i < len; i += 16) {
- memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
- }
-}
-
-/*
- * Host and TLB primitives for vertical tile slice addressing.
- */
-
-#define DO_LD(NAME, TYPE, HOST, TLB) \
-static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
-{ \
- TYPE val = HOST(host); \
- *(TYPE *)(za + tile_vslice_offset(off)) = val; \
-} \
-static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
- intptr_t off, target_ulong addr, uintptr_t ra) \
-{ \
- TYPE val = TLB(env, useronly_clean_ptr(addr), ra); \
- *(TYPE *)(za + tile_vslice_offset(off)) = val; \
-}
-
-#define DO_ST(NAME, TYPE, HOST, TLB) \
-static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
-{ \
- TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
- HOST(host, val); \
-} \
-static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
- intptr_t off, target_ulong addr, uintptr_t ra) \
-{ \
- TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
- TLB(env, useronly_clean_ptr(addr), val, ra); \
-}
-
-/*
- * The ARMVectorReg elements are stored in host-endian 64-bit units.
- * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
- * corresponds to storing the two 64-bit pieces in little-endian order.
- */
-#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \
-static inline void HNAME##_host(void *za, intptr_t off, void *host) \
-{ \
- uint64_t val0 = HOST(host), val1 = HOST(host + 8); \
- uint64_t *ptr = za + off; \
- ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
-} \
-static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
-{ \
- HNAME##_host(za, tile_vslice_offset(off), host); \
-} \
-static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
- target_ulong addr, uintptr_t ra) \
-{ \
- uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \
- uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \
- uint64_t *ptr = za + off; \
- ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
-} \
-static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
- target_ulong addr, uintptr_t ra) \
-{ \
- HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
-}
-
-#define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \
-static inline void HNAME##_host(void *za, intptr_t off, void *host) \
-{ \
- uint64_t *ptr = za + off; \
- HOST(host, ptr[BE]); \
- HOST(host + 1, ptr[!BE]); \
-} \
-static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
-{ \
- HNAME##_host(za, tile_vslice_offset(off), host); \
-} \
-static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
- target_ulong addr, uintptr_t ra) \
-{ \
- uint64_t *ptr = za + off; \
- TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \
- TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \
-} \
-static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
- target_ulong addr, uintptr_t ra) \
-{ \
- HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
-}
-
-DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
-DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
-DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
-DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
-DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
-DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
-DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
-
-DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
-DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
-
-DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
-DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
-DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
-DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
-DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
-DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
-DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
-
-DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
-DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
-
-#undef DO_LD
-#undef DO_ST
-#undef DO_LDQ
-#undef DO_STQ
-
-/*
- * Common helper for all contiguous predicated loads.
- */
-
-static inline QEMU_ALWAYS_INLINE
-void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
- const target_ulong addr, uint32_t desc, const uintptr_t ra,
- const int esz, uint32_t mtedesc, bool vertical,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn,
- ClearFn *clr_fn,
- CopyFn *cpy_fn)
-{
- const intptr_t reg_max = simd_oprsz(desc);
- const intptr_t esize = 1 << esz;
- intptr_t reg_off, reg_last;
- SVEContLdSt info;
- void *host;
- int flags;
-
- /* Find the active elements. */
- if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
- /* The entire predicate was false; no load occurs. */
- clr_fn(za, 0, reg_max);
- return;
- }
-
- /* Probe the page(s). Exit with exception for any invalid page. */
- sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
-
- /* Handle watchpoints for all active elements. */
- sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
- BP_MEM_READ, ra);
-
- /*
- * Handle mte checks for all active elements.
- * Since TBI must be set for MTE, !mtedesc => !mte_active.
- */
- if (mtedesc) {
- sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
- mtedesc, ra);
- }
-
- flags = info.page[0].flags | info.page[1].flags;
- if (unlikely(flags != 0)) {
-#ifdef CONFIG_USER_ONLY
- g_assert_not_reached();
-#else
- /*
- * At least one page includes MMIO.
- * Any bus operation can fail with cpu_transaction_failed,
- * which for ARM will raise SyncExternal. Perform the load
- * into scratch memory to preserve register state until the end.
- */
- ARMVectorReg scratch = { };
-
- reg_off = info.reg_off_first[0];
- reg_last = info.reg_off_last[1];
- if (reg_last < 0) {
- reg_last = info.reg_off_split;
- if (reg_last < 0) {
- reg_last = info.reg_off_last[0];
- }
- }
-
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
- }
- reg_off += esize;
- } while (reg_off & 63);
- } while (reg_off <= reg_last);
-
- cpy_fn(za, &scratch, reg_max);
- return;
-#endif
- }
-
- /* The entire operation is in RAM, on valid pages. */
-
- reg_off = info.reg_off_first[0];
- reg_last = info.reg_off_last[0];
- host = info.page[0].host;
-
- if (!vertical) {
- memset(za, 0, reg_max);
- } else if (reg_off) {
- clr_fn(za, 0, reg_off);
- }
-
- while (reg_off <= reg_last) {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- host_fn(za, reg_off, host + reg_off);
- } else if (vertical) {
- clr_fn(za, reg_off, esize);
- }
- reg_off += esize;
- } while (reg_off <= reg_last && (reg_off & 63));
- }
-
- /*
- * Use the slow path to manage the cross-page misalignment.
- * But we know this is RAM and cannot trap.
- */
- reg_off = info.reg_off_split;
- if (unlikely(reg_off >= 0)) {
- tlb_fn(env, za, reg_off, addr + reg_off, ra);
- }
-
- reg_off = info.reg_off_first[1];
- if (unlikely(reg_off >= 0)) {
- reg_last = info.reg_off_last[1];
- host = info.page[1].host;
-
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- host_fn(za, reg_off, host + reg_off);
- } else if (vertical) {
- clr_fn(za, reg_off, esize);
- }
- reg_off += esize;
- } while (reg_off & 63);
- } while (reg_off <= reg_last);
- }
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
- target_ulong addr, uint32_t desc, uintptr_t ra,
- const int esz, bool vertical,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn,
- ClearFn *clr_fn,
- CopyFn *cpy_fn)
-{
- uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
- int bit55 = extract64(addr, 55, 1);
-
- /* Remove mtedesc from the normal sve descriptor. */
- desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
- /* Perform gross MTE suppression early. */
- if (!tbi_check(desc, bit55) ||
- tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
- mtedesc = 0;
- }
-
- sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
- host_fn, tlb_fn, clr_fn, cpy_fn);
-}
-
-#define DO_LD(L, END, ESZ) \
-void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
- sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
- clear_horizontal, copy_horizontal); \
-} \
-void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
- sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
- clear_vertical_##L, copy_vertical_##L); \
-} \
-void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
- sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
- clear_horizontal, copy_horizontal); \
-} \
-void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
- sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
- clear_vertical_##L, copy_vertical_##L); \
-}
-
-DO_LD(b, , MO_8)
-DO_LD(h, _be, MO_16)
-DO_LD(h, _le, MO_16)
-DO_LD(s, _be, MO_32)
-DO_LD(s, _le, MO_32)
-DO_LD(d, _be, MO_64)
-DO_LD(d, _le, MO_64)
-DO_LD(q, _be, MO_128)
-DO_LD(q, _le, MO_128)
-
-#undef DO_LD
-
-/*
- * Common helper for all contiguous predicated stores.
- */
-
-static inline QEMU_ALWAYS_INLINE
-void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
- const target_ulong addr, uint32_t desc, const uintptr_t ra,
- const int esz, uint32_t mtedesc, bool vertical,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- const intptr_t reg_max = simd_oprsz(desc);
- const intptr_t esize = 1 << esz;
- intptr_t reg_off, reg_last;
- SVEContLdSt info;
- void *host;
- int flags;
-
- /* Find the active elements. */
- if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
- /* The entire predicate was false; no store occurs. */
- return;
- }
-
- /* Probe the page(s). Exit with exception for any invalid page. */
- sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
-
- /* Handle watchpoints for all active elements. */
- sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
- BP_MEM_WRITE, ra);
-
- /*
- * Handle mte checks for all active elements.
- * Since TBI must be set for MTE, !mtedesc => !mte_active.
- */
- if (mtedesc) {
- sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
- mtedesc, ra);
- }
-
- flags = info.page[0].flags | info.page[1].flags;
- if (unlikely(flags != 0)) {
-#ifdef CONFIG_USER_ONLY
- g_assert_not_reached();
-#else
- /*
- * At least one page includes MMIO.
- * Any bus operation can fail with cpu_transaction_failed,
- * which for ARM will raise SyncExternal. We cannot avoid
- * this fault and will leave with the store incomplete.
- */
- reg_off = info.reg_off_first[0];
- reg_last = info.reg_off_last[1];
- if (reg_last < 0) {
- reg_last = info.reg_off_split;
- if (reg_last < 0) {
- reg_last = info.reg_off_last[0];
- }
- }
-
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- tlb_fn(env, za, reg_off, addr + reg_off, ra);
- }
- reg_off += esize;
- } while (reg_off & 63);
- } while (reg_off <= reg_last);
- return;
-#endif
- }
-
- reg_off = info.reg_off_first[0];
- reg_last = info.reg_off_last[0];
- host = info.page[0].host;
-
- while (reg_off <= reg_last) {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- host_fn(za, reg_off, host + reg_off);
- }
- reg_off += 1 << esz;
- } while (reg_off <= reg_last && (reg_off & 63));
- }
-
- /*
- * Use the slow path to manage the cross-page misalignment.
- * But we know this is RAM and cannot trap.
- */
- reg_off = info.reg_off_split;
- if (unlikely(reg_off >= 0)) {
- tlb_fn(env, za, reg_off, addr + reg_off, ra);
- }
-
- reg_off = info.reg_off_first[1];
- if (unlikely(reg_off >= 0)) {
- reg_last = info.reg_off_last[1];
- host = info.page[1].host;
-
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- host_fn(za, reg_off, host + reg_off);
- }
- reg_off += 1 << esz;
- } while (reg_off & 63);
- } while (reg_off <= reg_last);
- }
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
- uint32_t desc, uintptr_t ra, int esz, bool vertical,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
- int bit55 = extract64(addr, 55, 1);
-
- /* Remove mtedesc from the normal sve descriptor. */
- desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
- /* Perform gross MTE suppression early. */
- if (!tbi_check(desc, bit55) ||
- tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
- mtedesc = 0;
- }
-
- sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
- vertical, host_fn, tlb_fn);
-}
-
-#define DO_ST(L, END, ESZ) \
-void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
- sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
-} \
-void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
- sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
-} \
-void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
- sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
-} \
-void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
- sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
-}
-
-DO_ST(b, , MO_8)
-DO_ST(h, _be, MO_16)
-DO_ST(h, _le, MO_16)
-DO_ST(s, _be, MO_32)
-DO_ST(s, _le, MO_32)
-DO_ST(d, _be, MO_64)
-DO_ST(d, _le, MO_64)
-DO_ST(q, _be, MO_128)
-DO_ST(q, _le, MO_128)
-
-#undef DO_ST
-
-void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
- void *vpm, uint32_t desc)
-{
- intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
- uint64_t *pn = vpn, *pm = vpm;
- uint32_t *zda = vzda, *zn = vzn;
-
- for (row = 0; row < oprsz; ) {
- uint64_t pa = pn[row >> 4];
- do {
- if (pa & 1) {
- for (col = 0; col < oprsz; ) {
- uint64_t pb = pm[col >> 4];
- do {
- if (pb & 1) {
- zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
- }
- pb >>= 4;
- } while (++col & 15);
- }
- }
- pa >>= 4;
- } while (++row & 15);
- }
-}
-
-void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
- void *vpm, uint32_t desc)
-{
- intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
- uint8_t *pn = vpn, *pm = vpm;
- uint64_t *zda = vzda, *zn = vzn;
-
- for (row = 0; row < oprsz; ++row) {
- if (pn[H1(row)] & 1) {
- for (col = 0; col < oprsz; ++col) {
- if (pm[H1(col)] & 1) {
- zda[tile_vslice_index(row) + col] += zn[col];
- }
- }
- }
- }
-}
-
-void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
- void *vpm, uint32_t desc)
-{
- intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
- uint64_t *pn = vpn, *pm = vpm;
- uint32_t *zda = vzda, *zn = vzn;
-
- for (row = 0; row < oprsz; ) {
- uint64_t pa = pn[row >> 4];
- do {
- if (pa & 1) {
- uint32_t zn_row = zn[H4(row)];
- for (col = 0; col < oprsz; ) {
- uint64_t pb = pm[col >> 4];
- do {
- if (pb & 1) {
- zda[tile_vslice_index(row) + H4(col)] += zn_row;
- }
- pb >>= 4;
- } while (++col & 15);
- }
- }
- pa >>= 4;
- } while (++row & 15);
- }
-}
-
-void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
- void *vpm, uint32_t desc)
-{
- intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
- uint8_t *pn = vpn, *pm = vpm;
- uint64_t *zda = vzda, *zn = vzn;
-
- for (row = 0; row < oprsz; ++row) {
- if (pn[H1(row)] & 1) {
- uint64_t zn_row = zn[row];
- for (col = 0; col < oprsz; ++col) {
- if (pm[H1(col)] & 1) {
- zda[tile_vslice_index(row) + col] += zn_row;
- }
- }
- }
- }
-}
-
-void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
- void *vpm, void *vst, uint32_t desc)
-{
- intptr_t row, col, oprsz = simd_maxsz(desc);
- uint32_t neg = simd_data(desc) << 31;
- uint16_t *pn = vpn, *pm = vpm;
- float_status fpst;
-
- /*
- * Make a copy of float_status because this operation does not
- * update the cumulative fp exception status. It also produces
- * default nans.
- */
- fpst = *(float_status *)vst;
- set_default_nan_mode(true, &fpst);
-
- for (row = 0; row < oprsz; ) {
- uint16_t pa = pn[H2(row >> 4)];
- do {
- if (pa & 1) {
- void *vza_row = vza + tile_vslice_offset(row);
- uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
-
- for (col = 0; col < oprsz; ) {
- uint16_t pb = pm[H2(col >> 4)];
- do {
- if (pb & 1) {
- uint32_t *a = vza_row + H1_4(col);
- uint32_t *m = vzm + H1_4(col);
- *a = float32_muladd(n, *m, *a, 0, vst);
- }
- col += 4;
- pb >>= 4;
- } while (col & 15);
- }
- }
- row += 4;
- pa >>= 4;
- } while (row & 15);
- }
-}
-
-void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
- void *vpm, void *vst, uint32_t desc)
-{
- intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
- uint64_t neg = (uint64_t)simd_data(desc) << 63;
- uint64_t *za = vza, *zn = vzn, *zm = vzm;
- uint8_t *pn = vpn, *pm = vpm;
- float_status fpst = *(float_status *)vst;
-
- set_default_nan_mode(true, &fpst);
-
- for (row = 0; row < oprsz; ++row) {
- if (pn[H1(row)] & 1) {
- uint64_t *za_row = &za[tile_vslice_index(row)];
- uint64_t n = zn[row] ^ neg;
-
- for (col = 0; col < oprsz; ++col) {
- if (pm[H1(col)] & 1) {
- uint64_t *a = &za_row[col];
- *a = float64_muladd(n, zm[col], *a, 0, &fpst);
- }
- }
- }
- }
-}
-
-/*
- * Alter PAIR as needed for controlling predicates being false,
- * and for NEG on an enabled row element.
- */
-static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
-{
- /*
- * The pseudocode uses a conditional negate after the conditional zero.
- * It is simpler here to unconditionally negate before conditional zero.
- */
- pair ^= neg;
- if (!(pg & 1)) {
- pair &= 0xffff0000u;
- }
- if (!(pg & 4)) {
- pair &= 0x0000ffffu;
- }
- return pair;
-}
-
-static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
- float_status *s_std, float_status *s_odd)
-{
- float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
- float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
- float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
- float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
- float64 t64;
- float32 t32;
-
- /*
- * The ARM pseudocode function FPDot performs both multiplies
- * and the add with a single rounding operation. Emulate this
- * by performing the first multiply in round-to-odd, then doing
- * the second multiply as fused multiply-add, and rounding to
- * float32 all in one step.
- */
- t64 = float64_mul(e1r, e2r, s_odd);
- t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
-
- /* This conversion is exact, because we've already rounded. */
- t32 = float64_to_float32(t64, s_std);
-
- /* The final accumulation step is not fused. */
- return float32_add(sum, t32, s_std);
-}
-
-void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
- void *vpm, void *vst, uint32_t desc)
-{
- intptr_t row, col, oprsz = simd_maxsz(desc);
- uint32_t neg = simd_data(desc) * 0x80008000u;
- uint16_t *pn = vpn, *pm = vpm;
- float_status fpst_odd, fpst_std;
-
- /*
- * Make a copy of float_status because this operation does not
- * update the cumulative fp exception status. It also produces
- * default nans. Make a second copy with round-to-odd -- see above.
- */
- fpst_std = *(float_status *)vst;
- set_default_nan_mode(true, &fpst_std);
- fpst_odd = fpst_std;
- set_float_rounding_mode(float_round_to_odd, &fpst_odd);
-
- for (row = 0; row < oprsz; ) {
- uint16_t prow = pn[H2(row >> 4)];
- do {
- void *vza_row = vza + tile_vslice_offset(row);
- uint32_t n = *(uint32_t *)(vzn + H1_4(row));
-
- n = f16mop_adj_pair(n, prow, neg);
-
- for (col = 0; col < oprsz; ) {
- uint16_t pcol = pm[H2(col >> 4)];
- do {
- if (prow & pcol & 0b0101) {
- uint32_t *a = vza_row + H1_4(col);
- uint32_t m = *(uint32_t *)(vzm + H1_4(col));
-
- m = f16mop_adj_pair(m, pcol, 0);
- *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
-
- col += 4;
- pcol >>= 4;
- }
- } while (col & 15);
- }
- row += 4;
- prow >>= 4;
- } while (row & 15);
- }
-}
-
-void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
- void *vpm, uint32_t desc)
-{
- intptr_t row, col, oprsz = simd_maxsz(desc);
- uint32_t neg = simd_data(desc) * 0x80008000u;
- uint16_t *pn = vpn, *pm = vpm;
-
- for (row = 0; row < oprsz; ) {
- uint16_t prow = pn[H2(row >> 4)];
- do {
- void *vza_row = vza + tile_vslice_offset(row);
- uint32_t n = *(uint32_t *)(vzn + H1_4(row));
-
- n = f16mop_adj_pair(n, prow, neg);
-
- for (col = 0; col < oprsz; ) {
- uint16_t pcol = pm[H2(col >> 4)];
- do {
- if (prow & pcol & 0b0101) {
- uint32_t *a = vza_row + H1_4(col);
- uint32_t m = *(uint32_t *)(vzm + H1_4(col));
-
- m = f16mop_adj_pair(m, pcol, 0);
- *a = bfdotadd(*a, n, m);
-
- col += 4;
- pcol >>= 4;
- }
- } while (col & 15);
- }
- row += 4;
- prow >>= 4;
- } while (row & 15);
- }
-}
-
-typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
-
-static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
- uint8_t *pn, uint8_t *pm,
- uint32_t desc, IMOPFn *fn)
-{
- intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
- bool neg = simd_data(desc);
-
- for (row = 0; row < oprsz; ++row) {
- uint8_t pa = pn[H1(row)];
- uint64_t *za_row = &za[tile_vslice_index(row)];
- uint64_t n = zn[row];
-
- for (col = 0; col < oprsz; ++col) {
- uint8_t pb = pm[H1(col)];
- uint64_t *a = &za_row[col];
-
- *a = fn(n, zm[col], *a, pa & pb, neg);
- }
- }
-}
-
-#define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
-static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
-{ \
- uint32_t sum0 = 0, sum1 = 0; \
- /* Apply P to N as a mask, making the inactive elements 0. */ \
- n &= expand_pred_b(p); \
- sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
- sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \
- sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
- sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \
- sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
- sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40); \
- sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
- sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56); \
- if (neg) { \
- sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1; \
- } else { \
- sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1; \
- } \
- return ((uint64_t)sum1 << 32) | sum0; \
-}
-
-#define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
-static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
-{ \
- uint64_t sum = 0; \
- /* Apply P to N as a mask, making the inactive elements 0. */ \
- n &= expand_pred_h(p); \
- sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
- sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
- sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
- sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
- return neg ? a - sum : a + sum; \
-}
-
-DEF_IMOP_32(smopa_s, int8_t, int8_t)
-DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
-DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
-DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
-
-DEF_IMOP_64(smopa_d, int16_t, int16_t)
-DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
-DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
-DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
-
-#define DEF_IMOPH(NAME) \
- void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn, \
- void *vpm, uint32_t desc) \
- { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
-
-DEF_IMOPH(smopa_s)
-DEF_IMOPH(umopa_s)
-DEF_IMOPH(sumopa_s)
-DEF_IMOPH(usmopa_s)
-DEF_IMOPH(smopa_d)
-DEF_IMOPH(umopa_d)
-DEF_IMOPH(sumopa_d)
-DEF_IMOPH(usmopa_d)
+++ /dev/null
-/*
- * ARM SVE Operations
- *
- * Copyright (c) 2018 Linaro, Ltd.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "internals.h"
-#include "exec/exec-all.h"
-#include "exec/helper-proto.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "fpu/softfloat.h"
-#include "tcg/tcg.h"
-#include "vec_internal.h"
-#include "sve_ldst_internal.h"
-
-
-/* Return a value for NZCV as per the ARM PredTest pseudofunction.
- *
- * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
- * and bit 0 set if C is set. Compare the definitions of these variables
- * within CPUARMState.
- */
-
-/* For no G bits set, NZCV = C. */
-#define PREDTEST_INIT 1
-
-/* This is an iterative function, called for each Pd and Pg word
- * moving forward.
- */
-static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
-{
- if (likely(g)) {
- /* Compute N from first D & G.
- Use bit 2 to signal first G bit seen. */
- if (!(flags & 4)) {
- flags |= ((d & (g & -g)) != 0) << 31;
- flags |= 4;
- }
-
- /* Accumulate Z from each D & G. */
- flags |= ((d & g) != 0) << 1;
-
- /* Compute C from last !(D & G). Replace previous. */
- flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
- }
- return flags;
-}
-
-/* This is an iterative function, called for each Pd and Pg word
- * moving backward.
- */
-static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
-{
- if (likely(g)) {
- /* Compute C from first (i.e last) !(D & G).
- Use bit 2 to signal first G bit seen. */
- if (!(flags & 4)) {
- flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
- flags |= (d & pow2floor(g)) == 0;
- }
-
- /* Accumulate Z from each D & G. */
- flags |= ((d & g) != 0) << 1;
-
- /* Compute N from last (i.e first) D & G. Replace previous. */
- flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
- }
- return flags;
-}
-
-/* The same for a single word predicate. */
-uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
-{
- return iter_predtest_fwd(d, g, PREDTEST_INIT);
-}
-
-/* The same for a multi-word predicate. */
-uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
-{
- uint32_t flags = PREDTEST_INIT;
- uint64_t *d = vd, *g = vg;
- uintptr_t i = 0;
-
- do {
- flags = iter_predtest_fwd(d[i], g[i], flags);
- } while (++i < words);
-
- return flags;
-}
-
-/* Similarly for single word elements. */
-static inline uint64_t expand_pred_s(uint8_t byte)
-{
- static const uint64_t word[] = {
- [0x01] = 0x00000000ffffffffull,
- [0x10] = 0xffffffff00000000ull,
- [0x11] = 0xffffffffffffffffull,
- };
- return word[byte & 0x11];
-}
-
-#define LOGICAL_PPPP(NAME, FUNC) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{ \
- uintptr_t opr_sz = simd_oprsz(desc); \
- uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
- uintptr_t i; \
- for (i = 0; i < opr_sz / 8; ++i) { \
- d[i] = FUNC(n[i], m[i], g[i]); \
- } \
-}
-
-#define DO_AND(N, M, G) (((N) & (M)) & (G))
-#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
-#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
-#define DO_ORR(N, M, G) (((N) | (M)) & (G))
-#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
-#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
-#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
-#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
-
-LOGICAL_PPPP(sve_and_pppp, DO_AND)
-LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
-LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
-LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
-LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
-LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
-LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
-LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
-
-#undef DO_AND
-#undef DO_BIC
-#undef DO_EOR
-#undef DO_ORR
-#undef DO_ORN
-#undef DO_NOR
-#undef DO_NAND
-#undef DO_SEL
-#undef LOGICAL_PPPP
-
-/* Fully general three-operand expander, controlled by a predicate.
- * This is complicated by the host-endian storage of the register file.
- */
-/* ??? I don't expect the compiler could ever vectorize this itself.
- * With some tables we can convert bit masks to byte masks, and with
- * extra care wrt byte/word ordering we could use gcc generic vectors
- * and do 16 bytes at a time.
- */
-#define DO_ZPZZ(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- if (pg & 1) { \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- TYPE mm = *(TYPE *)(vm + H(i)); \
- *(TYPE *)(vd + H(i)) = OP(nn, mm); \
- } \
- i += sizeof(TYPE), pg >>= sizeof(TYPE); \
- } while (i & 15); \
- } \
-}
-
-/* Similarly, specialized for 64-bit operands. */
-#define DO_ZPZZ_D(NAME, TYPE, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
- TYPE *d = vd, *n = vn, *m = vm; \
- uint8_t *pg = vg; \
- for (i = 0; i < opr_sz; i += 1) { \
- if (pg[H1(i)] & 1) { \
- TYPE nn = n[i], mm = m[i]; \
- d[i] = OP(nn, mm); \
- } \
- } \
-}
-
-#define DO_AND(N, M) (N & M)
-#define DO_EOR(N, M) (N ^ M)
-#define DO_ORR(N, M) (N | M)
-#define DO_BIC(N, M) (N & ~M)
-#define DO_ADD(N, M) (N + M)
-#define DO_SUB(N, M) (N - M)
-#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
-#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
-#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
-#define DO_MUL(N, M) (N * M)
-
-
-/*
- * We must avoid the C undefined behaviour cases: division by
- * zero and signed division of INT_MIN by -1. Both of these
- * have architecturally defined required results for Arm.
- * We special case all signed divisions by -1 to avoid having
- * to deduce the minimum integer for the type involved.
- */
-#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
-#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
-
-DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
-DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
-DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
-DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
-
-DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
-DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
-DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
-DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
-
-DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
-DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
-DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
-DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
-
-DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
-DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
-DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
-DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
-
-DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
-DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
-DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
-DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
-
-DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
-DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
-DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
-DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
-
-DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
-DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
-DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
-DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
-
-DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
-DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
-DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
-DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
-
-DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
-DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
-DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
-DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
-
-DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
-DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
-DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
-DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
-
-DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
-DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
-DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
-DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
-
-DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
-DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
-DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
-DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
-
-/* Because the computation type is at least twice as large as required,
- these work for both signed and unsigned source types. */
-static inline uint8_t do_mulh_b(int32_t n, int32_t m)
-{
- return (n * m) >> 8;
-}
-
-static inline uint16_t do_mulh_h(int32_t n, int32_t m)
-{
- return (n * m) >> 16;
-}
-
-static inline uint32_t do_mulh_s(int64_t n, int64_t m)
-{
- return (n * m) >> 32;
-}
-
-static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
-{
- uint64_t lo, hi;
- muls64(&lo, &hi, n, m);
- return hi;
-}
-
-static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
-{
- uint64_t lo, hi;
- mulu64(&lo, &hi, n, m);
- return hi;
-}
-
-DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
-DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
-DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
-DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
-
-DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
-DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
-DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
-DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
-
-DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
-DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
-DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
-DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
-
-DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
-DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
-
-DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
-DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
-
-/* Note that all bits of the shift are significant
- and not modulo the element size. */
-#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
-#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
-#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
-
-DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
-DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
-DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
-
-DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
-DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
-DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
-
-DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
-DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
-DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
-
-DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
-DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
-DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
-
-static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
-{
- int8_t n1 = n, n2 = n >> 8;
- return m + n1 + n2;
-}
-
-static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
-{
- int16_t n1 = n, n2 = n >> 16;
- return m + n1 + n2;
-}
-
-static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
-{
- int32_t n1 = n, n2 = n >> 32;
- return m + n1 + n2;
-}
-
-DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
-DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
-DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
-
-static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
-{
- uint8_t n1 = n, n2 = n >> 8;
- return m + n1 + n2;
-}
-
-static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
-{
- uint16_t n1 = n, n2 = n >> 16;
- return m + n1 + n2;
-}
-
-static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
-{
- uint32_t n1 = n, n2 = n >> 32;
- return m + n1 + n2;
-}
-
-DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
-DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
-DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
-
-#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
-#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
-#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
-#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
-
-DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
-DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
-DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
-DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
-
-#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
-#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
-#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
-#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
-
-DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
-DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
-DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
-DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
-
-/*
- * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
- * We pass in a pointer to a dummy saturation field to trigger
- * the saturating arithmetic but discard the information about
- * whether it has occurred.
- */
-#define do_sqshl_b(n, m) \
- ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
-#define do_sqshl_h(n, m) \
- ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
-#define do_sqshl_s(n, m) \
- ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
-#define do_sqshl_d(n, m) \
- ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
-
-DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
-DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
-DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
-DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
-
-#define do_uqshl_b(n, m) \
- ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
-#define do_uqshl_h(n, m) \
- ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
-#define do_uqshl_s(n, m) \
- ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
-#define do_uqshl_d(n, m) \
- ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
-
-DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
-DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
-DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
-DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
-
-#define do_sqrshl_b(n, m) \
- ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
-#define do_sqrshl_h(n, m) \
- ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
-#define do_sqrshl_s(n, m) \
- ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
-#define do_sqrshl_d(n, m) \
- ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
-
-DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
-DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
-DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
-DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
-
-#undef do_sqrshl_d
-
-#define do_uqrshl_b(n, m) \
- ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
-#define do_uqrshl_h(n, m) \
- ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
-#define do_uqrshl_s(n, m) \
- ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
-#define do_uqrshl_d(n, m) \
- ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
-
-DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
-DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
-DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
-DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
-
-#undef do_uqrshl_d
-
-#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
-#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
-
-DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
-DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
-DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
-DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
-
-DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
-DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
-DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
-DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
-
-#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
-#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
-
-DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
-DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
-DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
-DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
-
-DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
-DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
-DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
-DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
-
-#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
-#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
-
-DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
-DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
-DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
-DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
-
-DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
-DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
-DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
-DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
-
-static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
-{
- return val >= max ? max : val <= min ? min : val;
-}
-
-#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
-#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
-#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
-
-static inline int64_t do_sqadd_d(int64_t n, int64_t m)
-{
- int64_t r = n + m;
- if (((r ^ n) & ~(n ^ m)) < 0) {
- /* Signed overflow. */
- return r < 0 ? INT64_MAX : INT64_MIN;
- }
- return r;
-}
-
-DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
-DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
-DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
-DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
-
-#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
-#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
-#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
-
-static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
-{
- uint64_t r = n + m;
- return r < n ? UINT64_MAX : r;
-}
-
-DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
-DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
-DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
-DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
-
-#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
-#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
-#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
-
-static inline int64_t do_sqsub_d(int64_t n, int64_t m)
-{
- int64_t r = n - m;
- if (((r ^ n) & (n ^ m)) < 0) {
- /* Signed overflow. */
- return r < 0 ? INT64_MAX : INT64_MIN;
- }
- return r;
-}
-
-DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
-DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
-DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
-DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
-
-#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
-#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
-#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
-
-static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
-{
- return n > m ? n - m : 0;
-}
-
-DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
-DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
-DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
-DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
-
-#define DO_SUQADD_B(n, m) \
- do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
-#define DO_SUQADD_H(n, m) \
- do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
-#define DO_SUQADD_S(n, m) \
- do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
-
-static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
-{
- uint64_t r = n + m;
-
- if (n < 0) {
- /* Note that m - abs(n) cannot underflow. */
- if (r > INT64_MAX) {
- /* Result is either very large positive or negative. */
- if (m > -n) {
- /* m > abs(n), so r is a very large positive. */
- return INT64_MAX;
- }
- /* Result is negative. */
- }
- } else {
- /* Both inputs are positive: check for overflow. */
- if (r < m || r > INT64_MAX) {
- return INT64_MAX;
- }
- }
- return r;
-}
-
-DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
-DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
-DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
-DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
-
-#define DO_USQADD_B(n, m) \
- do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
-#define DO_USQADD_H(n, m) \
- do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
-#define DO_USQADD_S(n, m) \
- do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
-
-static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
-{
- uint64_t r = n + m;
-
- if (m < 0) {
- return n < -m ? 0 : r;
- }
- return r < n ? UINT64_MAX : r;
-}
-
-DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
-DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
-DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
-DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
-
-#undef DO_ZPZZ
-#undef DO_ZPZZ_D
-
-/*
- * Three operand expander, operating on element pairs.
- * If the slot I is even, the elements from from VN {I, I+1}.
- * If the slot I is odd, the elements from from VM {I-1, I}.
- * Load all of the input elements in each pair before overwriting output.
- */
-#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- TYPE n0 = *(TYPE *)(vn + H(i)); \
- TYPE m0 = *(TYPE *)(vm + H(i)); \
- TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
- TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
- if (pg & 1) { \
- *(TYPE *)(vd + H(i)) = OP(n0, n1); \
- } \
- i += sizeof(TYPE), pg >>= sizeof(TYPE); \
- if (pg & 1) { \
- *(TYPE *)(vd + H(i)) = OP(m0, m1); \
- } \
- i += sizeof(TYPE), pg >>= sizeof(TYPE); \
- } while (i & 15); \
- } \
-}
-
-/* Similarly, specialized for 64-bit operands. */
-#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
- TYPE *d = vd, *n = vn, *m = vm; \
- uint8_t *pg = vg; \
- for (i = 0; i < opr_sz; i += 2) { \
- TYPE n0 = n[i], n1 = n[i + 1]; \
- TYPE m0 = m[i], m1 = m[i + 1]; \
- if (pg[H1(i)] & 1) { \
- d[i] = OP(n0, n1); \
- } \
- if (pg[H1(i + 1)] & 1) { \
- d[i + 1] = OP(m0, m1); \
- } \
- } \
-}
-
-DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
-DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
-DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
-DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
-
-DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
-DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
-DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
-DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
-
-DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
-DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
-DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
-DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
-
-DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
-DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
-DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
-DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
-
-DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
-DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
-DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
-DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
-
-#undef DO_ZPZZ_PAIR
-#undef DO_ZPZZ_PAIR_D
-
-#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
- void *status, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- TYPE n0 = *(TYPE *)(vn + H(i)); \
- TYPE m0 = *(TYPE *)(vm + H(i)); \
- TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
- TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
- if (pg & 1) { \
- *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
- } \
- i += sizeof(TYPE), pg >>= sizeof(TYPE); \
- if (pg & 1) { \
- *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
- } \
- i += sizeof(TYPE), pg >>= sizeof(TYPE); \
- } while (i & 15); \
- } \
-}
-
-DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
-DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
-DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
-
-DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
-DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
-DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
-
-DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
-DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
-DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
-
-DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
-DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
-DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
-
-DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
-DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
-DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
-
-#undef DO_ZPZZ_PAIR_FP
-
-/* Three-operand expander, controlled by a predicate, in which the
- * third operand is "wide". That is, for D = N op M, the same 64-bit
- * value of M is used with all of the narrower values of N.
- */
-#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; ) { \
- uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
- TYPEW mm = *(TYPEW *)(vm + i); \
- do { \
- if (pg & 1) { \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- *(TYPE *)(vd + H(i)) = OP(nn, mm); \
- } \
- i += sizeof(TYPE), pg >>= sizeof(TYPE); \
- } while (i & 7); \
- } \
-}
-
-DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
-DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
-DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
-
-DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
-DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
-DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
-
-DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
-DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
-DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
-
-#undef DO_ZPZW
-
-/* Fully general two-operand expander, controlled by a predicate.
- */
-#define DO_ZPZ(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- if (pg & 1) { \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- *(TYPE *)(vd + H(i)) = OP(nn); \
- } \
- i += sizeof(TYPE), pg >>= sizeof(TYPE); \
- } while (i & 15); \
- } \
-}
-
-/* Similarly, specialized for 64-bit operands. */
-#define DO_ZPZ_D(NAME, TYPE, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
- TYPE *d = vd, *n = vn; \
- uint8_t *pg = vg; \
- for (i = 0; i < opr_sz; i += 1) { \
- if (pg[H1(i)] & 1) { \
- TYPE nn = n[i]; \
- d[i] = OP(nn); \
- } \
- } \
-}
-
-#define DO_CLS_B(N) (clrsb32(N) - 24)
-#define DO_CLS_H(N) (clrsb32(N) - 16)
-
-DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
-DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
-DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
-DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
-
-#define DO_CLZ_B(N) (clz32(N) - 24)
-#define DO_CLZ_H(N) (clz32(N) - 16)
-
-DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
-DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
-DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
-DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
-
-DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
-DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
-DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
-DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
-
-#define DO_CNOT(N) (N == 0)
-
-DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
-DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
-DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
-DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
-
-#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
-
-DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
-DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
-DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
-
-#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
-
-DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
-DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
-DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
-
-#define DO_NOT(N) (~N)
-
-DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
-DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
-DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
-DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
-
-#define DO_SXTB(N) ((int8_t)N)
-#define DO_SXTH(N) ((int16_t)N)
-#define DO_SXTS(N) ((int32_t)N)
-#define DO_UXTB(N) ((uint8_t)N)
-#define DO_UXTH(N) ((uint16_t)N)
-#define DO_UXTS(N) ((uint32_t)N)
-
-DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
-DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
-DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
-DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
-DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
-DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
-
-DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
-DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
-DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
-DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
-DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
-DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
-
-#define DO_ABS(N) (N < 0 ? -N : N)
-
-DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
-DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
-DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
-DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
-
-#define DO_NEG(N) (-N)
-
-DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
-DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
-DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
-DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
-
-DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
-DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
-DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
-
-DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
-DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
-
-DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
-
-void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn;
- uint8_t *pg = vg;
-
- for (i = 0; i < opr_sz; i += 2) {
- if (pg[H1(i)] & 1) {
- uint64_t n0 = n[i + 0];
- uint64_t n1 = n[i + 1];
- d[i + 0] = n1;
- d[i + 1] = n0;
- }
- }
-}
-
-DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
-DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
-DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
-DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
-
-#define DO_SQABS(X) \
- ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
- x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
-
-DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
-DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
-DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
-DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
-
-#define DO_SQNEG(X) \
- ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
- x_ == min_ ? -min_ - 1 : -x_; })
-
-DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
-DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
-DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
-DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
-
-DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
-DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
-
-/* Three-operand expander, unpredicated, in which the third operand is "wide".
- */
-#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; ) { \
- TYPEW mm = *(TYPEW *)(vm + i); \
- do { \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- *(TYPE *)(vd + H(i)) = OP(nn, mm); \
- i += sizeof(TYPE); \
- } while (i & 7); \
- } \
-}
-
-DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
-DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
-DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
-
-DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
-DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
-DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
-
-DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
-DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
-DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
-
-#undef DO_ZZW
-
-#undef DO_CLS_B
-#undef DO_CLS_H
-#undef DO_CLZ_B
-#undef DO_CLZ_H
-#undef DO_CNOT
-#undef DO_FABS
-#undef DO_FNEG
-#undef DO_ABS
-#undef DO_NEG
-#undef DO_ZPZ
-#undef DO_ZPZ_D
-
-/*
- * Three-operand expander, unpredicated, in which the two inputs are
- * selected from the top or bottom half of the wide column.
- */
-#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
- int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
- for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
- TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
- TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
- *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
- } \
-}
-
-DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
-DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
-DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
-
-DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
-DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
-DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
-
-DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
-DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
-DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
-
-DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
-DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
-DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
-
-DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
-DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
-DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
-
-DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
-DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
-DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
-
-DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
-DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
-DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
-
-DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
-DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
-DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
-
-/* Note that the multiply cannot overflow, but the doubling can. */
-static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
-{
- int16_t val = n * m;
- return DO_SQADD_H(val, val);
-}
-
-static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
-{
- int32_t val = n * m;
- return DO_SQADD_S(val, val);
-}
-
-static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
-{
- int64_t val = n * m;
- return do_sqadd_d(val, val);
-}
-
-DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
-DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
-DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
-
-#undef DO_ZZZ_TB
-
-#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
- for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
- TYPEW nn = *(TYPEW *)(vn + HW(i)); \
- TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
- *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
- } \
-}
-
-DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
-DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
-DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
-
-DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
-DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
-DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
-
-DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
-DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
-DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
-
-DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
-DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
-DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
-
-#undef DO_ZZZ_WTB
-
-#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
- intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
- for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
- TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
- TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
- *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
- } \
-}
-
-DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
-DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
-DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
-DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
-
-#undef DO_ZZZ_NTB
-
-#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
- for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
- TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
- TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
- TYPEW aa = *(TYPEW *)(va + HW(i)); \
- *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
- } \
-}
-
-DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
-DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
-DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
-
-DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
-DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
-DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
-
-DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
-DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
-DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
-
-DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
-DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
-DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
-
-#define DO_NMUL(N, M) -(N * M)
-
-DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
-DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
-DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
-
-DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
-DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
-DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
-
-#undef DO_ZZZW_ACC
-
-#define DO_XTNB(NAME, TYPE, OP) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
- TYPE nn = *(TYPE *)(vn + i); \
- nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
- *(TYPE *)(vd + i) = nn; \
- } \
-}
-
-#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
- for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
- TYPE nn = *(TYPE *)(vn + i); \
- *(TYPEN *)(vd + i + odd) = OP(nn); \
- } \
-}
-
-#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
-#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
-#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
-
-DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
-DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
-DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
-
-DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
-DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
-DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
-
-#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
-#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
-#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
-
-DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
-DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
-DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
-
-DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
-DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
-DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
-
-DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
-DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
-DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
-
-DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
-DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
-DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
-
-#undef DO_XTNB
-#undef DO_XTNT
-
-void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
- uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
- uint32_t *a = va, *n = vn;
- uint64_t *d = vd, *m = vm;
-
- for (i = 0; i < opr_sz / 8; ++i) {
- uint32_t e1 = a[2 * i + H4(0)];
- uint32_t e2 = n[2 * i + sel] ^ inv;
- uint64_t c = extract64(m[i], 32, 1);
- /* Compute and store the entire 33-bit result at once. */
- d[i] = c + e1 + e2;
- }
-}
-
-void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
- uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
- uint64_t *d = vd, *a = va, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 8; i += 2) {
- Int128 e1 = int128_make64(a[i]);
- Int128 e2 = int128_make64(n[i + sel] ^ inv);
- Int128 c = int128_make64(m[i + 1] & 1);
- Int128 r = int128_add(int128_add(e1, e2), c);
- d[i + 0] = int128_getlo(r);
- d[i + 1] = int128_gethi(r);
- }
-}
-
-#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
- int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
- for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
- TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
- TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
- TYPEW aa = *(TYPEW *)(va + HW(i)); \
- *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
- } \
-}
-
-DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
- do_sqdmull_h, DO_SQADD_H)
-DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
- do_sqdmull_s, DO_SQADD_S)
-DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
- do_sqdmull_d, do_sqadd_d)
-
-DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
- do_sqdmull_h, DO_SQSUB_H)
-DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
- do_sqdmull_s, DO_SQSUB_S)
-DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
- do_sqdmull_d, do_sqsub_d)
-
-#undef DO_SQDMLAL
-
-#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
- int rot = simd_data(desc); \
- int sel_a = rot & 1, sel_b = sel_a ^ 1; \
- bool sub_r = rot == 1 || rot == 2; \
- bool sub_i = rot >= 2; \
- TYPE *d = vd, *n = vn, *m = vm, *a = va; \
- for (i = 0; i < opr_sz; i += 2) { \
- TYPE elt1_a = n[H(i + sel_a)]; \
- TYPE elt2_a = m[H(i + sel_a)]; \
- TYPE elt2_b = m[H(i + sel_b)]; \
- d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
- d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
- } \
-}
-
-#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
-
-DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
-DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
-DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
-DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
-
-#define DO_SQRDMLAH_B(N, M, A, S) \
- do_sqrdmlah_b(N, M, A, S, true)
-#define DO_SQRDMLAH_H(N, M, A, S) \
- ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
-#define DO_SQRDMLAH_S(N, M, A, S) \
- ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
-#define DO_SQRDMLAH_D(N, M, A, S) \
- do_sqrdmlah_d(N, M, A, S, true)
-
-DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
-DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
-DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
-DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
-
-#define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{ \
- intptr_t i, j, oprsz = simd_oprsz(desc); \
- int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
- int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
- int sel_a = rot & 1, sel_b = sel_a ^ 1; \
- bool sub_r = rot == 1 || rot == 2; \
- bool sub_i = rot >= 2; \
- TYPE *d = vd, *n = vn, *m = vm, *a = va; \
- for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
- TYPE elt2_a = m[H(i + idx + sel_a)]; \
- TYPE elt2_b = m[H(i + idx + sel_b)]; \
- for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
- TYPE elt1_a = n[H(i + j + sel_a)]; \
- d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
- d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
- } \
- } \
-}
-
-DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
-DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
-
-DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
-DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
-
-#undef DO_CMLA
-#undef DO_CMLA_FUNC
-#undef DO_CMLA_IDX_FUNC
-#undef DO_SQRDMLAH_B
-#undef DO_SQRDMLAH_H
-#undef DO_SQRDMLAH_S
-#undef DO_SQRDMLAH_D
-
-/* Note N and M are 4 elements bundled into one unit. */
-static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
- int sel_a, int sel_b, int sub_i)
-{
- for (int i = 0; i <= 1; i++) {
- int32_t elt1_r = (int8_t)(n >> (16 * i));
- int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
- int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
- int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
-
- a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
- }
- return a;
-}
-
-static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
- int sel_a, int sel_b, int sub_i)
-{
- for (int i = 0; i <= 1; i++) {
- int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
- int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
- int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
- int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
-
- a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
- }
- return a;
-}
-
-void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- int opr_sz = simd_oprsz(desc);
- int rot = simd_data(desc);
- int sel_a = rot & 1;
- int sel_b = sel_a ^ 1;
- int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
- uint32_t *d = vd, *n = vn, *m = vm, *a = va;
-
- for (int e = 0; e < opr_sz / 4; e++) {
- d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
- }
-}
-
-void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- int opr_sz = simd_oprsz(desc);
- int rot = simd_data(desc);
- int sel_a = rot & 1;
- int sel_b = sel_a ^ 1;
- int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
- uint64_t *d = vd, *n = vn, *m = vm, *a = va;
-
- for (int e = 0; e < opr_sz / 8; e++) {
- d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
- }
-}
-
-void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- int opr_sz = simd_oprsz(desc);
- int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
- int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
- int sel_a = rot & 1;
- int sel_b = sel_a ^ 1;
- int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
- uint32_t *d = vd, *n = vn, *m = vm, *a = va;
-
- for (int seg = 0; seg < opr_sz / 4; seg += 4) {
- uint32_t seg_m = m[seg + idx];
- for (int e = 0; e < 4; e++) {
- d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
- sel_a, sel_b, sub_i);
- }
- }
-}
-
-void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- int seg, opr_sz = simd_oprsz(desc);
- int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
- int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
- int sel_a = rot & 1;
- int sel_b = sel_a ^ 1;
- int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
- uint64_t *d = vd, *n = vn, *m = vm, *a = va;
-
- for (seg = 0; seg < opr_sz / 8; seg += 2) {
- uint64_t seg_m = m[seg + idx];
- for (int e = 0; e < 2; e++) {
- d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
- sel_a, sel_b, sub_i);
- }
- }
-}
-
-#define DO_ZZXZ(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{ \
- intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
- intptr_t i, j, idx = simd_data(desc); \
- TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
- for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
- TYPE mm = m[i]; \
- for (j = 0; j < segment; j++) { \
- d[i + j] = OP(n[i + j], mm, a[i + j]); \
- } \
- } \
-}
-
-#define DO_SQRDMLAH_H(N, M, A) \
- ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
-#define DO_SQRDMLAH_S(N, M, A) \
- ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
-#define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
-
-DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
-DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
-DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
-
-#define DO_SQRDMLSH_H(N, M, A) \
- ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
-#define DO_SQRDMLSH_S(N, M, A) \
- ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
-#define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
-
-DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
-DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
-DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
-
-#undef DO_ZZXZ
-
-#define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{ \
- intptr_t i, j, oprsz = simd_oprsz(desc); \
- intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
- intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
- for (i = 0; i < oprsz; i += 16) { \
- TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
- for (j = 0; j < 16; j += sizeof(TYPEW)) { \
- TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
- TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
- *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
- } \
- } \
-}
-
-#define DO_MLA(N, M, A) (A + N * M)
-
-DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
-DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
-DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
-DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
-
-#define DO_MLS(N, M, A) (A - N * M)
-
-DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
-DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
-DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
-DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
-
-#define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
-#define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
-
-DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
-DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
-
-#define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
-#define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
-
-DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
-DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
-
-#undef DO_MLA
-#undef DO_MLS
-#undef DO_ZZXW
-
-#define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, j, oprsz = simd_oprsz(desc); \
- intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
- intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
- for (i = 0; i < oprsz; i += 16) { \
- TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
- for (j = 0; j < 16; j += sizeof(TYPEW)) { \
- TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
- *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
- } \
- } \
-}
-
-DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
-DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
-
-DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
-DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
-
-DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
-DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
-
-#undef DO_ZZX
-
-#define DO_BITPERM(NAME, TYPE, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
- TYPE nn = *(TYPE *)(vn + i); \
- TYPE mm = *(TYPE *)(vm + i); \
- *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
- } \
-}
-
-static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
-{
- uint64_t res = 0;
- int db, rb = 0;
-
- for (db = 0; db < n; ++db) {
- if ((mask >> db) & 1) {
- res |= ((data >> db) & 1) << rb;
- ++rb;
- }
- }
- return res;
-}
-
-DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
-DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
-DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
-DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
-
-static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
-{
- uint64_t res = 0;
- int rb, db = 0;
-
- for (rb = 0; rb < n; ++rb) {
- if ((mask >> rb) & 1) {
- res |= ((data >> db) & 1) << rb;
- ++db;
- }
- }
- return res;
-}
-
-DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
-DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
-DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
-DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
-
-static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
-{
- uint64_t resm = 0, resu = 0;
- int db, rbm = 0, rbu = 0;
-
- for (db = 0; db < n; ++db) {
- uint64_t val = (data >> db) & 1;
- if ((mask >> db) & 1) {
- resm |= val << rbm++;
- } else {
- resu |= val << rbu++;
- }
- }
-
- return resm | (resu << rbm);
-}
-
-DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
-DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
-DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
-DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
-
-#undef DO_BITPERM
-
-#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- int sub_r = simd_data(desc); \
- if (sub_r) { \
- for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
- TYPE acc_r = *(TYPE *)(vn + H(i)); \
- TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
- TYPE el2_r = *(TYPE *)(vm + H(i)); \
- TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
- acc_r = ADD_OP(acc_r, el2_i); \
- acc_i = SUB_OP(acc_i, el2_r); \
- *(TYPE *)(vd + H(i)) = acc_r; \
- *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
- } \
- } else { \
- for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
- TYPE acc_r = *(TYPE *)(vn + H(i)); \
- TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
- TYPE el2_r = *(TYPE *)(vm + H(i)); \
- TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
- acc_r = SUB_OP(acc_r, el2_i); \
- acc_i = ADD_OP(acc_i, el2_r); \
- *(TYPE *)(vd + H(i)) = acc_r; \
- *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
- } \
- } \
-}
-
-DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
-DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
-DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
-DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
-
-DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
-DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
-DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
-DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
-
-#undef DO_CADD
-
-#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
- int shift = simd_data(desc) >> 1; \
- for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
- TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
- *(TYPEW *)(vd + HW(i)) = nn << shift; \
- } \
-}
-
-DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
-DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
-DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
-
-DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
-DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
-DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
-
-#undef DO_ZZI_SHLL
-
-/* Two-operand reduction expander, controlled by a predicate.
- * The difference between TYPERED and TYPERET has to do with
- * sign-extension. E.g. for SMAX, TYPERED must be signed,
- * but TYPERET must be unsigned so that e.g. a 32-bit value
- * is not sign-extended to the ABI uint64_t return type.
- */
-/* ??? If we were to vectorize this by hand the reduction ordering
- * would change. For integer operands, this is perfectly fine.
- */
-#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
-uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- TYPERED ret = INIT; \
- for (i = 0; i < opr_sz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- if (pg & 1) { \
- TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
- ret = OP(ret, nn); \
- } \
- i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
- } while (i & 15); \
- } \
- return (TYPERET)ret; \
-}
-
-#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
-uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
- TYPEE *n = vn; \
- uint8_t *pg = vg; \
- TYPER ret = INIT; \
- for (i = 0; i < opr_sz; i += 1) { \
- if (pg[H1(i)] & 1) { \
- TYPEE nn = n[i]; \
- ret = OP(ret, nn); \
- } \
- } \
- return ret; \
-}
-
-DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
-DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
-DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
-DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
-
-DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
-DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
-DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
-DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
-
-DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
-DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
-DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
-DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
-
-DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
-DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
-DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
-
-DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
-DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
-DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
-DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
-
-DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
-DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
-DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
-DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
-
-DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
-DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
-DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
-DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
-
-DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
-DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
-DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
-DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
-
-DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
-DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
-DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
-DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
-
-#undef DO_VPZ
-#undef DO_VPZ_D
-
-/* Two vector operand, one scalar operand, unpredicated. */
-#define DO_ZZI(NAME, TYPE, OP) \
-void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
- TYPE s = s64, *d = vd, *n = vn; \
- for (i = 0; i < opr_sz; ++i) { \
- d[i] = OP(n[i], s); \
- } \
-}
-
-#define DO_SUBR(X, Y) (Y - X)
-
-DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
-DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
-DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
-DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
-
-DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
-DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
-DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
-DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
-
-DO_ZZI(sve_smini_b, int8_t, DO_MIN)
-DO_ZZI(sve_smini_h, int16_t, DO_MIN)
-DO_ZZI(sve_smini_s, int32_t, DO_MIN)
-DO_ZZI(sve_smini_d, int64_t, DO_MIN)
-
-DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
-DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
-DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
-DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
-
-DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
-DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
-DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
-DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
-
-#undef DO_ZZI
-
-#undef DO_AND
-#undef DO_ORR
-#undef DO_EOR
-#undef DO_BIC
-#undef DO_ADD
-#undef DO_SUB
-#undef DO_MAX
-#undef DO_MIN
-#undef DO_ABD
-#undef DO_MUL
-#undef DO_DIV
-#undef DO_ASR
-#undef DO_LSR
-#undef DO_LSL
-#undef DO_SUBR
-
-/* Similar to the ARM LastActiveElement pseudocode function, except the
- result is multiplied by the element size. This includes the not found
- indication; e.g. not found for esz=3 is -8. */
-static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
-{
- uint64_t mask = pred_esz_masks[esz];
- intptr_t i = words;
-
- do {
- uint64_t this_g = g[--i] & mask;
- if (this_g) {
- return i * 64 + (63 - clz64(this_g));
- }
- } while (i > 0);
- return (intptr_t)-1 << esz;
-}
-
-uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
-{
- intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
- uint32_t flags = PREDTEST_INIT;
- uint64_t *d = vd, *g = vg;
- intptr_t i = 0;
-
- do {
- uint64_t this_d = d[i];
- uint64_t this_g = g[i];
-
- if (this_g) {
- if (!(flags & 4)) {
- /* Set in D the first bit of G. */
- this_d |= this_g & -this_g;
- d[i] = this_d;
- }
- flags = iter_predtest_fwd(this_d, this_g, flags);
- }
- } while (++i < words);
-
- return flags;
-}
-
-uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
-{
- intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
- intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
- uint32_t flags = PREDTEST_INIT;
- uint64_t *d = vd, *g = vg, esz_mask;
- intptr_t i, next;
-
- next = last_active_element(vd, words, esz) + (1 << esz);
- esz_mask = pred_esz_masks[esz];
-
- /* Similar to the pseudocode for pnext, but scaled by ESZ
- so that we find the correct bit. */
- if (next < words * 64) {
- uint64_t mask = -1;
-
- if (next & 63) {
- mask = ~((1ull << (next & 63)) - 1);
- next &= -64;
- }
- do {
- uint64_t this_g = g[next / 64] & esz_mask & mask;
- if (this_g != 0) {
- next = (next & -64) + ctz64(this_g);
- break;
- }
- next += 64;
- mask = -1;
- } while (next < words * 64);
- }
-
- i = 0;
- do {
- uint64_t this_d = 0;
- if (i == next / 64) {
- this_d = 1ull << (next & 63);
- }
- d[i] = this_d;
- flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
- } while (++i < words);
-
- return flags;
-}
-
-/*
- * Copy Zn into Zd, and store zero into inactive elements.
- * If inv, store zeros into the active elements.
- */
-void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
- uint64_t *d = vd, *n = vn;
- uint8_t *pg = vg;
-
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
- }
-}
-
-void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
- uint64_t *d = vd, *n = vn;
- uint8_t *pg = vg;
-
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
- }
-}
-
-void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
- uint64_t *d = vd, *n = vn;
- uint8_t *pg = vg;
-
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
- }
-}
-
-void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn;
- uint8_t *pg = vg;
- uint8_t inv = simd_data(desc);
-
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
- }
-}
-
-/* Three-operand expander, immediate operand, controlled by a predicate.
- */
-#define DO_ZPZI(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- TYPE imm = simd_data(desc); \
- for (i = 0; i < opr_sz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- if (pg & 1) { \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- *(TYPE *)(vd + H(i)) = OP(nn, imm); \
- } \
- i += sizeof(TYPE), pg >>= sizeof(TYPE); \
- } while (i & 15); \
- } \
-}
-
-/* Similarly, specialized for 64-bit operands. */
-#define DO_ZPZI_D(NAME, TYPE, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
- TYPE *d = vd, *n = vn; \
- TYPE imm = simd_data(desc); \
- uint8_t *pg = vg; \
- for (i = 0; i < opr_sz; i += 1) { \
- if (pg[H1(i)] & 1) { \
- TYPE nn = n[i]; \
- d[i] = OP(nn, imm); \
- } \
- } \
-}
-
-#define DO_SHR(N, M) (N >> M)
-#define DO_SHL(N, M) (N << M)
-
-/* Arithmetic shift right for division. This rounds negative numbers
- toward zero as per signed division. Therefore before shifting,
- when N is negative, add 2**M-1. */
-#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
-
-static inline uint64_t do_urshr(uint64_t x, unsigned sh)
-{
- if (likely(sh < 64)) {
- return (x >> sh) + ((x >> (sh - 1)) & 1);
- } else if (sh == 64) {
- return x >> 63;
- } else {
- return 0;
- }
-}
-
-static inline int64_t do_srshr(int64_t x, unsigned sh)
-{
- if (likely(sh < 64)) {
- return (x >> sh) + ((x >> (sh - 1)) & 1);
- } else {
- /* Rounding the sign bit always produces 0. */
- return 0;
- }
-}
-
-DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
-DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
-DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
-DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
-
-DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
-DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
-DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
-DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
-
-DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
-DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
-DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
-DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
-
-DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
-DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
-DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
-DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
-
-/* SVE2 bitwise shift by immediate */
-DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
-DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
-DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
-DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
-
-DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
-DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
-DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
-DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
-
-DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
-DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
-DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
-DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
-
-DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
-DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
-DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
-DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
-
-#define do_suqrshl_b(n, m) \
- ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
-#define do_suqrshl_h(n, m) \
- ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
-#define do_suqrshl_s(n, m) \
- ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
-#define do_suqrshl_d(n, m) \
- ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
-
-DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
-DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
-DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
-DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
-
-#undef DO_ASRD
-#undef DO_ZPZI
-#undef DO_ZPZI_D
-
-#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- int shift = simd_data(desc); \
- for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
- TYPEW nn = *(TYPEW *)(vn + i); \
- *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
- } \
-}
-
-#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- int shift = simd_data(desc); \
- for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
- TYPEW nn = *(TYPEW *)(vn + HW(i)); \
- *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
- } \
-}
-
-DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
-DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
-DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
-
-DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
-DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
-DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
-
-DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
-DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
-DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
-
-DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
-DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
-DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
-
-#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
-#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
-#define DO_SQSHRUN_D(x, sh) \
- do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
-
-DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
-DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
-DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
-
-DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
-DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
-DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
-
-#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
-#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
-#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
-
-DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
-DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
-DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
-
-DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
-DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
-DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
-
-#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
-#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
-#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
-
-DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
-DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
-DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
-
-DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
-DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
-DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
-
-#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
-#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
-#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
-
-DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
-DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
-DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
-
-DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
-DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
-DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
-
-#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
-#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
-#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
-
-DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
-DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
-DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
-
-DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
-DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
-DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
-
-#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
-#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
-#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
-
-DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
-DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
-DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
-
-DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
-DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
-DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
-
-#undef DO_SHRNB
-#undef DO_SHRNT
-
-#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
- TYPEW nn = *(TYPEW *)(vn + i); \
- TYPEW mm = *(TYPEW *)(vm + i); \
- *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
- } \
-}
-
-#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
- TYPEW nn = *(TYPEW *)(vn + HW(i)); \
- TYPEW mm = *(TYPEW *)(vm + HW(i)); \
- *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
- } \
-}
-
-#define DO_ADDHN(N, M, SH) ((N + M) >> SH)
-#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
-#define DO_SUBHN(N, M, SH) ((N - M) >> SH)
-#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
-
-DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
-DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
-DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
-
-DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
-DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
-DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
-
-DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
-DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
-DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
-
-DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
-DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
-DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
-
-DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
-DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
-DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
-
-DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
-DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
-DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
-
-DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
-DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
-DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
-
-DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
-DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
-DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
-
-#undef DO_RSUBHN
-#undef DO_SUBHN
-#undef DO_RADDHN
-#undef DO_ADDHN
-
-#undef DO_BINOPNB
-
-/* Fully general four-operand expander, controlled by a predicate.
- */
-#define DO_ZPZZZ(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
- void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- if (pg & 1) { \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- TYPE mm = *(TYPE *)(vm + H(i)); \
- TYPE aa = *(TYPE *)(va + H(i)); \
- *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
- } \
- i += sizeof(TYPE), pg >>= sizeof(TYPE); \
- } while (i & 15); \
- } \
-}
-
-/* Similarly, specialized for 64-bit operands. */
-#define DO_ZPZZZ_D(NAME, TYPE, OP) \
-void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
- void *vg, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
- TYPE *d = vd, *a = va, *n = vn, *m = vm; \
- uint8_t *pg = vg; \
- for (i = 0; i < opr_sz; i += 1) { \
- if (pg[H1(i)] & 1) { \
- TYPE aa = a[i], nn = n[i], mm = m[i]; \
- d[i] = OP(aa, nn, mm); \
- } \
- } \
-}
-
-#define DO_MLA(A, N, M) (A + N * M)
-#define DO_MLS(A, N, M) (A - N * M)
-
-DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
-DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
-
-DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
-DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
-
-DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
-DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
-
-DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
-DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
-
-#undef DO_MLA
-#undef DO_MLS
-#undef DO_ZPZZZ
-#undef DO_ZPZZZ_D
-
-void HELPER(sve_index_b)(void *vd, uint32_t start,
- uint32_t incr, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- uint8_t *d = vd;
- for (i = 0; i < opr_sz; i += 1) {
- d[H1(i)] = start + i * incr;
- }
-}
-
-void HELPER(sve_index_h)(void *vd, uint32_t start,
- uint32_t incr, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 2;
- uint16_t *d = vd;
- for (i = 0; i < opr_sz; i += 1) {
- d[H2(i)] = start + i * incr;
- }
-}
-
-void HELPER(sve_index_s)(void *vd, uint32_t start,
- uint32_t incr, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 4;
- uint32_t *d = vd;
- for (i = 0; i < opr_sz; i += 1) {
- d[H4(i)] = start + i * incr;
- }
-}
-
-void HELPER(sve_index_d)(void *vd, uint64_t start,
- uint64_t incr, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = start + i * incr;
- }
-}
-
-void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 4;
- uint32_t sh = simd_data(desc);
- uint32_t *d = vd, *n = vn, *m = vm;
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] + (m[i] << sh);
- }
-}
-
-void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t sh = simd_data(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] + (m[i] << sh);
- }
-}
-
-void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t sh = simd_data(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
- }
-}
-
-void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t sh = simd_data(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
- }
-}
-
-void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
-{
- /* These constants are cut-and-paste directly from the ARM pseudocode. */
- static const uint16_t coeff[] = {
- 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
- 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
- 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
- 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
- };
- intptr_t i, opr_sz = simd_oprsz(desc) / 2;
- uint16_t *d = vd, *n = vn;
-
- for (i = 0; i < opr_sz; i++) {
- uint16_t nn = n[i];
- intptr_t idx = extract32(nn, 0, 5);
- uint16_t exp = extract32(nn, 5, 5);
- d[i] = coeff[idx] | (exp << 10);
- }
-}
-
-void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
-{
- /* These constants are cut-and-paste directly from the ARM pseudocode. */
- static const uint32_t coeff[] = {
- 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
- 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
- 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
- 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
- 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
- 0x1ef532, 0x20b051, 0x227043, 0x243516,
- 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
- 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
- 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
- 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
- 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
- 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
- 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
- 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
- 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
- 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
- };
- intptr_t i, opr_sz = simd_oprsz(desc) / 4;
- uint32_t *d = vd, *n = vn;
-
- for (i = 0; i < opr_sz; i++) {
- uint32_t nn = n[i];
- intptr_t idx = extract32(nn, 0, 6);
- uint32_t exp = extract32(nn, 6, 8);
- d[i] = coeff[idx] | (exp << 23);
- }
-}
-
-void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
-{
- /* These constants are cut-and-paste directly from the ARM pseudocode. */
- static const uint64_t coeff[] = {
- 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
- 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
- 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
- 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
- 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
- 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
- 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
- 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
- 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
- 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
- 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
- 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
- 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
- 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
- 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
- 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
- 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
- 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
- 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
- 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
- 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
- 0xFA7C1819E90D8ull,
- };
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn;
-
- for (i = 0; i < opr_sz; i++) {
- uint64_t nn = n[i];
- intptr_t idx = extract32(nn, 0, 6);
- uint64_t exp = extract32(nn, 6, 11);
- d[i] = coeff[idx] | (exp << 52);
- }
-}
-
-void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 2;
- uint16_t *d = vd, *n = vn, *m = vm;
- for (i = 0; i < opr_sz; i += 1) {
- uint16_t nn = n[i];
- uint16_t mm = m[i];
- if (mm & 1) {
- nn = float16_one;
- }
- d[i] = nn ^ (mm & 2) << 14;
- }
-}
-
-void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 4;
- uint32_t *d = vd, *n = vn, *m = vm;
- for (i = 0; i < opr_sz; i += 1) {
- uint32_t nn = n[i];
- uint32_t mm = m[i];
- if (mm & 1) {
- nn = float32_one;
- }
- d[i] = nn ^ (mm & 2) << 30;
- }
-}
-
-void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn, *m = vm;
- for (i = 0; i < opr_sz; i += 1) {
- uint64_t nn = n[i];
- uint64_t mm = m[i];
- if (mm & 1) {
- nn = float64_one;
- }
- d[i] = nn ^ (mm & 2) << 62;
- }
-}
-
-/*
- * Signed saturating addition with scalar operand.
- */
-
-void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
-
- for (i = 0; i < oprsz; i += sizeof(int8_t)) {
- *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
- }
-}
-
-void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
-
- for (i = 0; i < oprsz; i += sizeof(int16_t)) {
- *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
- }
-}
-
-void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
-
- for (i = 0; i < oprsz; i += sizeof(int32_t)) {
- *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
- }
-}
-
-void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
-
- for (i = 0; i < oprsz; i += sizeof(int64_t)) {
- *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
- }
-}
-
-/*
- * Unsigned saturating addition with scalar operand.
- */
-
-void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
-
- for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
- *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
- }
-}
-
-void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
-
- for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
- *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
- }
-}
-
-void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
-
- for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
- *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
- }
-}
-
-void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
-
- for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
- *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
- }
-}
-
-void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
-
- for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
- *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
- }
-}
-
-/* Two operand predicated copy immediate with merge. All valid immediates
- * can fit within 17 signed bits in the simd_data field.
- */
-void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
- uint64_t mm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn;
- uint8_t *pg = vg;
-
- mm = dup_const(MO_8, mm);
- for (i = 0; i < opr_sz; i += 1) {
- uint64_t nn = n[i];
- uint64_t pp = expand_pred_b(pg[H1(i)]);
- d[i] = (mm & pp) | (nn & ~pp);
- }
-}
-
-void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
- uint64_t mm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn;
- uint8_t *pg = vg;
-
- mm = dup_const(MO_16, mm);
- for (i = 0; i < opr_sz; i += 1) {
- uint64_t nn = n[i];
- uint64_t pp = expand_pred_h(pg[H1(i)]);
- d[i] = (mm & pp) | (nn & ~pp);
- }
-}
-
-void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
- uint64_t mm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn;
- uint8_t *pg = vg;
-
- mm = dup_const(MO_32, mm);
- for (i = 0; i < opr_sz; i += 1) {
- uint64_t nn = n[i];
- uint64_t pp = expand_pred_s(pg[H1(i)]);
- d[i] = (mm & pp) | (nn & ~pp);
- }
-}
-
-void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
- uint64_t mm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn;
- uint8_t *pg = vg;
-
- for (i = 0; i < opr_sz; i += 1) {
- uint64_t nn = n[i];
- d[i] = (pg[H1(i)] & 1 ? mm : nn);
- }
-}
-
-void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- uint8_t *pg = vg;
-
- val = dup_const(MO_8, val);
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = val & expand_pred_b(pg[H1(i)]);
- }
-}
-
-void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- uint8_t *pg = vg;
-
- val = dup_const(MO_16, val);
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = val & expand_pred_h(pg[H1(i)]);
- }
-}
-
-void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- uint8_t *pg = vg;
-
- val = dup_const(MO_32, val);
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = val & expand_pred_s(pg[H1(i)]);
- }
-}
-
-void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- uint8_t *pg = vg;
-
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = (pg[H1(i)] & 1 ? val : 0);
- }
-}
-
-/* Big-endian hosts need to frob the byte indices. If the copy
- * happens to be 8-byte aligned, then no frobbing necessary.
- */
-static void swap_memmove(void *vd, void *vs, size_t n)
-{
- uintptr_t d = (uintptr_t)vd;
- uintptr_t s = (uintptr_t)vs;
- uintptr_t o = (d | s | n) & 7;
- size_t i;
-
-#if !HOST_BIG_ENDIAN
- o = 0;
-#endif
- switch (o) {
- case 0:
- memmove(vd, vs, n);
- break;
-
- case 4:
- if (d < s || d >= s + n) {
- for (i = 0; i < n; i += 4) {
- *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
- }
- } else {
- for (i = n; i > 0; ) {
- i -= 4;
- *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
- }
- }
- break;
-
- case 2:
- case 6:
- if (d < s || d >= s + n) {
- for (i = 0; i < n; i += 2) {
- *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
- }
- } else {
- for (i = n; i > 0; ) {
- i -= 2;
- *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
- }
- }
- break;
-
- default:
- if (d < s || d >= s + n) {
- for (i = 0; i < n; i++) {
- *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
- }
- } else {
- for (i = n; i > 0; ) {
- i -= 1;
- *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
- }
- }
- break;
- }
-}
-
-/* Similarly for memset of 0. */
-static void swap_memzero(void *vd, size_t n)
-{
- uintptr_t d = (uintptr_t)vd;
- uintptr_t o = (d | n) & 7;
- size_t i;
-
- /* Usually, the first bit of a predicate is set, so N is 0. */
- if (likely(n == 0)) {
- return;
- }
-
-#if !HOST_BIG_ENDIAN
- o = 0;
-#endif
- switch (o) {
- case 0:
- memset(vd, 0, n);
- break;
-
- case 4:
- for (i = 0; i < n; i += 4) {
- *(uint32_t *)H1_4(d + i) = 0;
- }
- break;
-
- case 2:
- case 6:
- for (i = 0; i < n; i += 2) {
- *(uint16_t *)H1_2(d + i) = 0;
- }
- break;
-
- default:
- for (i = 0; i < n; i++) {
- *(uint8_t *)H1(d + i) = 0;
- }
- break;
- }
-}
-
-void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t opr_sz = simd_oprsz(desc);
- size_t n_ofs = simd_data(desc);
- size_t n_siz = opr_sz - n_ofs;
-
- if (vd != vm) {
- swap_memmove(vd, vn + n_ofs, n_siz);
- swap_memmove(vd + n_siz, vm, n_ofs);
- } else if (vd != vn) {
- swap_memmove(vd + n_siz, vd, n_ofs);
- swap_memmove(vd, vn + n_ofs, n_siz);
- } else {
- /* vd == vn == vm. Need temp space. */
- ARMVectorReg tmp;
- swap_memmove(&tmp, vm, n_ofs);
- swap_memmove(vd, vd + n_ofs, n_siz);
- memcpy(vd + n_siz, &tmp, n_ofs);
- }
-}
-
-#define DO_INSR(NAME, TYPE, H) \
-void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
-{ \
- intptr_t opr_sz = simd_oprsz(desc); \
- swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
- *(TYPE *)(vd + H(0)) = val; \
-}
-
-DO_INSR(sve_insr_b, uint8_t, H1)
-DO_INSR(sve_insr_h, uint16_t, H1_2)
-DO_INSR(sve_insr_s, uint32_t, H1_4)
-DO_INSR(sve_insr_d, uint64_t, H1_8)
-
-#undef DO_INSR
-
-void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
- uint64_t f = *(uint64_t *)(vn + i);
- uint64_t b = *(uint64_t *)(vn + j);
- *(uint64_t *)(vd + i) = bswap64(b);
- *(uint64_t *)(vd + j) = bswap64(f);
- }
-}
-
-void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
- uint64_t f = *(uint64_t *)(vn + i);
- uint64_t b = *(uint64_t *)(vn + j);
- *(uint64_t *)(vd + i) = hswap64(b);
- *(uint64_t *)(vd + j) = hswap64(f);
- }
-}
-
-void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
- uint64_t f = *(uint64_t *)(vn + i);
- uint64_t b = *(uint64_t *)(vn + j);
- *(uint64_t *)(vd + i) = rol64(b, 32);
- *(uint64_t *)(vd + j) = rol64(f, 32);
- }
-}
-
-void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
- uint64_t f = *(uint64_t *)(vn + i);
- uint64_t b = *(uint64_t *)(vn + j);
- *(uint64_t *)(vd + i) = b;
- *(uint64_t *)(vd + j) = f;
- }
-}
-
-typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
-
-static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
- bool is_tbx, tb_impl_fn *fn)
-{
- ARMVectorReg scratch;
- uintptr_t oprsz = simd_oprsz(desc);
-
- if (unlikely(vd == vn)) {
- vn = memcpy(&scratch, vn, oprsz);
- }
-
- fn(vd, vn, NULL, vm, oprsz, is_tbx);
-}
-
-static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
- uint32_t desc, bool is_tbx, tb_impl_fn *fn)
-{
- ARMVectorReg scratch;
- uintptr_t oprsz = simd_oprsz(desc);
-
- if (unlikely(vd == vn0)) {
- vn0 = memcpy(&scratch, vn0, oprsz);
- if (vd == vn1) {
- vn1 = vn0;
- }
- } else if (unlikely(vd == vn1)) {
- vn1 = memcpy(&scratch, vn1, oprsz);
- }
-
- fn(vd, vn0, vn1, vm, oprsz, is_tbx);
-}
-
-#define DO_TB(SUFF, TYPE, H) \
-static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
- void *vm, uintptr_t oprsz, bool is_tbx) \
-{ \
- TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
- uintptr_t i, nelem = oprsz / sizeof(TYPE); \
- for (i = 0; i < nelem; ++i) { \
- TYPE index = indexes[H1(i)], val = 0; \
- if (index < nelem) { \
- val = tbl0[H(index)]; \
- } else { \
- index -= nelem; \
- if (tbl1 && index < nelem) { \
- val = tbl1[H(index)]; \
- } else if (is_tbx) { \
- continue; \
- } \
- } \
- d[H(i)] = val; \
- } \
-} \
-void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
-} \
-void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
- void *vm, uint32_t desc) \
-{ \
- do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
-} \
-void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
-}
-
-DO_TB(b, uint8_t, H1)
-DO_TB(h, uint16_t, H2)
-DO_TB(s, uint32_t, H4)
-DO_TB(d, uint64_t, H8)
-
-#undef DO_TB
-
-#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- TYPED *d = vd; \
- TYPES *n = vn; \
- ARMVectorReg tmp; \
- if (unlikely(vn - vd < opr_sz)) { \
- n = memcpy(&tmp, n, opr_sz / 2); \
- } \
- for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
- d[HD(i)] = n[HS(i)]; \
- } \
-}
-
-DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
-DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
-DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
-
-DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
-DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
-DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
-
-#undef DO_UNPK
-
-/* Mask of bits included in the even numbered predicates of width esz.
- * We also use this for expand_bits/compress_bits, and so extend the
- * same pattern out to 16-bit units.
- */
-static const uint64_t even_bit_esz_masks[5] = {
- 0x5555555555555555ull,
- 0x3333333333333333ull,
- 0x0f0f0f0f0f0f0f0full,
- 0x00ff00ff00ff00ffull,
- 0x0000ffff0000ffffull,
-};
-
-/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
- * For N==0, this corresponds to the operation that in qemu/bitops.h
- * we call half_shuffle64; this algorithm is from Hacker's Delight,
- * section 7-2 Shuffling Bits.
- */
-static uint64_t expand_bits(uint64_t x, int n)
-{
- int i;
-
- x &= 0xffffffffu;
- for (i = 4; i >= n; i--) {
- int sh = 1 << i;
- x = ((x << sh) | x) & even_bit_esz_masks[i];
- }
- return x;
-}
-
-/* Compress units of 2**(N+1) bits to units of 2**N bits.
- * For N==0, this corresponds to the operation that in qemu/bitops.h
- * we call half_unshuffle64; this algorithm is from Hacker's Delight,
- * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
- */
-static uint64_t compress_bits(uint64_t x, int n)
-{
- int i;
-
- for (i = n; i <= 4; i++) {
- int sh = 1 << i;
- x &= even_bit_esz_masks[i];
- x = (x >> sh) | x;
- }
- return x & 0xffffffffu;
-}
-
-void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
- intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
- int esize = 1 << esz;
- uint64_t *d = vd;
- intptr_t i;
-
- if (oprsz <= 8) {
- uint64_t nn = *(uint64_t *)vn;
- uint64_t mm = *(uint64_t *)vm;
- int half = 4 * oprsz;
-
- nn = extract64(nn, high * half, half);
- mm = extract64(mm, high * half, half);
- nn = expand_bits(nn, esz);
- mm = expand_bits(mm, esz);
- d[0] = nn | (mm << esize);
- } else {
- ARMPredicateReg tmp;
-
- /* We produce output faster than we consume input.
- Therefore we must be mindful of possible overlap. */
- if (vd == vn) {
- vn = memcpy(&tmp, vn, oprsz);
- if (vd == vm) {
- vm = vn;
- }
- } else if (vd == vm) {
- vm = memcpy(&tmp, vm, oprsz);
- }
- if (high) {
- high = oprsz >> 1;
- }
-
- if ((oprsz & 7) == 0) {
- uint32_t *n = vn, *m = vm;
- high >>= 2;
-
- for (i = 0; i < oprsz / 8; i++) {
- uint64_t nn = n[H4(high + i)];
- uint64_t mm = m[H4(high + i)];
-
- nn = expand_bits(nn, esz);
- mm = expand_bits(mm, esz);
- d[i] = nn | (mm << esize);
- }
- } else {
- uint8_t *n = vn, *m = vm;
- uint16_t *d16 = vd;
-
- for (i = 0; i < oprsz / 2; i++) {
- uint16_t nn = n[H1(high + i)];
- uint16_t mm = m[H1(high + i)];
-
- nn = expand_bits(nn, esz);
- mm = expand_bits(mm, esz);
- d16[H2(i)] = nn | (mm << esize);
- }
- }
- }
-}
-
-void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
- int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
- uint64_t *d = vd, *n = vn, *m = vm;
- uint64_t l, h;
- intptr_t i;
-
- if (oprsz <= 8) {
- l = compress_bits(n[0] >> odd, esz);
- h = compress_bits(m[0] >> odd, esz);
- d[0] = l | (h << (4 * oprsz));
- } else {
- ARMPredicateReg tmp_m;
- intptr_t oprsz_16 = oprsz / 16;
-
- if ((vm - vd) < (uintptr_t)oprsz) {
- m = memcpy(&tmp_m, vm, oprsz);
- }
-
- for (i = 0; i < oprsz_16; i++) {
- l = n[2 * i + 0];
- h = n[2 * i + 1];
- l = compress_bits(l >> odd, esz);
- h = compress_bits(h >> odd, esz);
- d[i] = l | (h << 32);
- }
-
- /*
- * For VL which is not a multiple of 512, the results from M do not
- * align nicely with the uint64_t for D. Put the aligned results
- * from M into TMP_M and then copy it into place afterward.
- */
- if (oprsz & 15) {
- int final_shift = (oprsz & 15) * 2;
-
- l = n[2 * i + 0];
- h = n[2 * i + 1];
- l = compress_bits(l >> odd, esz);
- h = compress_bits(h >> odd, esz);
- d[i] = l | (h << final_shift);
-
- for (i = 0; i < oprsz_16; i++) {
- l = m[2 * i + 0];
- h = m[2 * i + 1];
- l = compress_bits(l >> odd, esz);
- h = compress_bits(h >> odd, esz);
- tmp_m.p[i] = l | (h << 32);
- }
- l = m[2 * i + 0];
- h = m[2 * i + 1];
- l = compress_bits(l >> odd, esz);
- h = compress_bits(h >> odd, esz);
- tmp_m.p[i] = l | (h << final_shift);
-
- swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
- } else {
- for (i = 0; i < oprsz_16; i++) {
- l = m[2 * i + 0];
- h = m[2 * i + 1];
- l = compress_bits(l >> odd, esz);
- h = compress_bits(h >> odd, esz);
- d[oprsz_16 + i] = l | (h << 32);
- }
- }
- }
-}
-
-void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
- int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
- uint64_t *d = vd, *n = vn, *m = vm;
- uint64_t mask;
- int shr, shl;
- intptr_t i;
-
- shl = 1 << esz;
- shr = 0;
- mask = even_bit_esz_masks[esz];
- if (odd) {
- mask <<= shl;
- shr = shl;
- shl = 0;
- }
-
- for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
- uint64_t nn = (n[i] & mask) >> shr;
- uint64_t mm = (m[i] & mask) << shl;
- d[i] = nn + mm;
- }
-}
-
-/* Reverse units of 2**N bits. */
-static uint64_t reverse_bits_64(uint64_t x, int n)
-{
- int i, sh;
-
- x = bswap64(x);
- for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
- uint64_t mask = even_bit_esz_masks[i];
- x = ((x & mask) << sh) | ((x >> sh) & mask);
- }
- return x;
-}
-
-static uint8_t reverse_bits_8(uint8_t x, int n)
-{
- static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
- int i, sh;
-
- for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
- x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
- }
- return x;
-}
-
-void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
- intptr_t i, oprsz_2 = oprsz / 2;
-
- if (oprsz <= 8) {
- uint64_t l = *(uint64_t *)vn;
- l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
- *(uint64_t *)vd = l;
- } else if ((oprsz & 15) == 0) {
- for (i = 0; i < oprsz_2; i += 8) {
- intptr_t ih = oprsz - 8 - i;
- uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
- uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
- *(uint64_t *)(vd + i) = h;
- *(uint64_t *)(vd + ih) = l;
- }
- } else {
- for (i = 0; i < oprsz_2; i += 1) {
- intptr_t il = H1(i);
- intptr_t ih = H1(oprsz - 1 - i);
- uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
- uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
- *(uint8_t *)(vd + il) = h;
- *(uint8_t *)(vd + ih) = l;
- }
- }
-}
-
-void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
- uint64_t *d = vd;
- intptr_t i;
-
- if (oprsz <= 8) {
- uint64_t nn = *(uint64_t *)vn;
- int half = 4 * oprsz;
-
- nn = extract64(nn, high * half, half);
- nn = expand_bits(nn, 0);
- d[0] = nn;
- } else {
- ARMPredicateReg tmp_n;
-
- /* We produce output faster than we consume input.
- Therefore we must be mindful of possible overlap. */
- if ((vn - vd) < (uintptr_t)oprsz) {
- vn = memcpy(&tmp_n, vn, oprsz);
- }
- if (high) {
- high = oprsz >> 1;
- }
-
- if ((oprsz & 7) == 0) {
- uint32_t *n = vn;
- high >>= 2;
-
- for (i = 0; i < oprsz / 8; i++) {
- uint64_t nn = n[H4(high + i)];
- d[i] = expand_bits(nn, 0);
- }
- } else {
- uint16_t *d16 = vd;
- uint8_t *n = vn;
-
- for (i = 0; i < oprsz / 2; i++) {
- uint16_t nn = n[H1(high + i)];
- d16[H2(i)] = expand_bits(nn, 0);
- }
- }
- }
-}
-
-#define DO_ZIP(NAME, TYPE, H) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t oprsz = simd_oprsz(desc); \
- intptr_t odd_ofs = simd_data(desc); \
- intptr_t i, oprsz_2 = oprsz / 2; \
- ARMVectorReg tmp_n, tmp_m; \
- /* We produce output faster than we consume input. \
- Therefore we must be mindful of possible overlap. */ \
- if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
- vn = memcpy(&tmp_n, vn, oprsz); \
- } \
- if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
- vm = memcpy(&tmp_m, vm, oprsz); \
- } \
- for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
- *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
- *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
- *(TYPE *)(vm + odd_ofs + H(i)); \
- } \
- if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
- memset(vd + oprsz - 16, 0, 16); \
- } \
-}
-
-DO_ZIP(sve_zip_b, uint8_t, H1)
-DO_ZIP(sve_zip_h, uint16_t, H1_2)
-DO_ZIP(sve_zip_s, uint32_t, H1_4)
-DO_ZIP(sve_zip_d, uint64_t, H1_8)
-DO_ZIP(sve2_zip_q, Int128, )
-
-#define DO_UZP(NAME, TYPE, H) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t oprsz = simd_oprsz(desc); \
- intptr_t odd_ofs = simd_data(desc); \
- intptr_t i, p; \
- ARMVectorReg tmp_m; \
- if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
- vm = memcpy(&tmp_m, vm, oprsz); \
- } \
- i = 0, p = odd_ofs; \
- do { \
- *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
- i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
- } while (p < oprsz); \
- p -= oprsz; \
- do { \
- *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
- i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
- } while (p < oprsz); \
- tcg_debug_assert(i == oprsz); \
-}
-
-DO_UZP(sve_uzp_b, uint8_t, H1)
-DO_UZP(sve_uzp_h, uint16_t, H1_2)
-DO_UZP(sve_uzp_s, uint32_t, H1_4)
-DO_UZP(sve_uzp_d, uint64_t, H1_8)
-DO_UZP(sve2_uzp_q, Int128, )
-
-#define DO_TRN(NAME, TYPE, H) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t oprsz = simd_oprsz(desc); \
- intptr_t odd_ofs = simd_data(desc); \
- intptr_t i; \
- for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
- TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
- TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
- *(TYPE *)(vd + H(i + 0)) = ae; \
- *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
- } \
- if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
- memset(vd + oprsz - 16, 0, 16); \
- } \
-}
-
-DO_TRN(sve_trn_b, uint8_t, H1)
-DO_TRN(sve_trn_h, uint16_t, H1_2)
-DO_TRN(sve_trn_s, uint32_t, H1_4)
-DO_TRN(sve_trn_d, uint64_t, H1_8)
-DO_TRN(sve2_trn_q, Int128, )
-
-#undef DO_ZIP
-#undef DO_UZP
-#undef DO_TRN
-
-void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
- uint32_t *d = vd, *n = vn;
- uint8_t *pg = vg;
-
- for (i = j = 0; i < opr_sz; i++) {
- if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
- d[H4(j)] = n[H4(i)];
- j++;
- }
- }
- for (; j < opr_sz; j++) {
- d[H4(j)] = 0;
- }
-}
-
-void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn;
- uint8_t *pg = vg;
-
- for (i = j = 0; i < opr_sz; i++) {
- if (pg[H1(i)] & 1) {
- d[j] = n[i];
- j++;
- }
- }
- for (; j < opr_sz; j++) {
- d[j] = 0;
- }
-}
-
-/* Similar to the ARM LastActiveElement pseudocode function, except the
- * result is multiplied by the element size. This includes the not found
- * indication; e.g. not found for esz=3 is -8.
- */
-int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
-{
- intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
- intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
-
- return last_active_element(vg, words, esz);
-}
-
-void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
-{
- intptr_t opr_sz = simd_oprsz(desc) / 8;
- int esz = simd_data(desc);
- uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
- intptr_t i, first_i, last_i;
- ARMVectorReg tmp;
-
- first_i = last_i = 0;
- first_g = last_g = 0;
-
- /* Find the extent of the active elements within VG. */
- for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
- pg = *(uint64_t *)(vg + i) & mask;
- if (pg) {
- if (last_g == 0) {
- last_g = pg;
- last_i = i;
- }
- first_g = pg;
- first_i = i;
- }
- }
-
- len = 0;
- if (first_g != 0) {
- first_i = first_i * 8 + ctz64(first_g);
- last_i = last_i * 8 + 63 - clz64(last_g);
- len = last_i - first_i + (1 << esz);
- if (vd == vm) {
- vm = memcpy(&tmp, vm, opr_sz * 8);
- }
- swap_memmove(vd, vn + first_i, len);
- }
- swap_memmove(vd + len, vm, opr_sz * 8 - len);
-}
-
-void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
- void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn, *m = vm;
- uint8_t *pg = vg;
-
- for (i = 0; i < opr_sz; i += 1) {
- uint64_t nn = n[i], mm = m[i];
- uint64_t pp = expand_pred_b(pg[H1(i)]);
- d[i] = (nn & pp) | (mm & ~pp);
- }
-}
-
-void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
- void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn, *m = vm;
- uint8_t *pg = vg;
-
- for (i = 0; i < opr_sz; i += 1) {
- uint64_t nn = n[i], mm = m[i];
- uint64_t pp = expand_pred_h(pg[H1(i)]);
- d[i] = (nn & pp) | (mm & ~pp);
- }
-}
-
-void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
- void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn, *m = vm;
- uint8_t *pg = vg;
-
- for (i = 0; i < opr_sz; i += 1) {
- uint64_t nn = n[i], mm = m[i];
- uint64_t pp = expand_pred_s(pg[H1(i)]);
- d[i] = (nn & pp) | (mm & ~pp);
- }
-}
-
-void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
- void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn, *m = vm;
- uint8_t *pg = vg;
-
- for (i = 0; i < opr_sz; i += 1) {
- uint64_t nn = n[i], mm = m[i];
- d[i] = (pg[H1(i)] & 1 ? nn : mm);
- }
-}
-
-void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
- void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 16;
- Int128 *d = vd, *n = vn, *m = vm;
- uint16_t *pg = vg;
-
- for (i = 0; i < opr_sz; i += 1) {
- d[i] = (pg[H2(i)] & 1 ? n : m)[i];
- }
-}
-
-/* Two operand comparison controlled by a predicate.
- * ??? It is very tempting to want to be able to expand this inline
- * with x86 instructions, e.g.
- *
- * vcmpeqw zm, zn, %ymm0
- * vpmovmskb %ymm0, %eax
- * and $0x5555, %eax
- * and pg, %eax
- *
- * or even aarch64, e.g.
- *
- * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
- * cmeq v0.8h, zn, zm
- * and v0.8h, v0.8h, mask
- * addv h0, v0.8h
- * and v0.8b, pg
- *
- * However, coming up with an abstraction that allows vector inputs and
- * a scalar output, and also handles the byte-ordering of sub-uint64_t
- * scalar outputs, is tricky.
- */
-#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
-uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{ \
- intptr_t opr_sz = simd_oprsz(desc); \
- uint32_t flags = PREDTEST_INIT; \
- intptr_t i = opr_sz; \
- do { \
- uint64_t out = 0, pg; \
- do { \
- i -= sizeof(TYPE), out <<= sizeof(TYPE); \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- TYPE mm = *(TYPE *)(vm + H(i)); \
- out |= nn OP mm; \
- } while (i & 63); \
- pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
- out &= pg; \
- *(uint64_t *)(vd + (i >> 3)) = out; \
- flags = iter_predtest_bwd(out, pg, flags); \
- } while (i > 0); \
- return flags; \
-}
-
-#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
- DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
-#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
- DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
-#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
- DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
-#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
- DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
-
-DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
-DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
-DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
-DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
-
-DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
-DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
-DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
-DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
-
-DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
-DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
-DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
-DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
-
-DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
-DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
-DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
-DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
-
-DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
-DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
-DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
-DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
-
-DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
-DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
-DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
-DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
-
-#undef DO_CMP_PPZZ_B
-#undef DO_CMP_PPZZ_H
-#undef DO_CMP_PPZZ_S
-#undef DO_CMP_PPZZ_D
-#undef DO_CMP_PPZZ
-
-/* Similar, but the second source is "wide". */
-#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
-uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{ \
- intptr_t opr_sz = simd_oprsz(desc); \
- uint32_t flags = PREDTEST_INIT; \
- intptr_t i = opr_sz; \
- do { \
- uint64_t out = 0, pg; \
- do { \
- TYPEW mm = *(TYPEW *)(vm + i - 8); \
- do { \
- i -= sizeof(TYPE), out <<= sizeof(TYPE); \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- out |= nn OP mm; \
- } while (i & 7); \
- } while (i & 63); \
- pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
- out &= pg; \
- *(uint64_t *)(vd + (i >> 3)) = out; \
- flags = iter_predtest_bwd(out, pg, flags); \
- } while (i > 0); \
- return flags; \
-}
-
-#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
- DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
-#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
- DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
-#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
- DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
-
-DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
-DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
-DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
-
-DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
-DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
-DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
-
-DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
-DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
-DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
-
-DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
-DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
-DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
-
-DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
-DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
-DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
-
-DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
-DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
-DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
-
-DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
-DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
-DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
-
-DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
-DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
-DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
-
-DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
-DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
-DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
-
-DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
-DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
-DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
-
-#undef DO_CMP_PPZW_B
-#undef DO_CMP_PPZW_H
-#undef DO_CMP_PPZW_S
-#undef DO_CMP_PPZW
-
-/* Similar, but the second source is immediate. */
-#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
-uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
-{ \
- intptr_t opr_sz = simd_oprsz(desc); \
- uint32_t flags = PREDTEST_INIT; \
- TYPE mm = simd_data(desc); \
- intptr_t i = opr_sz; \
- do { \
- uint64_t out = 0, pg; \
- do { \
- i -= sizeof(TYPE), out <<= sizeof(TYPE); \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- out |= nn OP mm; \
- } while (i & 63); \
- pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
- out &= pg; \
- *(uint64_t *)(vd + (i >> 3)) = out; \
- flags = iter_predtest_bwd(out, pg, flags); \
- } while (i > 0); \
- return flags; \
-}
-
-#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
- DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
-#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
- DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
-#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
- DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
-#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
- DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
-
-DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
-DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
-DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
-DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
-
-DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
-DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
-DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
-DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
-
-DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
-DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
-DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
-DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
-
-DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
-DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
-DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
-DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
-
-DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
-DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
-DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
-DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
-
-DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
-DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
-DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
-DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
-
-DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
-DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
-DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
-DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
-
-DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
-DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
-DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
-DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
-
-DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
-DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
-DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
-DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
-
-DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
-DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
-DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
-DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
-
-#undef DO_CMP_PPZI_B
-#undef DO_CMP_PPZI_H
-#undef DO_CMP_PPZI_S
-#undef DO_CMP_PPZI_D
-#undef DO_CMP_PPZI
-
-/* Similar to the ARM LastActive pseudocode function. */
-static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
-{
- intptr_t i;
-
- for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
- uint64_t pg = *(uint64_t *)(vg + i);
- if (pg) {
- return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
- }
- }
- return 0;
-}
-
-/* Compute a mask into RETB that is true for all G, up to and including
- * (if after) or excluding (if !after) the first G & N.
- * Return true if BRK found.
- */
-static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
- bool brk, bool after)
-{
- uint64_t b;
-
- if (brk) {
- b = 0;
- } else if ((g & n) == 0) {
- /* For all G, no N are set; break not found. */
- b = g;
- } else {
- /* Break somewhere in N. Locate it. */
- b = g & n; /* guard true, pred true */
- b = b & -b; /* first such */
- if (after) {
- b = b | (b - 1); /* break after same */
- } else {
- b = b - 1; /* break before same */
- }
- brk = true;
- }
-
- *retb = b;
- return brk;
-}
-
-/* Compute a zeroing BRK. */
-static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
- intptr_t oprsz, bool after)
-{
- bool brk = false;
- intptr_t i;
-
- for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
- uint64_t this_b, this_g = g[i];
-
- brk = compute_brk(&this_b, n[i], this_g, brk, after);
- d[i] = this_b & this_g;
- }
-}
-
-/* Likewise, but also compute flags. */
-static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
- intptr_t oprsz, bool after)
-{
- uint32_t flags = PREDTEST_INIT;
- bool brk = false;
- intptr_t i;
-
- for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
- uint64_t this_b, this_d, this_g = g[i];
-
- brk = compute_brk(&this_b, n[i], this_g, brk, after);
- d[i] = this_d = this_b & this_g;
- flags = iter_predtest_fwd(this_d, this_g, flags);
- }
- return flags;
-}
-
-/* Compute a merging BRK. */
-static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
- intptr_t oprsz, bool after)
-{
- bool brk = false;
- intptr_t i;
-
- for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
- uint64_t this_b, this_g = g[i];
-
- brk = compute_brk(&this_b, n[i], this_g, brk, after);
- d[i] = (this_b & this_g) | (d[i] & ~this_g);
- }
-}
-
-/* Likewise, but also compute flags. */
-static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
- intptr_t oprsz, bool after)
-{
- uint32_t flags = PREDTEST_INIT;
- bool brk = false;
- intptr_t i;
-
- for (i = 0; i < oprsz / 8; ++i) {
- uint64_t this_b, this_d = d[i], this_g = g[i];
-
- brk = compute_brk(&this_b, n[i], this_g, brk, after);
- d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
- flags = iter_predtest_fwd(this_d, this_g, flags);
- }
- return flags;
-}
-
-static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
-{
- /* It is quicker to zero the whole predicate than loop on OPRSZ.
- * The compiler should turn this into 4 64-bit integer stores.
- */
- memset(d, 0, sizeof(ARMPredicateReg));
- return PREDTEST_INIT;
-}
-
-void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
- uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- if (last_active_pred(vn, vg, oprsz)) {
- compute_brk_z(vd, vm, vg, oprsz, true);
- } else {
- do_zero(vd, oprsz);
- }
-}
-
-uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
- uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- if (last_active_pred(vn, vg, oprsz)) {
- return compute_brks_z(vd, vm, vg, oprsz, true);
- } else {
- return do_zero(vd, oprsz);
- }
-}
-
-void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
- uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- if (last_active_pred(vn, vg, oprsz)) {
- compute_brk_z(vd, vm, vg, oprsz, false);
- } else {
- do_zero(vd, oprsz);
- }
-}
-
-uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
- uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- if (last_active_pred(vn, vg, oprsz)) {
- return compute_brks_z(vd, vm, vg, oprsz, false);
- } else {
- return do_zero(vd, oprsz);
- }
-}
-
-void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- compute_brk_z(vd, vn, vg, oprsz, true);
-}
-
-uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- return compute_brks_z(vd, vn, vg, oprsz, true);
-}
-
-void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- compute_brk_z(vd, vn, vg, oprsz, false);
-}
-
-uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- return compute_brks_z(vd, vn, vg, oprsz, false);
-}
-
-void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- compute_brk_m(vd, vn, vg, oprsz, true);
-}
-
-uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- return compute_brks_m(vd, vn, vg, oprsz, true);
-}
-
-void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- compute_brk_m(vd, vn, vg, oprsz, false);
-}
-
-uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- return compute_brks_m(vd, vn, vg, oprsz, false);
-}
-
-void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- if (!last_active_pred(vn, vg, oprsz)) {
- do_zero(vd, oprsz);
- }
-}
-
-/* As if PredTest(Ones(PL), D, esz). */
-static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
- uint64_t esz_mask)
-{
- uint32_t flags = PREDTEST_INIT;
- intptr_t i;
-
- for (i = 0; i < oprsz / 8; i++) {
- flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
- }
- if (oprsz & 7) {
- uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
- flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
- }
- return flags;
-}
-
-uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- if (last_active_pred(vn, vg, oprsz)) {
- return predtest_ones(vd, oprsz, -1);
- } else {
- return do_zero(vd, oprsz);
- }
-}
-
-uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
-{
- intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
- intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
- uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
- intptr_t i;
-
- for (i = 0; i < words; ++i) {
- uint64_t t = n[i] & g[i] & mask;
- sum += ctpop64(t);
- }
- return sum;
-}
-
-uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
- uint64_t esz_mask = pred_esz_masks[esz];
- ARMPredicateReg *d = vd;
- uint32_t flags;
- intptr_t i;
-
- /* Begin with a zero predicate register. */
- flags = do_zero(d, oprsz);
- if (count == 0) {
- return flags;
- }
-
- /* Set all of the requested bits. */
- for (i = 0; i < count / 64; ++i) {
- d->p[i] = esz_mask;
- }
- if (count & 63) {
- d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
- }
-
- return predtest_ones(d, oprsz, esz_mask);
-}
-
-uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
-{
- intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
- intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
- uint64_t esz_mask = pred_esz_masks[esz];
- ARMPredicateReg *d = vd;
- intptr_t i, invcount, oprbits;
- uint64_t bits;
-
- if (count == 0) {
- return do_zero(d, oprsz);
- }
-
- oprbits = oprsz * 8;
- tcg_debug_assert(count <= oprbits);
-
- bits = esz_mask;
- if (oprbits & 63) {
- bits &= MAKE_64BIT_MASK(0, oprbits & 63);
- }
-
- invcount = oprbits - count;
- for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
- d->p[i] = bits;
- bits = esz_mask;
- }
-
- d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
-
- while (--i >= 0) {
- d->p[i] = 0;
- }
-
- return predtest_ones(d, oprsz, esz_mask);
-}
-
-/* Recursive reduction on a function;
- * C.f. the ARM ARM function ReducePredicated.
- *
- * While it would be possible to write this without the DATA temporary,
- * it is much simpler to process the predicate register this way.
- * The recursion is bounded to depth 7 (128 fp16 elements), so there's
- * little to gain with a more complex non-recursive form.
- */
-#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
-static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
-{ \
- if (n == 1) { \
- return *data; \
- } else { \
- uintptr_t half = n / 2; \
- TYPE lo = NAME##_reduce(data, status, half); \
- TYPE hi = NAME##_reduce(data + half, status, half); \
- return TYPE##_##FUNC(lo, hi, status); \
- } \
-} \
-uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
-{ \
- uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
- TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
- for (i = 0; i < oprsz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
- i += sizeof(TYPE), pg >>= sizeof(TYPE); \
- } while (i & 15); \
- } \
- for (; i < maxsz; i += sizeof(TYPE)) { \
- *(TYPE *)((void *)data + i) = IDENT; \
- } \
- return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
-}
-
-DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
-DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
-DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
-
-/* Identity is floatN_default_nan, without the function call. */
-DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
-DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
-DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
-
-DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
-DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
-DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
-
-DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
-DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
-DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
-
-DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
-DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
-DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
-
-#undef DO_REDUCE
-
-uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
- void *status, uint32_t desc)
-{
- intptr_t i = 0, opr_sz = simd_oprsz(desc);
- float16 result = nn;
-
- do {
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
- do {
- if (pg & 1) {
- float16 mm = *(float16 *)(vm + H1_2(i));
- result = float16_add(result, mm, status);
- }
- i += sizeof(float16), pg >>= sizeof(float16);
- } while (i & 15);
- } while (i < opr_sz);
-
- return result;
-}
-
-uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
- void *status, uint32_t desc)
-{
- intptr_t i = 0, opr_sz = simd_oprsz(desc);
- float32 result = nn;
-
- do {
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
- do {
- if (pg & 1) {
- float32 mm = *(float32 *)(vm + H1_2(i));
- result = float32_add(result, mm, status);
- }
- i += sizeof(float32), pg >>= sizeof(float32);
- } while (i & 15);
- } while (i < opr_sz);
-
- return result;
-}
-
-uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
- void *status, uint32_t desc)
-{
- intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *m = vm;
- uint8_t *pg = vg;
-
- for (i = 0; i < opr_sz; i++) {
- if (pg[H1(i)] & 1) {
- nn = float64_add(nn, m[i], status);
- }
- }
-
- return nn;
-}
-
-/* Fully general three-operand expander, controlled by a predicate,
- * With the extra float_status parameter.
- */
-#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
- void *status, uint32_t desc) \
-{ \
- intptr_t i = simd_oprsz(desc); \
- uint64_t *g = vg; \
- do { \
- uint64_t pg = g[(i - 1) >> 6]; \
- do { \
- i -= sizeof(TYPE); \
- if (likely((pg >> (i & 63)) & 1)) { \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- TYPE mm = *(TYPE *)(vm + H(i)); \
- *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
- } \
- } while (i & 63); \
- } while (i != 0); \
-}
-
-DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
-DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
-DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
-
-DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
-DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
-DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
-
-DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
-DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
-DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
-
-DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
-DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
-DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
-
-DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
-DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
-DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
-
-DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
-DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
-DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
-
-DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
-DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
-DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
-
-DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
-DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
-DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
-
-static inline float16 abd_h(float16 a, float16 b, float_status *s)
-{
- return float16_abs(float16_sub(a, b, s));
-}
-
-static inline float32 abd_s(float32 a, float32 b, float_status *s)
-{
- return float32_abs(float32_sub(a, b, s));
-}
-
-static inline float64 abd_d(float64 a, float64 b, float_status *s)
-{
- return float64_abs(float64_sub(a, b, s));
-}
-
-DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
-DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
-DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
-
-static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
-{
- int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
- return float64_scalbn(a, b_int, s);
-}
-
-DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
-DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
-DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
-
-DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
-DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
-DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
-
-#undef DO_ZPZZ_FP
-
-/* Three-operand expander, with one scalar operand, controlled by
- * a predicate, with the extra float_status parameter.
- */
-#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
- void *status, uint32_t desc) \
-{ \
- intptr_t i = simd_oprsz(desc); \
- uint64_t *g = vg; \
- TYPE mm = scalar; \
- do { \
- uint64_t pg = g[(i - 1) >> 6]; \
- do { \
- i -= sizeof(TYPE); \
- if (likely((pg >> (i & 63)) & 1)) { \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
- } \
- } while (i & 63); \
- } while (i != 0); \
-}
-
-DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
-DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
-DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
-
-DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
-DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
-DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
-
-DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
-DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
-DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
-
-static inline float16 subr_h(float16 a, float16 b, float_status *s)
-{
- return float16_sub(b, a, s);
-}
-
-static inline float32 subr_s(float32 a, float32 b, float_status *s)
-{
- return float32_sub(b, a, s);
-}
-
-static inline float64 subr_d(float64 a, float64 b, float_status *s)
-{
- return float64_sub(b, a, s);
-}
-
-DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
-DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
-DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
-
-DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
-DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
-DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
-
-DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
-DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
-DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
-
-DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
-DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
-DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
-
-DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
-DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
-DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
-
-/* Fully general two-operand expander, controlled by a predicate,
- * With the extra float_status parameter.
- */
-#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
-{ \
- intptr_t i = simd_oprsz(desc); \
- uint64_t *g = vg; \
- do { \
- uint64_t pg = g[(i - 1) >> 6]; \
- do { \
- i -= sizeof(TYPE); \
- if (likely((pg >> (i & 63)) & 1)) { \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- *(TYPE *)(vd + H(i)) = OP(nn, status); \
- } \
- } while (i & 63); \
- } while (i != 0); \
-}
-
-/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
- * FZ16. When converting from fp16, this affects flushing input denormals;
- * when converting to fp16, this affects flushing output denormals.
- */
-static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
-{
- bool save = get_flush_inputs_to_zero(fpst);
- float32 ret;
-
- set_flush_inputs_to_zero(false, fpst);
- ret = float16_to_float32(f, true, fpst);
- set_flush_inputs_to_zero(save, fpst);
- return ret;
-}
-
-static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
-{
- bool save = get_flush_inputs_to_zero(fpst);
- float64 ret;
-
- set_flush_inputs_to_zero(false, fpst);
- ret = float16_to_float64(f, true, fpst);
- set_flush_inputs_to_zero(save, fpst);
- return ret;
-}
-
-static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
-{
- bool save = get_flush_to_zero(fpst);
- float16 ret;
-
- set_flush_to_zero(false, fpst);
- ret = float32_to_float16(f, true, fpst);
- set_flush_to_zero(save, fpst);
- return ret;
-}
-
-static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
-{
- bool save = get_flush_to_zero(fpst);
- float16 ret;
-
- set_flush_to_zero(false, fpst);
- ret = float64_to_float16(f, true, fpst);
- set_flush_to_zero(save, fpst);
- return ret;
-}
-
-static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
-{
- if (float16_is_any_nan(f)) {
- float_raise(float_flag_invalid, s);
- return 0;
- }
- return float16_to_int16_round_to_zero(f, s);
-}
-
-static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
-{
- if (float16_is_any_nan(f)) {
- float_raise(float_flag_invalid, s);
- return 0;
- }
- return float16_to_int64_round_to_zero(f, s);
-}
-
-static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
-{
- if (float32_is_any_nan(f)) {
- float_raise(float_flag_invalid, s);
- return 0;
- }
- return float32_to_int64_round_to_zero(f, s);
-}
-
-static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
-{
- if (float64_is_any_nan(f)) {
- float_raise(float_flag_invalid, s);
- return 0;
- }
- return float64_to_int64_round_to_zero(f, s);
-}
-
-static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
-{
- if (float16_is_any_nan(f)) {
- float_raise(float_flag_invalid, s);
- return 0;
- }
- return float16_to_uint16_round_to_zero(f, s);
-}
-
-static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
-{
- if (float16_is_any_nan(f)) {
- float_raise(float_flag_invalid, s);
- return 0;
- }
- return float16_to_uint64_round_to_zero(f, s);
-}
-
-static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
-{
- if (float32_is_any_nan(f)) {
- float_raise(float_flag_invalid, s);
- return 0;
- }
- return float32_to_uint64_round_to_zero(f, s);
-}
-
-static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
-{
- if (float64_is_any_nan(f)) {
- float_raise(float_flag_invalid, s);
- return 0;
- }
- return float64_to_uint64_round_to_zero(f, s);
-}
-
-DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
-DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
-DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
-DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
-DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
-DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
-DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
-
-DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
-DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
-DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
-DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
-DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
-DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
-DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
-
-DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
-DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
-DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
-DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
-DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
-DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
-DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
-
-DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
-DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
-DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
-
-DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
-DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
-DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
-
-DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
-DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
-DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
-
-DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
-DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
-DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
-
-DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
-DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
-DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
-DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
-DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
-DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
-DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
-
-DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
-DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
-DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
-DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
-DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
-DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
-DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
-
-static int16_t do_float16_logb_as_int(float16 a, float_status *s)
-{
- /* Extract frac to the top of the uint32_t. */
- uint32_t frac = (uint32_t)a << (16 + 6);
- int16_t exp = extract32(a, 10, 5);
-
- if (unlikely(exp == 0)) {
- if (frac != 0) {
- if (!get_flush_inputs_to_zero(s)) {
- /* denormal: bias - fractional_zeros */
- return -15 - clz32(frac);
- }
- /* flush to zero */
- float_raise(float_flag_input_denormal, s);
- }
- } else if (unlikely(exp == 0x1f)) {
- if (frac == 0) {
- return INT16_MAX; /* infinity */
- }
- } else {
- /* normal: exp - bias */
- return exp - 15;
- }
- /* nan or zero */
- float_raise(float_flag_invalid, s);
- return INT16_MIN;
-}
-
-static int32_t do_float32_logb_as_int(float32 a, float_status *s)
-{
- /* Extract frac to the top of the uint32_t. */
- uint32_t frac = a << 9;
- int32_t exp = extract32(a, 23, 8);
-
- if (unlikely(exp == 0)) {
- if (frac != 0) {
- if (!get_flush_inputs_to_zero(s)) {
- /* denormal: bias - fractional_zeros */
- return -127 - clz32(frac);
- }
- /* flush to zero */
- float_raise(float_flag_input_denormal, s);
- }
- } else if (unlikely(exp == 0xff)) {
- if (frac == 0) {
- return INT32_MAX; /* infinity */
- }
- } else {
- /* normal: exp - bias */
- return exp - 127;
- }
- /* nan or zero */
- float_raise(float_flag_invalid, s);
- return INT32_MIN;
-}
-
-static int64_t do_float64_logb_as_int(float64 a, float_status *s)
-{
- /* Extract frac to the top of the uint64_t. */
- uint64_t frac = a << 12;
- int64_t exp = extract64(a, 52, 11);
-
- if (unlikely(exp == 0)) {
- if (frac != 0) {
- if (!get_flush_inputs_to_zero(s)) {
- /* denormal: bias - fractional_zeros */
- return -1023 - clz64(frac);
- }
- /* flush to zero */
- float_raise(float_flag_input_denormal, s);
- }
- } else if (unlikely(exp == 0x7ff)) {
- if (frac == 0) {
- return INT64_MAX; /* infinity */
- }
- } else {
- /* normal: exp - bias */
- return exp - 1023;
- }
- /* nan or zero */
- float_raise(float_flag_invalid, s);
- return INT64_MIN;
-}
-
-DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
-DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
-DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
-
-#undef DO_ZPZ_FP
-
-static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
- float_status *status, uint32_t desc,
- uint16_t neg1, uint16_t neg3)
-{
- intptr_t i = simd_oprsz(desc);
- uint64_t *g = vg;
-
- do {
- uint64_t pg = g[(i - 1) >> 6];
- do {
- i -= 2;
- if (likely((pg >> (i & 63)) & 1)) {
- float16 e1, e2, e3, r;
-
- e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
- e2 = *(uint16_t *)(vm + H1_2(i));
- e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
- r = float16_muladd(e1, e2, e3, 0, status);
- *(uint16_t *)(vd + H1_2(i)) = r;
- }
- } while (i & 63);
- } while (i != 0);
-}
-
-void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
-}
-
-void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
-}
-
-void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
-}
-
-void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
-}
-
-static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
- float_status *status, uint32_t desc,
- uint32_t neg1, uint32_t neg3)
-{
- intptr_t i = simd_oprsz(desc);
- uint64_t *g = vg;
-
- do {
- uint64_t pg = g[(i - 1) >> 6];
- do {
- i -= 4;
- if (likely((pg >> (i & 63)) & 1)) {
- float32 e1, e2, e3, r;
-
- e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
- e2 = *(uint32_t *)(vm + H1_4(i));
- e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
- r = float32_muladd(e1, e2, e3, 0, status);
- *(uint32_t *)(vd + H1_4(i)) = r;
- }
- } while (i & 63);
- } while (i != 0);
-}
-
-void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
-}
-
-void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
-}
-
-void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
-}
-
-void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
-}
-
-static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
- float_status *status, uint32_t desc,
- uint64_t neg1, uint64_t neg3)
-{
- intptr_t i = simd_oprsz(desc);
- uint64_t *g = vg;
-
- do {
- uint64_t pg = g[(i - 1) >> 6];
- do {
- i -= 8;
- if (likely((pg >> (i & 63)) & 1)) {
- float64 e1, e2, e3, r;
-
- e1 = *(uint64_t *)(vn + i) ^ neg1;
- e2 = *(uint64_t *)(vm + i);
- e3 = *(uint64_t *)(va + i) ^ neg3;
- r = float64_muladd(e1, e2, e3, 0, status);
- *(uint64_t *)(vd + i) = r;
- }
- } while (i & 63);
- } while (i != 0);
-}
-
-void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
-}
-
-void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
-}
-
-void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
-}
-
-void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
-}
-
-/* Two operand floating-point comparison controlled by a predicate.
- * Unlike the integer version, we are not allowed to optimistically
- * compare operands, since the comparison may have side effects wrt
- * the FPSR.
- */
-#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
- void *status, uint32_t desc) \
-{ \
- intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
- uint64_t *d = vd, *g = vg; \
- do { \
- uint64_t out = 0, pg = g[j]; \
- do { \
- i -= sizeof(TYPE), out <<= sizeof(TYPE); \
- if (likely((pg >> (i & 63)) & 1)) { \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- TYPE mm = *(TYPE *)(vm + H(i)); \
- out |= OP(TYPE, nn, mm, status); \
- } \
- } while (i & 63); \
- d[j--] = out; \
- } while (i > 0); \
-}
-
-#define DO_FPCMP_PPZZ_H(NAME, OP) \
- DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
-#define DO_FPCMP_PPZZ_S(NAME, OP) \
- DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
-#define DO_FPCMP_PPZZ_D(NAME, OP) \
- DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
-
-#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
- DO_FPCMP_PPZZ_H(NAME, OP) \
- DO_FPCMP_PPZZ_S(NAME, OP) \
- DO_FPCMP_PPZZ_D(NAME, OP)
-
-#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
-#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
-#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
-#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
-#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
-#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
-#define DO_FCMUO(TYPE, X, Y, ST) \
- TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
-#define DO_FACGE(TYPE, X, Y, ST) \
- TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
-#define DO_FACGT(TYPE, X, Y, ST) \
- TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
-
-DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
-DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
-DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
-DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
-DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
-DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
-DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
-
-#undef DO_FPCMP_PPZZ_ALL
-#undef DO_FPCMP_PPZZ_D
-#undef DO_FPCMP_PPZZ_S
-#undef DO_FPCMP_PPZZ_H
-#undef DO_FPCMP_PPZZ
-
-/* One operand floating-point comparison against zero, controlled
- * by a predicate.
- */
-#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vg, \
- void *status, uint32_t desc) \
-{ \
- intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
- uint64_t *d = vd, *g = vg; \
- do { \
- uint64_t out = 0, pg = g[j]; \
- do { \
- i -= sizeof(TYPE), out <<= sizeof(TYPE); \
- if ((pg >> (i & 63)) & 1) { \
- TYPE nn = *(TYPE *)(vn + H(i)); \
- out |= OP(TYPE, nn, 0, status); \
- } \
- } while (i & 63); \
- d[j--] = out; \
- } while (i > 0); \
-}
-
-#define DO_FPCMP_PPZ0_H(NAME, OP) \
- DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
-#define DO_FPCMP_PPZ0_S(NAME, OP) \
- DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
-#define DO_FPCMP_PPZ0_D(NAME, OP) \
- DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
-
-#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
- DO_FPCMP_PPZ0_H(NAME, OP) \
- DO_FPCMP_PPZ0_S(NAME, OP) \
- DO_FPCMP_PPZ0_D(NAME, OP)
-
-DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
-DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
-DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
-DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
-DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
-DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
-
-/* FP Trig Multiply-Add. */
-
-void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
-{
- static const float16 coeff[16] = {
- 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
- };
- intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
- intptr_t x = simd_data(desc);
- float16 *d = vd, *n = vn, *m = vm;
- for (i = 0; i < opr_sz; i++) {
- float16 mm = m[i];
- intptr_t xx = x;
- if (float16_is_neg(mm)) {
- mm = float16_abs(mm);
- xx += 8;
- }
- d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
- }
-}
-
-void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
-{
- static const float32 coeff[16] = {
- 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
- 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
- 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
- 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
- };
- intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
- intptr_t x = simd_data(desc);
- float32 *d = vd, *n = vn, *m = vm;
- for (i = 0; i < opr_sz; i++) {
- float32 mm = m[i];
- intptr_t xx = x;
- if (float32_is_neg(mm)) {
- mm = float32_abs(mm);
- xx += 8;
- }
- d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
- }
-}
-
-void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
-{
- static const float64 coeff[16] = {
- 0x3ff0000000000000ull, 0xbfc5555555555543ull,
- 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
- 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
- 0x3de5d8408868552full, 0x0000000000000000ull,
- 0x3ff0000000000000ull, 0xbfe0000000000000ull,
- 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
- 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
- 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
- };
- intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
- intptr_t x = simd_data(desc);
- float64 *d = vd, *n = vn, *m = vm;
- for (i = 0; i < opr_sz; i++) {
- float64 mm = m[i];
- intptr_t xx = x;
- if (float64_is_neg(mm)) {
- mm = float64_abs(mm);
- xx += 8;
- }
- d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
- }
-}
-
-/*
- * FP Complex Add
- */
-
-void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
- void *vs, uint32_t desc)
-{
- intptr_t j, i = simd_oprsz(desc);
- uint64_t *g = vg;
- float16 neg_imag = float16_set_sign(0, simd_data(desc));
- float16 neg_real = float16_chs(neg_imag);
-
- do {
- uint64_t pg = g[(i - 1) >> 6];
- do {
- float16 e0, e1, e2, e3;
-
- /* I holds the real index; J holds the imag index. */
- j = i - sizeof(float16);
- i -= 2 * sizeof(float16);
-
- e0 = *(float16 *)(vn + H1_2(i));
- e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
- e2 = *(float16 *)(vn + H1_2(j));
- e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
-
- if (likely((pg >> (i & 63)) & 1)) {
- *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
- }
- if (likely((pg >> (j & 63)) & 1)) {
- *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
- }
- } while (i & 63);
- } while (i != 0);
-}
-
-void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
- void *vs, uint32_t desc)
-{
- intptr_t j, i = simd_oprsz(desc);
- uint64_t *g = vg;
- float32 neg_imag = float32_set_sign(0, simd_data(desc));
- float32 neg_real = float32_chs(neg_imag);
-
- do {
- uint64_t pg = g[(i - 1) >> 6];
- do {
- float32 e0, e1, e2, e3;
-
- /* I holds the real index; J holds the imag index. */
- j = i - sizeof(float32);
- i -= 2 * sizeof(float32);
-
- e0 = *(float32 *)(vn + H1_2(i));
- e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
- e2 = *(float32 *)(vn + H1_2(j));
- e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
-
- if (likely((pg >> (i & 63)) & 1)) {
- *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
- }
- if (likely((pg >> (j & 63)) & 1)) {
- *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
- }
- } while (i & 63);
- } while (i != 0);
-}
-
-void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
- void *vs, uint32_t desc)
-{
- intptr_t j, i = simd_oprsz(desc);
- uint64_t *g = vg;
- float64 neg_imag = float64_set_sign(0, simd_data(desc));
- float64 neg_real = float64_chs(neg_imag);
-
- do {
- uint64_t pg = g[(i - 1) >> 6];
- do {
- float64 e0, e1, e2, e3;
-
- /* I holds the real index; J holds the imag index. */
- j = i - sizeof(float64);
- i -= 2 * sizeof(float64);
-
- e0 = *(float64 *)(vn + H1_2(i));
- e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
- e2 = *(float64 *)(vn + H1_2(j));
- e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
-
- if (likely((pg >> (i & 63)) & 1)) {
- *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
- }
- if (likely((pg >> (j & 63)) & 1)) {
- *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
- }
- } while (i & 63);
- } while (i != 0);
-}
-
-/*
- * FP Complex Multiply
- */
-
-void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- intptr_t j, i = simd_oprsz(desc);
- unsigned rot = simd_data(desc);
- bool flip = rot & 1;
- float16 neg_imag, neg_real;
- uint64_t *g = vg;
-
- neg_imag = float16_set_sign(0, (rot & 2) != 0);
- neg_real = float16_set_sign(0, rot == 1 || rot == 2);
-
- do {
- uint64_t pg = g[(i - 1) >> 6];
- do {
- float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
-
- /* I holds the real index; J holds the imag index. */
- j = i - sizeof(float16);
- i -= 2 * sizeof(float16);
-
- nr = *(float16 *)(vn + H1_2(i));
- ni = *(float16 *)(vn + H1_2(j));
- mr = *(float16 *)(vm + H1_2(i));
- mi = *(float16 *)(vm + H1_2(j));
-
- e2 = (flip ? ni : nr);
- e1 = (flip ? mi : mr) ^ neg_real;
- e4 = e2;
- e3 = (flip ? mr : mi) ^ neg_imag;
-
- if (likely((pg >> (i & 63)) & 1)) {
- d = *(float16 *)(va + H1_2(i));
- d = float16_muladd(e2, e1, d, 0, status);
- *(float16 *)(vd + H1_2(i)) = d;
- }
- if (likely((pg >> (j & 63)) & 1)) {
- d = *(float16 *)(va + H1_2(j));
- d = float16_muladd(e4, e3, d, 0, status);
- *(float16 *)(vd + H1_2(j)) = d;
- }
- } while (i & 63);
- } while (i != 0);
-}
-
-void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- intptr_t j, i = simd_oprsz(desc);
- unsigned rot = simd_data(desc);
- bool flip = rot & 1;
- float32 neg_imag, neg_real;
- uint64_t *g = vg;
-
- neg_imag = float32_set_sign(0, (rot & 2) != 0);
- neg_real = float32_set_sign(0, rot == 1 || rot == 2);
-
- do {
- uint64_t pg = g[(i - 1) >> 6];
- do {
- float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
-
- /* I holds the real index; J holds the imag index. */
- j = i - sizeof(float32);
- i -= 2 * sizeof(float32);
-
- nr = *(float32 *)(vn + H1_2(i));
- ni = *(float32 *)(vn + H1_2(j));
- mr = *(float32 *)(vm + H1_2(i));
- mi = *(float32 *)(vm + H1_2(j));
-
- e2 = (flip ? ni : nr);
- e1 = (flip ? mi : mr) ^ neg_real;
- e4 = e2;
- e3 = (flip ? mr : mi) ^ neg_imag;
-
- if (likely((pg >> (i & 63)) & 1)) {
- d = *(float32 *)(va + H1_2(i));
- d = float32_muladd(e2, e1, d, 0, status);
- *(float32 *)(vd + H1_2(i)) = d;
- }
- if (likely((pg >> (j & 63)) & 1)) {
- d = *(float32 *)(va + H1_2(j));
- d = float32_muladd(e4, e3, d, 0, status);
- *(float32 *)(vd + H1_2(j)) = d;
- }
- } while (i & 63);
- } while (i != 0);
-}
-
-void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
- void *vg, void *status, uint32_t desc)
-{
- intptr_t j, i = simd_oprsz(desc);
- unsigned rot = simd_data(desc);
- bool flip = rot & 1;
- float64 neg_imag, neg_real;
- uint64_t *g = vg;
-
- neg_imag = float64_set_sign(0, (rot & 2) != 0);
- neg_real = float64_set_sign(0, rot == 1 || rot == 2);
-
- do {
- uint64_t pg = g[(i - 1) >> 6];
- do {
- float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
-
- /* I holds the real index; J holds the imag index. */
- j = i - sizeof(float64);
- i -= 2 * sizeof(float64);
-
- nr = *(float64 *)(vn + H1_2(i));
- ni = *(float64 *)(vn + H1_2(j));
- mr = *(float64 *)(vm + H1_2(i));
- mi = *(float64 *)(vm + H1_2(j));
-
- e2 = (flip ? ni : nr);
- e1 = (flip ? mi : mr) ^ neg_real;
- e4 = e2;
- e3 = (flip ? mr : mi) ^ neg_imag;
-
- if (likely((pg >> (i & 63)) & 1)) {
- d = *(float64 *)(va + H1_2(i));
- d = float64_muladd(e2, e1, d, 0, status);
- *(float64 *)(vd + H1_2(i)) = d;
- }
- if (likely((pg >> (j & 63)) & 1)) {
- d = *(float64 *)(va + H1_2(j));
- d = float64_muladd(e4, e3, d, 0, status);
- *(float64 *)(vd + H1_2(j)) = d;
- }
- } while (i & 63);
- } while (i != 0);
-}
-
-/*
- * Load contiguous data, protected by a governing predicate.
- */
-
-/*
- * Skip through a sequence of inactive elements in the guarding predicate @vg,
- * beginning at @reg_off bounded by @reg_max. Return the offset of the active
- * element >= @reg_off, or @reg_max if there were no active elements at all.
- */
-static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
- intptr_t reg_max, int esz)
-{
- uint64_t pg_mask = pred_esz_masks[esz];
- uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
-
- /* In normal usage, the first element is active. */
- if (likely(pg & 1)) {
- return reg_off;
- }
-
- if (pg == 0) {
- reg_off &= -64;
- do {
- reg_off += 64;
- if (unlikely(reg_off >= reg_max)) {
- /* The entire predicate was false. */
- return reg_max;
- }
- pg = vg[reg_off >> 6] & pg_mask;
- } while (pg == 0);
- }
- reg_off += ctz64(pg);
-
- /* We should never see an out of range predicate bit set. */
- tcg_debug_assert(reg_off < reg_max);
- return reg_off;
-}
-
-/*
- * Resolve the guest virtual address to info->host and info->flags.
- * If @nofault, return false if the page is invalid, otherwise
- * exit via page fault exception.
- */
-
-bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
- target_ulong addr, int mem_off, MMUAccessType access_type,
- int mmu_idx, uintptr_t retaddr)
-{
- int flags;
-
- addr += mem_off;
-
- /*
- * User-only currently always issues with TBI. See the comment
- * above useronly_clean_ptr. Usually we clean this top byte away
- * during translation, but we can't do that for e.g. vector + imm
- * addressing modes.
- *
- * We currently always enable TBI for user-only, and do not provide
- * a way to turn it off. So clean the pointer unconditionally here,
- * rather than look it up here, or pass it down from above.
- */
- addr = useronly_clean_ptr(addr);
-
-#ifdef CONFIG_USER_ONLY
- flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
- &info->host, retaddr);
-#else
- CPUTLBEntryFull *full;
- flags = probe_access_full(env, addr, access_type, mmu_idx, nofault,
- &info->host, &full, retaddr);
-#endif
- info->flags = flags;
-
- if (flags & TLB_INVALID_MASK) {
- g_assert(nofault);
- return false;
- }
-
-#ifdef CONFIG_USER_ONLY
- memset(&info->attrs, 0, sizeof(info->attrs));
- /* Require both ANON and MTE; see allocation_tag_mem(). */
- info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
-#else
- info->attrs = full->attrs;
- info->tagged = full->pte_attrs == 0xf0;
-#endif
-
- /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
- info->host -= mem_off;
- return true;
-}
-
-/*
- * Find first active element on each page, and a loose bound for the
- * final element on each page. Identify any single element that spans
- * the page boundary. Return true if there are any active elements.
- */
-bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
- intptr_t reg_max, int esz, int msize)
-{
- const int esize = 1 << esz;
- const uint64_t pg_mask = pred_esz_masks[esz];
- intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
- intptr_t mem_off_last, mem_off_split;
- intptr_t page_split, elt_split;
- intptr_t i;
-
- /* Set all of the element indices to -1, and the TLB data to 0. */
- memset(info, -1, offsetof(SVEContLdSt, page));
- memset(info->page, 0, sizeof(info->page));
-
- /* Gross scan over the entire predicate to find bounds. */
- i = 0;
- do {
- uint64_t pg = vg[i] & pg_mask;
- if (pg) {
- reg_off_last = i * 64 + 63 - clz64(pg);
- if (reg_off_first < 0) {
- reg_off_first = i * 64 + ctz64(pg);
- }
- }
- } while (++i * 64 < reg_max);
-
- if (unlikely(reg_off_first < 0)) {
- /* No active elements, no pages touched. */
- return false;
- }
- tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
-
- info->reg_off_first[0] = reg_off_first;
- info->mem_off_first[0] = (reg_off_first >> esz) * msize;
- mem_off_last = (reg_off_last >> esz) * msize;
-
- page_split = -(addr | TARGET_PAGE_MASK);
- if (likely(mem_off_last + msize <= page_split)) {
- /* The entire operation fits within a single page. */
- info->reg_off_last[0] = reg_off_last;
- return true;
- }
-
- info->page_split = page_split;
- elt_split = page_split / msize;
- reg_off_split = elt_split << esz;
- mem_off_split = elt_split * msize;
-
- /*
- * This is the last full element on the first page, but it is not
- * necessarily active. If there is no full element, i.e. the first
- * active element is the one that's split, this value remains -1.
- * It is useful as iteration bounds.
- */
- if (elt_split != 0) {
- info->reg_off_last[0] = reg_off_split - esize;
- }
-
- /* Determine if an unaligned element spans the pages. */
- if (page_split % msize != 0) {
- /* It is helpful to know if the split element is active. */
- if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
- info->reg_off_split = reg_off_split;
- info->mem_off_split = mem_off_split;
-
- if (reg_off_split == reg_off_last) {
- /* The page crossing element is last. */
- return true;
- }
- }
- reg_off_split += esize;
- mem_off_split += msize;
- }
-
- /*
- * We do want the first active element on the second page, because
- * this may affect the address reported in an exception.
- */
- reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
- tcg_debug_assert(reg_off_split <= reg_off_last);
- info->reg_off_first[1] = reg_off_split;
- info->mem_off_first[1] = (reg_off_split >> esz) * msize;
- info->reg_off_last[1] = reg_off_last;
- return true;
-}
-
-/*
- * Resolve the guest virtual addresses to info->page[].
- * Control the generation of page faults with @fault. Return false if
- * there is no work to do, which can only happen with @fault == FAULT_NO.
- */
-bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
- CPUARMState *env, target_ulong addr,
- MMUAccessType access_type, uintptr_t retaddr)
-{
- int mmu_idx = cpu_mmu_index(env, false);
- int mem_off = info->mem_off_first[0];
- bool nofault = fault == FAULT_NO;
- bool have_work = true;
-
- if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
- access_type, mmu_idx, retaddr)) {
- /* No work to be done. */
- return false;
- }
-
- if (likely(info->page_split < 0)) {
- /* The entire operation was on the one page. */
- return true;
- }
-
- /*
- * If the second page is invalid, then we want the fault address to be
- * the first byte on that page which is accessed.
- */
- if (info->mem_off_split >= 0) {
- /*
- * There is an element split across the pages. The fault address
- * should be the first byte of the second page.
- */
- mem_off = info->page_split;
- /*
- * If the split element is also the first active element
- * of the vector, then: For first-fault we should continue
- * to generate faults for the second page. For no-fault,
- * we have work only if the second page is valid.
- */
- if (info->mem_off_first[0] < info->mem_off_split) {
- nofault = FAULT_FIRST;
- have_work = false;
- }
- } else {
- /*
- * There is no element split across the pages. The fault address
- * should be the first active element on the second page.
- */
- mem_off = info->mem_off_first[1];
- /*
- * There must have been one active element on the first page,
- * so we're out of first-fault territory.
- */
- nofault = fault != FAULT_ALL;
- }
-
- have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
- access_type, mmu_idx, retaddr);
- return have_work;
-}
-
-#ifndef CONFIG_USER_ONLY
-void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
- uint64_t *vg, target_ulong addr,
- int esize, int msize, int wp_access,
- uintptr_t retaddr)
-{
- intptr_t mem_off, reg_off, reg_last;
- int flags0 = info->page[0].flags;
- int flags1 = info->page[1].flags;
-
- if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
- return;
- }
-
- /* Indicate that watchpoints are handled. */
- info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
- info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
-
- if (flags0 & TLB_WATCHPOINT) {
- mem_off = info->mem_off_first[0];
- reg_off = info->reg_off_first[0];
- reg_last = info->reg_off_last[0];
-
- while (reg_off <= reg_last) {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- cpu_check_watchpoint(env_cpu(env), addr + mem_off,
- msize, info->page[0].attrs,
- wp_access, retaddr);
- }
- reg_off += esize;
- mem_off += msize;
- } while (reg_off <= reg_last && (reg_off & 63));
- }
- }
-
- mem_off = info->mem_off_split;
- if (mem_off >= 0) {
- cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
- info->page[0].attrs, wp_access, retaddr);
- }
-
- mem_off = info->mem_off_first[1];
- if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
- reg_off = info->reg_off_first[1];
- reg_last = info->reg_off_last[1];
-
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- cpu_check_watchpoint(env_cpu(env), addr + mem_off,
- msize, info->page[1].attrs,
- wp_access, retaddr);
- }
- reg_off += esize;
- mem_off += msize;
- } while (reg_off & 63);
- } while (reg_off <= reg_last);
- }
-}
-#endif
-
-void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
- uint64_t *vg, target_ulong addr, int esize,
- int msize, uint32_t mtedesc, uintptr_t ra)
-{
- intptr_t mem_off, reg_off, reg_last;
-
- /* Process the page only if MemAttr == Tagged. */
- if (info->page[0].tagged) {
- mem_off = info->mem_off_first[0];
- reg_off = info->reg_off_first[0];
- reg_last = info->reg_off_split;
- if (reg_last < 0) {
- reg_last = info->reg_off_last[0];
- }
-
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- mte_check(env, mtedesc, addr, ra);
- }
- reg_off += esize;
- mem_off += msize;
- } while (reg_off <= reg_last && (reg_off & 63));
- } while (reg_off <= reg_last);
- }
-
- mem_off = info->mem_off_first[1];
- if (mem_off >= 0 && info->page[1].tagged) {
- reg_off = info->reg_off_first[1];
- reg_last = info->reg_off_last[1];
-
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- mte_check(env, mtedesc, addr, ra);
- }
- reg_off += esize;
- mem_off += msize;
- } while (reg_off & 63);
- } while (reg_off <= reg_last);
- }
-}
-
-/*
- * Common helper for all contiguous 1,2,3,4-register predicated stores.
- */
-static inline QEMU_ALWAYS_INLINE
-void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
- uint32_t desc, const uintptr_t retaddr,
- const int esz, const int msz, const int N, uint32_t mtedesc,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- const unsigned rd = simd_data(desc);
- const intptr_t reg_max = simd_oprsz(desc);
- intptr_t reg_off, reg_last, mem_off;
- SVEContLdSt info;
- void *host;
- int flags, i;
-
- /* Find the active elements. */
- if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
- /* The entire predicate was false; no load occurs. */
- for (i = 0; i < N; ++i) {
- memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
- }
- return;
- }
-
- /* Probe the page(s). Exit with exception for any invalid page. */
- sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
-
- /* Handle watchpoints for all active elements. */
- sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
- BP_MEM_READ, retaddr);
-
- /*
- * Handle mte checks for all active elements.
- * Since TBI must be set for MTE, !mtedesc => !mte_active.
- */
- if (mtedesc) {
- sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
- mtedesc, retaddr);
- }
-
- flags = info.page[0].flags | info.page[1].flags;
- if (unlikely(flags != 0)) {
-#ifdef CONFIG_USER_ONLY
- g_assert_not_reached();
-#else
- /*
- * At least one page includes MMIO.
- * Any bus operation can fail with cpu_transaction_failed,
- * which for ARM will raise SyncExternal. Perform the load
- * into scratch memory to preserve register state until the end.
- */
- ARMVectorReg scratch[4] = { };
-
- mem_off = info.mem_off_first[0];
- reg_off = info.reg_off_first[0];
- reg_last = info.reg_off_last[1];
- if (reg_last < 0) {
- reg_last = info.reg_off_split;
- if (reg_last < 0) {
- reg_last = info.reg_off_last[0];
- }
- }
-
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- for (i = 0; i < N; ++i) {
- tlb_fn(env, &scratch[i], reg_off,
- addr + mem_off + (i << msz), retaddr);
- }
- }
- reg_off += 1 << esz;
- mem_off += N << msz;
- } while (reg_off & 63);
- } while (reg_off <= reg_last);
-
- for (i = 0; i < N; ++i) {
- memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
- }
- return;
-#endif
- }
-
- /* The entire operation is in RAM, on valid pages. */
-
- for (i = 0; i < N; ++i) {
- memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
- }
-
- mem_off = info.mem_off_first[0];
- reg_off = info.reg_off_first[0];
- reg_last = info.reg_off_last[0];
- host = info.page[0].host;
-
- while (reg_off <= reg_last) {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- for (i = 0; i < N; ++i) {
- host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
- host + mem_off + (i << msz));
- }
- }
- reg_off += 1 << esz;
- mem_off += N << msz;
- } while (reg_off <= reg_last && (reg_off & 63));
- }
-
- /*
- * Use the slow path to manage the cross-page misalignment.
- * But we know this is RAM and cannot trap.
- */
- mem_off = info.mem_off_split;
- if (unlikely(mem_off >= 0)) {
- reg_off = info.reg_off_split;
- for (i = 0; i < N; ++i) {
- tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
- addr + mem_off + (i << msz), retaddr);
- }
- }
-
- mem_off = info.mem_off_first[1];
- if (unlikely(mem_off >= 0)) {
- reg_off = info.reg_off_first[1];
- reg_last = info.reg_off_last[1];
- host = info.page[1].host;
-
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- for (i = 0; i < N; ++i) {
- host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
- host + mem_off + (i << msz));
- }
- }
- reg_off += 1 << esz;
- mem_off += N << msz;
- } while (reg_off & 63);
- } while (reg_off <= reg_last);
- }
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
- uint32_t desc, const uintptr_t ra,
- const int esz, const int msz, const int N,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
- int bit55 = extract64(addr, 55, 1);
-
- /* Remove mtedesc from the normal sve descriptor. */
- desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
- /* Perform gross MTE suppression early. */
- if (!tbi_check(desc, bit55) ||
- tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
- mtedesc = 0;
- }
-
- sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
-}
-
-#define DO_LD1_1(NAME, ESZ) \
-void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
- sve_##NAME##_host, sve_##NAME##_tlb); \
-} \
-void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
- sve_##NAME##_host, sve_##NAME##_tlb); \
-}
-
-#define DO_LD1_2(NAME, ESZ, MSZ) \
-void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
- sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
-} \
-void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
- sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
-} \
-void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
- sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
-} \
-void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
- sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
-}
-
-DO_LD1_1(ld1bb, MO_8)
-DO_LD1_1(ld1bhu, MO_16)
-DO_LD1_1(ld1bhs, MO_16)
-DO_LD1_1(ld1bsu, MO_32)
-DO_LD1_1(ld1bss, MO_32)
-DO_LD1_1(ld1bdu, MO_64)
-DO_LD1_1(ld1bds, MO_64)
-
-DO_LD1_2(ld1hh, MO_16, MO_16)
-DO_LD1_2(ld1hsu, MO_32, MO_16)
-DO_LD1_2(ld1hss, MO_32, MO_16)
-DO_LD1_2(ld1hdu, MO_64, MO_16)
-DO_LD1_2(ld1hds, MO_64, MO_16)
-
-DO_LD1_2(ld1ss, MO_32, MO_32)
-DO_LD1_2(ld1sdu, MO_64, MO_32)
-DO_LD1_2(ld1sds, MO_64, MO_32)
-
-DO_LD1_2(ld1dd, MO_64, MO_64)
-
-#undef DO_LD1_1
-#undef DO_LD1_2
-
-#define DO_LDN_1(N) \
-void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
- sve_ld1bb_host, sve_ld1bb_tlb); \
-} \
-void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
- sve_ld1bb_host, sve_ld1bb_tlb); \
-}
-
-#define DO_LDN_2(N, SUFF, ESZ) \
-void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
- sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
-} \
-void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
- sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
-} \
-void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
- sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
-} \
-void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
- sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
-}
-
-DO_LDN_1(2)
-DO_LDN_1(3)
-DO_LDN_1(4)
-
-DO_LDN_2(2, hh, MO_16)
-DO_LDN_2(3, hh, MO_16)
-DO_LDN_2(4, hh, MO_16)
-
-DO_LDN_2(2, ss, MO_32)
-DO_LDN_2(3, ss, MO_32)
-DO_LDN_2(4, ss, MO_32)
-
-DO_LDN_2(2, dd, MO_64)
-DO_LDN_2(3, dd, MO_64)
-DO_LDN_2(4, dd, MO_64)
-
-#undef DO_LDN_1
-#undef DO_LDN_2
-
-/*
- * Load contiguous data, first-fault and no-fault.
- *
- * For user-only, one could argue that we should hold the mmap_lock during
- * the operation so that there is no race between page_check_range and the
- * load operation. However, unmapping pages out from under a running thread
- * is extraordinarily unlikely. This theoretical race condition also affects
- * linux-user/ in its get_user/put_user macros.
- *
- * TODO: Construct some helpers, written in assembly, that interact with
- * host_signal_handler to produce memory ops which can properly report errors
- * without racing.
- */
-
-/* Fault on byte I. All bits in FFR from I are cleared. The vector
- * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
- * option, which leaves subsequent data unchanged.
- */
-static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
-{
- uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
-
- if (i & 63) {
- ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
- i = ROUND_UP(i, 64);
- }
- for (; i < oprsz; i += 64) {
- ffr[i / 64] = 0;
- }
-}
-
-/*
- * Common helper for all contiguous no-fault and first-fault loads.
- */
-static inline QEMU_ALWAYS_INLINE
-void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
- uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
- const int esz, const int msz, const SVEContFault fault,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- const unsigned rd = simd_data(desc);
- void *vd = &env->vfp.zregs[rd];
- const intptr_t reg_max = simd_oprsz(desc);
- intptr_t reg_off, mem_off, reg_last;
- SVEContLdSt info;
- int flags;
- void *host;
-
- /* Find the active elements. */
- if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
- /* The entire predicate was false; no load occurs. */
- memset(vd, 0, reg_max);
- return;
- }
- reg_off = info.reg_off_first[0];
-
- /* Probe the page(s). */
- if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
- /* Fault on first element. */
- tcg_debug_assert(fault == FAULT_NO);
- memset(vd, 0, reg_max);
- goto do_fault;
- }
-
- mem_off = info.mem_off_first[0];
- flags = info.page[0].flags;
-
- /*
- * Disable MTE checking if the Tagged bit is not set. Since TBI must
- * be set within MTEDESC for MTE, !mtedesc => !mte_active.
- */
- if (!info.page[0].tagged) {
- mtedesc = 0;
- }
-
- if (fault == FAULT_FIRST) {
- /* Trapping mte check for the first-fault element. */
- if (mtedesc) {
- mte_check(env, mtedesc, addr + mem_off, retaddr);
- }
-
- /*
- * Special handling of the first active element,
- * if it crosses a page boundary or is MMIO.
- */
- bool is_split = mem_off == info.mem_off_split;
- if (unlikely(flags != 0) || unlikely(is_split)) {
- /*
- * Use the slow path for cross-page handling.
- * Might trap for MMIO or watchpoints.
- */
- tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
-
- /* After any fault, zero the other elements. */
- swap_memzero(vd, reg_off);
- reg_off += 1 << esz;
- mem_off += 1 << msz;
- swap_memzero(vd + reg_off, reg_max - reg_off);
-
- if (is_split) {
- goto second_page;
- }
- } else {
- memset(vd, 0, reg_max);
- }
- } else {
- memset(vd, 0, reg_max);
- if (unlikely(mem_off == info.mem_off_split)) {
- /* The first active element crosses a page boundary. */
- flags |= info.page[1].flags;
- if (unlikely(flags & TLB_MMIO)) {
- /* Some page is MMIO, see below. */
- goto do_fault;
- }
- if (unlikely(flags & TLB_WATCHPOINT) &&
- (cpu_watchpoint_address_matches
- (env_cpu(env), addr + mem_off, 1 << msz)
- & BP_MEM_READ)) {
- /* Watchpoint hit, see below. */
- goto do_fault;
- }
- if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
- goto do_fault;
- }
- /*
- * Use the slow path for cross-page handling.
- * This is RAM, without a watchpoint, and will not trap.
- */
- tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
- goto second_page;
- }
- }
-
- /*
- * From this point on, all memory operations are MemSingleNF.
- *
- * Per the MemSingleNF pseudocode, a no-fault load from Device memory
- * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
- *
- * Unfortuately we do not have access to the memory attributes from the
- * PTE to tell Device memory from Normal memory. So we make a mostly
- * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
- * This gives the right answer for the common cases of "Normal memory,
- * backed by host RAM" and "Device memory, backed by MMIO".
- * The architecture allows us to suppress an NF load and return
- * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
- * case of "Normal memory, backed by MMIO" is permitted. The case we
- * get wrong is "Device memory, backed by host RAM", for which we
- * should return (UNKNOWN, FAULT) for but do not.
- *
- * Similarly, CPU_BP breakpoints would raise exceptions, and so
- * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
- * architectural breakpoints the same.
- */
- if (unlikely(flags & TLB_MMIO)) {
- goto do_fault;
- }
-
- reg_last = info.reg_off_last[0];
- host = info.page[0].host;
-
- do {
- uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- if (unlikely(flags & TLB_WATCHPOINT) &&
- (cpu_watchpoint_address_matches
- (env_cpu(env), addr + mem_off, 1 << msz)
- & BP_MEM_READ)) {
- goto do_fault;
- }
- if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
- goto do_fault;
- }
- host_fn(vd, reg_off, host + mem_off);
- }
- reg_off += 1 << esz;
- mem_off += 1 << msz;
- } while (reg_off <= reg_last && (reg_off & 63));
- } while (reg_off <= reg_last);
-
- /*
- * MemSingleNF is allowed to fail for any reason. We have special
- * code above to handle the first element crossing a page boundary.
- * As an implementation choice, decline to handle a cross-page element
- * in any other position.
- */
- reg_off = info.reg_off_split;
- if (reg_off >= 0) {
- goto do_fault;
- }
-
- second_page:
- reg_off = info.reg_off_first[1];
- if (likely(reg_off < 0)) {
- /* No active elements on the second page. All done. */
- return;
- }
-
- /*
- * MemSingleNF is allowed to fail for any reason. As an implementation
- * choice, decline to handle elements on the second page. This should
- * be low frequency as the guest walks through memory -- the next
- * iteration of the guest's loop should be aligned on the page boundary,
- * and then all following iterations will stay aligned.
- */
-
- do_fault:
- record_fault(env, reg_off, reg_max);
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
- uint32_t desc, const uintptr_t retaddr,
- const int esz, const int msz, const SVEContFault fault,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
- int bit55 = extract64(addr, 55, 1);
-
- /* Remove mtedesc from the normal sve descriptor. */
- desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
- /* Perform gross MTE suppression early. */
- if (!tbi_check(desc, bit55) ||
- tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
- mtedesc = 0;
- }
-
- sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
- esz, msz, fault, host_fn, tlb_fn);
-}
-
-#define DO_LDFF1_LDNF1_1(PART, ESZ) \
-void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
- sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
-} \
-void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
- sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
-} \
-void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
- sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
-} \
-void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
- sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
-}
-
-#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
-void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
- sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
-} \
-void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
- sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
-} \
-void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
- sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
-} \
-void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
- sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
-} \
-void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
- sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
-} \
-void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
- sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
-} \
-void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
- sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
-} \
-void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
- sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
-}
-
-DO_LDFF1_LDNF1_1(bb, MO_8)
-DO_LDFF1_LDNF1_1(bhu, MO_16)
-DO_LDFF1_LDNF1_1(bhs, MO_16)
-DO_LDFF1_LDNF1_1(bsu, MO_32)
-DO_LDFF1_LDNF1_1(bss, MO_32)
-DO_LDFF1_LDNF1_1(bdu, MO_64)
-DO_LDFF1_LDNF1_1(bds, MO_64)
-
-DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
-DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
-DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
-DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
-DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
-
-DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
-DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
-DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
-
-DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
-
-#undef DO_LDFF1_LDNF1_1
-#undef DO_LDFF1_LDNF1_2
-
-/*
- * Common helper for all contiguous 1,2,3,4-register predicated stores.
- */
-
-static inline QEMU_ALWAYS_INLINE
-void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
- uint32_t desc, const uintptr_t retaddr,
- const int esz, const int msz, const int N, uint32_t mtedesc,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- const unsigned rd = simd_data(desc);
- const intptr_t reg_max = simd_oprsz(desc);
- intptr_t reg_off, reg_last, mem_off;
- SVEContLdSt info;
- void *host;
- int i, flags;
-
- /* Find the active elements. */
- if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
- /* The entire predicate was false; no store occurs. */
- return;
- }
-
- /* Probe the page(s). Exit with exception for any invalid page. */
- sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
-
- /* Handle watchpoints for all active elements. */
- sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
- BP_MEM_WRITE, retaddr);
-
- /*
- * Handle mte checks for all active elements.
- * Since TBI must be set for MTE, !mtedesc => !mte_active.
- */
- if (mtedesc) {
- sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
- mtedesc, retaddr);
- }
-
- flags = info.page[0].flags | info.page[1].flags;
- if (unlikely(flags != 0)) {
-#ifdef CONFIG_USER_ONLY
- g_assert_not_reached();
-#else
- /*
- * At least one page includes MMIO.
- * Any bus operation can fail with cpu_transaction_failed,
- * which for ARM will raise SyncExternal. We cannot avoid
- * this fault and will leave with the store incomplete.
- */
- mem_off = info.mem_off_first[0];
- reg_off = info.reg_off_first[0];
- reg_last = info.reg_off_last[1];
- if (reg_last < 0) {
- reg_last = info.reg_off_split;
- if (reg_last < 0) {
- reg_last = info.reg_off_last[0];
- }
- }
-
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- for (i = 0; i < N; ++i) {
- tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
- addr + mem_off + (i << msz), retaddr);
- }
- }
- reg_off += 1 << esz;
- mem_off += N << msz;
- } while (reg_off & 63);
- } while (reg_off <= reg_last);
- return;
-#endif
- }
-
- mem_off = info.mem_off_first[0];
- reg_off = info.reg_off_first[0];
- reg_last = info.reg_off_last[0];
- host = info.page[0].host;
-
- while (reg_off <= reg_last) {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- for (i = 0; i < N; ++i) {
- host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
- host + mem_off + (i << msz));
- }
- }
- reg_off += 1 << esz;
- mem_off += N << msz;
- } while (reg_off <= reg_last && (reg_off & 63));
- }
-
- /*
- * Use the slow path to manage the cross-page misalignment.
- * But we know this is RAM and cannot trap.
- */
- mem_off = info.mem_off_split;
- if (unlikely(mem_off >= 0)) {
- reg_off = info.reg_off_split;
- for (i = 0; i < N; ++i) {
- tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
- addr + mem_off + (i << msz), retaddr);
- }
- }
-
- mem_off = info.mem_off_first[1];
- if (unlikely(mem_off >= 0)) {
- reg_off = info.reg_off_first[1];
- reg_last = info.reg_off_last[1];
- host = info.page[1].host;
-
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if ((pg >> (reg_off & 63)) & 1) {
- for (i = 0; i < N; ++i) {
- host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
- host + mem_off + (i << msz));
- }
- }
- reg_off += 1 << esz;
- mem_off += N << msz;
- } while (reg_off & 63);
- } while (reg_off <= reg_last);
- }
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
- uint32_t desc, const uintptr_t ra,
- const int esz, const int msz, const int N,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
- int bit55 = extract64(addr, 55, 1);
-
- /* Remove mtedesc from the normal sve descriptor. */
- desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
- /* Perform gross MTE suppression early. */
- if (!tbi_check(desc, bit55) ||
- tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
- mtedesc = 0;
- }
-
- sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
-}
-
-#define DO_STN_1(N, NAME, ESZ) \
-void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
- sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
-} \
-void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
- sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
-}
-
-#define DO_STN_2(N, NAME, ESZ, MSZ) \
-void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
- sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
-} \
-void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
- sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
-} \
-void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
- sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
-} \
-void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
- sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
-}
-
-DO_STN_1(1, bb, MO_8)
-DO_STN_1(1, bh, MO_16)
-DO_STN_1(1, bs, MO_32)
-DO_STN_1(1, bd, MO_64)
-DO_STN_1(2, bb, MO_8)
-DO_STN_1(3, bb, MO_8)
-DO_STN_1(4, bb, MO_8)
-
-DO_STN_2(1, hh, MO_16, MO_16)
-DO_STN_2(1, hs, MO_32, MO_16)
-DO_STN_2(1, hd, MO_64, MO_16)
-DO_STN_2(2, hh, MO_16, MO_16)
-DO_STN_2(3, hh, MO_16, MO_16)
-DO_STN_2(4, hh, MO_16, MO_16)
-
-DO_STN_2(1, ss, MO_32, MO_32)
-DO_STN_2(1, sd, MO_64, MO_32)
-DO_STN_2(2, ss, MO_32, MO_32)
-DO_STN_2(3, ss, MO_32, MO_32)
-DO_STN_2(4, ss, MO_32, MO_32)
-
-DO_STN_2(1, dd, MO_64, MO_64)
-DO_STN_2(2, dd, MO_64, MO_64)
-DO_STN_2(3, dd, MO_64, MO_64)
-DO_STN_2(4, dd, MO_64, MO_64)
-
-#undef DO_STN_1
-#undef DO_STN_2
-
-/*
- * Loads with a vector index.
- */
-
-/*
- * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
- */
-typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
-
-static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
-{
- return *(uint32_t *)(reg + H1_4(reg_ofs));
-}
-
-static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
-{
- return *(int32_t *)(reg + H1_4(reg_ofs));
-}
-
-static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
-{
- return (uint32_t)*(uint64_t *)(reg + reg_ofs);
-}
-
-static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
-{
- return (int32_t)*(uint64_t *)(reg + reg_ofs);
-}
-
-static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
-{
- return *(uint64_t *)(reg + reg_ofs);
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
- target_ulong base, uint32_t desc, uintptr_t retaddr,
- uint32_t mtedesc, int esize, int msize,
- zreg_off_fn *off_fn,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- const int mmu_idx = cpu_mmu_index(env, false);
- const intptr_t reg_max = simd_oprsz(desc);
- const int scale = simd_data(desc);
- ARMVectorReg scratch;
- intptr_t reg_off;
- SVEHostPage info, info2;
-
- memset(&scratch, 0, reg_max);
- reg_off = 0;
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if (likely(pg & 1)) {
- target_ulong addr = base + (off_fn(vm, reg_off) << scale);
- target_ulong in_page = -(addr | TARGET_PAGE_MASK);
-
- sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
- mmu_idx, retaddr);
-
- if (likely(in_page >= msize)) {
- if (unlikely(info.flags & TLB_WATCHPOINT)) {
- cpu_check_watchpoint(env_cpu(env), addr, msize,
- info.attrs, BP_MEM_READ, retaddr);
- }
- if (mtedesc && info.tagged) {
- mte_check(env, mtedesc, addr, retaddr);
- }
- if (unlikely(info.flags & TLB_MMIO)) {
- tlb_fn(env, &scratch, reg_off, addr, retaddr);
- } else {
- host_fn(&scratch, reg_off, info.host);
- }
- } else {
- /* Element crosses the page boundary. */
- sve_probe_page(&info2, false, env, addr + in_page, 0,
- MMU_DATA_LOAD, mmu_idx, retaddr);
- if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
- cpu_check_watchpoint(env_cpu(env), addr,
- msize, info.attrs,
- BP_MEM_READ, retaddr);
- }
- if (mtedesc && info.tagged) {
- mte_check(env, mtedesc, addr, retaddr);
- }
- tlb_fn(env, &scratch, reg_off, addr, retaddr);
- }
- }
- reg_off += esize;
- pg >>= esize;
- } while (reg_off & 63);
- } while (reg_off < reg_max);
-
- /* Wait until all exceptions have been raised to write back. */
- memcpy(vd, &scratch, reg_max);
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
- target_ulong base, uint32_t desc, uintptr_t retaddr,
- int esize, int msize, zreg_off_fn *off_fn,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
- /* Remove mtedesc from the normal sve descriptor. */
- desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
- /*
- * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
- * offset base entirely over the address space hole to change the
- * pointer tag, or change the bit55 selector. So we could here
- * examine TBI + TCMA like we do for sve_ldN_r_mte().
- */
- sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
- esize, msize, off_fn, host_fn, tlb_fn);
-}
-
-#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
-void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
- void *vm, target_ulong base, uint32_t desc) \
-{ \
- sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
- off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
-} \
-void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
- void *vm, target_ulong base, uint32_t desc) \
-{ \
- sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
- off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
-}
-
-#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
-void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
- void *vm, target_ulong base, uint32_t desc) \
-{ \
- sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
- off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
-} \
-void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
- void *vm, target_ulong base, uint32_t desc) \
-{ \
- sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
- off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
-}
-
-DO_LD1_ZPZ_S(bsu, zsu, MO_8)
-DO_LD1_ZPZ_S(bsu, zss, MO_8)
-DO_LD1_ZPZ_D(bdu, zsu, MO_8)
-DO_LD1_ZPZ_D(bdu, zss, MO_8)
-DO_LD1_ZPZ_D(bdu, zd, MO_8)
-
-DO_LD1_ZPZ_S(bss, zsu, MO_8)
-DO_LD1_ZPZ_S(bss, zss, MO_8)
-DO_LD1_ZPZ_D(bds, zsu, MO_8)
-DO_LD1_ZPZ_D(bds, zss, MO_8)
-DO_LD1_ZPZ_D(bds, zd, MO_8)
-
-DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
-DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
-DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
-DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
-DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
-
-DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
-DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
-DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
-DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
-DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
-
-DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
-DO_LD1_ZPZ_S(hss_le, zss, MO_16)
-DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
-DO_LD1_ZPZ_D(hds_le, zss, MO_16)
-DO_LD1_ZPZ_D(hds_le, zd, MO_16)
-
-DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
-DO_LD1_ZPZ_S(hss_be, zss, MO_16)
-DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
-DO_LD1_ZPZ_D(hds_be, zss, MO_16)
-DO_LD1_ZPZ_D(hds_be, zd, MO_16)
-
-DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
-DO_LD1_ZPZ_S(ss_le, zss, MO_32)
-DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
-DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
-DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
-
-DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
-DO_LD1_ZPZ_S(ss_be, zss, MO_32)
-DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
-DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
-DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
-
-DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
-DO_LD1_ZPZ_D(sds_le, zss, MO_32)
-DO_LD1_ZPZ_D(sds_le, zd, MO_32)
-
-DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
-DO_LD1_ZPZ_D(sds_be, zss, MO_32)
-DO_LD1_ZPZ_D(sds_be, zd, MO_32)
-
-DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
-DO_LD1_ZPZ_D(dd_le, zss, MO_64)
-DO_LD1_ZPZ_D(dd_le, zd, MO_64)
-
-DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
-DO_LD1_ZPZ_D(dd_be, zss, MO_64)
-DO_LD1_ZPZ_D(dd_be, zd, MO_64)
-
-#undef DO_LD1_ZPZ_S
-#undef DO_LD1_ZPZ_D
-
-/* First fault loads with a vector index. */
-
-/*
- * Common helpers for all gather first-faulting loads.
- */
-
-static inline QEMU_ALWAYS_INLINE
-void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
- target_ulong base, uint32_t desc, uintptr_t retaddr,
- uint32_t mtedesc, const int esz, const int msz,
- zreg_off_fn *off_fn,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- const int mmu_idx = cpu_mmu_index(env, false);
- const intptr_t reg_max = simd_oprsz(desc);
- const int scale = simd_data(desc);
- const int esize = 1 << esz;
- const int msize = 1 << msz;
- intptr_t reg_off;
- SVEHostPage info;
- target_ulong addr, in_page;
-
- /* Skip to the first true predicate. */
- reg_off = find_next_active(vg, 0, reg_max, esz);
- if (unlikely(reg_off >= reg_max)) {
- /* The entire predicate was false; no load occurs. */
- memset(vd, 0, reg_max);
- return;
- }
-
- /*
- * Probe the first element, allowing faults.
- */
- addr = base + (off_fn(vm, reg_off) << scale);
- if (mtedesc) {
- mte_check(env, mtedesc, addr, retaddr);
- }
- tlb_fn(env, vd, reg_off, addr, retaddr);
-
- /* After any fault, zero the other elements. */
- swap_memzero(vd, reg_off);
- reg_off += esize;
- swap_memzero(vd + reg_off, reg_max - reg_off);
-
- /*
- * Probe the remaining elements, not allowing faults.
- */
- while (reg_off < reg_max) {
- uint64_t pg = vg[reg_off >> 6];
- do {
- if (likely((pg >> (reg_off & 63)) & 1)) {
- addr = base + (off_fn(vm, reg_off) << scale);
- in_page = -(addr | TARGET_PAGE_MASK);
-
- if (unlikely(in_page < msize)) {
- /* Stop if the element crosses a page boundary. */
- goto fault;
- }
-
- sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
- mmu_idx, retaddr);
- if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
- goto fault;
- }
- if (unlikely(info.flags & TLB_WATCHPOINT) &&
- (cpu_watchpoint_address_matches
- (env_cpu(env), addr, msize) & BP_MEM_READ)) {
- goto fault;
- }
- if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
- goto fault;
- }
-
- host_fn(vd, reg_off, info.host);
- }
- reg_off += esize;
- } while (reg_off & 63);
- }
- return;
-
- fault:
- record_fault(env, reg_off, reg_max);
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
- target_ulong base, uint32_t desc, uintptr_t retaddr,
- const int esz, const int msz,
- zreg_off_fn *off_fn,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
- /* Remove mtedesc from the normal sve descriptor. */
- desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
- /*
- * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
- * offset base entirely over the address space hole to change the
- * pointer tag, or change the bit55 selector. So we could here
- * examine TBI + TCMA like we do for sve_ldN_r_mte().
- */
- sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
- esz, msz, off_fn, host_fn, tlb_fn);
-}
-
-#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
-void HELPER(sve_ldff##MEM##_##OFS) \
- (CPUARMState *env, void *vd, void *vg, \
- void *vm, target_ulong base, uint32_t desc) \
-{ \
- sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
- off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
-} \
-void HELPER(sve_ldff##MEM##_##OFS##_mte) \
- (CPUARMState *env, void *vd, void *vg, \
- void *vm, target_ulong base, uint32_t desc) \
-{ \
- sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
- off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
-}
-
-#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
-void HELPER(sve_ldff##MEM##_##OFS) \
- (CPUARMState *env, void *vd, void *vg, \
- void *vm, target_ulong base, uint32_t desc) \
-{ \
- sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
- off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
-} \
-void HELPER(sve_ldff##MEM##_##OFS##_mte) \
- (CPUARMState *env, void *vd, void *vg, \
- void *vm, target_ulong base, uint32_t desc) \
-{ \
- sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
- off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
-}
-
-DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
-DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
-DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
-DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
-DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
-
-DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
-DO_LDFF1_ZPZ_S(bss, zss, MO_8)
-DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
-DO_LDFF1_ZPZ_D(bds, zss, MO_8)
-DO_LDFF1_ZPZ_D(bds, zd, MO_8)
-
-DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
-DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
-DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
-DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
-DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
-
-DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
-DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
-DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
-DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
-DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
-
-DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
-DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
-DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
-DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
-DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
-
-DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
-DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
-DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
-DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
-DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
-
-DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
-DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
-DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
-DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
-DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
-
-DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
-DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
-DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
-DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
-DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
-
-DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
-DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
-DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
-
-DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
-DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
-DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
-
-DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
-DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
-DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
-
-DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
-DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
-DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
-
-/* Stores with a vector index. */
-
-static inline QEMU_ALWAYS_INLINE
-void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
- target_ulong base, uint32_t desc, uintptr_t retaddr,
- uint32_t mtedesc, int esize, int msize,
- zreg_off_fn *off_fn,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- const int mmu_idx = cpu_mmu_index(env, false);
- const intptr_t reg_max = simd_oprsz(desc);
- const int scale = simd_data(desc);
- void *host[ARM_MAX_VQ * 4];
- intptr_t reg_off, i;
- SVEHostPage info, info2;
-
- /*
- * Probe all of the elements for host addresses and flags.
- */
- i = reg_off = 0;
- do {
- uint64_t pg = vg[reg_off >> 6];
- do {
- target_ulong addr = base + (off_fn(vm, reg_off) << scale);
- target_ulong in_page = -(addr | TARGET_PAGE_MASK);
-
- host[i] = NULL;
- if (likely((pg >> (reg_off & 63)) & 1)) {
- if (likely(in_page >= msize)) {
- sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
- mmu_idx, retaddr);
- if (!(info.flags & TLB_MMIO)) {
- host[i] = info.host;
- }
- } else {
- /*
- * Element crosses the page boundary.
- * Probe both pages, but do not record the host address,
- * so that we use the slow path.
- */
- sve_probe_page(&info, false, env, addr, 0,
- MMU_DATA_STORE, mmu_idx, retaddr);
- sve_probe_page(&info2, false, env, addr + in_page, 0,
- MMU_DATA_STORE, mmu_idx, retaddr);
- info.flags |= info2.flags;
- }
-
- if (unlikely(info.flags & TLB_WATCHPOINT)) {
- cpu_check_watchpoint(env_cpu(env), addr, msize,
- info.attrs, BP_MEM_WRITE, retaddr);
- }
-
- if (mtedesc && info.tagged) {
- mte_check(env, mtedesc, addr, retaddr);
- }
- }
- i += 1;
- reg_off += esize;
- } while (reg_off & 63);
- } while (reg_off < reg_max);
-
- /*
- * Now that we have recognized all exceptions except SyncExternal
- * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
- *
- * Note for the common case of an element in RAM, not crossing a page
- * boundary, we have stored the host address in host[]. This doubles
- * as a first-level check against the predicate, since only enabled
- * elements have non-null host addresses.
- */
- i = reg_off = 0;
- do {
- void *h = host[i];
- if (likely(h != NULL)) {
- host_fn(vd, reg_off, h);
- } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
- target_ulong addr = base + (off_fn(vm, reg_off) << scale);
- tlb_fn(env, vd, reg_off, addr, retaddr);
- }
- i += 1;
- reg_off += esize;
- } while (reg_off < reg_max);
-}
-
-static inline QEMU_ALWAYS_INLINE
-void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
- target_ulong base, uint32_t desc, uintptr_t retaddr,
- int esize, int msize, zreg_off_fn *off_fn,
- sve_ldst1_host_fn *host_fn,
- sve_ldst1_tlb_fn *tlb_fn)
-{
- uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
- /* Remove mtedesc from the normal sve descriptor. */
- desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-
- /*
- * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
- * offset base entirely over the address space hole to change the
- * pointer tag, or change the bit55 selector. So we could here
- * examine TBI + TCMA like we do for sve_ldN_r_mte().
- */
- sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
- esize, msize, off_fn, host_fn, tlb_fn);
-}
-
-#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
-void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
- void *vm, target_ulong base, uint32_t desc) \
-{ \
- sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
- off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
-} \
-void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
- void *vm, target_ulong base, uint32_t desc) \
-{ \
- sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
- off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
-}
-
-#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
-void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
- void *vm, target_ulong base, uint32_t desc) \
-{ \
- sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
- off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
-} \
-void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
- void *vm, target_ulong base, uint32_t desc) \
-{ \
- sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
- off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
-}
-
-DO_ST1_ZPZ_S(bs, zsu, MO_8)
-DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
-DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
-DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
-DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
-
-DO_ST1_ZPZ_S(bs, zss, MO_8)
-DO_ST1_ZPZ_S(hs_le, zss, MO_16)
-DO_ST1_ZPZ_S(hs_be, zss, MO_16)
-DO_ST1_ZPZ_S(ss_le, zss, MO_32)
-DO_ST1_ZPZ_S(ss_be, zss, MO_32)
-
-DO_ST1_ZPZ_D(bd, zsu, MO_8)
-DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
-DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
-DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
-DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
-DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
-DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
-
-DO_ST1_ZPZ_D(bd, zss, MO_8)
-DO_ST1_ZPZ_D(hd_le, zss, MO_16)
-DO_ST1_ZPZ_D(hd_be, zss, MO_16)
-DO_ST1_ZPZ_D(sd_le, zss, MO_32)
-DO_ST1_ZPZ_D(sd_be, zss, MO_32)
-DO_ST1_ZPZ_D(dd_le, zss, MO_64)
-DO_ST1_ZPZ_D(dd_be, zss, MO_64)
-
-DO_ST1_ZPZ_D(bd, zd, MO_8)
-DO_ST1_ZPZ_D(hd_le, zd, MO_16)
-DO_ST1_ZPZ_D(hd_be, zd, MO_16)
-DO_ST1_ZPZ_D(sd_le, zd, MO_32)
-DO_ST1_ZPZ_D(sd_be, zd, MO_32)
-DO_ST1_ZPZ_D(dd_le, zd, MO_64)
-DO_ST1_ZPZ_D(dd_be, zd, MO_64)
-
-#undef DO_ST1_ZPZ_S
-#undef DO_ST1_ZPZ_D
-
-void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = n[i] ^ m[i] ^ k[i];
- }
-}
-
-void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = n[i] ^ (m[i] & ~k[i]);
- }
-}
-
-void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
- }
-}
-
-void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
- }
-}
-
-void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
- }
-}
-
-/*
- * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
- * See hasless(v,1) from
- * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
- */
-static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
-{
- int bits = 8 << esz;
- uint64_t ones = dup_const(esz, 1);
- uint64_t signs = ones << (bits - 1);
- uint64_t cmp0, cmp1;
-
- cmp1 = dup_const(esz, n);
- cmp0 = cmp1 ^ m0;
- cmp1 = cmp1 ^ m1;
- cmp0 = (cmp0 - ones) & ~cmp0;
- cmp1 = (cmp1 - ones) & ~cmp1;
- return (cmp0 | cmp1) & signs;
-}
-
-static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
- uint32_t desc, int esz, bool nmatch)
-{
- uint16_t esz_mask = pred_esz_masks[esz];
- intptr_t opr_sz = simd_oprsz(desc);
- uint32_t flags = PREDTEST_INIT;
- intptr_t i, j, k;
-
- for (i = 0; i < opr_sz; i += 16) {
- uint64_t m0 = *(uint64_t *)(vm + i);
- uint64_t m1 = *(uint64_t *)(vm + i + 8);
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
- uint16_t out = 0;
-
- for (j = 0; j < 16; j += 8) {
- uint64_t n = *(uint64_t *)(vn + i + j);
-
- for (k = 0; k < 8; k += 1 << esz) {
- if (pg & (1 << (j + k))) {
- bool o = do_match2(n >> (k * 8), m0, m1, esz);
- out |= (o ^ nmatch) << (j + k);
- }
- }
- }
- *(uint16_t *)(vd + H1_2(i >> 3)) = out;
- flags = iter_predtest_fwd(out, pg, flags);
- }
- return flags;
-}
-
-#define DO_PPZZ_MATCH(NAME, ESZ, INV) \
-uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
-{ \
- return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
-}
-
-DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
-DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
-
-DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
-DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
-
-#undef DO_PPZZ_MATCH
-
-void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
- uint32_t desc)
-{
- ARMVectorReg scratch;
- intptr_t i, j;
- intptr_t opr_sz = simd_oprsz(desc);
- uint32_t *d = vd, *n = vn, *m = vm;
- uint8_t *pg = vg;
-
- if (d == n) {
- n = memcpy(&scratch, n, opr_sz);
- if (d == m) {
- m = n;
- }
- } else if (d == m) {
- m = memcpy(&scratch, m, opr_sz);
- }
-
- for (i = 0; i < opr_sz; i += 4) {
- uint64_t count = 0;
- uint8_t pred;
-
- pred = pg[H1(i >> 3)] >> (i & 7);
- if (pred & 1) {
- uint32_t nn = n[H4(i >> 2)];
-
- for (j = 0; j <= i; j += 4) {
- pred = pg[H1(j >> 3)] >> (j & 7);
- if ((pred & 1) && nn == m[H4(j >> 2)]) {
- ++count;
- }
- }
- }
- d[H4(i >> 2)] = count;
- }
-}
-
-void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
- uint32_t desc)
-{
- ARMVectorReg scratch;
- intptr_t i, j;
- intptr_t opr_sz = simd_oprsz(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
- uint8_t *pg = vg;
-
- if (d == n) {
- n = memcpy(&scratch, n, opr_sz);
- if (d == m) {
- m = n;
- }
- } else if (d == m) {
- m = memcpy(&scratch, m, opr_sz);
- }
-
- for (i = 0; i < opr_sz / 8; ++i) {
- uint64_t count = 0;
- if (pg[H1(i)] & 1) {
- uint64_t nn = n[i];
- for (j = 0; j <= i; ++j) {
- if ((pg[H1(j)] & 1) && nn == m[j]) {
- ++count;
- }
- }
- }
- d[i] = count;
- }
-}
-
-/*
- * Returns the number of bytes in m0 and m1 that match n.
- * Unlike do_match2 we don't just need true/false, we need an exact count.
- * This requires two extra logical operations.
- */
-static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
-{
- const uint64_t mask = dup_const(MO_8, 0x7f);
- uint64_t cmp0, cmp1;
-
- cmp1 = dup_const(MO_8, n);
- cmp0 = cmp1 ^ m0;
- cmp1 = cmp1 ^ m1;
-
- /*
- * 1: clear msb of each byte to avoid carry to next byte (& mask)
- * 2: carry in to msb if byte != 0 (+ mask)
- * 3: set msb if cmp has msb set (| cmp)
- * 4: set ~msb to ignore them (| mask)
- * We now have 0xff for byte != 0 or 0x7f for byte == 0.
- * 5: invert, resulting in 0x80 if and only if byte == 0.
- */
- cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
- cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
-
- /*
- * Combine the two compares in a way that the bits do
- * not overlap, and so preserves the count of set bits.
- * If the host has an efficient instruction for ctpop,
- * then ctpop(x) + ctpop(y) has the same number of
- * operations as ctpop(x | (y >> 1)). If the host does
- * not have an efficient ctpop, then we only want to
- * use it once.
- */
- return ctpop64(cmp0 | (cmp1 >> 1));
-}
-
-void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, j;
- intptr_t opr_sz = simd_oprsz(desc);
-
- for (i = 0; i < opr_sz; i += 16) {
- uint64_t n0 = *(uint64_t *)(vn + i);
- uint64_t m0 = *(uint64_t *)(vm + i);
- uint64_t n1 = *(uint64_t *)(vn + i + 8);
- uint64_t m1 = *(uint64_t *)(vm + i + 8);
- uint64_t out0 = 0;
- uint64_t out1 = 0;
-
- for (j = 0; j < 64; j += 8) {
- uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
- uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
- out0 |= cnt0 << j;
- out1 |= cnt1 << j;
- }
-
- *(uint64_t *)(vd + i) = out0;
- *(uint64_t *)(vd + i + 8) = out1;
- }
-}
-
-void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- int shr = simd_data(desc);
- int shl = 8 - shr;
- uint64_t mask = dup_const(MO_8, 0xff >> shr);
- uint64_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz; ++i) {
- uint64_t t = n[i] ^ m[i];
- d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
- }
-}
-
-void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- int shr = simd_data(desc);
- int shl = 16 - shr;
- uint64_t mask = dup_const(MO_16, 0xffff >> shr);
- uint64_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz; ++i) {
- uint64_t t = n[i] ^ m[i];
- d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
- }
-}
-
-void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 4;
- int shr = simd_data(desc);
- uint32_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = ror32(n[i] ^ m[i], shr);
- }
-}
-
-void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
- void *status, uint32_t desc)
-{
- intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
-
- for (s = 0; s < opr_sz; ++s) {
- float32 *n = vn + s * sizeof(float32) * 4;
- float32 *m = vm + s * sizeof(float32) * 4;
- float32 *a = va + s * sizeof(float32) * 4;
- float32 *d = vd + s * sizeof(float32) * 4;
- float32 n00 = n[H4(0)], n01 = n[H4(1)];
- float32 n10 = n[H4(2)], n11 = n[H4(3)];
- float32 m00 = m[H4(0)], m01 = m[H4(1)];
- float32 m10 = m[H4(2)], m11 = m[H4(3)];
- float32 p0, p1;
-
- /* i = 0, j = 0 */
- p0 = float32_mul(n00, m00, status);
- p1 = float32_mul(n01, m01, status);
- d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
-
- /* i = 0, j = 1 */
- p0 = float32_mul(n00, m10, status);
- p1 = float32_mul(n01, m11, status);
- d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
-
- /* i = 1, j = 0 */
- p0 = float32_mul(n10, m00, status);
- p1 = float32_mul(n11, m01, status);
- d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
-
- /* i = 1, j = 1 */
- p0 = float32_mul(n10, m10, status);
- p1 = float32_mul(n11, m11, status);
- d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
- }
-}
-
-void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
- void *status, uint32_t desc)
-{
- intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
-
- for (s = 0; s < opr_sz; ++s) {
- float64 *n = vn + s * sizeof(float64) * 4;
- float64 *m = vm + s * sizeof(float64) * 4;
- float64 *a = va + s * sizeof(float64) * 4;
- float64 *d = vd + s * sizeof(float64) * 4;
- float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
- float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
- float64 p0, p1;
-
- /* i = 0, j = 0 */
- p0 = float64_mul(n00, m00, status);
- p1 = float64_mul(n01, m01, status);
- d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
-
- /* i = 0, j = 1 */
- p0 = float64_mul(n00, m10, status);
- p1 = float64_mul(n01, m11, status);
- d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
-
- /* i = 1, j = 0 */
- p0 = float64_mul(n10, m00, status);
- p1 = float64_mul(n11, m01, status);
- d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
-
- /* i = 1, j = 1 */
- p0 = float64_mul(n10, m10, status);
- p1 = float64_mul(n11, m11, status);
- d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
- }
-}
-
-#define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
-{ \
- intptr_t i = simd_oprsz(desc); \
- uint64_t *g = vg; \
- do { \
- uint64_t pg = g[(i - 1) >> 6]; \
- do { \
- i -= sizeof(TYPEW); \
- if (likely((pg >> (i & 63)) & 1)) { \
- TYPEW nn = *(TYPEW *)(vn + HW(i)); \
- *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
- } \
- } while (i & 63); \
- } while (i != 0); \
-}
-
-DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
-DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
-DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
-
-#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
-void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
-{ \
- intptr_t i = simd_oprsz(desc); \
- uint64_t *g = vg; \
- do { \
- uint64_t pg = g[(i - 1) >> 6]; \
- do { \
- i -= sizeof(TYPEW); \
- if (likely((pg >> (i & 63)) & 1)) { \
- TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
- *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
- } \
- } while (i & 63); \
- } while (i != 0); \
-}
-
-DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
-DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
-
-#undef DO_FCVTLT
-#undef DO_FCVTNT
--- /dev/null
+/*
+ * QEMU ARM stubs for some TCG helper functions
+ *
+ * Copyright 2021 SUSE LLC
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+
+void write_v7m_exception(CPUARMState *env, uint32_t new_exc)
+{
+ g_assert_not_reached();
+}
+
+void raise_exception_ra(CPUARMState *env, uint32_t excp, uint32_t syndrome,
+ uint32_t target_el, uintptr_t ra)
+{
+ g_assert_not_reached();
+}
--- /dev/null
+/*
+ * crypto_helper.c - emulate v8 Crypto Extensions instructions
+ *
+ * Copyright (C) 2013 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "crypto/aes.h"
+#include "crypto/sm4.h"
+#include "vec_internal.h"
+
+union CRYPTO_STATE {
+ uint8_t bytes[16];
+ uint32_t words[4];
+ uint64_t l[2];
+};
+
+#if HOST_BIG_ENDIAN
+#define CR_ST_BYTE(state, i) ((state).bytes[(15 - (i)) ^ 8])
+#define CR_ST_WORD(state, i) ((state).words[(3 - (i)) ^ 2])
+#else
+#define CR_ST_BYTE(state, i) ((state).bytes[i])
+#define CR_ST_WORD(state, i) ((state).words[i])
+#endif
+
+/*
+ * The caller has not been converted to full gvec, and so only
+ * modifies the low 16 bytes of the vector register.
+ */
+static void clear_tail_16(void *vd, uint32_t desc)
+{
+ int opr_sz = simd_oprsz(desc);
+ int max_sz = simd_maxsz(desc);
+
+ assert(opr_sz == 16);
+ clear_tail(vd, opr_sz, max_sz);
+}
+
+static void do_crypto_aese(uint64_t *rd, uint64_t *rn,
+ uint64_t *rm, bool decrypt)
+{
+ static uint8_t const * const sbox[2] = { AES_sbox, AES_isbox };
+ static uint8_t const * const shift[2] = { AES_shifts, AES_ishifts };
+ union CRYPTO_STATE rk = { .l = { rm[0], rm[1] } };
+ union CRYPTO_STATE st = { .l = { rn[0], rn[1] } };
+ int i;
+
+ /* xor state vector with round key */
+ rk.l[0] ^= st.l[0];
+ rk.l[1] ^= st.l[1];
+
+ /* combine ShiftRows operation and sbox substitution */
+ for (i = 0; i < 16; i++) {
+ CR_ST_BYTE(st, i) = sbox[decrypt][CR_ST_BYTE(rk, shift[decrypt][i])];
+ }
+
+ rd[0] = st.l[0];
+ rd[1] = st.l[1];
+}
+
+void HELPER(crypto_aese)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ bool decrypt = simd_data(desc);
+
+ for (i = 0; i < opr_sz; i += 16) {
+ do_crypto_aese(vd + i, vn + i, vm + i, decrypt);
+ }
+ clear_tail(vd, opr_sz, simd_maxsz(desc));
+}
+
+static void do_crypto_aesmc(uint64_t *rd, uint64_t *rm, bool decrypt)
+{
+ static uint32_t const mc[][256] = { {
+ /* MixColumns lookup table */
+ 0x00000000, 0x03010102, 0x06020204, 0x05030306,
+ 0x0c040408, 0x0f05050a, 0x0a06060c, 0x0907070e,
+ 0x18080810, 0x1b090912, 0x1e0a0a14, 0x1d0b0b16,
+ 0x140c0c18, 0x170d0d1a, 0x120e0e1c, 0x110f0f1e,
+ 0x30101020, 0x33111122, 0x36121224, 0x35131326,
+ 0x3c141428, 0x3f15152a, 0x3a16162c, 0x3917172e,
+ 0x28181830, 0x2b191932, 0x2e1a1a34, 0x2d1b1b36,
+ 0x241c1c38, 0x271d1d3a, 0x221e1e3c, 0x211f1f3e,
+ 0x60202040, 0x63212142, 0x66222244, 0x65232346,
+ 0x6c242448, 0x6f25254a, 0x6a26264c, 0x6927274e,
+ 0x78282850, 0x7b292952, 0x7e2a2a54, 0x7d2b2b56,
+ 0x742c2c58, 0x772d2d5a, 0x722e2e5c, 0x712f2f5e,
+ 0x50303060, 0x53313162, 0x56323264, 0x55333366,
+ 0x5c343468, 0x5f35356a, 0x5a36366c, 0x5937376e,
+ 0x48383870, 0x4b393972, 0x4e3a3a74, 0x4d3b3b76,
+ 0x443c3c78, 0x473d3d7a, 0x423e3e7c, 0x413f3f7e,
+ 0xc0404080, 0xc3414182, 0xc6424284, 0xc5434386,
+ 0xcc444488, 0xcf45458a, 0xca46468c, 0xc947478e,
+ 0xd8484890, 0xdb494992, 0xde4a4a94, 0xdd4b4b96,
+ 0xd44c4c98, 0xd74d4d9a, 0xd24e4e9c, 0xd14f4f9e,
+ 0xf05050a0, 0xf35151a2, 0xf65252a4, 0xf55353a6,
+ 0xfc5454a8, 0xff5555aa, 0xfa5656ac, 0xf95757ae,
+ 0xe85858b0, 0xeb5959b2, 0xee5a5ab4, 0xed5b5bb6,
+ 0xe45c5cb8, 0xe75d5dba, 0xe25e5ebc, 0xe15f5fbe,
+ 0xa06060c0, 0xa36161c2, 0xa66262c4, 0xa56363c6,
+ 0xac6464c8, 0xaf6565ca, 0xaa6666cc, 0xa96767ce,
+ 0xb86868d0, 0xbb6969d2, 0xbe6a6ad4, 0xbd6b6bd6,
+ 0xb46c6cd8, 0xb76d6dda, 0xb26e6edc, 0xb16f6fde,
+ 0x907070e0, 0x937171e2, 0x967272e4, 0x957373e6,
+ 0x9c7474e8, 0x9f7575ea, 0x9a7676ec, 0x997777ee,
+ 0x887878f0, 0x8b7979f2, 0x8e7a7af4, 0x8d7b7bf6,
+ 0x847c7cf8, 0x877d7dfa, 0x827e7efc, 0x817f7ffe,
+ 0x9b80801b, 0x98818119, 0x9d82821f, 0x9e83831d,
+ 0x97848413, 0x94858511, 0x91868617, 0x92878715,
+ 0x8388880b, 0x80898909, 0x858a8a0f, 0x868b8b0d,
+ 0x8f8c8c03, 0x8c8d8d01, 0x898e8e07, 0x8a8f8f05,
+ 0xab90903b, 0xa8919139, 0xad92923f, 0xae93933d,
+ 0xa7949433, 0xa4959531, 0xa1969637, 0xa2979735,
+ 0xb398982b, 0xb0999929, 0xb59a9a2f, 0xb69b9b2d,
+ 0xbf9c9c23, 0xbc9d9d21, 0xb99e9e27, 0xba9f9f25,
+ 0xfba0a05b, 0xf8a1a159, 0xfda2a25f, 0xfea3a35d,
+ 0xf7a4a453, 0xf4a5a551, 0xf1a6a657, 0xf2a7a755,
+ 0xe3a8a84b, 0xe0a9a949, 0xe5aaaa4f, 0xe6abab4d,
+ 0xefacac43, 0xecadad41, 0xe9aeae47, 0xeaafaf45,
+ 0xcbb0b07b, 0xc8b1b179, 0xcdb2b27f, 0xceb3b37d,
+ 0xc7b4b473, 0xc4b5b571, 0xc1b6b677, 0xc2b7b775,
+ 0xd3b8b86b, 0xd0b9b969, 0xd5baba6f, 0xd6bbbb6d,
+ 0xdfbcbc63, 0xdcbdbd61, 0xd9bebe67, 0xdabfbf65,
+ 0x5bc0c09b, 0x58c1c199, 0x5dc2c29f, 0x5ec3c39d,
+ 0x57c4c493, 0x54c5c591, 0x51c6c697, 0x52c7c795,
+ 0x43c8c88b, 0x40c9c989, 0x45caca8f, 0x46cbcb8d,
+ 0x4fcccc83, 0x4ccdcd81, 0x49cece87, 0x4acfcf85,
+ 0x6bd0d0bb, 0x68d1d1b9, 0x6dd2d2bf, 0x6ed3d3bd,
+ 0x67d4d4b3, 0x64d5d5b1, 0x61d6d6b7, 0x62d7d7b5,
+ 0x73d8d8ab, 0x70d9d9a9, 0x75dadaaf, 0x76dbdbad,
+ 0x7fdcdca3, 0x7cdddda1, 0x79dedea7, 0x7adfdfa5,
+ 0x3be0e0db, 0x38e1e1d9, 0x3de2e2df, 0x3ee3e3dd,
+ 0x37e4e4d3, 0x34e5e5d1, 0x31e6e6d7, 0x32e7e7d5,
+ 0x23e8e8cb, 0x20e9e9c9, 0x25eaeacf, 0x26ebebcd,
+ 0x2fececc3, 0x2cededc1, 0x29eeeec7, 0x2aefefc5,
+ 0x0bf0f0fb, 0x08f1f1f9, 0x0df2f2ff, 0x0ef3f3fd,
+ 0x07f4f4f3, 0x04f5f5f1, 0x01f6f6f7, 0x02f7f7f5,
+ 0x13f8f8eb, 0x10f9f9e9, 0x15fafaef, 0x16fbfbed,
+ 0x1ffcfce3, 0x1cfdfde1, 0x19fefee7, 0x1affffe5,
+ }, {
+ /* Inverse MixColumns lookup table */
+ 0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12,
+ 0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a,
+ 0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362,
+ 0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a,
+ 0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2,
+ 0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca,
+ 0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382,
+ 0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba,
+ 0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9,
+ 0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1,
+ 0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9,
+ 0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81,
+ 0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029,
+ 0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411,
+ 0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859,
+ 0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61,
+ 0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf,
+ 0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987,
+ 0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf,
+ 0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7,
+ 0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f,
+ 0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967,
+ 0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f,
+ 0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117,
+ 0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664,
+ 0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c,
+ 0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14,
+ 0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c,
+ 0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684,
+ 0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc,
+ 0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4,
+ 0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc,
+ 0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753,
+ 0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b,
+ 0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23,
+ 0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b,
+ 0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3,
+ 0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b,
+ 0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3,
+ 0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb,
+ 0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88,
+ 0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0,
+ 0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8,
+ 0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0,
+ 0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68,
+ 0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850,
+ 0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418,
+ 0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020,
+ 0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe,
+ 0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6,
+ 0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e,
+ 0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6,
+ 0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e,
+ 0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526,
+ 0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e,
+ 0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56,
+ 0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25,
+ 0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d,
+ 0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255,
+ 0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d,
+ 0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5,
+ 0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd,
+ 0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5,
+ 0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d,
+ } };
+
+ union CRYPTO_STATE st = { .l = { rm[0], rm[1] } };
+ int i;
+
+ for (i = 0; i < 16; i += 4) {
+ CR_ST_WORD(st, i >> 2) =
+ mc[decrypt][CR_ST_BYTE(st, i)] ^
+ rol32(mc[decrypt][CR_ST_BYTE(st, i + 1)], 8) ^
+ rol32(mc[decrypt][CR_ST_BYTE(st, i + 2)], 16) ^
+ rol32(mc[decrypt][CR_ST_BYTE(st, i + 3)], 24);
+ }
+
+ rd[0] = st.l[0];
+ rd[1] = st.l[1];
+}
+
+void HELPER(crypto_aesmc)(void *vd, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ bool decrypt = simd_data(desc);
+
+ for (i = 0; i < opr_sz; i += 16) {
+ do_crypto_aesmc(vd + i, vm + i, decrypt);
+ }
+ clear_tail(vd, opr_sz, simd_maxsz(desc));
+}
+
+/*
+ * SHA-1 logical functions
+ */
+
+static uint32_t cho(uint32_t x, uint32_t y, uint32_t z)
+{
+ return (x & (y ^ z)) ^ z;
+}
+
+static uint32_t par(uint32_t x, uint32_t y, uint32_t z)
+{
+ return x ^ y ^ z;
+}
+
+static uint32_t maj(uint32_t x, uint32_t y, uint32_t z)
+{
+ return (x & y) | ((x | y) & z);
+}
+
+void HELPER(crypto_sha1su0)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint64_t d0, d1;
+
+ d0 = d[1] ^ d[0] ^ m[0];
+ d1 = n[0] ^ d[1] ^ m[1];
+ d[0] = d0;
+ d[1] = d1;
+
+ clear_tail_16(vd, desc);
+}
+
+static inline void crypto_sha1_3reg(uint64_t *rd, uint64_t *rn,
+ uint64_t *rm, uint32_t desc,
+ uint32_t (*fn)(union CRYPTO_STATE *d))
+{
+ union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+ union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+ union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ uint32_t t = fn(&d);
+
+ t += rol32(CR_ST_WORD(d, 0), 5) + CR_ST_WORD(n, 0)
+ + CR_ST_WORD(m, i);
+
+ CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3);
+ CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
+ CR_ST_WORD(d, 2) = ror32(CR_ST_WORD(d, 1), 2);
+ CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
+ CR_ST_WORD(d, 0) = t;
+ }
+ rd[0] = d.l[0];
+ rd[1] = d.l[1];
+
+ clear_tail_16(rd, desc);
+}
+
+static uint32_t do_sha1c(union CRYPTO_STATE *d)
+{
+ return cho(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
+}
+
+void HELPER(crypto_sha1c)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ crypto_sha1_3reg(vd, vn, vm, desc, do_sha1c);
+}
+
+static uint32_t do_sha1p(union CRYPTO_STATE *d)
+{
+ return par(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
+}
+
+void HELPER(crypto_sha1p)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ crypto_sha1_3reg(vd, vn, vm, desc, do_sha1p);
+}
+
+static uint32_t do_sha1m(union CRYPTO_STATE *d)
+{
+ return maj(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
+}
+
+void HELPER(crypto_sha1m)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ crypto_sha1_3reg(vd, vn, vm, desc, do_sha1m);
+}
+
+void HELPER(crypto_sha1h)(void *vd, void *vm, uint32_t desc)
+{
+ uint64_t *rd = vd;
+ uint64_t *rm = vm;
+ union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+
+ CR_ST_WORD(m, 0) = ror32(CR_ST_WORD(m, 0), 2);
+ CR_ST_WORD(m, 1) = CR_ST_WORD(m, 2) = CR_ST_WORD(m, 3) = 0;
+
+ rd[0] = m.l[0];
+ rd[1] = m.l[1];
+
+ clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha1su1)(void *vd, void *vm, uint32_t desc)
+{
+ uint64_t *rd = vd;
+ uint64_t *rm = vm;
+ union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+ union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+
+ CR_ST_WORD(d, 0) = rol32(CR_ST_WORD(d, 0) ^ CR_ST_WORD(m, 1), 1);
+ CR_ST_WORD(d, 1) = rol32(CR_ST_WORD(d, 1) ^ CR_ST_WORD(m, 2), 1);
+ CR_ST_WORD(d, 2) = rol32(CR_ST_WORD(d, 2) ^ CR_ST_WORD(m, 3), 1);
+ CR_ST_WORD(d, 3) = rol32(CR_ST_WORD(d, 3) ^ CR_ST_WORD(d, 0), 1);
+
+ rd[0] = d.l[0];
+ rd[1] = d.l[1];
+
+ clear_tail_16(vd, desc);
+}
+
+/*
+ * The SHA-256 logical functions, according to
+ * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
+ */
+
+static uint32_t S0(uint32_t x)
+{
+ return ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22);
+}
+
+static uint32_t S1(uint32_t x)
+{
+ return ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25);
+}
+
+static uint32_t s0(uint32_t x)
+{
+ return ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3);
+}
+
+static uint32_t s1(uint32_t x)
+{
+ return ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10);
+}
+
+void HELPER(crypto_sha256h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ uint64_t *rd = vd;
+ uint64_t *rn = vn;
+ uint64_t *rm = vm;
+ union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+ union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+ union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ uint32_t t = cho(CR_ST_WORD(n, 0), CR_ST_WORD(n, 1), CR_ST_WORD(n, 2))
+ + CR_ST_WORD(n, 3) + S1(CR_ST_WORD(n, 0))
+ + CR_ST_WORD(m, i);
+
+ CR_ST_WORD(n, 3) = CR_ST_WORD(n, 2);
+ CR_ST_WORD(n, 2) = CR_ST_WORD(n, 1);
+ CR_ST_WORD(n, 1) = CR_ST_WORD(n, 0);
+ CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3) + t;
+
+ t += maj(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2))
+ + S0(CR_ST_WORD(d, 0));
+
+ CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
+ CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1);
+ CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
+ CR_ST_WORD(d, 0) = t;
+ }
+
+ rd[0] = d.l[0];
+ rd[1] = d.l[1];
+
+ clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha256h2)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ uint64_t *rd = vd;
+ uint64_t *rn = vn;
+ uint64_t *rm = vm;
+ union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+ union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+ union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ uint32_t t = cho(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2))
+ + CR_ST_WORD(d, 3) + S1(CR_ST_WORD(d, 0))
+ + CR_ST_WORD(m, i);
+
+ CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
+ CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1);
+ CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
+ CR_ST_WORD(d, 0) = CR_ST_WORD(n, 3 - i) + t;
+ }
+
+ rd[0] = d.l[0];
+ rd[1] = d.l[1];
+
+ clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha256su0)(void *vd, void *vm, uint32_t desc)
+{
+ uint64_t *rd = vd;
+ uint64_t *rm = vm;
+ union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+ union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+
+ CR_ST_WORD(d, 0) += s0(CR_ST_WORD(d, 1));
+ CR_ST_WORD(d, 1) += s0(CR_ST_WORD(d, 2));
+ CR_ST_WORD(d, 2) += s0(CR_ST_WORD(d, 3));
+ CR_ST_WORD(d, 3) += s0(CR_ST_WORD(m, 0));
+
+ rd[0] = d.l[0];
+ rd[1] = d.l[1];
+
+ clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha256su1)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ uint64_t *rd = vd;
+ uint64_t *rn = vn;
+ uint64_t *rm = vm;
+ union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+ union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+ union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+
+ CR_ST_WORD(d, 0) += s1(CR_ST_WORD(m, 2)) + CR_ST_WORD(n, 1);
+ CR_ST_WORD(d, 1) += s1(CR_ST_WORD(m, 3)) + CR_ST_WORD(n, 2);
+ CR_ST_WORD(d, 2) += s1(CR_ST_WORD(d, 0)) + CR_ST_WORD(n, 3);
+ CR_ST_WORD(d, 3) += s1(CR_ST_WORD(d, 1)) + CR_ST_WORD(m, 0);
+
+ rd[0] = d.l[0];
+ rd[1] = d.l[1];
+
+ clear_tail_16(vd, desc);
+}
+
+/*
+ * The SHA-512 logical functions (same as above but using 64-bit operands)
+ */
+
+static uint64_t cho512(uint64_t x, uint64_t y, uint64_t z)
+{
+ return (x & (y ^ z)) ^ z;
+}
+
+static uint64_t maj512(uint64_t x, uint64_t y, uint64_t z)
+{
+ return (x & y) | ((x | y) & z);
+}
+
+static uint64_t S0_512(uint64_t x)
+{
+ return ror64(x, 28) ^ ror64(x, 34) ^ ror64(x, 39);
+}
+
+static uint64_t S1_512(uint64_t x)
+{
+ return ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41);
+}
+
+static uint64_t s0_512(uint64_t x)
+{
+ return ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7);
+}
+
+static uint64_t s1_512(uint64_t x)
+{
+ return ror64(x, 19) ^ ror64(x, 61) ^ (x >> 6);
+}
+
+void HELPER(crypto_sha512h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ uint64_t *rd = vd;
+ uint64_t *rn = vn;
+ uint64_t *rm = vm;
+ uint64_t d0 = rd[0];
+ uint64_t d1 = rd[1];
+
+ d1 += S1_512(rm[1]) + cho512(rm[1], rn[0], rn[1]);
+ d0 += S1_512(d1 + rm[0]) + cho512(d1 + rm[0], rm[1], rn[0]);
+
+ rd[0] = d0;
+ rd[1] = d1;
+
+ clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha512h2)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ uint64_t *rd = vd;
+ uint64_t *rn = vn;
+ uint64_t *rm = vm;
+ uint64_t d0 = rd[0];
+ uint64_t d1 = rd[1];
+
+ d1 += S0_512(rm[0]) + maj512(rn[0], rm[1], rm[0]);
+ d0 += S0_512(d1) + maj512(d1, rm[0], rm[1]);
+
+ rd[0] = d0;
+ rd[1] = d1;
+
+ clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha512su0)(void *vd, void *vn, uint32_t desc)
+{
+ uint64_t *rd = vd;
+ uint64_t *rn = vn;
+ uint64_t d0 = rd[0];
+ uint64_t d1 = rd[1];
+
+ d0 += s0_512(rd[1]);
+ d1 += s0_512(rn[0]);
+
+ rd[0] = d0;
+ rd[1] = d1;
+
+ clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sha512su1)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ uint64_t *rd = vd;
+ uint64_t *rn = vn;
+ uint64_t *rm = vm;
+
+ rd[0] += s1_512(rn[0]) + rm[0];
+ rd[1] += s1_512(rn[1]) + rm[1];
+
+ clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sm3partw1)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ uint64_t *rd = vd;
+ uint64_t *rn = vn;
+ uint64_t *rm = vm;
+ union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+ union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+ union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+ uint32_t t;
+
+ t = CR_ST_WORD(d, 0) ^ CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 1), 17);
+ CR_ST_WORD(d, 0) = t ^ ror32(t, 17) ^ ror32(t, 9);
+
+ t = CR_ST_WORD(d, 1) ^ CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 2), 17);
+ CR_ST_WORD(d, 1) = t ^ ror32(t, 17) ^ ror32(t, 9);
+
+ t = CR_ST_WORD(d, 2) ^ CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 3), 17);
+ CR_ST_WORD(d, 2) = t ^ ror32(t, 17) ^ ror32(t, 9);
+
+ t = CR_ST_WORD(d, 3) ^ CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 0), 17);
+ CR_ST_WORD(d, 3) = t ^ ror32(t, 17) ^ ror32(t, 9);
+
+ rd[0] = d.l[0];
+ rd[1] = d.l[1];
+
+ clear_tail_16(vd, desc);
+}
+
+void HELPER(crypto_sm3partw2)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ uint64_t *rd = vd;
+ uint64_t *rn = vn;
+ uint64_t *rm = vm;
+ union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+ union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+ union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+ uint32_t t = CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 0), 25);
+
+ CR_ST_WORD(d, 0) ^= t;
+ CR_ST_WORD(d, 1) ^= CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 1), 25);
+ CR_ST_WORD(d, 2) ^= CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 2), 25);
+ CR_ST_WORD(d, 3) ^= CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(m, 3), 25) ^
+ ror32(t, 17) ^ ror32(t, 2) ^ ror32(t, 26);
+
+ rd[0] = d.l[0];
+ rd[1] = d.l[1];
+
+ clear_tail_16(vd, desc);
+}
+
+static inline void QEMU_ALWAYS_INLINE
+crypto_sm3tt(uint64_t *rd, uint64_t *rn, uint64_t *rm,
+ uint32_t desc, uint32_t opcode)
+{
+ union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
+ union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+ union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+ uint32_t imm2 = simd_data(desc);
+ uint32_t t;
+
+ assert(imm2 < 4);
+
+ if (opcode == 0 || opcode == 2) {
+ /* SM3TT1A, SM3TT2A */
+ t = par(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
+ } else if (opcode == 1) {
+ /* SM3TT1B */
+ t = maj(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
+ } else if (opcode == 3) {
+ /* SM3TT2B */
+ t = cho(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
+ } else {
+ qemu_build_not_reached();
+ }
+
+ t += CR_ST_WORD(d, 0) + CR_ST_WORD(m, imm2);
+
+ CR_ST_WORD(d, 0) = CR_ST_WORD(d, 1);
+
+ if (opcode < 2) {
+ /* SM3TT1A, SM3TT1B */
+ t += CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 3), 20);
+
+ CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 23);
+ } else {
+ /* SM3TT2A, SM3TT2B */
+ t += CR_ST_WORD(n, 3);
+ t ^= rol32(t, 9) ^ rol32(t, 17);
+
+ CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 13);
+ }
+
+ CR_ST_WORD(d, 2) = CR_ST_WORD(d, 3);
+ CR_ST_WORD(d, 3) = t;
+
+ rd[0] = d.l[0];
+ rd[1] = d.l[1];
+
+ clear_tail_16(rd, desc);
+}
+
+#define DO_SM3TT(NAME, OPCODE) \
+ void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+ { crypto_sm3tt(vd, vn, vm, desc, OPCODE); }
+
+DO_SM3TT(crypto_sm3tt1a, 0)
+DO_SM3TT(crypto_sm3tt1b, 1)
+DO_SM3TT(crypto_sm3tt2a, 2)
+DO_SM3TT(crypto_sm3tt2b, 3)
+
+#undef DO_SM3TT
+
+static void do_crypto_sm4e(uint64_t *rd, uint64_t *rn, uint64_t *rm)
+{
+ union CRYPTO_STATE d = { .l = { rn[0], rn[1] } };
+ union CRYPTO_STATE n = { .l = { rm[0], rm[1] } };
+ uint32_t t, i;
+
+ for (i = 0; i < 4; i++) {
+ t = CR_ST_WORD(d, (i + 1) % 4) ^
+ CR_ST_WORD(d, (i + 2) % 4) ^
+ CR_ST_WORD(d, (i + 3) % 4) ^
+ CR_ST_WORD(n, i);
+
+ t = sm4_sbox[t & 0xff] |
+ sm4_sbox[(t >> 8) & 0xff] << 8 |
+ sm4_sbox[(t >> 16) & 0xff] << 16 |
+ sm4_sbox[(t >> 24) & 0xff] << 24;
+
+ CR_ST_WORD(d, i) ^= t ^ rol32(t, 2) ^ rol32(t, 10) ^ rol32(t, 18) ^
+ rol32(t, 24);
+ }
+
+ rd[0] = d.l[0];
+ rd[1] = d.l[1];
+}
+
+void HELPER(crypto_sm4e)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+
+ for (i = 0; i < opr_sz; i += 16) {
+ do_crypto_sm4e(vd + i, vn + i, vm + i);
+ }
+ clear_tail(vd, opr_sz, simd_maxsz(desc));
+}
+
+static void do_crypto_sm4ekey(uint64_t *rd, uint64_t *rn, uint64_t *rm)
+{
+ union CRYPTO_STATE d;
+ union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
+ union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
+ uint32_t t, i;
+
+ d = n;
+ for (i = 0; i < 4; i++) {
+ t = CR_ST_WORD(d, (i + 1) % 4) ^
+ CR_ST_WORD(d, (i + 2) % 4) ^
+ CR_ST_WORD(d, (i + 3) % 4) ^
+ CR_ST_WORD(m, i);
+
+ t = sm4_sbox[t & 0xff] |
+ sm4_sbox[(t >> 8) & 0xff] << 8 |
+ sm4_sbox[(t >> 16) & 0xff] << 16 |
+ sm4_sbox[(t >> 24) & 0xff] << 24;
+
+ CR_ST_WORD(d, i) ^= t ^ rol32(t, 13) ^ rol32(t, 23);
+ }
+
+ rd[0] = d.l[0];
+ rd[1] = d.l[1];
+}
+
+void HELPER(crypto_sm4ekey)(void *vd, void *vn, void* vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+
+ for (i = 0; i < opr_sz; i += 16) {
+ do_crypto_sm4ekey(vd + i, vn + i, vm + i);
+ }
+ clear_tail(vd, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(crypto_rax1)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ d[i] = n[i] ^ rol64(m[i], 1);
+ }
+ clear_tail(vd, opr_sz, simd_maxsz(desc));
+}
--- /dev/null
+/*
+ * AArch64 specific helpers
+ *
+ * Copyright (c) 2013 Alexander Graf <agraf@suse.de>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "cpu.h"
+#include "exec/gdbstub.h"
+#include "exec/helper-proto.h"
+#include "qemu/host-utils.h"
+#include "qemu/log.h"
+#include "qemu/main-loop.h"
+#include "qemu/bitops.h"
+#include "internals.h"
+#include "qemu/crc32c.h"
+#include "exec/exec-all.h"
+#include "exec/cpu_ldst.h"
+#include "qemu/int128.h"
+#include "qemu/atomic128.h"
+#include "fpu/softfloat.h"
+#include <zlib.h> /* For crc32 */
+
+/* C2.4.7 Multiply and divide */
+/* special cases for 0 and LLONG_MIN are mandated by the standard */
+uint64_t HELPER(udiv64)(uint64_t num, uint64_t den)
+{
+ if (den == 0) {
+ return 0;
+ }
+ return num / den;
+}
+
+int64_t HELPER(sdiv64)(int64_t num, int64_t den)
+{
+ if (den == 0) {
+ return 0;
+ }
+ if (num == LLONG_MIN && den == -1) {
+ return LLONG_MIN;
+ }
+ return num / den;
+}
+
+uint64_t HELPER(rbit64)(uint64_t x)
+{
+ return revbit64(x);
+}
+
+void HELPER(msr_i_spsel)(CPUARMState *env, uint32_t imm)
+{
+ update_spsel(env, imm);
+}
+
+static void daif_check(CPUARMState *env, uint32_t op,
+ uint32_t imm, uintptr_t ra)
+{
+ /* DAIF update to PSTATE. This is OK from EL0 only if UMA is set. */
+ if (arm_current_el(env) == 0 && !(arm_sctlr(env, 0) & SCTLR_UMA)) {
+ raise_exception_ra(env, EXCP_UDEF,
+ syn_aa64_sysregtrap(0, extract32(op, 0, 3),
+ extract32(op, 3, 3), 4,
+ imm, 0x1f, 0),
+ exception_target_el(env), ra);
+ }
+}
+
+void HELPER(msr_i_daifset)(CPUARMState *env, uint32_t imm)
+{
+ daif_check(env, 0x1e, imm, GETPC());
+ env->daif |= (imm << 6) & PSTATE_DAIF;
+ arm_rebuild_hflags(env);
+}
+
+void HELPER(msr_i_daifclear)(CPUARMState *env, uint32_t imm)
+{
+ daif_check(env, 0x1f, imm, GETPC());
+ env->daif &= ~((imm << 6) & PSTATE_DAIF);
+ arm_rebuild_hflags(env);
+}
+
+/* Convert a softfloat float_relation_ (as returned by
+ * the float*_compare functions) to the correct ARM
+ * NZCV flag state.
+ */
+static inline uint32_t float_rel_to_flags(int res)
+{
+ uint64_t flags;
+ switch (res) {
+ case float_relation_equal:
+ flags = PSTATE_Z | PSTATE_C;
+ break;
+ case float_relation_less:
+ flags = PSTATE_N;
+ break;
+ case float_relation_greater:
+ flags = PSTATE_C;
+ break;
+ case float_relation_unordered:
+ default:
+ flags = PSTATE_C | PSTATE_V;
+ break;
+ }
+ return flags;
+}
+
+uint64_t HELPER(vfp_cmph_a64)(uint32_t x, uint32_t y, void *fp_status)
+{
+ return float_rel_to_flags(float16_compare_quiet(x, y, fp_status));
+}
+
+uint64_t HELPER(vfp_cmpeh_a64)(uint32_t x, uint32_t y, void *fp_status)
+{
+ return float_rel_to_flags(float16_compare(x, y, fp_status));
+}
+
+uint64_t HELPER(vfp_cmps_a64)(float32 x, float32 y, void *fp_status)
+{
+ return float_rel_to_flags(float32_compare_quiet(x, y, fp_status));
+}
+
+uint64_t HELPER(vfp_cmpes_a64)(float32 x, float32 y, void *fp_status)
+{
+ return float_rel_to_flags(float32_compare(x, y, fp_status));
+}
+
+uint64_t HELPER(vfp_cmpd_a64)(float64 x, float64 y, void *fp_status)
+{
+ return float_rel_to_flags(float64_compare_quiet(x, y, fp_status));
+}
+
+uint64_t HELPER(vfp_cmped_a64)(float64 x, float64 y, void *fp_status)
+{
+ return float_rel_to_flags(float64_compare(x, y, fp_status));
+}
+
+float32 HELPER(vfp_mulxs)(float32 a, float32 b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+
+ a = float32_squash_input_denormal(a, fpst);
+ b = float32_squash_input_denormal(b, fpst);
+
+ if ((float32_is_zero(a) && float32_is_infinity(b)) ||
+ (float32_is_infinity(a) && float32_is_zero(b))) {
+ /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
+ return make_float32((1U << 30) |
+ ((float32_val(a) ^ float32_val(b)) & (1U << 31)));
+ }
+ return float32_mul(a, b, fpst);
+}
+
+float64 HELPER(vfp_mulxd)(float64 a, float64 b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+
+ a = float64_squash_input_denormal(a, fpst);
+ b = float64_squash_input_denormal(b, fpst);
+
+ if ((float64_is_zero(a) && float64_is_infinity(b)) ||
+ (float64_is_infinity(a) && float64_is_zero(b))) {
+ /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
+ return make_float64((1ULL << 62) |
+ ((float64_val(a) ^ float64_val(b)) & (1ULL << 63)));
+ }
+ return float64_mul(a, b, fpst);
+}
+
+/* 64bit/double versions of the neon float compare functions */
+uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ return -float64_eq_quiet(a, b, fpst);
+}
+
+uint64_t HELPER(neon_cge_f64)(float64 a, float64 b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ return -float64_le(b, a, fpst);
+}
+
+uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ return -float64_lt(b, a, fpst);
+}
+
+/* Reciprocal step and sqrt step. Note that unlike the A32/T32
+ * versions, these do a fully fused multiply-add or
+ * multiply-add-and-halve.
+ */
+
+uint32_t HELPER(recpsf_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+
+ a = float16_squash_input_denormal(a, fpst);
+ b = float16_squash_input_denormal(b, fpst);
+
+ a = float16_chs(a);
+ if ((float16_is_infinity(a) && float16_is_zero(b)) ||
+ (float16_is_infinity(b) && float16_is_zero(a))) {
+ return float16_two;
+ }
+ return float16_muladd(a, b, float16_two, 0, fpst);
+}
+
+float32 HELPER(recpsf_f32)(float32 a, float32 b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+
+ a = float32_squash_input_denormal(a, fpst);
+ b = float32_squash_input_denormal(b, fpst);
+
+ a = float32_chs(a);
+ if ((float32_is_infinity(a) && float32_is_zero(b)) ||
+ (float32_is_infinity(b) && float32_is_zero(a))) {
+ return float32_two;
+ }
+ return float32_muladd(a, b, float32_two, 0, fpst);
+}
+
+float64 HELPER(recpsf_f64)(float64 a, float64 b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+
+ a = float64_squash_input_denormal(a, fpst);
+ b = float64_squash_input_denormal(b, fpst);
+
+ a = float64_chs(a);
+ if ((float64_is_infinity(a) && float64_is_zero(b)) ||
+ (float64_is_infinity(b) && float64_is_zero(a))) {
+ return float64_two;
+ }
+ return float64_muladd(a, b, float64_two, 0, fpst);
+}
+
+uint32_t HELPER(rsqrtsf_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+
+ a = float16_squash_input_denormal(a, fpst);
+ b = float16_squash_input_denormal(b, fpst);
+
+ a = float16_chs(a);
+ if ((float16_is_infinity(a) && float16_is_zero(b)) ||
+ (float16_is_infinity(b) && float16_is_zero(a))) {
+ return float16_one_point_five;
+ }
+ return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
+}
+
+float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+
+ a = float32_squash_input_denormal(a, fpst);
+ b = float32_squash_input_denormal(b, fpst);
+
+ a = float32_chs(a);
+ if ((float32_is_infinity(a) && float32_is_zero(b)) ||
+ (float32_is_infinity(b) && float32_is_zero(a))) {
+ return float32_one_point_five;
+ }
+ return float32_muladd(a, b, float32_three, float_muladd_halve_result, fpst);
+}
+
+float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+
+ a = float64_squash_input_denormal(a, fpst);
+ b = float64_squash_input_denormal(b, fpst);
+
+ a = float64_chs(a);
+ if ((float64_is_infinity(a) && float64_is_zero(b)) ||
+ (float64_is_infinity(b) && float64_is_zero(a))) {
+ return float64_one_point_five;
+ }
+ return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst);
+}
+
+/* Pairwise long add: add pairs of adjacent elements into
+ * double-width elements in the result (eg _s8 is an 8x8->16 op)
+ */
+uint64_t HELPER(neon_addlp_s8)(uint64_t a)
+{
+ uint64_t nsignmask = 0x0080008000800080ULL;
+ uint64_t wsignmask = 0x8000800080008000ULL;
+ uint64_t elementmask = 0x00ff00ff00ff00ffULL;
+ uint64_t tmp1, tmp2;
+ uint64_t res, signres;
+
+ /* Extract odd elements, sign extend each to a 16 bit field */
+ tmp1 = a & elementmask;
+ tmp1 ^= nsignmask;
+ tmp1 |= wsignmask;
+ tmp1 = (tmp1 - nsignmask) ^ wsignmask;
+ /* Ditto for the even elements */
+ tmp2 = (a >> 8) & elementmask;
+ tmp2 ^= nsignmask;
+ tmp2 |= wsignmask;
+ tmp2 = (tmp2 - nsignmask) ^ wsignmask;
+
+ /* calculate the result by summing bits 0..14, 16..22, etc,
+ * and then adjusting the sign bits 15, 23, etc manually.
+ * This ensures the addition can't overflow the 16 bit field.
+ */
+ signres = (tmp1 ^ tmp2) & wsignmask;
+ res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask);
+ res ^= signres;
+
+ return res;
+}
+
+uint64_t HELPER(neon_addlp_u8)(uint64_t a)
+{
+ uint64_t tmp;
+
+ tmp = a & 0x00ff00ff00ff00ffULL;
+ tmp += (a >> 8) & 0x00ff00ff00ff00ffULL;
+ return tmp;
+}
+
+uint64_t HELPER(neon_addlp_s16)(uint64_t a)
+{
+ int32_t reslo, reshi;
+
+ reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16);
+ reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48);
+
+ return (uint32_t)reslo | (((uint64_t)reshi) << 32);
+}
+
+uint64_t HELPER(neon_addlp_u16)(uint64_t a)
+{
+ uint64_t tmp;
+
+ tmp = a & 0x0000ffff0000ffffULL;
+ tmp += (a >> 16) & 0x0000ffff0000ffffULL;
+ return tmp;
+}
+
+/* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
+uint32_t HELPER(frecpx_f16)(uint32_t a, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ uint16_t val16, sbit;
+ int16_t exp;
+
+ if (float16_is_any_nan(a)) {
+ float16 nan = a;
+ if (float16_is_signaling_nan(a, fpst)) {
+ float_raise(float_flag_invalid, fpst);
+ if (!fpst->default_nan_mode) {
+ nan = float16_silence_nan(a, fpst);
+ }
+ }
+ if (fpst->default_nan_mode) {
+ nan = float16_default_nan(fpst);
+ }
+ return nan;
+ }
+
+ a = float16_squash_input_denormal(a, fpst);
+
+ val16 = float16_val(a);
+ sbit = 0x8000 & val16;
+ exp = extract32(val16, 10, 5);
+
+ if (exp == 0) {
+ return make_float16(deposit32(sbit, 10, 5, 0x1e));
+ } else {
+ return make_float16(deposit32(sbit, 10, 5, ~exp));
+ }
+}
+
+float32 HELPER(frecpx_f32)(float32 a, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ uint32_t val32, sbit;
+ int32_t exp;
+
+ if (float32_is_any_nan(a)) {
+ float32 nan = a;
+ if (float32_is_signaling_nan(a, fpst)) {
+ float_raise(float_flag_invalid, fpst);
+ if (!fpst->default_nan_mode) {
+ nan = float32_silence_nan(a, fpst);
+ }
+ }
+ if (fpst->default_nan_mode) {
+ nan = float32_default_nan(fpst);
+ }
+ return nan;
+ }
+
+ a = float32_squash_input_denormal(a, fpst);
+
+ val32 = float32_val(a);
+ sbit = 0x80000000ULL & val32;
+ exp = extract32(val32, 23, 8);
+
+ if (exp == 0) {
+ return make_float32(sbit | (0xfe << 23));
+ } else {
+ return make_float32(sbit | (~exp & 0xff) << 23);
+ }
+}
+
+float64 HELPER(frecpx_f64)(float64 a, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ uint64_t val64, sbit;
+ int64_t exp;
+
+ if (float64_is_any_nan(a)) {
+ float64 nan = a;
+ if (float64_is_signaling_nan(a, fpst)) {
+ float_raise(float_flag_invalid, fpst);
+ if (!fpst->default_nan_mode) {
+ nan = float64_silence_nan(a, fpst);
+ }
+ }
+ if (fpst->default_nan_mode) {
+ nan = float64_default_nan(fpst);
+ }
+ return nan;
+ }
+
+ a = float64_squash_input_denormal(a, fpst);
+
+ val64 = float64_val(a);
+ sbit = 0x8000000000000000ULL & val64;
+ exp = extract64(float64_val(a), 52, 11);
+
+ if (exp == 0) {
+ return make_float64(sbit | (0x7feULL << 52));
+ } else {
+ return make_float64(sbit | (~exp & 0x7ffULL) << 52);
+ }
+}
+
+float32 HELPER(fcvtx_f64_to_f32)(float64 a, CPUARMState *env)
+{
+ /* Von Neumann rounding is implemented by using round-to-zero
+ * and then setting the LSB of the result if Inexact was raised.
+ */
+ float32 r;
+ float_status *fpst = &env->vfp.fp_status;
+ float_status tstat = *fpst;
+ int exflags;
+
+ set_float_rounding_mode(float_round_to_zero, &tstat);
+ set_float_exception_flags(0, &tstat);
+ r = float64_to_float32(a, &tstat);
+ exflags = get_float_exception_flags(&tstat);
+ if (exflags & float_flag_inexact) {
+ r = make_float32(float32_val(r) | 1);
+ }
+ exflags |= get_float_exception_flags(fpst);
+ set_float_exception_flags(exflags, fpst);
+ return r;
+}
+
+/* 64-bit versions of the CRC helpers. Note that although the operation
+ * (and the prototypes of crc32c() and crc32() mean that only the bottom
+ * 32 bits of the accumulator and result are used, we pass and return
+ * uint64_t for convenience of the generated code. Unlike the 32-bit
+ * instruction set versions, val may genuinely have 64 bits of data in it.
+ * The upper bytes of val (above the number specified by 'bytes') must have
+ * been zeroed out by the caller.
+ */
+uint64_t HELPER(crc32_64)(uint64_t acc, uint64_t val, uint32_t bytes)
+{
+ uint8_t buf[8];
+
+ stq_le_p(buf, val);
+
+ /* zlib crc32 converts the accumulator and output to one's complement. */
+ return crc32(acc ^ 0xffffffff, buf, bytes) ^ 0xffffffff;
+}
+
+uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
+{
+ uint8_t buf[8];
+
+ stq_le_p(buf, val);
+
+ /* Linux crc32c converts the output to one's complement. */
+ return crc32c(acc, buf, bytes) ^ 0xffffffff;
+}
+
+/*
+ * AdvSIMD half-precision
+ */
+
+#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
+
+#define ADVSIMD_HALFOP(name) \
+uint32_t ADVSIMD_HELPER(name, h)(uint32_t a, uint32_t b, void *fpstp) \
+{ \
+ float_status *fpst = fpstp; \
+ return float16_ ## name(a, b, fpst); \
+}
+
+ADVSIMD_HALFOP(add)
+ADVSIMD_HALFOP(sub)
+ADVSIMD_HALFOP(mul)
+ADVSIMD_HALFOP(div)
+ADVSIMD_HALFOP(min)
+ADVSIMD_HALFOP(max)
+ADVSIMD_HALFOP(minnum)
+ADVSIMD_HALFOP(maxnum)
+
+#define ADVSIMD_TWOHALFOP(name) \
+uint32_t ADVSIMD_HELPER(name, 2h)(uint32_t two_a, uint32_t two_b, void *fpstp) \
+{ \
+ float16 a1, a2, b1, b2; \
+ uint32_t r1, r2; \
+ float_status *fpst = fpstp; \
+ a1 = extract32(two_a, 0, 16); \
+ a2 = extract32(two_a, 16, 16); \
+ b1 = extract32(two_b, 0, 16); \
+ b2 = extract32(two_b, 16, 16); \
+ r1 = float16_ ## name(a1, b1, fpst); \
+ r2 = float16_ ## name(a2, b2, fpst); \
+ return deposit32(r1, 16, 16, r2); \
+}
+
+ADVSIMD_TWOHALFOP(add)
+ADVSIMD_TWOHALFOP(sub)
+ADVSIMD_TWOHALFOP(mul)
+ADVSIMD_TWOHALFOP(div)
+ADVSIMD_TWOHALFOP(min)
+ADVSIMD_TWOHALFOP(max)
+ADVSIMD_TWOHALFOP(minnum)
+ADVSIMD_TWOHALFOP(maxnum)
+
+/* Data processing - scalar floating-point and advanced SIMD */
+static float16 float16_mulx(float16 a, float16 b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+
+ a = float16_squash_input_denormal(a, fpst);
+ b = float16_squash_input_denormal(b, fpst);
+
+ if ((float16_is_zero(a) && float16_is_infinity(b)) ||
+ (float16_is_infinity(a) && float16_is_zero(b))) {
+ /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
+ return make_float16((1U << 14) |
+ ((float16_val(a) ^ float16_val(b)) & (1U << 15)));
+ }
+ return float16_mul(a, b, fpst);
+}
+
+ADVSIMD_HALFOP(mulx)
+ADVSIMD_TWOHALFOP(mulx)
+
+/* fused multiply-accumulate */
+uint32_t HELPER(advsimd_muladdh)(uint32_t a, uint32_t b, uint32_t c,
+ void *fpstp)
+{
+ float_status *fpst = fpstp;
+ return float16_muladd(a, b, c, 0, fpst);
+}
+
+uint32_t HELPER(advsimd_muladd2h)(uint32_t two_a, uint32_t two_b,
+ uint32_t two_c, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ float16 a1, a2, b1, b2, c1, c2;
+ uint32_t r1, r2;
+ a1 = extract32(two_a, 0, 16);
+ a2 = extract32(two_a, 16, 16);
+ b1 = extract32(two_b, 0, 16);
+ b2 = extract32(two_b, 16, 16);
+ c1 = extract32(two_c, 0, 16);
+ c2 = extract32(two_c, 16, 16);
+ r1 = float16_muladd(a1, b1, c1, 0, fpst);
+ r2 = float16_muladd(a2, b2, c2, 0, fpst);
+ return deposit32(r1, 16, 16, r2);
+}
+
+/*
+ * Floating point comparisons produce an integer result. Softfloat
+ * routines return float_relation types which we convert to the 0/-1
+ * Neon requires.
+ */
+
+#define ADVSIMD_CMPRES(test) (test) ? 0xffff : 0
+
+uint32_t HELPER(advsimd_ceq_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ int compare = float16_compare_quiet(a, b, fpst);
+ return ADVSIMD_CMPRES(compare == float_relation_equal);
+}
+
+uint32_t HELPER(advsimd_cge_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ int compare = float16_compare(a, b, fpst);
+ return ADVSIMD_CMPRES(compare == float_relation_greater ||
+ compare == float_relation_equal);
+}
+
+uint32_t HELPER(advsimd_cgt_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ int compare = float16_compare(a, b, fpst);
+ return ADVSIMD_CMPRES(compare == float_relation_greater);
+}
+
+uint32_t HELPER(advsimd_acge_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ float16 f0 = float16_abs(a);
+ float16 f1 = float16_abs(b);
+ int compare = float16_compare(f0, f1, fpst);
+ return ADVSIMD_CMPRES(compare == float_relation_greater ||
+ compare == float_relation_equal);
+}
+
+uint32_t HELPER(advsimd_acgt_f16)(uint32_t a, uint32_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ float16 f0 = float16_abs(a);
+ float16 f1 = float16_abs(b);
+ int compare = float16_compare(f0, f1, fpst);
+ return ADVSIMD_CMPRES(compare == float_relation_greater);
+}
+
+/* round to integral */
+uint32_t HELPER(advsimd_rinth_exact)(uint32_t x, void *fp_status)
+{
+ return float16_round_to_int(x, fp_status);
+}
+
+uint32_t HELPER(advsimd_rinth)(uint32_t x, void *fp_status)
+{
+ int old_flags = get_float_exception_flags(fp_status), new_flags;
+ float16 ret;
+
+ ret = float16_round_to_int(x, fp_status);
+
+ /* Suppress any inexact exceptions the conversion produced */
+ if (!(old_flags & float_flag_inexact)) {
+ new_flags = get_float_exception_flags(fp_status);
+ set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
+ }
+
+ return ret;
+}
+
+/*
+ * Half-precision floating point conversion functions
+ *
+ * There are a multitude of conversion functions with various
+ * different rounding modes. This is dealt with by the calling code
+ * setting the mode appropriately before calling the helper.
+ */
+
+uint32_t HELPER(advsimd_f16tosinth)(uint32_t a, void *fpstp)
+{
+ float_status *fpst = fpstp;
+
+ /* Invalid if we are passed a NaN */
+ if (float16_is_any_nan(a)) {
+ float_raise(float_flag_invalid, fpst);
+ return 0;
+ }
+ return float16_to_int16(a, fpst);
+}
+
+uint32_t HELPER(advsimd_f16touinth)(uint32_t a, void *fpstp)
+{
+ float_status *fpst = fpstp;
+
+ /* Invalid if we are passed a NaN */
+ if (float16_is_any_nan(a)) {
+ float_raise(float_flag_invalid, fpst);
+ return 0;
+ }
+ return float16_to_uint16(a, fpst);
+}
+
+static int el_from_spsr(uint32_t spsr)
+{
+ /* Return the exception level that this SPSR is requesting a return to,
+ * or -1 if it is invalid (an illegal return)
+ */
+ if (spsr & PSTATE_nRW) {
+ switch (spsr & CPSR_M) {
+ case ARM_CPU_MODE_USR:
+ return 0;
+ case ARM_CPU_MODE_HYP:
+ return 2;
+ case ARM_CPU_MODE_FIQ:
+ case ARM_CPU_MODE_IRQ:
+ case ARM_CPU_MODE_SVC:
+ case ARM_CPU_MODE_ABT:
+ case ARM_CPU_MODE_UND:
+ case ARM_CPU_MODE_SYS:
+ return 1;
+ case ARM_CPU_MODE_MON:
+ /* Returning to Mon from AArch64 is never possible,
+ * so this is an illegal return.
+ */
+ default:
+ return -1;
+ }
+ } else {
+ if (extract32(spsr, 1, 1)) {
+ /* Return with reserved M[1] bit set */
+ return -1;
+ }
+ if (extract32(spsr, 0, 4) == 1) {
+ /* return to EL0 with M[0] bit set */
+ return -1;
+ }
+ return extract32(spsr, 2, 2);
+ }
+}
+
+static void cpsr_write_from_spsr_elx(CPUARMState *env,
+ uint32_t val)
+{
+ uint32_t mask;
+
+ /* Save SPSR_ELx.SS into PSTATE. */
+ env->pstate = (env->pstate & ~PSTATE_SS) | (val & PSTATE_SS);
+ val &= ~PSTATE_SS;
+
+ /* Move DIT to the correct location for CPSR */
+ if (val & PSTATE_DIT) {
+ val &= ~PSTATE_DIT;
+ val |= CPSR_DIT;
+ }
+
+ mask = aarch32_cpsr_valid_mask(env->features, \
+ &env_archcpu(env)->isar);
+ cpsr_write(env, val, mask, CPSRWriteRaw);
+}
+
+void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
+{
+ int cur_el = arm_current_el(env);
+ unsigned int spsr_idx = aarch64_banked_spsr_index(cur_el);
+ uint32_t spsr = env->banked_spsr[spsr_idx];
+ int new_el;
+ bool return_to_aa64 = (spsr & PSTATE_nRW) == 0;
+
+ aarch64_save_sp(env, cur_el);
+
+ arm_clear_exclusive(env);
+
+ /* We must squash the PSTATE.SS bit to zero unless both of the
+ * following hold:
+ * 1. debug exceptions are currently disabled
+ * 2. singlestep will be active in the EL we return to
+ * We check 1 here and 2 after we've done the pstate/cpsr write() to
+ * transition to the EL we're going to.
+ */
+ if (arm_generate_debug_exceptions(env)) {
+ spsr &= ~PSTATE_SS;
+ }
+
+ new_el = el_from_spsr(spsr);
+ if (new_el == -1) {
+ goto illegal_return;
+ }
+ if (new_el > cur_el || (new_el == 2 && !arm_is_el2_enabled(env))) {
+ /* Disallow return to an EL which is unimplemented or higher
+ * than the current one.
+ */
+ goto illegal_return;
+ }
+
+ if (new_el != 0 && arm_el_is_aa64(env, new_el) != return_to_aa64) {
+ /* Return to an EL which is configured for a different register width */
+ goto illegal_return;
+ }
+
+ if (new_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
+ goto illegal_return;
+ }
+
+ qemu_mutex_lock_iothread();
+ arm_call_pre_el_change_hook(env_archcpu(env));
+ qemu_mutex_unlock_iothread();
+
+ if (!return_to_aa64) {
+ env->aarch64 = false;
+ /* We do a raw CPSR write because aarch64_sync_64_to_32()
+ * will sort the register banks out for us, and we've already
+ * caught all the bad-mode cases in el_from_spsr().
+ */
+ cpsr_write_from_spsr_elx(env, spsr);
+ if (!arm_singlestep_active(env)) {
+ env->pstate &= ~PSTATE_SS;
+ }
+ aarch64_sync_64_to_32(env);
+
+ if (spsr & CPSR_T) {
+ env->regs[15] = new_pc & ~0x1;
+ } else {
+ env->regs[15] = new_pc & ~0x3;
+ }
+ helper_rebuild_hflags_a32(env, new_el);
+ qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
+ "AArch32 EL%d PC 0x%" PRIx32 "\n",
+ cur_el, new_el, env->regs[15]);
+ } else {
+ int tbii;
+
+ env->aarch64 = true;
+ spsr &= aarch64_pstate_valid_mask(&env_archcpu(env)->isar);
+ pstate_write(env, spsr);
+ if (!arm_singlestep_active(env)) {
+ env->pstate &= ~PSTATE_SS;
+ }
+ aarch64_restore_sp(env, new_el);
+ helper_rebuild_hflags_a64(env, new_el);
+
+ /*
+ * Apply TBI to the exception return address. We had to delay this
+ * until after we selected the new EL, so that we could select the
+ * correct TBI+TBID bits. This is made easier by waiting until after
+ * the hflags rebuild, since we can pull the composite TBII field
+ * from there.
+ */
+ tbii = EX_TBFLAG_A64(env->hflags, TBII);
+ if ((tbii >> extract64(new_pc, 55, 1)) & 1) {
+ /* TBI is enabled. */
+ int core_mmu_idx = cpu_mmu_index(env, false);
+ if (regime_has_2_ranges(core_to_aa64_mmu_idx(core_mmu_idx))) {
+ new_pc = sextract64(new_pc, 0, 56);
+ } else {
+ new_pc = extract64(new_pc, 0, 56);
+ }
+ }
+ env->pc = new_pc;
+
+ qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
+ "AArch64 EL%d PC 0x%" PRIx64 "\n",
+ cur_el, new_el, env->pc);
+ }
+
+ /*
+ * Note that cur_el can never be 0. If new_el is 0, then
+ * el0_a64 is return_to_aa64, else el0_a64 is ignored.
+ */
+ aarch64_sve_change_el(env, cur_el, new_el, return_to_aa64);
+
+ qemu_mutex_lock_iothread();
+ arm_call_el_change_hook(env_archcpu(env));
+ qemu_mutex_unlock_iothread();
+
+ return;
+
+illegal_return:
+ /* Illegal return events of various kinds have architecturally
+ * mandated behaviour:
+ * restore NZCV and DAIF from SPSR_ELx
+ * set PSTATE.IL
+ * restore PC from ELR_ELx
+ * no change to exception level, execution state or stack pointer
+ */
+ env->pstate |= PSTATE_IL;
+ env->pc = new_pc;
+ spsr &= PSTATE_NZCV | PSTATE_DAIF;
+ spsr |= pstate_read(env) & ~(PSTATE_NZCV | PSTATE_DAIF);
+ pstate_write(env, spsr);
+ if (!arm_singlestep_active(env)) {
+ env->pstate &= ~PSTATE_SS;
+ }
+ helper_rebuild_hflags_a64(env, cur_el);
+ qemu_log_mask(LOG_GUEST_ERROR, "Illegal exception return at EL%d: "
+ "resuming execution at 0x%" PRIx64 "\n", cur_el, env->pc);
+}
+
+/*
+ * Square Root and Reciprocal square root
+ */
+
+uint32_t HELPER(sqrt_f16)(uint32_t a, void *fpstp)
+{
+ float_status *s = fpstp;
+
+ return float16_sqrt(a, s);
+}
+
+void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in)
+{
+ /*
+ * Implement DC ZVA, which zeroes a fixed-length block of memory.
+ * Note that we do not implement the (architecturally mandated)
+ * alignment fault for attempts to use this on Device memory
+ * (which matches the usual QEMU behaviour of not implementing either
+ * alignment faults or any memory attribute handling).
+ */
+ int blocklen = 4 << env_archcpu(env)->dcz_blocksize;
+ uint64_t vaddr = vaddr_in & ~(blocklen - 1);
+ int mmu_idx = cpu_mmu_index(env, false);
+ void *mem;
+
+ /*
+ * Trapless lookup. In addition to actual invalid page, may
+ * return NULL for I/O, watchpoints, clean pages, etc.
+ */
+ mem = tlb_vaddr_to_host(env, vaddr, MMU_DATA_STORE, mmu_idx);
+
+#ifndef CONFIG_USER_ONLY
+ if (unlikely(!mem)) {
+ uintptr_t ra = GETPC();
+
+ /*
+ * Trap if accessing an invalid page. DC_ZVA requires that we supply
+ * the original pointer for an invalid page. But watchpoints require
+ * that we probe the actual space. So do both.
+ */
+ (void) probe_write(env, vaddr_in, 1, mmu_idx, ra);
+ mem = probe_write(env, vaddr, blocklen, mmu_idx, ra);
+
+ if (unlikely(!mem)) {
+ /*
+ * The only remaining reason for mem == NULL is I/O.
+ * Just do a series of byte writes as the architecture demands.
+ */
+ for (int i = 0; i < blocklen; i++) {
+ cpu_stb_mmuidx_ra(env, vaddr + i, 0, mmu_idx, ra);
+ }
+ return;
+ }
+ }
+#endif
+
+ memset(mem, 0, blocklen);
+}
--- /dev/null
+/*
+ * iwMMXt micro operations for XScale.
+ *
+ * Copyright (c) 2007 OpenedHand, Ltd.
+ * Written by Andrzej Zaborowski <andrew@openedhand.com>
+ * Copyright (c) 2008 CodeSourcery
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#include "exec/helper-proto.h"
+
+/* iwMMXt macros extracted from GNU gdb. */
+
+/* Set the SIMD wCASF flags for 8, 16, 32 or 64-bit operations. */
+#define SIMD8_SET(v, n, b) ((v != 0) << ((((b) + 1) * 4) + (n)))
+#define SIMD16_SET(v, n, h) ((v != 0) << ((((h) + 1) * 8) + (n)))
+#define SIMD32_SET(v, n, w) ((v != 0) << ((((w) + 1) * 16) + (n)))
+#define SIMD64_SET(v, n) ((v != 0) << (32 + (n)))
+/* Flags to pass as "n" above. */
+#define SIMD_NBIT -1
+#define SIMD_ZBIT -2
+#define SIMD_CBIT -3
+#define SIMD_VBIT -4
+/* Various status bit macros. */
+#define NBIT8(x) ((x) & 0x80)
+#define NBIT16(x) ((x) & 0x8000)
+#define NBIT32(x) ((x) & 0x80000000)
+#define NBIT64(x) ((x) & 0x8000000000000000ULL)
+#define ZBIT8(x) (((x) & 0xff) == 0)
+#define ZBIT16(x) (((x) & 0xffff) == 0)
+#define ZBIT32(x) (((x) & 0xffffffff) == 0)
+#define ZBIT64(x) (x == 0)
+/* Sign extension macros. */
+#define EXTEND8H(a) ((uint16_t) (int8_t) (a))
+#define EXTEND8(a) ((uint32_t) (int8_t) (a))
+#define EXTEND16(a) ((uint32_t) (int16_t) (a))
+#define EXTEND16S(a) ((int32_t) (int16_t) (a))
+#define EXTEND32(a) ((uint64_t) (int32_t) (a))
+
+uint64_t HELPER(iwmmxt_maddsq)(uint64_t a, uint64_t b)
+{
+ a = ((
+ EXTEND16S((a >> 0) & 0xffff) * EXTEND16S((b >> 0) & 0xffff) +
+ EXTEND16S((a >> 16) & 0xffff) * EXTEND16S((b >> 16) & 0xffff)
+ ) & 0xffffffff) | ((uint64_t) (
+ EXTEND16S((a >> 32) & 0xffff) * EXTEND16S((b >> 32) & 0xffff) +
+ EXTEND16S((a >> 48) & 0xffff) * EXTEND16S((b >> 48) & 0xffff)
+ ) << 32);
+ return a;
+}
+
+uint64_t HELPER(iwmmxt_madduq)(uint64_t a, uint64_t b)
+{
+ a = ((
+ ((a >> 0) & 0xffff) * ((b >> 0) & 0xffff) +
+ ((a >> 16) & 0xffff) * ((b >> 16) & 0xffff)
+ ) & 0xffffffff) | ((
+ ((a >> 32) & 0xffff) * ((b >> 32) & 0xffff) +
+ ((a >> 48) & 0xffff) * ((b >> 48) & 0xffff)
+ ) << 32);
+ return a;
+}
+
+uint64_t HELPER(iwmmxt_sadb)(uint64_t a, uint64_t b)
+{
+#define abs(x) (((x) >= 0) ? x : -x)
+#define SADB(SHR) abs((int) ((a >> SHR) & 0xff) - (int) ((b >> SHR) & 0xff))
+ return
+ SADB(0) + SADB(8) + SADB(16) + SADB(24) +
+ SADB(32) + SADB(40) + SADB(48) + SADB(56);
+#undef SADB
+}
+
+uint64_t HELPER(iwmmxt_sadw)(uint64_t a, uint64_t b)
+{
+#define SADW(SHR) \
+ abs((int) ((a >> SHR) & 0xffff) - (int) ((b >> SHR) & 0xffff))
+ return SADW(0) + SADW(16) + SADW(32) + SADW(48);
+#undef SADW
+}
+
+uint64_t HELPER(iwmmxt_mulslw)(uint64_t a, uint64_t b)
+{
+#define MULS(SHR) ((uint64_t) ((( \
+ EXTEND16S((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff) \
+ ) >> 0) & 0xffff) << SHR)
+ return MULS(0) | MULS(16) | MULS(32) | MULS(48);
+#undef MULS
+}
+
+uint64_t HELPER(iwmmxt_mulshw)(uint64_t a, uint64_t b)
+{
+#define MULS(SHR) ((uint64_t) ((( \
+ EXTEND16S((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff) \
+ ) >> 16) & 0xffff) << SHR)
+ return MULS(0) | MULS(16) | MULS(32) | MULS(48);
+#undef MULS
+}
+
+uint64_t HELPER(iwmmxt_mululw)(uint64_t a, uint64_t b)
+{
+#define MULU(SHR) ((uint64_t) ((( \
+ ((a >> SHR) & 0xffff) * ((b >> SHR) & 0xffff) \
+ ) >> 0) & 0xffff) << SHR)
+ return MULU(0) | MULU(16) | MULU(32) | MULU(48);
+#undef MULU
+}
+
+uint64_t HELPER(iwmmxt_muluhw)(uint64_t a, uint64_t b)
+{
+#define MULU(SHR) ((uint64_t) ((( \
+ ((a >> SHR) & 0xffff) * ((b >> SHR) & 0xffff) \
+ ) >> 16) & 0xffff) << SHR)
+ return MULU(0) | MULU(16) | MULU(32) | MULU(48);
+#undef MULU
+}
+
+uint64_t HELPER(iwmmxt_macsw)(uint64_t a, uint64_t b)
+{
+#define MACS(SHR) ( \
+ EXTEND16((a >> SHR) & 0xffff) * EXTEND16S((b >> SHR) & 0xffff))
+ return (int64_t) (MACS(0) + MACS(16) + MACS(32) + MACS(48));
+#undef MACS
+}
+
+uint64_t HELPER(iwmmxt_macuw)(uint64_t a, uint64_t b)
+{
+#define MACU(SHR) ( \
+ (uint32_t) ((a >> SHR) & 0xffff) * \
+ (uint32_t) ((b >> SHR) & 0xffff))
+ return MACU(0) + MACU(16) + MACU(32) + MACU(48);
+#undef MACU
+}
+
+#define NZBIT8(x, i) \
+ SIMD8_SET(NBIT8((x) & 0xff), SIMD_NBIT, i) | \
+ SIMD8_SET(ZBIT8((x) & 0xff), SIMD_ZBIT, i)
+#define NZBIT16(x, i) \
+ SIMD16_SET(NBIT16((x) & 0xffff), SIMD_NBIT, i) | \
+ SIMD16_SET(ZBIT16((x) & 0xffff), SIMD_ZBIT, i)
+#define NZBIT32(x, i) \
+ SIMD32_SET(NBIT32((x) & 0xffffffff), SIMD_NBIT, i) | \
+ SIMD32_SET(ZBIT32((x) & 0xffffffff), SIMD_ZBIT, i)
+#define NZBIT64(x) \
+ SIMD64_SET(NBIT64(x), SIMD_NBIT) | \
+ SIMD64_SET(ZBIT64(x), SIMD_ZBIT)
+#define IWMMXT_OP_UNPACK(S, SH0, SH1, SH2, SH3) \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, b)))(CPUARMState *env, \
+ uint64_t a, uint64_t b) \
+{ \
+ a = \
+ (((a >> SH0) & 0xff) << 0) | (((b >> SH0) & 0xff) << 8) | \
+ (((a >> SH1) & 0xff) << 16) | (((b >> SH1) & 0xff) << 24) | \
+ (((a >> SH2) & 0xff) << 32) | (((b >> SH2) & 0xff) << 40) | \
+ (((a >> SH3) & 0xff) << 48) | (((b >> SH3) & 0xff) << 56); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
+ NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) | \
+ NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) | \
+ NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) | \
+ NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7); \
+ return a; \
+} \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, w)))(CPUARMState *env, \
+ uint64_t a, uint64_t b) \
+{ \
+ a = \
+ (((a >> SH0) & 0xffff) << 0) | \
+ (((b >> SH0) & 0xffff) << 16) | \
+ (((a >> SH2) & 0xffff) << 32) | \
+ (((b >> SH2) & 0xffff) << 48); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
+ NZBIT8(a >> 0, 0) | NZBIT8(a >> 16, 1) | \
+ NZBIT8(a >> 32, 2) | NZBIT8(a >> 48, 3); \
+ return a; \
+} \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, l)))(CPUARMState *env, \
+ uint64_t a, uint64_t b) \
+{ \
+ a = \
+ (((a >> SH0) & 0xffffffff) << 0) | \
+ (((b >> SH0) & 0xffffffff) << 32); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
+ NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1); \
+ return a; \
+} \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ub)))(CPUARMState *env, \
+ uint64_t x) \
+{ \
+ x = \
+ (((x >> SH0) & 0xff) << 0) | \
+ (((x >> SH1) & 0xff) << 16) | \
+ (((x >> SH2) & 0xff) << 32) | \
+ (((x >> SH3) & 0xff) << 48); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
+ NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | \
+ NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); \
+ return x; \
+} \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, uw)))(CPUARMState *env, \
+ uint64_t x) \
+{ \
+ x = \
+ (((x >> SH0) & 0xffff) << 0) | \
+ (((x >> SH2) & 0xffff) << 32); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
+ NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); \
+ return x; \
+} \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ul)))(CPUARMState *env, \
+ uint64_t x) \
+{ \
+ x = (((x >> SH0) & 0xffffffff) << 0); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0); \
+ return x; \
+} \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sb)))(CPUARMState *env, \
+ uint64_t x) \
+{ \
+ x = \
+ ((uint64_t) EXTEND8H((x >> SH0) & 0xff) << 0) | \
+ ((uint64_t) EXTEND8H((x >> SH1) & 0xff) << 16) | \
+ ((uint64_t) EXTEND8H((x >> SH2) & 0xff) << 32) | \
+ ((uint64_t) EXTEND8H((x >> SH3) & 0xff) << 48); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
+ NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) | \
+ NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3); \
+ return x; \
+} \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sw)))(CPUARMState *env, \
+ uint64_t x) \
+{ \
+ x = \
+ ((uint64_t) EXTEND16((x >> SH0) & 0xffff) << 0) | \
+ ((uint64_t) EXTEND16((x >> SH2) & 0xffff) << 32); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
+ NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1); \
+ return x; \
+} \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sl)))(CPUARMState *env, \
+ uint64_t x) \
+{ \
+ x = EXTEND32((x >> SH0) & 0xffffffff); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0); \
+ return x; \
+}
+IWMMXT_OP_UNPACK(l, 0, 8, 16, 24)
+IWMMXT_OP_UNPACK(h, 32, 40, 48, 56)
+
+#define IWMMXT_OP_CMP(SUFF, Tb, Tw, Tl, O) \
+uint64_t HELPER(glue(iwmmxt_, glue(SUFF, b)))(CPUARMState *env, \
+ uint64_t a, uint64_t b) \
+{ \
+ a = \
+ CMP(0, Tb, O, 0xff) | CMP(8, Tb, O, 0xff) | \
+ CMP(16, Tb, O, 0xff) | CMP(24, Tb, O, 0xff) | \
+ CMP(32, Tb, O, 0xff) | CMP(40, Tb, O, 0xff) | \
+ CMP(48, Tb, O, 0xff) | CMP(56, Tb, O, 0xff); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
+ NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) | \
+ NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) | \
+ NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) | \
+ NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7); \
+ return a; \
+} \
+uint64_t HELPER(glue(iwmmxt_, glue(SUFF, w)))(CPUARMState *env, \
+ uint64_t a, uint64_t b) \
+{ \
+ a = CMP(0, Tw, O, 0xffff) | CMP(16, Tw, O, 0xffff) | \
+ CMP(32, Tw, O, 0xffff) | CMP(48, Tw, O, 0xffff); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
+ NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) | \
+ NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3); \
+ return a; \
+} \
+uint64_t HELPER(glue(iwmmxt_, glue(SUFF, l)))(CPUARMState *env, \
+ uint64_t a, uint64_t b) \
+{ \
+ a = CMP(0, Tl, O, 0xffffffff) | \
+ CMP(32, Tl, O, 0xffffffff); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
+ NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1); \
+ return a; \
+}
+#define CMP(SHR, TYPE, OPER, MASK) ((((TYPE) ((a >> SHR) & MASK) OPER \
+ (TYPE) ((b >> SHR) & MASK)) ? (uint64_t) MASK : 0) << SHR)
+IWMMXT_OP_CMP(cmpeq, uint8_t, uint16_t, uint32_t, ==)
+IWMMXT_OP_CMP(cmpgts, int8_t, int16_t, int32_t, >)
+IWMMXT_OP_CMP(cmpgtu, uint8_t, uint16_t, uint32_t, >)
+#undef CMP
+#define CMP(SHR, TYPE, OPER, MASK) ((((TYPE) ((a >> SHR) & MASK) OPER \
+ (TYPE) ((b >> SHR) & MASK)) ? a : b) & ((uint64_t) MASK << SHR))
+IWMMXT_OP_CMP(mins, int8_t, int16_t, int32_t, <)
+IWMMXT_OP_CMP(minu, uint8_t, uint16_t, uint32_t, <)
+IWMMXT_OP_CMP(maxs, int8_t, int16_t, int32_t, >)
+IWMMXT_OP_CMP(maxu, uint8_t, uint16_t, uint32_t, >)
+#undef CMP
+#define CMP(SHR, TYPE, OPER, MASK) ((uint64_t) (((TYPE) ((a >> SHR) & MASK) \
+ OPER (TYPE) ((b >> SHR) & MASK)) & MASK) << SHR)
+IWMMXT_OP_CMP(subn, uint8_t, uint16_t, uint32_t, -)
+IWMMXT_OP_CMP(addn, uint8_t, uint16_t, uint32_t, +)
+#undef CMP
+/* TODO Signed- and Unsigned-Saturation */
+#define CMP(SHR, TYPE, OPER, MASK) ((uint64_t) (((TYPE) ((a >> SHR) & MASK) \
+ OPER (TYPE) ((b >> SHR) & MASK)) & MASK) << SHR)
+IWMMXT_OP_CMP(subu, uint8_t, uint16_t, uint32_t, -)
+IWMMXT_OP_CMP(addu, uint8_t, uint16_t, uint32_t, +)
+IWMMXT_OP_CMP(subs, int8_t, int16_t, int32_t, -)
+IWMMXT_OP_CMP(adds, int8_t, int16_t, int32_t, +)
+#undef CMP
+#undef IWMMXT_OP_CMP
+
+#define AVGB(SHR) ((( \
+ ((a >> SHR) & 0xff) + ((b >> SHR) & 0xff) + round) >> 1) << SHR)
+#define IWMMXT_OP_AVGB(r) \
+uint64_t HELPER(iwmmxt_avgb##r)(CPUARMState *env, uint64_t a, uint64_t b) \
+{ \
+ const int round = r; \
+ a = AVGB(0) | AVGB(8) | AVGB(16) | AVGB(24) | \
+ AVGB(32) | AVGB(40) | AVGB(48) | AVGB(56); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
+ SIMD8_SET(ZBIT8((a >> 0) & 0xff), SIMD_ZBIT, 0) | \
+ SIMD8_SET(ZBIT8((a >> 8) & 0xff), SIMD_ZBIT, 1) | \
+ SIMD8_SET(ZBIT8((a >> 16) & 0xff), SIMD_ZBIT, 2) | \
+ SIMD8_SET(ZBIT8((a >> 24) & 0xff), SIMD_ZBIT, 3) | \
+ SIMD8_SET(ZBIT8((a >> 32) & 0xff), SIMD_ZBIT, 4) | \
+ SIMD8_SET(ZBIT8((a >> 40) & 0xff), SIMD_ZBIT, 5) | \
+ SIMD8_SET(ZBIT8((a >> 48) & 0xff), SIMD_ZBIT, 6) | \
+ SIMD8_SET(ZBIT8((a >> 56) & 0xff), SIMD_ZBIT, 7); \
+ return a; \
+}
+IWMMXT_OP_AVGB(0)
+IWMMXT_OP_AVGB(1)
+#undef IWMMXT_OP_AVGB
+#undef AVGB
+
+#define AVGW(SHR) ((( \
+ ((a >> SHR) & 0xffff) + ((b >> SHR) & 0xffff) + round) >> 1) << SHR)
+#define IWMMXT_OP_AVGW(r) \
+uint64_t HELPER(iwmmxt_avgw##r)(CPUARMState *env, uint64_t a, uint64_t b) \
+{ \
+ const int round = r; \
+ a = AVGW(0) | AVGW(16) | AVGW(32) | AVGW(48); \
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = \
+ SIMD16_SET(ZBIT16((a >> 0) & 0xffff), SIMD_ZBIT, 0) | \
+ SIMD16_SET(ZBIT16((a >> 16) & 0xffff), SIMD_ZBIT, 1) | \
+ SIMD16_SET(ZBIT16((a >> 32) & 0xffff), SIMD_ZBIT, 2) | \
+ SIMD16_SET(ZBIT16((a >> 48) & 0xffff), SIMD_ZBIT, 3); \
+ return a; \
+}
+IWMMXT_OP_AVGW(0)
+IWMMXT_OP_AVGW(1)
+#undef IWMMXT_OP_AVGW
+#undef AVGW
+
+uint64_t HELPER(iwmmxt_align)(uint64_t a, uint64_t b, uint32_t n)
+{
+ a >>= n << 3;
+ a |= b << (64 - (n << 3));
+ return a;
+}
+
+uint64_t HELPER(iwmmxt_insr)(uint64_t x, uint32_t a, uint32_t b, uint32_t n)
+{
+ x &= ~((uint64_t) b << n);
+ x |= (uint64_t) (a & b) << n;
+ return x;
+}
+
+uint32_t HELPER(iwmmxt_setpsr_nz)(uint64_t x)
+{
+ return SIMD64_SET((x == 0), SIMD_ZBIT) |
+ SIMD64_SET((x & (1ULL << 63)), SIMD_NBIT);
+}
+
+uint64_t HELPER(iwmmxt_bcstb)(uint32_t arg)
+{
+ arg &= 0xff;
+ return
+ ((uint64_t) arg << 0 ) | ((uint64_t) arg << 8 ) |
+ ((uint64_t) arg << 16) | ((uint64_t) arg << 24) |
+ ((uint64_t) arg << 32) | ((uint64_t) arg << 40) |
+ ((uint64_t) arg << 48) | ((uint64_t) arg << 56);
+}
+
+uint64_t HELPER(iwmmxt_bcstw)(uint32_t arg)
+{
+ arg &= 0xffff;
+ return
+ ((uint64_t) arg << 0 ) | ((uint64_t) arg << 16) |
+ ((uint64_t) arg << 32) | ((uint64_t) arg << 48);
+}
+
+uint64_t HELPER(iwmmxt_bcstl)(uint32_t arg)
+{
+ return arg | ((uint64_t) arg << 32);
+}
+
+uint64_t HELPER(iwmmxt_addcb)(uint64_t x)
+{
+ return
+ ((x >> 0) & 0xff) + ((x >> 8) & 0xff) +
+ ((x >> 16) & 0xff) + ((x >> 24) & 0xff) +
+ ((x >> 32) & 0xff) + ((x >> 40) & 0xff) +
+ ((x >> 48) & 0xff) + ((x >> 56) & 0xff);
+}
+
+uint64_t HELPER(iwmmxt_addcw)(uint64_t x)
+{
+ return
+ ((x >> 0) & 0xffff) + ((x >> 16) & 0xffff) +
+ ((x >> 32) & 0xffff) + ((x >> 48) & 0xffff);
+}
+
+uint64_t HELPER(iwmmxt_addcl)(uint64_t x)
+{
+ return (x & 0xffffffff) + (x >> 32);
+}
+
+uint32_t HELPER(iwmmxt_msbb)(uint64_t x)
+{
+ return
+ ((x >> 7) & 0x01) | ((x >> 14) & 0x02) |
+ ((x >> 21) & 0x04) | ((x >> 28) & 0x08) |
+ ((x >> 35) & 0x10) | ((x >> 42) & 0x20) |
+ ((x >> 49) & 0x40) | ((x >> 56) & 0x80);
+}
+
+uint32_t HELPER(iwmmxt_msbw)(uint64_t x)
+{
+ return
+ ((x >> 15) & 0x01) | ((x >> 30) & 0x02) |
+ ((x >> 45) & 0x04) | ((x >> 52) & 0x08);
+}
+
+uint32_t HELPER(iwmmxt_msbl)(uint64_t x)
+{
+ return ((x >> 31) & 0x01) | ((x >> 62) & 0x02);
+}
+
+/* FIXME: Split wCASF setting into a separate op to avoid env use. */
+uint64_t HELPER(iwmmxt_srlw)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x = (((x & (0xffffll << 0)) >> n) & (0xffffll << 0)) |
+ (((x & (0xffffll << 16)) >> n) & (0xffffll << 16)) |
+ (((x & (0xffffll << 32)) >> n) & (0xffffll << 32)) |
+ (((x & (0xffffll << 48)) >> n) & (0xffffll << 48));
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
+ NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
+ return x;
+}
+
+uint64_t HELPER(iwmmxt_srll)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x = ((x & (0xffffffffll << 0)) >> n) |
+ ((x >> n) & (0xffffffffll << 32));
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
+ return x;
+}
+
+uint64_t HELPER(iwmmxt_srlq)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x >>= n;
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
+ return x;
+}
+
+uint64_t HELPER(iwmmxt_sllw)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x = (((x & (0xffffll << 0)) << n) & (0xffffll << 0)) |
+ (((x & (0xffffll << 16)) << n) & (0xffffll << 16)) |
+ (((x & (0xffffll << 32)) << n) & (0xffffll << 32)) |
+ (((x & (0xffffll << 48)) << n) & (0xffffll << 48));
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
+ NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
+ return x;
+}
+
+uint64_t HELPER(iwmmxt_slll)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x = ((x << n) & (0xffffffffll << 0)) |
+ ((x & (0xffffffffll << 32)) << n);
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
+ return x;
+}
+
+uint64_t HELPER(iwmmxt_sllq)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x <<= n;
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
+ return x;
+}
+
+uint64_t HELPER(iwmmxt_sraw)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x = ((uint64_t) ((EXTEND16(x >> 0) >> n) & 0xffff) << 0) |
+ ((uint64_t) ((EXTEND16(x >> 16) >> n) & 0xffff) << 16) |
+ ((uint64_t) ((EXTEND16(x >> 32) >> n) & 0xffff) << 32) |
+ ((uint64_t) ((EXTEND16(x >> 48) >> n) & 0xffff) << 48);
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
+ NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
+ return x;
+}
+
+uint64_t HELPER(iwmmxt_sral)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x = (((EXTEND32(x >> 0) >> n) & 0xffffffff) << 0) |
+ (((EXTEND32(x >> 32) >> n) & 0xffffffff) << 32);
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
+ return x;
+}
+
+uint64_t HELPER(iwmmxt_sraq)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x = (int64_t) x >> n;
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
+ return x;
+}
+
+uint64_t HELPER(iwmmxt_rorw)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x = ((((x & (0xffffll << 0)) >> n) |
+ ((x & (0xffffll << 0)) << (16 - n))) & (0xffffll << 0)) |
+ ((((x & (0xffffll << 16)) >> n) |
+ ((x & (0xffffll << 16)) << (16 - n))) & (0xffffll << 16)) |
+ ((((x & (0xffffll << 32)) >> n) |
+ ((x & (0xffffll << 32)) << (16 - n))) & (0xffffll << 32)) |
+ ((((x & (0xffffll << 48)) >> n) |
+ ((x & (0xffffll << 48)) << (16 - n))) & (0xffffll << 48));
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
+ NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
+ return x;
+}
+
+uint64_t HELPER(iwmmxt_rorl)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x = ((x & (0xffffffffll << 0)) >> n) |
+ ((x >> n) & (0xffffffffll << 32)) |
+ ((x << (32 - n)) & (0xffffffffll << 0)) |
+ ((x & (0xffffffffll << 32)) << (32 - n));
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);
+ return x;
+}
+
+uint64_t HELPER(iwmmxt_rorq)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x = ror64(x, n);
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
+ return x;
+}
+
+uint64_t HELPER(iwmmxt_shufh)(CPUARMState *env, uint64_t x, uint32_t n)
+{
+ x = (((x >> ((n << 4) & 0x30)) & 0xffff) << 0) |
+ (((x >> ((n << 2) & 0x30)) & 0xffff) << 16) |
+ (((x >> ((n << 0) & 0x30)) & 0xffff) << 32) |
+ (((x >> ((n >> 2) & 0x30)) & 0xffff) << 48);
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT16(x >> 0, 0) | NZBIT16(x >> 16, 1) |
+ NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);
+ return x;
+}
+
+/* TODO: Unsigned-Saturation */
+uint64_t HELPER(iwmmxt_packuw)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+ a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) |
+ (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) |
+ (((b >> 0) & 0xff) << 32) | (((b >> 16) & 0xff) << 40) |
+ (((b >> 32) & 0xff) << 48) | (((b >> 48) & 0xff) << 56);
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) |
+ NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) |
+ NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) |
+ NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);
+ return a;
+}
+
+uint64_t HELPER(iwmmxt_packul)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+ a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) |
+ (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48);
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) |
+ NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3);
+ return a;
+}
+
+uint64_t HELPER(iwmmxt_packuq)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+ a = (a & 0xffffffff) | ((b & 0xffffffff) << 32);
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);
+ return a;
+}
+
+/* TODO: Signed-Saturation */
+uint64_t HELPER(iwmmxt_packsw)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+ a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) |
+ (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) |
+ (((b >> 0) & 0xff) << 32) | (((b >> 16) & 0xff) << 40) |
+ (((b >> 32) & 0xff) << 48) | (((b >> 48) & 0xff) << 56);
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT8(a >> 0, 0) | NZBIT8(a >> 8, 1) |
+ NZBIT8(a >> 16, 2) | NZBIT8(a >> 24, 3) |
+ NZBIT8(a >> 32, 4) | NZBIT8(a >> 40, 5) |
+ NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);
+ return a;
+}
+
+uint64_t HELPER(iwmmxt_packsl)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+ a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) |
+ (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48);
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT16(a >> 0, 0) | NZBIT16(a >> 16, 1) |
+ NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3);
+ return a;
+}
+
+uint64_t HELPER(iwmmxt_packsq)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+ a = (a & 0xffffffff) | ((b & 0xffffffff) << 32);
+ env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
+ NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);
+ return a;
+}
+
+uint64_t HELPER(iwmmxt_muladdsl)(uint64_t c, uint32_t a, uint32_t b)
+{
+ return c + ((int32_t) EXTEND32(a) * (int32_t) EXTEND32(b));
+}
+
+uint64_t HELPER(iwmmxt_muladdsw)(uint64_t c, uint32_t a, uint32_t b)
+{
+ c += EXTEND32(EXTEND16S((a >> 0) & 0xffff) *
+ EXTEND16S((b >> 0) & 0xffff));
+ c += EXTEND32(EXTEND16S((a >> 16) & 0xffff) *
+ EXTEND16S((b >> 16) & 0xffff));
+ return c;
+}
+
+uint64_t HELPER(iwmmxt_muladdswl)(uint64_t c, uint32_t a, uint32_t b)
+{
+ return c + (EXTEND32(EXTEND16S(a & 0xffff) *
+ EXTEND16S(b & 0xffff)));
+}
--- /dev/null
+/*
+ * ARM generic helpers.
+ *
+ * This code is licensed under the GNU GPL v2 or later.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "exec/helper-proto.h"
+#include "qemu/main-loop.h"
+#include "qemu/bitops.h"
+#include "qemu/log.h"
+#include "exec/exec-all.h"
+#ifdef CONFIG_TCG
+#include "exec/cpu_ldst.h"
+#include "semihosting/common-semi.h"
+#endif
+#if !defined(CONFIG_USER_ONLY)
+#include "hw/intc/armv7m_nvic.h"
+#endif
+
+static void v7m_msr_xpsr(CPUARMState *env, uint32_t mask,
+ uint32_t reg, uint32_t val)
+{
+ /* Only APSR is actually writable */
+ if (!(reg & 4)) {
+ uint32_t apsrmask = 0;
+
+ if (mask & 8) {
+ apsrmask |= XPSR_NZCV | XPSR_Q;
+ }
+ if ((mask & 4) && arm_feature(env, ARM_FEATURE_THUMB_DSP)) {
+ apsrmask |= XPSR_GE;
+ }
+ xpsr_write(env, val, apsrmask);
+ }
+}
+
+static uint32_t v7m_mrs_xpsr(CPUARMState *env, uint32_t reg, unsigned el)
+{
+ uint32_t mask = 0;
+
+ if ((reg & 1) && el) {
+ mask |= XPSR_EXCP; /* IPSR (unpriv. reads as zero) */
+ }
+ if (!(reg & 4)) {
+ mask |= XPSR_NZCV | XPSR_Q; /* APSR */
+ if (arm_feature(env, ARM_FEATURE_THUMB_DSP)) {
+ mask |= XPSR_GE;
+ }
+ }
+ /* EPSR reads as zero */
+ return xpsr_read(env) & mask;
+}
+
+static uint32_t v7m_mrs_control(CPUARMState *env, uint32_t secure)
+{
+ uint32_t value = env->v7m.control[secure];
+
+ if (!secure) {
+ /* SFPA is RAZ/WI from NS; FPCA is stored in the M_REG_S bank */
+ value |= env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK;
+ }
+ return value;
+}
+
+#ifdef CONFIG_USER_ONLY
+
+void HELPER(v7m_msr)(CPUARMState *env, uint32_t maskreg, uint32_t val)
+{
+ uint32_t mask = extract32(maskreg, 8, 4);
+ uint32_t reg = extract32(maskreg, 0, 8);
+
+ switch (reg) {
+ case 0 ... 7: /* xPSR sub-fields */
+ v7m_msr_xpsr(env, mask, reg, val);
+ break;
+ case 20: /* CONTROL */
+ /* There are no sub-fields that are actually writable from EL0. */
+ break;
+ default:
+ /* Unprivileged writes to other registers are ignored */
+ break;
+ }
+}
+
+uint32_t HELPER(v7m_mrs)(CPUARMState *env, uint32_t reg)
+{
+ switch (reg) {
+ case 0 ... 7: /* xPSR sub-fields */
+ return v7m_mrs_xpsr(env, reg, 0);
+ case 20: /* CONTROL */
+ return v7m_mrs_control(env, 0);
+ default:
+ /* Unprivileged reads others as zero. */
+ return 0;
+ }
+}
+
+void HELPER(v7m_bxns)(CPUARMState *env, uint32_t dest)
+{
+ /* translate.c should never generate calls here in user-only mode */
+ g_assert_not_reached();
+}
+
+void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest)
+{
+ /* translate.c should never generate calls here in user-only mode */
+ g_assert_not_reached();
+}
+
+void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
+{
+ /* translate.c should never generate calls here in user-only mode */
+ g_assert_not_reached();
+}
+
+void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
+{
+ /* translate.c should never generate calls here in user-only mode */
+ g_assert_not_reached();
+}
+
+void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
+{
+ /* translate.c should never generate calls here in user-only mode */
+ g_assert_not_reached();
+}
+
+uint32_t HELPER(v7m_tt)(CPUARMState *env, uint32_t addr, uint32_t op)
+{
+ /*
+ * The TT instructions can be used by unprivileged code, but in
+ * user-only emulation we don't have the MPU.
+ * Luckily since we know we are NonSecure unprivileged (and that in
+ * turn means that the A flag wasn't specified), all the bits in the
+ * register must be zero:
+ * IREGION: 0 because IRVALID is 0
+ * IRVALID: 0 because NS
+ * S: 0 because NS
+ * NSRW: 0 because NS
+ * NSR: 0 because NS
+ * RW: 0 because unpriv and A flag not set
+ * R: 0 because unpriv and A flag not set
+ * SRVALID: 0 because NS
+ * MRVALID: 0 because unpriv and A flag not set
+ * SREGION: 0 becaus SRVALID is 0
+ * MREGION: 0 because MRVALID is 0
+ */
+ return 0;
+}
+
+ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate)
+{
+ return ARMMMUIdx_MUser;
+}
+
+#else /* !CONFIG_USER_ONLY */
+
+static ARMMMUIdx arm_v7m_mmu_idx_all(CPUARMState *env,
+ bool secstate, bool priv, bool negpri)
+{
+ ARMMMUIdx mmu_idx = ARM_MMU_IDX_M;
+
+ if (priv) {
+ mmu_idx |= ARM_MMU_IDX_M_PRIV;
+ }
+
+ if (negpri) {
+ mmu_idx |= ARM_MMU_IDX_M_NEGPRI;
+ }
+
+ if (secstate) {
+ mmu_idx |= ARM_MMU_IDX_M_S;
+ }
+
+ return mmu_idx;
+}
+
+static ARMMMUIdx arm_v7m_mmu_idx_for_secstate_and_priv(CPUARMState *env,
+ bool secstate, bool priv)
+{
+ bool negpri = armv7m_nvic_neg_prio_requested(env->nvic, secstate);
+
+ return arm_v7m_mmu_idx_all(env, secstate, priv, negpri);
+}
+
+/* Return the MMU index for a v7M CPU in the specified security state */
+ARMMMUIdx arm_v7m_mmu_idx_for_secstate(CPUARMState *env, bool secstate)
+{
+ bool priv = arm_v7m_is_handler_mode(env) ||
+ !(env->v7m.control[secstate] & 1);
+
+ return arm_v7m_mmu_idx_for_secstate_and_priv(env, secstate, priv);
+}
+
+/*
+ * What kind of stack write are we doing? This affects how exceptions
+ * generated during the stacking are treated.
+ */
+typedef enum StackingMode {
+ STACK_NORMAL,
+ STACK_IGNFAULTS,
+ STACK_LAZYFP,
+} StackingMode;
+
+static bool v7m_stack_write(ARMCPU *cpu, uint32_t addr, uint32_t value,
+ ARMMMUIdx mmu_idx, StackingMode mode)
+{
+ CPUState *cs = CPU(cpu);
+ CPUARMState *env = &cpu->env;
+ MemTxResult txres;
+ GetPhysAddrResult res = {};
+ ARMMMUFaultInfo fi = {};
+ bool secure = mmu_idx & ARM_MMU_IDX_M_S;
+ int exc;
+ bool exc_secure;
+
+ if (get_phys_addr(env, addr, MMU_DATA_STORE, mmu_idx, &res, &fi)) {
+ /* MPU/SAU lookup failed */
+ if (fi.type == ARMFault_QEMU_SFault) {
+ if (mode == STACK_LAZYFP) {
+ qemu_log_mask(CPU_LOG_INT,
+ "...SecureFault with SFSR.LSPERR "
+ "during lazy stacking\n");
+ env->v7m.sfsr |= R_V7M_SFSR_LSPERR_MASK;
+ } else {
+ qemu_log_mask(CPU_LOG_INT,
+ "...SecureFault with SFSR.AUVIOL "
+ "during stacking\n");
+ env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK;
+ }
+ env->v7m.sfsr |= R_V7M_SFSR_SFARVALID_MASK;
+ env->v7m.sfar = addr;
+ exc = ARMV7M_EXCP_SECURE;
+ exc_secure = false;
+ } else {
+ if (mode == STACK_LAZYFP) {
+ qemu_log_mask(CPU_LOG_INT,
+ "...MemManageFault with CFSR.MLSPERR\n");
+ env->v7m.cfsr[secure] |= R_V7M_CFSR_MLSPERR_MASK;
+ } else {
+ qemu_log_mask(CPU_LOG_INT,
+ "...MemManageFault with CFSR.MSTKERR\n");
+ env->v7m.cfsr[secure] |= R_V7M_CFSR_MSTKERR_MASK;
+ }
+ exc = ARMV7M_EXCP_MEM;
+ exc_secure = secure;
+ }
+ goto pend_fault;
+ }
+ address_space_stl_le(arm_addressspace(cs, res.f.attrs), res.f.phys_addr,
+ value, res.f.attrs, &txres);
+ if (txres != MEMTX_OK) {
+ /* BusFault trying to write the data */
+ if (mode == STACK_LAZYFP) {
+ qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.LSPERR\n");
+ env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_LSPERR_MASK;
+ } else {
+ qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.STKERR\n");
+ env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_STKERR_MASK;
+ }
+ exc = ARMV7M_EXCP_BUS;
+ exc_secure = false;
+ goto pend_fault;
+ }
+ return true;
+
+pend_fault:
+ /*
+ * By pending the exception at this point we are making
+ * the IMPDEF choice "overridden exceptions pended" (see the
+ * MergeExcInfo() pseudocode). The other choice would be to not
+ * pend them now and then make a choice about which to throw away
+ * later if we have two derived exceptions.
+ * The only case when we must not pend the exception but instead
+ * throw it away is if we are doing the push of the callee registers
+ * and we've already generated a derived exception (this is indicated
+ * by the caller passing STACK_IGNFAULTS). Even in this case we will
+ * still update the fault status registers.
+ */
+ switch (mode) {
+ case STACK_NORMAL:
+ armv7m_nvic_set_pending_derived(env->nvic, exc, exc_secure);
+ break;
+ case STACK_LAZYFP:
+ armv7m_nvic_set_pending_lazyfp(env->nvic, exc, exc_secure);
+ break;
+ case STACK_IGNFAULTS:
+ break;
+ }
+ return false;
+}
+
+static bool v7m_stack_read(ARMCPU *cpu, uint32_t *dest, uint32_t addr,
+ ARMMMUIdx mmu_idx)
+{
+ CPUState *cs = CPU(cpu);
+ CPUARMState *env = &cpu->env;
+ MemTxResult txres;
+ GetPhysAddrResult res = {};
+ ARMMMUFaultInfo fi = {};
+ bool secure = mmu_idx & ARM_MMU_IDX_M_S;
+ int exc;
+ bool exc_secure;
+ uint32_t value;
+
+ if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &res, &fi)) {
+ /* MPU/SAU lookup failed */
+ if (fi.type == ARMFault_QEMU_SFault) {
+ qemu_log_mask(CPU_LOG_INT,
+ "...SecureFault with SFSR.AUVIOL during unstack\n");
+ env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK;
+ env->v7m.sfar = addr;
+ exc = ARMV7M_EXCP_SECURE;
+ exc_secure = false;
+ } else {
+ qemu_log_mask(CPU_LOG_INT,
+ "...MemManageFault with CFSR.MUNSTKERR\n");
+ env->v7m.cfsr[secure] |= R_V7M_CFSR_MUNSTKERR_MASK;
+ exc = ARMV7M_EXCP_MEM;
+ exc_secure = secure;
+ }
+ goto pend_fault;
+ }
+
+ value = address_space_ldl(arm_addressspace(cs, res.f.attrs),
+ res.f.phys_addr, res.f.attrs, &txres);
+ if (txres != MEMTX_OK) {
+ /* BusFault trying to read the data */
+ qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.UNSTKERR\n");
+ env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_UNSTKERR_MASK;
+ exc = ARMV7M_EXCP_BUS;
+ exc_secure = false;
+ goto pend_fault;
+ }
+
+ *dest = value;
+ return true;
+
+pend_fault:
+ /*
+ * By pending the exception at this point we are making
+ * the IMPDEF choice "overridden exceptions pended" (see the
+ * MergeExcInfo() pseudocode). The other choice would be to not
+ * pend them now and then make a choice about which to throw away
+ * later if we have two derived exceptions.
+ */
+ armv7m_nvic_set_pending(env->nvic, exc, exc_secure);
+ return false;
+}
+
+void HELPER(v7m_preserve_fp_state)(CPUARMState *env)
+{
+ /*
+ * Preserve FP state (because LSPACT was set and we are about
+ * to execute an FP instruction). This corresponds to the
+ * PreserveFPState() pseudocode.
+ * We may throw an exception if the stacking fails.
+ */
+ ARMCPU *cpu = env_archcpu(env);
+ bool is_secure = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
+ bool negpri = !(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_HFRDY_MASK);
+ bool is_priv = !(env->v7m.fpccr[is_secure] & R_V7M_FPCCR_USER_MASK);
+ bool splimviol = env->v7m.fpccr[is_secure] & R_V7M_FPCCR_SPLIMVIOL_MASK;
+ uint32_t fpcar = env->v7m.fpcar[is_secure];
+ bool stacked_ok = true;
+ bool ts = is_secure && (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK);
+ bool take_exception;
+
+ /* Take the iothread lock as we are going to touch the NVIC */
+ qemu_mutex_lock_iothread();
+
+ /* Check the background context had access to the FPU */
+ if (!v7m_cpacr_pass(env, is_secure, is_priv)) {
+ armv7m_nvic_set_pending_lazyfp(env->nvic, ARMV7M_EXCP_USAGE, is_secure);
+ env->v7m.cfsr[is_secure] |= R_V7M_CFSR_NOCP_MASK;
+ stacked_ok = false;
+ } else if (!is_secure && !extract32(env->v7m.nsacr, 10, 1)) {
+ armv7m_nvic_set_pending_lazyfp(env->nvic, ARMV7M_EXCP_USAGE, M_REG_S);
+ env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK;
+ stacked_ok = false;
+ }
+
+ if (!splimviol && stacked_ok) {
+ /* We only stack if the stack limit wasn't violated */
+ int i;
+ ARMMMUIdx mmu_idx;
+
+ mmu_idx = arm_v7m_mmu_idx_all(env, is_secure, is_priv, negpri);
+ for (i = 0; i < (ts ? 32 : 16); i += 2) {
+ uint64_t dn = *aa32_vfp_dreg(env, i / 2);
+ uint32_t faddr = fpcar + 4 * i;
+ uint32_t slo = extract64(dn, 0, 32);
+ uint32_t shi = extract64(dn, 32, 32);
+
+ if (i >= 16) {
+ faddr += 8; /* skip the slot for the FPSCR/VPR */
+ }
+ stacked_ok = stacked_ok &&
+ v7m_stack_write(cpu, faddr, slo, mmu_idx, STACK_LAZYFP) &&
+ v7m_stack_write(cpu, faddr + 4, shi, mmu_idx, STACK_LAZYFP);
+ }
+
+ stacked_ok = stacked_ok &&
+ v7m_stack_write(cpu, fpcar + 0x40,
+ vfp_get_fpscr(env), mmu_idx, STACK_LAZYFP);
+ if (cpu_isar_feature(aa32_mve, cpu)) {
+ stacked_ok = stacked_ok &&
+ v7m_stack_write(cpu, fpcar + 0x44,
+ env->v7m.vpr, mmu_idx, STACK_LAZYFP);
+ }
+ }
+
+ /*
+ * We definitely pended an exception, but it's possible that it
+ * might not be able to be taken now. If its priority permits us
+ * to take it now, then we must not update the LSPACT or FP regs,
+ * but instead jump out to take the exception immediately.
+ * If it's just pending and won't be taken until the current
+ * handler exits, then we do update LSPACT and the FP regs.
+ */
+ take_exception = !stacked_ok &&
+ armv7m_nvic_can_take_pending_exception(env->nvic);
+
+ qemu_mutex_unlock_iothread();
+
+ if (take_exception) {
+ raise_exception_ra(env, EXCP_LAZYFP, 0, 1, GETPC());
+ }
+
+ env->v7m.fpccr[is_secure] &= ~R_V7M_FPCCR_LSPACT_MASK;
+
+ if (ts) {
+ /* Clear s0 to s31 and the FPSCR and VPR */
+ int i;
+
+ for (i = 0; i < 32; i += 2) {
+ *aa32_vfp_dreg(env, i / 2) = 0;
+ }
+ vfp_set_fpscr(env, 0);
+ if (cpu_isar_feature(aa32_mve, cpu)) {
+ env->v7m.vpr = 0;
+ }
+ }
+ /*
+ * Otherwise s0 to s15, FPSCR and VPR are UNKNOWN; we choose to leave them
+ * unchanged.
+ */
+}
+
+/*
+ * Write to v7M CONTROL.SPSEL bit for the specified security bank.
+ * This may change the current stack pointer between Main and Process
+ * stack pointers if it is done for the CONTROL register for the current
+ * security state.
+ */
+static void write_v7m_control_spsel_for_secstate(CPUARMState *env,
+ bool new_spsel,
+ bool secstate)
+{
+ bool old_is_psp = v7m_using_psp(env);
+
+ env->v7m.control[secstate] =
+ deposit32(env->v7m.control[secstate],
+ R_V7M_CONTROL_SPSEL_SHIFT,
+ R_V7M_CONTROL_SPSEL_LENGTH, new_spsel);
+
+ if (secstate == env->v7m.secure) {
+ bool new_is_psp = v7m_using_psp(env);
+ uint32_t tmp;
+
+ if (old_is_psp != new_is_psp) {
+ tmp = env->v7m.other_sp;
+ env->v7m.other_sp = env->regs[13];
+ env->regs[13] = tmp;
+ }
+ }
+}
+
+/*
+ * Write to v7M CONTROL.SPSEL bit. This may change the current
+ * stack pointer between Main and Process stack pointers.
+ */
+static void write_v7m_control_spsel(CPUARMState *env, bool new_spsel)
+{
+ write_v7m_control_spsel_for_secstate(env, new_spsel, env->v7m.secure);
+}
+
+void write_v7m_exception(CPUARMState *env, uint32_t new_exc)
+{
+ /*
+ * Write a new value to v7m.exception, thus transitioning into or out
+ * of Handler mode; this may result in a change of active stack pointer.
+ */
+ bool new_is_psp, old_is_psp = v7m_using_psp(env);
+ uint32_t tmp;
+
+ env->v7m.exception = new_exc;
+
+ new_is_psp = v7m_using_psp(env);
+
+ if (old_is_psp != new_is_psp) {
+ tmp = env->v7m.other_sp;
+ env->v7m.other_sp = env->regs[13];
+ env->regs[13] = tmp;
+ }
+}
+
+/* Switch M profile security state between NS and S */
+static void switch_v7m_security_state(CPUARMState *env, bool new_secstate)
+{
+ uint32_t new_ss_msp, new_ss_psp;
+
+ if (env->v7m.secure == new_secstate) {
+ return;
+ }
+
+ /*
+ * All the banked state is accessed by looking at env->v7m.secure
+ * except for the stack pointer; rearrange the SP appropriately.
+ */
+ new_ss_msp = env->v7m.other_ss_msp;
+ new_ss_psp = env->v7m.other_ss_psp;
+
+ if (v7m_using_psp(env)) {
+ env->v7m.other_ss_psp = env->regs[13];
+ env->v7m.other_ss_msp = env->v7m.other_sp;
+ } else {
+ env->v7m.other_ss_msp = env->regs[13];
+ env->v7m.other_ss_psp = env->v7m.other_sp;
+ }
+
+ env->v7m.secure = new_secstate;
+
+ if (v7m_using_psp(env)) {
+ env->regs[13] = new_ss_psp;
+ env->v7m.other_sp = new_ss_msp;
+ } else {
+ env->regs[13] = new_ss_msp;
+ env->v7m.other_sp = new_ss_psp;
+ }
+}
+
+void HELPER(v7m_bxns)(CPUARMState *env, uint32_t dest)
+{
+ /*
+ * Handle v7M BXNS:
+ * - if the return value is a magic value, do exception return (like BX)
+ * - otherwise bit 0 of the return value is the target security state
+ */
+ uint32_t min_magic;
+
+ if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+ /* Covers FNC_RETURN and EXC_RETURN magic */
+ min_magic = FNC_RETURN_MIN_MAGIC;
+ } else {
+ /* EXC_RETURN magic only */
+ min_magic = EXC_RETURN_MIN_MAGIC;
+ }
+
+ if (dest >= min_magic) {
+ /*
+ * This is an exception return magic value; put it where
+ * do_v7m_exception_exit() expects and raise EXCEPTION_EXIT.
+ * Note that if we ever add gen_ss_advance() singlestep support to
+ * M profile this should count as an "instruction execution complete"
+ * event (compare gen_bx_excret_final_code()).
+ */
+ env->regs[15] = dest & ~1;
+ env->thumb = dest & 1;
+ HELPER(exception_internal)(env, EXCP_EXCEPTION_EXIT);
+ /* notreached */
+ }
+
+ /* translate.c should have made BXNS UNDEF unless we're secure */
+ assert(env->v7m.secure);
+
+ if (!(dest & 1)) {
+ env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
+ }
+ switch_v7m_security_state(env, dest & 1);
+ env->thumb = true;
+ env->regs[15] = dest & ~1;
+ arm_rebuild_hflags(env);
+}
+
+void HELPER(v7m_blxns)(CPUARMState *env, uint32_t dest)
+{
+ /*
+ * Handle v7M BLXNS:
+ * - bit 0 of the destination address is the target security state
+ */
+
+ /* At this point regs[15] is the address just after the BLXNS */
+ uint32_t nextinst = env->regs[15] | 1;
+ uint32_t sp = env->regs[13] - 8;
+ uint32_t saved_psr;
+
+ /* translate.c will have made BLXNS UNDEF unless we're secure */
+ assert(env->v7m.secure);
+
+ if (dest & 1) {
+ /*
+ * Target is Secure, so this is just a normal BLX,
+ * except that the low bit doesn't indicate Thumb/not.
+ */
+ env->regs[14] = nextinst;
+ env->thumb = true;
+ env->regs[15] = dest & ~1;
+ return;
+ }
+
+ /* Target is non-secure: first push a stack frame */
+ if (!QEMU_IS_ALIGNED(sp, 8)) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "BLXNS with misaligned SP is UNPREDICTABLE\n");
+ }
+
+ if (sp < v7m_sp_limit(env)) {
+ raise_exception(env, EXCP_STKOF, 0, 1);
+ }
+
+ saved_psr = env->v7m.exception;
+ if (env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK) {
+ saved_psr |= XPSR_SFPA;
+ }
+
+ /* Note that these stores can throw exceptions on MPU faults */
+ cpu_stl_data_ra(env, sp, nextinst, GETPC());
+ cpu_stl_data_ra(env, sp + 4, saved_psr, GETPC());
+
+ env->regs[13] = sp;
+ env->regs[14] = 0xfeffffff;
+ if (arm_v7m_is_handler_mode(env)) {
+ /*
+ * Write a dummy value to IPSR, to avoid leaking the current secure
+ * exception number to non-secure code. This is guaranteed not
+ * to cause write_v7m_exception() to actually change stacks.
+ */
+ write_v7m_exception(env, 1);
+ }
+ env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
+ switch_v7m_security_state(env, 0);
+ env->thumb = true;
+ env->regs[15] = dest;
+ arm_rebuild_hflags(env);
+}
+
+static uint32_t *get_v7m_sp_ptr(CPUARMState *env, bool secure, bool threadmode,
+ bool spsel)
+{
+ /*
+ * Return a pointer to the location where we currently store the
+ * stack pointer for the requested security state and thread mode.
+ * This pointer will become invalid if the CPU state is updated
+ * such that the stack pointers are switched around (eg changing
+ * the SPSEL control bit).
+ * Compare the v8M ARM ARM pseudocode LookUpSP_with_security_mode().
+ * Unlike that pseudocode, we require the caller to pass us in the
+ * SPSEL control bit value; this is because we also use this
+ * function in handling of pushing of the callee-saves registers
+ * part of the v8M stack frame (pseudocode PushCalleeStack()),
+ * and in the tailchain codepath the SPSEL bit comes from the exception
+ * return magic LR value from the previous exception. The pseudocode
+ * opencodes the stack-selection in PushCalleeStack(), but we prefer
+ * to make this utility function generic enough to do the job.
+ */
+ bool want_psp = threadmode && spsel;
+
+ if (secure == env->v7m.secure) {
+ if (want_psp == v7m_using_psp(env)) {
+ return &env->regs[13];
+ } else {
+ return &env->v7m.other_sp;
+ }
+ } else {
+ if (want_psp) {
+ return &env->v7m.other_ss_psp;
+ } else {
+ return &env->v7m.other_ss_msp;
+ }
+ }
+}
+
+static bool arm_v7m_load_vector(ARMCPU *cpu, int exc, bool targets_secure,
+ uint32_t *pvec)
+{
+ CPUState *cs = CPU(cpu);
+ CPUARMState *env = &cpu->env;
+ MemTxResult result;
+ uint32_t addr = env->v7m.vecbase[targets_secure] + exc * 4;
+ uint32_t vector_entry;
+ MemTxAttrs attrs = {};
+ ARMMMUIdx mmu_idx;
+ bool exc_secure;
+
+ qemu_log_mask(CPU_LOG_INT,
+ "...loading from element %d of %s vector table at 0x%x\n",
+ exc, targets_secure ? "secure" : "non-secure", addr);
+
+ mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targets_secure, true);
+
+ /*
+ * We don't do a get_phys_addr() here because the rules for vector
+ * loads are special: they always use the default memory map, and
+ * the default memory map permits reads from all addresses.
+ * Since there's no easy way to pass through to pmsav8_mpu_lookup()
+ * that we want this special case which would always say "yes",
+ * we just do the SAU lookup here followed by a direct physical load.
+ */
+ attrs.secure = targets_secure;
+ attrs.user = false;
+
+ if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+ V8M_SAttributes sattrs = {};
+
+ v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx,
+ targets_secure, &sattrs);
+ if (sattrs.ns) {
+ attrs.secure = false;
+ } else if (!targets_secure) {
+ /*
+ * NS access to S memory: the underlying exception which we escalate
+ * to HardFault is SecureFault, which always targets Secure.
+ */
+ exc_secure = true;
+ goto load_fail;
+ }
+ }
+
+ vector_entry = address_space_ldl(arm_addressspace(cs, attrs), addr,
+ attrs, &result);
+ if (result != MEMTX_OK) {
+ /*
+ * Underlying exception is BusFault: its target security state
+ * depends on BFHFNMINS.
+ */
+ exc_secure = !(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK);
+ goto load_fail;
+ }
+ *pvec = vector_entry;
+ qemu_log_mask(CPU_LOG_INT, "...loaded new PC 0x%x\n", *pvec);
+ return true;
+
+load_fail:
+ /*
+ * All vector table fetch fails are reported as HardFault, with
+ * HFSR.VECTTBL and .FORCED set. (FORCED is set because
+ * technically the underlying exception is a SecureFault or BusFault
+ * that is escalated to HardFault.) This is a terminal exception,
+ * so we will either take the HardFault immediately or else enter
+ * lockup (the latter case is handled in armv7m_nvic_set_pending_derived()).
+ * The HardFault is Secure if BFHFNMINS is 0 (meaning that all HFs are
+ * secure); otherwise it targets the same security state as the
+ * underlying exception.
+ * In v8.1M HardFaults from vector table fetch fails don't set FORCED.
+ */
+ if (!(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK)) {
+ exc_secure = true;
+ }
+ env->v7m.hfsr |= R_V7M_HFSR_VECTTBL_MASK;
+ if (!arm_feature(env, ARM_FEATURE_V8_1M)) {
+ env->v7m.hfsr |= R_V7M_HFSR_FORCED_MASK;
+ }
+ armv7m_nvic_set_pending_derived(env->nvic, ARMV7M_EXCP_HARD, exc_secure);
+ return false;
+}
+
+static uint32_t v7m_integrity_sig(CPUARMState *env, uint32_t lr)
+{
+ /*
+ * Return the integrity signature value for the callee-saves
+ * stack frame section. @lr is the exception return payload/LR value
+ * whose FType bit forms bit 0 of the signature if FP is present.
+ */
+ uint32_t sig = 0xfefa125a;
+
+ if (!cpu_isar_feature(aa32_vfp_simd, env_archcpu(env))
+ || (lr & R_V7M_EXCRET_FTYPE_MASK)) {
+ sig |= 1;
+ }
+ return sig;
+}
+
+static bool v7m_push_callee_stack(ARMCPU *cpu, uint32_t lr, bool dotailchain,
+ bool ignore_faults)
+{
+ /*
+ * For v8M, push the callee-saves register part of the stack frame.
+ * Compare the v8M pseudocode PushCalleeStack().
+ * In the tailchaining case this may not be the current stack.
+ */
+ CPUARMState *env = &cpu->env;
+ uint32_t *frame_sp_p;
+ uint32_t frameptr;
+ ARMMMUIdx mmu_idx;
+ bool stacked_ok;
+ uint32_t limit;
+ bool want_psp;
+ uint32_t sig;
+ StackingMode smode = ignore_faults ? STACK_IGNFAULTS : STACK_NORMAL;
+
+ if (dotailchain) {
+ bool mode = lr & R_V7M_EXCRET_MODE_MASK;
+ bool priv = !(env->v7m.control[M_REG_S] & R_V7M_CONTROL_NPRIV_MASK) ||
+ !mode;
+
+ mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, M_REG_S, priv);
+ frame_sp_p = get_v7m_sp_ptr(env, M_REG_S, mode,
+ lr & R_V7M_EXCRET_SPSEL_MASK);
+ want_psp = mode && (lr & R_V7M_EXCRET_SPSEL_MASK);
+ if (want_psp) {
+ limit = env->v7m.psplim[M_REG_S];
+ } else {
+ limit = env->v7m.msplim[M_REG_S];
+ }
+ } else {
+ mmu_idx = arm_mmu_idx(env);
+ frame_sp_p = &env->regs[13];
+ limit = v7m_sp_limit(env);
+ }
+
+ frameptr = *frame_sp_p - 0x28;
+ if (frameptr < limit) {
+ /*
+ * Stack limit failure: set SP to the limit value, and generate
+ * STKOF UsageFault. Stack pushes below the limit must not be
+ * performed. It is IMPDEF whether pushes above the limit are
+ * performed; we choose not to.
+ */
+ qemu_log_mask(CPU_LOG_INT,
+ "...STKOF during callee-saves register stacking\n");
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+ env->v7m.secure);
+ *frame_sp_p = limit;
+ return true;
+ }
+
+ /*
+ * Write as much of the stack frame as we can. A write failure may
+ * cause us to pend a derived exception.
+ */
+ sig = v7m_integrity_sig(env, lr);
+ stacked_ok =
+ v7m_stack_write(cpu, frameptr, sig, mmu_idx, smode) &&
+ v7m_stack_write(cpu, frameptr + 0x8, env->regs[4], mmu_idx, smode) &&
+ v7m_stack_write(cpu, frameptr + 0xc, env->regs[5], mmu_idx, smode) &&
+ v7m_stack_write(cpu, frameptr + 0x10, env->regs[6], mmu_idx, smode) &&
+ v7m_stack_write(cpu, frameptr + 0x14, env->regs[7], mmu_idx, smode) &&
+ v7m_stack_write(cpu, frameptr + 0x18, env->regs[8], mmu_idx, smode) &&
+ v7m_stack_write(cpu, frameptr + 0x1c, env->regs[9], mmu_idx, smode) &&
+ v7m_stack_write(cpu, frameptr + 0x20, env->regs[10], mmu_idx, smode) &&
+ v7m_stack_write(cpu, frameptr + 0x24, env->regs[11], mmu_idx, smode);
+
+ /* Update SP regardless of whether any of the stack accesses failed. */
+ *frame_sp_p = frameptr;
+
+ return !stacked_ok;
+}
+
+static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, bool dotailchain,
+ bool ignore_stackfaults)
+{
+ /*
+ * Do the "take the exception" parts of exception entry,
+ * but not the pushing of state to the stack. This is
+ * similar to the pseudocode ExceptionTaken() function.
+ */
+ CPUARMState *env = &cpu->env;
+ uint32_t addr;
+ bool targets_secure;
+ int exc;
+ bool push_failed = false;
+
+ armv7m_nvic_get_pending_irq_info(env->nvic, &exc, &targets_secure);
+ qemu_log_mask(CPU_LOG_INT, "...taking pending %s exception %d\n",
+ targets_secure ? "secure" : "nonsecure", exc);
+
+ if (dotailchain) {
+ /* Sanitize LR FType and PREFIX bits */
+ if (!cpu_isar_feature(aa32_vfp_simd, cpu)) {
+ lr |= R_V7M_EXCRET_FTYPE_MASK;
+ }
+ lr = deposit32(lr, 24, 8, 0xff);
+ }
+
+ if (arm_feature(env, ARM_FEATURE_V8)) {
+ if (arm_feature(env, ARM_FEATURE_M_SECURITY) &&
+ (lr & R_V7M_EXCRET_S_MASK)) {
+ /*
+ * The background code (the owner of the registers in the
+ * exception frame) is Secure. This means it may either already
+ * have or now needs to push callee-saves registers.
+ */
+ if (targets_secure) {
+ if (dotailchain && !(lr & R_V7M_EXCRET_ES_MASK)) {
+ /*
+ * We took an exception from Secure to NonSecure
+ * (which means the callee-saved registers got stacked)
+ * and are now tailchaining to a Secure exception.
+ * Clear DCRS so eventual return from this Secure
+ * exception unstacks the callee-saved registers.
+ */
+ lr &= ~R_V7M_EXCRET_DCRS_MASK;
+ }
+ } else {
+ /*
+ * We're going to a non-secure exception; push the
+ * callee-saves registers to the stack now, if they're
+ * not already saved.
+ */
+ if (lr & R_V7M_EXCRET_DCRS_MASK &&
+ !(dotailchain && !(lr & R_V7M_EXCRET_ES_MASK))) {
+ push_failed = v7m_push_callee_stack(cpu, lr, dotailchain,
+ ignore_stackfaults);
+ }
+ lr |= R_V7M_EXCRET_DCRS_MASK;
+ }
+ }
+
+ lr &= ~R_V7M_EXCRET_ES_MASK;
+ if (targets_secure) {
+ lr |= R_V7M_EXCRET_ES_MASK;
+ }
+ lr &= ~R_V7M_EXCRET_SPSEL_MASK;
+ if (env->v7m.control[targets_secure] & R_V7M_CONTROL_SPSEL_MASK) {
+ lr |= R_V7M_EXCRET_SPSEL_MASK;
+ }
+
+ /*
+ * Clear registers if necessary to prevent non-secure exception
+ * code being able to see register values from secure code.
+ * Where register values become architecturally UNKNOWN we leave
+ * them with their previous values. v8.1M is tighter than v8.0M
+ * here and always zeroes the caller-saved registers regardless
+ * of the security state the exception is targeting.
+ */
+ if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+ if (!targets_secure || arm_feature(env, ARM_FEATURE_V8_1M)) {
+ /*
+ * Always clear the caller-saved registers (they have been
+ * pushed to the stack earlier in v7m_push_stack()).
+ * Clear callee-saved registers if the background code is
+ * Secure (in which case these regs were saved in
+ * v7m_push_callee_stack()).
+ */
+ int i;
+ /*
+ * r4..r11 are callee-saves, zero only if background
+ * state was Secure (EXCRET.S == 1) and exception
+ * targets Non-secure state
+ */
+ bool zero_callee_saves = !targets_secure &&
+ (lr & R_V7M_EXCRET_S_MASK);
+
+ for (i = 0; i < 13; i++) {
+ if (i < 4 || i > 11 || zero_callee_saves) {
+ env->regs[i] = 0;
+ }
+ }
+ /* Clear EAPSR */
+ xpsr_write(env, 0, XPSR_NZCV | XPSR_Q | XPSR_GE | XPSR_IT);
+ }
+ }
+ }
+
+ if (push_failed && !ignore_stackfaults) {
+ /*
+ * Derived exception on callee-saves register stacking:
+ * we might now want to take a different exception which
+ * targets a different security state, so try again from the top.
+ */
+ qemu_log_mask(CPU_LOG_INT,
+ "...derived exception on callee-saves register stacking");
+ v7m_exception_taken(cpu, lr, true, true);
+ return;
+ }
+
+ if (!arm_v7m_load_vector(cpu, exc, targets_secure, &addr)) {
+ /* Vector load failed: derived exception */
+ qemu_log_mask(CPU_LOG_INT, "...derived exception on vector table load");
+ v7m_exception_taken(cpu, lr, true, true);
+ return;
+ }
+
+ /*
+ * Now we've done everything that might cause a derived exception
+ * we can go ahead and activate whichever exception we're going to
+ * take (which might now be the derived exception).
+ */
+ armv7m_nvic_acknowledge_irq(env->nvic);
+
+ /* Switch to target security state -- must do this before writing SPSEL */
+ switch_v7m_security_state(env, targets_secure);
+ write_v7m_control_spsel(env, 0);
+ arm_clear_exclusive(env);
+ /* Clear SFPA and FPCA (has no effect if no FPU) */
+ env->v7m.control[M_REG_S] &=
+ ~(R_V7M_CONTROL_FPCA_MASK | R_V7M_CONTROL_SFPA_MASK);
+ /* Clear IT bits */
+ env->condexec_bits = 0;
+ env->regs[14] = lr;
+ env->regs[15] = addr & 0xfffffffe;
+ env->thumb = addr & 1;
+ arm_rebuild_hflags(env);
+}
+
+static void v7m_update_fpccr(CPUARMState *env, uint32_t frameptr,
+ bool apply_splim)
+{
+ /*
+ * Like the pseudocode UpdateFPCCR: save state in FPCAR and FPCCR
+ * that we will need later in order to do lazy FP reg stacking.
+ */
+ bool is_secure = env->v7m.secure;
+ NVICState *nvic = env->nvic;
+ /*
+ * Some bits are unbanked and live always in fpccr[M_REG_S]; some bits
+ * are banked and we want to update the bit in the bank for the
+ * current security state; and in one case we want to specifically
+ * update the NS banked version of a bit even if we are secure.
+ */
+ uint32_t *fpccr_s = &env->v7m.fpccr[M_REG_S];
+ uint32_t *fpccr_ns = &env->v7m.fpccr[M_REG_NS];
+ uint32_t *fpccr = &env->v7m.fpccr[is_secure];
+ bool hfrdy, bfrdy, mmrdy, ns_ufrdy, s_ufrdy, sfrdy, monrdy;
+
+ env->v7m.fpcar[is_secure] = frameptr & ~0x7;
+
+ if (apply_splim && arm_feature(env, ARM_FEATURE_V8)) {
+ bool splimviol;
+ uint32_t splim = v7m_sp_limit(env);
+ bool ign = armv7m_nvic_neg_prio_requested(nvic, is_secure) &&
+ (env->v7m.ccr[is_secure] & R_V7M_CCR_STKOFHFNMIGN_MASK);
+
+ splimviol = !ign && frameptr < splim;
+ *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, SPLIMVIOL, splimviol);
+ }
+
+ *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, LSPACT, 1);
+
+ *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, S, is_secure);
+
+ *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, USER, arm_current_el(env) == 0);
+
+ *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, THREAD,
+ !arm_v7m_is_handler_mode(env));
+
+ hfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_HARD, false);
+ *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, HFRDY, hfrdy);
+
+ bfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_BUS, false);
+ *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, BFRDY, bfrdy);
+
+ mmrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_MEM, is_secure);
+ *fpccr = FIELD_DP32(*fpccr, V7M_FPCCR, MMRDY, mmrdy);
+
+ ns_ufrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_USAGE, false);
+ *fpccr_ns = FIELD_DP32(*fpccr_ns, V7M_FPCCR, UFRDY, ns_ufrdy);
+
+ monrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_DEBUG, false);
+ *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, MONRDY, monrdy);
+
+ if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+ s_ufrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_USAGE, true);
+ *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, UFRDY, s_ufrdy);
+
+ sfrdy = armv7m_nvic_get_ready_status(nvic, ARMV7M_EXCP_SECURE, false);
+ *fpccr_s = FIELD_DP32(*fpccr_s, V7M_FPCCR, SFRDY, sfrdy);
+ }
+}
+
+void HELPER(v7m_vlstm)(CPUARMState *env, uint32_t fptr)
+{
+ /* fptr is the value of Rn, the frame pointer we store the FP regs to */
+ ARMCPU *cpu = env_archcpu(env);
+ bool s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
+ bool lspact = env->v7m.fpccr[s] & R_V7M_FPCCR_LSPACT_MASK;
+ uintptr_t ra = GETPC();
+
+ assert(env->v7m.secure);
+
+ if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) {
+ return;
+ }
+
+ /* Check access to the coprocessor is permitted */
+ if (!v7m_cpacr_pass(env, true, arm_current_el(env) != 0)) {
+ raise_exception_ra(env, EXCP_NOCP, 0, 1, GETPC());
+ }
+
+ if (lspact) {
+ /* LSPACT should not be active when there is active FP state */
+ raise_exception_ra(env, EXCP_LSERR, 0, 1, GETPC());
+ }
+
+ if (fptr & 7) {
+ raise_exception_ra(env, EXCP_UNALIGNED, 0, 1, GETPC());
+ }
+
+ /*
+ * Note that we do not use v7m_stack_write() here, because the
+ * accesses should not set the FSR bits for stacking errors if they
+ * fail. (In pseudocode terms, they are AccType_NORMAL, not AccType_STACK
+ * or AccType_LAZYFP). Faults in cpu_stl_data_ra() will throw exceptions
+ * and longjmp out.
+ */
+ if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) {
+ bool ts = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK;
+ int i;
+
+ for (i = 0; i < (ts ? 32 : 16); i += 2) {
+ uint64_t dn = *aa32_vfp_dreg(env, i / 2);
+ uint32_t faddr = fptr + 4 * i;
+ uint32_t slo = extract64(dn, 0, 32);
+ uint32_t shi = extract64(dn, 32, 32);
+
+ if (i >= 16) {
+ faddr += 8; /* skip the slot for the FPSCR */
+ }
+ cpu_stl_data_ra(env, faddr, slo, ra);
+ cpu_stl_data_ra(env, faddr + 4, shi, ra);
+ }
+ cpu_stl_data_ra(env, fptr + 0x40, vfp_get_fpscr(env), ra);
+ if (cpu_isar_feature(aa32_mve, cpu)) {
+ cpu_stl_data_ra(env, fptr + 0x44, env->v7m.vpr, ra);
+ }
+
+ /*
+ * If TS is 0 then s0 to s15, FPSCR and VPR are UNKNOWN; we choose to
+ * leave them unchanged, matching our choice in v7m_preserve_fp_state.
+ */
+ if (ts) {
+ for (i = 0; i < 32; i += 2) {
+ *aa32_vfp_dreg(env, i / 2) = 0;
+ }
+ vfp_set_fpscr(env, 0);
+ if (cpu_isar_feature(aa32_mve, cpu)) {
+ env->v7m.vpr = 0;
+ }
+ }
+ } else {
+ v7m_update_fpccr(env, fptr, false);
+ }
+
+ env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK;
+}
+
+void HELPER(v7m_vlldm)(CPUARMState *env, uint32_t fptr)
+{
+ ARMCPU *cpu = env_archcpu(env);
+ uintptr_t ra = GETPC();
+
+ /* fptr is the value of Rn, the frame pointer we load the FP regs from */
+ assert(env->v7m.secure);
+
+ if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) {
+ return;
+ }
+
+ /* Check access to the coprocessor is permitted */
+ if (!v7m_cpacr_pass(env, true, arm_current_el(env) != 0)) {
+ raise_exception_ra(env, EXCP_NOCP, 0, 1, GETPC());
+ }
+
+ if (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK) {
+ /* State in FP is still valid */
+ env->v7m.fpccr[M_REG_S] &= ~R_V7M_FPCCR_LSPACT_MASK;
+ } else {
+ bool ts = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK;
+ int i;
+ uint32_t fpscr;
+
+ if (fptr & 7) {
+ raise_exception_ra(env, EXCP_UNALIGNED, 0, 1, GETPC());
+ }
+
+ for (i = 0; i < (ts ? 32 : 16); i += 2) {
+ uint32_t slo, shi;
+ uint64_t dn;
+ uint32_t faddr = fptr + 4 * i;
+
+ if (i >= 16) {
+ faddr += 8; /* skip the slot for the FPSCR and VPR */
+ }
+
+ slo = cpu_ldl_data_ra(env, faddr, ra);
+ shi = cpu_ldl_data_ra(env, faddr + 4, ra);
+
+ dn = (uint64_t) shi << 32 | slo;
+ *aa32_vfp_dreg(env, i / 2) = dn;
+ }
+ fpscr = cpu_ldl_data_ra(env, fptr + 0x40, ra);
+ vfp_set_fpscr(env, fpscr);
+ if (cpu_isar_feature(aa32_mve, cpu)) {
+ env->v7m.vpr = cpu_ldl_data_ra(env, fptr + 0x44, ra);
+ }
+ }
+
+ env->v7m.control[M_REG_S] |= R_V7M_CONTROL_FPCA_MASK;
+}
+
+static bool v7m_push_stack(ARMCPU *cpu)
+{
+ /*
+ * Do the "set up stack frame" part of exception entry,
+ * similar to pseudocode PushStack().
+ * Return true if we generate a derived exception (and so
+ * should ignore further stack faults trying to process
+ * that derived exception.)
+ */
+ bool stacked_ok = true, limitviol = false;
+ CPUARMState *env = &cpu->env;
+ uint32_t xpsr = xpsr_read(env);
+ uint32_t frameptr = env->regs[13];
+ ARMMMUIdx mmu_idx = arm_mmu_idx(env);
+ uint32_t framesize;
+ bool nsacr_cp10 = extract32(env->v7m.nsacr, 10, 1);
+
+ if ((env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) &&
+ (env->v7m.secure || nsacr_cp10)) {
+ if (env->v7m.secure &&
+ env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK) {
+ framesize = 0xa8;
+ } else {
+ framesize = 0x68;
+ }
+ } else {
+ framesize = 0x20;
+ }
+
+ /* Align stack pointer if the guest wants that */
+ if ((frameptr & 4) &&
+ (env->v7m.ccr[env->v7m.secure] & R_V7M_CCR_STKALIGN_MASK)) {
+ frameptr -= 4;
+ xpsr |= XPSR_SPREALIGN;
+ }
+
+ xpsr &= ~XPSR_SFPA;
+ if (env->v7m.secure &&
+ (env->v7m.control[M_REG_S] & R_V7M_CONTROL_SFPA_MASK)) {
+ xpsr |= XPSR_SFPA;
+ }
+
+ frameptr -= framesize;
+
+ if (arm_feature(env, ARM_FEATURE_V8)) {
+ uint32_t limit = v7m_sp_limit(env);
+
+ if (frameptr < limit) {
+ /*
+ * Stack limit failure: set SP to the limit value, and generate
+ * STKOF UsageFault. Stack pushes below the limit must not be
+ * performed. It is IMPDEF whether pushes above the limit are
+ * performed; we choose not to.
+ */
+ qemu_log_mask(CPU_LOG_INT,
+ "...STKOF during stacking\n");
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+ env->v7m.secure);
+ env->regs[13] = limit;
+ /*
+ * We won't try to perform any further memory accesses but
+ * we must continue through the following code to check for
+ * permission faults during FPU state preservation, and we
+ * must update FPCCR if lazy stacking is enabled.
+ */
+ limitviol = true;
+ stacked_ok = false;
+ }
+ }
+
+ /*
+ * Write as much of the stack frame as we can. If we fail a stack
+ * write this will result in a derived exception being pended
+ * (which may be taken in preference to the one we started with
+ * if it has higher priority).
+ */
+ stacked_ok = stacked_ok &&
+ v7m_stack_write(cpu, frameptr, env->regs[0], mmu_idx, STACK_NORMAL) &&
+ v7m_stack_write(cpu, frameptr + 4, env->regs[1],
+ mmu_idx, STACK_NORMAL) &&
+ v7m_stack_write(cpu, frameptr + 8, env->regs[2],
+ mmu_idx, STACK_NORMAL) &&
+ v7m_stack_write(cpu, frameptr + 12, env->regs[3],
+ mmu_idx, STACK_NORMAL) &&
+ v7m_stack_write(cpu, frameptr + 16, env->regs[12],
+ mmu_idx, STACK_NORMAL) &&
+ v7m_stack_write(cpu, frameptr + 20, env->regs[14],
+ mmu_idx, STACK_NORMAL) &&
+ v7m_stack_write(cpu, frameptr + 24, env->regs[15],
+ mmu_idx, STACK_NORMAL) &&
+ v7m_stack_write(cpu, frameptr + 28, xpsr, mmu_idx, STACK_NORMAL);
+
+ if (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK) {
+ /* FPU is active, try to save its registers */
+ bool fpccr_s = env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_S_MASK;
+ bool lspact = env->v7m.fpccr[fpccr_s] & R_V7M_FPCCR_LSPACT_MASK;
+
+ if (lspact && arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+ qemu_log_mask(CPU_LOG_INT,
+ "...SecureFault because LSPACT and FPCA both set\n");
+ env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+ } else if (!env->v7m.secure && !nsacr_cp10) {
+ qemu_log_mask(CPU_LOG_INT,
+ "...Secure UsageFault with CFSR.NOCP because "
+ "NSACR.CP10 prevents stacking FP regs\n");
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, M_REG_S);
+ env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK;
+ } else {
+ if (!(env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPEN_MASK)) {
+ /* Lazy stacking disabled, save registers now */
+ int i;
+ bool cpacr_pass = v7m_cpacr_pass(env, env->v7m.secure,
+ arm_current_el(env) != 0);
+
+ if (stacked_ok && !cpacr_pass) {
+ /*
+ * Take UsageFault if CPACR forbids access. The pseudocode
+ * here does a full CheckCPEnabled() but we know the NSACR
+ * check can never fail as we have already handled that.
+ */
+ qemu_log_mask(CPU_LOG_INT,
+ "...UsageFault with CFSR.NOCP because "
+ "CPACR.CP10 prevents stacking FP regs\n");
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+ env->v7m.secure);
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_NOCP_MASK;
+ stacked_ok = false;
+ }
+
+ for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) {
+ uint64_t dn = *aa32_vfp_dreg(env, i / 2);
+ uint32_t faddr = frameptr + 0x20 + 4 * i;
+ uint32_t slo = extract64(dn, 0, 32);
+ uint32_t shi = extract64(dn, 32, 32);
+
+ if (i >= 16) {
+ faddr += 8; /* skip the slot for the FPSCR and VPR */
+ }
+ stacked_ok = stacked_ok &&
+ v7m_stack_write(cpu, faddr, slo,
+ mmu_idx, STACK_NORMAL) &&
+ v7m_stack_write(cpu, faddr + 4, shi,
+ mmu_idx, STACK_NORMAL);
+ }
+ stacked_ok = stacked_ok &&
+ v7m_stack_write(cpu, frameptr + 0x60,
+ vfp_get_fpscr(env), mmu_idx, STACK_NORMAL);
+ if (cpu_isar_feature(aa32_mve, cpu)) {
+ stacked_ok = stacked_ok &&
+ v7m_stack_write(cpu, frameptr + 0x64,
+ env->v7m.vpr, mmu_idx, STACK_NORMAL);
+ }
+ if (cpacr_pass) {
+ for (i = 0; i < ((framesize == 0xa8) ? 32 : 16); i += 2) {
+ *aa32_vfp_dreg(env, i / 2) = 0;
+ }
+ vfp_set_fpscr(env, 0);
+ if (cpu_isar_feature(aa32_mve, cpu)) {
+ env->v7m.vpr = 0;
+ }
+ }
+ } else {
+ /* Lazy stacking enabled, save necessary info to stack later */
+ v7m_update_fpccr(env, frameptr + 0x20, true);
+ }
+ }
+ }
+
+ /*
+ * If we broke a stack limit then SP was already updated earlier;
+ * otherwise we update SP regardless of whether any of the stack
+ * accesses failed or we took some other kind of fault.
+ */
+ if (!limitviol) {
+ env->regs[13] = frameptr;
+ }
+
+ return !stacked_ok;
+}
+
+static void do_v7m_exception_exit(ARMCPU *cpu)
+{
+ CPUARMState *env = &cpu->env;
+ uint32_t excret;
+ uint32_t xpsr, xpsr_mask;
+ bool ufault = false;
+ bool sfault = false;
+ bool return_to_sp_process;
+ bool return_to_handler;
+ bool rettobase = false;
+ bool exc_secure = false;
+ bool return_to_secure;
+ bool ftype;
+ bool restore_s16_s31 = false;
+
+ /*
+ * If we're not in Handler mode then jumps to magic exception-exit
+ * addresses don't have magic behaviour. However for the v8M
+ * security extensions the magic secure-function-return has to
+ * work in thread mode too, so to avoid doing an extra check in
+ * the generated code we allow exception-exit magic to also cause the
+ * internal exception and bring us here in thread mode. Correct code
+ * will never try to do this (the following insn fetch will always
+ * fault) so we the overhead of having taken an unnecessary exception
+ * doesn't matter.
+ */
+ if (!arm_v7m_is_handler_mode(env)) {
+ return;
+ }
+
+ /*
+ * In the spec pseudocode ExceptionReturn() is called directly
+ * from BXWritePC() and gets the full target PC value including
+ * bit zero. In QEMU's implementation we treat it as a normal
+ * jump-to-register (which is then caught later on), and so split
+ * the target value up between env->regs[15] and env->thumb in
+ * gen_bx(). Reconstitute it.
+ */
+ excret = env->regs[15];
+ if (env->thumb) {
+ excret |= 1;
+ }
+
+ qemu_log_mask(CPU_LOG_INT, "Exception return: magic PC %" PRIx32
+ " previous exception %d\n",
+ excret, env->v7m.exception);
+
+ if ((excret & R_V7M_EXCRET_RES1_MASK) != R_V7M_EXCRET_RES1_MASK) {
+ qemu_log_mask(LOG_GUEST_ERROR, "M profile: zero high bits in exception "
+ "exit PC value 0x%" PRIx32 " are UNPREDICTABLE\n",
+ excret);
+ }
+
+ ftype = excret & R_V7M_EXCRET_FTYPE_MASK;
+
+ if (!ftype && !cpu_isar_feature(aa32_vfp_simd, cpu)) {
+ qemu_log_mask(LOG_GUEST_ERROR, "M profile: zero FTYPE in exception "
+ "exit PC value 0x%" PRIx32 " is UNPREDICTABLE "
+ "if FPU not present\n",
+ excret);
+ ftype = true;
+ }
+
+ if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+ /*
+ * EXC_RETURN.ES validation check (R_SMFL). We must do this before
+ * we pick which FAULTMASK to clear.
+ */
+ if (!env->v7m.secure &&
+ ((excret & R_V7M_EXCRET_ES_MASK) ||
+ !(excret & R_V7M_EXCRET_DCRS_MASK))) {
+ sfault = 1;
+ /* For all other purposes, treat ES as 0 (R_HXSR) */
+ excret &= ~R_V7M_EXCRET_ES_MASK;
+ }
+ exc_secure = excret & R_V7M_EXCRET_ES_MASK;
+ }
+
+ if (env->v7m.exception != ARMV7M_EXCP_NMI) {
+ /*
+ * Auto-clear FAULTMASK on return from other than NMI.
+ * If the security extension is implemented then this only
+ * happens if the raw execution priority is >= 0; the
+ * value of the ES bit in the exception return value indicates
+ * which security state's faultmask to clear. (v8M ARM ARM R_KBNF.)
+ */
+ if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+ if (armv7m_nvic_raw_execution_priority(env->nvic) >= 0) {
+ env->v7m.faultmask[exc_secure] = 0;
+ }
+ } else {
+ env->v7m.faultmask[M_REG_NS] = 0;
+ }
+ }
+
+ switch (armv7m_nvic_complete_irq(env->nvic, env->v7m.exception,
+ exc_secure)) {
+ case -1:
+ /* attempt to exit an exception that isn't active */
+ ufault = true;
+ break;
+ case 0:
+ /* still an irq active now */
+ break;
+ case 1:
+ /*
+ * We returned to base exception level, no nesting.
+ * (In the pseudocode this is written using "NestedActivation != 1"
+ * where we have 'rettobase == false'.)
+ */
+ rettobase = true;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ return_to_handler = !(excret & R_V7M_EXCRET_MODE_MASK);
+ return_to_sp_process = excret & R_V7M_EXCRET_SPSEL_MASK;
+ return_to_secure = arm_feature(env, ARM_FEATURE_M_SECURITY) &&
+ (excret & R_V7M_EXCRET_S_MASK);
+
+ if (arm_feature(env, ARM_FEATURE_V8)) {
+ if (!arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+ /*
+ * UNPREDICTABLE if S == 1 or DCRS == 0 or ES == 1 (R_XLCP);
+ * we choose to take the UsageFault.
+ */
+ if ((excret & R_V7M_EXCRET_S_MASK) ||
+ (excret & R_V7M_EXCRET_ES_MASK) ||
+ !(excret & R_V7M_EXCRET_DCRS_MASK)) {
+ ufault = true;
+ }
+ }
+ if (excret & R_V7M_EXCRET_RES0_MASK) {
+ ufault = true;
+ }
+ } else {
+ /* For v7M we only recognize certain combinations of the low bits */
+ switch (excret & 0xf) {
+ case 1: /* Return to Handler */
+ break;
+ case 13: /* Return to Thread using Process stack */
+ case 9: /* Return to Thread using Main stack */
+ /*
+ * We only need to check NONBASETHRDENA for v7M, because in
+ * v8M this bit does not exist (it is RES1).
+ */
+ if (!rettobase &&
+ !(env->v7m.ccr[env->v7m.secure] &
+ R_V7M_CCR_NONBASETHRDENA_MASK)) {
+ ufault = true;
+ }
+ break;
+ default:
+ ufault = true;
+ }
+ }
+
+ /*
+ * Set CONTROL.SPSEL from excret.SPSEL. Since we're still in
+ * Handler mode (and will be until we write the new XPSR.Interrupt
+ * field) this does not switch around the current stack pointer.
+ * We must do this before we do any kind of tailchaining, including
+ * for the derived exceptions on integrity check failures, or we will
+ * give the guest an incorrect EXCRET.SPSEL value on exception entry.
+ */
+ write_v7m_control_spsel_for_secstate(env, return_to_sp_process, exc_secure);
+
+ /*
+ * Clear scratch FP values left in caller saved registers; this
+ * must happen before any kind of tail chaining.
+ */
+ if ((env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_CLRONRET_MASK) &&
+ (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK)) {
+ if (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK) {
+ env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+ qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing "
+ "stackframe: error during lazy state deactivation\n");
+ v7m_exception_taken(cpu, excret, true, false);
+ return;
+ } else {
+ if (arm_feature(env, ARM_FEATURE_V8_1M)) {
+ /* v8.1M adds this NOCP check */
+ bool nsacr_pass = exc_secure ||
+ extract32(env->v7m.nsacr, 10, 1);
+ bool cpacr_pass = v7m_cpacr_pass(env, exc_secure, true);
+ if (!nsacr_pass) {
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, true);
+ env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_NOCP_MASK;
+ qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
+ "stackframe: NSACR prevents clearing FPU registers\n");
+ v7m_exception_taken(cpu, excret, true, false);
+ return;
+ } else if (!cpacr_pass) {
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+ exc_secure);
+ env->v7m.cfsr[exc_secure] |= R_V7M_CFSR_NOCP_MASK;
+ qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
+ "stackframe: CPACR prevents clearing FPU registers\n");
+ v7m_exception_taken(cpu, excret, true, false);
+ return;
+ }
+ }
+ /* Clear s0..s15, FPSCR and VPR */
+ int i;
+
+ for (i = 0; i < 16; i += 2) {
+ *aa32_vfp_dreg(env, i / 2) = 0;
+ }
+ vfp_set_fpscr(env, 0);
+ if (cpu_isar_feature(aa32_mve, cpu)) {
+ env->v7m.vpr = 0;
+ }
+ }
+ }
+
+ if (sfault) {
+ env->v7m.sfsr |= R_V7M_SFSR_INVER_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+ qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing "
+ "stackframe: failed EXC_RETURN.ES validity check\n");
+ v7m_exception_taken(cpu, excret, true, false);
+ return;
+ }
+
+ if (ufault) {
+ /*
+ * Bad exception return: instead of popping the exception
+ * stack, directly take a usage fault on the current stack.
+ */
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
+ qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
+ "stackframe: failed exception return integrity check\n");
+ v7m_exception_taken(cpu, excret, true, false);
+ return;
+ }
+
+ /*
+ * Tailchaining: if there is currently a pending exception that
+ * is high enough priority to preempt execution at the level we're
+ * about to return to, then just directly take that exception now,
+ * avoiding an unstack-and-then-stack. Note that now we have
+ * deactivated the previous exception by calling armv7m_nvic_complete_irq()
+ * our current execution priority is already the execution priority we are
+ * returning to -- none of the state we would unstack or set based on
+ * the EXCRET value affects it.
+ */
+ if (armv7m_nvic_can_take_pending_exception(env->nvic)) {
+ qemu_log_mask(CPU_LOG_INT, "...tailchaining to pending exception\n");
+ v7m_exception_taken(cpu, excret, true, false);
+ return;
+ }
+
+ switch_v7m_security_state(env, return_to_secure);
+
+ {
+ /*
+ * The stack pointer we should be reading the exception frame from
+ * depends on bits in the magic exception return type value (and
+ * for v8M isn't necessarily the stack pointer we will eventually
+ * end up resuming execution with). Get a pointer to the location
+ * in the CPU state struct where the SP we need is currently being
+ * stored; we will use and modify it in place.
+ * We use this limited C variable scope so we don't accidentally
+ * use 'frame_sp_p' after we do something that makes it invalid.
+ */
+ bool spsel = env->v7m.control[return_to_secure] & R_V7M_CONTROL_SPSEL_MASK;
+ uint32_t *frame_sp_p = get_v7m_sp_ptr(env,
+ return_to_secure,
+ !return_to_handler,
+ spsel);
+ uint32_t frameptr = *frame_sp_p;
+ bool pop_ok = true;
+ ARMMMUIdx mmu_idx;
+ bool return_to_priv = return_to_handler ||
+ !(env->v7m.control[return_to_secure] & R_V7M_CONTROL_NPRIV_MASK);
+
+ mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, return_to_secure,
+ return_to_priv);
+
+ if (!QEMU_IS_ALIGNED(frameptr, 8) &&
+ arm_feature(env, ARM_FEATURE_V8)) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "M profile exception return with non-8-aligned SP "
+ "for destination state is UNPREDICTABLE\n");
+ }
+
+ /* Do we need to pop callee-saved registers? */
+ if (return_to_secure &&
+ ((excret & R_V7M_EXCRET_ES_MASK) == 0 ||
+ (excret & R_V7M_EXCRET_DCRS_MASK) == 0)) {
+ uint32_t actual_sig;
+
+ pop_ok = v7m_stack_read(cpu, &actual_sig, frameptr, mmu_idx);
+
+ if (pop_ok && v7m_integrity_sig(env, excret) != actual_sig) {
+ /* Take a SecureFault on the current stack */
+ env->v7m.sfsr |= R_V7M_SFSR_INVIS_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+ qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing "
+ "stackframe: failed exception return integrity "
+ "signature check\n");
+ v7m_exception_taken(cpu, excret, true, false);
+ return;
+ }
+
+ pop_ok = pop_ok &&
+ v7m_stack_read(cpu, &env->regs[4], frameptr + 0x8, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[5], frameptr + 0xc, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[6], frameptr + 0x10, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[7], frameptr + 0x14, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[8], frameptr + 0x18, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[9], frameptr + 0x1c, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[10], frameptr + 0x20, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[11], frameptr + 0x24, mmu_idx);
+
+ frameptr += 0x28;
+ }
+
+ /* Pop registers */
+ pop_ok = pop_ok &&
+ v7m_stack_read(cpu, &env->regs[0], frameptr, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[1], frameptr + 0x4, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[2], frameptr + 0x8, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[3], frameptr + 0xc, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[12], frameptr + 0x10, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[14], frameptr + 0x14, mmu_idx) &&
+ v7m_stack_read(cpu, &env->regs[15], frameptr + 0x18, mmu_idx) &&
+ v7m_stack_read(cpu, &xpsr, frameptr + 0x1c, mmu_idx);
+
+ if (!pop_ok) {
+ /*
+ * v7m_stack_read() pended a fault, so take it (as a tail
+ * chained exception on the same stack frame)
+ */
+ qemu_log_mask(CPU_LOG_INT, "...derived exception on unstacking\n");
+ v7m_exception_taken(cpu, excret, true, false);
+ return;
+ }
+
+ /*
+ * Returning from an exception with a PC with bit 0 set is defined
+ * behaviour on v8M (bit 0 is ignored), but for v7M it was specified
+ * to be UNPREDICTABLE. In practice actual v7M hardware seems to ignore
+ * the lsbit, and there are several RTOSes out there which incorrectly
+ * assume the r15 in the stack frame should be a Thumb-style "lsbit
+ * indicates ARM/Thumb" value, so ignore the bit on v7M as well, but
+ * complain about the badly behaved guest.
+ */
+ if (env->regs[15] & 1) {
+ env->regs[15] &= ~1U;
+ if (!arm_feature(env, ARM_FEATURE_V8)) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "M profile return from interrupt with misaligned "
+ "PC is UNPREDICTABLE on v7M\n");
+ }
+ }
+
+ if (arm_feature(env, ARM_FEATURE_V8)) {
+ /*
+ * For v8M we have to check whether the xPSR exception field
+ * matches the EXCRET value for return to handler/thread
+ * before we commit to changing the SP and xPSR.
+ */
+ bool will_be_handler = (xpsr & XPSR_EXCP) != 0;
+ if (return_to_handler != will_be_handler) {
+ /*
+ * Take an INVPC UsageFault on the current stack.
+ * By this point we will have switched to the security state
+ * for the background state, so this UsageFault will target
+ * that state.
+ */
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+ env->v7m.secure);
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
+ qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing "
+ "stackframe: failed exception return integrity "
+ "check\n");
+ v7m_exception_taken(cpu, excret, true, false);
+ return;
+ }
+ }
+
+ if (!ftype) {
+ /* FP present and we need to handle it */
+ if (!return_to_secure &&
+ (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_LSPACT_MASK)) {
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+ env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
+ qemu_log_mask(CPU_LOG_INT,
+ "...taking SecureFault on existing stackframe: "
+ "Secure LSPACT set but exception return is "
+ "not to secure state\n");
+ v7m_exception_taken(cpu, excret, true, false);
+ return;
+ }
+
+ restore_s16_s31 = return_to_secure &&
+ (env->v7m.fpccr[M_REG_S] & R_V7M_FPCCR_TS_MASK);
+
+ if (env->v7m.fpccr[return_to_secure] & R_V7M_FPCCR_LSPACT_MASK) {
+ /* State in FPU is still valid, just clear LSPACT */
+ env->v7m.fpccr[return_to_secure] &= ~R_V7M_FPCCR_LSPACT_MASK;
+ } else {
+ int i;
+ uint32_t fpscr;
+ bool cpacr_pass, nsacr_pass;
+
+ cpacr_pass = v7m_cpacr_pass(env, return_to_secure,
+ return_to_priv);
+ nsacr_pass = return_to_secure ||
+ extract32(env->v7m.nsacr, 10, 1);
+
+ if (!cpacr_pass) {
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+ return_to_secure);
+ env->v7m.cfsr[return_to_secure] |= R_V7M_CFSR_NOCP_MASK;
+ qemu_log_mask(CPU_LOG_INT,
+ "...taking UsageFault on existing "
+ "stackframe: CPACR.CP10 prevents unstacking "
+ "FP regs\n");
+ v7m_exception_taken(cpu, excret, true, false);
+ return;
+ } else if (!nsacr_pass) {
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, true);
+ env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_INVPC_MASK;
+ qemu_log_mask(CPU_LOG_INT,
+ "...taking Secure UsageFault on existing "
+ "stackframe: NSACR.CP10 prevents unstacking "
+ "FP regs\n");
+ v7m_exception_taken(cpu, excret, true, false);
+ return;
+ }
+
+ for (i = 0; i < (restore_s16_s31 ? 32 : 16); i += 2) {
+ uint32_t slo, shi;
+ uint64_t dn;
+ uint32_t faddr = frameptr + 0x20 + 4 * i;
+
+ if (i >= 16) {
+ faddr += 8; /* Skip the slot for the FPSCR and VPR */
+ }
+
+ pop_ok = pop_ok &&
+ v7m_stack_read(cpu, &slo, faddr, mmu_idx) &&
+ v7m_stack_read(cpu, &shi, faddr + 4, mmu_idx);
+
+ if (!pop_ok) {
+ break;
+ }
+
+ dn = (uint64_t)shi << 32 | slo;
+ *aa32_vfp_dreg(env, i / 2) = dn;
+ }
+ pop_ok = pop_ok &&
+ v7m_stack_read(cpu, &fpscr, frameptr + 0x60, mmu_idx);
+ if (pop_ok) {
+ vfp_set_fpscr(env, fpscr);
+ }
+ if (cpu_isar_feature(aa32_mve, cpu)) {
+ pop_ok = pop_ok &&
+ v7m_stack_read(cpu, &env->v7m.vpr,
+ frameptr + 0x64, mmu_idx);
+ }
+ if (!pop_ok) {
+ /*
+ * These regs are 0 if security extension present;
+ * otherwise merely UNKNOWN. We zero always.
+ */
+ for (i = 0; i < (restore_s16_s31 ? 32 : 16); i += 2) {
+ *aa32_vfp_dreg(env, i / 2) = 0;
+ }
+ vfp_set_fpscr(env, 0);
+ if (cpu_isar_feature(aa32_mve, cpu)) {
+ env->v7m.vpr = 0;
+ }
+ }
+ }
+ }
+ env->v7m.control[M_REG_S] = FIELD_DP32(env->v7m.control[M_REG_S],
+ V7M_CONTROL, FPCA, !ftype);
+
+ /* Commit to consuming the stack frame */
+ frameptr += 0x20;
+ if (!ftype) {
+ frameptr += 0x48;
+ if (restore_s16_s31) {
+ frameptr += 0x40;
+ }
+ }
+ /*
+ * Undo stack alignment (the SPREALIGN bit indicates that the original
+ * pre-exception SP was not 8-aligned and we added a padding word to
+ * align it, so we undo this by ORing in the bit that increases it
+ * from the current 8-aligned value to the 8-unaligned value. (Adding 4
+ * would work too but a logical OR is how the pseudocode specifies it.)
+ */
+ if (xpsr & XPSR_SPREALIGN) {
+ frameptr |= 4;
+ }
+ *frame_sp_p = frameptr;
+ }
+
+ xpsr_mask = ~(XPSR_SPREALIGN | XPSR_SFPA);
+ if (!arm_feature(env, ARM_FEATURE_THUMB_DSP)) {
+ xpsr_mask &= ~XPSR_GE;
+ }
+ /* This xpsr_write() will invalidate frame_sp_p as it may switch stack */
+ xpsr_write(env, xpsr, xpsr_mask);
+
+ if (env->v7m.secure) {
+ bool sfpa = xpsr & XPSR_SFPA;
+
+ env->v7m.control[M_REG_S] = FIELD_DP32(env->v7m.control[M_REG_S],
+ V7M_CONTROL, SFPA, sfpa);
+ }
+
+ /*
+ * The restored xPSR exception field will be zero if we're
+ * resuming in Thread mode. If that doesn't match what the
+ * exception return excret specified then this is a UsageFault.
+ * v7M requires we make this check here; v8M did it earlier.
+ */
+ if (return_to_handler != arm_v7m_is_handler_mode(env)) {
+ /*
+ * Take an INVPC UsageFault by pushing the stack again;
+ * we know we're v7M so this is never a Secure UsageFault.
+ */
+ bool ignore_stackfaults;
+
+ assert(!arm_feature(env, ARM_FEATURE_V8));
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, false);
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
+ ignore_stackfaults = v7m_push_stack(cpu);
+ qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on new stackframe: "
+ "failed exception return integrity check\n");
+ v7m_exception_taken(cpu, excret, false, ignore_stackfaults);
+ return;
+ }
+
+ /* Otherwise, we have a successful exception exit. */
+ arm_clear_exclusive(env);
+ arm_rebuild_hflags(env);
+ qemu_log_mask(CPU_LOG_INT, "...successful exception return\n");
+}
+
+static bool do_v7m_function_return(ARMCPU *cpu)
+{
+ /*
+ * v8M security extensions magic function return.
+ * We may either:
+ * (1) throw an exception (longjump)
+ * (2) return true if we successfully handled the function return
+ * (3) return false if we failed a consistency check and have
+ * pended a UsageFault that needs to be taken now
+ *
+ * At this point the magic return value is split between env->regs[15]
+ * and env->thumb. We don't bother to reconstitute it because we don't
+ * need it (all values are handled the same way).
+ */
+ CPUARMState *env = &cpu->env;
+ uint32_t newpc, newpsr, newpsr_exc;
+
+ qemu_log_mask(CPU_LOG_INT, "...really v7M secure function return\n");
+
+ {
+ bool threadmode, spsel;
+ MemOpIdx oi;
+ ARMMMUIdx mmu_idx;
+ uint32_t *frame_sp_p;
+ uint32_t frameptr;
+
+ /* Pull the return address and IPSR from the Secure stack */
+ threadmode = !arm_v7m_is_handler_mode(env);
+ spsel = env->v7m.control[M_REG_S] & R_V7M_CONTROL_SPSEL_MASK;
+
+ frame_sp_p = get_v7m_sp_ptr(env, true, threadmode, spsel);
+ frameptr = *frame_sp_p;
+
+ /*
+ * These loads may throw an exception (for MPU faults). We want to
+ * do them as secure, so work out what MMU index that is.
+ */
+ mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
+ oi = make_memop_idx(MO_LEUL, arm_to_core_mmu_idx(mmu_idx));
+ newpc = cpu_ldl_le_mmu(env, frameptr, oi, 0);
+ newpsr = cpu_ldl_le_mmu(env, frameptr + 4, oi, 0);
+
+ /* Consistency checks on new IPSR */
+ newpsr_exc = newpsr & XPSR_EXCP;
+ if (!((env->v7m.exception == 0 && newpsr_exc == 0) ||
+ (env->v7m.exception == 1 && newpsr_exc != 0))) {
+ /* Pend the fault and tell our caller to take it */
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+ env->v7m.secure);
+ qemu_log_mask(CPU_LOG_INT,
+ "...taking INVPC UsageFault: "
+ "IPSR consistency check failed\n");
+ return false;
+ }
+
+ *frame_sp_p = frameptr + 8;
+ }
+
+ /* This invalidates frame_sp_p */
+ switch_v7m_security_state(env, true);
+ env->v7m.exception = newpsr_exc;
+ env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
+ if (newpsr & XPSR_SFPA) {
+ env->v7m.control[M_REG_S] |= R_V7M_CONTROL_SFPA_MASK;
+ }
+ xpsr_write(env, 0, XPSR_IT);
+ env->thumb = newpc & 1;
+ env->regs[15] = newpc & ~1;
+ arm_rebuild_hflags(env);
+
+ qemu_log_mask(CPU_LOG_INT, "...function return successful\n");
+ return true;
+}
+
+static bool v7m_read_half_insn(ARMCPU *cpu, ARMMMUIdx mmu_idx, bool secure,
+ uint32_t addr, uint16_t *insn)
+{
+ /*
+ * Load a 16-bit portion of a v7M instruction, returning true on success,
+ * or false on failure (in which case we will have pended the appropriate
+ * exception).
+ * We need to do the instruction fetch's MPU and SAU checks
+ * like this because there is no MMU index that would allow
+ * doing the load with a single function call. Instead we must
+ * first check that the security attributes permit the load
+ * and that they don't mismatch on the two halves of the instruction,
+ * and then we do the load as a secure load (ie using the security
+ * attributes of the address, not the CPU, as architecturally required).
+ */
+ CPUState *cs = CPU(cpu);
+ CPUARMState *env = &cpu->env;
+ V8M_SAttributes sattrs = {};
+ GetPhysAddrResult res = {};
+ ARMMMUFaultInfo fi = {};
+ MemTxResult txres;
+
+ v8m_security_lookup(env, addr, MMU_INST_FETCH, mmu_idx, secure, &sattrs);
+ if (!sattrs.nsc || sattrs.ns) {
+ /*
+ * This must be the second half of the insn, and it straddles a
+ * region boundary with the second half not being S&NSC.
+ */
+ env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+ qemu_log_mask(CPU_LOG_INT,
+ "...really SecureFault with SFSR.INVEP\n");
+ return false;
+ }
+ if (get_phys_addr(env, addr, MMU_INST_FETCH, mmu_idx, &res, &fi)) {
+ /* the MPU lookup failed */
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, env->v7m.secure);
+ qemu_log_mask(CPU_LOG_INT, "...really MemManage with CFSR.IACCVIOL\n");
+ return false;
+ }
+ *insn = address_space_lduw_le(arm_addressspace(cs, res.f.attrs),
+ res.f.phys_addr, res.f.attrs, &txres);
+ if (txres != MEMTX_OK) {
+ env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_IBUSERR_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false);
+ qemu_log_mask(CPU_LOG_INT, "...really BusFault with CFSR.IBUSERR\n");
+ return false;
+ }
+ return true;
+}
+
+static bool v7m_read_sg_stack_word(ARMCPU *cpu, ARMMMUIdx mmu_idx,
+ uint32_t addr, uint32_t *spdata)
+{
+ /*
+ * Read a word of data from the stack for the SG instruction,
+ * writing the value into *spdata. If the load succeeds, return
+ * true; otherwise pend an appropriate exception and return false.
+ * (We can't use data load helpers here that throw an exception
+ * because of the context we're called in, which is halfway through
+ * arm_v7m_cpu_do_interrupt().)
+ */
+ CPUState *cs = CPU(cpu);
+ CPUARMState *env = &cpu->env;
+ MemTxResult txres;
+ GetPhysAddrResult res = {};
+ ARMMMUFaultInfo fi = {};
+ uint32_t value;
+
+ if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &res, &fi)) {
+ /* MPU/SAU lookup failed */
+ if (fi.type == ARMFault_QEMU_SFault) {
+ qemu_log_mask(CPU_LOG_INT,
+ "...SecureFault during stack word read\n");
+ env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK;
+ env->v7m.sfar = addr;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+ } else {
+ qemu_log_mask(CPU_LOG_INT,
+ "...MemManageFault during stack word read\n");
+ env->v7m.cfsr[M_REG_S] |= R_V7M_CFSR_DACCVIOL_MASK |
+ R_V7M_CFSR_MMARVALID_MASK;
+ env->v7m.mmfar[M_REG_S] = addr;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, false);
+ }
+ return false;
+ }
+ value = address_space_ldl(arm_addressspace(cs, res.f.attrs),
+ res.f.phys_addr, res.f.attrs, &txres);
+ if (txres != MEMTX_OK) {
+ /* BusFault trying to read the data */
+ qemu_log_mask(CPU_LOG_INT,
+ "...BusFault during stack word read\n");
+ env->v7m.cfsr[M_REG_NS] |=
+ (R_V7M_CFSR_PRECISERR_MASK | R_V7M_CFSR_BFARVALID_MASK);
+ env->v7m.bfar = addr;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false);
+ return false;
+ }
+
+ *spdata = value;
+ return true;
+}
+
+static bool v7m_handle_execute_nsc(ARMCPU *cpu)
+{
+ /*
+ * Check whether this attempt to execute code in a Secure & NS-Callable
+ * memory region is for an SG instruction; if so, then emulate the
+ * effect of the SG instruction and return true. Otherwise pend
+ * the correct kind of exception and return false.
+ */
+ CPUARMState *env = &cpu->env;
+ ARMMMUIdx mmu_idx;
+ uint16_t insn;
+
+ /*
+ * We should never get here unless get_phys_addr_pmsav8() caused
+ * an exception for NS executing in S&NSC memory.
+ */
+ assert(!env->v7m.secure);
+ assert(arm_feature(env, ARM_FEATURE_M_SECURITY));
+
+ /* We want to do the MPU lookup as secure; work out what mmu_idx that is */
+ mmu_idx = arm_v7m_mmu_idx_for_secstate(env, true);
+
+ if (!v7m_read_half_insn(cpu, mmu_idx, true, env->regs[15], &insn)) {
+ return false;
+ }
+
+ if (!env->thumb) {
+ goto gen_invep;
+ }
+
+ if (insn != 0xe97f) {
+ /*
+ * Not an SG instruction first half (we choose the IMPDEF
+ * early-SG-check option).
+ */
+ goto gen_invep;
+ }
+
+ if (!v7m_read_half_insn(cpu, mmu_idx, true, env->regs[15] + 2, &insn)) {
+ return false;
+ }
+
+ if (insn != 0xe97f) {
+ /*
+ * Not an SG instruction second half (yes, both halves of the SG
+ * insn have the same hex value)
+ */
+ goto gen_invep;
+ }
+
+ /*
+ * OK, we have confirmed that we really have an SG instruction.
+ * We know we're NS in S memory so don't need to repeat those checks.
+ */
+ qemu_log_mask(CPU_LOG_INT, "...really an SG instruction at 0x%08" PRIx32
+ ", executing it\n", env->regs[15]);
+
+ if (cpu_isar_feature(aa32_m_sec_state, cpu) &&
+ !arm_v7m_is_handler_mode(env)) {
+ /*
+ * v8.1M exception stack frame integrity check. Note that we
+ * must perform the memory access even if CCR_S.TRD is zero
+ * and we aren't going to check what the data loaded is.
+ */
+ uint32_t spdata, sp;
+
+ /*
+ * We know we are currently NS, so the S stack pointers must be
+ * in other_ss_{psp,msp}, not in regs[13]/other_sp.
+ */
+ sp = v7m_using_psp(env) ? env->v7m.other_ss_psp : env->v7m.other_ss_msp;
+ if (!v7m_read_sg_stack_word(cpu, mmu_idx, sp, &spdata)) {
+ /* Stack access failed and an exception has been pended */
+ return false;
+ }
+
+ if (env->v7m.ccr[M_REG_S] & R_V7M_CCR_TRD_MASK) {
+ if (((spdata & ~1) == 0xfefa125a) ||
+ !(env->v7m.control[M_REG_S] & 1)) {
+ goto gen_invep;
+ }
+ }
+ }
+
+ env->regs[14] &= ~1;
+ env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
+ switch_v7m_security_state(env, true);
+ xpsr_write(env, 0, XPSR_IT);
+ env->regs[15] += 4;
+ arm_rebuild_hflags(env);
+ return true;
+
+gen_invep:
+ env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+ qemu_log_mask(CPU_LOG_INT,
+ "...really SecureFault with SFSR.INVEP\n");
+ return false;
+}
+
+void arm_v7m_cpu_do_interrupt(CPUState *cs)
+{
+ ARMCPU *cpu = ARM_CPU(cs);
+ CPUARMState *env = &cpu->env;
+ uint32_t lr;
+ bool ignore_stackfaults;
+
+ arm_log_exception(cs);
+
+ /*
+ * For exceptions we just mark as pending on the NVIC, and let that
+ * handle it.
+ */
+ switch (cs->exception_index) {
+ case EXCP_UDEF:
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNDEFINSTR_MASK;
+ break;
+ case EXCP_NOCP:
+ {
+ /*
+ * NOCP might be directed to something other than the current
+ * security state if this fault is because of NSACR; we indicate
+ * the target security state using exception.target_el.
+ */
+ int target_secstate;
+
+ if (env->exception.target_el == 3) {
+ target_secstate = M_REG_S;
+ } else {
+ target_secstate = env->v7m.secure;
+ }
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, target_secstate);
+ env->v7m.cfsr[target_secstate] |= R_V7M_CFSR_NOCP_MASK;
+ break;
+ }
+ case EXCP_INVSTATE:
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVSTATE_MASK;
+ break;
+ case EXCP_STKOF:
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_STKOF_MASK;
+ break;
+ case EXCP_LSERR:
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+ env->v7m.sfsr |= R_V7M_SFSR_LSERR_MASK;
+ break;
+ case EXCP_UNALIGNED:
+ /* Unaligned faults reported by M-profile aware code */
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK;
+ break;
+ case EXCP_DIVBYZERO:
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure);
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_DIVBYZERO_MASK;
+ break;
+ case EXCP_SWI:
+ /* The PC already points to the next instruction. */
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SVC, env->v7m.secure);
+ break;
+ case EXCP_PREFETCH_ABORT:
+ case EXCP_DATA_ABORT:
+ /*
+ * Note that for M profile we don't have a guest facing FSR, but
+ * the env->exception.fsr will be populated by the code that
+ * raises the fault, in the A profile short-descriptor format.
+ *
+ * Log the exception.vaddress now regardless of subtype, because
+ * logging below only logs it when it goes into a guest visible
+ * register.
+ */
+ qemu_log_mask(CPU_LOG_INT, "...at fault address 0x%x\n",
+ (uint32_t)env->exception.vaddress);
+ switch (env->exception.fsr & 0xf) {
+ case M_FAKE_FSR_NSC_EXEC:
+ /*
+ * Exception generated when we try to execute code at an address
+ * which is marked as Secure & Non-Secure Callable and the CPU
+ * is in the Non-Secure state. The only instruction which can
+ * be executed like this is SG (and that only if both halves of
+ * the SG instruction have the same security attributes.)
+ * Everything else must generate an INVEP SecureFault, so we
+ * emulate the SG instruction here.
+ */
+ if (v7m_handle_execute_nsc(cpu)) {
+ return;
+ }
+ break;
+ case M_FAKE_FSR_SFAULT:
+ /*
+ * Various flavours of SecureFault for attempts to execute or
+ * access data in the wrong security state.
+ */
+ switch (cs->exception_index) {
+ case EXCP_PREFETCH_ABORT:
+ if (env->v7m.secure) {
+ env->v7m.sfsr |= R_V7M_SFSR_INVTRAN_MASK;
+ qemu_log_mask(CPU_LOG_INT,
+ "...really SecureFault with SFSR.INVTRAN\n");
+ } else {
+ env->v7m.sfsr |= R_V7M_SFSR_INVEP_MASK;
+ qemu_log_mask(CPU_LOG_INT,
+ "...really SecureFault with SFSR.INVEP\n");
+ }
+ break;
+ case EXCP_DATA_ABORT:
+ /* This must be an NS access to S memory */
+ env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK;
+ qemu_log_mask(CPU_LOG_INT,
+ "...really SecureFault with SFSR.AUVIOL\n");
+ break;
+ }
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false);
+ break;
+ case 0x8: /* External Abort */
+ switch (cs->exception_index) {
+ case EXCP_PREFETCH_ABORT:
+ env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_IBUSERR_MASK;
+ qemu_log_mask(CPU_LOG_INT, "...with CFSR.IBUSERR\n");
+ break;
+ case EXCP_DATA_ABORT:
+ env->v7m.cfsr[M_REG_NS] |=
+ (R_V7M_CFSR_PRECISERR_MASK | R_V7M_CFSR_BFARVALID_MASK);
+ env->v7m.bfar = env->exception.vaddress;
+ qemu_log_mask(CPU_LOG_INT,
+ "...with CFSR.PRECISERR and BFAR 0x%x\n",
+ env->v7m.bfar);
+ break;
+ }
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_BUS, false);
+ break;
+ case 0x1: /* Alignment fault reported by generic code */
+ qemu_log_mask(CPU_LOG_INT,
+ "...really UsageFault with UFSR.UNALIGNED\n");
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_UNALIGNED_MASK;
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE,
+ env->v7m.secure);
+ break;
+ default:
+ /*
+ * All other FSR values are either MPU faults or "can't happen
+ * for M profile" cases.
+ */
+ switch (cs->exception_index) {
+ case EXCP_PREFETCH_ABORT:
+ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK;
+ qemu_log_mask(CPU_LOG_INT, "...with CFSR.IACCVIOL\n");
+ break;
+ case EXCP_DATA_ABORT:
+ env->v7m.cfsr[env->v7m.secure] |=
+ (R_V7M_CFSR_DACCVIOL_MASK | R_V7M_CFSR_MMARVALID_MASK);
+ env->v7m.mmfar[env->v7m.secure] = env->exception.vaddress;
+ qemu_log_mask(CPU_LOG_INT,
+ "...with CFSR.DACCVIOL and MMFAR 0x%x\n",
+ env->v7m.mmfar[env->v7m.secure]);
+ break;
+ }
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM,
+ env->v7m.secure);
+ break;
+ }
+ break;
+ case EXCP_SEMIHOST:
+ qemu_log_mask(CPU_LOG_INT,
+ "...handling as semihosting call 0x%x\n",
+ env->regs[0]);
+#ifdef CONFIG_TCG
+ do_common_semihosting(cs);
+#else
+ g_assert_not_reached();
+#endif
+ env->regs[15] += env->thumb ? 2 : 4;
+ return;
+ case EXCP_BKPT:
+ armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_DEBUG, false);
+ break;
+ case EXCP_IRQ:
+ break;
+ case EXCP_EXCEPTION_EXIT:
+ if (env->regs[15] < EXC_RETURN_MIN_MAGIC) {
+ /* Must be v8M security extension function return */
+ assert(env->regs[15] >= FNC_RETURN_MIN_MAGIC);
+ assert(arm_feature(env, ARM_FEATURE_M_SECURITY));
+ if (do_v7m_function_return(cpu)) {
+ return;
+ }
+ } else {
+ do_v7m_exception_exit(cpu);
+ return;
+ }
+ break;
+ case EXCP_LAZYFP:
+ /*
+ * We already pended the specific exception in the NVIC in the
+ * v7m_preserve_fp_state() helper function.
+ */
+ break;
+ default:
+ cpu_abort(cs, "Unhandled exception 0x%x\n", cs->exception_index);
+ return; /* Never happens. Keep compiler happy. */
+ }
+
+ if (arm_feature(env, ARM_FEATURE_V8)) {
+ lr = R_V7M_EXCRET_RES1_MASK |
+ R_V7M_EXCRET_DCRS_MASK;
+ /*
+ * The S bit indicates whether we should return to Secure
+ * or NonSecure (ie our current state).
+ * The ES bit indicates whether we're taking this exception
+ * to Secure or NonSecure (ie our target state). We set it
+ * later, in v7m_exception_taken().
+ * The SPSEL bit is also set in v7m_exception_taken() for v8M.
+ * This corresponds to the ARM ARM pseudocode for v8M setting
+ * some LR bits in PushStack() and some in ExceptionTaken();
+ * the distinction matters for the tailchain cases where we
+ * can take an exception without pushing the stack.
+ */
+ if (env->v7m.secure) {
+ lr |= R_V7M_EXCRET_S_MASK;
+ }
+ } else {
+ lr = R_V7M_EXCRET_RES1_MASK |
+ R_V7M_EXCRET_S_MASK |
+ R_V7M_EXCRET_DCRS_MASK |
+ R_V7M_EXCRET_ES_MASK;
+ if (env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK) {
+ lr |= R_V7M_EXCRET_SPSEL_MASK;
+ }
+ }
+ if (!(env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK)) {
+ lr |= R_V7M_EXCRET_FTYPE_MASK;
+ }
+ if (!arm_v7m_is_handler_mode(env)) {
+ lr |= R_V7M_EXCRET_MODE_MASK;
+ }
+
+ ignore_stackfaults = v7m_push_stack(cpu);
+ v7m_exception_taken(cpu, lr, false, ignore_stackfaults);
+}
+
+uint32_t HELPER(v7m_mrs)(CPUARMState *env, uint32_t reg)
+{
+ unsigned el = arm_current_el(env);
+
+ /* First handle registers which unprivileged can read */
+ switch (reg) {
+ case 0 ... 7: /* xPSR sub-fields */
+ return v7m_mrs_xpsr(env, reg, el);
+ case 20: /* CONTROL */
+ return v7m_mrs_control(env, env->v7m.secure);
+ case 0x94: /* CONTROL_NS */
+ /*
+ * We have to handle this here because unprivileged Secure code
+ * can read the NS CONTROL register.
+ */
+ if (!env->v7m.secure) {
+ return 0;
+ }
+ return env->v7m.control[M_REG_NS] |
+ (env->v7m.control[M_REG_S] & R_V7M_CONTROL_FPCA_MASK);
+ }
+
+ if (el == 0) {
+ return 0; /* unprivileged reads others as zero */
+ }
+
+ if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+ switch (reg) {
+ case 0x88: /* MSP_NS */
+ if (!env->v7m.secure) {
+ return 0;
+ }
+ return env->v7m.other_ss_msp;
+ case 0x89: /* PSP_NS */
+ if (!env->v7m.secure) {
+ return 0;
+ }
+ return env->v7m.other_ss_psp;
+ case 0x8a: /* MSPLIM_NS */
+ if (!env->v7m.secure) {
+ return 0;
+ }
+ return env->v7m.msplim[M_REG_NS];
+ case 0x8b: /* PSPLIM_NS */
+ if (!env->v7m.secure) {
+ return 0;
+ }
+ return env->v7m.psplim[M_REG_NS];
+ case 0x90: /* PRIMASK_NS */
+ if (!env->v7m.secure) {
+ return 0;
+ }
+ return env->v7m.primask[M_REG_NS];
+ case 0x91: /* BASEPRI_NS */
+ if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+ goto bad_reg;
+ }
+ if (!env->v7m.secure) {
+ return 0;
+ }
+ return env->v7m.basepri[M_REG_NS];
+ case 0x93: /* FAULTMASK_NS */
+ if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+ goto bad_reg;
+ }
+ if (!env->v7m.secure) {
+ return 0;
+ }
+ return env->v7m.faultmask[M_REG_NS];
+ case 0x98: /* SP_NS */
+ {
+ /*
+ * This gives the non-secure SP selected based on whether we're
+ * currently in handler mode or not, using the NS CONTROL.SPSEL.
+ */
+ bool spsel = env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK;
+
+ if (!env->v7m.secure) {
+ return 0;
+ }
+ if (!arm_v7m_is_handler_mode(env) && spsel) {
+ return env->v7m.other_ss_psp;
+ } else {
+ return env->v7m.other_ss_msp;
+ }
+ }
+ default:
+ break;
+ }
+ }
+
+ switch (reg) {
+ case 8: /* MSP */
+ return v7m_using_psp(env) ? env->v7m.other_sp : env->regs[13];
+ case 9: /* PSP */
+ return v7m_using_psp(env) ? env->regs[13] : env->v7m.other_sp;
+ case 10: /* MSPLIM */
+ if (!arm_feature(env, ARM_FEATURE_V8)) {
+ goto bad_reg;
+ }
+ return env->v7m.msplim[env->v7m.secure];
+ case 11: /* PSPLIM */
+ if (!arm_feature(env, ARM_FEATURE_V8)) {
+ goto bad_reg;
+ }
+ return env->v7m.psplim[env->v7m.secure];
+ case 16: /* PRIMASK */
+ return env->v7m.primask[env->v7m.secure];
+ case 17: /* BASEPRI */
+ case 18: /* BASEPRI_MAX */
+ if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+ goto bad_reg;
+ }
+ return env->v7m.basepri[env->v7m.secure];
+ case 19: /* FAULTMASK */
+ if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+ goto bad_reg;
+ }
+ return env->v7m.faultmask[env->v7m.secure];
+ default:
+ bad_reg:
+ qemu_log_mask(LOG_GUEST_ERROR, "Attempt to read unknown special"
+ " register %d\n", reg);
+ return 0;
+ }
+}
+
+void HELPER(v7m_msr)(CPUARMState *env, uint32_t maskreg, uint32_t val)
+{
+ /*
+ * We're passed bits [11..0] of the instruction; extract
+ * SYSm and the mask bits.
+ * Invalid combinations of SYSm and mask are UNPREDICTABLE;
+ * we choose to treat them as if the mask bits were valid.
+ * NB that the pseudocode 'mask' variable is bits [11..10],
+ * whereas ours is [11..8].
+ */
+ uint32_t mask = extract32(maskreg, 8, 4);
+ uint32_t reg = extract32(maskreg, 0, 8);
+ int cur_el = arm_current_el(env);
+
+ if (cur_el == 0 && reg > 7 && reg != 20) {
+ /*
+ * only xPSR sub-fields and CONTROL.SFPA may be written by
+ * unprivileged code
+ */
+ return;
+ }
+
+ if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
+ switch (reg) {
+ case 0x88: /* MSP_NS */
+ if (!env->v7m.secure) {
+ return;
+ }
+ env->v7m.other_ss_msp = val & ~3;
+ return;
+ case 0x89: /* PSP_NS */
+ if (!env->v7m.secure) {
+ return;
+ }
+ env->v7m.other_ss_psp = val & ~3;
+ return;
+ case 0x8a: /* MSPLIM_NS */
+ if (!env->v7m.secure) {
+ return;
+ }
+ env->v7m.msplim[M_REG_NS] = val & ~7;
+ return;
+ case 0x8b: /* PSPLIM_NS */
+ if (!env->v7m.secure) {
+ return;
+ }
+ env->v7m.psplim[M_REG_NS] = val & ~7;
+ return;
+ case 0x90: /* PRIMASK_NS */
+ if (!env->v7m.secure) {
+ return;
+ }
+ env->v7m.primask[M_REG_NS] = val & 1;
+ return;
+ case 0x91: /* BASEPRI_NS */
+ if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+ goto bad_reg;
+ }
+ if (!env->v7m.secure) {
+ return;
+ }
+ env->v7m.basepri[M_REG_NS] = val & 0xff;
+ return;
+ case 0x93: /* FAULTMASK_NS */
+ if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+ goto bad_reg;
+ }
+ if (!env->v7m.secure) {
+ return;
+ }
+ env->v7m.faultmask[M_REG_NS] = val & 1;
+ return;
+ case 0x94: /* CONTROL_NS */
+ if (!env->v7m.secure) {
+ return;
+ }
+ write_v7m_control_spsel_for_secstate(env,
+ val & R_V7M_CONTROL_SPSEL_MASK,
+ M_REG_NS);
+ if (arm_feature(env, ARM_FEATURE_M_MAIN)) {
+ env->v7m.control[M_REG_NS] &= ~R_V7M_CONTROL_NPRIV_MASK;
+ env->v7m.control[M_REG_NS] |= val & R_V7M_CONTROL_NPRIV_MASK;
+ }
+ /*
+ * SFPA is RAZ/WI from NS. FPCA is RO if NSACR.CP10 == 0,
+ * RES0 if the FPU is not present, and is stored in the S bank
+ */
+ if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env)) &&
+ extract32(env->v7m.nsacr, 10, 1)) {
+ env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK;
+ env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_FPCA_MASK;
+ }
+ return;
+ case 0x98: /* SP_NS */
+ {
+ /*
+ * This gives the non-secure SP selected based on whether we're
+ * currently in handler mode or not, using the NS CONTROL.SPSEL.
+ */
+ bool spsel = env->v7m.control[M_REG_NS] & R_V7M_CONTROL_SPSEL_MASK;
+ bool is_psp = !arm_v7m_is_handler_mode(env) && spsel;
+ uint32_t limit;
+
+ if (!env->v7m.secure) {
+ return;
+ }
+
+ limit = is_psp ? env->v7m.psplim[false] : env->v7m.msplim[false];
+
+ val &= ~0x3;
+
+ if (val < limit) {
+ raise_exception_ra(env, EXCP_STKOF, 0, 1, GETPC());
+ }
+
+ if (is_psp) {
+ env->v7m.other_ss_psp = val;
+ } else {
+ env->v7m.other_ss_msp = val;
+ }
+ return;
+ }
+ default:
+ break;
+ }
+ }
+
+ switch (reg) {
+ case 0 ... 7: /* xPSR sub-fields */
+ v7m_msr_xpsr(env, mask, reg, val);
+ break;
+ case 8: /* MSP */
+ if (v7m_using_psp(env)) {
+ env->v7m.other_sp = val & ~3;
+ } else {
+ env->regs[13] = val & ~3;
+ }
+ break;
+ case 9: /* PSP */
+ if (v7m_using_psp(env)) {
+ env->regs[13] = val & ~3;
+ } else {
+ env->v7m.other_sp = val & ~3;
+ }
+ break;
+ case 10: /* MSPLIM */
+ if (!arm_feature(env, ARM_FEATURE_V8)) {
+ goto bad_reg;
+ }
+ env->v7m.msplim[env->v7m.secure] = val & ~7;
+ break;
+ case 11: /* PSPLIM */
+ if (!arm_feature(env, ARM_FEATURE_V8)) {
+ goto bad_reg;
+ }
+ env->v7m.psplim[env->v7m.secure] = val & ~7;
+ break;
+ case 16: /* PRIMASK */
+ env->v7m.primask[env->v7m.secure] = val & 1;
+ break;
+ case 17: /* BASEPRI */
+ if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+ goto bad_reg;
+ }
+ env->v7m.basepri[env->v7m.secure] = val & 0xff;
+ break;
+ case 18: /* BASEPRI_MAX */
+ if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+ goto bad_reg;
+ }
+ val &= 0xff;
+ if (val != 0 && (val < env->v7m.basepri[env->v7m.secure]
+ || env->v7m.basepri[env->v7m.secure] == 0)) {
+ env->v7m.basepri[env->v7m.secure] = val;
+ }
+ break;
+ case 19: /* FAULTMASK */
+ if (!arm_feature(env, ARM_FEATURE_M_MAIN)) {
+ goto bad_reg;
+ }
+ env->v7m.faultmask[env->v7m.secure] = val & 1;
+ break;
+ case 20: /* CONTROL */
+ /*
+ * Writing to the SPSEL bit only has an effect if we are in
+ * thread mode; other bits can be updated by any privileged code.
+ * write_v7m_control_spsel() deals with updating the SPSEL bit in
+ * env->v7m.control, so we only need update the others.
+ * For v7M, we must just ignore explicit writes to SPSEL in handler
+ * mode; for v8M the write is permitted but will have no effect.
+ * All these bits are writes-ignored from non-privileged code,
+ * except for SFPA.
+ */
+ if (cur_el > 0 && (arm_feature(env, ARM_FEATURE_V8) ||
+ !arm_v7m_is_handler_mode(env))) {
+ write_v7m_control_spsel(env, (val & R_V7M_CONTROL_SPSEL_MASK) != 0);
+ }
+ if (cur_el > 0 && arm_feature(env, ARM_FEATURE_M_MAIN)) {
+ env->v7m.control[env->v7m.secure] &= ~R_V7M_CONTROL_NPRIV_MASK;
+ env->v7m.control[env->v7m.secure] |= val & R_V7M_CONTROL_NPRIV_MASK;
+ }
+ if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env))) {
+ /*
+ * SFPA is RAZ/WI from NS or if no FPU.
+ * FPCA is RO if NSACR.CP10 == 0, RES0 if the FPU is not present.
+ * Both are stored in the S bank.
+ */
+ if (env->v7m.secure) {
+ env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_SFPA_MASK;
+ env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_SFPA_MASK;
+ }
+ if (cur_el > 0 &&
+ (env->v7m.secure || !arm_feature(env, ARM_FEATURE_M_SECURITY) ||
+ extract32(env->v7m.nsacr, 10, 1))) {
+ env->v7m.control[M_REG_S] &= ~R_V7M_CONTROL_FPCA_MASK;
+ env->v7m.control[M_REG_S] |= val & R_V7M_CONTROL_FPCA_MASK;
+ }
+ }
+ break;
+ default:
+ bad_reg:
+ qemu_log_mask(LOG_GUEST_ERROR, "Attempt to write unknown special"
+ " register %d\n", reg);
+ return;
+ }
+}
+
+uint32_t HELPER(v7m_tt)(CPUARMState *env, uint32_t addr, uint32_t op)
+{
+ /* Implement the TT instruction. op is bits [7:6] of the insn. */
+ bool forceunpriv = op & 1;
+ bool alt = op & 2;
+ V8M_SAttributes sattrs = {};
+ uint32_t tt_resp;
+ bool r, rw, nsr, nsrw, mrvalid;
+ ARMMMUIdx mmu_idx;
+ uint32_t mregion;
+ bool targetpriv;
+ bool targetsec = env->v7m.secure;
+
+ /*
+ * Work out what the security state and privilege level we're
+ * interested in is...
+ */
+ if (alt) {
+ targetsec = !targetsec;
+ }
+
+ if (forceunpriv) {
+ targetpriv = false;
+ } else {
+ targetpriv = arm_v7m_is_handler_mode(env) ||
+ !(env->v7m.control[targetsec] & R_V7M_CONTROL_NPRIV_MASK);
+ }
+
+ /* ...and then figure out which MMU index this is */
+ mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targetsec, targetpriv);
+
+ /*
+ * We know that the MPU and SAU don't care about the access type
+ * for our purposes beyond that we don't want to claim to be
+ * an insn fetch, so we arbitrarily call this a read.
+ */
+
+ /*
+ * MPU region info only available for privileged or if
+ * inspecting the other MPU state.
+ */
+ if (arm_current_el(env) != 0 || alt) {
+ GetPhysAddrResult res = {};
+ ARMMMUFaultInfo fi = {};
+
+ /* We can ignore the return value as prot is always set */
+ pmsav8_mpu_lookup(env, addr, MMU_DATA_LOAD, mmu_idx, targetsec,
+ &res, &fi, &mregion);
+ if (mregion == -1) {
+ mrvalid = false;
+ mregion = 0;
+ } else {
+ mrvalid = true;
+ }
+ r = res.f.prot & PAGE_READ;
+ rw = res.f.prot & PAGE_WRITE;
+ } else {
+ r = false;
+ rw = false;
+ mrvalid = false;
+ mregion = 0;
+ }
+
+ if (env->v7m.secure) {
+ v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx,
+ targetsec, &sattrs);
+ nsr = sattrs.ns && r;
+ nsrw = sattrs.ns && rw;
+ } else {
+ sattrs.ns = true;
+ nsr = false;
+ nsrw = false;
+ }
+
+ tt_resp = (sattrs.iregion << 24) |
+ (sattrs.irvalid << 23) |
+ ((!sattrs.ns) << 22) |
+ (nsrw << 21) |
+ (nsr << 20) |
+ (rw << 19) |
+ (r << 18) |
+ (sattrs.srvalid << 17) |
+ (mrvalid << 16) |
+ (sattrs.sregion << 8) |
+ mregion;
+
+ return tt_resp;
+}
+
+#endif /* !CONFIG_USER_ONLY */
'translate-mve.c',
'translate-neon.c',
'translate-vfp.c',
+ 'crypto_helper.c',
+ 'iwmmxt_helper.c',
+ 'm_helper.c',
+ 'mve_helper.c',
+ 'neon_helper.c',
+ 'op_helper.c',
+ 'tlb_helper.c',
+ 'vec_helper.c',
))
arm_ss.add(when: 'TARGET_AARCH64', if_true: files(
'translate-a64.c',
'translate-sve.c',
'translate-sme.c',
+ 'helper-a64.c',
+ 'mte_helper.c',
+ 'pauth_helper.c',
+ 'sme_helper.c',
+ 'sve_helper.c',
))
--- /dev/null
+/*
+ * ARM v8.5-MemTag Operations
+ *
+ * Copyright (c) 2020 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "cpu.h"
+#include "internals.h"
+#include "exec/exec-all.h"
+#include "exec/ram_addr.h"
+#include "exec/cpu_ldst.h"
+#include "exec/helper-proto.h"
+#include "qapi/error.h"
+#include "qemu/guest-random.h"
+
+
+static int choose_nonexcluded_tag(int tag, int offset, uint16_t exclude)
+{
+ if (exclude == 0xffff) {
+ return 0;
+ }
+ if (offset == 0) {
+ while (exclude & (1 << tag)) {
+ tag = (tag + 1) & 15;
+ }
+ } else {
+ do {
+ do {
+ tag = (tag + 1) & 15;
+ } while (exclude & (1 << tag));
+ } while (--offset > 0);
+ }
+ return tag;
+}
+
+/**
+ * allocation_tag_mem:
+ * @env: the cpu environment
+ * @ptr_mmu_idx: the addressing regime to use for the virtual address
+ * @ptr: the virtual address for which to look up tag memory
+ * @ptr_access: the access to use for the virtual address
+ * @ptr_size: the number of bytes in the normal memory access
+ * @tag_access: the access to use for the tag memory
+ * @tag_size: the number of bytes in the tag memory access
+ * @ra: the return address for exception handling
+ *
+ * Our tag memory is formatted as a sequence of little-endian nibbles.
+ * That is, the byte at (addr >> (LOG2_TAG_GRANULE + 1)) contains two
+ * tags, with the tag at [3:0] for the lower addr and the tag at [7:4]
+ * for the higher addr.
+ *
+ * Here, resolve the physical address from the virtual address, and return
+ * a pointer to the corresponding tag byte. Exit with exception if the
+ * virtual address is not accessible for @ptr_access.
+ *
+ * The @ptr_size and @tag_size values may not have an obvious relation
+ * due to the alignment of @ptr, and the number of tag checks required.
+ *
+ * If there is no tag storage corresponding to @ptr, return NULL.
+ */
+static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx,
+ uint64_t ptr, MMUAccessType ptr_access,
+ int ptr_size, MMUAccessType tag_access,
+ int tag_size, uintptr_t ra)
+{
+#ifdef CONFIG_USER_ONLY
+ uint64_t clean_ptr = useronly_clean_ptr(ptr);
+ int flags = page_get_flags(clean_ptr);
+ uint8_t *tags;
+ uintptr_t index;
+
+ if (!(flags & (ptr_access == MMU_DATA_STORE ? PAGE_WRITE_ORG : PAGE_READ))) {
+ cpu_loop_exit_sigsegv(env_cpu(env), ptr, ptr_access,
+ !(flags & PAGE_VALID), ra);
+ }
+
+ /* Require both MAP_ANON and PROT_MTE for the page. */
+ if (!(flags & PAGE_ANON) || !(flags & PAGE_MTE)) {
+ return NULL;
+ }
+
+ tags = page_get_target_data(clean_ptr);
+
+ index = extract32(ptr, LOG2_TAG_GRANULE + 1,
+ TARGET_PAGE_BITS - LOG2_TAG_GRANULE - 1);
+ return tags + index;
+#else
+ CPUTLBEntryFull *full;
+ MemTxAttrs attrs;
+ int in_page, flags;
+ hwaddr ptr_paddr, tag_paddr, xlat;
+ MemoryRegion *mr;
+ ARMASIdx tag_asi;
+ AddressSpace *tag_as;
+ void *host;
+
+ /*
+ * Probe the first byte of the virtual address. This raises an
+ * exception for inaccessible pages, and resolves the virtual address
+ * into the softmmu tlb.
+ *
+ * When RA == 0, this is for mte_probe. The page is expected to be
+ * valid. Indicate to probe_access_flags no-fault, then assert that
+ * we received a valid page.
+ */
+ flags = probe_access_full(env, ptr, ptr_access, ptr_mmu_idx,
+ ra == 0, &host, &full, ra);
+ assert(!(flags & TLB_INVALID_MASK));
+
+ /* If the virtual page MemAttr != Tagged, access unchecked. */
+ if (full->pte_attrs != 0xf0) {
+ return NULL;
+ }
+
+ /*
+ * If not backed by host ram, there is no tag storage: access unchecked.
+ * This is probably a guest os bug though, so log it.
+ */
+ if (unlikely(flags & TLB_MMIO)) {
+ qemu_log_mask(LOG_GUEST_ERROR,
+ "Page @ 0x%" PRIx64 " indicates Tagged Normal memory "
+ "but is not backed by host ram\n", ptr);
+ return NULL;
+ }
+
+ /*
+ * Remember these values across the second lookup below,
+ * which may invalidate this pointer via tlb resize.
+ */
+ ptr_paddr = full->phys_addr | (ptr & ~TARGET_PAGE_MASK);
+ attrs = full->attrs;
+ full = NULL;
+
+ /*
+ * The Normal memory access can extend to the next page. E.g. a single
+ * 8-byte access to the last byte of a page will check only the last
+ * tag on the first page.
+ * Any page access exception has priority over tag check exception.
+ */
+ in_page = -(ptr | TARGET_PAGE_MASK);
+ if (unlikely(ptr_size > in_page)) {
+ flags |= probe_access_full(env, ptr + in_page, ptr_access,
+ ptr_mmu_idx, ra == 0, &host, &full, ra);
+ assert(!(flags & TLB_INVALID_MASK));
+ }
+
+ /* Any debug exception has priority over a tag check exception. */
+ if (unlikely(flags & TLB_WATCHPOINT)) {
+ int wp = ptr_access == MMU_DATA_LOAD ? BP_MEM_READ : BP_MEM_WRITE;
+ assert(ra != 0);
+ cpu_check_watchpoint(env_cpu(env), ptr, ptr_size, attrs, wp, ra);
+ }
+
+ /* Convert to the physical address in tag space. */
+ tag_paddr = ptr_paddr >> (LOG2_TAG_GRANULE + 1);
+
+ /* Look up the address in tag space. */
+ tag_asi = attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS;
+ tag_as = cpu_get_address_space(env_cpu(env), tag_asi);
+ mr = address_space_translate(tag_as, tag_paddr, &xlat, NULL,
+ tag_access == MMU_DATA_STORE, attrs);
+
+ /*
+ * Note that @mr will never be NULL. If there is nothing in the address
+ * space at @tag_paddr, the translation will return the unallocated memory
+ * region. For our purposes, the result must be ram.
+ */
+ if (unlikely(!memory_region_is_ram(mr))) {
+ /* ??? Failure is a board configuration error. */
+ qemu_log_mask(LOG_UNIMP,
+ "Tag Memory @ 0x%" HWADDR_PRIx " not found for "
+ "Normal Memory @ 0x%" HWADDR_PRIx "\n",
+ tag_paddr, ptr_paddr);
+ return NULL;
+ }
+
+ /*
+ * Ensure the tag memory is dirty on write, for migration.
+ * Tag memory can never contain code or display memory (vga).
+ */
+ if (tag_access == MMU_DATA_STORE) {
+ ram_addr_t tag_ra = memory_region_get_ram_addr(mr) + xlat;
+ cpu_physical_memory_set_dirty_flag(tag_ra, DIRTY_MEMORY_MIGRATION);
+ }
+
+ return memory_region_get_ram_ptr(mr) + xlat;
+#endif
+}
+
+uint64_t HELPER(irg)(CPUARMState *env, uint64_t rn, uint64_t rm)
+{
+ uint16_t exclude = extract32(rm | env->cp15.gcr_el1, 0, 16);
+ int rrnd = extract32(env->cp15.gcr_el1, 16, 1);
+ int start = extract32(env->cp15.rgsr_el1, 0, 4);
+ int seed = extract32(env->cp15.rgsr_el1, 8, 16);
+ int offset, i, rtag;
+
+ /*
+ * Our IMPDEF choice for GCR_EL1.RRND==1 is to continue to use the
+ * deterministic algorithm. Except that with RRND==1 the kernel is
+ * not required to have set RGSR_EL1.SEED != 0, which is required for
+ * the deterministic algorithm to function. So we force a non-zero
+ * SEED for that case.
+ */
+ if (unlikely(seed == 0) && rrnd) {
+ do {
+ Error *err = NULL;
+ uint16_t two;
+
+ if (qemu_guest_getrandom(&two, sizeof(two), &err) < 0) {
+ /*
+ * Failed, for unknown reasons in the crypto subsystem.
+ * Best we can do is log the reason and use a constant seed.
+ */
+ qemu_log_mask(LOG_UNIMP, "IRG: Crypto failure: %s\n",
+ error_get_pretty(err));
+ error_free(err);
+ two = 1;
+ }
+ seed = two;
+ } while (seed == 0);
+ }
+
+ /* RandomTag */
+ for (i = offset = 0; i < 4; ++i) {
+ /* NextRandomTagBit */
+ int top = (extract32(seed, 5, 1) ^ extract32(seed, 3, 1) ^
+ extract32(seed, 2, 1) ^ extract32(seed, 0, 1));
+ seed = (top << 15) | (seed >> 1);
+ offset |= top << i;
+ }
+ rtag = choose_nonexcluded_tag(start, offset, exclude);
+ env->cp15.rgsr_el1 = rtag | (seed << 8);
+
+ return address_with_allocation_tag(rn, rtag);
+}
+
+uint64_t HELPER(addsubg)(CPUARMState *env, uint64_t ptr,
+ int32_t offset, uint32_t tag_offset)
+{
+ int start_tag = allocation_tag_from_addr(ptr);
+ uint16_t exclude = extract32(env->cp15.gcr_el1, 0, 16);
+ int rtag = choose_nonexcluded_tag(start_tag, tag_offset, exclude);
+
+ return address_with_allocation_tag(ptr + offset, rtag);
+}
+
+static int load_tag1(uint64_t ptr, uint8_t *mem)
+{
+ int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
+ return extract32(*mem, ofs, 4);
+}
+
+uint64_t HELPER(ldg)(CPUARMState *env, uint64_t ptr, uint64_t xt)
+{
+ int mmu_idx = cpu_mmu_index(env, false);
+ uint8_t *mem;
+ int rtag = 0;
+
+ /* Trap if accessing an invalid page. */
+ mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD, 1,
+ MMU_DATA_LOAD, 1, GETPC());
+
+ /* Load if page supports tags. */
+ if (mem) {
+ rtag = load_tag1(ptr, mem);
+ }
+
+ return address_with_allocation_tag(xt, rtag);
+}
+
+static void check_tag_aligned(CPUARMState *env, uint64_t ptr, uintptr_t ra)
+{
+ if (unlikely(!QEMU_IS_ALIGNED(ptr, TAG_GRANULE))) {
+ arm_cpu_do_unaligned_access(env_cpu(env), ptr, MMU_DATA_STORE,
+ cpu_mmu_index(env, false), ra);
+ g_assert_not_reached();
+ }
+}
+
+/* For use in a non-parallel context, store to the given nibble. */
+static void store_tag1(uint64_t ptr, uint8_t *mem, int tag)
+{
+ int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
+ *mem = deposit32(*mem, ofs, 4, tag);
+}
+
+/* For use in a parallel context, atomically store to the given nibble. */
+static void store_tag1_parallel(uint64_t ptr, uint8_t *mem, int tag)
+{
+ int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
+ uint8_t old = qatomic_read(mem);
+
+ while (1) {
+ uint8_t new = deposit32(old, ofs, 4, tag);
+ uint8_t cmp = qatomic_cmpxchg(mem, old, new);
+ if (likely(cmp == old)) {
+ return;
+ }
+ old = cmp;
+ }
+}
+
+typedef void stg_store1(uint64_t, uint8_t *, int);
+
+static inline void do_stg(CPUARMState *env, uint64_t ptr, uint64_t xt,
+ uintptr_t ra, stg_store1 store1)
+{
+ int mmu_idx = cpu_mmu_index(env, false);
+ uint8_t *mem;
+
+ check_tag_aligned(env, ptr, ra);
+
+ /* Trap if accessing an invalid page. */
+ mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, TAG_GRANULE,
+ MMU_DATA_STORE, 1, ra);
+
+ /* Store if page supports tags. */
+ if (mem) {
+ store1(ptr, mem, allocation_tag_from_addr(xt));
+ }
+}
+
+void HELPER(stg)(CPUARMState *env, uint64_t ptr, uint64_t xt)
+{
+ do_stg(env, ptr, xt, GETPC(), store_tag1);
+}
+
+void HELPER(stg_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt)
+{
+ do_stg(env, ptr, xt, GETPC(), store_tag1_parallel);
+}
+
+void HELPER(stg_stub)(CPUARMState *env, uint64_t ptr)
+{
+ int mmu_idx = cpu_mmu_index(env, false);
+ uintptr_t ra = GETPC();
+
+ check_tag_aligned(env, ptr, ra);
+ probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra);
+}
+
+static inline void do_st2g(CPUARMState *env, uint64_t ptr, uint64_t xt,
+ uintptr_t ra, stg_store1 store1)
+{
+ int mmu_idx = cpu_mmu_index(env, false);
+ int tag = allocation_tag_from_addr(xt);
+ uint8_t *mem1, *mem2;
+
+ check_tag_aligned(env, ptr, ra);
+
+ /*
+ * Trap if accessing an invalid page(s).
+ * This takes priority over !allocation_tag_access_enabled.
+ */
+ if (ptr & TAG_GRANULE) {
+ /* Two stores unaligned mod TAG_GRANULE*2 -- modify two bytes. */
+ mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE,
+ TAG_GRANULE, MMU_DATA_STORE, 1, ra);
+ mem2 = allocation_tag_mem(env, mmu_idx, ptr + TAG_GRANULE,
+ MMU_DATA_STORE, TAG_GRANULE,
+ MMU_DATA_STORE, 1, ra);
+
+ /* Store if page(s) support tags. */
+ if (mem1) {
+ store1(TAG_GRANULE, mem1, tag);
+ }
+ if (mem2) {
+ store1(0, mem2, tag);
+ }
+ } else {
+ /* Two stores aligned mod TAG_GRANULE*2 -- modify one byte. */
+ mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE,
+ 2 * TAG_GRANULE, MMU_DATA_STORE, 1, ra);
+ if (mem1) {
+ tag |= tag << 4;
+ qatomic_set(mem1, tag);
+ }
+ }
+}
+
+void HELPER(st2g)(CPUARMState *env, uint64_t ptr, uint64_t xt)
+{
+ do_st2g(env, ptr, xt, GETPC(), store_tag1);
+}
+
+void HELPER(st2g_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt)
+{
+ do_st2g(env, ptr, xt, GETPC(), store_tag1_parallel);
+}
+
+void HELPER(st2g_stub)(CPUARMState *env, uint64_t ptr)
+{
+ int mmu_idx = cpu_mmu_index(env, false);
+ uintptr_t ra = GETPC();
+ int in_page = -(ptr | TARGET_PAGE_MASK);
+
+ check_tag_aligned(env, ptr, ra);
+
+ if (likely(in_page >= 2 * TAG_GRANULE)) {
+ probe_write(env, ptr, 2 * TAG_GRANULE, mmu_idx, ra);
+ } else {
+ probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra);
+ probe_write(env, ptr + TAG_GRANULE, TAG_GRANULE, mmu_idx, ra);
+ }
+}
+
+#define LDGM_STGM_SIZE (4 << GMID_EL1_BS)
+
+uint64_t HELPER(ldgm)(CPUARMState *env, uint64_t ptr)
+{
+ int mmu_idx = cpu_mmu_index(env, false);
+ uintptr_t ra = GETPC();
+ void *tag_mem;
+
+ ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE);
+
+ /* Trap if accessing an invalid page. */
+ tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD,
+ LDGM_STGM_SIZE, MMU_DATA_LOAD,
+ LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra);
+
+ /* The tag is squashed to zero if the page does not support tags. */
+ if (!tag_mem) {
+ return 0;
+ }
+
+ QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6);
+ /*
+ * We are loading 64-bits worth of tags. The ordering of elements
+ * within the word corresponds to a 64-bit little-endian operation.
+ */
+ return ldq_le_p(tag_mem);
+}
+
+void HELPER(stgm)(CPUARMState *env, uint64_t ptr, uint64_t val)
+{
+ int mmu_idx = cpu_mmu_index(env, false);
+ uintptr_t ra = GETPC();
+ void *tag_mem;
+
+ ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE);
+
+ /* Trap if accessing an invalid page. */
+ tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE,
+ LDGM_STGM_SIZE, MMU_DATA_LOAD,
+ LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra);
+
+ /*
+ * Tag store only happens if the page support tags,
+ * and if the OS has enabled access to the tags.
+ */
+ if (!tag_mem) {
+ return;
+ }
+
+ QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6);
+ /*
+ * We are storing 64-bits worth of tags. The ordering of elements
+ * within the word corresponds to a 64-bit little-endian operation.
+ */
+ stq_le_p(tag_mem, val);
+}
+
+void HELPER(stzgm_tags)(CPUARMState *env, uint64_t ptr, uint64_t val)
+{
+ uintptr_t ra = GETPC();
+ int mmu_idx = cpu_mmu_index(env, false);
+ int log2_dcz_bytes, log2_tag_bytes;
+ intptr_t dcz_bytes, tag_bytes;
+ uint8_t *mem;
+
+ /*
+ * In arm_cpu_realizefn, we assert that dcz > LOG2_TAG_GRANULE+1,
+ * i.e. 32 bytes, which is an unreasonably small dcz anyway,
+ * to make sure that we can access one complete tag byte here.
+ */
+ log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2;
+ log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1);
+ dcz_bytes = (intptr_t)1 << log2_dcz_bytes;
+ tag_bytes = (intptr_t)1 << log2_tag_bytes;
+ ptr &= -dcz_bytes;
+
+ mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, dcz_bytes,
+ MMU_DATA_STORE, tag_bytes, ra);
+ if (mem) {
+ int tag_pair = (val & 0xf) * 0x11;
+ memset(mem, tag_pair, tag_bytes);
+ }
+}
+
+static void mte_sync_check_fail(CPUARMState *env, uint32_t desc,
+ uint64_t dirty_ptr, uintptr_t ra)
+{
+ int is_write, syn;
+
+ env->exception.vaddress = dirty_ptr;
+
+ is_write = FIELD_EX32(desc, MTEDESC, WRITE);
+ syn = syn_data_abort_no_iss(arm_current_el(env) != 0, 0, 0, 0, 0, is_write,
+ 0x11);
+ raise_exception_ra(env, EXCP_DATA_ABORT, syn, exception_target_el(env), ra);
+ g_assert_not_reached();
+}
+
+static void mte_async_check_fail(CPUARMState *env, uint64_t dirty_ptr,
+ uintptr_t ra, ARMMMUIdx arm_mmu_idx, int el)
+{
+ int select;
+
+ if (regime_has_2_ranges(arm_mmu_idx)) {
+ select = extract64(dirty_ptr, 55, 1);
+ } else {
+ select = 0;
+ }
+ env->cp15.tfsr_el[el] |= 1 << select;
+#ifdef CONFIG_USER_ONLY
+ /*
+ * Stand in for a timer irq, setting _TIF_MTE_ASYNC_FAULT,
+ * which then sends a SIGSEGV when the thread is next scheduled.
+ * This cpu will return to the main loop at the end of the TB,
+ * which is rather sooner than "normal". But the alternative
+ * is waiting until the next syscall.
+ */
+ qemu_cpu_kick(env_cpu(env));
+#endif
+}
+
+/* Record a tag check failure. */
+static void mte_check_fail(CPUARMState *env, uint32_t desc,
+ uint64_t dirty_ptr, uintptr_t ra)
+{
+ int mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
+ ARMMMUIdx arm_mmu_idx = core_to_aa64_mmu_idx(mmu_idx);
+ int el, reg_el, tcf;
+ uint64_t sctlr;
+
+ reg_el = regime_el(env, arm_mmu_idx);
+ sctlr = env->cp15.sctlr_el[reg_el];
+
+ switch (arm_mmu_idx) {
+ case ARMMMUIdx_E10_0:
+ case ARMMMUIdx_E20_0:
+ el = 0;
+ tcf = extract64(sctlr, 38, 2);
+ break;
+ default:
+ el = reg_el;
+ tcf = extract64(sctlr, 40, 2);
+ }
+
+ switch (tcf) {
+ case 1:
+ /* Tag check fail causes a synchronous exception. */
+ mte_sync_check_fail(env, desc, dirty_ptr, ra);
+ break;
+
+ case 0:
+ /*
+ * Tag check fail does not affect the PE.
+ * We eliminate this case by not setting MTE_ACTIVE
+ * in tb_flags, so that we never make this runtime call.
+ */
+ g_assert_not_reached();
+
+ case 2:
+ /* Tag check fail causes asynchronous flag set. */
+ mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el);
+ break;
+
+ case 3:
+ /*
+ * Tag check fail causes asynchronous flag set for stores, or
+ * a synchronous exception for loads.
+ */
+ if (FIELD_EX32(desc, MTEDESC, WRITE)) {
+ mte_async_check_fail(env, dirty_ptr, ra, arm_mmu_idx, el);
+ } else {
+ mte_sync_check_fail(env, desc, dirty_ptr, ra);
+ }
+ break;
+ }
+}
+
+/**
+ * checkN:
+ * @tag: tag memory to test
+ * @odd: true to begin testing at tags at odd nibble
+ * @cmp: the tag to compare against
+ * @count: number of tags to test
+ *
+ * Return the number of successful tests.
+ * Thus a return value < @count indicates a failure.
+ *
+ * A note about sizes: count is expected to be small.
+ *
+ * The most common use will be LDP/STP of two integer registers,
+ * which means 16 bytes of memory touching at most 2 tags, but
+ * often the access is aligned and thus just 1 tag.
+ *
+ * Using AdvSIMD LD/ST (multiple), one can access 64 bytes of memory,
+ * touching at most 5 tags. SVE LDR/STR (vector) with the default
+ * vector length is also 64 bytes; the maximum architectural length
+ * is 256 bytes touching at most 9 tags.
+ *
+ * The loop below uses 7 logical operations and 1 memory operation
+ * per tag pair. An implementation that loads an aligned word and
+ * uses masking to ignore adjacent tags requires 18 logical operations
+ * and thus does not begin to pay off until 6 tags.
+ * Which, according to the survey above, is unlikely to be common.
+ */
+static int checkN(uint8_t *mem, int odd, int cmp, int count)
+{
+ int n = 0, diff;
+
+ /* Replicate the test tag and compare. */
+ cmp *= 0x11;
+ diff = *mem++ ^ cmp;
+
+ if (odd) {
+ goto start_odd;
+ }
+
+ while (1) {
+ /* Test even tag. */
+ if (unlikely((diff) & 0x0f)) {
+ break;
+ }
+ if (++n == count) {
+ break;
+ }
+
+ start_odd:
+ /* Test odd tag. */
+ if (unlikely((diff) & 0xf0)) {
+ break;
+ }
+ if (++n == count) {
+ break;
+ }
+
+ diff = *mem++ ^ cmp;
+ }
+ return n;
+}
+
+/**
+ * mte_probe_int() - helper for mte_probe and mte_check
+ * @env: CPU environment
+ * @desc: MTEDESC descriptor
+ * @ptr: virtual address of the base of the access
+ * @fault: return virtual address of the first check failure
+ *
+ * Internal routine for both mte_probe and mte_check.
+ * Return zero on failure, filling in *fault.
+ * Return negative on trivial success for tbi disabled.
+ * Return positive on success with tbi enabled.
+ */
+static int mte_probe_int(CPUARMState *env, uint32_t desc, uint64_t ptr,
+ uintptr_t ra, uint64_t *fault)
+{
+ int mmu_idx, ptr_tag, bit55;
+ uint64_t ptr_last, prev_page, next_page;
+ uint64_t tag_first, tag_last;
+ uint64_t tag_byte_first, tag_byte_last;
+ uint32_t sizem1, tag_count, tag_size, n, c;
+ uint8_t *mem1, *mem2;
+ MMUAccessType type;
+
+ bit55 = extract64(ptr, 55, 1);
+ *fault = ptr;
+
+ /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */
+ if (unlikely(!tbi_check(desc, bit55))) {
+ return -1;
+ }
+
+ ptr_tag = allocation_tag_from_addr(ptr);
+
+ if (tcma_check(desc, bit55, ptr_tag)) {
+ return 1;
+ }
+
+ mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
+ type = FIELD_EX32(desc, MTEDESC, WRITE) ? MMU_DATA_STORE : MMU_DATA_LOAD;
+ sizem1 = FIELD_EX32(desc, MTEDESC, SIZEM1);
+
+ /* Find the addr of the end of the access */
+ ptr_last = ptr + sizem1;
+
+ /* Round the bounds to the tag granule, and compute the number of tags. */
+ tag_first = QEMU_ALIGN_DOWN(ptr, TAG_GRANULE);
+ tag_last = QEMU_ALIGN_DOWN(ptr_last, TAG_GRANULE);
+ tag_count = ((tag_last - tag_first) / TAG_GRANULE) + 1;
+
+ /* Round the bounds to twice the tag granule, and compute the bytes. */
+ tag_byte_first = QEMU_ALIGN_DOWN(ptr, 2 * TAG_GRANULE);
+ tag_byte_last = QEMU_ALIGN_DOWN(ptr_last, 2 * TAG_GRANULE);
+
+ /* Locate the page boundaries. */
+ prev_page = ptr & TARGET_PAGE_MASK;
+ next_page = prev_page + TARGET_PAGE_SIZE;
+
+ if (likely(tag_last - prev_page < TARGET_PAGE_SIZE)) {
+ /* Memory access stays on one page. */
+ tag_size = ((tag_byte_last - tag_byte_first) / (2 * TAG_GRANULE)) + 1;
+ mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, sizem1 + 1,
+ MMU_DATA_LOAD, tag_size, ra);
+ if (!mem1) {
+ return 1;
+ }
+ /* Perform all of the comparisons. */
+ n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, tag_count);
+ } else {
+ /* Memory access crosses to next page. */
+ tag_size = (next_page - tag_byte_first) / (2 * TAG_GRANULE);
+ mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, next_page - ptr,
+ MMU_DATA_LOAD, tag_size, ra);
+
+ tag_size = ((tag_byte_last - next_page) / (2 * TAG_GRANULE)) + 1;
+ mem2 = allocation_tag_mem(env, mmu_idx, next_page, type,
+ ptr_last - next_page + 1,
+ MMU_DATA_LOAD, tag_size, ra);
+
+ /*
+ * Perform all of the comparisons.
+ * Note the possible but unlikely case of the operation spanning
+ * two pages that do not both have tagging enabled.
+ */
+ n = c = (next_page - tag_first) / TAG_GRANULE;
+ if (mem1) {
+ n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, c);
+ }
+ if (n == c) {
+ if (!mem2) {
+ return 1;
+ }
+ n += checkN(mem2, 0, ptr_tag, tag_count - c);
+ }
+ }
+
+ if (likely(n == tag_count)) {
+ return 1;
+ }
+
+ /*
+ * If we failed, we know which granule. For the first granule, the
+ * failure address is @ptr, the first byte accessed. Otherwise the
+ * failure address is the first byte of the nth granule.
+ */
+ if (n > 0) {
+ *fault = tag_first + n * TAG_GRANULE;
+ }
+ return 0;
+}
+
+uint64_t mte_check(CPUARMState *env, uint32_t desc, uint64_t ptr, uintptr_t ra)
+{
+ uint64_t fault;
+ int ret = mte_probe_int(env, desc, ptr, ra, &fault);
+
+ if (unlikely(ret == 0)) {
+ mte_check_fail(env, desc, fault, ra);
+ } else if (ret < 0) {
+ return ptr;
+ }
+ return useronly_clean_ptr(ptr);
+}
+
+uint64_t HELPER(mte_check)(CPUARMState *env, uint32_t desc, uint64_t ptr)
+{
+ return mte_check(env, desc, ptr, GETPC());
+}
+
+/*
+ * No-fault version of mte_check, to be used by SVE for MemSingleNF.
+ * Returns false if the access is Checked and the check failed. This
+ * is only intended to probe the tag -- the validity of the page must
+ * be checked beforehand.
+ */
+bool mte_probe(CPUARMState *env, uint32_t desc, uint64_t ptr)
+{
+ uint64_t fault;
+ int ret = mte_probe_int(env, desc, ptr, 0, &fault);
+
+ return ret != 0;
+}
+
+/*
+ * Perform an MTE checked access for DC_ZVA.
+ */
+uint64_t HELPER(mte_check_zva)(CPUARMState *env, uint32_t desc, uint64_t ptr)
+{
+ uintptr_t ra = GETPC();
+ int log2_dcz_bytes, log2_tag_bytes;
+ int mmu_idx, bit55;
+ intptr_t dcz_bytes, tag_bytes, i;
+ void *mem;
+ uint64_t ptr_tag, mem_tag, align_ptr;
+
+ bit55 = extract64(ptr, 55, 1);
+
+ /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */
+ if (unlikely(!tbi_check(desc, bit55))) {
+ return ptr;
+ }
+
+ ptr_tag = allocation_tag_from_addr(ptr);
+
+ if (tcma_check(desc, bit55, ptr_tag)) {
+ goto done;
+ }
+
+ /*
+ * In arm_cpu_realizefn, we asserted that dcz > LOG2_TAG_GRANULE+1,
+ * i.e. 32 bytes, which is an unreasonably small dcz anyway, to make
+ * sure that we can access one complete tag byte here.
+ */
+ log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2;
+ log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1);
+ dcz_bytes = (intptr_t)1 << log2_dcz_bytes;
+ tag_bytes = (intptr_t)1 << log2_tag_bytes;
+ align_ptr = ptr & -dcz_bytes;
+
+ /*
+ * Trap if accessing an invalid page. DC_ZVA requires that we supply
+ * the original pointer for an invalid page. But watchpoints require
+ * that we probe the actual space. So do both.
+ */
+ mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX);
+ (void) probe_write(env, ptr, 1, mmu_idx, ra);
+ mem = allocation_tag_mem(env, mmu_idx, align_ptr, MMU_DATA_STORE,
+ dcz_bytes, MMU_DATA_LOAD, tag_bytes, ra);
+ if (!mem) {
+ goto done;
+ }
+
+ /*
+ * Unlike the reasoning for checkN, DC_ZVA is always aligned, and thus
+ * it is quite easy to perform all of the comparisons at once without
+ * any extra masking.
+ *
+ * The most common zva block size is 64; some of the thunderx cpus use
+ * a block size of 128. For user-only, aarch64_max_initfn will set the
+ * block size to 512. Fill out the other cases for future-proofing.
+ *
+ * In order to be able to find the first miscompare later, we want the
+ * tag bytes to be in little-endian order.
+ */
+ switch (log2_tag_bytes) {
+ case 0: /* zva_blocksize 32 */
+ mem_tag = *(uint8_t *)mem;
+ ptr_tag *= 0x11u;
+ break;
+ case 1: /* zva_blocksize 64 */
+ mem_tag = cpu_to_le16(*(uint16_t *)mem);
+ ptr_tag *= 0x1111u;
+ break;
+ case 2: /* zva_blocksize 128 */
+ mem_tag = cpu_to_le32(*(uint32_t *)mem);
+ ptr_tag *= 0x11111111u;
+ break;
+ case 3: /* zva_blocksize 256 */
+ mem_tag = cpu_to_le64(*(uint64_t *)mem);
+ ptr_tag *= 0x1111111111111111ull;
+ break;
+
+ default: /* zva_blocksize 512, 1024, 2048 */
+ ptr_tag *= 0x1111111111111111ull;
+ i = 0;
+ do {
+ mem_tag = cpu_to_le64(*(uint64_t *)(mem + i));
+ if (unlikely(mem_tag != ptr_tag)) {
+ goto fail;
+ }
+ i += 8;
+ align_ptr += 16 * TAG_GRANULE;
+ } while (i < tag_bytes);
+ goto done;
+ }
+
+ if (likely(mem_tag == ptr_tag)) {
+ goto done;
+ }
+
+ fail:
+ /* Locate the first nibble that differs. */
+ i = ctz64(mem_tag ^ ptr_tag) >> 4;
+ mte_check_fail(env, desc, align_ptr + i * TAG_GRANULE, ra);
+
+ done:
+ return useronly_clean_ptr(ptr);
+}
--- /dev/null
+/*
+ * M-profile MVE Operations
+ *
+ * Copyright (c) 2021 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "vec_internal.h"
+#include "exec/helper-proto.h"
+#include "exec/cpu_ldst.h"
+#include "exec/exec-all.h"
+#include "tcg/tcg.h"
+#include "fpu/softfloat.h"
+
+static uint16_t mve_eci_mask(CPUARMState *env)
+{
+ /*
+ * Return the mask of which elements in the MVE vector correspond
+ * to beats being executed. The mask has 1 bits for executed lanes
+ * and 0 bits where ECI says this beat was already executed.
+ */
+ int eci;
+
+ if ((env->condexec_bits & 0xf) != 0) {
+ return 0xffff;
+ }
+
+ eci = env->condexec_bits >> 4;
+ switch (eci) {
+ case ECI_NONE:
+ return 0xffff;
+ case ECI_A0:
+ return 0xfff0;
+ case ECI_A0A1:
+ return 0xff00;
+ case ECI_A0A1A2:
+ case ECI_A0A1A2B0:
+ return 0xf000;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static uint16_t mve_element_mask(CPUARMState *env)
+{
+ /*
+ * Return the mask of which elements in the MVE vector should be
+ * updated. This is a combination of multiple things:
+ * (1) by default, we update every lane in the vector
+ * (2) VPT predication stores its state in the VPR register;
+ * (3) low-overhead-branch tail predication will mask out part
+ * the vector on the final iteration of the loop
+ * (4) if EPSR.ECI is set then we must execute only some beats
+ * of the insn
+ * We combine all these into a 16-bit result with the same semantics
+ * as VPR.P0: 0 to mask the lane, 1 if it is active.
+ * 8-bit vector ops will look at all bits of the result;
+ * 16-bit ops will look at bits 0, 2, 4, ...;
+ * 32-bit ops will look at bits 0, 4, 8 and 12.
+ * Compare pseudocode GetCurInstrBeat(), though that only returns
+ * the 4-bit slice of the mask corresponding to a single beat.
+ */
+ uint16_t mask = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
+
+ if (!(env->v7m.vpr & R_V7M_VPR_MASK01_MASK)) {
+ mask |= 0xff;
+ }
+ if (!(env->v7m.vpr & R_V7M_VPR_MASK23_MASK)) {
+ mask |= 0xff00;
+ }
+
+ if (env->v7m.ltpsize < 4 &&
+ env->regs[14] <= (1 << (4 - env->v7m.ltpsize))) {
+ /*
+ * Tail predication active, and this is the last loop iteration.
+ * The element size is (1 << ltpsize), and we only want to process
+ * loopcount elements, so we want to retain the least significant
+ * (loopcount * esize) predicate bits and zero out bits above that.
+ */
+ int masklen = env->regs[14] << env->v7m.ltpsize;
+ assert(masklen <= 16);
+ uint16_t ltpmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0;
+ mask &= ltpmask;
+ }
+
+ /*
+ * ECI bits indicate which beats are already executed;
+ * we handle this by effectively predicating them out.
+ */
+ mask &= mve_eci_mask(env);
+ return mask;
+}
+
+static void mve_advance_vpt(CPUARMState *env)
+{
+ /* Advance the VPT and ECI state if necessary */
+ uint32_t vpr = env->v7m.vpr;
+ unsigned mask01, mask23;
+ uint16_t inv_mask;
+ uint16_t eci_mask = mve_eci_mask(env);
+
+ if ((env->condexec_bits & 0xf) == 0) {
+ env->condexec_bits = (env->condexec_bits == (ECI_A0A1A2B0 << 4)) ?
+ (ECI_A0 << 4) : (ECI_NONE << 4);
+ }
+
+ if (!(vpr & (R_V7M_VPR_MASK01_MASK | R_V7M_VPR_MASK23_MASK))) {
+ /* VPT not enabled, nothing to do */
+ return;
+ }
+
+ /* Invert P0 bits if needed, but only for beats we actually executed */
+ mask01 = FIELD_EX32(vpr, V7M_VPR, MASK01);
+ mask23 = FIELD_EX32(vpr, V7M_VPR, MASK23);
+ /* Start by assuming we invert all bits corresponding to executed beats */
+ inv_mask = eci_mask;
+ if (mask01 <= 8) {
+ /* MASK01 says don't invert low half of P0 */
+ inv_mask &= ~0xff;
+ }
+ if (mask23 <= 8) {
+ /* MASK23 says don't invert high half of P0 */
+ inv_mask &= ~0xff00;
+ }
+ vpr ^= inv_mask;
+ /* Only update MASK01 if beat 1 executed */
+ if (eci_mask & 0xf0) {
+ vpr = FIELD_DP32(vpr, V7M_VPR, MASK01, mask01 << 1);
+ }
+ /* Beat 3 always executes, so update MASK23 */
+ vpr = FIELD_DP32(vpr, V7M_VPR, MASK23, mask23 << 1);
+ env->v7m.vpr = vpr;
+}
+
+/* For loads, predicated lanes are zeroed instead of keeping their old values */
+#define DO_VLDR(OP, MSIZE, LDTYPE, ESIZE, TYPE) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \
+ { \
+ TYPE *d = vd; \
+ uint16_t mask = mve_element_mask(env); \
+ uint16_t eci_mask = mve_eci_mask(env); \
+ unsigned b, e; \
+ /* \
+ * R_SXTM allows the dest reg to become UNKNOWN for abandoned \
+ * beats so we don't care if we update part of the dest and \
+ * then take an exception. \
+ */ \
+ for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \
+ if (eci_mask & (1 << b)) { \
+ d[H##ESIZE(e)] = (mask & (1 << b)) ? \
+ cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \
+ } \
+ addr += MSIZE; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VSTR(OP, MSIZE, STTYPE, ESIZE, TYPE) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, uint32_t addr) \
+ { \
+ TYPE *d = vd; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned b, e; \
+ for (b = 0, e = 0; b < 16; b += ESIZE, e++) { \
+ if (mask & (1 << b)) { \
+ cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
+ } \
+ addr += MSIZE; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+DO_VLDR(vldrb, 1, ldub, 1, uint8_t)
+DO_VLDR(vldrh, 2, lduw, 2, uint16_t)
+DO_VLDR(vldrw, 4, ldl, 4, uint32_t)
+
+DO_VSTR(vstrb, 1, stb, 1, uint8_t)
+DO_VSTR(vstrh, 2, stw, 2, uint16_t)
+DO_VSTR(vstrw, 4, stl, 4, uint32_t)
+
+DO_VLDR(vldrb_sh, 1, ldsb, 2, int16_t)
+DO_VLDR(vldrb_sw, 1, ldsb, 4, int32_t)
+DO_VLDR(vldrb_uh, 1, ldub, 2, uint16_t)
+DO_VLDR(vldrb_uw, 1, ldub, 4, uint32_t)
+DO_VLDR(vldrh_sw, 2, ldsw, 4, int32_t)
+DO_VLDR(vldrh_uw, 2, lduw, 4, uint32_t)
+
+DO_VSTR(vstrb_h, 1, stb, 2, int16_t)
+DO_VSTR(vstrb_w, 1, stb, 4, int32_t)
+DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
+
+#undef DO_VLDR
+#undef DO_VSTR
+
+/*
+ * Gather loads/scatter stores. Here each element of Qm specifies
+ * an offset to use from the base register Rm. In the _os_ versions
+ * that offset is scaled by the element size.
+ * For loads, predicated lanes are zeroed instead of retaining
+ * their previous values.
+ */
+#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN, WB) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
+ uint32_t base) \
+ { \
+ TYPE *d = vd; \
+ OFFTYPE *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ uint16_t eci_mask = mve_eci_mask(env); \
+ unsigned e; \
+ uint32_t addr; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
+ if (!(eci_mask & 1)) { \
+ continue; \
+ } \
+ addr = ADDRFN(base, m[H##ESIZE(e)]); \
+ d[H##ESIZE(e)] = (mask & 1) ? \
+ cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \
+ if (WB) { \
+ m[H##ESIZE(e)] = addr; \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/* We know here TYPE is unsigned so always the same as the offset type */
+#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN, WB) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
+ uint32_t base) \
+ { \
+ TYPE *d = vd; \
+ TYPE *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ uint16_t eci_mask = mve_eci_mask(env); \
+ unsigned e; \
+ uint32_t addr; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
+ if (!(eci_mask & 1)) { \
+ continue; \
+ } \
+ addr = ADDRFN(base, m[H##ESIZE(e)]); \
+ if (mask & 1) { \
+ cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
+ } \
+ if (WB) { \
+ m[H##ESIZE(e)] = addr; \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/*
+ * 64-bit accesses are slightly different: they are done as two 32-bit
+ * accesses, controlled by the predicate mask for the relevant beat,
+ * and with a single 32-bit offset in the first of the two Qm elements.
+ * Note that for QEMU our IMPDEF AIRCR.ENDIANNESS is always 0 (little).
+ * Address writeback happens on the odd beats and updates the address
+ * stored in the even-beat element.
+ */
+#define DO_VLDR64_SG(OP, ADDRFN, WB) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
+ uint32_t base) \
+ { \
+ uint32_t *d = vd; \
+ uint32_t *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ uint16_t eci_mask = mve_eci_mask(env); \
+ unsigned e; \
+ uint32_t addr; \
+ for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \
+ if (!(eci_mask & 1)) { \
+ continue; \
+ } \
+ addr = ADDRFN(base, m[H4(e & ~1)]); \
+ addr += 4 * (e & 1); \
+ d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \
+ if (WB && (e & 1)) { \
+ m[H4(e & ~1)] = addr - 4; \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VSTR64_SG(OP, ADDRFN, WB) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
+ uint32_t base) \
+ { \
+ uint32_t *d = vd; \
+ uint32_t *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ uint16_t eci_mask = mve_eci_mask(env); \
+ unsigned e; \
+ uint32_t addr; \
+ for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \
+ if (!(eci_mask & 1)) { \
+ continue; \
+ } \
+ addr = ADDRFN(base, m[H4(e & ~1)]); \
+ addr += 4 * (e & 1); \
+ if (mask & 1) { \
+ cpu_stl_data_ra(env, addr, d[H4(e)], GETPC()); \
+ } \
+ if (WB && (e & 1)) { \
+ m[H4(e & ~1)] = addr - 4; \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define ADDR_ADD(BASE, OFFSET) ((BASE) + (OFFSET))
+#define ADDR_ADD_OSH(BASE, OFFSET) ((BASE) + ((OFFSET) << 1))
+#define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2))
+#define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3))
+
+DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD, false)
+
+DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, false)
+DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD, false)
+
+DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH, false)
+DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH, false)
+DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH, false)
+DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW, false)
+DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD, false)
+
+DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD, false)
+DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD, false)
+DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD, false)
+
+DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH, false)
+DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH, false)
+DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW, false)
+DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD, false)
+
+DO_VLDR_SG(vldrw_sg_wb_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD, true)
+DO_VLDR64_SG(vldrd_sg_wb_ud, ADDR_ADD, true)
+DO_VSTR_SG(vstrw_sg_wb_uw, stl, 4, uint32_t, ADDR_ADD, true)
+DO_VSTR64_SG(vstrd_sg_wb_ud, ADDR_ADD, true)
+
+/*
+ * Deinterleaving loads/interleaving stores.
+ *
+ * For these helpers we are passed the index of the first Qreg
+ * (VLD2/VST2 will also access Qn+1, VLD4/VST4 access Qn .. Qn+3)
+ * and the value of the base address register Rn.
+ * The helpers are specialized for pattern and element size, so
+ * for instance vld42h is VLD4 with pattern 2, element size MO_16.
+ *
+ * These insns are beatwise but not predicated, so we must honour ECI,
+ * but need not look at mve_element_mask().
+ *
+ * The pseudocode implements these insns with multiple memory accesses
+ * of the element size, but rules R_VVVG and R_FXDM permit us to make
+ * one 32-bit memory access per beat.
+ */
+#define DO_VLD4B(OP, O1, O2, O3, O4) \
+ void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
+ uint32_t base) \
+ { \
+ int beat, e; \
+ uint16_t mask = mve_eci_mask(env); \
+ static const uint8_t off[4] = { O1, O2, O3, O4 }; \
+ uint32_t addr, data; \
+ for (beat = 0; beat < 4; beat++, mask >>= 4) { \
+ if ((mask & 1) == 0) { \
+ /* ECI says skip this beat */ \
+ continue; \
+ } \
+ addr = base + off[beat] * 4; \
+ data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
+ for (e = 0; e < 4; e++, data >>= 8) { \
+ uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \
+ qd[H1(off[beat])] = data; \
+ } \
+ } \
+ }
+
+#define DO_VLD4H(OP, O1, O2) \
+ void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
+ uint32_t base) \
+ { \
+ int beat; \
+ uint16_t mask = mve_eci_mask(env); \
+ static const uint8_t off[4] = { O1, O1, O2, O2 }; \
+ uint32_t addr, data; \
+ int y; /* y counts 0 2 0 2 */ \
+ uint16_t *qd; \
+ for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \
+ if ((mask & 1) == 0) { \
+ /* ECI says skip this beat */ \
+ continue; \
+ } \
+ addr = base + off[beat] * 8 + (beat & 1) * 4; \
+ data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
+ qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y); \
+ qd[H2(off[beat])] = data; \
+ data >>= 16; \
+ qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1); \
+ qd[H2(off[beat])] = data; \
+ } \
+ }
+
+#define DO_VLD4W(OP, O1, O2, O3, O4) \
+ void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
+ uint32_t base) \
+ { \
+ int beat; \
+ uint16_t mask = mve_eci_mask(env); \
+ static const uint8_t off[4] = { O1, O2, O3, O4 }; \
+ uint32_t addr, data; \
+ uint32_t *qd; \
+ int y; \
+ for (beat = 0; beat < 4; beat++, mask >>= 4) { \
+ if ((mask & 1) == 0) { \
+ /* ECI says skip this beat */ \
+ continue; \
+ } \
+ addr = base + off[beat] * 4; \
+ data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
+ y = (beat + (O1 & 2)) & 3; \
+ qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \
+ qd[H4(off[beat] >> 2)] = data; \
+ } \
+ }
+
+DO_VLD4B(vld40b, 0, 1, 10, 11)
+DO_VLD4B(vld41b, 2, 3, 12, 13)
+DO_VLD4B(vld42b, 4, 5, 14, 15)
+DO_VLD4B(vld43b, 6, 7, 8, 9)
+
+DO_VLD4H(vld40h, 0, 5)
+DO_VLD4H(vld41h, 1, 6)
+DO_VLD4H(vld42h, 2, 7)
+DO_VLD4H(vld43h, 3, 4)
+
+DO_VLD4W(vld40w, 0, 1, 10, 11)
+DO_VLD4W(vld41w, 2, 3, 12, 13)
+DO_VLD4W(vld42w, 4, 5, 14, 15)
+DO_VLD4W(vld43w, 6, 7, 8, 9)
+
+#define DO_VLD2B(OP, O1, O2, O3, O4) \
+ void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
+ uint32_t base) \
+ { \
+ int beat, e; \
+ uint16_t mask = mve_eci_mask(env); \
+ static const uint8_t off[4] = { O1, O2, O3, O4 }; \
+ uint32_t addr, data; \
+ uint8_t *qd; \
+ for (beat = 0; beat < 4; beat++, mask >>= 4) { \
+ if ((mask & 1) == 0) { \
+ /* ECI says skip this beat */ \
+ continue; \
+ } \
+ addr = base + off[beat] * 2; \
+ data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
+ for (e = 0; e < 4; e++, data >>= 8) { \
+ qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \
+ qd[H1(off[beat] + (e >> 1))] = data; \
+ } \
+ } \
+ }
+
+#define DO_VLD2H(OP, O1, O2, O3, O4) \
+ void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
+ uint32_t base) \
+ { \
+ int beat; \
+ uint16_t mask = mve_eci_mask(env); \
+ static const uint8_t off[4] = { O1, O2, O3, O4 }; \
+ uint32_t addr, data; \
+ int e; \
+ uint16_t *qd; \
+ for (beat = 0; beat < 4; beat++, mask >>= 4) { \
+ if ((mask & 1) == 0) { \
+ /* ECI says skip this beat */ \
+ continue; \
+ } \
+ addr = base + off[beat] * 4; \
+ data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
+ for (e = 0; e < 2; e++, data >>= 16) { \
+ qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \
+ qd[H2(off[beat])] = data; \
+ } \
+ } \
+ }
+
+#define DO_VLD2W(OP, O1, O2, O3, O4) \
+ void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
+ uint32_t base) \
+ { \
+ int beat; \
+ uint16_t mask = mve_eci_mask(env); \
+ static const uint8_t off[4] = { O1, O2, O3, O4 }; \
+ uint32_t addr, data; \
+ uint32_t *qd; \
+ for (beat = 0; beat < 4; beat++, mask >>= 4) { \
+ if ((mask & 1) == 0) { \
+ /* ECI says skip this beat */ \
+ continue; \
+ } \
+ addr = base + off[beat]; \
+ data = cpu_ldl_le_data_ra(env, addr, GETPC()); \
+ qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \
+ qd[H4(off[beat] >> 3)] = data; \
+ } \
+ }
+
+DO_VLD2B(vld20b, 0, 2, 12, 14)
+DO_VLD2B(vld21b, 4, 6, 8, 10)
+
+DO_VLD2H(vld20h, 0, 1, 6, 7)
+DO_VLD2H(vld21h, 2, 3, 4, 5)
+
+DO_VLD2W(vld20w, 0, 4, 24, 28)
+DO_VLD2W(vld21w, 8, 12, 16, 20)
+
+#define DO_VST4B(OP, O1, O2, O3, O4) \
+ void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
+ uint32_t base) \
+ { \
+ int beat, e; \
+ uint16_t mask = mve_eci_mask(env); \
+ static const uint8_t off[4] = { O1, O2, O3, O4 }; \
+ uint32_t addr, data; \
+ for (beat = 0; beat < 4; beat++, mask >>= 4) { \
+ if ((mask & 1) == 0) { \
+ /* ECI says skip this beat */ \
+ continue; \
+ } \
+ addr = base + off[beat] * 4; \
+ data = 0; \
+ for (e = 3; e >= 0; e--) { \
+ uint8_t *qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + e); \
+ data = (data << 8) | qd[H1(off[beat])]; \
+ } \
+ cpu_stl_le_data_ra(env, addr, data, GETPC()); \
+ } \
+ }
+
+#define DO_VST4H(OP, O1, O2) \
+ void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
+ uint32_t base) \
+ { \
+ int beat; \
+ uint16_t mask = mve_eci_mask(env); \
+ static const uint8_t off[4] = { O1, O1, O2, O2 }; \
+ uint32_t addr, data; \
+ int y; /* y counts 0 2 0 2 */ \
+ uint16_t *qd; \
+ for (beat = 0, y = 0; beat < 4; beat++, mask >>= 4, y ^= 2) { \
+ if ((mask & 1) == 0) { \
+ /* ECI says skip this beat */ \
+ continue; \
+ } \
+ addr = base + off[beat] * 8 + (beat & 1) * 4; \
+ qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y); \
+ data = qd[H2(off[beat])]; \
+ qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + y + 1); \
+ data |= qd[H2(off[beat])] << 16; \
+ cpu_stl_le_data_ra(env, addr, data, GETPC()); \
+ } \
+ }
+
+#define DO_VST4W(OP, O1, O2, O3, O4) \
+ void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
+ uint32_t base) \
+ { \
+ int beat; \
+ uint16_t mask = mve_eci_mask(env); \
+ static const uint8_t off[4] = { O1, O2, O3, O4 }; \
+ uint32_t addr, data; \
+ uint32_t *qd; \
+ int y; \
+ for (beat = 0; beat < 4; beat++, mask >>= 4) { \
+ if ((mask & 1) == 0) { \
+ /* ECI says skip this beat */ \
+ continue; \
+ } \
+ addr = base + off[beat] * 4; \
+ y = (beat + (O1 & 2)) & 3; \
+ qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + y); \
+ data = qd[H4(off[beat] >> 2)]; \
+ cpu_stl_le_data_ra(env, addr, data, GETPC()); \
+ } \
+ }
+
+DO_VST4B(vst40b, 0, 1, 10, 11)
+DO_VST4B(vst41b, 2, 3, 12, 13)
+DO_VST4B(vst42b, 4, 5, 14, 15)
+DO_VST4B(vst43b, 6, 7, 8, 9)
+
+DO_VST4H(vst40h, 0, 5)
+DO_VST4H(vst41h, 1, 6)
+DO_VST4H(vst42h, 2, 7)
+DO_VST4H(vst43h, 3, 4)
+
+DO_VST4W(vst40w, 0, 1, 10, 11)
+DO_VST4W(vst41w, 2, 3, 12, 13)
+DO_VST4W(vst42w, 4, 5, 14, 15)
+DO_VST4W(vst43w, 6, 7, 8, 9)
+
+#define DO_VST2B(OP, O1, O2, O3, O4) \
+ void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
+ uint32_t base) \
+ { \
+ int beat, e; \
+ uint16_t mask = mve_eci_mask(env); \
+ static const uint8_t off[4] = { O1, O2, O3, O4 }; \
+ uint32_t addr, data; \
+ uint8_t *qd; \
+ for (beat = 0; beat < 4; beat++, mask >>= 4) { \
+ if ((mask & 1) == 0) { \
+ /* ECI says skip this beat */ \
+ continue; \
+ } \
+ addr = base + off[beat] * 2; \
+ data = 0; \
+ for (e = 3; e >= 0; e--) { \
+ qd = (uint8_t *)aa32_vfp_qreg(env, qnidx + (e & 1)); \
+ data = (data << 8) | qd[H1(off[beat] + (e >> 1))]; \
+ } \
+ cpu_stl_le_data_ra(env, addr, data, GETPC()); \
+ } \
+ }
+
+#define DO_VST2H(OP, O1, O2, O3, O4) \
+ void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
+ uint32_t base) \
+ { \
+ int beat; \
+ uint16_t mask = mve_eci_mask(env); \
+ static const uint8_t off[4] = { O1, O2, O3, O4 }; \
+ uint32_t addr, data; \
+ int e; \
+ uint16_t *qd; \
+ for (beat = 0; beat < 4; beat++, mask >>= 4) { \
+ if ((mask & 1) == 0) { \
+ /* ECI says skip this beat */ \
+ continue; \
+ } \
+ addr = base + off[beat] * 4; \
+ data = 0; \
+ for (e = 1; e >= 0; e--) { \
+ qd = (uint16_t *)aa32_vfp_qreg(env, qnidx + e); \
+ data = (data << 16) | qd[H2(off[beat])]; \
+ } \
+ cpu_stl_le_data_ra(env, addr, data, GETPC()); \
+ } \
+ }
+
+#define DO_VST2W(OP, O1, O2, O3, O4) \
+ void HELPER(mve_##OP)(CPUARMState *env, uint32_t qnidx, \
+ uint32_t base) \
+ { \
+ int beat; \
+ uint16_t mask = mve_eci_mask(env); \
+ static const uint8_t off[4] = { O1, O2, O3, O4 }; \
+ uint32_t addr, data; \
+ uint32_t *qd; \
+ for (beat = 0; beat < 4; beat++, mask >>= 4) { \
+ if ((mask & 1) == 0) { \
+ /* ECI says skip this beat */ \
+ continue; \
+ } \
+ addr = base + off[beat]; \
+ qd = (uint32_t *)aa32_vfp_qreg(env, qnidx + (beat & 1)); \
+ data = qd[H4(off[beat] >> 3)]; \
+ cpu_stl_le_data_ra(env, addr, data, GETPC()); \
+ } \
+ }
+
+DO_VST2B(vst20b, 0, 2, 12, 14)
+DO_VST2B(vst21b, 4, 6, 8, 10)
+
+DO_VST2H(vst20h, 0, 1, 6, 7)
+DO_VST2H(vst21h, 2, 3, 4, 5)
+
+DO_VST2W(vst20w, 0, 4, 24, 28)
+DO_VST2W(vst21w, 8, 12, 16, 20)
+
+/*
+ * The mergemask(D, R, M) macro performs the operation "*D = R" but
+ * storing only the bytes which correspond to 1 bits in M,
+ * leaving other bytes in *D unchanged. We use _Generic
+ * to select the correct implementation based on the type of D.
+ */
+
+static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask)
+{
+ if (mask & 1) {
+ *d = r;
+ }
+}
+
+static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask)
+{
+ mergemask_ub((uint8_t *)d, r, mask);
+}
+
+static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask)
+{
+ uint16_t bmask = expand_pred_b(mask);
+ *d = (*d & ~bmask) | (r & bmask);
+}
+
+static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask)
+{
+ mergemask_uh((uint16_t *)d, r, mask);
+}
+
+static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask)
+{
+ uint32_t bmask = expand_pred_b(mask);
+ *d = (*d & ~bmask) | (r & bmask);
+}
+
+static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask)
+{
+ mergemask_uw((uint32_t *)d, r, mask);
+}
+
+static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask)
+{
+ uint64_t bmask = expand_pred_b(mask);
+ *d = (*d & ~bmask) | (r & bmask);
+}
+
+static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask)
+{
+ mergemask_uq((uint64_t *)d, r, mask);
+}
+
+#define mergemask(D, R, M) \
+ _Generic(D, \
+ uint8_t *: mergemask_ub, \
+ int8_t *: mergemask_sb, \
+ uint16_t *: mergemask_uh, \
+ int16_t *: mergemask_sh, \
+ uint32_t *: mergemask_uw, \
+ int32_t *: mergemask_sw, \
+ uint64_t *: mergemask_uq, \
+ int64_t *: mergemask_sq)(D, R, M)
+
+void HELPER(mve_vdup)(CPUARMState *env, void *vd, uint32_t val)
+{
+ /*
+ * The generated code already replicated an 8 or 16 bit constant
+ * into the 32-bit value, so we only need to write the 32-bit
+ * value to all elements of the Qreg, allowing for predication.
+ */
+ uint32_t *d = vd;
+ uint16_t mask = mve_element_mask(env);
+ unsigned e;
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+ mergemask(&d[H4(e)], val, mask);
+ }
+ mve_advance_vpt(env);
+}
+
+#define DO_1OP(OP, ESIZE, TYPE, FN) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \
+ { \
+ TYPE *d = vd, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_CLS_B(N) (clrsb32(N) - 24)
+#define DO_CLS_H(N) (clrsb32(N) - 16)
+
+DO_1OP(vclsb, 1, int8_t, DO_CLS_B)
+DO_1OP(vclsh, 2, int16_t, DO_CLS_H)
+DO_1OP(vclsw, 4, int32_t, clrsb32)
+
+#define DO_CLZ_B(N) (clz32(N) - 24)
+#define DO_CLZ_H(N) (clz32(N) - 16)
+
+DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B)
+DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H)
+DO_1OP(vclzw, 4, uint32_t, clz32)
+
+DO_1OP(vrev16b, 2, uint16_t, bswap16)
+DO_1OP(vrev32b, 4, uint32_t, bswap32)
+DO_1OP(vrev32h, 4, uint32_t, hswap32)
+DO_1OP(vrev64b, 8, uint64_t, bswap64)
+DO_1OP(vrev64h, 8, uint64_t, hswap64)
+DO_1OP(vrev64w, 8, uint64_t, wswap64)
+
+#define DO_NOT(N) (~(N))
+
+DO_1OP(vmvn, 8, uint64_t, DO_NOT)
+
+#define DO_ABS(N) ((N) < 0 ? -(N) : (N))
+#define DO_FABSH(N) ((N) & dup_const(MO_16, 0x7fff))
+#define DO_FABSS(N) ((N) & dup_const(MO_32, 0x7fffffff))
+
+DO_1OP(vabsb, 1, int8_t, DO_ABS)
+DO_1OP(vabsh, 2, int16_t, DO_ABS)
+DO_1OP(vabsw, 4, int32_t, DO_ABS)
+
+/* We can do these 64 bits at a time */
+DO_1OP(vfabsh, 8, uint64_t, DO_FABSH)
+DO_1OP(vfabss, 8, uint64_t, DO_FABSS)
+
+#define DO_NEG(N) (-(N))
+#define DO_FNEGH(N) ((N) ^ dup_const(MO_16, 0x8000))
+#define DO_FNEGS(N) ((N) ^ dup_const(MO_32, 0x80000000))
+
+DO_1OP(vnegb, 1, int8_t, DO_NEG)
+DO_1OP(vnegh, 2, int16_t, DO_NEG)
+DO_1OP(vnegw, 4, int32_t, DO_NEG)
+
+/* We can do these 64 bits at a time */
+DO_1OP(vfnegh, 8, uint64_t, DO_FNEGH)
+DO_1OP(vfnegs, 8, uint64_t, DO_FNEGS)
+
+/*
+ * 1 operand immediates: Vda is destination and possibly also one source.
+ * All these insns work at 64-bit widths.
+ */
+#define DO_1OP_IMM(OP, FN) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vda, uint64_t imm) \
+ { \
+ uint64_t *da = vda; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ for (e = 0; e < 16 / 8; e++, mask >>= 8) { \
+ mergemask(&da[H8(e)], FN(da[H8(e)], imm), mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_MOVI(N, I) (I)
+#define DO_ANDI(N, I) ((N) & (I))
+#define DO_ORRI(N, I) ((N) | (I))
+
+DO_1OP_IMM(vmovi, DO_MOVI)
+DO_1OP_IMM(vandi, DO_ANDI)
+DO_1OP_IMM(vorri, DO_ORRI)
+
+#define DO_2OP(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, \
+ void *vd, void *vn, void *vm) \
+ { \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], \
+ FN(n[H##ESIZE(e)], m[H##ESIZE(e)]), mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/* provide unsigned 2-op helpers for all sizes */
+#define DO_2OP_U(OP, FN) \
+ DO_2OP(OP##b, 1, uint8_t, FN) \
+ DO_2OP(OP##h, 2, uint16_t, FN) \
+ DO_2OP(OP##w, 4, uint32_t, FN)
+
+/* provide signed 2-op helpers for all sizes */
+#define DO_2OP_S(OP, FN) \
+ DO_2OP(OP##b, 1, int8_t, FN) \
+ DO_2OP(OP##h, 2, int16_t, FN) \
+ DO_2OP(OP##w, 4, int32_t, FN)
+
+/*
+ * "Long" operations where two half-sized inputs (taken from either the
+ * top or the bottom of the input vector) produce a double-width result.
+ * Here ESIZE, TYPE are for the input, and LESIZE, LTYPE for the output.
+ */
+#define DO_2OP_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
+ { \
+ LTYPE *d = vd; \
+ TYPE *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned le; \
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+ LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], \
+ m[H##ESIZE(le * 2 + TOP)]); \
+ mergemask(&d[H##LESIZE(le)], r, mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_2OP_SAT(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
+ { \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ bool qc = false; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ bool sat = false; \
+ TYPE r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], &sat); \
+ mergemask(&d[H##ESIZE(e)], r, mask); \
+ qc |= sat & mask & 1; \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/* provide unsigned 2-op helpers for all sizes */
+#define DO_2OP_SAT_U(OP, FN) \
+ DO_2OP_SAT(OP##b, 1, uint8_t, FN) \
+ DO_2OP_SAT(OP##h, 2, uint16_t, FN) \
+ DO_2OP_SAT(OP##w, 4, uint32_t, FN)
+
+/* provide signed 2-op helpers for all sizes */
+#define DO_2OP_SAT_S(OP, FN) \
+ DO_2OP_SAT(OP##b, 1, int8_t, FN) \
+ DO_2OP_SAT(OP##h, 2, int16_t, FN) \
+ DO_2OP_SAT(OP##w, 4, int32_t, FN)
+
+#define DO_AND(N, M) ((N) & (M))
+#define DO_BIC(N, M) ((N) & ~(M))
+#define DO_ORR(N, M) ((N) | (M))
+#define DO_ORN(N, M) ((N) | ~(M))
+#define DO_EOR(N, M) ((N) ^ (M))
+
+DO_2OP(vand, 8, uint64_t, DO_AND)
+DO_2OP(vbic, 8, uint64_t, DO_BIC)
+DO_2OP(vorr, 8, uint64_t, DO_ORR)
+DO_2OP(vorn, 8, uint64_t, DO_ORN)
+DO_2OP(veor, 8, uint64_t, DO_EOR)
+
+#define DO_ADD(N, M) ((N) + (M))
+#define DO_SUB(N, M) ((N) - (M))
+#define DO_MUL(N, M) ((N) * (M))
+
+DO_2OP_U(vadd, DO_ADD)
+DO_2OP_U(vsub, DO_SUB)
+DO_2OP_U(vmul, DO_MUL)
+
+DO_2OP_L(vmullbsb, 0, 1, int8_t, 2, int16_t, DO_MUL)
+DO_2OP_L(vmullbsh, 0, 2, int16_t, 4, int32_t, DO_MUL)
+DO_2OP_L(vmullbsw, 0, 4, int32_t, 8, int64_t, DO_MUL)
+DO_2OP_L(vmullbub, 0, 1, uint8_t, 2, uint16_t, DO_MUL)
+DO_2OP_L(vmullbuh, 0, 2, uint16_t, 4, uint32_t, DO_MUL)
+DO_2OP_L(vmullbuw, 0, 4, uint32_t, 8, uint64_t, DO_MUL)
+
+DO_2OP_L(vmulltsb, 1, 1, int8_t, 2, int16_t, DO_MUL)
+DO_2OP_L(vmulltsh, 1, 2, int16_t, 4, int32_t, DO_MUL)
+DO_2OP_L(vmulltsw, 1, 4, int32_t, 8, int64_t, DO_MUL)
+DO_2OP_L(vmulltub, 1, 1, uint8_t, 2, uint16_t, DO_MUL)
+DO_2OP_L(vmulltuh, 1, 2, uint16_t, 4, uint32_t, DO_MUL)
+DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
+
+/*
+ * Polynomial multiply. We can always do this generating 64 bits
+ * of the result at a time, so we don't need to use DO_2OP_L.
+ */
+#define VMULLPH_MASK 0x00ff00ff00ff00ffULL
+#define VMULLPW_MASK 0x0000ffff0000ffffULL
+#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK)
+#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8)
+#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK)
+#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16)
+
+DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH)
+DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH)
+DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW)
+DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW)
+
+/*
+ * Because the computation type is at least twice as large as required,
+ * these work for both signed and unsigned source types.
+ */
+static inline uint8_t do_mulh_b(int32_t n, int32_t m)
+{
+ return (n * m) >> 8;
+}
+
+static inline uint16_t do_mulh_h(int32_t n, int32_t m)
+{
+ return (n * m) >> 16;
+}
+
+static inline uint32_t do_mulh_w(int64_t n, int64_t m)
+{
+ return (n * m) >> 32;
+}
+
+static inline uint8_t do_rmulh_b(int32_t n, int32_t m)
+{
+ return (n * m + (1U << 7)) >> 8;
+}
+
+static inline uint16_t do_rmulh_h(int32_t n, int32_t m)
+{
+ return (n * m + (1U << 15)) >> 16;
+}
+
+static inline uint32_t do_rmulh_w(int64_t n, int64_t m)
+{
+ return (n * m + (1U << 31)) >> 32;
+}
+
+DO_2OP(vmulhsb, 1, int8_t, do_mulh_b)
+DO_2OP(vmulhsh, 2, int16_t, do_mulh_h)
+DO_2OP(vmulhsw, 4, int32_t, do_mulh_w)
+DO_2OP(vmulhub, 1, uint8_t, do_mulh_b)
+DO_2OP(vmulhuh, 2, uint16_t, do_mulh_h)
+DO_2OP(vmulhuw, 4, uint32_t, do_mulh_w)
+
+DO_2OP(vrmulhsb, 1, int8_t, do_rmulh_b)
+DO_2OP(vrmulhsh, 2, int16_t, do_rmulh_h)
+DO_2OP(vrmulhsw, 4, int32_t, do_rmulh_w)
+DO_2OP(vrmulhub, 1, uint8_t, do_rmulh_b)
+DO_2OP(vrmulhuh, 2, uint16_t, do_rmulh_h)
+DO_2OP(vrmulhuw, 4, uint32_t, do_rmulh_w)
+
+#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
+#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
+
+DO_2OP_S(vmaxs, DO_MAX)
+DO_2OP_U(vmaxu, DO_MAX)
+DO_2OP_S(vmins, DO_MIN)
+DO_2OP_U(vminu, DO_MIN)
+
+#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
+
+DO_2OP_S(vabds, DO_ABD)
+DO_2OP_U(vabdu, DO_ABD)
+
+static inline uint32_t do_vhadd_u(uint32_t n, uint32_t m)
+{
+ return ((uint64_t)n + m) >> 1;
+}
+
+static inline int32_t do_vhadd_s(int32_t n, int32_t m)
+{
+ return ((int64_t)n + m) >> 1;
+}
+
+static inline uint32_t do_vhsub_u(uint32_t n, uint32_t m)
+{
+ return ((uint64_t)n - m) >> 1;
+}
+
+static inline int32_t do_vhsub_s(int32_t n, int32_t m)
+{
+ return ((int64_t)n - m) >> 1;
+}
+
+DO_2OP_S(vhadds, do_vhadd_s)
+DO_2OP_U(vhaddu, do_vhadd_u)
+DO_2OP_S(vhsubs, do_vhsub_s)
+DO_2OP_U(vhsubu, do_vhsub_u)
+
+#define DO_VSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
+#define DO_VSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, false, NULL)
+#define DO_VRSHLS(N, M) do_sqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
+#define DO_VRSHLU(N, M) do_uqrshl_bhs(N, (int8_t)(M), sizeof(N) * 8, true, NULL)
+
+DO_2OP_S(vshls, DO_VSHLS)
+DO_2OP_U(vshlu, DO_VSHLU)
+DO_2OP_S(vrshls, DO_VRSHLS)
+DO_2OP_U(vrshlu, DO_VRSHLU)
+
+#define DO_RHADD_S(N, M) (((int64_t)(N) + (M) + 1) >> 1)
+#define DO_RHADD_U(N, M) (((uint64_t)(N) + (M) + 1) >> 1)
+
+DO_2OP_S(vrhadds, DO_RHADD_S)
+DO_2OP_U(vrhaddu, DO_RHADD_U)
+
+static void do_vadc(CPUARMState *env, uint32_t *d, uint32_t *n, uint32_t *m,
+ uint32_t inv, uint32_t carry_in, bool update_flags)
+{
+ uint16_t mask = mve_element_mask(env);
+ unsigned e;
+
+ /* If any additions trigger, we will update flags. */
+ if (mask & 0x1111) {
+ update_flags = true;
+ }
+
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+ uint64_t r = carry_in;
+ r += n[H4(e)];
+ r += m[H4(e)] ^ inv;
+ if (mask & 1) {
+ carry_in = r >> 32;
+ }
+ mergemask(&d[H4(e)], r, mask);
+ }
+
+ if (update_flags) {
+ /* Store C, clear NZV. */
+ env->vfp.xregs[ARM_VFP_FPSCR] &= ~FPCR_NZCV_MASK;
+ env->vfp.xregs[ARM_VFP_FPSCR] |= carry_in * FPCR_C;
+ }
+ mve_advance_vpt(env);
+}
+
+void HELPER(mve_vadc)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+ bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C;
+ do_vadc(env, vd, vn, vm, 0, carry_in, false);
+}
+
+void HELPER(mve_vsbc)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+ bool carry_in = env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_C;
+ do_vadc(env, vd, vn, vm, -1, carry_in, false);
+}
+
+
+void HELPER(mve_vadci)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+ do_vadc(env, vd, vn, vm, 0, 0, true);
+}
+
+void HELPER(mve_vsbci)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+ do_vadc(env, vd, vn, vm, -1, 1, true);
+}
+
+#define DO_VCADD(OP, ESIZE, TYPE, FN0, FN1) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, void *vm) \
+ { \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE r[16 / ESIZE]; \
+ /* Calculate all results first to avoid overwriting inputs */ \
+ for (e = 0; e < 16 / ESIZE; e++) { \
+ if (!(e & 1)) { \
+ r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)]); \
+ } else { \
+ r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)]); \
+ } \
+ } \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], r[e], mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VCADD_ALL(OP, FN0, FN1) \
+ DO_VCADD(OP##b, 1, int8_t, FN0, FN1) \
+ DO_VCADD(OP##h, 2, int16_t, FN0, FN1) \
+ DO_VCADD(OP##w, 4, int32_t, FN0, FN1)
+
+DO_VCADD_ALL(vcadd90, DO_SUB, DO_ADD)
+DO_VCADD_ALL(vcadd270, DO_ADD, DO_SUB)
+DO_VCADD_ALL(vhcadd90, do_vhsub_s, do_vhadd_s)
+DO_VCADD_ALL(vhcadd270, do_vhadd_s, do_vhsub_s)
+
+static inline int32_t do_sat_bhw(int64_t val, int64_t min, int64_t max, bool *s)
+{
+ if (val > max) {
+ *s = true;
+ return max;
+ } else if (val < min) {
+ *s = true;
+ return min;
+ }
+ return val;
+}
+
+#define DO_SQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, INT8_MIN, INT8_MAX, s)
+#define DO_SQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, INT16_MIN, INT16_MAX, s)
+#define DO_SQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, INT32_MIN, INT32_MAX, s)
+
+#define DO_UQADD_B(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT8_MAX, s)
+#define DO_UQADD_H(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT16_MAX, s)
+#define DO_UQADD_W(n, m, s) do_sat_bhw((int64_t)n + m, 0, UINT32_MAX, s)
+
+#define DO_SQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, INT8_MIN, INT8_MAX, s)
+#define DO_SQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, INT16_MIN, INT16_MAX, s)
+#define DO_SQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, INT32_MIN, INT32_MAX, s)
+
+#define DO_UQSUB_B(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT8_MAX, s)
+#define DO_UQSUB_H(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT16_MAX, s)
+#define DO_UQSUB_W(n, m, s) do_sat_bhw((int64_t)n - m, 0, UINT32_MAX, s)
+
+/*
+ * For QDMULH and QRDMULH we simplify "double and shift by esize" into
+ * "shift by esize-1", adjusting the QRDMULH rounding constant to match.
+ */
+#define DO_QDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m) >> 7, \
+ INT8_MIN, INT8_MAX, s)
+#define DO_QDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m) >> 15, \
+ INT16_MIN, INT16_MAX, s)
+#define DO_QDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m) >> 31, \
+ INT32_MIN, INT32_MAX, s)
+
+#define DO_QRDMULH_B(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 6)) >> 7, \
+ INT8_MIN, INT8_MAX, s)
+#define DO_QRDMULH_H(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 14)) >> 15, \
+ INT16_MIN, INT16_MAX, s)
+#define DO_QRDMULH_W(n, m, s) do_sat_bhw(((int64_t)n * m + (1 << 30)) >> 31, \
+ INT32_MIN, INT32_MAX, s)
+
+DO_2OP_SAT(vqdmulhb, 1, int8_t, DO_QDMULH_B)
+DO_2OP_SAT(vqdmulhh, 2, int16_t, DO_QDMULH_H)
+DO_2OP_SAT(vqdmulhw, 4, int32_t, DO_QDMULH_W)
+
+DO_2OP_SAT(vqrdmulhb, 1, int8_t, DO_QRDMULH_B)
+DO_2OP_SAT(vqrdmulhh, 2, int16_t, DO_QRDMULH_H)
+DO_2OP_SAT(vqrdmulhw, 4, int32_t, DO_QRDMULH_W)
+
+DO_2OP_SAT(vqaddub, 1, uint8_t, DO_UQADD_B)
+DO_2OP_SAT(vqadduh, 2, uint16_t, DO_UQADD_H)
+DO_2OP_SAT(vqadduw, 4, uint32_t, DO_UQADD_W)
+DO_2OP_SAT(vqaddsb, 1, int8_t, DO_SQADD_B)
+DO_2OP_SAT(vqaddsh, 2, int16_t, DO_SQADD_H)
+DO_2OP_SAT(vqaddsw, 4, int32_t, DO_SQADD_W)
+
+DO_2OP_SAT(vqsubub, 1, uint8_t, DO_UQSUB_B)
+DO_2OP_SAT(vqsubuh, 2, uint16_t, DO_UQSUB_H)
+DO_2OP_SAT(vqsubuw, 4, uint32_t, DO_UQSUB_W)
+DO_2OP_SAT(vqsubsb, 1, int8_t, DO_SQSUB_B)
+DO_2OP_SAT(vqsubsh, 2, int16_t, DO_SQSUB_H)
+DO_2OP_SAT(vqsubsw, 4, int32_t, DO_SQSUB_W)
+
+/*
+ * This wrapper fixes up the impedance mismatch between do_sqrshl_bhs()
+ * and friends wanting a uint32_t* sat and our needing a bool*.
+ */
+#define WRAP_QRSHL_HELPER(FN, N, M, ROUND, satp) \
+ ({ \
+ uint32_t su32 = 0; \
+ typeof(N) r = FN(N, (int8_t)(M), sizeof(N) * 8, ROUND, &su32); \
+ if (su32) { \
+ *satp = true; \
+ } \
+ r; \
+ })
+
+#define DO_SQSHL_OP(N, M, satp) \
+ WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, false, satp)
+#define DO_UQSHL_OP(N, M, satp) \
+ WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, false, satp)
+#define DO_SQRSHL_OP(N, M, satp) \
+ WRAP_QRSHL_HELPER(do_sqrshl_bhs, N, M, true, satp)
+#define DO_UQRSHL_OP(N, M, satp) \
+ WRAP_QRSHL_HELPER(do_uqrshl_bhs, N, M, true, satp)
+#define DO_SUQSHL_OP(N, M, satp) \
+ WRAP_QRSHL_HELPER(do_suqrshl_bhs, N, M, false, satp)
+
+DO_2OP_SAT_S(vqshls, DO_SQSHL_OP)
+DO_2OP_SAT_U(vqshlu, DO_UQSHL_OP)
+DO_2OP_SAT_S(vqrshls, DO_SQRSHL_OP)
+DO_2OP_SAT_U(vqrshlu, DO_UQRSHL_OP)
+
+/*
+ * Multiply add dual returning high half
+ * The 'FN' here takes four inputs A, B, C, D, a 0/1 indicator of
+ * whether to add the rounding constant, and the pointer to the
+ * saturation flag, and should do "(A * B + C * D) * 2 + rounding constant",
+ * saturate to twice the input size and return the high half; or
+ * (A * B - C * D) etc for VQDMLSDH.
+ */
+#define DO_VQDMLADH_OP(OP, ESIZE, TYPE, XCHG, ROUND, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ void *vm) \
+ { \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ bool qc = false; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ bool sat = false; \
+ if ((e & 1) == XCHG) { \
+ TYPE r = FN(n[H##ESIZE(e)], \
+ m[H##ESIZE(e - XCHG)], \
+ n[H##ESIZE(e + (1 - 2 * XCHG))], \
+ m[H##ESIZE(e + (1 - XCHG))], \
+ ROUND, &sat); \
+ mergemask(&d[H##ESIZE(e)], r, mask); \
+ qc |= sat & mask & 1; \
+ } \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+static int8_t do_vqdmladh_b(int8_t a, int8_t b, int8_t c, int8_t d,
+ int round, bool *sat)
+{
+ int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 7);
+ return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
+}
+
+static int16_t do_vqdmladh_h(int16_t a, int16_t b, int16_t c, int16_t d,
+ int round, bool *sat)
+{
+ int64_t r = ((int64_t)a * b + (int64_t)c * d) * 2 + (round << 15);
+ return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
+}
+
+static int32_t do_vqdmladh_w(int32_t a, int32_t b, int32_t c, int32_t d,
+ int round, bool *sat)
+{
+ int64_t m1 = (int64_t)a * b;
+ int64_t m2 = (int64_t)c * d;
+ int64_t r;
+ /*
+ * Architecturally we should do the entire add, double, round
+ * and then check for saturation. We do three saturating adds,
+ * but we need to be careful about the order. If the first
+ * m1 + m2 saturates then it's impossible for the *2+rc to
+ * bring it back into the non-saturated range. However, if
+ * m1 + m2 is negative then it's possible that doing the doubling
+ * would take the intermediate result below INT64_MAX and the
+ * addition of the rounding constant then brings it back in range.
+ * So we add half the rounding constant before doubling rather
+ * than adding the rounding constant after the doubling.
+ */
+ if (sadd64_overflow(m1, m2, &r) ||
+ sadd64_overflow(r, (round << 30), &r) ||
+ sadd64_overflow(r, r, &r)) {
+ *sat = true;
+ return r < 0 ? INT32_MAX : INT32_MIN;
+ }
+ return r >> 32;
+}
+
+static int8_t do_vqdmlsdh_b(int8_t a, int8_t b, int8_t c, int8_t d,
+ int round, bool *sat)
+{
+ int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 7);
+ return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
+}
+
+static int16_t do_vqdmlsdh_h(int16_t a, int16_t b, int16_t c, int16_t d,
+ int round, bool *sat)
+{
+ int64_t r = ((int64_t)a * b - (int64_t)c * d) * 2 + (round << 15);
+ return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
+}
+
+static int32_t do_vqdmlsdh_w(int32_t a, int32_t b, int32_t c, int32_t d,
+ int round, bool *sat)
+{
+ int64_t m1 = (int64_t)a * b;
+ int64_t m2 = (int64_t)c * d;
+ int64_t r;
+ /* The same ordering issue as in do_vqdmladh_w applies here too */
+ if (ssub64_overflow(m1, m2, &r) ||
+ sadd64_overflow(r, (round << 30), &r) ||
+ sadd64_overflow(r, r, &r)) {
+ *sat = true;
+ return r < 0 ? INT32_MAX : INT32_MIN;
+ }
+ return r >> 32;
+}
+
+DO_VQDMLADH_OP(vqdmladhb, 1, int8_t, 0, 0, do_vqdmladh_b)
+DO_VQDMLADH_OP(vqdmladhh, 2, int16_t, 0, 0, do_vqdmladh_h)
+DO_VQDMLADH_OP(vqdmladhw, 4, int32_t, 0, 0, do_vqdmladh_w)
+DO_VQDMLADH_OP(vqdmladhxb, 1, int8_t, 1, 0, do_vqdmladh_b)
+DO_VQDMLADH_OP(vqdmladhxh, 2, int16_t, 1, 0, do_vqdmladh_h)
+DO_VQDMLADH_OP(vqdmladhxw, 4, int32_t, 1, 0, do_vqdmladh_w)
+
+DO_VQDMLADH_OP(vqrdmladhb, 1, int8_t, 0, 1, do_vqdmladh_b)
+DO_VQDMLADH_OP(vqrdmladhh, 2, int16_t, 0, 1, do_vqdmladh_h)
+DO_VQDMLADH_OP(vqrdmladhw, 4, int32_t, 0, 1, do_vqdmladh_w)
+DO_VQDMLADH_OP(vqrdmladhxb, 1, int8_t, 1, 1, do_vqdmladh_b)
+DO_VQDMLADH_OP(vqrdmladhxh, 2, int16_t, 1, 1, do_vqdmladh_h)
+DO_VQDMLADH_OP(vqrdmladhxw, 4, int32_t, 1, 1, do_vqdmladh_w)
+
+DO_VQDMLADH_OP(vqdmlsdhb, 1, int8_t, 0, 0, do_vqdmlsdh_b)
+DO_VQDMLADH_OP(vqdmlsdhh, 2, int16_t, 0, 0, do_vqdmlsdh_h)
+DO_VQDMLADH_OP(vqdmlsdhw, 4, int32_t, 0, 0, do_vqdmlsdh_w)
+DO_VQDMLADH_OP(vqdmlsdhxb, 1, int8_t, 1, 0, do_vqdmlsdh_b)
+DO_VQDMLADH_OP(vqdmlsdhxh, 2, int16_t, 1, 0, do_vqdmlsdh_h)
+DO_VQDMLADH_OP(vqdmlsdhxw, 4, int32_t, 1, 0, do_vqdmlsdh_w)
+
+DO_VQDMLADH_OP(vqrdmlsdhb, 1, int8_t, 0, 1, do_vqdmlsdh_b)
+DO_VQDMLADH_OP(vqrdmlsdhh, 2, int16_t, 0, 1, do_vqdmlsdh_h)
+DO_VQDMLADH_OP(vqrdmlsdhw, 4, int32_t, 0, 1, do_vqdmlsdh_w)
+DO_VQDMLADH_OP(vqrdmlsdhxb, 1, int8_t, 1, 1, do_vqdmlsdh_b)
+DO_VQDMLADH_OP(vqrdmlsdhxh, 2, int16_t, 1, 1, do_vqdmlsdh_h)
+DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w)
+
+#define DO_2OP_SCALAR(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ uint32_t rm) \
+ { \
+ TYPE *d = vd, *n = vn; \
+ TYPE m = rm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m), mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_2OP_SAT_SCALAR(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ uint32_t rm) \
+ { \
+ TYPE *d = vd, *n = vn; \
+ TYPE m = rm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ bool qc = false; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ bool sat = false; \
+ mergemask(&d[H##ESIZE(e)], FN(n[H##ESIZE(e)], m, &sat), \
+ mask); \
+ qc |= sat & mask & 1; \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/* "accumulating" version where FN takes d as well as n and m */
+#define DO_2OP_ACC_SCALAR(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ uint32_t rm) \
+ { \
+ TYPE *d = vd, *n = vn; \
+ TYPE m = rm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], \
+ FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m), mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_2OP_SAT_ACC_SCALAR(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ uint32_t rm) \
+ { \
+ TYPE *d = vd, *n = vn; \
+ TYPE m = rm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ bool qc = false; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ bool sat = false; \
+ mergemask(&d[H##ESIZE(e)], \
+ FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m, &sat), \
+ mask); \
+ qc |= sat & mask & 1; \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/* provide unsigned 2-op scalar helpers for all sizes */
+#define DO_2OP_SCALAR_U(OP, FN) \
+ DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \
+ DO_2OP_SCALAR(OP##h, 2, uint16_t, FN) \
+ DO_2OP_SCALAR(OP##w, 4, uint32_t, FN)
+#define DO_2OP_SCALAR_S(OP, FN) \
+ DO_2OP_SCALAR(OP##b, 1, int8_t, FN) \
+ DO_2OP_SCALAR(OP##h, 2, int16_t, FN) \
+ DO_2OP_SCALAR(OP##w, 4, int32_t, FN)
+
+#define DO_2OP_ACC_SCALAR_U(OP, FN) \
+ DO_2OP_ACC_SCALAR(OP##b, 1, uint8_t, FN) \
+ DO_2OP_ACC_SCALAR(OP##h, 2, uint16_t, FN) \
+ DO_2OP_ACC_SCALAR(OP##w, 4, uint32_t, FN)
+
+DO_2OP_SCALAR_U(vadd_scalar, DO_ADD)
+DO_2OP_SCALAR_U(vsub_scalar, DO_SUB)
+DO_2OP_SCALAR_U(vmul_scalar, DO_MUL)
+DO_2OP_SCALAR_S(vhadds_scalar, do_vhadd_s)
+DO_2OP_SCALAR_U(vhaddu_scalar, do_vhadd_u)
+DO_2OP_SCALAR_S(vhsubs_scalar, do_vhsub_s)
+DO_2OP_SCALAR_U(vhsubu_scalar, do_vhsub_u)
+
+DO_2OP_SAT_SCALAR(vqaddu_scalarb, 1, uint8_t, DO_UQADD_B)
+DO_2OP_SAT_SCALAR(vqaddu_scalarh, 2, uint16_t, DO_UQADD_H)
+DO_2OP_SAT_SCALAR(vqaddu_scalarw, 4, uint32_t, DO_UQADD_W)
+DO_2OP_SAT_SCALAR(vqadds_scalarb, 1, int8_t, DO_SQADD_B)
+DO_2OP_SAT_SCALAR(vqadds_scalarh, 2, int16_t, DO_SQADD_H)
+DO_2OP_SAT_SCALAR(vqadds_scalarw, 4, int32_t, DO_SQADD_W)
+
+DO_2OP_SAT_SCALAR(vqsubu_scalarb, 1, uint8_t, DO_UQSUB_B)
+DO_2OP_SAT_SCALAR(vqsubu_scalarh, 2, uint16_t, DO_UQSUB_H)
+DO_2OP_SAT_SCALAR(vqsubu_scalarw, 4, uint32_t, DO_UQSUB_W)
+DO_2OP_SAT_SCALAR(vqsubs_scalarb, 1, int8_t, DO_SQSUB_B)
+DO_2OP_SAT_SCALAR(vqsubs_scalarh, 2, int16_t, DO_SQSUB_H)
+DO_2OP_SAT_SCALAR(vqsubs_scalarw, 4, int32_t, DO_SQSUB_W)
+
+DO_2OP_SAT_SCALAR(vqdmulh_scalarb, 1, int8_t, DO_QDMULH_B)
+DO_2OP_SAT_SCALAR(vqdmulh_scalarh, 2, int16_t, DO_QDMULH_H)
+DO_2OP_SAT_SCALAR(vqdmulh_scalarw, 4, int32_t, DO_QDMULH_W)
+DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B)
+DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H)
+DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W)
+
+static int8_t do_vqdmlah_b(int8_t a, int8_t b, int8_t c, int round, bool *sat)
+{
+ int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 8) + (round << 7);
+ return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
+}
+
+static int16_t do_vqdmlah_h(int16_t a, int16_t b, int16_t c,
+ int round, bool *sat)
+{
+ int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 16) + (round << 15);
+ return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
+}
+
+static int32_t do_vqdmlah_w(int32_t a, int32_t b, int32_t c,
+ int round, bool *sat)
+{
+ /*
+ * Architecturally we should do the entire add, double, round
+ * and then check for saturation. We do three saturating adds,
+ * but we need to be careful about the order. If the first
+ * m1 + m2 saturates then it's impossible for the *2+rc to
+ * bring it back into the non-saturated range. However, if
+ * m1 + m2 is negative then it's possible that doing the doubling
+ * would take the intermediate result below INT64_MAX and the
+ * addition of the rounding constant then brings it back in range.
+ * So we add half the rounding constant and half the "c << esize"
+ * before doubling rather than adding the rounding constant after
+ * the doubling.
+ */
+ int64_t m1 = (int64_t)a * b;
+ int64_t m2 = (int64_t)c << 31;
+ int64_t r;
+ if (sadd64_overflow(m1, m2, &r) ||
+ sadd64_overflow(r, (round << 30), &r) ||
+ sadd64_overflow(r, r, &r)) {
+ *sat = true;
+ return r < 0 ? INT32_MAX : INT32_MIN;
+ }
+ return r >> 32;
+}
+
+/*
+ * The *MLAH insns are vector * scalar + vector;
+ * the *MLASH insns are vector * vector + scalar
+ */
+#define DO_VQDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 0, S)
+#define DO_VQDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 0, S)
+#define DO_VQDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 0, S)
+#define DO_VQRDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 1, S)
+#define DO_VQRDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 1, S)
+#define DO_VQRDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 1, S)
+
+#define DO_VQDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 0, S)
+#define DO_VQDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 0, S)
+#define DO_VQDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 0, S)
+#define DO_VQRDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 1, S)
+#define DO_VQRDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 1, S)
+#define DO_VQRDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 1, S)
+
+DO_2OP_SAT_ACC_SCALAR(vqdmlahb, 1, int8_t, DO_VQDMLAH_B)
+DO_2OP_SAT_ACC_SCALAR(vqdmlahh, 2, int16_t, DO_VQDMLAH_H)
+DO_2OP_SAT_ACC_SCALAR(vqdmlahw, 4, int32_t, DO_VQDMLAH_W)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlahb, 1, int8_t, DO_VQRDMLAH_B)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlahh, 2, int16_t, DO_VQRDMLAH_H)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlahw, 4, int32_t, DO_VQRDMLAH_W)
+
+DO_2OP_SAT_ACC_SCALAR(vqdmlashb, 1, int8_t, DO_VQDMLASH_B)
+DO_2OP_SAT_ACC_SCALAR(vqdmlashh, 2, int16_t, DO_VQDMLASH_H)
+DO_2OP_SAT_ACC_SCALAR(vqdmlashw, 4, int32_t, DO_VQDMLASH_W)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlashb, 1, int8_t, DO_VQRDMLASH_B)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlashh, 2, int16_t, DO_VQRDMLASH_H)
+DO_2OP_SAT_ACC_SCALAR(vqrdmlashw, 4, int32_t, DO_VQRDMLASH_W)
+
+/* Vector by scalar plus vector */
+#define DO_VMLA(D, N, M) ((N) * (M) + (D))
+
+DO_2OP_ACC_SCALAR_U(vmla, DO_VMLA)
+
+/* Vector by vector plus scalar */
+#define DO_VMLAS(D, N, M) ((N) * (D) + (M))
+
+DO_2OP_ACC_SCALAR_U(vmlas, DO_VMLAS)
+
+/*
+ * Long saturating scalar ops. As with DO_2OP_L, TYPE and H are for the
+ * input (smaller) type and LESIZE, LTYPE, LH for the output (long) type.
+ * SATMASK specifies which bits of the predicate mask matter for determining
+ * whether to propagate a saturation indication into FPSCR.QC -- for
+ * the 16x16->32 case we must check only the bit corresponding to the T or B
+ * half that we used, but for the 32x32->64 case we propagate if the mask
+ * bit is set for either half.
+ */
+#define DO_2OP_SAT_SCALAR_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ uint32_t rm) \
+ { \
+ LTYPE *d = vd; \
+ TYPE *n = vn; \
+ TYPE m = rm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned le; \
+ bool qc = false; \
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+ bool sat = false; \
+ LTYPE r = FN((LTYPE)n[H##ESIZE(le * 2 + TOP)], m, &sat); \
+ mergemask(&d[H##LESIZE(le)], r, mask); \
+ qc |= sat && (mask & SATMASK); \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+static inline int32_t do_qdmullh(int16_t n, int16_t m, bool *sat)
+{
+ int64_t r = ((int64_t)n * m) * 2;
+ return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat);
+}
+
+static inline int64_t do_qdmullw(int32_t n, int32_t m, bool *sat)
+{
+ /* The multiply can't overflow, but the doubling might */
+ int64_t r = (int64_t)n * m;
+ if (r > INT64_MAX / 2) {
+ *sat = true;
+ return INT64_MAX;
+ } else if (r < INT64_MIN / 2) {
+ *sat = true;
+ return INT64_MIN;
+ } else {
+ return r * 2;
+ }
+}
+
+#define SATMASK16B 1
+#define SATMASK16T (1 << 2)
+#define SATMASK32 ((1 << 4) | 1)
+
+DO_2OP_SAT_SCALAR_L(vqdmullb_scalarh, 0, 2, int16_t, 4, int32_t, \
+ do_qdmullh, SATMASK16B)
+DO_2OP_SAT_SCALAR_L(vqdmullb_scalarw, 0, 4, int32_t, 8, int64_t, \
+ do_qdmullw, SATMASK32)
+DO_2OP_SAT_SCALAR_L(vqdmullt_scalarh, 1, 2, int16_t, 4, int32_t, \
+ do_qdmullh, SATMASK16T)
+DO_2OP_SAT_SCALAR_L(vqdmullt_scalarw, 1, 4, int32_t, 8, int64_t, \
+ do_qdmullw, SATMASK32)
+
+/*
+ * Long saturating ops
+ */
+#define DO_2OP_SAT_L(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN, SATMASK) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
+ void *vm) \
+ { \
+ LTYPE *d = vd; \
+ TYPE *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned le; \
+ bool qc = false; \
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+ bool sat = false; \
+ LTYPE op1 = n[H##ESIZE(le * 2 + TOP)]; \
+ LTYPE op2 = m[H##ESIZE(le * 2 + TOP)]; \
+ mergemask(&d[H##LESIZE(le)], FN(op1, op2, &sat), mask); \
+ qc |= sat && (mask & SATMASK); \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+DO_2OP_SAT_L(vqdmullbh, 0, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16B)
+DO_2OP_SAT_L(vqdmullbw, 0, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
+DO_2OP_SAT_L(vqdmullth, 1, 2, int16_t, 4, int32_t, do_qdmullh, SATMASK16T)
+DO_2OP_SAT_L(vqdmulltw, 1, 4, int32_t, 8, int64_t, do_qdmullw, SATMASK32)
+
+static inline uint32_t do_vbrsrb(uint32_t n, uint32_t m)
+{
+ m &= 0xff;
+ if (m == 0) {
+ return 0;
+ }
+ n = revbit8(n);
+ if (m < 8) {
+ n >>= 8 - m;
+ }
+ return n;
+}
+
+static inline uint32_t do_vbrsrh(uint32_t n, uint32_t m)
+{
+ m &= 0xff;
+ if (m == 0) {
+ return 0;
+ }
+ n = revbit16(n);
+ if (m < 16) {
+ n >>= 16 - m;
+ }
+ return n;
+}
+
+static inline uint32_t do_vbrsrw(uint32_t n, uint32_t m)
+{
+ m &= 0xff;
+ if (m == 0) {
+ return 0;
+ }
+ n = revbit32(n);
+ if (m < 32) {
+ n >>= 32 - m;
+ }
+ return n;
+}
+
+DO_2OP_SCALAR(vbrsrb, 1, uint8_t, do_vbrsrb)
+DO_2OP_SCALAR(vbrsrh, 2, uint16_t, do_vbrsrh)
+DO_2OP_SCALAR(vbrsrw, 4, uint32_t, do_vbrsrw)
+
+/*
+ * Multiply add long dual accumulate ops.
+ */
+#define DO_LDAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \
+ uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
+ void *vm, uint64_t a) \
+ { \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE *n = vn, *m = vm; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if (mask & 1) { \
+ if (e & 1) { \
+ a ODDACC \
+ (int64_t)n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
+ } else { \
+ a EVENACC \
+ (int64_t)n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \
+ } \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ return a; \
+ }
+
+DO_LDAV(vmlaldavsh, 2, int16_t, false, +=, +=)
+DO_LDAV(vmlaldavxsh, 2, int16_t, true, +=, +=)
+DO_LDAV(vmlaldavsw, 4, int32_t, false, +=, +=)
+DO_LDAV(vmlaldavxsw, 4, int32_t, true, +=, +=)
+
+DO_LDAV(vmlaldavuh, 2, uint16_t, false, +=, +=)
+DO_LDAV(vmlaldavuw, 4, uint32_t, false, +=, +=)
+
+DO_LDAV(vmlsldavsh, 2, int16_t, false, +=, -=)
+DO_LDAV(vmlsldavxsh, 2, int16_t, true, +=, -=)
+DO_LDAV(vmlsldavsw, 4, int32_t, false, +=, -=)
+DO_LDAV(vmlsldavxsw, 4, int32_t, true, +=, -=)
+
+/*
+ * Multiply add dual accumulate ops
+ */
+#define DO_DAV(OP, ESIZE, TYPE, XCHG, EVENACC, ODDACC) \
+ uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
+ void *vm, uint32_t a) \
+ { \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE *n = vn, *m = vm; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if (mask & 1) { \
+ if (e & 1) { \
+ a ODDACC \
+ n[H##ESIZE(e - 1 * XCHG)] * m[H##ESIZE(e)]; \
+ } else { \
+ a EVENACC \
+ n[H##ESIZE(e + 1 * XCHG)] * m[H##ESIZE(e)]; \
+ } \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ return a; \
+ }
+
+#define DO_DAV_S(INSN, XCHG, EVENACC, ODDACC) \
+ DO_DAV(INSN##b, 1, int8_t, XCHG, EVENACC, ODDACC) \
+ DO_DAV(INSN##h, 2, int16_t, XCHG, EVENACC, ODDACC) \
+ DO_DAV(INSN##w, 4, int32_t, XCHG, EVENACC, ODDACC)
+
+#define DO_DAV_U(INSN, XCHG, EVENACC, ODDACC) \
+ DO_DAV(INSN##b, 1, uint8_t, XCHG, EVENACC, ODDACC) \
+ DO_DAV(INSN##h, 2, uint16_t, XCHG, EVENACC, ODDACC) \
+ DO_DAV(INSN##w, 4, uint32_t, XCHG, EVENACC, ODDACC)
+
+DO_DAV_S(vmladavs, false, +=, +=)
+DO_DAV_U(vmladavu, false, +=, +=)
+DO_DAV_S(vmlsdav, false, +=, -=)
+DO_DAV_S(vmladavsx, true, +=, +=)
+DO_DAV_S(vmlsdavx, true, +=, -=)
+
+/*
+ * Rounding multiply add long dual accumulate high. In the pseudocode
+ * this is implemented with a 72-bit internal accumulator value of which
+ * the top 64 bits are returned. We optimize this to avoid having to
+ * use 128-bit arithmetic -- we can do this because the 74-bit accumulator
+ * is squashed back into 64-bits after each beat.
+ */
+#define DO_LDAVH(OP, TYPE, LTYPE, XCHG, SUB) \
+ uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
+ void *vm, uint64_t a) \
+ { \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE *n = vn, *m = vm; \
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) { \
+ if (mask & 1) { \
+ LTYPE mul; \
+ if (e & 1) { \
+ mul = (LTYPE)n[H4(e - 1 * XCHG)] * m[H4(e)]; \
+ if (SUB) { \
+ mul = -mul; \
+ } \
+ } else { \
+ mul = (LTYPE)n[H4(e + 1 * XCHG)] * m[H4(e)]; \
+ } \
+ mul = (mul >> 8) + ((mul >> 7) & 1); \
+ a += mul; \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ return a; \
+ }
+
+DO_LDAVH(vrmlaldavhsw, int32_t, int64_t, false, false)
+DO_LDAVH(vrmlaldavhxsw, int32_t, int64_t, true, false)
+
+DO_LDAVH(vrmlaldavhuw, uint32_t, uint64_t, false, false)
+
+DO_LDAVH(vrmlsldavhsw, int32_t, int64_t, false, true)
+DO_LDAVH(vrmlsldavhxsw, int32_t, int64_t, true, true)
+
+/* Vector add across vector */
+#define DO_VADDV(OP, ESIZE, TYPE) \
+ uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
+ uint32_t ra) \
+ { \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE *m = vm; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if (mask & 1) { \
+ ra += m[H##ESIZE(e)]; \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ return ra; \
+ } \
+
+DO_VADDV(vaddvsb, 1, int8_t)
+DO_VADDV(vaddvsh, 2, int16_t)
+DO_VADDV(vaddvsw, 4, int32_t)
+DO_VADDV(vaddvub, 1, uint8_t)
+DO_VADDV(vaddvuh, 2, uint16_t)
+DO_VADDV(vaddvuw, 4, uint32_t)
+
+/*
+ * Vector max/min across vector. Unlike VADDV, we must
+ * read ra as the element size, not its full width.
+ * We work with int64_t internally for simplicity.
+ */
+#define DO_VMAXMINV(OP, ESIZE, TYPE, RATYPE, FN) \
+ uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
+ uint32_t ra_in) \
+ { \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE *m = vm; \
+ int64_t ra = (RATYPE)ra_in; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if (mask & 1) { \
+ ra = FN(ra, m[H##ESIZE(e)]); \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ return ra; \
+ } \
+
+#define DO_VMAXMINV_U(INSN, FN) \
+ DO_VMAXMINV(INSN##b, 1, uint8_t, uint8_t, FN) \
+ DO_VMAXMINV(INSN##h, 2, uint16_t, uint16_t, FN) \
+ DO_VMAXMINV(INSN##w, 4, uint32_t, uint32_t, FN)
+#define DO_VMAXMINV_S(INSN, FN) \
+ DO_VMAXMINV(INSN##b, 1, int8_t, int8_t, FN) \
+ DO_VMAXMINV(INSN##h, 2, int16_t, int16_t, FN) \
+ DO_VMAXMINV(INSN##w, 4, int32_t, int32_t, FN)
+
+/*
+ * Helpers for max and min of absolute values across vector:
+ * note that we only take the absolute value of 'm', not 'n'
+ */
+static int64_t do_maxa(int64_t n, int64_t m)
+{
+ if (m < 0) {
+ m = -m;
+ }
+ return MAX(n, m);
+}
+
+static int64_t do_mina(int64_t n, int64_t m)
+{
+ if (m < 0) {
+ m = -m;
+ }
+ return MIN(n, m);
+}
+
+DO_VMAXMINV_S(vmaxvs, DO_MAX)
+DO_VMAXMINV_U(vmaxvu, DO_MAX)
+DO_VMAXMINV_S(vminvs, DO_MIN)
+DO_VMAXMINV_U(vminvu, DO_MIN)
+/*
+ * VMAXAV, VMINAV treat the general purpose input as unsigned
+ * and the vector elements as signed.
+ */
+DO_VMAXMINV(vmaxavb, 1, int8_t, uint8_t, do_maxa)
+DO_VMAXMINV(vmaxavh, 2, int16_t, uint16_t, do_maxa)
+DO_VMAXMINV(vmaxavw, 4, int32_t, uint32_t, do_maxa)
+DO_VMAXMINV(vminavb, 1, int8_t, uint8_t, do_mina)
+DO_VMAXMINV(vminavh, 2, int16_t, uint16_t, do_mina)
+DO_VMAXMINV(vminavw, 4, int32_t, uint32_t, do_mina)
+
+#define DO_VABAV(OP, ESIZE, TYPE) \
+ uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
+ void *vm, uint32_t ra) \
+ { \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE *m = vm, *n = vn; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if (mask & 1) { \
+ int64_t n0 = n[H##ESIZE(e)]; \
+ int64_t m0 = m[H##ESIZE(e)]; \
+ uint32_t r = n0 >= m0 ? (n0 - m0) : (m0 - n0); \
+ ra += r; \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ return ra; \
+ }
+
+DO_VABAV(vabavsb, 1, int8_t)
+DO_VABAV(vabavsh, 2, int16_t)
+DO_VABAV(vabavsw, 4, int32_t)
+DO_VABAV(vabavub, 1, uint8_t)
+DO_VABAV(vabavuh, 2, uint16_t)
+DO_VABAV(vabavuw, 4, uint32_t)
+
+#define DO_VADDLV(OP, TYPE, LTYPE) \
+ uint64_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
+ uint64_t ra) \
+ { \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE *m = vm; \
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) { \
+ if (mask & 1) { \
+ ra += (LTYPE)m[H4(e)]; \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ return ra; \
+ } \
+
+DO_VADDLV(vaddlv_s, int32_t, int64_t)
+DO_VADDLV(vaddlv_u, uint32_t, uint64_t)
+
+/* Shifts by immediate */
+#define DO_2SHIFT(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
+ void *vm, uint32_t shift) \
+ { \
+ TYPE *d = vd, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], \
+ FN(m[H##ESIZE(e)], shift), mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_2SHIFT_SAT(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
+ void *vm, uint32_t shift) \
+ { \
+ TYPE *d = vd, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ bool qc = false; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ bool sat = false; \
+ mergemask(&d[H##ESIZE(e)], \
+ FN(m[H##ESIZE(e)], shift, &sat), mask); \
+ qc |= sat & mask & 1; \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/* provide unsigned 2-op shift helpers for all sizes */
+#define DO_2SHIFT_U(OP, FN) \
+ DO_2SHIFT(OP##b, 1, uint8_t, FN) \
+ DO_2SHIFT(OP##h, 2, uint16_t, FN) \
+ DO_2SHIFT(OP##w, 4, uint32_t, FN)
+#define DO_2SHIFT_S(OP, FN) \
+ DO_2SHIFT(OP##b, 1, int8_t, FN) \
+ DO_2SHIFT(OP##h, 2, int16_t, FN) \
+ DO_2SHIFT(OP##w, 4, int32_t, FN)
+
+#define DO_2SHIFT_SAT_U(OP, FN) \
+ DO_2SHIFT_SAT(OP##b, 1, uint8_t, FN) \
+ DO_2SHIFT_SAT(OP##h, 2, uint16_t, FN) \
+ DO_2SHIFT_SAT(OP##w, 4, uint32_t, FN)
+#define DO_2SHIFT_SAT_S(OP, FN) \
+ DO_2SHIFT_SAT(OP##b, 1, int8_t, FN) \
+ DO_2SHIFT_SAT(OP##h, 2, int16_t, FN) \
+ DO_2SHIFT_SAT(OP##w, 4, int32_t, FN)
+
+DO_2SHIFT_U(vshli_u, DO_VSHLU)
+DO_2SHIFT_S(vshli_s, DO_VSHLS)
+DO_2SHIFT_SAT_U(vqshli_u, DO_UQSHL_OP)
+DO_2SHIFT_SAT_S(vqshli_s, DO_SQSHL_OP)
+DO_2SHIFT_SAT_S(vqshlui_s, DO_SUQSHL_OP)
+DO_2SHIFT_U(vrshli_u, DO_VRSHLU)
+DO_2SHIFT_S(vrshli_s, DO_VRSHLS)
+DO_2SHIFT_SAT_U(vqrshli_u, DO_UQRSHL_OP)
+DO_2SHIFT_SAT_S(vqrshli_s, DO_SQRSHL_OP)
+
+/* Shift-and-insert; we always work with 64 bits at a time */
+#define DO_2SHIFT_INSERT(OP, ESIZE, SHIFTFN, MASKFN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
+ void *vm, uint32_t shift) \
+ { \
+ uint64_t *d = vd, *m = vm; \
+ uint16_t mask; \
+ uint64_t shiftmask; \
+ unsigned e; \
+ if (shift == ESIZE * 8) { \
+ /* \
+ * Only VSRI can shift by <dt>; it should mean "don't \
+ * update the destination". The generic logic can't handle \
+ * this because it would try to shift by an out-of-range \
+ * amount, so special case it here. \
+ */ \
+ goto done; \
+ } \
+ assert(shift < ESIZE * 8); \
+ mask = mve_element_mask(env); \
+ /* ESIZE / 2 gives the MO_* value if ESIZE is in [1,2,4] */ \
+ shiftmask = dup_const(ESIZE / 2, MASKFN(ESIZE * 8, shift)); \
+ for (e = 0; e < 16 / 8; e++, mask >>= 8) { \
+ uint64_t r = (SHIFTFN(m[H8(e)], shift) & shiftmask) | \
+ (d[H8(e)] & ~shiftmask); \
+ mergemask(&d[H8(e)], r, mask); \
+ } \
+done: \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_SHL(N, SHIFT) ((N) << (SHIFT))
+#define DO_SHR(N, SHIFT) ((N) >> (SHIFT))
+#define SHL_MASK(EBITS, SHIFT) MAKE_64BIT_MASK((SHIFT), (EBITS) - (SHIFT))
+#define SHR_MASK(EBITS, SHIFT) MAKE_64BIT_MASK(0, (EBITS) - (SHIFT))
+
+DO_2SHIFT_INSERT(vsrib, 1, DO_SHR, SHR_MASK)
+DO_2SHIFT_INSERT(vsrih, 2, DO_SHR, SHR_MASK)
+DO_2SHIFT_INSERT(vsriw, 4, DO_SHR, SHR_MASK)
+DO_2SHIFT_INSERT(vslib, 1, DO_SHL, SHL_MASK)
+DO_2SHIFT_INSERT(vslih, 2, DO_SHL, SHL_MASK)
+DO_2SHIFT_INSERT(vsliw, 4, DO_SHL, SHL_MASK)
+
+/*
+ * Long shifts taking half-sized inputs from top or bottom of the input
+ * vector and producing a double-width result. ESIZE, TYPE are for
+ * the input, and LESIZE, LTYPE for the output.
+ * Unlike the normal shift helpers, we do not handle negative shift counts,
+ * because the long shift is strictly left-only.
+ */
+#define DO_VSHLL(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
+ void *vm, uint32_t shift) \
+ { \
+ LTYPE *d = vd; \
+ TYPE *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned le; \
+ assert(shift <= 16); \
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+ LTYPE r = (LTYPE)m[H##ESIZE(le * 2 + TOP)] << shift; \
+ mergemask(&d[H##LESIZE(le)], r, mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VSHLL_ALL(OP, TOP) \
+ DO_VSHLL(OP##sb, TOP, 1, int8_t, 2, int16_t) \
+ DO_VSHLL(OP##ub, TOP, 1, uint8_t, 2, uint16_t) \
+ DO_VSHLL(OP##sh, TOP, 2, int16_t, 4, int32_t) \
+ DO_VSHLL(OP##uh, TOP, 2, uint16_t, 4, uint32_t) \
+
+DO_VSHLL_ALL(vshllb, false)
+DO_VSHLL_ALL(vshllt, true)
+
+/*
+ * Narrowing right shifts, taking a double sized input, shifting it
+ * and putting the result in either the top or bottom half of the output.
+ * ESIZE, TYPE are the output, and LESIZE, LTYPE the input.
+ */
+#define DO_VSHRN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
+ void *vm, uint32_t shift) \
+ { \
+ LTYPE *m = vm; \
+ TYPE *d = vd; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned le; \
+ mask >>= ESIZE * TOP; \
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+ TYPE r = FN(m[H##LESIZE(le)], shift); \
+ mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VSHRN_ALL(OP, FN) \
+ DO_VSHRN(OP##bb, false, 1, uint8_t, 2, uint16_t, FN) \
+ DO_VSHRN(OP##bh, false, 2, uint16_t, 4, uint32_t, FN) \
+ DO_VSHRN(OP##tb, true, 1, uint8_t, 2, uint16_t, FN) \
+ DO_VSHRN(OP##th, true, 2, uint16_t, 4, uint32_t, FN)
+
+static inline uint64_t do_urshr(uint64_t x, unsigned sh)
+{
+ if (likely(sh < 64)) {
+ return (x >> sh) + ((x >> (sh - 1)) & 1);
+ } else if (sh == 64) {
+ return x >> 63;
+ } else {
+ return 0;
+ }
+}
+
+static inline int64_t do_srshr(int64_t x, unsigned sh)
+{
+ if (likely(sh < 64)) {
+ return (x >> sh) + ((x >> (sh - 1)) & 1);
+ } else {
+ /* Rounding the sign bit always produces 0. */
+ return 0;
+ }
+}
+
+DO_VSHRN_ALL(vshrn, DO_SHR)
+DO_VSHRN_ALL(vrshrn, do_urshr)
+
+static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max,
+ bool *satp)
+{
+ if (val > max) {
+ *satp = true;
+ return max;
+ } else if (val < min) {
+ *satp = true;
+ return min;
+ } else {
+ return val;
+ }
+}
+
+/* Saturating narrowing right shifts */
+#define DO_VSHRN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, \
+ void *vm, uint32_t shift) \
+ { \
+ LTYPE *m = vm; \
+ TYPE *d = vd; \
+ uint16_t mask = mve_element_mask(env); \
+ bool qc = false; \
+ unsigned le; \
+ mask >>= ESIZE * TOP; \
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+ bool sat = false; \
+ TYPE r = FN(m[H##LESIZE(le)], shift, &sat); \
+ mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \
+ qc |= sat & mask & 1; \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VSHRN_SAT_UB(BOP, TOP, FN) \
+ DO_VSHRN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN) \
+ DO_VSHRN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN)
+
+#define DO_VSHRN_SAT_UH(BOP, TOP, FN) \
+ DO_VSHRN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN) \
+ DO_VSHRN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN)
+
+#define DO_VSHRN_SAT_SB(BOP, TOP, FN) \
+ DO_VSHRN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN) \
+ DO_VSHRN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN)
+
+#define DO_VSHRN_SAT_SH(BOP, TOP, FN) \
+ DO_VSHRN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN) \
+ DO_VSHRN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN)
+
+#define DO_SHRN_SB(N, M, SATP) \
+ do_sat_bhs((int64_t)(N) >> (M), INT8_MIN, INT8_MAX, SATP)
+#define DO_SHRN_UB(N, M, SATP) \
+ do_sat_bhs((uint64_t)(N) >> (M), 0, UINT8_MAX, SATP)
+#define DO_SHRUN_B(N, M, SATP) \
+ do_sat_bhs((int64_t)(N) >> (M), 0, UINT8_MAX, SATP)
+
+#define DO_SHRN_SH(N, M, SATP) \
+ do_sat_bhs((int64_t)(N) >> (M), INT16_MIN, INT16_MAX, SATP)
+#define DO_SHRN_UH(N, M, SATP) \
+ do_sat_bhs((uint64_t)(N) >> (M), 0, UINT16_MAX, SATP)
+#define DO_SHRUN_H(N, M, SATP) \
+ do_sat_bhs((int64_t)(N) >> (M), 0, UINT16_MAX, SATP)
+
+#define DO_RSHRN_SB(N, M, SATP) \
+ do_sat_bhs(do_srshr(N, M), INT8_MIN, INT8_MAX, SATP)
+#define DO_RSHRN_UB(N, M, SATP) \
+ do_sat_bhs(do_urshr(N, M), 0, UINT8_MAX, SATP)
+#define DO_RSHRUN_B(N, M, SATP) \
+ do_sat_bhs(do_srshr(N, M), 0, UINT8_MAX, SATP)
+
+#define DO_RSHRN_SH(N, M, SATP) \
+ do_sat_bhs(do_srshr(N, M), INT16_MIN, INT16_MAX, SATP)
+#define DO_RSHRN_UH(N, M, SATP) \
+ do_sat_bhs(do_urshr(N, M), 0, UINT16_MAX, SATP)
+#define DO_RSHRUN_H(N, M, SATP) \
+ do_sat_bhs(do_srshr(N, M), 0, UINT16_MAX, SATP)
+
+DO_VSHRN_SAT_SB(vqshrnb_sb, vqshrnt_sb, DO_SHRN_SB)
+DO_VSHRN_SAT_SH(vqshrnb_sh, vqshrnt_sh, DO_SHRN_SH)
+DO_VSHRN_SAT_UB(vqshrnb_ub, vqshrnt_ub, DO_SHRN_UB)
+DO_VSHRN_SAT_UH(vqshrnb_uh, vqshrnt_uh, DO_SHRN_UH)
+DO_VSHRN_SAT_SB(vqshrunbb, vqshruntb, DO_SHRUN_B)
+DO_VSHRN_SAT_SH(vqshrunbh, vqshrunth, DO_SHRUN_H)
+
+DO_VSHRN_SAT_SB(vqrshrnb_sb, vqrshrnt_sb, DO_RSHRN_SB)
+DO_VSHRN_SAT_SH(vqrshrnb_sh, vqrshrnt_sh, DO_RSHRN_SH)
+DO_VSHRN_SAT_UB(vqrshrnb_ub, vqrshrnt_ub, DO_RSHRN_UB)
+DO_VSHRN_SAT_UH(vqrshrnb_uh, vqrshrnt_uh, DO_RSHRN_UH)
+DO_VSHRN_SAT_SB(vqrshrunbb, vqrshruntb, DO_RSHRUN_B)
+DO_VSHRN_SAT_SH(vqrshrunbh, vqrshrunth, DO_RSHRUN_H)
+
+#define DO_VMOVN(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \
+ { \
+ LTYPE *m = vm; \
+ TYPE *d = vd; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned le; \
+ mask >>= ESIZE * TOP; \
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+ mergemask(&d[H##ESIZE(le * 2 + TOP)], \
+ m[H##LESIZE(le)], mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+DO_VMOVN(vmovnbb, false, 1, uint8_t, 2, uint16_t)
+DO_VMOVN(vmovnbh, false, 2, uint16_t, 4, uint32_t)
+DO_VMOVN(vmovntb, true, 1, uint8_t, 2, uint16_t)
+DO_VMOVN(vmovnth, true, 2, uint16_t, 4, uint32_t)
+
+#define DO_VMOVN_SAT(OP, TOP, ESIZE, TYPE, LESIZE, LTYPE, FN) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \
+ { \
+ LTYPE *m = vm; \
+ TYPE *d = vd; \
+ uint16_t mask = mve_element_mask(env); \
+ bool qc = false; \
+ unsigned le; \
+ mask >>= ESIZE * TOP; \
+ for (le = 0; le < 16 / LESIZE; le++, mask >>= LESIZE) { \
+ bool sat = false; \
+ TYPE r = FN(m[H##LESIZE(le)], &sat); \
+ mergemask(&d[H##ESIZE(le * 2 + TOP)], r, mask); \
+ qc |= sat & mask & 1; \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VMOVN_SAT_UB(BOP, TOP, FN) \
+ DO_VMOVN_SAT(BOP, false, 1, uint8_t, 2, uint16_t, FN) \
+ DO_VMOVN_SAT(TOP, true, 1, uint8_t, 2, uint16_t, FN)
+
+#define DO_VMOVN_SAT_UH(BOP, TOP, FN) \
+ DO_VMOVN_SAT(BOP, false, 2, uint16_t, 4, uint32_t, FN) \
+ DO_VMOVN_SAT(TOP, true, 2, uint16_t, 4, uint32_t, FN)
+
+#define DO_VMOVN_SAT_SB(BOP, TOP, FN) \
+ DO_VMOVN_SAT(BOP, false, 1, int8_t, 2, int16_t, FN) \
+ DO_VMOVN_SAT(TOP, true, 1, int8_t, 2, int16_t, FN)
+
+#define DO_VMOVN_SAT_SH(BOP, TOP, FN) \
+ DO_VMOVN_SAT(BOP, false, 2, int16_t, 4, int32_t, FN) \
+ DO_VMOVN_SAT(TOP, true, 2, int16_t, 4, int32_t, FN)
+
+#define DO_VQMOVN_SB(N, SATP) \
+ do_sat_bhs((int64_t)(N), INT8_MIN, INT8_MAX, SATP)
+#define DO_VQMOVN_UB(N, SATP) \
+ do_sat_bhs((uint64_t)(N), 0, UINT8_MAX, SATP)
+#define DO_VQMOVUN_B(N, SATP) \
+ do_sat_bhs((int64_t)(N), 0, UINT8_MAX, SATP)
+
+#define DO_VQMOVN_SH(N, SATP) \
+ do_sat_bhs((int64_t)(N), INT16_MIN, INT16_MAX, SATP)
+#define DO_VQMOVN_UH(N, SATP) \
+ do_sat_bhs((uint64_t)(N), 0, UINT16_MAX, SATP)
+#define DO_VQMOVUN_H(N, SATP) \
+ do_sat_bhs((int64_t)(N), 0, UINT16_MAX, SATP)
+
+DO_VMOVN_SAT_SB(vqmovnbsb, vqmovntsb, DO_VQMOVN_SB)
+DO_VMOVN_SAT_SH(vqmovnbsh, vqmovntsh, DO_VQMOVN_SH)
+DO_VMOVN_SAT_UB(vqmovnbub, vqmovntub, DO_VQMOVN_UB)
+DO_VMOVN_SAT_UH(vqmovnbuh, vqmovntuh, DO_VQMOVN_UH)
+DO_VMOVN_SAT_SB(vqmovunbb, vqmovuntb, DO_VQMOVUN_B)
+DO_VMOVN_SAT_SH(vqmovunbh, vqmovunth, DO_VQMOVUN_H)
+
+uint32_t HELPER(mve_vshlc)(CPUARMState *env, void *vd, uint32_t rdm,
+ uint32_t shift)
+{
+ uint32_t *d = vd;
+ uint16_t mask = mve_element_mask(env);
+ unsigned e;
+ uint32_t r;
+
+ /*
+ * For each 32-bit element, we shift it left, bringing in the
+ * low 'shift' bits of rdm at the bottom. Bits shifted out at
+ * the top become the new rdm, if the predicate mask permits.
+ * The final rdm value is returned to update the register.
+ * shift == 0 here means "shift by 32 bits".
+ */
+ if (shift == 0) {
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+ r = rdm;
+ if (mask & 1) {
+ rdm = d[H4(e)];
+ }
+ mergemask(&d[H4(e)], r, mask);
+ }
+ } else {
+ uint32_t shiftmask = MAKE_64BIT_MASK(0, shift);
+
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+ r = (d[H4(e)] << shift) | (rdm & shiftmask);
+ if (mask & 1) {
+ rdm = d[H4(e)] >> (32 - shift);
+ }
+ mergemask(&d[H4(e)], r, mask);
+ }
+ }
+ mve_advance_vpt(env);
+ return rdm;
+}
+
+uint64_t HELPER(mve_sshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+ return do_sqrshl_d(n, -(int8_t)shift, false, NULL);
+}
+
+uint64_t HELPER(mve_ushll)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+ return do_uqrshl_d(n, (int8_t)shift, false, NULL);
+}
+
+uint64_t HELPER(mve_sqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+ return do_sqrshl_d(n, (int8_t)shift, false, &env->QF);
+}
+
+uint64_t HELPER(mve_uqshll)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+ return do_uqrshl_d(n, (int8_t)shift, false, &env->QF);
+}
+
+uint64_t HELPER(mve_sqrshrl)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+ return do_sqrshl_d(n, -(int8_t)shift, true, &env->QF);
+}
+
+uint64_t HELPER(mve_uqrshll)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+ return do_uqrshl_d(n, (int8_t)shift, true, &env->QF);
+}
+
+/* Operate on 64-bit values, but saturate at 48 bits */
+static inline int64_t do_sqrshl48_d(int64_t src, int64_t shift,
+ bool round, uint32_t *sat)
+{
+ int64_t val, extval;
+
+ if (shift <= -48) {
+ /* Rounding the sign bit always produces 0. */
+ if (round) {
+ return 0;
+ }
+ return src >> 63;
+ } else if (shift < 0) {
+ if (round) {
+ src >>= -shift - 1;
+ val = (src >> 1) + (src & 1);
+ } else {
+ val = src >> -shift;
+ }
+ extval = sextract64(val, 0, 48);
+ if (!sat || val == extval) {
+ return extval;
+ }
+ } else if (shift < 48) {
+ int64_t extval = sextract64(src << shift, 0, 48);
+ if (!sat || src == (extval >> shift)) {
+ return extval;
+ }
+ } else if (!sat || src == 0) {
+ return 0;
+ }
+
+ *sat = 1;
+ return src >= 0 ? MAKE_64BIT_MASK(0, 47) : MAKE_64BIT_MASK(47, 17);
+}
+
+/* Operate on 64-bit values, but saturate at 48 bits */
+static inline uint64_t do_uqrshl48_d(uint64_t src, int64_t shift,
+ bool round, uint32_t *sat)
+{
+ uint64_t val, extval;
+
+ if (shift <= -(48 + round)) {
+ return 0;
+ } else if (shift < 0) {
+ if (round) {
+ val = src >> (-shift - 1);
+ val = (val >> 1) + (val & 1);
+ } else {
+ val = src >> -shift;
+ }
+ extval = extract64(val, 0, 48);
+ if (!sat || val == extval) {
+ return extval;
+ }
+ } else if (shift < 48) {
+ uint64_t extval = extract64(src << shift, 0, 48);
+ if (!sat || src == (extval >> shift)) {
+ return extval;
+ }
+ } else if (!sat || src == 0) {
+ return 0;
+ }
+
+ *sat = 1;
+ return MAKE_64BIT_MASK(0, 48);
+}
+
+uint64_t HELPER(mve_sqrshrl48)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+ return do_sqrshl48_d(n, -(int8_t)shift, true, &env->QF);
+}
+
+uint64_t HELPER(mve_uqrshll48)(CPUARMState *env, uint64_t n, uint32_t shift)
+{
+ return do_uqrshl48_d(n, (int8_t)shift, true, &env->QF);
+}
+
+uint32_t HELPER(mve_uqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
+{
+ return do_uqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
+}
+
+uint32_t HELPER(mve_sqshl)(CPUARMState *env, uint32_t n, uint32_t shift)
+{
+ return do_sqrshl_bhs(n, (int8_t)shift, 32, false, &env->QF);
+}
+
+uint32_t HELPER(mve_uqrshl)(CPUARMState *env, uint32_t n, uint32_t shift)
+{
+ return do_uqrshl_bhs(n, (int8_t)shift, 32, true, &env->QF);
+}
+
+uint32_t HELPER(mve_sqrshr)(CPUARMState *env, uint32_t n, uint32_t shift)
+{
+ return do_sqrshl_bhs(n, -(int8_t)shift, 32, true, &env->QF);
+}
+
+#define DO_VIDUP(OP, ESIZE, TYPE, FN) \
+ uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd, \
+ uint32_t offset, uint32_t imm) \
+ { \
+ TYPE *d = vd; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], offset, mask); \
+ offset = FN(offset, imm); \
+ } \
+ mve_advance_vpt(env); \
+ return offset; \
+ }
+
+#define DO_VIWDUP(OP, ESIZE, TYPE, FN) \
+ uint32_t HELPER(mve_##OP)(CPUARMState *env, void *vd, \
+ uint32_t offset, uint32_t wrap, \
+ uint32_t imm) \
+ { \
+ TYPE *d = vd; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], offset, mask); \
+ offset = FN(offset, wrap, imm); \
+ } \
+ mve_advance_vpt(env); \
+ return offset; \
+ }
+
+#define DO_VIDUP_ALL(OP, FN) \
+ DO_VIDUP(OP##b, 1, int8_t, FN) \
+ DO_VIDUP(OP##h, 2, int16_t, FN) \
+ DO_VIDUP(OP##w, 4, int32_t, FN)
+
+#define DO_VIWDUP_ALL(OP, FN) \
+ DO_VIWDUP(OP##b, 1, int8_t, FN) \
+ DO_VIWDUP(OP##h, 2, int16_t, FN) \
+ DO_VIWDUP(OP##w, 4, int32_t, FN)
+
+static uint32_t do_add_wrap(uint32_t offset, uint32_t wrap, uint32_t imm)
+{
+ offset += imm;
+ if (offset == wrap) {
+ offset = 0;
+ }
+ return offset;
+}
+
+static uint32_t do_sub_wrap(uint32_t offset, uint32_t wrap, uint32_t imm)
+{
+ if (offset == 0) {
+ offset = wrap;
+ }
+ offset -= imm;
+ return offset;
+}
+
+DO_VIDUP_ALL(vidup, DO_ADD)
+DO_VIWDUP_ALL(viwdup, do_add_wrap)
+DO_VIWDUP_ALL(vdwdup, do_sub_wrap)
+
+/*
+ * Vector comparison.
+ * P0 bits for non-executed beats (where eci_mask is 0) are unchanged.
+ * P0 bits for predicated lanes in executed beats (where mask is 0) are 0.
+ * P0 bits otherwise are updated with the results of the comparisons.
+ * We must also keep unchanged the MASK fields at the top of v7m.vpr.
+ */
+#define DO_VCMP(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm) \
+ { \
+ TYPE *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ uint16_t eci_mask = mve_eci_mask(env); \
+ uint16_t beatpred = 0; \
+ uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++) { \
+ bool r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)]); \
+ /* Comparison sets 0/1 bits for each byte in the element */ \
+ beatpred |= r * emask; \
+ emask <<= ESIZE; \
+ } \
+ beatpred &= mask; \
+ env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \
+ (beatpred & eci_mask); \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VCMP_SCALAR(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
+ uint32_t rm) \
+ { \
+ TYPE *n = vn; \
+ uint16_t mask = mve_element_mask(env); \
+ uint16_t eci_mask = mve_eci_mask(env); \
+ uint16_t beatpred = 0; \
+ uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++) { \
+ bool r = FN(n[H##ESIZE(e)], (TYPE)rm); \
+ /* Comparison sets 0/1 bits for each byte in the element */ \
+ beatpred |= r * emask; \
+ emask <<= ESIZE; \
+ } \
+ beatpred &= mask; \
+ env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \
+ (beatpred & eci_mask); \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VCMP_S(OP, FN) \
+ DO_VCMP(OP##b, 1, int8_t, FN) \
+ DO_VCMP(OP##h, 2, int16_t, FN) \
+ DO_VCMP(OP##w, 4, int32_t, FN) \
+ DO_VCMP_SCALAR(OP##_scalarb, 1, int8_t, FN) \
+ DO_VCMP_SCALAR(OP##_scalarh, 2, int16_t, FN) \
+ DO_VCMP_SCALAR(OP##_scalarw, 4, int32_t, FN)
+
+#define DO_VCMP_U(OP, FN) \
+ DO_VCMP(OP##b, 1, uint8_t, FN) \
+ DO_VCMP(OP##h, 2, uint16_t, FN) \
+ DO_VCMP(OP##w, 4, uint32_t, FN) \
+ DO_VCMP_SCALAR(OP##_scalarb, 1, uint8_t, FN) \
+ DO_VCMP_SCALAR(OP##_scalarh, 2, uint16_t, FN) \
+ DO_VCMP_SCALAR(OP##_scalarw, 4, uint32_t, FN)
+
+#define DO_EQ(N, M) ((N) == (M))
+#define DO_NE(N, M) ((N) != (M))
+#define DO_EQ(N, M) ((N) == (M))
+#define DO_EQ(N, M) ((N) == (M))
+#define DO_GE(N, M) ((N) >= (M))
+#define DO_LT(N, M) ((N) < (M))
+#define DO_GT(N, M) ((N) > (M))
+#define DO_LE(N, M) ((N) <= (M))
+
+DO_VCMP_U(vcmpeq, DO_EQ)
+DO_VCMP_U(vcmpne, DO_NE)
+DO_VCMP_U(vcmpcs, DO_GE)
+DO_VCMP_U(vcmphi, DO_GT)
+DO_VCMP_S(vcmpge, DO_GE)
+DO_VCMP_S(vcmplt, DO_LT)
+DO_VCMP_S(vcmpgt, DO_GT)
+DO_VCMP_S(vcmple, DO_LE)
+
+void HELPER(mve_vpsel)(CPUARMState *env, void *vd, void *vn, void *vm)
+{
+ /*
+ * Qd[n] = VPR.P0[n] ? Qn[n] : Qm[n]
+ * but note that whether bytes are written to Qd is still subject
+ * to (all forms of) predication in the usual way.
+ */
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint16_t mask = mve_element_mask(env);
+ uint16_t p0 = FIELD_EX32(env->v7m.vpr, V7M_VPR, P0);
+ unsigned e;
+ for (e = 0; e < 16 / 8; e++, mask >>= 8, p0 >>= 8) {
+ uint64_t r = m[H8(e)];
+ mergemask(&r, n[H8(e)], p0);
+ mergemask(&d[H8(e)], r, mask);
+ }
+ mve_advance_vpt(env);
+}
+
+void HELPER(mve_vpnot)(CPUARMState *env)
+{
+ /*
+ * P0 bits for unexecuted beats (where eci_mask is 0) are unchanged.
+ * P0 bits for predicated lanes in executed bits (where mask is 0) are 0.
+ * P0 bits otherwise are inverted.
+ * (This is the same logic as VCMP.)
+ * This insn is itself subject to predication and to beat-wise execution,
+ * and after it executes VPT state advances in the usual way.
+ */
+ uint16_t mask = mve_element_mask(env);
+ uint16_t eci_mask = mve_eci_mask(env);
+ uint16_t beatpred = ~env->v7m.vpr & mask;
+ env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (beatpred & eci_mask);
+ mve_advance_vpt(env);
+}
+
+/*
+ * VCTP: P0 unexecuted bits unchanged, predicated bits zeroed,
+ * otherwise set according to value of Rn. The calculation of
+ * newmask here works in the same way as the calculation of the
+ * ltpmask in mve_element_mask(), but we have pre-calculated
+ * the masklen in the generated code.
+ */
+void HELPER(mve_vctp)(CPUARMState *env, uint32_t masklen)
+{
+ uint16_t mask = mve_element_mask(env);
+ uint16_t eci_mask = mve_eci_mask(env);
+ uint16_t newmask;
+
+ assert(masklen <= 16);
+ newmask = masklen ? MAKE_64BIT_MASK(0, masklen) : 0;
+ newmask &= mask;
+ env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | (newmask & eci_mask);
+ mve_advance_vpt(env);
+}
+
+#define DO_1OP_SAT(OP, ESIZE, TYPE, FN) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \
+ { \
+ TYPE *d = vd, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ bool qc = false; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ bool sat = false; \
+ mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)], &sat), mask); \
+ qc |= sat & mask & 1; \
+ } \
+ if (qc) { \
+ env->vfp.qc[0] = qc; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VQABS_B(N, SATP) \
+ do_sat_bhs(DO_ABS((int64_t)N), INT8_MIN, INT8_MAX, SATP)
+#define DO_VQABS_H(N, SATP) \
+ do_sat_bhs(DO_ABS((int64_t)N), INT16_MIN, INT16_MAX, SATP)
+#define DO_VQABS_W(N, SATP) \
+ do_sat_bhs(DO_ABS((int64_t)N), INT32_MIN, INT32_MAX, SATP)
+
+#define DO_VQNEG_B(N, SATP) do_sat_bhs(-(int64_t)N, INT8_MIN, INT8_MAX, SATP)
+#define DO_VQNEG_H(N, SATP) do_sat_bhs(-(int64_t)N, INT16_MIN, INT16_MAX, SATP)
+#define DO_VQNEG_W(N, SATP) do_sat_bhs(-(int64_t)N, INT32_MIN, INT32_MAX, SATP)
+
+DO_1OP_SAT(vqabsb, 1, int8_t, DO_VQABS_B)
+DO_1OP_SAT(vqabsh, 2, int16_t, DO_VQABS_H)
+DO_1OP_SAT(vqabsw, 4, int32_t, DO_VQABS_W)
+
+DO_1OP_SAT(vqnegb, 1, int8_t, DO_VQNEG_B)
+DO_1OP_SAT(vqnegh, 2, int16_t, DO_VQNEG_H)
+DO_1OP_SAT(vqnegw, 4, int32_t, DO_VQNEG_W)
+
+/*
+ * VMAXA, VMINA: vd is unsigned; vm is signed, and we take its
+ * absolute value; we then do an unsigned comparison.
+ */
+#define DO_VMAXMINA(OP, ESIZE, STYPE, UTYPE, FN) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \
+ { \
+ UTYPE *d = vd; \
+ STYPE *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ UTYPE r = DO_ABS(m[H##ESIZE(e)]); \
+ r = FN(d[H##ESIZE(e)], r); \
+ mergemask(&d[H##ESIZE(e)], r, mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+DO_VMAXMINA(vmaxab, 1, int8_t, uint8_t, DO_MAX)
+DO_VMAXMINA(vmaxah, 2, int16_t, uint16_t, DO_MAX)
+DO_VMAXMINA(vmaxaw, 4, int32_t, uint32_t, DO_MAX)
+DO_VMAXMINA(vminab, 1, int8_t, uint8_t, DO_MIN)
+DO_VMAXMINA(vminah, 2, int16_t, uint16_t, DO_MIN)
+DO_VMAXMINA(vminaw, 4, int32_t, uint32_t, DO_MIN)
+
+/*
+ * 2-operand floating point. Note that if an element is partially
+ * predicated we must do the FP operation to update the non-predicated
+ * bytes, but we must be careful to avoid updating the FP exception
+ * state unless byte 0 of the element was unpredicated.
+ */
+#define DO_2OP_FP(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, \
+ void *vd, void *vn, void *vm) \
+ { \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ TYPE r; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ float_status *fpst; \
+ float_status scratch_fpst; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
+ continue; \
+ } \
+ fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
+ &env->vfp.standard_fp_status; \
+ if (!(mask & 1)) { \
+ /* We need the result but without updating flags */ \
+ scratch_fpst = *fpst; \
+ fpst = &scratch_fpst; \
+ } \
+ r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst); \
+ mergemask(&d[H##ESIZE(e)], r, mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_2OP_FP_ALL(OP, FN) \
+ DO_2OP_FP(OP##h, 2, float16, float16_##FN) \
+ DO_2OP_FP(OP##s, 4, float32, float32_##FN)
+
+DO_2OP_FP_ALL(vfadd, add)
+DO_2OP_FP_ALL(vfsub, sub)
+DO_2OP_FP_ALL(vfmul, mul)
+
+static inline float16 float16_abd(float16 a, float16 b, float_status *s)
+{
+ return float16_abs(float16_sub(a, b, s));
+}
+
+static inline float32 float32_abd(float32 a, float32 b, float_status *s)
+{
+ return float32_abs(float32_sub(a, b, s));
+}
+
+DO_2OP_FP_ALL(vfabd, abd)
+DO_2OP_FP_ALL(vmaxnm, maxnum)
+DO_2OP_FP_ALL(vminnm, minnum)
+
+static inline float16 float16_maxnuma(float16 a, float16 b, float_status *s)
+{
+ return float16_maxnum(float16_abs(a), float16_abs(b), s);
+}
+
+static inline float32 float32_maxnuma(float32 a, float32 b, float_status *s)
+{
+ return float32_maxnum(float32_abs(a), float32_abs(b), s);
+}
+
+static inline float16 float16_minnuma(float16 a, float16 b, float_status *s)
+{
+ return float16_minnum(float16_abs(a), float16_abs(b), s);
+}
+
+static inline float32 float32_minnuma(float32 a, float32 b, float_status *s)
+{
+ return float32_minnum(float32_abs(a), float32_abs(b), s);
+}
+
+DO_2OP_FP_ALL(vmaxnma, maxnuma)
+DO_2OP_FP_ALL(vminnma, minnuma)
+
+#define DO_VCADD_FP(OP, ESIZE, TYPE, FN0, FN1) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, \
+ void *vd, void *vn, void *vm) \
+ { \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ TYPE r[16 / ESIZE]; \
+ uint16_t tm, mask = mve_element_mask(env); \
+ unsigned e; \
+ float_status *fpst; \
+ float_status scratch_fpst; \
+ /* Calculate all results first to avoid overwriting inputs */ \
+ for (e = 0, tm = mask; e < 16 / ESIZE; e++, tm >>= ESIZE) { \
+ if ((tm & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
+ r[e] = 0; \
+ continue; \
+ } \
+ fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
+ &env->vfp.standard_fp_status; \
+ if (!(tm & 1)) { \
+ /* We need the result but without updating flags */ \
+ scratch_fpst = *fpst; \
+ fpst = &scratch_fpst; \
+ } \
+ if (!(e & 1)) { \
+ r[e] = FN0(n[H##ESIZE(e)], m[H##ESIZE(e + 1)], fpst); \
+ } else { \
+ r[e] = FN1(n[H##ESIZE(e)], m[H##ESIZE(e - 1)], fpst); \
+ } \
+ } \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ mergemask(&d[H##ESIZE(e)], r[e], mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+DO_VCADD_FP(vfcadd90h, 2, float16, float16_sub, float16_add)
+DO_VCADD_FP(vfcadd90s, 4, float32, float32_sub, float32_add)
+DO_VCADD_FP(vfcadd270h, 2, float16, float16_add, float16_sub)
+DO_VCADD_FP(vfcadd270s, 4, float32, float32_add, float32_sub)
+
+#define DO_VFMA(OP, ESIZE, TYPE, CHS) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, \
+ void *vd, void *vn, void *vm) \
+ { \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ TYPE r; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ float_status *fpst; \
+ float_status scratch_fpst; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
+ continue; \
+ } \
+ fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
+ &env->vfp.standard_fp_status; \
+ if (!(mask & 1)) { \
+ /* We need the result but without updating flags */ \
+ scratch_fpst = *fpst; \
+ fpst = &scratch_fpst; \
+ } \
+ r = n[H##ESIZE(e)]; \
+ if (CHS) { \
+ r = TYPE##_chs(r); \
+ } \
+ r = TYPE##_muladd(r, m[H##ESIZE(e)], d[H##ESIZE(e)], \
+ 0, fpst); \
+ mergemask(&d[H##ESIZE(e)], r, mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+DO_VFMA(vfmah, 2, float16, false)
+DO_VFMA(vfmas, 4, float32, false)
+DO_VFMA(vfmsh, 2, float16, true)
+DO_VFMA(vfmss, 4, float32, true)
+
+#define DO_VCMLA(OP, ESIZE, TYPE, ROT, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, \
+ void *vd, void *vn, void *vm) \
+ { \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ TYPE r0, r1, e1, e2, e3, e4; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ float_status *fpst0, *fpst1; \
+ float_status scratch_fpst; \
+ /* We loop through pairs of elements at a time */ \
+ for (e = 0; e < 16 / ESIZE; e += 2, mask >>= ESIZE * 2) { \
+ if ((mask & MAKE_64BIT_MASK(0, ESIZE * 2)) == 0) { \
+ continue; \
+ } \
+ fpst0 = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
+ &env->vfp.standard_fp_status; \
+ fpst1 = fpst0; \
+ if (!(mask & 1)) { \
+ scratch_fpst = *fpst0; \
+ fpst0 = &scratch_fpst; \
+ } \
+ if (!(mask & (1 << ESIZE))) { \
+ scratch_fpst = *fpst1; \
+ fpst1 = &scratch_fpst; \
+ } \
+ switch (ROT) { \
+ case 0: \
+ e1 = m[H##ESIZE(e)]; \
+ e2 = n[H##ESIZE(e)]; \
+ e3 = m[H##ESIZE(e + 1)]; \
+ e4 = n[H##ESIZE(e)]; \
+ break; \
+ case 1: \
+ e1 = TYPE##_chs(m[H##ESIZE(e + 1)]); \
+ e2 = n[H##ESIZE(e + 1)]; \
+ e3 = m[H##ESIZE(e)]; \
+ e4 = n[H##ESIZE(e + 1)]; \
+ break; \
+ case 2: \
+ e1 = TYPE##_chs(m[H##ESIZE(e)]); \
+ e2 = n[H##ESIZE(e)]; \
+ e3 = TYPE##_chs(m[H##ESIZE(e + 1)]); \
+ e4 = n[H##ESIZE(e)]; \
+ break; \
+ case 3: \
+ e1 = m[H##ESIZE(e + 1)]; \
+ e2 = n[H##ESIZE(e + 1)]; \
+ e3 = TYPE##_chs(m[H##ESIZE(e)]); \
+ e4 = n[H##ESIZE(e + 1)]; \
+ break; \
+ default: \
+ g_assert_not_reached(); \
+ } \
+ r0 = FN(e2, e1, d[H##ESIZE(e)], fpst0); \
+ r1 = FN(e4, e3, d[H##ESIZE(e + 1)], fpst1); \
+ mergemask(&d[H##ESIZE(e)], r0, mask); \
+ mergemask(&d[H##ESIZE(e + 1)], r1, mask >> ESIZE); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VCMULH(N, M, D, S) float16_mul(N, M, S)
+#define DO_VCMULS(N, M, D, S) float32_mul(N, M, S)
+
+#define DO_VCMLAH(N, M, D, S) float16_muladd(N, M, D, 0, S)
+#define DO_VCMLAS(N, M, D, S) float32_muladd(N, M, D, 0, S)
+
+DO_VCMLA(vcmul0h, 2, float16, 0, DO_VCMULH)
+DO_VCMLA(vcmul0s, 4, float32, 0, DO_VCMULS)
+DO_VCMLA(vcmul90h, 2, float16, 1, DO_VCMULH)
+DO_VCMLA(vcmul90s, 4, float32, 1, DO_VCMULS)
+DO_VCMLA(vcmul180h, 2, float16, 2, DO_VCMULH)
+DO_VCMLA(vcmul180s, 4, float32, 2, DO_VCMULS)
+DO_VCMLA(vcmul270h, 2, float16, 3, DO_VCMULH)
+DO_VCMLA(vcmul270s, 4, float32, 3, DO_VCMULS)
+
+DO_VCMLA(vcmla0h, 2, float16, 0, DO_VCMLAH)
+DO_VCMLA(vcmla0s, 4, float32, 0, DO_VCMLAS)
+DO_VCMLA(vcmla90h, 2, float16, 1, DO_VCMLAH)
+DO_VCMLA(vcmla90s, 4, float32, 1, DO_VCMLAS)
+DO_VCMLA(vcmla180h, 2, float16, 2, DO_VCMLAH)
+DO_VCMLA(vcmla180s, 4, float32, 2, DO_VCMLAS)
+DO_VCMLA(vcmla270h, 2, float16, 3, DO_VCMLAH)
+DO_VCMLA(vcmla270s, 4, float32, 3, DO_VCMLAS)
+
+#define DO_2OP_FP_SCALAR(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, \
+ void *vd, void *vn, uint32_t rm) \
+ { \
+ TYPE *d = vd, *n = vn; \
+ TYPE r, m = rm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ float_status *fpst; \
+ float_status scratch_fpst; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
+ continue; \
+ } \
+ fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
+ &env->vfp.standard_fp_status; \
+ if (!(mask & 1)) { \
+ /* We need the result but without updating flags */ \
+ scratch_fpst = *fpst; \
+ fpst = &scratch_fpst; \
+ } \
+ r = FN(n[H##ESIZE(e)], m, fpst); \
+ mergemask(&d[H##ESIZE(e)], r, mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_2OP_FP_SCALAR_ALL(OP, FN) \
+ DO_2OP_FP_SCALAR(OP##h, 2, float16, float16_##FN) \
+ DO_2OP_FP_SCALAR(OP##s, 4, float32, float32_##FN)
+
+DO_2OP_FP_SCALAR_ALL(vfadd_scalar, add)
+DO_2OP_FP_SCALAR_ALL(vfsub_scalar, sub)
+DO_2OP_FP_SCALAR_ALL(vfmul_scalar, mul)
+
+#define DO_2OP_FP_ACC_SCALAR(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, \
+ void *vd, void *vn, uint32_t rm) \
+ { \
+ TYPE *d = vd, *n = vn; \
+ TYPE r, m = rm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ float_status *fpst; \
+ float_status scratch_fpst; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
+ continue; \
+ } \
+ fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
+ &env->vfp.standard_fp_status; \
+ if (!(mask & 1)) { \
+ /* We need the result but without updating flags */ \
+ scratch_fpst = *fpst; \
+ fpst = &scratch_fpst; \
+ } \
+ r = FN(n[H##ESIZE(e)], m, d[H##ESIZE(e)], 0, fpst); \
+ mergemask(&d[H##ESIZE(e)], r, mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/* VFMAS is vector * vector + scalar, so swap op2 and op3 */
+#define DO_VFMAS_SCALARH(N, M, D, F, S) float16_muladd(N, D, M, F, S)
+#define DO_VFMAS_SCALARS(N, M, D, F, S) float32_muladd(N, D, M, F, S)
+
+/* VFMA is vector * scalar + vector */
+DO_2OP_FP_ACC_SCALAR(vfma_scalarh, 2, float16, float16_muladd)
+DO_2OP_FP_ACC_SCALAR(vfma_scalars, 4, float32, float32_muladd)
+DO_2OP_FP_ACC_SCALAR(vfmas_scalarh, 2, float16, DO_VFMAS_SCALARH)
+DO_2OP_FP_ACC_SCALAR(vfmas_scalars, 4, float32, DO_VFMAS_SCALARS)
+
+/* Floating point max/min across vector. */
+#define DO_FP_VMAXMINV(OP, ESIZE, TYPE, ABS, FN) \
+ uint32_t HELPER(glue(mve_, OP))(CPUARMState *env, void *vm, \
+ uint32_t ra_in) \
+ { \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ TYPE *m = vm; \
+ TYPE ra = (TYPE)ra_in; \
+ float_status *fpst = (ESIZE == 2) ? \
+ &env->vfp.standard_fp_status_f16 : \
+ &env->vfp.standard_fp_status; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if (mask & 1) { \
+ TYPE v = m[H##ESIZE(e)]; \
+ if (TYPE##_is_signaling_nan(ra, fpst)) { \
+ ra = TYPE##_silence_nan(ra, fpst); \
+ float_raise(float_flag_invalid, fpst); \
+ } \
+ if (TYPE##_is_signaling_nan(v, fpst)) { \
+ v = TYPE##_silence_nan(v, fpst); \
+ float_raise(float_flag_invalid, fpst); \
+ } \
+ if (ABS) { \
+ v = TYPE##_abs(v); \
+ } \
+ ra = FN(ra, v, fpst); \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ return ra; \
+ } \
+
+#define NOP(X) (X)
+
+DO_FP_VMAXMINV(vmaxnmvh, 2, float16, false, float16_maxnum)
+DO_FP_VMAXMINV(vmaxnmvs, 4, float32, false, float32_maxnum)
+DO_FP_VMAXMINV(vminnmvh, 2, float16, false, float16_minnum)
+DO_FP_VMAXMINV(vminnmvs, 4, float32, false, float32_minnum)
+DO_FP_VMAXMINV(vmaxnmavh, 2, float16, true, float16_maxnum)
+DO_FP_VMAXMINV(vmaxnmavs, 4, float32, true, float32_maxnum)
+DO_FP_VMAXMINV(vminnmavh, 2, float16, true, float16_minnum)
+DO_FP_VMAXMINV(vminnmavs, 4, float32, true, float32_minnum)
+
+/* FP compares; note that all comparisons signal InvalidOp for QNaNs */
+#define DO_VCMP_FP(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, void *vm) \
+ { \
+ TYPE *n = vn, *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ uint16_t eci_mask = mve_eci_mask(env); \
+ uint16_t beatpred = 0; \
+ uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \
+ unsigned e; \
+ float_status *fpst; \
+ float_status scratch_fpst; \
+ bool r; \
+ for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) { \
+ if ((mask & emask) == 0) { \
+ continue; \
+ } \
+ fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
+ &env->vfp.standard_fp_status; \
+ if (!(mask & (1 << (e * ESIZE)))) { \
+ /* We need the result but without updating flags */ \
+ scratch_fpst = *fpst; \
+ fpst = &scratch_fpst; \
+ } \
+ r = FN(n[H##ESIZE(e)], m[H##ESIZE(e)], fpst); \
+ /* Comparison sets 0/1 bits for each byte in the element */ \
+ beatpred |= r * emask; \
+ } \
+ beatpred &= mask; \
+ env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \
+ (beatpred & eci_mask); \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VCMP_FP_SCALAR(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vn, \
+ uint32_t rm) \
+ { \
+ TYPE *n = vn; \
+ uint16_t mask = mve_element_mask(env); \
+ uint16_t eci_mask = mve_eci_mask(env); \
+ uint16_t beatpred = 0; \
+ uint16_t emask = MAKE_64BIT_MASK(0, ESIZE); \
+ unsigned e; \
+ float_status *fpst; \
+ float_status scratch_fpst; \
+ bool r; \
+ for (e = 0; e < 16 / ESIZE; e++, emask <<= ESIZE) { \
+ if ((mask & emask) == 0) { \
+ continue; \
+ } \
+ fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
+ &env->vfp.standard_fp_status; \
+ if (!(mask & (1 << (e * ESIZE)))) { \
+ /* We need the result but without updating flags */ \
+ scratch_fpst = *fpst; \
+ fpst = &scratch_fpst; \
+ } \
+ r = FN(n[H##ESIZE(e)], (TYPE)rm, fpst); \
+ /* Comparison sets 0/1 bits for each byte in the element */ \
+ beatpred |= r * emask; \
+ } \
+ beatpred &= mask; \
+ env->v7m.vpr = (env->v7m.vpr & ~(uint32_t)eci_mask) | \
+ (beatpred & eci_mask); \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VCMP_FP_BOTH(VOP, SOP, ESIZE, TYPE, FN) \
+ DO_VCMP_FP(VOP, ESIZE, TYPE, FN) \
+ DO_VCMP_FP_SCALAR(SOP, ESIZE, TYPE, FN)
+
+/*
+ * Some care is needed here to get the correct result for the unordered case.
+ * Architecturally EQ, GE and GT are defined to be false for unordered, but
+ * the NE, LT and LE comparisons are defined as simple logical inverses of
+ * EQ, GE and GT and so they must return true for unordered. The softfloat
+ * comparison functions float*_{eq,le,lt} all return false for unordered.
+ */
+#define DO_GE16(X, Y, S) float16_le(Y, X, S)
+#define DO_GE32(X, Y, S) float32_le(Y, X, S)
+#define DO_GT16(X, Y, S) float16_lt(Y, X, S)
+#define DO_GT32(X, Y, S) float32_lt(Y, X, S)
+
+DO_VCMP_FP_BOTH(vfcmpeqh, vfcmpeq_scalarh, 2, float16, float16_eq)
+DO_VCMP_FP_BOTH(vfcmpeqs, vfcmpeq_scalars, 4, float32, float32_eq)
+
+DO_VCMP_FP_BOTH(vfcmpneh, vfcmpne_scalarh, 2, float16, !float16_eq)
+DO_VCMP_FP_BOTH(vfcmpnes, vfcmpne_scalars, 4, float32, !float32_eq)
+
+DO_VCMP_FP_BOTH(vfcmpgeh, vfcmpge_scalarh, 2, float16, DO_GE16)
+DO_VCMP_FP_BOTH(vfcmpges, vfcmpge_scalars, 4, float32, DO_GE32)
+
+DO_VCMP_FP_BOTH(vfcmplth, vfcmplt_scalarh, 2, float16, !DO_GE16)
+DO_VCMP_FP_BOTH(vfcmplts, vfcmplt_scalars, 4, float32, !DO_GE32)
+
+DO_VCMP_FP_BOTH(vfcmpgth, vfcmpgt_scalarh, 2, float16, DO_GT16)
+DO_VCMP_FP_BOTH(vfcmpgts, vfcmpgt_scalars, 4, float32, DO_GT32)
+
+DO_VCMP_FP_BOTH(vfcmpleh, vfcmple_scalarh, 2, float16, !DO_GT16)
+DO_VCMP_FP_BOTH(vfcmples, vfcmple_scalars, 4, float32, !DO_GT32)
+
+#define DO_VCVT_FIXED(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm, \
+ uint32_t shift) \
+ { \
+ TYPE *d = vd, *m = vm; \
+ TYPE r; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ float_status *fpst; \
+ float_status scratch_fpst; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
+ continue; \
+ } \
+ fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
+ &env->vfp.standard_fp_status; \
+ if (!(mask & 1)) { \
+ /* We need the result but without updating flags */ \
+ scratch_fpst = *fpst; \
+ fpst = &scratch_fpst; \
+ } \
+ r = FN(m[H##ESIZE(e)], shift, fpst); \
+ mergemask(&d[H##ESIZE(e)], r, mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+DO_VCVT_FIXED(vcvt_sh, 2, int16_t, helper_vfp_shtoh)
+DO_VCVT_FIXED(vcvt_uh, 2, uint16_t, helper_vfp_uhtoh)
+DO_VCVT_FIXED(vcvt_hs, 2, int16_t, helper_vfp_toshh_round_to_zero)
+DO_VCVT_FIXED(vcvt_hu, 2, uint16_t, helper_vfp_touhh_round_to_zero)
+DO_VCVT_FIXED(vcvt_sf, 4, int32_t, helper_vfp_sltos)
+DO_VCVT_FIXED(vcvt_uf, 4, uint32_t, helper_vfp_ultos)
+DO_VCVT_FIXED(vcvt_fs, 4, int32_t, helper_vfp_tosls_round_to_zero)
+DO_VCVT_FIXED(vcvt_fu, 4, uint32_t, helper_vfp_touls_round_to_zero)
+
+/* VCVT with specified rmode */
+#define DO_VCVT_RMODE(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, \
+ void *vd, void *vm, uint32_t rmode) \
+ { \
+ TYPE *d = vd, *m = vm; \
+ TYPE r; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ float_status *fpst; \
+ float_status scratch_fpst; \
+ float_status *base_fpst = (ESIZE == 2) ? \
+ &env->vfp.standard_fp_status_f16 : \
+ &env->vfp.standard_fp_status; \
+ uint32_t prev_rmode = get_float_rounding_mode(base_fpst); \
+ set_float_rounding_mode(rmode, base_fpst); \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
+ continue; \
+ } \
+ fpst = base_fpst; \
+ if (!(mask & 1)) { \
+ /* We need the result but without updating flags */ \
+ scratch_fpst = *fpst; \
+ fpst = &scratch_fpst; \
+ } \
+ r = FN(m[H##ESIZE(e)], 0, fpst); \
+ mergemask(&d[H##ESIZE(e)], r, mask); \
+ } \
+ set_float_rounding_mode(prev_rmode, base_fpst); \
+ mve_advance_vpt(env); \
+ }
+
+DO_VCVT_RMODE(vcvt_rm_sh, 2, uint16_t, helper_vfp_toshh)
+DO_VCVT_RMODE(vcvt_rm_uh, 2, uint16_t, helper_vfp_touhh)
+DO_VCVT_RMODE(vcvt_rm_ss, 4, uint32_t, helper_vfp_tosls)
+DO_VCVT_RMODE(vcvt_rm_us, 4, uint32_t, helper_vfp_touls)
+
+#define DO_VRINT_RM_H(M, F, S) helper_rinth(M, S)
+#define DO_VRINT_RM_S(M, F, S) helper_rints(M, S)
+
+DO_VCVT_RMODE(vrint_rm_h, 2, uint16_t, DO_VRINT_RM_H)
+DO_VCVT_RMODE(vrint_rm_s, 4, uint32_t, DO_VRINT_RM_S)
+
+/*
+ * VCVT between halfprec and singleprec. As usual for halfprec
+ * conversions, FZ16 is ignored and AHP is observed.
+ */
+static void do_vcvt_sh(CPUARMState *env, void *vd, void *vm, int top)
+{
+ uint16_t *d = vd;
+ uint32_t *m = vm;
+ uint16_t r;
+ uint16_t mask = mve_element_mask(env);
+ bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP);
+ unsigned e;
+ float_status *fpst;
+ float_status scratch_fpst;
+ float_status *base_fpst = &env->vfp.standard_fp_status;
+ bool old_fz = get_flush_to_zero(base_fpst);
+ set_flush_to_zero(false, base_fpst);
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+ if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) {
+ continue;
+ }
+ fpst = base_fpst;
+ if (!(mask & 1)) {
+ /* We need the result but without updating flags */
+ scratch_fpst = *fpst;
+ fpst = &scratch_fpst;
+ }
+ r = float32_to_float16(m[H4(e)], ieee, fpst);
+ mergemask(&d[H2(e * 2 + top)], r, mask >> (top * 2));
+ }
+ set_flush_to_zero(old_fz, base_fpst);
+ mve_advance_vpt(env);
+}
+
+static void do_vcvt_hs(CPUARMState *env, void *vd, void *vm, int top)
+{
+ uint32_t *d = vd;
+ uint16_t *m = vm;
+ uint32_t r;
+ uint16_t mask = mve_element_mask(env);
+ bool ieee = !(env->vfp.xregs[ARM_VFP_FPSCR] & FPCR_AHP);
+ unsigned e;
+ float_status *fpst;
+ float_status scratch_fpst;
+ float_status *base_fpst = &env->vfp.standard_fp_status;
+ bool old_fiz = get_flush_inputs_to_zero(base_fpst);
+ set_flush_inputs_to_zero(false, base_fpst);
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) {
+ if ((mask & MAKE_64BIT_MASK(0, 4)) == 0) {
+ continue;
+ }
+ fpst = base_fpst;
+ if (!(mask & (1 << (top * 2)))) {
+ /* We need the result but without updating flags */
+ scratch_fpst = *fpst;
+ fpst = &scratch_fpst;
+ }
+ r = float16_to_float32(m[H2(e * 2 + top)], ieee, fpst);
+ mergemask(&d[H4(e)], r, mask);
+ }
+ set_flush_inputs_to_zero(old_fiz, base_fpst);
+ mve_advance_vpt(env);
+}
+
+void HELPER(mve_vcvtb_sh)(CPUARMState *env, void *vd, void *vm)
+{
+ do_vcvt_sh(env, vd, vm, 0);
+}
+void HELPER(mve_vcvtt_sh)(CPUARMState *env, void *vd, void *vm)
+{
+ do_vcvt_sh(env, vd, vm, 1);
+}
+void HELPER(mve_vcvtb_hs)(CPUARMState *env, void *vd, void *vm)
+{
+ do_vcvt_hs(env, vd, vm, 0);
+}
+void HELPER(mve_vcvtt_hs)(CPUARMState *env, void *vd, void *vm)
+{
+ do_vcvt_hs(env, vd, vm, 1);
+}
+
+#define DO_1OP_FP(OP, ESIZE, TYPE, FN) \
+ void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vm) \
+ { \
+ TYPE *d = vd, *m = vm; \
+ TYPE r; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ float_status *fpst; \
+ float_status scratch_fpst; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ if ((mask & MAKE_64BIT_MASK(0, ESIZE)) == 0) { \
+ continue; \
+ } \
+ fpst = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 : \
+ &env->vfp.standard_fp_status; \
+ if (!(mask & 1)) { \
+ /* We need the result but without updating flags */ \
+ scratch_fpst = *fpst; \
+ fpst = &scratch_fpst; \
+ } \
+ r = FN(m[H##ESIZE(e)], fpst); \
+ mergemask(&d[H##ESIZE(e)], r, mask); \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+DO_1OP_FP(vrintx_h, 2, float16, float16_round_to_int)
+DO_1OP_FP(vrintx_s, 4, float32, float32_round_to_int)
--- /dev/null
+/*
+ * ARM NEON vector operations.
+ *
+ * Copyright (c) 2007, 2008 CodeSourcery.
+ * Written by Paul Brook
+ *
+ * This code is licensed under the GNU GPL v2.
+ */
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "fpu/softfloat.h"
+#include "vec_internal.h"
+
+#define SIGNBIT (uint32_t)0x80000000
+#define SIGNBIT64 ((uint64_t)1 << 63)
+
+#define SET_QC() env->vfp.qc[0] = 1
+
+#define NEON_TYPE1(name, type) \
+typedef struct \
+{ \
+ type v1; \
+} neon_##name;
+#if HOST_BIG_ENDIAN
+#define NEON_TYPE2(name, type) \
+typedef struct \
+{ \
+ type v2; \
+ type v1; \
+} neon_##name;
+#define NEON_TYPE4(name, type) \
+typedef struct \
+{ \
+ type v4; \
+ type v3; \
+ type v2; \
+ type v1; \
+} neon_##name;
+#else
+#define NEON_TYPE2(name, type) \
+typedef struct \
+{ \
+ type v1; \
+ type v2; \
+} neon_##name;
+#define NEON_TYPE4(name, type) \
+typedef struct \
+{ \
+ type v1; \
+ type v2; \
+ type v3; \
+ type v4; \
+} neon_##name;
+#endif
+
+NEON_TYPE4(s8, int8_t)
+NEON_TYPE4(u8, uint8_t)
+NEON_TYPE2(s16, int16_t)
+NEON_TYPE2(u16, uint16_t)
+NEON_TYPE1(s32, int32_t)
+NEON_TYPE1(u32, uint32_t)
+#undef NEON_TYPE4
+#undef NEON_TYPE2
+#undef NEON_TYPE1
+
+/* Copy from a uint32_t to a vector structure type. */
+#define NEON_UNPACK(vtype, dest, val) do { \
+ union { \
+ vtype v; \
+ uint32_t i; \
+ } conv_u; \
+ conv_u.i = (val); \
+ dest = conv_u.v; \
+ } while(0)
+
+/* Copy from a vector structure type to a uint32_t. */
+#define NEON_PACK(vtype, dest, val) do { \
+ union { \
+ vtype v; \
+ uint32_t i; \
+ } conv_u; \
+ conv_u.v = (val); \
+ dest = conv_u.i; \
+ } while(0)
+
+#define NEON_DO1 \
+ NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
+#define NEON_DO2 \
+ NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
+ NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
+#define NEON_DO4 \
+ NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
+ NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
+ NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
+ NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
+
+#define NEON_VOP_BODY(vtype, n) \
+{ \
+ uint32_t res; \
+ vtype vsrc1; \
+ vtype vsrc2; \
+ vtype vdest; \
+ NEON_UNPACK(vtype, vsrc1, arg1); \
+ NEON_UNPACK(vtype, vsrc2, arg2); \
+ NEON_DO##n; \
+ NEON_PACK(vtype, res, vdest); \
+ return res; \
+}
+
+#define NEON_VOP(name, vtype, n) \
+uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
+NEON_VOP_BODY(vtype, n)
+
+#define NEON_VOP_ENV(name, vtype, n) \
+uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
+NEON_VOP_BODY(vtype, n)
+
+/* Pairwise operations. */
+/* For 32-bit elements each segment only contains a single element, so
+ the elementwise and pairwise operations are the same. */
+#define NEON_PDO2 \
+ NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
+ NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
+#define NEON_PDO4 \
+ NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
+ NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
+ NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
+ NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
+
+#define NEON_POP(name, vtype, n) \
+uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
+{ \
+ uint32_t res; \
+ vtype vsrc1; \
+ vtype vsrc2; \
+ vtype vdest; \
+ NEON_UNPACK(vtype, vsrc1, arg1); \
+ NEON_UNPACK(vtype, vsrc2, arg2); \
+ NEON_PDO##n; \
+ NEON_PACK(vtype, res, vdest); \
+ return res; \
+}
+
+/* Unary operators. */
+#define NEON_VOP1(name, vtype, n) \
+uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
+{ \
+ vtype vsrc1; \
+ vtype vdest; \
+ NEON_UNPACK(vtype, vsrc1, arg); \
+ NEON_DO##n; \
+ NEON_PACK(vtype, arg, vdest); \
+ return arg; \
+}
+
+
+#define NEON_USAT(dest, src1, src2, type) do { \
+ uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
+ if (tmp != (type)tmp) { \
+ SET_QC(); \
+ dest = ~0; \
+ } else { \
+ dest = tmp; \
+ }} while(0)
+#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
+NEON_VOP_ENV(qadd_u8, neon_u8, 4)
+#undef NEON_FN
+#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
+NEON_VOP_ENV(qadd_u16, neon_u16, 2)
+#undef NEON_FN
+#undef NEON_USAT
+
+uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ uint32_t res = a + b;
+ if (res < a) {
+ SET_QC();
+ res = ~0;
+ }
+ return res;
+}
+
+uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
+{
+ uint64_t res;
+
+ res = src1 + src2;
+ if (res < src1) {
+ SET_QC();
+ res = ~(uint64_t)0;
+ }
+ return res;
+}
+
+#define NEON_SSAT(dest, src1, src2, type) do { \
+ int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
+ if (tmp != (type)tmp) { \
+ SET_QC(); \
+ if (src2 > 0) { \
+ tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
+ } else { \
+ tmp = 1 << (sizeof(type) * 8 - 1); \
+ } \
+ } \
+ dest = tmp; \
+ } while(0)
+#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
+NEON_VOP_ENV(qadd_s8, neon_s8, 4)
+#undef NEON_FN
+#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
+NEON_VOP_ENV(qadd_s16, neon_s16, 2)
+#undef NEON_FN
+#undef NEON_SSAT
+
+uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ uint32_t res = a + b;
+ if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
+ SET_QC();
+ res = ~(((int32_t)a >> 31) ^ SIGNBIT);
+ }
+ return res;
+}
+
+uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
+{
+ uint64_t res;
+
+ res = src1 + src2;
+ if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
+ SET_QC();
+ res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
+ }
+ return res;
+}
+
+/* Unsigned saturating accumulate of signed value
+ *
+ * Op1/Rn is treated as signed
+ * Op2/Rd is treated as unsigned
+ *
+ * Explicit casting is used to ensure the correct sign extension of
+ * inputs. The result is treated as a unsigned value and saturated as such.
+ *
+ * We use a macro for the 8/16 bit cases which expects signed integers of va,
+ * vb, and vr for interim calculation and an unsigned 32 bit result value r.
+ */
+
+#define USATACC(bits, shift) \
+ do { \
+ va = sextract32(a, shift, bits); \
+ vb = extract32(b, shift, bits); \
+ vr = va + vb; \
+ if (vr > UINT##bits##_MAX) { \
+ SET_QC(); \
+ vr = UINT##bits##_MAX; \
+ } else if (vr < 0) { \
+ SET_QC(); \
+ vr = 0; \
+ } \
+ r = deposit32(r, shift, bits, vr); \
+ } while (0)
+
+uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ int16_t va, vb, vr;
+ uint32_t r = 0;
+
+ USATACC(8, 0);
+ USATACC(8, 8);
+ USATACC(8, 16);
+ USATACC(8, 24);
+ return r;
+}
+
+uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ int32_t va, vb, vr;
+ uint64_t r = 0;
+
+ USATACC(16, 0);
+ USATACC(16, 16);
+ return r;
+}
+
+#undef USATACC
+
+uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ int64_t va = (int32_t)a;
+ int64_t vb = (uint32_t)b;
+ int64_t vr = va + vb;
+ if (vr > UINT32_MAX) {
+ SET_QC();
+ vr = UINT32_MAX;
+ } else if (vr < 0) {
+ SET_QC();
+ vr = 0;
+ }
+ return vr;
+}
+
+uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+ uint64_t res;
+ res = a + b;
+ /* We only need to look at the pattern of SIGN bits to detect
+ * +ve/-ve saturation
+ */
+ if (~a & b & ~res & SIGNBIT64) {
+ SET_QC();
+ res = UINT64_MAX;
+ } else if (a & ~b & res & SIGNBIT64) {
+ SET_QC();
+ res = 0;
+ }
+ return res;
+}
+
+/* Signed saturating accumulate of unsigned value
+ *
+ * Op1/Rn is treated as unsigned
+ * Op2/Rd is treated as signed
+ *
+ * The result is treated as a signed value and saturated as such
+ *
+ * We use a macro for the 8/16 bit cases which expects signed integers of va,
+ * vb, and vr for interim calculation and an unsigned 32 bit result value r.
+ */
+
+#define SSATACC(bits, shift) \
+ do { \
+ va = extract32(a, shift, bits); \
+ vb = sextract32(b, shift, bits); \
+ vr = va + vb; \
+ if (vr > INT##bits##_MAX) { \
+ SET_QC(); \
+ vr = INT##bits##_MAX; \
+ } else if (vr < INT##bits##_MIN) { \
+ SET_QC(); \
+ vr = INT##bits##_MIN; \
+ } \
+ r = deposit32(r, shift, bits, vr); \
+ } while (0)
+
+uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ int16_t va, vb, vr;
+ uint32_t r = 0;
+
+ SSATACC(8, 0);
+ SSATACC(8, 8);
+ SSATACC(8, 16);
+ SSATACC(8, 24);
+ return r;
+}
+
+uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ int32_t va, vb, vr;
+ uint32_t r = 0;
+
+ SSATACC(16, 0);
+ SSATACC(16, 16);
+
+ return r;
+}
+
+#undef SSATACC
+
+uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ int64_t res;
+ int64_t op1 = (uint32_t)a;
+ int64_t op2 = (int32_t)b;
+ res = op1 + op2;
+ if (res > INT32_MAX) {
+ SET_QC();
+ res = INT32_MAX;
+ } else if (res < INT32_MIN) {
+ SET_QC();
+ res = INT32_MIN;
+ }
+ return res;
+}
+
+uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+ uint64_t res;
+ res = a + b;
+ /* We only need to look at the pattern of SIGN bits to detect an overflow */
+ if (((a & res)
+ | (~b & res)
+ | (a & ~b)) & SIGNBIT64) {
+ SET_QC();
+ res = INT64_MAX;
+ }
+ return res;
+}
+
+
+#define NEON_USAT(dest, src1, src2, type) do { \
+ uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
+ if (tmp != (type)tmp) { \
+ SET_QC(); \
+ dest = 0; \
+ } else { \
+ dest = tmp; \
+ }} while(0)
+#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
+NEON_VOP_ENV(qsub_u8, neon_u8, 4)
+#undef NEON_FN
+#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
+NEON_VOP_ENV(qsub_u16, neon_u16, 2)
+#undef NEON_FN
+#undef NEON_USAT
+
+uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ uint32_t res = a - b;
+ if (res > a) {
+ SET_QC();
+ res = 0;
+ }
+ return res;
+}
+
+uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
+{
+ uint64_t res;
+
+ if (src1 < src2) {
+ SET_QC();
+ res = 0;
+ } else {
+ res = src1 - src2;
+ }
+ return res;
+}
+
+#define NEON_SSAT(dest, src1, src2, type) do { \
+ int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
+ if (tmp != (type)tmp) { \
+ SET_QC(); \
+ if (src2 < 0) { \
+ tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
+ } else { \
+ tmp = 1 << (sizeof(type) * 8 - 1); \
+ } \
+ } \
+ dest = tmp; \
+ } while(0)
+#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
+NEON_VOP_ENV(qsub_s8, neon_s8, 4)
+#undef NEON_FN
+#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
+NEON_VOP_ENV(qsub_s16, neon_s16, 2)
+#undef NEON_FN
+#undef NEON_SSAT
+
+uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ uint32_t res = a - b;
+ if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
+ SET_QC();
+ res = ~(((int32_t)a >> 31) ^ SIGNBIT);
+ }
+ return res;
+}
+
+uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
+{
+ uint64_t res;
+
+ res = src1 - src2;
+ if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
+ SET_QC();
+ res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
+ }
+ return res;
+}
+
+#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
+NEON_VOP(hadd_s8, neon_s8, 4)
+NEON_VOP(hadd_u8, neon_u8, 4)
+NEON_VOP(hadd_s16, neon_s16, 2)
+NEON_VOP(hadd_u16, neon_u16, 2)
+#undef NEON_FN
+
+int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
+{
+ int32_t dest;
+
+ dest = (src1 >> 1) + (src2 >> 1);
+ if (src1 & src2 & 1)
+ dest++;
+ return dest;
+}
+
+uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
+{
+ uint32_t dest;
+
+ dest = (src1 >> 1) + (src2 >> 1);
+ if (src1 & src2 & 1)
+ dest++;
+ return dest;
+}
+
+#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
+NEON_VOP(rhadd_s8, neon_s8, 4)
+NEON_VOP(rhadd_u8, neon_u8, 4)
+NEON_VOP(rhadd_s16, neon_s16, 2)
+NEON_VOP(rhadd_u16, neon_u16, 2)
+#undef NEON_FN
+
+int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
+{
+ int32_t dest;
+
+ dest = (src1 >> 1) + (src2 >> 1);
+ if ((src1 | src2) & 1)
+ dest++;
+ return dest;
+}
+
+uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
+{
+ uint32_t dest;
+
+ dest = (src1 >> 1) + (src2 >> 1);
+ if ((src1 | src2) & 1)
+ dest++;
+ return dest;
+}
+
+#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
+NEON_VOP(hsub_s8, neon_s8, 4)
+NEON_VOP(hsub_u8, neon_u8, 4)
+NEON_VOP(hsub_s16, neon_s16, 2)
+NEON_VOP(hsub_u16, neon_u16, 2)
+#undef NEON_FN
+
+int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
+{
+ int32_t dest;
+
+ dest = (src1 >> 1) - (src2 >> 1);
+ if ((~src1) & src2 & 1)
+ dest--;
+ return dest;
+}
+
+uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
+{
+ uint32_t dest;
+
+ dest = (src1 >> 1) - (src2 >> 1);
+ if ((~src1) & src2 & 1)
+ dest--;
+ return dest;
+}
+
+#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
+NEON_POP(pmin_s8, neon_s8, 4)
+NEON_POP(pmin_u8, neon_u8, 4)
+NEON_POP(pmin_s16, neon_s16, 2)
+NEON_POP(pmin_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
+NEON_POP(pmax_s8, neon_s8, 4)
+NEON_POP(pmax_u8, neon_u8, 4)
+NEON_POP(pmax_s16, neon_s16, 2)
+NEON_POP(pmax_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
+NEON_VOP(shl_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
+NEON_VOP(shl_s16, neon_s16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
+NEON_VOP(rshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
+NEON_VOP(rshl_s16, neon_s16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
+{
+ return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
+}
+
+uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
+{
+ return do_sqrshl_d(val, (int8_t)shift, true, NULL);
+}
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
+NEON_VOP(rshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
+NEON_VOP(rshl_u16, neon_u16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
+{
+ return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
+}
+
+uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
+{
+ return do_uqrshl_d(val, (int8_t)shift, true, NULL);
+}
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
+NEON_VOP_ENV(qshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
+NEON_VOP_ENV(qshl_u16, neon_u16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
+{
+ return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
+}
+
+uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+ return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
+NEON_VOP_ENV(qshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
+NEON_VOP_ENV(qshl_s16, neon_s16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
+{
+ return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
+}
+
+uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+ return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
+NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
+NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
+{
+ return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
+}
+
+uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+ return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
+NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
+NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
+{
+ return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
+}
+
+uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+ return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
+NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
+NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
+{
+ return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
+}
+
+uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+ return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
+}
+
+uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
+{
+ uint32_t mask;
+ mask = (a ^ b) & 0x80808080u;
+ a &= ~0x80808080u;
+ b &= ~0x80808080u;
+ return (a + b) ^ mask;
+}
+
+uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
+{
+ uint32_t mask;
+ mask = (a ^ b) & 0x80008000u;
+ a &= ~0x80008000u;
+ b &= ~0x80008000u;
+ return (a + b) ^ mask;
+}
+
+#define NEON_FN(dest, src1, src2) dest = src1 + src2
+NEON_POP(padd_u8, neon_u8, 4)
+NEON_POP(padd_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) dest = src1 - src2
+NEON_VOP(sub_u8, neon_u8, 4)
+NEON_VOP(sub_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) dest = src1 * src2
+NEON_VOP(mul_u8, neon_u8, 4)
+NEON_VOP(mul_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
+NEON_VOP(tst_u8, neon_u8, 4)
+NEON_VOP(tst_u16, neon_u16, 2)
+NEON_VOP(tst_u32, neon_u32, 1)
+#undef NEON_FN
+
+/* Count Leading Sign/Zero Bits. */
+static inline int do_clz8(uint8_t x)
+{
+ int n;
+ for (n = 8; x; n--)
+ x >>= 1;
+ return n;
+}
+
+static inline int do_clz16(uint16_t x)
+{
+ int n;
+ for (n = 16; x; n--)
+ x >>= 1;
+ return n;
+}
+
+#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
+NEON_VOP1(clz_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
+NEON_VOP1(clz_u16, neon_u16, 2)
+#undef NEON_FN
+
+#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
+NEON_VOP1(cls_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
+NEON_VOP1(cls_s16, neon_s16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_cls_s32)(uint32_t x)
+{
+ int count;
+ if ((int32_t)x < 0)
+ x = ~x;
+ for (count = 32; x; count--)
+ x = x >> 1;
+ return count - 1;
+}
+
+/* Bit count. */
+uint32_t HELPER(neon_cnt_u8)(uint32_t x)
+{
+ x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
+ x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+ x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f);
+ return x;
+}
+
+/* Reverse bits in each 8 bit word */
+uint32_t HELPER(neon_rbit_u8)(uint32_t x)
+{
+ x = ((x & 0xf0f0f0f0) >> 4)
+ | ((x & 0x0f0f0f0f) << 4);
+ x = ((x & 0x88888888) >> 3)
+ | ((x & 0x44444444) >> 1)
+ | ((x & 0x22222222) << 1)
+ | ((x & 0x11111111) << 3);
+ return x;
+}
+
+#define NEON_QDMULH16(dest, src1, src2, round) do { \
+ uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
+ if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
+ SET_QC(); \
+ tmp = (tmp >> 31) ^ ~SIGNBIT; \
+ } else { \
+ tmp <<= 1; \
+ } \
+ if (round) { \
+ int32_t old = tmp; \
+ tmp += 1 << 15; \
+ if ((int32_t)tmp < old) { \
+ SET_QC(); \
+ tmp = SIGNBIT - 1; \
+ } \
+ } \
+ dest = tmp >> 16; \
+ } while(0)
+#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
+NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
+#undef NEON_FN
+#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
+NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
+#undef NEON_FN
+#undef NEON_QDMULH16
+
+#define NEON_QDMULH32(dest, src1, src2, round) do { \
+ uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
+ if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
+ SET_QC(); \
+ tmp = (tmp >> 63) ^ ~SIGNBIT64; \
+ } else { \
+ tmp <<= 1; \
+ } \
+ if (round) { \
+ int64_t old = tmp; \
+ tmp += (int64_t)1 << 31; \
+ if ((int64_t)tmp < old) { \
+ SET_QC(); \
+ tmp = SIGNBIT64 - 1; \
+ } \
+ } \
+ dest = tmp >> 32; \
+ } while(0)
+#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
+NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
+#undef NEON_FN
+#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
+NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
+#undef NEON_FN
+#undef NEON_QDMULH32
+
+uint32_t HELPER(neon_narrow_u8)(uint64_t x)
+{
+ return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
+ | ((x >> 24) & 0xff000000u);
+}
+
+uint32_t HELPER(neon_narrow_u16)(uint64_t x)
+{
+ return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
+}
+
+uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
+{
+ return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
+ | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
+}
+
+uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
+{
+ return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
+}
+
+uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
+{
+ x &= 0xff80ff80ff80ff80ull;
+ x += 0x0080008000800080ull;
+ return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
+ | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
+}
+
+uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
+{
+ x &= 0xffff8000ffff8000ull;
+ x += 0x0000800000008000ull;
+ return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
+}
+
+uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
+{
+ uint16_t s;
+ uint8_t d;
+ uint32_t res = 0;
+#define SAT8(n) \
+ s = x >> n; \
+ if (s & 0x8000) { \
+ SET_QC(); \
+ } else { \
+ if (s > 0xff) { \
+ d = 0xff; \
+ SET_QC(); \
+ } else { \
+ d = s; \
+ } \
+ res |= (uint32_t)d << (n / 2); \
+ }
+
+ SAT8(0);
+ SAT8(16);
+ SAT8(32);
+ SAT8(48);
+#undef SAT8
+ return res;
+}
+
+uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
+{
+ uint16_t s;
+ uint8_t d;
+ uint32_t res = 0;
+#define SAT8(n) \
+ s = x >> n; \
+ if (s > 0xff) { \
+ d = 0xff; \
+ SET_QC(); \
+ } else { \
+ d = s; \
+ } \
+ res |= (uint32_t)d << (n / 2);
+
+ SAT8(0);
+ SAT8(16);
+ SAT8(32);
+ SAT8(48);
+#undef SAT8
+ return res;
+}
+
+uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
+{
+ int16_t s;
+ uint8_t d;
+ uint32_t res = 0;
+#define SAT8(n) \
+ s = x >> n; \
+ if (s != (int8_t)s) { \
+ d = (s >> 15) ^ 0x7f; \
+ SET_QC(); \
+ } else { \
+ d = s; \
+ } \
+ res |= (uint32_t)d << (n / 2);
+
+ SAT8(0);
+ SAT8(16);
+ SAT8(32);
+ SAT8(48);
+#undef SAT8
+ return res;
+}
+
+uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
+{
+ uint32_t high;
+ uint32_t low;
+ low = x;
+ if (low & 0x80000000) {
+ low = 0;
+ SET_QC();
+ } else if (low > 0xffff) {
+ low = 0xffff;
+ SET_QC();
+ }
+ high = x >> 32;
+ if (high & 0x80000000) {
+ high = 0;
+ SET_QC();
+ } else if (high > 0xffff) {
+ high = 0xffff;
+ SET_QC();
+ }
+ return low | (high << 16);
+}
+
+uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
+{
+ uint32_t high;
+ uint32_t low;
+ low = x;
+ if (low > 0xffff) {
+ low = 0xffff;
+ SET_QC();
+ }
+ high = x >> 32;
+ if (high > 0xffff) {
+ high = 0xffff;
+ SET_QC();
+ }
+ return low | (high << 16);
+}
+
+uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
+{
+ int32_t low;
+ int32_t high;
+ low = x;
+ if (low != (int16_t)low) {
+ low = (low >> 31) ^ 0x7fff;
+ SET_QC();
+ }
+ high = x >> 32;
+ if (high != (int16_t)high) {
+ high = (high >> 31) ^ 0x7fff;
+ SET_QC();
+ }
+ return (uint16_t)low | (high << 16);
+}
+
+uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
+{
+ if (x & 0x8000000000000000ull) {
+ SET_QC();
+ return 0;
+ }
+ if (x > 0xffffffffu) {
+ SET_QC();
+ return 0xffffffffu;
+ }
+ return x;
+}
+
+uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
+{
+ if (x > 0xffffffffu) {
+ SET_QC();
+ return 0xffffffffu;
+ }
+ return x;
+}
+
+uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
+{
+ if ((int64_t)x != (int32_t)x) {
+ SET_QC();
+ return ((int64_t)x >> 63) ^ 0x7fffffff;
+ }
+ return x;
+}
+
+uint64_t HELPER(neon_widen_u8)(uint32_t x)
+{
+ uint64_t tmp;
+ uint64_t ret;
+ ret = (uint8_t)x;
+ tmp = (uint8_t)(x >> 8);
+ ret |= tmp << 16;
+ tmp = (uint8_t)(x >> 16);
+ ret |= tmp << 32;
+ tmp = (uint8_t)(x >> 24);
+ ret |= tmp << 48;
+ return ret;
+}
+
+uint64_t HELPER(neon_widen_s8)(uint32_t x)
+{
+ uint64_t tmp;
+ uint64_t ret;
+ ret = (uint16_t)(int8_t)x;
+ tmp = (uint16_t)(int8_t)(x >> 8);
+ ret |= tmp << 16;
+ tmp = (uint16_t)(int8_t)(x >> 16);
+ ret |= tmp << 32;
+ tmp = (uint16_t)(int8_t)(x >> 24);
+ ret |= tmp << 48;
+ return ret;
+}
+
+uint64_t HELPER(neon_widen_u16)(uint32_t x)
+{
+ uint64_t high = (uint16_t)(x >> 16);
+ return ((uint16_t)x) | (high << 32);
+}
+
+uint64_t HELPER(neon_widen_s16)(uint32_t x)
+{
+ uint64_t high = (int16_t)(x >> 16);
+ return ((uint32_t)(int16_t)x) | (high << 32);
+}
+
+uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
+{
+ uint64_t mask;
+ mask = (a ^ b) & 0x8000800080008000ull;
+ a &= ~0x8000800080008000ull;
+ b &= ~0x8000800080008000ull;
+ return (a + b) ^ mask;
+}
+
+uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
+{
+ uint64_t mask;
+ mask = (a ^ b) & 0x8000000080000000ull;
+ a &= ~0x8000000080000000ull;
+ b &= ~0x8000000080000000ull;
+ return (a + b) ^ mask;
+}
+
+uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
+{
+ uint64_t tmp;
+ uint64_t tmp2;
+
+ tmp = a & 0x0000ffff0000ffffull;
+ tmp += (a >> 16) & 0x0000ffff0000ffffull;
+ tmp2 = b & 0xffff0000ffff0000ull;
+ tmp2 += (b << 16) & 0xffff0000ffff0000ull;
+ return ( tmp & 0xffff)
+ | ((tmp >> 16) & 0xffff0000ull)
+ | ((tmp2 << 16) & 0xffff00000000ull)
+ | ( tmp2 & 0xffff000000000000ull);
+}
+
+uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
+{
+ uint32_t low = a + (a >> 32);
+ uint32_t high = b + (b >> 32);
+ return low + ((uint64_t)high << 32);
+}
+
+uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
+{
+ uint64_t mask;
+ mask = (a ^ ~b) & 0x8000800080008000ull;
+ a |= 0x8000800080008000ull;
+ b &= ~0x8000800080008000ull;
+ return (a - b) ^ mask;
+}
+
+uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
+{
+ uint64_t mask;
+ mask = (a ^ ~b) & 0x8000000080000000ull;
+ a |= 0x8000000080000000ull;
+ b &= ~0x8000000080000000ull;
+ return (a - b) ^ mask;
+}
+
+uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+ uint32_t x, y;
+ uint32_t low, high;
+
+ x = a;
+ y = b;
+ low = x + y;
+ if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
+ SET_QC();
+ low = ((int32_t)x >> 31) ^ ~SIGNBIT;
+ }
+ x = a >> 32;
+ y = b >> 32;
+ high = x + y;
+ if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
+ SET_QC();
+ high = ((int32_t)x >> 31) ^ ~SIGNBIT;
+ }
+ return low | ((uint64_t)high << 32);
+}
+
+uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
+{
+ uint64_t result;
+
+ result = a + b;
+ if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
+ SET_QC();
+ result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
+ }
+ return result;
+}
+
+/* We have to do the arithmetic in a larger type than
+ * the input type, because for example with a signed 32 bit
+ * op the absolute difference can overflow a signed 32 bit value.
+ */
+#define DO_ABD(dest, x, y, intype, arithtype) do { \
+ arithtype tmp_x = (intype)(x); \
+ arithtype tmp_y = (intype)(y); \
+ dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
+ } while(0)
+
+uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
+{
+ uint64_t tmp;
+ uint64_t result;
+ DO_ABD(result, a, b, uint8_t, uint32_t);
+ DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
+ result |= tmp << 16;
+ DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
+ result |= tmp << 32;
+ DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
+ result |= tmp << 48;
+ return result;
+}
+
+uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
+{
+ uint64_t tmp;
+ uint64_t result;
+ DO_ABD(result, a, b, int8_t, int32_t);
+ DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
+ result |= tmp << 16;
+ DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
+ result |= tmp << 32;
+ DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
+ result |= tmp << 48;
+ return result;
+}
+
+uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
+{
+ uint64_t tmp;
+ uint64_t result;
+ DO_ABD(result, a, b, uint16_t, uint32_t);
+ DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
+ return result | (tmp << 32);
+}
+
+uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
+{
+ uint64_t tmp;
+ uint64_t result;
+ DO_ABD(result, a, b, int16_t, int32_t);
+ DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
+ return result | (tmp << 32);
+}
+
+uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
+{
+ uint64_t result;
+ DO_ABD(result, a, b, uint32_t, uint64_t);
+ return result;
+}
+
+uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
+{
+ uint64_t result;
+ DO_ABD(result, a, b, int32_t, int64_t);
+ return result;
+}
+#undef DO_ABD
+
+/* Widening multiply. Named type is the source type. */
+#define DO_MULL(dest, x, y, type1, type2) do { \
+ type1 tmp_x = x; \
+ type1 tmp_y = y; \
+ dest = (type2)((type2)tmp_x * (type2)tmp_y); \
+ } while(0)
+
+uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
+{
+ uint64_t tmp;
+ uint64_t result;
+
+ DO_MULL(result, a, b, uint8_t, uint16_t);
+ DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
+ result |= tmp << 16;
+ DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
+ result |= tmp << 32;
+ DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
+ result |= tmp << 48;
+ return result;
+}
+
+uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
+{
+ uint64_t tmp;
+ uint64_t result;
+
+ DO_MULL(result, a, b, int8_t, uint16_t);
+ DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
+ result |= tmp << 16;
+ DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
+ result |= tmp << 32;
+ DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
+ result |= tmp << 48;
+ return result;
+}
+
+uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
+{
+ uint64_t tmp;
+ uint64_t result;
+
+ DO_MULL(result, a, b, uint16_t, uint32_t);
+ DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
+ return result | (tmp << 32);
+}
+
+uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
+{
+ uint64_t tmp;
+ uint64_t result;
+
+ DO_MULL(result, a, b, int16_t, uint32_t);
+ DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
+ return result | (tmp << 32);
+}
+
+uint64_t HELPER(neon_negl_u16)(uint64_t x)
+{
+ uint16_t tmp;
+ uint64_t result;
+ result = (uint16_t)-x;
+ tmp = -(x >> 16);
+ result |= (uint64_t)tmp << 16;
+ tmp = -(x >> 32);
+ result |= (uint64_t)tmp << 32;
+ tmp = -(x >> 48);
+ result |= (uint64_t)tmp << 48;
+ return result;
+}
+
+uint64_t HELPER(neon_negl_u32)(uint64_t x)
+{
+ uint32_t low = -x;
+ uint32_t high = -(x >> 32);
+ return low | ((uint64_t)high << 32);
+}
+
+/* Saturating sign manipulation. */
+/* ??? Make these use NEON_VOP1 */
+#define DO_QABS8(x) do { \
+ if (x == (int8_t)0x80) { \
+ x = 0x7f; \
+ SET_QC(); \
+ } else if (x < 0) { \
+ x = -x; \
+ }} while (0)
+uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
+{
+ neon_s8 vec;
+ NEON_UNPACK(neon_s8, vec, x);
+ DO_QABS8(vec.v1);
+ DO_QABS8(vec.v2);
+ DO_QABS8(vec.v3);
+ DO_QABS8(vec.v4);
+ NEON_PACK(neon_s8, x, vec);
+ return x;
+}
+#undef DO_QABS8
+
+#define DO_QNEG8(x) do { \
+ if (x == (int8_t)0x80) { \
+ x = 0x7f; \
+ SET_QC(); \
+ } else { \
+ x = -x; \
+ }} while (0)
+uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
+{
+ neon_s8 vec;
+ NEON_UNPACK(neon_s8, vec, x);
+ DO_QNEG8(vec.v1);
+ DO_QNEG8(vec.v2);
+ DO_QNEG8(vec.v3);
+ DO_QNEG8(vec.v4);
+ NEON_PACK(neon_s8, x, vec);
+ return x;
+}
+#undef DO_QNEG8
+
+#define DO_QABS16(x) do { \
+ if (x == (int16_t)0x8000) { \
+ x = 0x7fff; \
+ SET_QC(); \
+ } else if (x < 0) { \
+ x = -x; \
+ }} while (0)
+uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
+{
+ neon_s16 vec;
+ NEON_UNPACK(neon_s16, vec, x);
+ DO_QABS16(vec.v1);
+ DO_QABS16(vec.v2);
+ NEON_PACK(neon_s16, x, vec);
+ return x;
+}
+#undef DO_QABS16
+
+#define DO_QNEG16(x) do { \
+ if (x == (int16_t)0x8000) { \
+ x = 0x7fff; \
+ SET_QC(); \
+ } else { \
+ x = -x; \
+ }} while (0)
+uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
+{
+ neon_s16 vec;
+ NEON_UNPACK(neon_s16, vec, x);
+ DO_QNEG16(vec.v1);
+ DO_QNEG16(vec.v2);
+ NEON_PACK(neon_s16, x, vec);
+ return x;
+}
+#undef DO_QNEG16
+
+uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
+{
+ if (x == SIGNBIT) {
+ SET_QC();
+ x = ~SIGNBIT;
+ } else if ((int32_t)x < 0) {
+ x = -x;
+ }
+ return x;
+}
+
+uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
+{
+ if (x == SIGNBIT) {
+ SET_QC();
+ x = ~SIGNBIT;
+ } else {
+ x = -x;
+ }
+ return x;
+}
+
+uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
+{
+ if (x == SIGNBIT64) {
+ SET_QC();
+ x = ~SIGNBIT64;
+ } else if ((int64_t)x < 0) {
+ x = -x;
+ }
+ return x;
+}
+
+uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
+{
+ if (x == SIGNBIT64) {
+ SET_QC();
+ x = ~SIGNBIT64;
+ } else {
+ x = -x;
+ }
+ return x;
+}
+
+/* NEON Float helpers. */
+
+/* Floating point comparisons produce an integer result.
+ * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
+ * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
+ */
+uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
+}
+
+uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ return -float32_le(make_float32(b), make_float32(a), fpst);
+}
+
+uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ return -float32_lt(make_float32(b), make_float32(a), fpst);
+}
+
+uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ float32 f0 = float32_abs(make_float32(a));
+ float32 f1 = float32_abs(make_float32(b));
+ return -float32_le(f1, f0, fpst);
+}
+
+uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ float32 f0 = float32_abs(make_float32(a));
+ float32 f1 = float32_abs(make_float32(b));
+ return -float32_lt(f1, f0, fpst);
+}
+
+uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ float64 f0 = float64_abs(make_float64(a));
+ float64 f1 = float64_abs(make_float64(b));
+ return -float64_le(f1, f0, fpst);
+}
+
+uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ float64 f0 = float64_abs(make_float64(a));
+ float64 f1 = float64_abs(make_float64(b));
+ return -float64_lt(f1, f0, fpst);
+}
+
+#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
+
+void HELPER(neon_qunzip8)(void *vd, void *vm)
+{
+ uint64_t *rd = vd, *rm = vm;
+ uint64_t zd0 = rd[0], zd1 = rd[1];
+ uint64_t zm0 = rm[0], zm1 = rm[1];
+
+ uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
+ | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
+ | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
+ | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
+ uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
+ | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
+ | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
+ | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
+ uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
+ | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
+ | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
+ | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
+ uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
+ | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
+ | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
+ | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
+
+ rm[0] = m0;
+ rm[1] = m1;
+ rd[0] = d0;
+ rd[1] = d1;
+}
+
+void HELPER(neon_qunzip16)(void *vd, void *vm)
+{
+ uint64_t *rd = vd, *rm = vm;
+ uint64_t zd0 = rd[0], zd1 = rd[1];
+ uint64_t zm0 = rm[0], zm1 = rm[1];
+
+ uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
+ | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
+ uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
+ | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
+ uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
+ | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
+ uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
+ | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
+
+ rm[0] = m0;
+ rm[1] = m1;
+ rd[0] = d0;
+ rd[1] = d1;
+}
+
+void HELPER(neon_qunzip32)(void *vd, void *vm)
+{
+ uint64_t *rd = vd, *rm = vm;
+ uint64_t zd0 = rd[0], zd1 = rd[1];
+ uint64_t zm0 = rm[0], zm1 = rm[1];
+
+ uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
+ uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
+ uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
+ uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
+
+ rm[0] = m0;
+ rm[1] = m1;
+ rd[0] = d0;
+ rd[1] = d1;
+}
+
+void HELPER(neon_unzip8)(void *vd, void *vm)
+{
+ uint64_t *rd = vd, *rm = vm;
+ uint64_t zd = rd[0], zm = rm[0];
+
+ uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
+ | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
+ | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
+ | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
+ uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
+ | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
+ | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
+ | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
+
+ rm[0] = m0;
+ rd[0] = d0;
+}
+
+void HELPER(neon_unzip16)(void *vd, void *vm)
+{
+ uint64_t *rd = vd, *rm = vm;
+ uint64_t zd = rd[0], zm = rm[0];
+
+ uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
+ | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
+ uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
+ | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
+
+ rm[0] = m0;
+ rd[0] = d0;
+}
+
+void HELPER(neon_qzip8)(void *vd, void *vm)
+{
+ uint64_t *rd = vd, *rm = vm;
+ uint64_t zd0 = rd[0], zd1 = rd[1];
+ uint64_t zm0 = rm[0], zm1 = rm[1];
+
+ uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
+ | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
+ | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
+ | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
+ uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
+ | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
+ | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
+ | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
+ uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
+ | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
+ | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
+ | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
+ uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
+ | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
+ | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
+ | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
+
+ rm[0] = m0;
+ rm[1] = m1;
+ rd[0] = d0;
+ rd[1] = d1;
+}
+
+void HELPER(neon_qzip16)(void *vd, void *vm)
+{
+ uint64_t *rd = vd, *rm = vm;
+ uint64_t zd0 = rd[0], zd1 = rd[1];
+ uint64_t zm0 = rm[0], zm1 = rm[1];
+
+ uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
+ | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
+ uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
+ | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
+ uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
+ | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
+ uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
+ | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
+
+ rm[0] = m0;
+ rm[1] = m1;
+ rd[0] = d0;
+ rd[1] = d1;
+}
+
+void HELPER(neon_qzip32)(void *vd, void *vm)
+{
+ uint64_t *rd = vd, *rm = vm;
+ uint64_t zd0 = rd[0], zd1 = rd[1];
+ uint64_t zm0 = rm[0], zm1 = rm[1];
+
+ uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
+ uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
+ uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
+ uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
+
+ rm[0] = m0;
+ rm[1] = m1;
+ rd[0] = d0;
+ rd[1] = d1;
+}
+
+void HELPER(neon_zip8)(void *vd, void *vm)
+{
+ uint64_t *rd = vd, *rm = vm;
+ uint64_t zd = rd[0], zm = rm[0];
+
+ uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
+ | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
+ | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
+ | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
+ uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
+ | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
+ | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
+ | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
+
+ rm[0] = m0;
+ rd[0] = d0;
+}
+
+void HELPER(neon_zip16)(void *vd, void *vm)
+{
+ uint64_t *rd = vd, *rm = vm;
+ uint64_t zd = rd[0], zm = rm[0];
+
+ uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
+ | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
+ uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
+ | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
+
+ rm[0] = m0;
+ rd[0] = d0;
+}
--- /dev/null
+/*
+ * ARM helper routines
+ *
+ * Copyright (c) 2005-2007 CodeSourcery, LLC
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "internals.h"
+#include "exec/exec-all.h"
+#include "exec/cpu_ldst.h"
+#include "cpregs.h"
+
+#define SIGNBIT (uint32_t)0x80000000
+#define SIGNBIT64 ((uint64_t)1 << 63)
+
+int exception_target_el(CPUARMState *env)
+{
+ int target_el = MAX(1, arm_current_el(env));
+
+ /*
+ * No such thing as secure EL1 if EL3 is aarch32,
+ * so update the target EL to EL3 in this case.
+ */
+ if (arm_is_secure(env) && !arm_el_is_aa64(env, 3) && target_el == 1) {
+ target_el = 3;
+ }
+
+ return target_el;
+}
+
+void raise_exception(CPUARMState *env, uint32_t excp,
+ uint32_t syndrome, uint32_t target_el)
+{
+ CPUState *cs = env_cpu(env);
+
+ if (target_el == 1 && (arm_hcr_el2_eff(env) & HCR_TGE)) {
+ /*
+ * Redirect NS EL1 exceptions to NS EL2. These are reported with
+ * their original syndrome register value, with the exception of
+ * SIMD/FP access traps, which are reported as uncategorized
+ * (see DDI0478C.a D1.10.4)
+ */
+ target_el = 2;
+ if (syn_get_ec(syndrome) == EC_ADVSIMDFPACCESSTRAP) {
+ syndrome = syn_uncategorized();
+ }
+ }
+
+ assert(!excp_is_internal(excp));
+ cs->exception_index = excp;
+ env->exception.syndrome = syndrome;
+ env->exception.target_el = target_el;
+ cpu_loop_exit(cs);
+}
+
+void raise_exception_ra(CPUARMState *env, uint32_t excp, uint32_t syndrome,
+ uint32_t target_el, uintptr_t ra)
+{
+ CPUState *cs = env_cpu(env);
+
+ /*
+ * restore_state_to_opc() will set env->exception.syndrome, so
+ * we must restore CPU state here before setting the syndrome
+ * the caller passed us, and cannot use cpu_loop_exit_restore().
+ */
+ cpu_restore_state(cs, ra);
+ raise_exception(env, excp, syndrome, target_el);
+}
+
+uint64_t HELPER(neon_tbl)(CPUARMState *env, uint32_t desc,
+ uint64_t ireg, uint64_t def)
+{
+ uint64_t tmp, val = 0;
+ uint32_t maxindex = ((desc & 3) + 1) * 8;
+ uint32_t base_reg = desc >> 2;
+ uint32_t shift, index, reg;
+
+ for (shift = 0; shift < 64; shift += 8) {
+ index = (ireg >> shift) & 0xff;
+ if (index < maxindex) {
+ reg = base_reg + (index >> 3);
+ tmp = *aa32_vfp_dreg(env, reg);
+ tmp = ((tmp >> ((index & 7) << 3)) & 0xff) << shift;
+ } else {
+ tmp = def & (0xffull << shift);
+ }
+ val |= tmp;
+ }
+ return val;
+}
+
+void HELPER(v8m_stackcheck)(CPUARMState *env, uint32_t newvalue)
+{
+ /*
+ * Perform the v8M stack limit check for SP updates from translated code,
+ * raising an exception if the limit is breached.
+ */
+ if (newvalue < v7m_sp_limit(env)) {
+ /*
+ * Stack limit exceptions are a rare case, so rather than syncing
+ * PC/condbits before the call, we use raise_exception_ra() so
+ * that cpu_restore_state() will sort them out.
+ */
+ raise_exception_ra(env, EXCP_STKOF, 0, 1, GETPC());
+ }
+}
+
+uint32_t HELPER(add_setq)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ uint32_t res = a + b;
+ if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT))
+ env->QF = 1;
+ return res;
+}
+
+uint32_t HELPER(add_saturate)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ uint32_t res = a + b;
+ if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
+ env->QF = 1;
+ res = ~(((int32_t)a >> 31) ^ SIGNBIT);
+ }
+ return res;
+}
+
+uint32_t HELPER(sub_saturate)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ uint32_t res = a - b;
+ if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
+ env->QF = 1;
+ res = ~(((int32_t)a >> 31) ^ SIGNBIT);
+ }
+ return res;
+}
+
+uint32_t HELPER(add_usaturate)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ uint32_t res = a + b;
+ if (res < a) {
+ env->QF = 1;
+ res = ~0;
+ }
+ return res;
+}
+
+uint32_t HELPER(sub_usaturate)(CPUARMState *env, uint32_t a, uint32_t b)
+{
+ uint32_t res = a - b;
+ if (res > a) {
+ env->QF = 1;
+ res = 0;
+ }
+ return res;
+}
+
+/* Signed saturation. */
+static inline uint32_t do_ssat(CPUARMState *env, int32_t val, int shift)
+{
+ int32_t top;
+ uint32_t mask;
+
+ top = val >> shift;
+ mask = (1u << shift) - 1;
+ if (top > 0) {
+ env->QF = 1;
+ return mask;
+ } else if (top < -1) {
+ env->QF = 1;
+ return ~mask;
+ }
+ return val;
+}
+
+/* Unsigned saturation. */
+static inline uint32_t do_usat(CPUARMState *env, int32_t val, int shift)
+{
+ uint32_t max;
+
+ max = (1u << shift) - 1;
+ if (val < 0) {
+ env->QF = 1;
+ return 0;
+ } else if (val > max) {
+ env->QF = 1;
+ return max;
+ }
+ return val;
+}
+
+/* Signed saturate. */
+uint32_t HELPER(ssat)(CPUARMState *env, uint32_t x, uint32_t shift)
+{
+ return do_ssat(env, x, shift);
+}
+
+/* Dual halfword signed saturate. */
+uint32_t HELPER(ssat16)(CPUARMState *env, uint32_t x, uint32_t shift)
+{
+ uint32_t res;
+
+ res = (uint16_t)do_ssat(env, (int16_t)x, shift);
+ res |= do_ssat(env, ((int32_t)x) >> 16, shift) << 16;
+ return res;
+}
+
+/* Unsigned saturate. */
+uint32_t HELPER(usat)(CPUARMState *env, uint32_t x, uint32_t shift)
+{
+ return do_usat(env, x, shift);
+}
+
+/* Dual halfword unsigned saturate. */
+uint32_t HELPER(usat16)(CPUARMState *env, uint32_t x, uint32_t shift)
+{
+ uint32_t res;
+
+ res = (uint16_t)do_usat(env, (int16_t)x, shift);
+ res |= do_usat(env, ((int32_t)x) >> 16, shift) << 16;
+ return res;
+}
+
+void HELPER(setend)(CPUARMState *env)
+{
+ env->uncached_cpsr ^= CPSR_E;
+ arm_rebuild_hflags(env);
+}
+
+void HELPER(check_bxj_trap)(CPUARMState *env, uint32_t rm)
+{
+ /*
+ * Only called if in NS EL0 or EL1 for a BXJ for a v7A CPU;
+ * check if HSTR.TJDBX means we need to trap to EL2.
+ */
+ if (env->cp15.hstr_el2 & HSTR_TJDBX) {
+ /*
+ * We know the condition code check passed, so take the IMPDEF
+ * choice to always report CV=1 COND 0xe
+ */
+ uint32_t syn = syn_bxjtrap(1, 0xe, rm);
+ raise_exception_ra(env, EXCP_HYP_TRAP, syn, 2, GETPC());
+ }
+}
+
+#ifndef CONFIG_USER_ONLY
+/* Function checks whether WFx (WFI/WFE) instructions are set up to be trapped.
+ * The function returns the target EL (1-3) if the instruction is to be trapped;
+ * otherwise it returns 0 indicating it is not trapped.
+ */
+static inline int check_wfx_trap(CPUARMState *env, bool is_wfe)
+{
+ int cur_el = arm_current_el(env);
+ uint64_t mask;
+
+ if (arm_feature(env, ARM_FEATURE_M)) {
+ /* M profile cores can never trap WFI/WFE. */
+ return 0;
+ }
+
+ /* If we are currently in EL0 then we need to check if SCTLR is set up for
+ * WFx instructions being trapped to EL1. These trap bits don't exist in v7.
+ */
+ if (cur_el < 1 && arm_feature(env, ARM_FEATURE_V8)) {
+ int target_el;
+
+ mask = is_wfe ? SCTLR_nTWE : SCTLR_nTWI;
+ if (arm_is_secure_below_el3(env) && !arm_el_is_aa64(env, 3)) {
+ /* Secure EL0 and Secure PL1 is at EL3 */
+ target_el = 3;
+ } else {
+ target_el = 1;
+ }
+
+ if (!(env->cp15.sctlr_el[target_el] & mask)) {
+ return target_el;
+ }
+ }
+
+ /* We are not trapping to EL1; trap to EL2 if HCR_EL2 requires it
+ * No need for ARM_FEATURE check as if HCR_EL2 doesn't exist the
+ * bits will be zero indicating no trap.
+ */
+ if (cur_el < 2) {
+ mask = is_wfe ? HCR_TWE : HCR_TWI;
+ if (arm_hcr_el2_eff(env) & mask) {
+ return 2;
+ }
+ }
+
+ /* We are not trapping to EL1 or EL2; trap to EL3 if SCR_EL3 requires it */
+ if (cur_el < 3) {
+ mask = (is_wfe) ? SCR_TWE : SCR_TWI;
+ if (env->cp15.scr_el3 & mask) {
+ return 3;
+ }
+ }
+
+ return 0;
+}
+#endif
+
+void HELPER(wfi)(CPUARMState *env, uint32_t insn_len)
+{
+#ifdef CONFIG_USER_ONLY
+ /*
+ * WFI in the user-mode emulator is technically permitted but not
+ * something any real-world code would do. AArch64 Linux kernels
+ * trap it via SCTRL_EL1.nTWI and make it an (expensive) NOP;
+ * AArch32 kernels don't trap it so it will delay a bit.
+ * For QEMU, make it NOP here, because trying to raise EXCP_HLT
+ * would trigger an abort.
+ */
+ return;
+#else
+ CPUState *cs = env_cpu(env);
+ int target_el = check_wfx_trap(env, false);
+
+ if (cpu_has_work(cs)) {
+ /* Don't bother to go into our "low power state" if
+ * we would just wake up immediately.
+ */
+ return;
+ }
+
+ if (target_el) {
+ if (env->aarch64) {
+ env->pc -= insn_len;
+ } else {
+ env->regs[15] -= insn_len;
+ }
+
+ raise_exception(env, EXCP_UDEF, syn_wfx(1, 0xe, 0, insn_len == 2),
+ target_el);
+ }
+
+ cs->exception_index = EXCP_HLT;
+ cs->halted = 1;
+ cpu_loop_exit(cs);
+#endif
+}
+
+void HELPER(wfe)(CPUARMState *env)
+{
+ /* This is a hint instruction that is semantically different
+ * from YIELD even though we currently implement it identically.
+ * Don't actually halt the CPU, just yield back to top
+ * level loop. This is not going into a "low power state"
+ * (ie halting until some event occurs), so we never take
+ * a configurable trap to a different exception level.
+ */
+ HELPER(yield)(env);
+}
+
+void HELPER(yield)(CPUARMState *env)
+{
+ CPUState *cs = env_cpu(env);
+
+ /* This is a non-trappable hint instruction that generally indicates
+ * that the guest is currently busy-looping. Yield control back to the
+ * top level loop so that a more deserving VCPU has a chance to run.
+ */
+ cs->exception_index = EXCP_YIELD;
+ cpu_loop_exit(cs);
+}
+
+/* Raise an internal-to-QEMU exception. This is limited to only
+ * those EXCP values which are special cases for QEMU to interrupt
+ * execution and not to be used for exceptions which are passed to
+ * the guest (those must all have syndrome information and thus should
+ * use exception_with_syndrome*).
+ */
+void HELPER(exception_internal)(CPUARMState *env, uint32_t excp)
+{
+ CPUState *cs = env_cpu(env);
+
+ assert(excp_is_internal(excp));
+ cs->exception_index = excp;
+ cpu_loop_exit(cs);
+}
+
+/* Raise an exception with the specified syndrome register value */
+void HELPER(exception_with_syndrome_el)(CPUARMState *env, uint32_t excp,
+ uint32_t syndrome, uint32_t target_el)
+{
+ raise_exception(env, excp, syndrome, target_el);
+}
+
+/*
+ * Raise an exception with the specified syndrome register value
+ * to the default target el.
+ */
+void HELPER(exception_with_syndrome)(CPUARMState *env, uint32_t excp,
+ uint32_t syndrome)
+{
+ raise_exception(env, excp, syndrome, exception_target_el(env));
+}
+
+uint32_t HELPER(cpsr_read)(CPUARMState *env)
+{
+ return cpsr_read(env) & ~CPSR_EXEC;
+}
+
+void HELPER(cpsr_write)(CPUARMState *env, uint32_t val, uint32_t mask)
+{
+ cpsr_write(env, val, mask, CPSRWriteByInstr);
+ /* TODO: Not all cpsr bits are relevant to hflags. */
+ arm_rebuild_hflags(env);
+}
+
+/* Write the CPSR for a 32-bit exception return */
+void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val)
+{
+ uint32_t mask;
+
+ qemu_mutex_lock_iothread();
+ arm_call_pre_el_change_hook(env_archcpu(env));
+ qemu_mutex_unlock_iothread();
+
+ mask = aarch32_cpsr_valid_mask(env->features, &env_archcpu(env)->isar);
+ cpsr_write(env, val, mask, CPSRWriteExceptionReturn);
+
+ /* Generated code has already stored the new PC value, but
+ * without masking out its low bits, because which bits need
+ * masking depends on whether we're returning to Thumb or ARM
+ * state. Do the masking now.
+ */
+ env->regs[15] &= (env->thumb ? ~1 : ~3);
+ arm_rebuild_hflags(env);
+
+ qemu_mutex_lock_iothread();
+ arm_call_el_change_hook(env_archcpu(env));
+ qemu_mutex_unlock_iothread();
+}
+
+/* Access to user mode registers from privileged modes. */
+uint32_t HELPER(get_user_reg)(CPUARMState *env, uint32_t regno)
+{
+ uint32_t val;
+
+ if (regno == 13) {
+ val = env->banked_r13[BANK_USRSYS];
+ } else if (regno == 14) {
+ val = env->banked_r14[BANK_USRSYS];
+ } else if (regno >= 8
+ && (env->uncached_cpsr & 0x1f) == ARM_CPU_MODE_FIQ) {
+ val = env->usr_regs[regno - 8];
+ } else {
+ val = env->regs[regno];
+ }
+ return val;
+}
+
+void HELPER(set_user_reg)(CPUARMState *env, uint32_t regno, uint32_t val)
+{
+ if (regno == 13) {
+ env->banked_r13[BANK_USRSYS] = val;
+ } else if (regno == 14) {
+ env->banked_r14[BANK_USRSYS] = val;
+ } else if (regno >= 8
+ && (env->uncached_cpsr & 0x1f) == ARM_CPU_MODE_FIQ) {
+ env->usr_regs[regno - 8] = val;
+ } else {
+ env->regs[regno] = val;
+ }
+}
+
+void HELPER(set_r13_banked)(CPUARMState *env, uint32_t mode, uint32_t val)
+{
+ if ((env->uncached_cpsr & CPSR_M) == mode) {
+ env->regs[13] = val;
+ } else {
+ env->banked_r13[bank_number(mode)] = val;
+ }
+}
+
+uint32_t HELPER(get_r13_banked)(CPUARMState *env, uint32_t mode)
+{
+ if ((env->uncached_cpsr & CPSR_M) == ARM_CPU_MODE_SYS) {
+ /* SRS instruction is UNPREDICTABLE from System mode; we UNDEF.
+ * Other UNPREDICTABLE and UNDEF cases were caught at translate time.
+ */
+ raise_exception(env, EXCP_UDEF, syn_uncategorized(),
+ exception_target_el(env));
+ }
+
+ if ((env->uncached_cpsr & CPSR_M) == mode) {
+ return env->regs[13];
+ } else {
+ return env->banked_r13[bank_number(mode)];
+ }
+}
+
+static void msr_mrs_banked_exc_checks(CPUARMState *env, uint32_t tgtmode,
+ uint32_t regno)
+{
+ /* Raise an exception if the requested access is one of the UNPREDICTABLE
+ * cases; otherwise return. This broadly corresponds to the pseudocode
+ * BankedRegisterAccessValid() and SPSRAccessValid(),
+ * except that we have already handled some cases at translate time.
+ */
+ int curmode = env->uncached_cpsr & CPSR_M;
+
+ if (regno == 17) {
+ /* ELR_Hyp: a special case because access from tgtmode is OK */
+ if (curmode != ARM_CPU_MODE_HYP && curmode != ARM_CPU_MODE_MON) {
+ goto undef;
+ }
+ return;
+ }
+
+ if (curmode == tgtmode) {
+ goto undef;
+ }
+
+ if (tgtmode == ARM_CPU_MODE_USR) {
+ switch (regno) {
+ case 8 ... 12:
+ if (curmode != ARM_CPU_MODE_FIQ) {
+ goto undef;
+ }
+ break;
+ case 13:
+ if (curmode == ARM_CPU_MODE_SYS) {
+ goto undef;
+ }
+ break;
+ case 14:
+ if (curmode == ARM_CPU_MODE_HYP || curmode == ARM_CPU_MODE_SYS) {
+ goto undef;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (tgtmode == ARM_CPU_MODE_HYP) {
+ /* SPSR_Hyp, r13_hyp: accessible from Monitor mode only */
+ if (curmode != ARM_CPU_MODE_MON) {
+ goto undef;
+ }
+ }
+
+ return;
+
+undef:
+ raise_exception(env, EXCP_UDEF, syn_uncategorized(),
+ exception_target_el(env));
+}
+
+void HELPER(msr_banked)(CPUARMState *env, uint32_t value, uint32_t tgtmode,
+ uint32_t regno)
+{
+ msr_mrs_banked_exc_checks(env, tgtmode, regno);
+
+ switch (regno) {
+ case 16: /* SPSRs */
+ env->banked_spsr[bank_number(tgtmode)] = value;
+ break;
+ case 17: /* ELR_Hyp */
+ env->elr_el[2] = value;
+ break;
+ case 13:
+ env->banked_r13[bank_number(tgtmode)] = value;
+ break;
+ case 14:
+ env->banked_r14[r14_bank_number(tgtmode)] = value;
+ break;
+ case 8 ... 12:
+ switch (tgtmode) {
+ case ARM_CPU_MODE_USR:
+ env->usr_regs[regno - 8] = value;
+ break;
+ case ARM_CPU_MODE_FIQ:
+ env->fiq_regs[regno - 8] = value;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+uint32_t HELPER(mrs_banked)(CPUARMState *env, uint32_t tgtmode, uint32_t regno)
+{
+ msr_mrs_banked_exc_checks(env, tgtmode, regno);
+
+ switch (regno) {
+ case 16: /* SPSRs */
+ return env->banked_spsr[bank_number(tgtmode)];
+ case 17: /* ELR_Hyp */
+ return env->elr_el[2];
+ case 13:
+ return env->banked_r13[bank_number(tgtmode)];
+ case 14:
+ return env->banked_r14[r14_bank_number(tgtmode)];
+ case 8 ... 12:
+ switch (tgtmode) {
+ case ARM_CPU_MODE_USR:
+ return env->usr_regs[regno - 8];
+ case ARM_CPU_MODE_FIQ:
+ return env->fiq_regs[regno - 8];
+ default:
+ g_assert_not_reached();
+ }
+ default:
+ g_assert_not_reached();
+ }
+}
+
+const void *HELPER(access_check_cp_reg)(CPUARMState *env, uint32_t key,
+ uint32_t syndrome, uint32_t isread)
+{
+ ARMCPU *cpu = env_archcpu(env);
+ const ARMCPRegInfo *ri = get_arm_cp_reginfo(cpu->cp_regs, key);
+ CPAccessResult res = CP_ACCESS_OK;
+ int target_el;
+
+ assert(ri != NULL);
+
+ if (arm_feature(env, ARM_FEATURE_XSCALE) && ri->cp < 14
+ && extract32(env->cp15.c15_cpar, ri->cp, 1) == 0) {
+ res = CP_ACCESS_TRAP;
+ goto fail;
+ }
+
+ if (ri->accessfn) {
+ res = ri->accessfn(env, ri, isread);
+ }
+
+ /*
+ * If the access function indicates a trap from EL0 to EL1 then
+ * that always takes priority over the HSTR_EL2 trap. (If it indicates
+ * a trap to EL3, then the HSTR_EL2 trap takes priority; if it indicates
+ * a trap to EL2, then the syndrome is the same either way so we don't
+ * care whether technically the architecture says that HSTR_EL2 trap or
+ * the other trap takes priority. So we take the "check HSTR_EL2" path
+ * for all of those cases.)
+ */
+ if (res != CP_ACCESS_OK && ((res & CP_ACCESS_EL_MASK) == 0) &&
+ arm_current_el(env) == 0) {
+ goto fail;
+ }
+
+ /*
+ * HSTR_EL2 traps from EL1 are checked earlier, in generated code;
+ * we only need to check here for traps from EL0.
+ */
+ if (!is_a64(env) && arm_current_el(env) == 0 && ri->cp == 15 &&
+ arm_is_el2_enabled(env) &&
+ (arm_hcr_el2_eff(env) & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE)) {
+ uint32_t mask = 1 << ri->crn;
+
+ if (ri->type & ARM_CP_64BIT) {
+ mask = 1 << ri->crm;
+ }
+
+ /* T4 and T14 are RES0 */
+ mask &= ~((1 << 4) | (1 << 14));
+
+ if (env->cp15.hstr_el2 & mask) {
+ res = CP_ACCESS_TRAP_EL2;
+ goto fail;
+ }
+ }
+
+ /*
+ * Fine-grained traps also are lower priority than undef-to-EL1,
+ * higher priority than trap-to-EL3, and we don't care about priority
+ * order with other EL2 traps because the syndrome value is the same.
+ */
+ if (arm_fgt_active(env, arm_current_el(env))) {
+ uint64_t trapword = 0;
+ unsigned int idx = FIELD_EX32(ri->fgt, FGT, IDX);
+ unsigned int bitpos = FIELD_EX32(ri->fgt, FGT, BITPOS);
+ bool rev = FIELD_EX32(ri->fgt, FGT, REV);
+ bool trapbit;
+
+ if (ri->fgt & FGT_EXEC) {
+ assert(idx < ARRAY_SIZE(env->cp15.fgt_exec));
+ trapword = env->cp15.fgt_exec[idx];
+ } else if (isread && (ri->fgt & FGT_R)) {
+ assert(idx < ARRAY_SIZE(env->cp15.fgt_read));
+ trapword = env->cp15.fgt_read[idx];
+ } else if (!isread && (ri->fgt & FGT_W)) {
+ assert(idx < ARRAY_SIZE(env->cp15.fgt_write));
+ trapword = env->cp15.fgt_write[idx];
+ }
+
+ trapbit = extract64(trapword, bitpos, 1);
+ if (trapbit != rev) {
+ res = CP_ACCESS_TRAP_EL2;
+ goto fail;
+ }
+ }
+
+ if (likely(res == CP_ACCESS_OK)) {
+ return ri;
+ }
+
+ fail:
+ switch (res & ~CP_ACCESS_EL_MASK) {
+ case CP_ACCESS_TRAP:
+ break;
+ case CP_ACCESS_TRAP_UNCATEGORIZED:
+ /* Only CP_ACCESS_TRAP traps are direct to a specified EL */
+ assert((res & CP_ACCESS_EL_MASK) == 0);
+ if (cpu_isar_feature(aa64_ids, cpu) && isread &&
+ arm_cpreg_in_idspace(ri)) {
+ /*
+ * FEAT_IDST says this should be reported as EC_SYSTEMREGISTERTRAP,
+ * not EC_UNCATEGORIZED
+ */
+ break;
+ }
+ syndrome = syn_uncategorized();
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ target_el = res & CP_ACCESS_EL_MASK;
+ switch (target_el) {
+ case 0:
+ target_el = exception_target_el(env);
+ break;
+ case 2:
+ assert(arm_current_el(env) != 3);
+ assert(arm_is_el2_enabled(env));
+ break;
+ case 3:
+ assert(arm_feature(env, ARM_FEATURE_EL3));
+ break;
+ default:
+ /* No "direct" traps to EL1 */
+ g_assert_not_reached();
+ }
+
+ raise_exception(env, EXCP_UDEF, syndrome, target_el);
+}
+
+const void *HELPER(lookup_cp_reg)(CPUARMState *env, uint32_t key)
+{
+ ARMCPU *cpu = env_archcpu(env);
+ const ARMCPRegInfo *ri = get_arm_cp_reginfo(cpu->cp_regs, key);
+
+ assert(ri != NULL);
+ return ri;
+}
+
+void HELPER(set_cp_reg)(CPUARMState *env, const void *rip, uint32_t value)
+{
+ const ARMCPRegInfo *ri = rip;
+
+ if (ri->type & ARM_CP_IO) {
+ qemu_mutex_lock_iothread();
+ ri->writefn(env, ri, value);
+ qemu_mutex_unlock_iothread();
+ } else {
+ ri->writefn(env, ri, value);
+ }
+}
+
+uint32_t HELPER(get_cp_reg)(CPUARMState *env, const void *rip)
+{
+ const ARMCPRegInfo *ri = rip;
+ uint32_t res;
+
+ if (ri->type & ARM_CP_IO) {
+ qemu_mutex_lock_iothread();
+ res = ri->readfn(env, ri);
+ qemu_mutex_unlock_iothread();
+ } else {
+ res = ri->readfn(env, ri);
+ }
+
+ return res;
+}
+
+void HELPER(set_cp_reg64)(CPUARMState *env, const void *rip, uint64_t value)
+{
+ const ARMCPRegInfo *ri = rip;
+
+ if (ri->type & ARM_CP_IO) {
+ qemu_mutex_lock_iothread();
+ ri->writefn(env, ri, value);
+ qemu_mutex_unlock_iothread();
+ } else {
+ ri->writefn(env, ri, value);
+ }
+}
+
+uint64_t HELPER(get_cp_reg64)(CPUARMState *env, const void *rip)
+{
+ const ARMCPRegInfo *ri = rip;
+ uint64_t res;
+
+ if (ri->type & ARM_CP_IO) {
+ qemu_mutex_lock_iothread();
+ res = ri->readfn(env, ri);
+ qemu_mutex_unlock_iothread();
+ } else {
+ res = ri->readfn(env, ri);
+ }
+
+ return res;
+}
+
+void HELPER(pre_hvc)(CPUARMState *env)
+{
+ ARMCPU *cpu = env_archcpu(env);
+ int cur_el = arm_current_el(env);
+ /* FIXME: Use actual secure state. */
+ bool secure = false;
+ bool undef;
+
+ if (arm_is_psci_call(cpu, EXCP_HVC)) {
+ /* If PSCI is enabled and this looks like a valid PSCI call then
+ * that overrides the architecturally mandated HVC behaviour.
+ */
+ return;
+ }
+
+ if (!arm_feature(env, ARM_FEATURE_EL2)) {
+ /* If EL2 doesn't exist, HVC always UNDEFs */
+ undef = true;
+ } else if (arm_feature(env, ARM_FEATURE_EL3)) {
+ /* EL3.HCE has priority over EL2.HCD. */
+ undef = !(env->cp15.scr_el3 & SCR_HCE);
+ } else {
+ undef = env->cp15.hcr_el2 & HCR_HCD;
+ }
+
+ /* In ARMv7 and ARMv8/AArch32, HVC is undef in secure state.
+ * For ARMv8/AArch64, HVC is allowed in EL3.
+ * Note that we've already trapped HVC from EL0 at translation
+ * time.
+ */
+ if (secure && (!is_a64(env) || cur_el == 1)) {
+ undef = true;
+ }
+
+ if (undef) {
+ raise_exception(env, EXCP_UDEF, syn_uncategorized(),
+ exception_target_el(env));
+ }
+}
+
+void HELPER(pre_smc)(CPUARMState *env, uint32_t syndrome)
+{
+ ARMCPU *cpu = env_archcpu(env);
+ int cur_el = arm_current_el(env);
+ bool secure = arm_is_secure(env);
+ bool smd_flag = env->cp15.scr_el3 & SCR_SMD;
+
+ /*
+ * SMC behaviour is summarized in the following table.
+ * This helper handles the "Trap to EL2" and "Undef insn" cases.
+ * The "Trap to EL3" and "PSCI call" cases are handled in the exception
+ * helper.
+ *
+ * -> ARM_FEATURE_EL3 and !SMD
+ * HCR_TSC && NS EL1 !HCR_TSC || !NS EL1
+ *
+ * Conduit SMC, valid call Trap to EL2 PSCI Call
+ * Conduit SMC, inval call Trap to EL2 Trap to EL3
+ * Conduit not SMC Trap to EL2 Trap to EL3
+ *
+ *
+ * -> ARM_FEATURE_EL3 and SMD
+ * HCR_TSC && NS EL1 !HCR_TSC || !NS EL1
+ *
+ * Conduit SMC, valid call Trap to EL2 PSCI Call
+ * Conduit SMC, inval call Trap to EL2 Undef insn
+ * Conduit not SMC Trap to EL2 Undef insn
+ *
+ *
+ * -> !ARM_FEATURE_EL3
+ * HCR_TSC && NS EL1 !HCR_TSC || !NS EL1
+ *
+ * Conduit SMC, valid call Trap to EL2 PSCI Call
+ * Conduit SMC, inval call Trap to EL2 Undef insn
+ * Conduit not SMC Undef insn Undef insn
+ */
+
+ /* On ARMv8 with EL3 AArch64, SMD applies to both S and NS state.
+ * On ARMv8 with EL3 AArch32, or ARMv7 with the Virtualization
+ * extensions, SMD only applies to NS state.
+ * On ARMv7 without the Virtualization extensions, the SMD bit
+ * doesn't exist, but we forbid the guest to set it to 1 in scr_write(),
+ * so we need not special case this here.
+ */
+ bool smd = arm_feature(env, ARM_FEATURE_AARCH64) ? smd_flag
+ : smd_flag && !secure;
+
+ if (!arm_feature(env, ARM_FEATURE_EL3) &&
+ cpu->psci_conduit != QEMU_PSCI_CONDUIT_SMC) {
+ /* If we have no EL3 then SMC always UNDEFs and can't be
+ * trapped to EL2. PSCI-via-SMC is a sort of ersatz EL3
+ * firmware within QEMU, and we want an EL2 guest to be able
+ * to forbid its EL1 from making PSCI calls into QEMU's
+ * "firmware" via HCR.TSC, so for these purposes treat
+ * PSCI-via-SMC as implying an EL3.
+ * This handles the very last line of the previous table.
+ */
+ raise_exception(env, EXCP_UDEF, syn_uncategorized(),
+ exception_target_el(env));
+ }
+
+ if (cur_el == 1 && (arm_hcr_el2_eff(env) & HCR_TSC)) {
+ /* In NS EL1, HCR controlled routing to EL2 has priority over SMD.
+ * We also want an EL2 guest to be able to forbid its EL1 from
+ * making PSCI calls into QEMU's "firmware" via HCR.TSC.
+ * This handles all the "Trap to EL2" cases of the previous table.
+ */
+ raise_exception(env, EXCP_HYP_TRAP, syndrome, 2);
+ }
+
+ /* Catch the two remaining "Undef insn" cases of the previous table:
+ * - PSCI conduit is SMC but we don't have a valid PCSI call,
+ * - We don't have EL3 or SMD is set.
+ */
+ if (!arm_is_psci_call(cpu, EXCP_SMC) &&
+ (smd || !arm_feature(env, ARM_FEATURE_EL3))) {
+ raise_exception(env, EXCP_UDEF, syn_uncategorized(),
+ exception_target_el(env));
+ }
+}
+
+/* ??? Flag setting arithmetic is awkward because we need to do comparisons.
+ The only way to do that in TCG is a conditional branch, which clobbers
+ all our temporaries. For now implement these as helper functions. */
+
+/* Similarly for variable shift instructions. */
+
+uint32_t HELPER(shl_cc)(CPUARMState *env, uint32_t x, uint32_t i)
+{
+ int shift = i & 0xff;
+ if (shift >= 32) {
+ if (shift == 32)
+ env->CF = x & 1;
+ else
+ env->CF = 0;
+ return 0;
+ } else if (shift != 0) {
+ env->CF = (x >> (32 - shift)) & 1;
+ return x << shift;
+ }
+ return x;
+}
+
+uint32_t HELPER(shr_cc)(CPUARMState *env, uint32_t x, uint32_t i)
+{
+ int shift = i & 0xff;
+ if (shift >= 32) {
+ if (shift == 32)
+ env->CF = (x >> 31) & 1;
+ else
+ env->CF = 0;
+ return 0;
+ } else if (shift != 0) {
+ env->CF = (x >> (shift - 1)) & 1;
+ return x >> shift;
+ }
+ return x;
+}
+
+uint32_t HELPER(sar_cc)(CPUARMState *env, uint32_t x, uint32_t i)
+{
+ int shift = i & 0xff;
+ if (shift >= 32) {
+ env->CF = (x >> 31) & 1;
+ return (int32_t)x >> 31;
+ } else if (shift != 0) {
+ env->CF = (x >> (shift - 1)) & 1;
+ return (int32_t)x >> shift;
+ }
+ return x;
+}
+
+uint32_t HELPER(ror_cc)(CPUARMState *env, uint32_t x, uint32_t i)
+{
+ int shift1, shift;
+ shift1 = i & 0xff;
+ shift = shift1 & 0x1f;
+ if (shift == 0) {
+ if (shift1 != 0)
+ env->CF = (x >> 31) & 1;
+ return x;
+ } else {
+ env->CF = (x >> (shift - 1)) & 1;
+ return ((uint32_t)x >> shift) | (x << (32 - shift));
+ }
+}
+
+void HELPER(probe_access)(CPUARMState *env, target_ulong ptr,
+ uint32_t access_type, uint32_t mmu_idx,
+ uint32_t size)
+{
+ uint32_t in_page = -((uint32_t)ptr | TARGET_PAGE_SIZE);
+ uintptr_t ra = GETPC();
+
+ if (likely(size <= in_page)) {
+ probe_access(env, ptr, size, access_type, mmu_idx, ra);
+ } else {
+ probe_access(env, ptr, in_page, access_type, mmu_idx, ra);
+ probe_access(env, ptr + in_page, size - in_page,
+ access_type, mmu_idx, ra);
+ }
+}
+
+/*
+ * This function corresponds to AArch64.vESBOperation().
+ * Note that the AArch32 version is not functionally different.
+ */
+void HELPER(vesb)(CPUARMState *env)
+{
+ /*
+ * The EL2Enabled() check is done inside arm_hcr_el2_eff,
+ * and will return HCR_EL2.VSE == 0, so nothing happens.
+ */
+ uint64_t hcr = arm_hcr_el2_eff(env);
+ bool enabled = !(hcr & HCR_TGE) && (hcr & HCR_AMO);
+ bool pending = enabled && (hcr & HCR_VSE);
+ bool masked = (env->daif & PSTATE_A);
+
+ /* If VSE pending and masked, defer the exception. */
+ if (pending && masked) {
+ uint32_t syndrome;
+
+ if (arm_el_is_aa64(env, 1)) {
+ /* Copy across IDS and ISS from VSESR. */
+ syndrome = env->cp15.vsesr_el2 & 0x1ffffff;
+ } else {
+ ARMMMUFaultInfo fi = { .type = ARMFault_AsyncExternal };
+
+ if (extended_addresses_enabled(env)) {
+ syndrome = arm_fi_to_lfsc(&fi);
+ } else {
+ syndrome = arm_fi_to_sfsc(&fi);
+ }
+ /* Copy across AET and ExT from VSESR. */
+ syndrome |= env->cp15.vsesr_el2 & 0xd000;
+ }
+
+ /* Set VDISR_EL2.A along with the syndrome. */
+ env->cp15.vdisr_el2 = syndrome | (1u << 31);
+
+ /* Clear pending virtual SError */
+ env->cp15.hcr_el2 &= ~HCR_VSE;
+ cpu_reset_interrupt(env_cpu(env), CPU_INTERRUPT_VSERR);
+ }
+}
--- /dev/null
+/*
+ * ARM v8.3-PAuth Operations
+ *
+ * Copyright (c) 2019 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "exec/exec-all.h"
+#include "exec/cpu_ldst.h"
+#include "exec/helper-proto.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "qemu/xxhash.h"
+
+
+static uint64_t pac_cell_shuffle(uint64_t i)
+{
+ uint64_t o = 0;
+
+ o |= extract64(i, 52, 4);
+ o |= extract64(i, 24, 4) << 4;
+ o |= extract64(i, 44, 4) << 8;
+ o |= extract64(i, 0, 4) << 12;
+
+ o |= extract64(i, 28, 4) << 16;
+ o |= extract64(i, 48, 4) << 20;
+ o |= extract64(i, 4, 4) << 24;
+ o |= extract64(i, 40, 4) << 28;
+
+ o |= extract64(i, 32, 4) << 32;
+ o |= extract64(i, 12, 4) << 36;
+ o |= extract64(i, 56, 4) << 40;
+ o |= extract64(i, 20, 4) << 44;
+
+ o |= extract64(i, 8, 4) << 48;
+ o |= extract64(i, 36, 4) << 52;
+ o |= extract64(i, 16, 4) << 56;
+ o |= extract64(i, 60, 4) << 60;
+
+ return o;
+}
+
+static uint64_t pac_cell_inv_shuffle(uint64_t i)
+{
+ uint64_t o = 0;
+
+ o |= extract64(i, 12, 4);
+ o |= extract64(i, 24, 4) << 4;
+ o |= extract64(i, 48, 4) << 8;
+ o |= extract64(i, 36, 4) << 12;
+
+ o |= extract64(i, 56, 4) << 16;
+ o |= extract64(i, 44, 4) << 20;
+ o |= extract64(i, 4, 4) << 24;
+ o |= extract64(i, 16, 4) << 28;
+
+ o |= i & MAKE_64BIT_MASK(32, 4);
+ o |= extract64(i, 52, 4) << 36;
+ o |= extract64(i, 28, 4) << 40;
+ o |= extract64(i, 8, 4) << 44;
+
+ o |= extract64(i, 20, 4) << 48;
+ o |= extract64(i, 0, 4) << 52;
+ o |= extract64(i, 40, 4) << 56;
+ o |= i & MAKE_64BIT_MASK(60, 4);
+
+ return o;
+}
+
+static uint64_t pac_sub(uint64_t i)
+{
+ static const uint8_t sub[16] = {
+ 0xb, 0x6, 0x8, 0xf, 0xc, 0x0, 0x9, 0xe,
+ 0x3, 0x7, 0x4, 0x5, 0xd, 0x2, 0x1, 0xa,
+ };
+ uint64_t o = 0;
+ int b;
+
+ for (b = 0; b < 64; b += 4) {
+ o |= (uint64_t)sub[(i >> b) & 0xf] << b;
+ }
+ return o;
+}
+
+static uint64_t pac_inv_sub(uint64_t i)
+{
+ static const uint8_t inv_sub[16] = {
+ 0x5, 0xe, 0xd, 0x8, 0xa, 0xb, 0x1, 0x9,
+ 0x2, 0x6, 0xf, 0x0, 0x4, 0xc, 0x7, 0x3,
+ };
+ uint64_t o = 0;
+ int b;
+
+ for (b = 0; b < 64; b += 4) {
+ o |= (uint64_t)inv_sub[(i >> b) & 0xf] << b;
+ }
+ return o;
+}
+
+static int rot_cell(int cell, int n)
+{
+ /* 4-bit rotate left by n. */
+ cell |= cell << 4;
+ return extract32(cell, 4 - n, 4);
+}
+
+static uint64_t pac_mult(uint64_t i)
+{
+ uint64_t o = 0;
+ int b;
+
+ for (b = 0; b < 4 * 4; b += 4) {
+ int i0, i4, i8, ic, t0, t1, t2, t3;
+
+ i0 = extract64(i, b, 4);
+ i4 = extract64(i, b + 4 * 4, 4);
+ i8 = extract64(i, b + 8 * 4, 4);
+ ic = extract64(i, b + 12 * 4, 4);
+
+ t0 = rot_cell(i8, 1) ^ rot_cell(i4, 2) ^ rot_cell(i0, 1);
+ t1 = rot_cell(ic, 1) ^ rot_cell(i4, 1) ^ rot_cell(i0, 2);
+ t2 = rot_cell(ic, 2) ^ rot_cell(i8, 1) ^ rot_cell(i0, 1);
+ t3 = rot_cell(ic, 1) ^ rot_cell(i8, 2) ^ rot_cell(i4, 1);
+
+ o |= (uint64_t)t3 << b;
+ o |= (uint64_t)t2 << (b + 4 * 4);
+ o |= (uint64_t)t1 << (b + 8 * 4);
+ o |= (uint64_t)t0 << (b + 12 * 4);
+ }
+ return o;
+}
+
+static uint64_t tweak_cell_rot(uint64_t cell)
+{
+ return (cell >> 1) | (((cell ^ (cell >> 1)) & 1) << 3);
+}
+
+static uint64_t tweak_shuffle(uint64_t i)
+{
+ uint64_t o = 0;
+
+ o |= extract64(i, 16, 4) << 0;
+ o |= extract64(i, 20, 4) << 4;
+ o |= tweak_cell_rot(extract64(i, 24, 4)) << 8;
+ o |= extract64(i, 28, 4) << 12;
+
+ o |= tweak_cell_rot(extract64(i, 44, 4)) << 16;
+ o |= extract64(i, 8, 4) << 20;
+ o |= extract64(i, 12, 4) << 24;
+ o |= tweak_cell_rot(extract64(i, 32, 4)) << 28;
+
+ o |= extract64(i, 48, 4) << 32;
+ o |= extract64(i, 52, 4) << 36;
+ o |= extract64(i, 56, 4) << 40;
+ o |= tweak_cell_rot(extract64(i, 60, 4)) << 44;
+
+ o |= tweak_cell_rot(extract64(i, 0, 4)) << 48;
+ o |= extract64(i, 4, 4) << 52;
+ o |= tweak_cell_rot(extract64(i, 40, 4)) << 56;
+ o |= tweak_cell_rot(extract64(i, 36, 4)) << 60;
+
+ return o;
+}
+
+static uint64_t tweak_cell_inv_rot(uint64_t cell)
+{
+ return ((cell << 1) & 0xf) | ((cell & 1) ^ (cell >> 3));
+}
+
+static uint64_t tweak_inv_shuffle(uint64_t i)
+{
+ uint64_t o = 0;
+
+ o |= tweak_cell_inv_rot(extract64(i, 48, 4));
+ o |= extract64(i, 52, 4) << 4;
+ o |= extract64(i, 20, 4) << 8;
+ o |= extract64(i, 24, 4) << 12;
+
+ o |= extract64(i, 0, 4) << 16;
+ o |= extract64(i, 4, 4) << 20;
+ o |= tweak_cell_inv_rot(extract64(i, 8, 4)) << 24;
+ o |= extract64(i, 12, 4) << 28;
+
+ o |= tweak_cell_inv_rot(extract64(i, 28, 4)) << 32;
+ o |= tweak_cell_inv_rot(extract64(i, 60, 4)) << 36;
+ o |= tweak_cell_inv_rot(extract64(i, 56, 4)) << 40;
+ o |= tweak_cell_inv_rot(extract64(i, 16, 4)) << 44;
+
+ o |= extract64(i, 32, 4) << 48;
+ o |= extract64(i, 36, 4) << 52;
+ o |= extract64(i, 40, 4) << 56;
+ o |= tweak_cell_inv_rot(extract64(i, 44, 4)) << 60;
+
+ return o;
+}
+
+static uint64_t pauth_computepac_architected(uint64_t data, uint64_t modifier,
+ ARMPACKey key)
+{
+ static const uint64_t RC[5] = {
+ 0x0000000000000000ull,
+ 0x13198A2E03707344ull,
+ 0xA4093822299F31D0ull,
+ 0x082EFA98EC4E6C89ull,
+ 0x452821E638D01377ull,
+ };
+ const uint64_t alpha = 0xC0AC29B7C97C50DDull;
+ /*
+ * Note that in the ARM pseudocode, key0 contains bits <127:64>
+ * and key1 contains bits <63:0> of the 128-bit key.
+ */
+ uint64_t key0 = key.hi, key1 = key.lo;
+ uint64_t workingval, runningmod, roundkey, modk0;
+ int i;
+
+ modk0 = (key0 << 63) | ((key0 >> 1) ^ (key0 >> 63));
+ runningmod = modifier;
+ workingval = data ^ key0;
+
+ for (i = 0; i <= 4; ++i) {
+ roundkey = key1 ^ runningmod;
+ workingval ^= roundkey;
+ workingval ^= RC[i];
+ if (i > 0) {
+ workingval = pac_cell_shuffle(workingval);
+ workingval = pac_mult(workingval);
+ }
+ workingval = pac_sub(workingval);
+ runningmod = tweak_shuffle(runningmod);
+ }
+ roundkey = modk0 ^ runningmod;
+ workingval ^= roundkey;
+ workingval = pac_cell_shuffle(workingval);
+ workingval = pac_mult(workingval);
+ workingval = pac_sub(workingval);
+ workingval = pac_cell_shuffle(workingval);
+ workingval = pac_mult(workingval);
+ workingval ^= key1;
+ workingval = pac_cell_inv_shuffle(workingval);
+ workingval = pac_inv_sub(workingval);
+ workingval = pac_mult(workingval);
+ workingval = pac_cell_inv_shuffle(workingval);
+ workingval ^= key0;
+ workingval ^= runningmod;
+ for (i = 0; i <= 4; ++i) {
+ workingval = pac_inv_sub(workingval);
+ if (i < 4) {
+ workingval = pac_mult(workingval);
+ workingval = pac_cell_inv_shuffle(workingval);
+ }
+ runningmod = tweak_inv_shuffle(runningmod);
+ roundkey = key1 ^ runningmod;
+ workingval ^= RC[4 - i];
+ workingval ^= roundkey;
+ workingval ^= alpha;
+ }
+ workingval ^= modk0;
+
+ return workingval;
+}
+
+static uint64_t pauth_computepac_impdef(uint64_t data, uint64_t modifier,
+ ARMPACKey key)
+{
+ return qemu_xxhash64_4(data, modifier, key.lo, key.hi);
+}
+
+static uint64_t pauth_computepac(CPUARMState *env, uint64_t data,
+ uint64_t modifier, ARMPACKey key)
+{
+ if (cpu_isar_feature(aa64_pauth_arch, env_archcpu(env))) {
+ return pauth_computepac_architected(data, modifier, key);
+ } else {
+ return pauth_computepac_impdef(data, modifier, key);
+ }
+}
+
+static uint64_t pauth_addpac(CPUARMState *env, uint64_t ptr, uint64_t modifier,
+ ARMPACKey *key, bool data)
+{
+ ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env);
+ ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data);
+ uint64_t pac, ext_ptr, ext, test;
+ int bot_bit, top_bit;
+
+ /* If tagged pointers are in use, use ptr<55>, otherwise ptr<63>. */
+ if (param.tbi) {
+ ext = sextract64(ptr, 55, 1);
+ } else {
+ ext = sextract64(ptr, 63, 1);
+ }
+
+ /* Build a pointer with known good extension bits. */
+ top_bit = 64 - 8 * param.tbi;
+ bot_bit = 64 - param.tsz;
+ ext_ptr = deposit64(ptr, bot_bit, top_bit - bot_bit, ext);
+
+ pac = pauth_computepac(env, ext_ptr, modifier, *key);
+
+ /*
+ * Check if the ptr has good extension bits and corrupt the
+ * pointer authentication code if not.
+ */
+ test = sextract64(ptr, bot_bit, top_bit - bot_bit);
+ if (test != 0 && test != -1) {
+ /*
+ * Note that our top_bit is one greater than the pseudocode's
+ * version, hence "- 2" here.
+ */
+ pac ^= MAKE_64BIT_MASK(top_bit - 2, 1);
+ }
+
+ /*
+ * Preserve the determination between upper and lower at bit 55,
+ * and insert pointer authentication code.
+ */
+ if (param.tbi) {
+ ptr &= ~MAKE_64BIT_MASK(bot_bit, 55 - bot_bit + 1);
+ pac &= MAKE_64BIT_MASK(bot_bit, 54 - bot_bit + 1);
+ } else {
+ ptr &= MAKE_64BIT_MASK(0, bot_bit);
+ pac &= ~(MAKE_64BIT_MASK(55, 1) | MAKE_64BIT_MASK(0, bot_bit));
+ }
+ ext &= MAKE_64BIT_MASK(55, 1);
+ return pac | ext | ptr;
+}
+
+static uint64_t pauth_original_ptr(uint64_t ptr, ARMVAParameters param)
+{
+ /* Note that bit 55 is used whether or not the regime has 2 ranges. */
+ uint64_t extfield = sextract64(ptr, 55, 1);
+ int bot_pac_bit = 64 - param.tsz;
+ int top_pac_bit = 64 - 8 * param.tbi;
+
+ return deposit64(ptr, bot_pac_bit, top_pac_bit - bot_pac_bit, extfield);
+}
+
+static uint64_t pauth_auth(CPUARMState *env, uint64_t ptr, uint64_t modifier,
+ ARMPACKey *key, bool data, int keynumber)
+{
+ ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env);
+ ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data);
+ int bot_bit, top_bit;
+ uint64_t pac, orig_ptr, test;
+
+ orig_ptr = pauth_original_ptr(ptr, param);
+ pac = pauth_computepac(env, orig_ptr, modifier, *key);
+ bot_bit = 64 - param.tsz;
+ top_bit = 64 - 8 * param.tbi;
+
+ test = (pac ^ ptr) & ~MAKE_64BIT_MASK(55, 1);
+ if (unlikely(extract64(test, bot_bit, top_bit - bot_bit))) {
+ int error_code = (keynumber << 1) | (keynumber ^ 1);
+ if (param.tbi) {
+ return deposit64(orig_ptr, 53, 2, error_code);
+ } else {
+ return deposit64(orig_ptr, 61, 2, error_code);
+ }
+ }
+ return orig_ptr;
+}
+
+static uint64_t pauth_strip(CPUARMState *env, uint64_t ptr, bool data)
+{
+ ARMMMUIdx mmu_idx = arm_stage1_mmu_idx(env);
+ ARMVAParameters param = aa64_va_parameters(env, ptr, mmu_idx, data);
+
+ return pauth_original_ptr(ptr, param);
+}
+
+static G_NORETURN
+void pauth_trap(CPUARMState *env, int target_el, uintptr_t ra)
+{
+ raise_exception_ra(env, EXCP_UDEF, syn_pactrap(), target_el, ra);
+}
+
+static void pauth_check_trap(CPUARMState *env, int el, uintptr_t ra)
+{
+ if (el < 2 && arm_is_el2_enabled(env)) {
+ uint64_t hcr = arm_hcr_el2_eff(env);
+ bool trap = !(hcr & HCR_API);
+ if (el == 0) {
+ /* Trap only applies to EL1&0 regime. */
+ trap &= (hcr & (HCR_E2H | HCR_TGE)) != (HCR_E2H | HCR_TGE);
+ }
+ /* FIXME: ARMv8.3-NV: HCR_NV trap takes precedence for ERETA[AB]. */
+ if (trap) {
+ pauth_trap(env, 2, ra);
+ }
+ }
+ if (el < 3 && arm_feature(env, ARM_FEATURE_EL3)) {
+ if (!(env->cp15.scr_el3 & SCR_API)) {
+ pauth_trap(env, 3, ra);
+ }
+ }
+}
+
+static bool pauth_key_enabled(CPUARMState *env, int el, uint32_t bit)
+{
+ return (arm_sctlr(env, el) & bit) != 0;
+}
+
+uint64_t HELPER(pacia)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+ int el = arm_current_el(env);
+ if (!pauth_key_enabled(env, el, SCTLR_EnIA)) {
+ return x;
+ }
+ pauth_check_trap(env, el, GETPC());
+ return pauth_addpac(env, x, y, &env->keys.apia, false);
+}
+
+uint64_t HELPER(pacib)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+ int el = arm_current_el(env);
+ if (!pauth_key_enabled(env, el, SCTLR_EnIB)) {
+ return x;
+ }
+ pauth_check_trap(env, el, GETPC());
+ return pauth_addpac(env, x, y, &env->keys.apib, false);
+}
+
+uint64_t HELPER(pacda)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+ int el = arm_current_el(env);
+ if (!pauth_key_enabled(env, el, SCTLR_EnDA)) {
+ return x;
+ }
+ pauth_check_trap(env, el, GETPC());
+ return pauth_addpac(env, x, y, &env->keys.apda, true);
+}
+
+uint64_t HELPER(pacdb)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+ int el = arm_current_el(env);
+ if (!pauth_key_enabled(env, el, SCTLR_EnDB)) {
+ return x;
+ }
+ pauth_check_trap(env, el, GETPC());
+ return pauth_addpac(env, x, y, &env->keys.apdb, true);
+}
+
+uint64_t HELPER(pacga)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+ uint64_t pac;
+
+ pauth_check_trap(env, arm_current_el(env), GETPC());
+ pac = pauth_computepac(env, x, y, env->keys.apga);
+
+ return pac & 0xffffffff00000000ull;
+}
+
+uint64_t HELPER(autia)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+ int el = arm_current_el(env);
+ if (!pauth_key_enabled(env, el, SCTLR_EnIA)) {
+ return x;
+ }
+ pauth_check_trap(env, el, GETPC());
+ return pauth_auth(env, x, y, &env->keys.apia, false, 0);
+}
+
+uint64_t HELPER(autib)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+ int el = arm_current_el(env);
+ if (!pauth_key_enabled(env, el, SCTLR_EnIB)) {
+ return x;
+ }
+ pauth_check_trap(env, el, GETPC());
+ return pauth_auth(env, x, y, &env->keys.apib, false, 1);
+}
+
+uint64_t HELPER(autda)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+ int el = arm_current_el(env);
+ if (!pauth_key_enabled(env, el, SCTLR_EnDA)) {
+ return x;
+ }
+ pauth_check_trap(env, el, GETPC());
+ return pauth_auth(env, x, y, &env->keys.apda, true, 0);
+}
+
+uint64_t HELPER(autdb)(CPUARMState *env, uint64_t x, uint64_t y)
+{
+ int el = arm_current_el(env);
+ if (!pauth_key_enabled(env, el, SCTLR_EnDB)) {
+ return x;
+ }
+ pauth_check_trap(env, el, GETPC());
+ return pauth_auth(env, x, y, &env->keys.apdb, true, 1);
+}
+
+uint64_t HELPER(xpaci)(CPUARMState *env, uint64_t a)
+{
+ return pauth_strip(env, a, false);
+}
+
+uint64_t HELPER(xpacd)(CPUARMState *env, uint64_t a)
+{
+ return pauth_strip(env, a, true);
+}
--- /dev/null
+/*
+ * ARM SME Operations
+ *
+ * Copyright (c) 2022 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "exec/helper-proto.h"
+#include "exec/cpu_ldst.h"
+#include "exec/exec-all.h"
+#include "qemu/int128.h"
+#include "fpu/softfloat.h"
+#include "vec_internal.h"
+#include "sve_ldst_internal.h"
+
+void helper_set_svcr(CPUARMState *env, uint32_t val, uint32_t mask)
+{
+ aarch64_set_svcr(env, val, mask);
+}
+
+void helper_sme_zero(CPUARMState *env, uint32_t imm, uint32_t svl)
+{
+ uint32_t i;
+
+ /*
+ * Special case clearing the entire ZA space.
+ * This falls into the CONSTRAINED UNPREDICTABLE zeroing of any
+ * parts of the ZA storage outside of SVL.
+ */
+ if (imm == 0xff) {
+ memset(env->zarray, 0, sizeof(env->zarray));
+ return;
+ }
+
+ /*
+ * Recall that ZAnH.D[m] is spread across ZA[n+8*m],
+ * so each row is discontiguous within ZA[].
+ */
+ for (i = 0; i < svl; i++) {
+ if (imm & (1 << (i % 8))) {
+ memset(&env->zarray[i], 0, svl);
+ }
+ }
+}
+
+
+/*
+ * When considering the ZA storage as an array of elements of
+ * type T, the index within that array of the Nth element of
+ * a vertical slice of a tile can be calculated like this,
+ * regardless of the size of type T. This is because the tiles
+ * are interleaved, so if type T is size N bytes then row 1 of
+ * the tile is N rows away from row 0. The division by N to
+ * convert a byte offset into an array index and the multiplication
+ * by N to convert from vslice-index-within-the-tile to
+ * the index within the ZA storage cancel out.
+ */
+#define tile_vslice_index(i) ((i) * sizeof(ARMVectorReg))
+
+/*
+ * When doing byte arithmetic on the ZA storage, the element
+ * byteoff bytes away in a tile vertical slice is always this
+ * many bytes away in the ZA storage, regardless of the
+ * size of the tile element, assuming that byteoff is a multiple
+ * of the element size. Again this is because of the interleaving
+ * of the tiles. For instance if we have 1 byte per element then
+ * each row of the ZA storage has one byte of the vslice data,
+ * and (counting from 0) byte 8 goes in row 8 of the storage
+ * at offset (8 * row-size-in-bytes).
+ * If we have 8 bytes per element then each row of the ZA storage
+ * has 8 bytes of the data, but there are 8 interleaved tiles and
+ * so byte 8 of the data goes into row 1 of the tile,
+ * which is again row 8 of the storage, so the offset is still
+ * (8 * row-size-in-bytes). Similarly for other element sizes.
+ */
+#define tile_vslice_offset(byteoff) ((byteoff) * sizeof(ARMVectorReg))
+
+
+/*
+ * Move Zreg vector to ZArray column.
+ */
+#define DO_MOVA_C(NAME, TYPE, H) \
+void HELPER(NAME)(void *za, void *vn, void *vg, uint32_t desc) \
+{ \
+ int i, oprsz = simd_oprsz(desc); \
+ for (i = 0; i < oprsz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ if (pg & 1) { \
+ *(TYPE *)(za + tile_vslice_offset(i)) = *(TYPE *)(vn + H(i)); \
+ } \
+ i += sizeof(TYPE); \
+ pg >>= sizeof(TYPE); \
+ } while (i & 15); \
+ } \
+}
+
+DO_MOVA_C(sme_mova_cz_b, uint8_t, H1)
+DO_MOVA_C(sme_mova_cz_h, uint16_t, H1_2)
+DO_MOVA_C(sme_mova_cz_s, uint32_t, H1_4)
+
+void HELPER(sme_mova_cz_d)(void *za, void *vn, void *vg, uint32_t desc)
+{
+ int i, oprsz = simd_oprsz(desc) / 8;
+ uint8_t *pg = vg;
+ uint64_t *n = vn;
+ uint64_t *a = za;
+
+ for (i = 0; i < oprsz; i++) {
+ if (pg[H1(i)] & 1) {
+ a[tile_vslice_index(i)] = n[i];
+ }
+ }
+}
+
+void HELPER(sme_mova_cz_q)(void *za, void *vn, void *vg, uint32_t desc)
+{
+ int i, oprsz = simd_oprsz(desc) / 16;
+ uint16_t *pg = vg;
+ Int128 *n = vn;
+ Int128 *a = za;
+
+ /*
+ * Int128 is used here simply to copy 16 bytes, and to simplify
+ * the address arithmetic.
+ */
+ for (i = 0; i < oprsz; i++) {
+ if (pg[H2(i)] & 1) {
+ a[tile_vslice_index(i)] = n[i];
+ }
+ }
+}
+
+#undef DO_MOVA_C
+
+/*
+ * Move ZArray column to Zreg vector.
+ */
+#define DO_MOVA_Z(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *za, void *vg, uint32_t desc) \
+{ \
+ int i, oprsz = simd_oprsz(desc); \
+ for (i = 0; i < oprsz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ if (pg & 1) { \
+ *(TYPE *)(vd + H(i)) = *(TYPE *)(za + tile_vslice_offset(i)); \
+ } \
+ i += sizeof(TYPE); \
+ pg >>= sizeof(TYPE); \
+ } while (i & 15); \
+ } \
+}
+
+DO_MOVA_Z(sme_mova_zc_b, uint8_t, H1)
+DO_MOVA_Z(sme_mova_zc_h, uint16_t, H1_2)
+DO_MOVA_Z(sme_mova_zc_s, uint32_t, H1_4)
+
+void HELPER(sme_mova_zc_d)(void *vd, void *za, void *vg, uint32_t desc)
+{
+ int i, oprsz = simd_oprsz(desc) / 8;
+ uint8_t *pg = vg;
+ uint64_t *d = vd;
+ uint64_t *a = za;
+
+ for (i = 0; i < oprsz; i++) {
+ if (pg[H1(i)] & 1) {
+ d[i] = a[tile_vslice_index(i)];
+ }
+ }
+}
+
+void HELPER(sme_mova_zc_q)(void *vd, void *za, void *vg, uint32_t desc)
+{
+ int i, oprsz = simd_oprsz(desc) / 16;
+ uint16_t *pg = vg;
+ Int128 *d = vd;
+ Int128 *a = za;
+
+ /*
+ * Int128 is used here simply to copy 16 bytes, and to simplify
+ * the address arithmetic.
+ */
+ for (i = 0; i < oprsz; i++, za += sizeof(ARMVectorReg)) {
+ if (pg[H2(i)] & 1) {
+ d[i] = a[tile_vslice_index(i)];
+ }
+ }
+}
+
+#undef DO_MOVA_Z
+
+/*
+ * Clear elements in a tile slice comprising len bytes.
+ */
+
+typedef void ClearFn(void *ptr, size_t off, size_t len);
+
+static void clear_horizontal(void *ptr, size_t off, size_t len)
+{
+ memset(ptr + off, 0, len);
+}
+
+static void clear_vertical_b(void *vptr, size_t off, size_t len)
+{
+ for (size_t i = 0; i < len; ++i) {
+ *(uint8_t *)(vptr + tile_vslice_offset(i + off)) = 0;
+ }
+}
+
+static void clear_vertical_h(void *vptr, size_t off, size_t len)
+{
+ for (size_t i = 0; i < len; i += 2) {
+ *(uint16_t *)(vptr + tile_vslice_offset(i + off)) = 0;
+ }
+}
+
+static void clear_vertical_s(void *vptr, size_t off, size_t len)
+{
+ for (size_t i = 0; i < len; i += 4) {
+ *(uint32_t *)(vptr + tile_vslice_offset(i + off)) = 0;
+ }
+}
+
+static void clear_vertical_d(void *vptr, size_t off, size_t len)
+{
+ for (size_t i = 0; i < len; i += 8) {
+ *(uint64_t *)(vptr + tile_vslice_offset(i + off)) = 0;
+ }
+}
+
+static void clear_vertical_q(void *vptr, size_t off, size_t len)
+{
+ for (size_t i = 0; i < len; i += 16) {
+ memset(vptr + tile_vslice_offset(i + off), 0, 16);
+ }
+}
+
+/*
+ * Copy elements from an array into a tile slice comprising len bytes.
+ */
+
+typedef void CopyFn(void *dst, const void *src, size_t len);
+
+static void copy_horizontal(void *dst, const void *src, size_t len)
+{
+ memcpy(dst, src, len);
+}
+
+static void copy_vertical_b(void *vdst, const void *vsrc, size_t len)
+{
+ const uint8_t *src = vsrc;
+ uint8_t *dst = vdst;
+ size_t i;
+
+ for (i = 0; i < len; ++i) {
+ dst[tile_vslice_index(i)] = src[i];
+ }
+}
+
+static void copy_vertical_h(void *vdst, const void *vsrc, size_t len)
+{
+ const uint16_t *src = vsrc;
+ uint16_t *dst = vdst;
+ size_t i;
+
+ for (i = 0; i < len / 2; ++i) {
+ dst[tile_vslice_index(i)] = src[i];
+ }
+}
+
+static void copy_vertical_s(void *vdst, const void *vsrc, size_t len)
+{
+ const uint32_t *src = vsrc;
+ uint32_t *dst = vdst;
+ size_t i;
+
+ for (i = 0; i < len / 4; ++i) {
+ dst[tile_vslice_index(i)] = src[i];
+ }
+}
+
+static void copy_vertical_d(void *vdst, const void *vsrc, size_t len)
+{
+ const uint64_t *src = vsrc;
+ uint64_t *dst = vdst;
+ size_t i;
+
+ for (i = 0; i < len / 8; ++i) {
+ dst[tile_vslice_index(i)] = src[i];
+ }
+}
+
+static void copy_vertical_q(void *vdst, const void *vsrc, size_t len)
+{
+ for (size_t i = 0; i < len; i += 16) {
+ memcpy(vdst + tile_vslice_offset(i), vsrc + i, 16);
+ }
+}
+
+/*
+ * Host and TLB primitives for vertical tile slice addressing.
+ */
+
+#define DO_LD(NAME, TYPE, HOST, TLB) \
+static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
+{ \
+ TYPE val = HOST(host); \
+ *(TYPE *)(za + tile_vslice_offset(off)) = val; \
+} \
+static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
+ intptr_t off, target_ulong addr, uintptr_t ra) \
+{ \
+ TYPE val = TLB(env, useronly_clean_ptr(addr), ra); \
+ *(TYPE *)(za + tile_vslice_offset(off)) = val; \
+}
+
+#define DO_ST(NAME, TYPE, HOST, TLB) \
+static inline void sme_##NAME##_v_host(void *za, intptr_t off, void *host) \
+{ \
+ TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
+ HOST(host, val); \
+} \
+static inline void sme_##NAME##_v_tlb(CPUARMState *env, void *za, \
+ intptr_t off, target_ulong addr, uintptr_t ra) \
+{ \
+ TYPE val = *(TYPE *)(za + tile_vslice_offset(off)); \
+ TLB(env, useronly_clean_ptr(addr), val, ra); \
+}
+
+/*
+ * The ARMVectorReg elements are stored in host-endian 64-bit units.
+ * For 128-bit quantities, the sequence defined by the Elem[] pseudocode
+ * corresponds to storing the two 64-bit pieces in little-endian order.
+ */
+#define DO_LDQ(HNAME, VNAME, BE, HOST, TLB) \
+static inline void HNAME##_host(void *za, intptr_t off, void *host) \
+{ \
+ uint64_t val0 = HOST(host), val1 = HOST(host + 8); \
+ uint64_t *ptr = za + off; \
+ ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
+} \
+static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
+{ \
+ HNAME##_host(za, tile_vslice_offset(off), host); \
+} \
+static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
+ target_ulong addr, uintptr_t ra) \
+{ \
+ uint64_t val0 = TLB(env, useronly_clean_ptr(addr), ra); \
+ uint64_t val1 = TLB(env, useronly_clean_ptr(addr + 8), ra); \
+ uint64_t *ptr = za + off; \
+ ptr[0] = BE ? val1 : val0, ptr[1] = BE ? val0 : val1; \
+} \
+static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
+ target_ulong addr, uintptr_t ra) \
+{ \
+ HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
+}
+
+#define DO_STQ(HNAME, VNAME, BE, HOST, TLB) \
+static inline void HNAME##_host(void *za, intptr_t off, void *host) \
+{ \
+ uint64_t *ptr = za + off; \
+ HOST(host, ptr[BE]); \
+ HOST(host + 1, ptr[!BE]); \
+} \
+static inline void VNAME##_v_host(void *za, intptr_t off, void *host) \
+{ \
+ HNAME##_host(za, tile_vslice_offset(off), host); \
+} \
+static inline void HNAME##_tlb(CPUARMState *env, void *za, intptr_t off, \
+ target_ulong addr, uintptr_t ra) \
+{ \
+ uint64_t *ptr = za + off; \
+ TLB(env, useronly_clean_ptr(addr), ptr[BE], ra); \
+ TLB(env, useronly_clean_ptr(addr + 8), ptr[!BE], ra); \
+} \
+static inline void VNAME##_v_tlb(CPUARMState *env, void *za, intptr_t off, \
+ target_ulong addr, uintptr_t ra) \
+{ \
+ HNAME##_tlb(env, za, tile_vslice_offset(off), addr, ra); \
+}
+
+DO_LD(ld1b, uint8_t, ldub_p, cpu_ldub_data_ra)
+DO_LD(ld1h_be, uint16_t, lduw_be_p, cpu_lduw_be_data_ra)
+DO_LD(ld1h_le, uint16_t, lduw_le_p, cpu_lduw_le_data_ra)
+DO_LD(ld1s_be, uint32_t, ldl_be_p, cpu_ldl_be_data_ra)
+DO_LD(ld1s_le, uint32_t, ldl_le_p, cpu_ldl_le_data_ra)
+DO_LD(ld1d_be, uint64_t, ldq_be_p, cpu_ldq_be_data_ra)
+DO_LD(ld1d_le, uint64_t, ldq_le_p, cpu_ldq_le_data_ra)
+
+DO_LDQ(sve_ld1qq_be, sme_ld1q_be, 1, ldq_be_p, cpu_ldq_be_data_ra)
+DO_LDQ(sve_ld1qq_le, sme_ld1q_le, 0, ldq_le_p, cpu_ldq_le_data_ra)
+
+DO_ST(st1b, uint8_t, stb_p, cpu_stb_data_ra)
+DO_ST(st1h_be, uint16_t, stw_be_p, cpu_stw_be_data_ra)
+DO_ST(st1h_le, uint16_t, stw_le_p, cpu_stw_le_data_ra)
+DO_ST(st1s_be, uint32_t, stl_be_p, cpu_stl_be_data_ra)
+DO_ST(st1s_le, uint32_t, stl_le_p, cpu_stl_le_data_ra)
+DO_ST(st1d_be, uint64_t, stq_be_p, cpu_stq_be_data_ra)
+DO_ST(st1d_le, uint64_t, stq_le_p, cpu_stq_le_data_ra)
+
+DO_STQ(sve_st1qq_be, sme_st1q_be, 1, stq_be_p, cpu_stq_be_data_ra)
+DO_STQ(sve_st1qq_le, sme_st1q_le, 0, stq_le_p, cpu_stq_le_data_ra)
+
+#undef DO_LD
+#undef DO_ST
+#undef DO_LDQ
+#undef DO_STQ
+
+/*
+ * Common helper for all contiguous predicated loads.
+ */
+
+static inline QEMU_ALWAYS_INLINE
+void sme_ld1(CPUARMState *env, void *za, uint64_t *vg,
+ const target_ulong addr, uint32_t desc, const uintptr_t ra,
+ const int esz, uint32_t mtedesc, bool vertical,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn,
+ ClearFn *clr_fn,
+ CopyFn *cpy_fn)
+{
+ const intptr_t reg_max = simd_oprsz(desc);
+ const intptr_t esize = 1 << esz;
+ intptr_t reg_off, reg_last;
+ SVEContLdSt info;
+ void *host;
+ int flags;
+
+ /* Find the active elements. */
+ if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
+ /* The entire predicate was false; no load occurs. */
+ clr_fn(za, 0, reg_max);
+ return;
+ }
+
+ /* Probe the page(s). Exit with exception for any invalid page. */
+ sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, ra);
+
+ /* Handle watchpoints for all active elements. */
+ sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
+ BP_MEM_READ, ra);
+
+ /*
+ * Handle mte checks for all active elements.
+ * Since TBI must be set for MTE, !mtedesc => !mte_active.
+ */
+ if (mtedesc) {
+ sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
+ mtedesc, ra);
+ }
+
+ flags = info.page[0].flags | info.page[1].flags;
+ if (unlikely(flags != 0)) {
+#ifdef CONFIG_USER_ONLY
+ g_assert_not_reached();
+#else
+ /*
+ * At least one page includes MMIO.
+ * Any bus operation can fail with cpu_transaction_failed,
+ * which for ARM will raise SyncExternal. Perform the load
+ * into scratch memory to preserve register state until the end.
+ */
+ ARMVectorReg scratch = { };
+
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[1];
+ if (reg_last < 0) {
+ reg_last = info.reg_off_split;
+ if (reg_last < 0) {
+ reg_last = info.reg_off_last[0];
+ }
+ }
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ tlb_fn(env, &scratch, reg_off, addr + reg_off, ra);
+ }
+ reg_off += esize;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+
+ cpy_fn(za, &scratch, reg_max);
+ return;
+#endif
+ }
+
+ /* The entire operation is in RAM, on valid pages. */
+
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[0];
+ host = info.page[0].host;
+
+ if (!vertical) {
+ memset(za, 0, reg_max);
+ } else if (reg_off) {
+ clr_fn(za, 0, reg_off);
+ }
+
+ while (reg_off <= reg_last) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ host_fn(za, reg_off, host + reg_off);
+ } else if (vertical) {
+ clr_fn(za, reg_off, esize);
+ }
+ reg_off += esize;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ }
+
+ /*
+ * Use the slow path to manage the cross-page misalignment.
+ * But we know this is RAM and cannot trap.
+ */
+ reg_off = info.reg_off_split;
+ if (unlikely(reg_off >= 0)) {
+ tlb_fn(env, za, reg_off, addr + reg_off, ra);
+ }
+
+ reg_off = info.reg_off_first[1];
+ if (unlikely(reg_off >= 0)) {
+ reg_last = info.reg_off_last[1];
+ host = info.page[1].host;
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ host_fn(za, reg_off, host + reg_off);
+ } else if (vertical) {
+ clr_fn(za, reg_off, esize);
+ }
+ reg_off += esize;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ }
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sme_ld1_mte(CPUARMState *env, void *za, uint64_t *vg,
+ target_ulong addr, uint32_t desc, uintptr_t ra,
+ const int esz, bool vertical,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn,
+ ClearFn *clr_fn,
+ CopyFn *cpy_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ int bit55 = extract64(addr, 55, 1);
+
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /* Perform gross MTE suppression early. */
+ if (!tbi_check(desc, bit55) ||
+ tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+ mtedesc = 0;
+ }
+
+ sme_ld1(env, za, vg, addr, desc, ra, esz, mtedesc, vertical,
+ host_fn, tlb_fn, clr_fn, cpy_fn);
+}
+
+#define DO_LD(L, END, ESZ) \
+void HELPER(sme_ld1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
+ sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
+ clear_horizontal, copy_horizontal); \
+} \
+void HELPER(sme_ld1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sme_ld1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
+ sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
+ clear_vertical_##L, copy_vertical_##L); \
+} \
+void HELPER(sme_ld1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
+ sve_ld1##L##L##END##_host, sve_ld1##L##L##END##_tlb, \
+ clear_horizontal, copy_horizontal); \
+} \
+void HELPER(sme_ld1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sme_ld1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
+ sme_ld1##L##END##_v_host, sme_ld1##L##END##_v_tlb, \
+ clear_vertical_##L, copy_vertical_##L); \
+}
+
+DO_LD(b, , MO_8)
+DO_LD(h, _be, MO_16)
+DO_LD(h, _le, MO_16)
+DO_LD(s, _be, MO_32)
+DO_LD(s, _le, MO_32)
+DO_LD(d, _be, MO_64)
+DO_LD(d, _le, MO_64)
+DO_LD(q, _be, MO_128)
+DO_LD(q, _le, MO_128)
+
+#undef DO_LD
+
+/*
+ * Common helper for all contiguous predicated stores.
+ */
+
+static inline QEMU_ALWAYS_INLINE
+void sme_st1(CPUARMState *env, void *za, uint64_t *vg,
+ const target_ulong addr, uint32_t desc, const uintptr_t ra,
+ const int esz, uint32_t mtedesc, bool vertical,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const intptr_t reg_max = simd_oprsz(desc);
+ const intptr_t esize = 1 << esz;
+ intptr_t reg_off, reg_last;
+ SVEContLdSt info;
+ void *host;
+ int flags;
+
+ /* Find the active elements. */
+ if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, esize)) {
+ /* The entire predicate was false; no store occurs. */
+ return;
+ }
+
+ /* Probe the page(s). Exit with exception for any invalid page. */
+ sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, ra);
+
+ /* Handle watchpoints for all active elements. */
+ sve_cont_ldst_watchpoints(&info, env, vg, addr, esize, esize,
+ BP_MEM_WRITE, ra);
+
+ /*
+ * Handle mte checks for all active elements.
+ * Since TBI must be set for MTE, !mtedesc => !mte_active.
+ */
+ if (mtedesc) {
+ sve_cont_ldst_mte_check(&info, env, vg, addr, esize, esize,
+ mtedesc, ra);
+ }
+
+ flags = info.page[0].flags | info.page[1].flags;
+ if (unlikely(flags != 0)) {
+#ifdef CONFIG_USER_ONLY
+ g_assert_not_reached();
+#else
+ /*
+ * At least one page includes MMIO.
+ * Any bus operation can fail with cpu_transaction_failed,
+ * which for ARM will raise SyncExternal. We cannot avoid
+ * this fault and will leave with the store incomplete.
+ */
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[1];
+ if (reg_last < 0) {
+ reg_last = info.reg_off_split;
+ if (reg_last < 0) {
+ reg_last = info.reg_off_last[0];
+ }
+ }
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ tlb_fn(env, za, reg_off, addr + reg_off, ra);
+ }
+ reg_off += esize;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ return;
+#endif
+ }
+
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[0];
+ host = info.page[0].host;
+
+ while (reg_off <= reg_last) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ host_fn(za, reg_off, host + reg_off);
+ }
+ reg_off += 1 << esz;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ }
+
+ /*
+ * Use the slow path to manage the cross-page misalignment.
+ * But we know this is RAM and cannot trap.
+ */
+ reg_off = info.reg_off_split;
+ if (unlikely(reg_off >= 0)) {
+ tlb_fn(env, za, reg_off, addr + reg_off, ra);
+ }
+
+ reg_off = info.reg_off_first[1];
+ if (unlikely(reg_off >= 0)) {
+ reg_last = info.reg_off_last[1];
+ host = info.page[1].host;
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ host_fn(za, reg_off, host + reg_off);
+ }
+ reg_off += 1 << esz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ }
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sme_st1_mte(CPUARMState *env, void *za, uint64_t *vg, target_ulong addr,
+ uint32_t desc, uintptr_t ra, int esz, bool vertical,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ int bit55 = extract64(addr, 55, 1);
+
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /* Perform gross MTE suppression early. */
+ if (!tbi_check(desc, bit55) ||
+ tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+ mtedesc = 0;
+ }
+
+ sme_st1(env, za, vg, addr, desc, ra, esz, mtedesc,
+ vertical, host_fn, tlb_fn);
+}
+
+#define DO_ST(L, END, ESZ) \
+void HELPER(sme_st1##L##END##_h)(CPUARMState *env, void *za, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, false, \
+ sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
+} \
+void HELPER(sme_st1##L##END##_v)(CPUARMState *env, void *za, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sme_st1(env, za, vg, addr, desc, GETPC(), ESZ, 0, true, \
+ sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
+} \
+void HELPER(sme_st1##L##END##_h_mte)(CPUARMState *env, void *za, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, false, \
+ sve_st1##L##L##END##_host, sve_st1##L##L##END##_tlb); \
+} \
+void HELPER(sme_st1##L##END##_v_mte)(CPUARMState *env, void *za, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sme_st1_mte(env, za, vg, addr, desc, GETPC(), ESZ, true, \
+ sme_st1##L##END##_v_host, sme_st1##L##END##_v_tlb); \
+}
+
+DO_ST(b, , MO_8)
+DO_ST(h, _be, MO_16)
+DO_ST(h, _le, MO_16)
+DO_ST(s, _be, MO_32)
+DO_ST(s, _le, MO_32)
+DO_ST(d, _be, MO_64)
+DO_ST(d, _le, MO_64)
+DO_ST(q, _be, MO_128)
+DO_ST(q, _le, MO_128)
+
+#undef DO_ST
+
+void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn,
+ void *vpm, uint32_t desc)
+{
+ intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
+ uint64_t *pn = vpn, *pm = vpm;
+ uint32_t *zda = vzda, *zn = vzn;
+
+ for (row = 0; row < oprsz; ) {
+ uint64_t pa = pn[row >> 4];
+ do {
+ if (pa & 1) {
+ for (col = 0; col < oprsz; ) {
+ uint64_t pb = pm[col >> 4];
+ do {
+ if (pb & 1) {
+ zda[tile_vslice_index(row) + H4(col)] += zn[H4(col)];
+ }
+ pb >>= 4;
+ } while (++col & 15);
+ }
+ }
+ pa >>= 4;
+ } while (++row & 15);
+ }
+}
+
+void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn,
+ void *vpm, uint32_t desc)
+{
+ intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
+ uint8_t *pn = vpn, *pm = vpm;
+ uint64_t *zda = vzda, *zn = vzn;
+
+ for (row = 0; row < oprsz; ++row) {
+ if (pn[H1(row)] & 1) {
+ for (col = 0; col < oprsz; ++col) {
+ if (pm[H1(col)] & 1) {
+ zda[tile_vslice_index(row) + col] += zn[col];
+ }
+ }
+ }
+ }
+}
+
+void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn,
+ void *vpm, uint32_t desc)
+{
+ intptr_t row, col, oprsz = simd_oprsz(desc) / 4;
+ uint64_t *pn = vpn, *pm = vpm;
+ uint32_t *zda = vzda, *zn = vzn;
+
+ for (row = 0; row < oprsz; ) {
+ uint64_t pa = pn[row >> 4];
+ do {
+ if (pa & 1) {
+ uint32_t zn_row = zn[H4(row)];
+ for (col = 0; col < oprsz; ) {
+ uint64_t pb = pm[col >> 4];
+ do {
+ if (pb & 1) {
+ zda[tile_vslice_index(row) + H4(col)] += zn_row;
+ }
+ pb >>= 4;
+ } while (++col & 15);
+ }
+ }
+ pa >>= 4;
+ } while (++row & 15);
+ }
+}
+
+void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn,
+ void *vpm, uint32_t desc)
+{
+ intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
+ uint8_t *pn = vpn, *pm = vpm;
+ uint64_t *zda = vzda, *zn = vzn;
+
+ for (row = 0; row < oprsz; ++row) {
+ if (pn[H1(row)] & 1) {
+ uint64_t zn_row = zn[row];
+ for (col = 0; col < oprsz; ++col) {
+ if (pm[H1(col)] & 1) {
+ zda[tile_vslice_index(row) + col] += zn_row;
+ }
+ }
+ }
+ }
+}
+
+void HELPER(sme_fmopa_s)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, void *vst, uint32_t desc)
+{
+ intptr_t row, col, oprsz = simd_maxsz(desc);
+ uint32_t neg = simd_data(desc) << 31;
+ uint16_t *pn = vpn, *pm = vpm;
+ float_status fpst;
+
+ /*
+ * Make a copy of float_status because this operation does not
+ * update the cumulative fp exception status. It also produces
+ * default nans.
+ */
+ fpst = *(float_status *)vst;
+ set_default_nan_mode(true, &fpst);
+
+ for (row = 0; row < oprsz; ) {
+ uint16_t pa = pn[H2(row >> 4)];
+ do {
+ if (pa & 1) {
+ void *vza_row = vza + tile_vslice_offset(row);
+ uint32_t n = *(uint32_t *)(vzn + H1_4(row)) ^ neg;
+
+ for (col = 0; col < oprsz; ) {
+ uint16_t pb = pm[H2(col >> 4)];
+ do {
+ if (pb & 1) {
+ uint32_t *a = vza_row + H1_4(col);
+ uint32_t *m = vzm + H1_4(col);
+ *a = float32_muladd(n, *m, *a, 0, vst);
+ }
+ col += 4;
+ pb >>= 4;
+ } while (col & 15);
+ }
+ }
+ row += 4;
+ pa >>= 4;
+ } while (row & 15);
+ }
+}
+
+void HELPER(sme_fmopa_d)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, void *vst, uint32_t desc)
+{
+ intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
+ uint64_t neg = (uint64_t)simd_data(desc) << 63;
+ uint64_t *za = vza, *zn = vzn, *zm = vzm;
+ uint8_t *pn = vpn, *pm = vpm;
+ float_status fpst = *(float_status *)vst;
+
+ set_default_nan_mode(true, &fpst);
+
+ for (row = 0; row < oprsz; ++row) {
+ if (pn[H1(row)] & 1) {
+ uint64_t *za_row = &za[tile_vslice_index(row)];
+ uint64_t n = zn[row] ^ neg;
+
+ for (col = 0; col < oprsz; ++col) {
+ if (pm[H1(col)] & 1) {
+ uint64_t *a = &za_row[col];
+ *a = float64_muladd(n, zm[col], *a, 0, &fpst);
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Alter PAIR as needed for controlling predicates being false,
+ * and for NEG on an enabled row element.
+ */
+static inline uint32_t f16mop_adj_pair(uint32_t pair, uint32_t pg, uint32_t neg)
+{
+ /*
+ * The pseudocode uses a conditional negate after the conditional zero.
+ * It is simpler here to unconditionally negate before conditional zero.
+ */
+ pair ^= neg;
+ if (!(pg & 1)) {
+ pair &= 0xffff0000u;
+ }
+ if (!(pg & 4)) {
+ pair &= 0x0000ffffu;
+ }
+ return pair;
+}
+
+static float32 f16_dotadd(float32 sum, uint32_t e1, uint32_t e2,
+ float_status *s_std, float_status *s_odd)
+{
+ float64 e1r = float16_to_float64(e1 & 0xffff, true, s_std);
+ float64 e1c = float16_to_float64(e1 >> 16, true, s_std);
+ float64 e2r = float16_to_float64(e2 & 0xffff, true, s_std);
+ float64 e2c = float16_to_float64(e2 >> 16, true, s_std);
+ float64 t64;
+ float32 t32;
+
+ /*
+ * The ARM pseudocode function FPDot performs both multiplies
+ * and the add with a single rounding operation. Emulate this
+ * by performing the first multiply in round-to-odd, then doing
+ * the second multiply as fused multiply-add, and rounding to
+ * float32 all in one step.
+ */
+ t64 = float64_mul(e1r, e2r, s_odd);
+ t64 = float64r32_muladd(e1c, e2c, t64, 0, s_std);
+
+ /* This conversion is exact, because we've already rounded. */
+ t32 = float64_to_float32(t64, s_std);
+
+ /* The final accumulation step is not fused. */
+ return float32_add(sum, t32, s_std);
+}
+
+void HELPER(sme_fmopa_h)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, void *vst, uint32_t desc)
+{
+ intptr_t row, col, oprsz = simd_maxsz(desc);
+ uint32_t neg = simd_data(desc) * 0x80008000u;
+ uint16_t *pn = vpn, *pm = vpm;
+ float_status fpst_odd, fpst_std;
+
+ /*
+ * Make a copy of float_status because this operation does not
+ * update the cumulative fp exception status. It also produces
+ * default nans. Make a second copy with round-to-odd -- see above.
+ */
+ fpst_std = *(float_status *)vst;
+ set_default_nan_mode(true, &fpst_std);
+ fpst_odd = fpst_std;
+ set_float_rounding_mode(float_round_to_odd, &fpst_odd);
+
+ for (row = 0; row < oprsz; ) {
+ uint16_t prow = pn[H2(row >> 4)];
+ do {
+ void *vza_row = vza + tile_vslice_offset(row);
+ uint32_t n = *(uint32_t *)(vzn + H1_4(row));
+
+ n = f16mop_adj_pair(n, prow, neg);
+
+ for (col = 0; col < oprsz; ) {
+ uint16_t pcol = pm[H2(col >> 4)];
+ do {
+ if (prow & pcol & 0b0101) {
+ uint32_t *a = vza_row + H1_4(col);
+ uint32_t m = *(uint32_t *)(vzm + H1_4(col));
+
+ m = f16mop_adj_pair(m, pcol, 0);
+ *a = f16_dotadd(*a, n, m, &fpst_std, &fpst_odd);
+
+ col += 4;
+ pcol >>= 4;
+ }
+ } while (col & 15);
+ }
+ row += 4;
+ prow >>= 4;
+ } while (row & 15);
+ }
+}
+
+void HELPER(sme_bfmopa)(void *vza, void *vzn, void *vzm, void *vpn,
+ void *vpm, uint32_t desc)
+{
+ intptr_t row, col, oprsz = simd_maxsz(desc);
+ uint32_t neg = simd_data(desc) * 0x80008000u;
+ uint16_t *pn = vpn, *pm = vpm;
+
+ for (row = 0; row < oprsz; ) {
+ uint16_t prow = pn[H2(row >> 4)];
+ do {
+ void *vza_row = vza + tile_vslice_offset(row);
+ uint32_t n = *(uint32_t *)(vzn + H1_4(row));
+
+ n = f16mop_adj_pair(n, prow, neg);
+
+ for (col = 0; col < oprsz; ) {
+ uint16_t pcol = pm[H2(col >> 4)];
+ do {
+ if (prow & pcol & 0b0101) {
+ uint32_t *a = vza_row + H1_4(col);
+ uint32_t m = *(uint32_t *)(vzm + H1_4(col));
+
+ m = f16mop_adj_pair(m, pcol, 0);
+ *a = bfdotadd(*a, n, m);
+
+ col += 4;
+ pcol >>= 4;
+ }
+ } while (col & 15);
+ }
+ row += 4;
+ prow >>= 4;
+ } while (row & 15);
+ }
+}
+
+typedef uint64_t IMOPFn(uint64_t, uint64_t, uint64_t, uint8_t, bool);
+
+static inline void do_imopa(uint64_t *za, uint64_t *zn, uint64_t *zm,
+ uint8_t *pn, uint8_t *pm,
+ uint32_t desc, IMOPFn *fn)
+{
+ intptr_t row, col, oprsz = simd_oprsz(desc) / 8;
+ bool neg = simd_data(desc);
+
+ for (row = 0; row < oprsz; ++row) {
+ uint8_t pa = pn[H1(row)];
+ uint64_t *za_row = &za[tile_vslice_index(row)];
+ uint64_t n = zn[row];
+
+ for (col = 0; col < oprsz; ++col) {
+ uint8_t pb = pm[H1(col)];
+ uint64_t *a = &za_row[col];
+
+ *a = fn(n, zm[col], *a, pa & pb, neg);
+ }
+ }
+}
+
+#define DEF_IMOP_32(NAME, NTYPE, MTYPE) \
+static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
+{ \
+ uint32_t sum0 = 0, sum1 = 0; \
+ /* Apply P to N as a mask, making the inactive elements 0. */ \
+ n &= expand_pred_b(p); \
+ sum0 += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
+ sum0 += (NTYPE)(n >> 8) * (MTYPE)(m >> 8); \
+ sum0 += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
+ sum0 += (NTYPE)(n >> 24) * (MTYPE)(m >> 24); \
+ sum1 += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
+ sum1 += (NTYPE)(n >> 40) * (MTYPE)(m >> 40); \
+ sum1 += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
+ sum1 += (NTYPE)(n >> 56) * (MTYPE)(m >> 56); \
+ if (neg) { \
+ sum0 = (uint32_t)a - sum0, sum1 = (uint32_t)(a >> 32) - sum1; \
+ } else { \
+ sum0 = (uint32_t)a + sum0, sum1 = (uint32_t)(a >> 32) + sum1; \
+ } \
+ return ((uint64_t)sum1 << 32) | sum0; \
+}
+
+#define DEF_IMOP_64(NAME, NTYPE, MTYPE) \
+static uint64_t NAME(uint64_t n, uint64_t m, uint64_t a, uint8_t p, bool neg) \
+{ \
+ uint64_t sum = 0; \
+ /* Apply P to N as a mask, making the inactive elements 0. */ \
+ n &= expand_pred_h(p); \
+ sum += (NTYPE)(n >> 0) * (MTYPE)(m >> 0); \
+ sum += (NTYPE)(n >> 16) * (MTYPE)(m >> 16); \
+ sum += (NTYPE)(n >> 32) * (MTYPE)(m >> 32); \
+ sum += (NTYPE)(n >> 48) * (MTYPE)(m >> 48); \
+ return neg ? a - sum : a + sum; \
+}
+
+DEF_IMOP_32(smopa_s, int8_t, int8_t)
+DEF_IMOP_32(umopa_s, uint8_t, uint8_t)
+DEF_IMOP_32(sumopa_s, int8_t, uint8_t)
+DEF_IMOP_32(usmopa_s, uint8_t, int8_t)
+
+DEF_IMOP_64(smopa_d, int16_t, int16_t)
+DEF_IMOP_64(umopa_d, uint16_t, uint16_t)
+DEF_IMOP_64(sumopa_d, int16_t, uint16_t)
+DEF_IMOP_64(usmopa_d, uint16_t, int16_t)
+
+#define DEF_IMOPH(NAME) \
+ void HELPER(sme_##NAME)(void *vza, void *vzn, void *vzm, void *vpn, \
+ void *vpm, uint32_t desc) \
+ { do_imopa(vza, vzn, vzm, vpn, vpm, desc, NAME); }
+
+DEF_IMOPH(smopa_s)
+DEF_IMOPH(umopa_s)
+DEF_IMOPH(sumopa_s)
+DEF_IMOPH(usmopa_s)
+DEF_IMOPH(smopa_d)
+DEF_IMOPH(umopa_d)
+DEF_IMOPH(sumopa_d)
+DEF_IMOPH(usmopa_d)
--- /dev/null
+/*
+ * ARM SVE Operations
+ *
+ * Copyright (c) 2018 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "exec/exec-all.h"
+#include "exec/helper-proto.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "fpu/softfloat.h"
+#include "tcg/tcg.h"
+#include "vec_internal.h"
+#include "sve_ldst_internal.h"
+
+
+/* Return a value for NZCV as per the ARM PredTest pseudofunction.
+ *
+ * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
+ * and bit 0 set if C is set. Compare the definitions of these variables
+ * within CPUARMState.
+ */
+
+/* For no G bits set, NZCV = C. */
+#define PREDTEST_INIT 1
+
+/* This is an iterative function, called for each Pd and Pg word
+ * moving forward.
+ */
+static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
+{
+ if (likely(g)) {
+ /* Compute N from first D & G.
+ Use bit 2 to signal first G bit seen. */
+ if (!(flags & 4)) {
+ flags |= ((d & (g & -g)) != 0) << 31;
+ flags |= 4;
+ }
+
+ /* Accumulate Z from each D & G. */
+ flags |= ((d & g) != 0) << 1;
+
+ /* Compute C from last !(D & G). Replace previous. */
+ flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
+ }
+ return flags;
+}
+
+/* This is an iterative function, called for each Pd and Pg word
+ * moving backward.
+ */
+static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
+{
+ if (likely(g)) {
+ /* Compute C from first (i.e last) !(D & G).
+ Use bit 2 to signal first G bit seen. */
+ if (!(flags & 4)) {
+ flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
+ flags |= (d & pow2floor(g)) == 0;
+ }
+
+ /* Accumulate Z from each D & G. */
+ flags |= ((d & g) != 0) << 1;
+
+ /* Compute N from last (i.e first) D & G. Replace previous. */
+ flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
+ }
+ return flags;
+}
+
+/* The same for a single word predicate. */
+uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
+{
+ return iter_predtest_fwd(d, g, PREDTEST_INIT);
+}
+
+/* The same for a multi-word predicate. */
+uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
+{
+ uint32_t flags = PREDTEST_INIT;
+ uint64_t *d = vd, *g = vg;
+ uintptr_t i = 0;
+
+ do {
+ flags = iter_predtest_fwd(d[i], g[i], flags);
+ } while (++i < words);
+
+ return flags;
+}
+
+/* Similarly for single word elements. */
+static inline uint64_t expand_pred_s(uint8_t byte)
+{
+ static const uint64_t word[] = {
+ [0x01] = 0x00000000ffffffffull,
+ [0x10] = 0xffffffff00000000ull,
+ [0x11] = 0xffffffffffffffffull,
+ };
+ return word[byte & 0x11];
+}
+
+#define LOGICAL_PPPP(NAME, FUNC) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{ \
+ uintptr_t opr_sz = simd_oprsz(desc); \
+ uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
+ uintptr_t i; \
+ for (i = 0; i < opr_sz / 8; ++i) { \
+ d[i] = FUNC(n[i], m[i], g[i]); \
+ } \
+}
+
+#define DO_AND(N, M, G) (((N) & (M)) & (G))
+#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
+#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
+#define DO_ORR(N, M, G) (((N) | (M)) & (G))
+#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
+#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
+#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
+#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
+
+LOGICAL_PPPP(sve_and_pppp, DO_AND)
+LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
+LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
+LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
+LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
+LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
+LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
+LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
+
+#undef DO_AND
+#undef DO_BIC
+#undef DO_EOR
+#undef DO_ORR
+#undef DO_ORN
+#undef DO_NOR
+#undef DO_NAND
+#undef DO_SEL
+#undef LOGICAL_PPPP
+
+/* Fully general three-operand expander, controlled by a predicate.
+ * This is complicated by the host-endian storage of the register file.
+ */
+/* ??? I don't expect the compiler could ever vectorize this itself.
+ * With some tables we can convert bit masks to byte masks, and with
+ * extra care wrt byte/word ordering we could use gcc generic vectors
+ * and do 16 bytes at a time.
+ */
+#define DO_ZPZZ(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ if (pg & 1) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ TYPE mm = *(TYPE *)(vm + H(i)); \
+ *(TYPE *)(vd + H(i)) = OP(nn, mm); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ } while (i & 15); \
+ } \
+}
+
+/* Similarly, specialized for 64-bit operands. */
+#define DO_ZPZZ_D(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ uint8_t *pg = vg; \
+ for (i = 0; i < opr_sz; i += 1) { \
+ if (pg[H1(i)] & 1) { \
+ TYPE nn = n[i], mm = m[i]; \
+ d[i] = OP(nn, mm); \
+ } \
+ } \
+}
+
+#define DO_AND(N, M) (N & M)
+#define DO_EOR(N, M) (N ^ M)
+#define DO_ORR(N, M) (N | M)
+#define DO_BIC(N, M) (N & ~M)
+#define DO_ADD(N, M) (N + M)
+#define DO_SUB(N, M) (N - M)
+#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
+#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
+#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
+#define DO_MUL(N, M) (N * M)
+
+
+/*
+ * We must avoid the C undefined behaviour cases: division by
+ * zero and signed division of INT_MIN by -1. Both of these
+ * have architecturally defined required results for Arm.
+ * We special case all signed divisions by -1 to avoid having
+ * to deduce the minimum integer for the type involved.
+ */
+#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
+#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
+
+DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
+DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
+DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
+DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
+
+DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
+DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
+DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
+DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
+
+DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
+DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
+DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
+DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
+
+DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
+DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
+DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
+DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
+
+DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
+DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
+DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
+DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
+
+DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
+DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
+DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
+DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
+
+DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
+DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
+DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
+DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
+
+DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
+DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
+DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
+DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
+
+DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
+DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
+DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
+DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
+
+DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
+DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
+DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
+DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
+
+DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
+DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
+DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
+DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
+
+DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
+DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
+DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
+DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
+
+/* Because the computation type is at least twice as large as required,
+ these work for both signed and unsigned source types. */
+static inline uint8_t do_mulh_b(int32_t n, int32_t m)
+{
+ return (n * m) >> 8;
+}
+
+static inline uint16_t do_mulh_h(int32_t n, int32_t m)
+{
+ return (n * m) >> 16;
+}
+
+static inline uint32_t do_mulh_s(int64_t n, int64_t m)
+{
+ return (n * m) >> 32;
+}
+
+static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
+{
+ uint64_t lo, hi;
+ muls64(&lo, &hi, n, m);
+ return hi;
+}
+
+static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
+{
+ uint64_t lo, hi;
+ mulu64(&lo, &hi, n, m);
+ return hi;
+}
+
+DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
+DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
+DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
+DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
+
+DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
+DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
+DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
+DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
+
+DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
+DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
+DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
+DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
+
+DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
+DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
+
+DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
+DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
+
+/* Note that all bits of the shift are significant
+ and not modulo the element size. */
+#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
+#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
+#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
+
+DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
+DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
+DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
+
+DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
+DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
+DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
+
+DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
+DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
+DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
+
+DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
+DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
+DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
+
+static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
+{
+ int8_t n1 = n, n2 = n >> 8;
+ return m + n1 + n2;
+}
+
+static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
+{
+ int16_t n1 = n, n2 = n >> 16;
+ return m + n1 + n2;
+}
+
+static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
+{
+ int32_t n1 = n, n2 = n >> 32;
+ return m + n1 + n2;
+}
+
+DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
+DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
+DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
+
+static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
+{
+ uint8_t n1 = n, n2 = n >> 8;
+ return m + n1 + n2;
+}
+
+static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
+{
+ uint16_t n1 = n, n2 = n >> 16;
+ return m + n1 + n2;
+}
+
+static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
+{
+ uint32_t n1 = n, n2 = n >> 32;
+ return m + n1 + n2;
+}
+
+DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
+DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
+DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
+
+#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
+#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
+#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
+#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
+
+DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
+DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
+DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
+DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
+
+#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
+#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
+#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
+#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
+
+DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
+DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
+DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
+DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
+
+/*
+ * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
+ * We pass in a pointer to a dummy saturation field to trigger
+ * the saturating arithmetic but discard the information about
+ * whether it has occurred.
+ */
+#define do_sqshl_b(n, m) \
+ ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
+#define do_sqshl_h(n, m) \
+ ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
+#define do_sqshl_s(n, m) \
+ ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
+#define do_sqshl_d(n, m) \
+ ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
+
+DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
+DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
+DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
+DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
+
+#define do_uqshl_b(n, m) \
+ ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
+#define do_uqshl_h(n, m) \
+ ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
+#define do_uqshl_s(n, m) \
+ ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
+#define do_uqshl_d(n, m) \
+ ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
+
+DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
+DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
+DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
+DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
+
+#define do_sqrshl_b(n, m) \
+ ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
+#define do_sqrshl_h(n, m) \
+ ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
+#define do_sqrshl_s(n, m) \
+ ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
+#define do_sqrshl_d(n, m) \
+ ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
+
+DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
+DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
+DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
+DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
+
+#undef do_sqrshl_d
+
+#define do_uqrshl_b(n, m) \
+ ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
+#define do_uqrshl_h(n, m) \
+ ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
+#define do_uqrshl_s(n, m) \
+ ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
+#define do_uqrshl_d(n, m) \
+ ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
+
+DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
+DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
+DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
+DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
+
+#undef do_uqrshl_d
+
+#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
+#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
+
+DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
+DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
+DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
+DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
+
+DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
+DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
+DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
+DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
+
+#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
+#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
+
+DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
+DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
+DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
+DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
+
+DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
+DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
+DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
+DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
+
+#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
+#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
+
+DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
+DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
+DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
+DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
+
+DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
+DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
+DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
+DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
+
+static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
+{
+ return val >= max ? max : val <= min ? min : val;
+}
+
+#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
+#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
+#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
+
+static inline int64_t do_sqadd_d(int64_t n, int64_t m)
+{
+ int64_t r = n + m;
+ if (((r ^ n) & ~(n ^ m)) < 0) {
+ /* Signed overflow. */
+ return r < 0 ? INT64_MAX : INT64_MIN;
+ }
+ return r;
+}
+
+DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
+DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
+DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
+DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
+
+#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
+#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
+#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
+
+static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
+{
+ uint64_t r = n + m;
+ return r < n ? UINT64_MAX : r;
+}
+
+DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
+DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
+DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
+DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
+
+#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
+#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
+#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
+
+static inline int64_t do_sqsub_d(int64_t n, int64_t m)
+{
+ int64_t r = n - m;
+ if (((r ^ n) & (n ^ m)) < 0) {
+ /* Signed overflow. */
+ return r < 0 ? INT64_MAX : INT64_MIN;
+ }
+ return r;
+}
+
+DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
+DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
+DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
+DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
+
+#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
+#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
+#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
+
+static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
+{
+ return n > m ? n - m : 0;
+}
+
+DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
+DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
+DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
+DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
+
+#define DO_SUQADD_B(n, m) \
+ do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
+#define DO_SUQADD_H(n, m) \
+ do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
+#define DO_SUQADD_S(n, m) \
+ do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
+
+static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
+{
+ uint64_t r = n + m;
+
+ if (n < 0) {
+ /* Note that m - abs(n) cannot underflow. */
+ if (r > INT64_MAX) {
+ /* Result is either very large positive or negative. */
+ if (m > -n) {
+ /* m > abs(n), so r is a very large positive. */
+ return INT64_MAX;
+ }
+ /* Result is negative. */
+ }
+ } else {
+ /* Both inputs are positive: check for overflow. */
+ if (r < m || r > INT64_MAX) {
+ return INT64_MAX;
+ }
+ }
+ return r;
+}
+
+DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
+DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
+DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
+DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
+
+#define DO_USQADD_B(n, m) \
+ do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
+#define DO_USQADD_H(n, m) \
+ do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
+#define DO_USQADD_S(n, m) \
+ do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
+
+static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
+{
+ uint64_t r = n + m;
+
+ if (m < 0) {
+ return n < -m ? 0 : r;
+ }
+ return r < n ? UINT64_MAX : r;
+}
+
+DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
+DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
+DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
+DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
+
+#undef DO_ZPZZ
+#undef DO_ZPZZ_D
+
+/*
+ * Three operand expander, operating on element pairs.
+ * If the slot I is even, the elements from from VN {I, I+1}.
+ * If the slot I is odd, the elements from from VM {I-1, I}.
+ * Load all of the input elements in each pair before overwriting output.
+ */
+#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPE n0 = *(TYPE *)(vn + H(i)); \
+ TYPE m0 = *(TYPE *)(vm + H(i)); \
+ TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
+ TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
+ if (pg & 1) { \
+ *(TYPE *)(vd + H(i)) = OP(n0, n1); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ if (pg & 1) { \
+ *(TYPE *)(vd + H(i)) = OP(m0, m1); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ } while (i & 15); \
+ } \
+}
+
+/* Similarly, specialized for 64-bit operands. */
+#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ uint8_t *pg = vg; \
+ for (i = 0; i < opr_sz; i += 2) { \
+ TYPE n0 = n[i], n1 = n[i + 1]; \
+ TYPE m0 = m[i], m1 = m[i + 1]; \
+ if (pg[H1(i)] & 1) { \
+ d[i] = OP(n0, n1); \
+ } \
+ if (pg[H1(i + 1)] & 1) { \
+ d[i + 1] = OP(m0, m1); \
+ } \
+ } \
+}
+
+DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
+DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
+DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
+DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
+
+DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
+DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
+DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
+DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
+
+DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
+DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
+DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
+DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
+
+DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
+DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
+DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
+DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
+
+DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
+DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
+DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
+DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
+
+#undef DO_ZPZZ_PAIR
+#undef DO_ZPZZ_PAIR_D
+
+#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
+ void *status, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPE n0 = *(TYPE *)(vn + H(i)); \
+ TYPE m0 = *(TYPE *)(vm + H(i)); \
+ TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
+ TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
+ if (pg & 1) { \
+ *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ if (pg & 1) { \
+ *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ } while (i & 15); \
+ } \
+}
+
+DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
+DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
+DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, H1_8, float64_add)
+
+DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
+DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
+DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, H1_8, float64_maxnum)
+
+DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
+DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
+DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, H1_8, float64_minnum)
+
+DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
+DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
+DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, H1_8, float64_max)
+
+DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
+DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
+DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, H1_8, float64_min)
+
+#undef DO_ZPZZ_PAIR_FP
+
+/* Three-operand expander, controlled by a predicate, in which the
+ * third operand is "wide". That is, for D = N op M, the same 64-bit
+ * value of M is used with all of the narrower values of N.
+ */
+#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; ) { \
+ uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
+ TYPEW mm = *(TYPEW *)(vm + i); \
+ do { \
+ if (pg & 1) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ *(TYPE *)(vd + H(i)) = OP(nn, mm); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ } while (i & 7); \
+ } \
+}
+
+DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
+DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
+DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
+
+DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
+DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
+DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
+
+DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
+DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
+DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
+
+#undef DO_ZPZW
+
+/* Fully general two-operand expander, controlled by a predicate.
+ */
+#define DO_ZPZ(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ if (pg & 1) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ *(TYPE *)(vd + H(i)) = OP(nn); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ } while (i & 15); \
+ } \
+}
+
+/* Similarly, specialized for 64-bit operands. */
+#define DO_ZPZ_D(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
+ TYPE *d = vd, *n = vn; \
+ uint8_t *pg = vg; \
+ for (i = 0; i < opr_sz; i += 1) { \
+ if (pg[H1(i)] & 1) { \
+ TYPE nn = n[i]; \
+ d[i] = OP(nn); \
+ } \
+ } \
+}
+
+#define DO_CLS_B(N) (clrsb32(N) - 24)
+#define DO_CLS_H(N) (clrsb32(N) - 16)
+
+DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
+DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
+DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
+DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
+
+#define DO_CLZ_B(N) (clz32(N) - 24)
+#define DO_CLZ_H(N) (clz32(N) - 16)
+
+DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
+DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
+DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
+DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
+
+DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
+DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
+DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
+DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
+
+#define DO_CNOT(N) (N == 0)
+
+DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
+DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
+DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
+DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
+
+#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
+
+DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
+DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
+DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
+
+#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
+
+DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
+DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
+DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
+
+#define DO_NOT(N) (~N)
+
+DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
+DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
+DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
+DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
+
+#define DO_SXTB(N) ((int8_t)N)
+#define DO_SXTH(N) ((int16_t)N)
+#define DO_SXTS(N) ((int32_t)N)
+#define DO_UXTB(N) ((uint8_t)N)
+#define DO_UXTH(N) ((uint16_t)N)
+#define DO_UXTS(N) ((uint32_t)N)
+
+DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
+DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
+DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
+DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
+DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
+DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
+
+DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
+DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
+DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
+DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
+DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
+DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
+
+#define DO_ABS(N) (N < 0 ? -N : N)
+
+DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
+DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
+DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
+DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
+
+#define DO_NEG(N) (-N)
+
+DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
+DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
+DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
+DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
+
+DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
+DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
+DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
+
+DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
+DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
+
+DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
+
+void HELPER(sme_revd_q)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i += 2) {
+ if (pg[H1(i)] & 1) {
+ uint64_t n0 = n[i + 0];
+ uint64_t n1 = n[i + 1];
+ d[i + 0] = n1;
+ d[i + 1] = n0;
+ }
+ }
+}
+
+DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
+DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
+DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
+DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
+
+#define DO_SQABS(X) \
+ ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
+ x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
+
+DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
+DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
+DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
+DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
+
+#define DO_SQNEG(X) \
+ ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
+ x_ == min_ ? -min_ - 1 : -x_; })
+
+DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
+DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
+DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
+DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
+
+DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
+DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
+
+/* Three-operand expander, unpredicated, in which the third operand is "wide".
+ */
+#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; ) { \
+ TYPEW mm = *(TYPEW *)(vm + i); \
+ do { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ *(TYPE *)(vd + H(i)) = OP(nn, mm); \
+ i += sizeof(TYPE); \
+ } while (i & 7); \
+ } \
+}
+
+DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
+DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
+DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
+
+DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
+DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
+DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
+
+DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
+DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
+DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
+
+#undef DO_ZZW
+
+#undef DO_CLS_B
+#undef DO_CLS_H
+#undef DO_CLZ_B
+#undef DO_CLZ_H
+#undef DO_CNOT
+#undef DO_FABS
+#undef DO_FNEG
+#undef DO_ABS
+#undef DO_NEG
+#undef DO_ZPZ
+#undef DO_ZPZ_D
+
+/*
+ * Three-operand expander, unpredicated, in which the two inputs are
+ * selected from the top or bottom half of the wide column.
+ */
+#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
+ int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
+ TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
+ *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
+ } \
+}
+
+DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
+DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
+DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
+
+DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
+DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
+DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
+
+DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
+DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
+DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
+
+DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
+DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
+DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
+
+DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
+DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
+DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
+
+DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
+DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
+DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
+
+DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
+DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
+DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
+
+DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
+DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
+DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
+
+/* Note that the multiply cannot overflow, but the doubling can. */
+static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
+{
+ int16_t val = n * m;
+ return DO_SQADD_H(val, val);
+}
+
+static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
+{
+ int32_t val = n * m;
+ return DO_SQADD_S(val, val);
+}
+
+static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
+{
+ int64_t val = n * m;
+ return do_sqadd_d(val, val);
+}
+
+DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
+DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
+DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
+
+#undef DO_ZZZ_TB
+
+#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEW *)(vn + HW(i)); \
+ TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
+ *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
+ } \
+}
+
+DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
+DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
+DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, H1_8, H1_4, DO_ADD)
+
+DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
+DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
+DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, H1_8, H1_4, DO_SUB)
+
+DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
+DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
+DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, H1_8, H1_4, DO_ADD)
+
+DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
+DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
+DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, H1_8, H1_4, DO_SUB)
+
+#undef DO_ZZZ_WTB
+
+#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
+ intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
+ for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
+ TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
+ TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
+ *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
+ } \
+}
+
+DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
+DO_ZZZ_NTB(sve2_eoril_d, uint64_t, H1_8, DO_EOR)
+
+#undef DO_ZZZ_NTB
+
+#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
+ TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
+ TYPEW aa = *(TYPEW *)(va + HW(i)); \
+ *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
+ } \
+}
+
+DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
+DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
+DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, H1_8, H1_4, DO_ABD)
+
+DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
+DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
+DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, H1_8, H1_4, DO_ABD)
+
+DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
+DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
+DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
+
+DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
+DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
+DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
+
+#define DO_NMUL(N, M) -(N * M)
+
+DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
+DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
+DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4, DO_NMUL)
+
+DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
+DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
+DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, H1_8, H1_4, DO_NMUL)
+
+#undef DO_ZZZW_ACC
+
+#define DO_XTNB(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
+ TYPE nn = *(TYPE *)(vn + i); \
+ nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
+ *(TYPE *)(vd + i) = nn; \
+ } \
+}
+
+#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
+ TYPE nn = *(TYPE *)(vn + i); \
+ *(TYPEN *)(vd + i + odd) = OP(nn); \
+ } \
+}
+
+#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
+#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
+#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
+
+DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
+DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
+DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
+
+DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
+DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
+DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
+
+#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
+#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
+#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
+
+DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
+DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
+DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
+
+DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
+DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
+DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
+
+DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
+DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
+DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
+
+DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
+DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
+DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
+
+#undef DO_XTNB
+#undef DO_XTNT
+
+void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
+ uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ uint32_t *a = va, *n = vn;
+ uint64_t *d = vd, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ uint32_t e1 = a[2 * i + H4(0)];
+ uint32_t e2 = n[2 * i + sel] ^ inv;
+ uint64_t c = extract64(m[i], 32, 1);
+ /* Compute and store the entire 33-bit result at once. */
+ d[i] = c + e1 + e2;
+ }
+}
+
+void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ uint64_t *d = vd, *a = va, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; i += 2) {
+ Int128 e1 = int128_make64(a[i]);
+ Int128 e2 = int128_make64(n[i + sel] ^ inv);
+ Int128 c = int128_make64(m[i + 1] & 1);
+ Int128 r = int128_add(int128_add(e1, e2), c);
+ d[i + 0] = int128_getlo(r);
+ d[i + 1] = int128_gethi(r);
+ }
+}
+
+#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
+ int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
+ TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
+ TYPEW aa = *(TYPEW *)(va + HW(i)); \
+ *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
+ } \
+}
+
+DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
+ do_sqdmull_h, DO_SQADD_H)
+DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
+ do_sqdmull_s, DO_SQADD_S)
+DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, H1_8, H1_4,
+ do_sqdmull_d, do_sqadd_d)
+
+DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
+ do_sqdmull_h, DO_SQSUB_H)
+DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
+ do_sqdmull_s, DO_SQSUB_S)
+DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, H1_8, H1_4,
+ do_sqdmull_d, do_sqsub_d)
+
+#undef DO_SQDMLAL
+
+#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
+ int rot = simd_data(desc); \
+ int sel_a = rot & 1, sel_b = sel_a ^ 1; \
+ bool sub_r = rot == 1 || rot == 2; \
+ bool sub_i = rot >= 2; \
+ TYPE *d = vd, *n = vn, *m = vm, *a = va; \
+ for (i = 0; i < opr_sz; i += 2) { \
+ TYPE elt1_a = n[H(i + sel_a)]; \
+ TYPE elt2_a = m[H(i + sel_a)]; \
+ TYPE elt2_b = m[H(i + sel_b)]; \
+ d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
+ d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
+ } \
+}
+
+#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
+
+DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
+DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
+DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
+DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, H8, DO_CMLA)
+
+#define DO_SQRDMLAH_B(N, M, A, S) \
+ do_sqrdmlah_b(N, M, A, S, true)
+#define DO_SQRDMLAH_H(N, M, A, S) \
+ ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
+#define DO_SQRDMLAH_S(N, M, A, S) \
+ ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
+#define DO_SQRDMLAH_D(N, M, A, S) \
+ do_sqrdmlah_d(N, M, A, S, true)
+
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
+DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, H8, DO_SQRDMLAH_D)
+
+#define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
+ int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
+ int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
+ int sel_a = rot & 1, sel_b = sel_a ^ 1; \
+ bool sub_r = rot == 1 || rot == 2; \
+ bool sub_i = rot >= 2; \
+ TYPE *d = vd, *n = vn, *m = vm, *a = va; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
+ TYPE elt2_a = m[H(i + idx + sel_a)]; \
+ TYPE elt2_b = m[H(i + idx + sel_b)]; \
+ for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
+ TYPE elt1_a = n[H(i + j + sel_a)]; \
+ d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
+ d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
+ } \
+ } \
+}
+
+DO_CMLA_IDX_FUNC(sve2_cmla_idx_h, int16_t, H2, DO_CMLA)
+DO_CMLA_IDX_FUNC(sve2_cmla_idx_s, int32_t, H4, DO_CMLA)
+
+DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
+DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
+
+#undef DO_CMLA
+#undef DO_CMLA_FUNC
+#undef DO_CMLA_IDX_FUNC
+#undef DO_SQRDMLAH_B
+#undef DO_SQRDMLAH_H
+#undef DO_SQRDMLAH_S
+#undef DO_SQRDMLAH_D
+
+/* Note N and M are 4 elements bundled into one unit. */
+static int32_t do_cdot_s(uint32_t n, uint32_t m, int32_t a,
+ int sel_a, int sel_b, int sub_i)
+{
+ for (int i = 0; i <= 1; i++) {
+ int32_t elt1_r = (int8_t)(n >> (16 * i));
+ int32_t elt1_i = (int8_t)(n >> (16 * i + 8));
+ int32_t elt2_a = (int8_t)(m >> (16 * i + 8 * sel_a));
+ int32_t elt2_b = (int8_t)(m >> (16 * i + 8 * sel_b));
+
+ a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
+ }
+ return a;
+}
+
+static int64_t do_cdot_d(uint64_t n, uint64_t m, int64_t a,
+ int sel_a, int sel_b, int sub_i)
+{
+ for (int i = 0; i <= 1; i++) {
+ int64_t elt1_r = (int16_t)(n >> (32 * i + 0));
+ int64_t elt1_i = (int16_t)(n >> (32 * i + 16));
+ int64_t elt2_a = (int16_t)(m >> (32 * i + 16 * sel_a));
+ int64_t elt2_b = (int16_t)(m >> (32 * i + 16 * sel_b));
+
+ a += elt1_r * elt2_a + elt1_i * elt2_b * sub_i;
+ }
+ return a;
+}
+
+void HELPER(sve2_cdot_zzzz_s)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ int opr_sz = simd_oprsz(desc);
+ int rot = simd_data(desc);
+ int sel_a = rot & 1;
+ int sel_b = sel_a ^ 1;
+ int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
+ uint32_t *d = vd, *n = vn, *m = vm, *a = va;
+
+ for (int e = 0; e < opr_sz / 4; e++) {
+ d[e] = do_cdot_s(n[e], m[e], a[e], sel_a, sel_b, sub_i);
+ }
+}
+
+void HELPER(sve2_cdot_zzzz_d)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ int opr_sz = simd_oprsz(desc);
+ int rot = simd_data(desc);
+ int sel_a = rot & 1;
+ int sel_b = sel_a ^ 1;
+ int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
+ uint64_t *d = vd, *n = vn, *m = vm, *a = va;
+
+ for (int e = 0; e < opr_sz / 8; e++) {
+ d[e] = do_cdot_d(n[e], m[e], a[e], sel_a, sel_b, sub_i);
+ }
+}
+
+void HELPER(sve2_cdot_idx_s)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ int opr_sz = simd_oprsz(desc);
+ int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
+ int idx = H4(extract32(desc, SIMD_DATA_SHIFT + 2, 2));
+ int sel_a = rot & 1;
+ int sel_b = sel_a ^ 1;
+ int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
+ uint32_t *d = vd, *n = vn, *m = vm, *a = va;
+
+ for (int seg = 0; seg < opr_sz / 4; seg += 4) {
+ uint32_t seg_m = m[seg + idx];
+ for (int e = 0; e < 4; e++) {
+ d[seg + e] = do_cdot_s(n[seg + e], seg_m, a[seg + e],
+ sel_a, sel_b, sub_i);
+ }
+ }
+}
+
+void HELPER(sve2_cdot_idx_d)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ int seg, opr_sz = simd_oprsz(desc);
+ int rot = extract32(desc, SIMD_DATA_SHIFT, 2);
+ int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
+ int sel_a = rot & 1;
+ int sel_b = sel_a ^ 1;
+ int sub_i = (rot == 0 || rot == 3 ? -1 : 1);
+ uint64_t *d = vd, *n = vn, *m = vm, *a = va;
+
+ for (seg = 0; seg < opr_sz / 8; seg += 2) {
+ uint64_t seg_m = m[seg + idx];
+ for (int e = 0; e < 2; e++) {
+ d[seg + e] = do_cdot_d(n[seg + e], seg_m, a[seg + e],
+ sel_a, sel_b, sub_i);
+ }
+ }
+}
+
+#define DO_ZZXZ(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
+ intptr_t i, j, idx = simd_data(desc); \
+ TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
+ for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
+ TYPE mm = m[i]; \
+ for (j = 0; j < segment; j++) { \
+ d[i + j] = OP(n[i + j], mm, a[i + j]); \
+ } \
+ } \
+}
+
+#define DO_SQRDMLAH_H(N, M, A) \
+ ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
+#define DO_SQRDMLAH_S(N, M, A) \
+ ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
+#define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
+
+DO_ZZXZ(sve2_sqrdmlah_idx_h, int16_t, H2, DO_SQRDMLAH_H)
+DO_ZZXZ(sve2_sqrdmlah_idx_s, int32_t, H4, DO_SQRDMLAH_S)
+DO_ZZXZ(sve2_sqrdmlah_idx_d, int64_t, H8, DO_SQRDMLAH_D)
+
+#define DO_SQRDMLSH_H(N, M, A) \
+ ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
+#define DO_SQRDMLSH_S(N, M, A) \
+ ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
+#define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
+
+DO_ZZXZ(sve2_sqrdmlsh_idx_h, int16_t, H2, DO_SQRDMLSH_H)
+DO_ZZXZ(sve2_sqrdmlsh_idx_s, int32_t, H4, DO_SQRDMLSH_S)
+DO_ZZXZ(sve2_sqrdmlsh_idx_d, int64_t, H8, DO_SQRDMLSH_D)
+
+#undef DO_ZZXZ
+
+#define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
+ intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
+ for (i = 0; i < oprsz; i += 16) { \
+ TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
+ for (j = 0; j < 16; j += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
+ TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
+ *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
+ } \
+ } \
+}
+
+#define DO_MLA(N, M, A) (A + N * M)
+
+DO_ZZXW(sve2_smlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLA)
+DO_ZZXW(sve2_smlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLA)
+DO_ZZXW(sve2_umlal_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLA)
+DO_ZZXW(sve2_umlal_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLA)
+
+#define DO_MLS(N, M, A) (A - N * M)
+
+DO_ZZXW(sve2_smlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MLS)
+DO_ZZXW(sve2_smlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MLS)
+DO_ZZXW(sve2_umlsl_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MLS)
+DO_ZZXW(sve2_umlsl_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MLS)
+
+#define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
+#define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
+
+DO_ZZXW(sve2_sqdmlal_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLAL_S)
+DO_ZZXW(sve2_sqdmlal_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLAL_D)
+
+#define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
+#define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
+
+DO_ZZXW(sve2_sqdmlsl_idx_s, int32_t, int16_t, H1_4, H1_2, DO_SQDMLSL_S)
+DO_ZZXW(sve2_sqdmlsl_idx_d, int64_t, int32_t, H1_8, H1_4, DO_SQDMLSL_D)
+
+#undef DO_MLA
+#undef DO_MLS
+#undef DO_ZZXW
+
+#define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
+ intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
+ for (i = 0; i < oprsz; i += 16) { \
+ TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
+ for (j = 0; j < 16; j += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
+ *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
+ } \
+ } \
+}
+
+DO_ZZX(sve2_sqdmull_idx_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
+DO_ZZX(sve2_sqdmull_idx_d, int64_t, int32_t, H1_8, H1_4, do_sqdmull_d)
+
+DO_ZZX(sve2_smull_idx_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
+DO_ZZX(sve2_smull_idx_d, int64_t, int32_t, H1_8, H1_4, DO_MUL)
+
+DO_ZZX(sve2_umull_idx_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
+DO_ZZX(sve2_umull_idx_d, uint64_t, uint32_t, H1_8, H1_4, DO_MUL)
+
+#undef DO_ZZX
+
+#define DO_BITPERM(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
+ TYPE nn = *(TYPE *)(vn + i); \
+ TYPE mm = *(TYPE *)(vm + i); \
+ *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
+ } \
+}
+
+static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
+{
+ uint64_t res = 0;
+ int db, rb = 0;
+
+ for (db = 0; db < n; ++db) {
+ if ((mask >> db) & 1) {
+ res |= ((data >> db) & 1) << rb;
+ ++rb;
+ }
+ }
+ return res;
+}
+
+DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
+DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
+DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
+DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
+
+static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
+{
+ uint64_t res = 0;
+ int rb, db = 0;
+
+ for (rb = 0; rb < n; ++rb) {
+ if ((mask >> rb) & 1) {
+ res |= ((data >> db) & 1) << rb;
+ ++db;
+ }
+ }
+ return res;
+}
+
+DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
+DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
+DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
+DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
+
+static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
+{
+ uint64_t resm = 0, resu = 0;
+ int db, rbm = 0, rbu = 0;
+
+ for (db = 0; db < n; ++db) {
+ uint64_t val = (data >> db) & 1;
+ if ((mask >> db) & 1) {
+ resm |= val << rbm++;
+ } else {
+ resu |= val << rbu++;
+ }
+ }
+
+ return resm | (resu << rbm);
+}
+
+DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
+DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
+DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
+DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
+
+#undef DO_BITPERM
+
+#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int sub_r = simd_data(desc); \
+ if (sub_r) { \
+ for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
+ TYPE acc_r = *(TYPE *)(vn + H(i)); \
+ TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
+ TYPE el2_r = *(TYPE *)(vm + H(i)); \
+ TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
+ acc_r = ADD_OP(acc_r, el2_i); \
+ acc_i = SUB_OP(acc_i, el2_r); \
+ *(TYPE *)(vd + H(i)) = acc_r; \
+ *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
+ } \
+ } else { \
+ for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
+ TYPE acc_r = *(TYPE *)(vn + H(i)); \
+ TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
+ TYPE el2_r = *(TYPE *)(vm + H(i)); \
+ TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
+ acc_r = SUB_OP(acc_r, el2_i); \
+ acc_i = ADD_OP(acc_i, el2_r); \
+ *(TYPE *)(vd + H(i)) = acc_r; \
+ *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
+ } \
+ } \
+}
+
+DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
+DO_CADD(sve2_cadd_d, int64_t, H1_8, DO_ADD, DO_SUB)
+
+DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
+DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
+DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
+DO_CADD(sve2_sqcadd_d, int64_t, H1_8, do_sqadd_d, do_sqsub_d)
+
+#undef DO_CADD
+
+#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
+ int shift = simd_data(desc) >> 1; \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
+ *(TYPEW *)(vd + HW(i)) = nn << shift; \
+ } \
+}
+
+DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
+DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
+DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, H1_8, H1_4)
+
+DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
+DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
+DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, H1_8, H1_4)
+
+#undef DO_ZZI_SHLL
+
+/* Two-operand reduction expander, controlled by a predicate.
+ * The difference between TYPERED and TYPERET has to do with
+ * sign-extension. E.g. for SMAX, TYPERED must be signed,
+ * but TYPERET must be unsigned so that e.g. a 32-bit value
+ * is not sign-extended to the ABI uint64_t return type.
+ */
+/* ??? If we were to vectorize this by hand the reduction ordering
+ * would change. For integer operands, this is perfectly fine.
+ */
+#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
+uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ TYPERED ret = INIT; \
+ for (i = 0; i < opr_sz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ if (pg & 1) { \
+ TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
+ ret = OP(ret, nn); \
+ } \
+ i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
+ } while (i & 15); \
+ } \
+ return (TYPERET)ret; \
+}
+
+#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
+uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
+ TYPEE *n = vn; \
+ uint8_t *pg = vg; \
+ TYPER ret = INIT; \
+ for (i = 0; i < opr_sz; i += 1) { \
+ if (pg[H1(i)] & 1) { \
+ TYPEE nn = n[i]; \
+ ret = OP(ret, nn); \
+ } \
+ } \
+ return ret; \
+}
+
+DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
+DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
+DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
+DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
+
+DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
+DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
+DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
+DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
+
+DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
+DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
+DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
+DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
+
+DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
+DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
+DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
+
+DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
+DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
+DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
+DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
+
+DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
+DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
+DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
+DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
+
+DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
+DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
+DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
+DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
+
+DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
+DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
+DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
+DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
+
+DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
+DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
+DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
+DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
+
+#undef DO_VPZ
+#undef DO_VPZ_D
+
+/* Two vector operand, one scalar operand, unpredicated. */
+#define DO_ZZI(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
+ TYPE s = s64, *d = vd, *n = vn; \
+ for (i = 0; i < opr_sz; ++i) { \
+ d[i] = OP(n[i], s); \
+ } \
+}
+
+#define DO_SUBR(X, Y) (Y - X)
+
+DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
+DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
+DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
+DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
+
+DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
+DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
+DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
+DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
+
+DO_ZZI(sve_smini_b, int8_t, DO_MIN)
+DO_ZZI(sve_smini_h, int16_t, DO_MIN)
+DO_ZZI(sve_smini_s, int32_t, DO_MIN)
+DO_ZZI(sve_smini_d, int64_t, DO_MIN)
+
+DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
+DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
+DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
+DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
+
+DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
+DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
+DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
+DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
+
+#undef DO_ZZI
+
+#undef DO_AND
+#undef DO_ORR
+#undef DO_EOR
+#undef DO_BIC
+#undef DO_ADD
+#undef DO_SUB
+#undef DO_MAX
+#undef DO_MIN
+#undef DO_ABD
+#undef DO_MUL
+#undef DO_DIV
+#undef DO_ASR
+#undef DO_LSR
+#undef DO_LSL
+#undef DO_SUBR
+
+/* Similar to the ARM LastActiveElement pseudocode function, except the
+ result is multiplied by the element size. This includes the not found
+ indication; e.g. not found for esz=3 is -8. */
+static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
+{
+ uint64_t mask = pred_esz_masks[esz];
+ intptr_t i = words;
+
+ do {
+ uint64_t this_g = g[--i] & mask;
+ if (this_g) {
+ return i * 64 + (63 - clz64(this_g));
+ }
+ } while (i > 0);
+ return (intptr_t)-1 << esz;
+}
+
+uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
+{
+ intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+ uint32_t flags = PREDTEST_INIT;
+ uint64_t *d = vd, *g = vg;
+ intptr_t i = 0;
+
+ do {
+ uint64_t this_d = d[i];
+ uint64_t this_g = g[i];
+
+ if (this_g) {
+ if (!(flags & 4)) {
+ /* Set in D the first bit of G. */
+ this_d |= this_g & -this_g;
+ d[i] = this_d;
+ }
+ flags = iter_predtest_fwd(this_d, this_g, flags);
+ }
+ } while (++i < words);
+
+ return flags;
+}
+
+uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
+{
+ intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+ intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint32_t flags = PREDTEST_INIT;
+ uint64_t *d = vd, *g = vg, esz_mask;
+ intptr_t i, next;
+
+ next = last_active_element(vd, words, esz) + (1 << esz);
+ esz_mask = pred_esz_masks[esz];
+
+ /* Similar to the pseudocode for pnext, but scaled by ESZ
+ so that we find the correct bit. */
+ if (next < words * 64) {
+ uint64_t mask = -1;
+
+ if (next & 63) {
+ mask = ~((1ull << (next & 63)) - 1);
+ next &= -64;
+ }
+ do {
+ uint64_t this_g = g[next / 64] & esz_mask & mask;
+ if (this_g != 0) {
+ next = (next & -64) + ctz64(this_g);
+ break;
+ }
+ next += 64;
+ mask = -1;
+ } while (next < words * 64);
+ }
+
+ i = 0;
+ do {
+ uint64_t this_d = 0;
+ if (i == next / 64) {
+ this_d = 1ull << (next & 63);
+ }
+ d[i] = this_d;
+ flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
+ } while (++i < words);
+
+ return flags;
+}
+
+/*
+ * Copy Zn into Zd, and store zero into inactive elements.
+ * If inv, store zeros into the active elements.
+ */
+void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
+ }
+}
+
+void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
+ }
+}
+
+void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
+ }
+}
+
+void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+ uint8_t inv = simd_data(desc);
+
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
+ }
+}
+
+/* Three-operand expander, immediate operand, controlled by a predicate.
+ */
+#define DO_ZPZI(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ TYPE imm = simd_data(desc); \
+ for (i = 0; i < opr_sz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ if (pg & 1) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ *(TYPE *)(vd + H(i)) = OP(nn, imm); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ } while (i & 15); \
+ } \
+}
+
+/* Similarly, specialized for 64-bit operands. */
+#define DO_ZPZI_D(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
+ TYPE *d = vd, *n = vn; \
+ TYPE imm = simd_data(desc); \
+ uint8_t *pg = vg; \
+ for (i = 0; i < opr_sz; i += 1) { \
+ if (pg[H1(i)] & 1) { \
+ TYPE nn = n[i]; \
+ d[i] = OP(nn, imm); \
+ } \
+ } \
+}
+
+#define DO_SHR(N, M) (N >> M)
+#define DO_SHL(N, M) (N << M)
+
+/* Arithmetic shift right for division. This rounds negative numbers
+ toward zero as per signed division. Therefore before shifting,
+ when N is negative, add 2**M-1. */
+#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
+
+static inline uint64_t do_urshr(uint64_t x, unsigned sh)
+{
+ if (likely(sh < 64)) {
+ return (x >> sh) + ((x >> (sh - 1)) & 1);
+ } else if (sh == 64) {
+ return x >> 63;
+ } else {
+ return 0;
+ }
+}
+
+static inline int64_t do_srshr(int64_t x, unsigned sh)
+{
+ if (likely(sh < 64)) {
+ return (x >> sh) + ((x >> (sh - 1)) & 1);
+ } else {
+ /* Rounding the sign bit always produces 0. */
+ return 0;
+ }
+}
+
+DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
+DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
+DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
+DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
+
+DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
+DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
+DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
+DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
+
+DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
+DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
+DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
+DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
+
+DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
+DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
+DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
+DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
+
+/* SVE2 bitwise shift by immediate */
+DO_ZPZI(sve2_sqshl_zpzi_b, int8_t, H1, do_sqshl_b)
+DO_ZPZI(sve2_sqshl_zpzi_h, int16_t, H1_2, do_sqshl_h)
+DO_ZPZI(sve2_sqshl_zpzi_s, int32_t, H1_4, do_sqshl_s)
+DO_ZPZI_D(sve2_sqshl_zpzi_d, int64_t, do_sqshl_d)
+
+DO_ZPZI(sve2_uqshl_zpzi_b, uint8_t, H1, do_uqshl_b)
+DO_ZPZI(sve2_uqshl_zpzi_h, uint16_t, H1_2, do_uqshl_h)
+DO_ZPZI(sve2_uqshl_zpzi_s, uint32_t, H1_4, do_uqshl_s)
+DO_ZPZI_D(sve2_uqshl_zpzi_d, uint64_t, do_uqshl_d)
+
+DO_ZPZI(sve2_srshr_b, int8_t, H1, do_srshr)
+DO_ZPZI(sve2_srshr_h, int16_t, H1_2, do_srshr)
+DO_ZPZI(sve2_srshr_s, int32_t, H1_4, do_srshr)
+DO_ZPZI_D(sve2_srshr_d, int64_t, do_srshr)
+
+DO_ZPZI(sve2_urshr_b, uint8_t, H1, do_urshr)
+DO_ZPZI(sve2_urshr_h, uint16_t, H1_2, do_urshr)
+DO_ZPZI(sve2_urshr_s, uint32_t, H1_4, do_urshr)
+DO_ZPZI_D(sve2_urshr_d, uint64_t, do_urshr)
+
+#define do_suqrshl_b(n, m) \
+ ({ uint32_t discard; do_suqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
+#define do_suqrshl_h(n, m) \
+ ({ uint32_t discard; do_suqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
+#define do_suqrshl_s(n, m) \
+ ({ uint32_t discard; do_suqrshl_bhs(n, m, 32, false, &discard); })
+#define do_suqrshl_d(n, m) \
+ ({ uint32_t discard; do_suqrshl_d(n, m, false, &discard); })
+
+DO_ZPZI(sve2_sqshlu_b, int8_t, H1, do_suqrshl_b)
+DO_ZPZI(sve2_sqshlu_h, int16_t, H1_2, do_suqrshl_h)
+DO_ZPZI(sve2_sqshlu_s, int32_t, H1_4, do_suqrshl_s)
+DO_ZPZI_D(sve2_sqshlu_d, int64_t, do_suqrshl_d)
+
+#undef DO_ASRD
+#undef DO_ZPZI
+#undef DO_ZPZI_D
+
+#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int shift = simd_data(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEW *)(vn + i); \
+ *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
+ } \
+}
+
+#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ int shift = simd_data(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEW *)(vn + HW(i)); \
+ *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
+ } \
+}
+
+DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
+DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
+DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
+
+DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
+DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
+DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_SHR)
+
+DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
+DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
+DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
+
+DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
+DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
+DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, H1_8, H1_4, do_urshr)
+
+#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
+#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
+#define DO_SQSHRUN_D(x, sh) \
+ do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
+
+DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
+DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
+DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
+
+DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
+DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
+DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRUN_D)
+
+#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
+#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
+#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
+
+DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
+DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
+DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
+
+DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
+DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
+DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRUN_D)
+
+#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
+#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
+#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
+
+DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
+DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
+DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
+
+DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
+DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
+DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQSHRN_D)
+
+#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
+#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
+#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
+
+DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
+DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
+DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
+
+DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
+DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
+DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, H1_8, H1_4, DO_SQRSHRN_D)
+
+#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
+#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
+#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
+
+DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
+DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
+DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
+
+DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
+DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
+DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQSHRN_D)
+
+#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
+#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
+#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
+
+DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
+DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
+DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
+
+DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
+DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
+DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, H1_8, H1_4, DO_UQRSHRN_D)
+
+#undef DO_SHRNB
+#undef DO_SHRNT
+
+#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEW *)(vn + i); \
+ TYPEW mm = *(TYPEW *)(vm + i); \
+ *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
+ } \
+}
+
+#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
+ TYPEW nn = *(TYPEW *)(vn + HW(i)); \
+ TYPEW mm = *(TYPEW *)(vm + HW(i)); \
+ *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
+ } \
+}
+
+#define DO_ADDHN(N, M, SH) ((N + M) >> SH)
+#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
+#define DO_SUBHN(N, M, SH) ((N - M) >> SH)
+#define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
+
+DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
+DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
+DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
+
+DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
+DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
+DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_ADDHN)
+
+DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
+DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
+DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
+
+DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
+DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
+DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RADDHN)
+
+DO_BINOPNB(sve2_subhnb_h, uint16_t, uint8_t, 8, DO_SUBHN)
+DO_BINOPNB(sve2_subhnb_s, uint32_t, uint16_t, 16, DO_SUBHN)
+DO_BINOPNB(sve2_subhnb_d, uint64_t, uint32_t, 32, DO_SUBHN)
+
+DO_BINOPNT(sve2_subhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_SUBHN)
+DO_BINOPNT(sve2_subhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_SUBHN)
+DO_BINOPNT(sve2_subhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_SUBHN)
+
+DO_BINOPNB(sve2_rsubhnb_h, uint16_t, uint8_t, 8, DO_RSUBHN)
+DO_BINOPNB(sve2_rsubhnb_s, uint32_t, uint16_t, 16, DO_RSUBHN)
+DO_BINOPNB(sve2_rsubhnb_d, uint64_t, uint32_t, 32, DO_RSUBHN)
+
+DO_BINOPNT(sve2_rsubhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RSUBHN)
+DO_BINOPNT(sve2_rsubhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RSUBHN)
+DO_BINOPNT(sve2_rsubhnt_d, uint64_t, uint32_t, 32, H1_8, H1_4, DO_RSUBHN)
+
+#undef DO_RSUBHN
+#undef DO_SUBHN
+#undef DO_RADDHN
+#undef DO_ADDHN
+
+#undef DO_BINOPNB
+
+/* Fully general four-operand expander, controlled by a predicate.
+ */
+#define DO_ZPZZZ(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
+ void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ if (pg & 1) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ TYPE mm = *(TYPE *)(vm + H(i)); \
+ TYPE aa = *(TYPE *)(va + H(i)); \
+ *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
+ } \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ } while (i & 15); \
+ } \
+}
+
+/* Similarly, specialized for 64-bit operands. */
+#define DO_ZPZZZ_D(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
+ void *vg, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
+ TYPE *d = vd, *a = va, *n = vn, *m = vm; \
+ uint8_t *pg = vg; \
+ for (i = 0; i < opr_sz; i += 1) { \
+ if (pg[H1(i)] & 1) { \
+ TYPE aa = a[i], nn = n[i], mm = m[i]; \
+ d[i] = OP(aa, nn, mm); \
+ } \
+ } \
+}
+
+#define DO_MLA(A, N, M) (A + N * M)
+#define DO_MLS(A, N, M) (A - N * M)
+
+DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
+DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
+
+DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
+DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
+
+DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
+DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
+
+DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
+DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
+
+#undef DO_MLA
+#undef DO_MLS
+#undef DO_ZPZZZ
+#undef DO_ZPZZZ_D
+
+void HELPER(sve_index_b)(void *vd, uint32_t start,
+ uint32_t incr, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint8_t *d = vd;
+ for (i = 0; i < opr_sz; i += 1) {
+ d[H1(i)] = start + i * incr;
+ }
+}
+
+void HELPER(sve_index_h)(void *vd, uint32_t start,
+ uint32_t incr, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 2;
+ uint16_t *d = vd;
+ for (i = 0; i < opr_sz; i += 1) {
+ d[H2(i)] = start + i * incr;
+ }
+}
+
+void HELPER(sve_index_s)(void *vd, uint32_t start,
+ uint32_t incr, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+ uint32_t *d = vd;
+ for (i = 0; i < opr_sz; i += 1) {
+ d[H4(i)] = start + i * incr;
+ }
+}
+
+void HELPER(sve_index_d)(void *vd, uint64_t start,
+ uint64_t incr, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd;
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = start + i * incr;
+ }
+}
+
+void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+ uint32_t sh = simd_data(desc);
+ uint32_t *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = n[i] + (m[i] << sh);
+ }
+}
+
+void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t sh = simd_data(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = n[i] + (m[i] << sh);
+ }
+}
+
+void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t sh = simd_data(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
+ }
+}
+
+void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t sh = simd_data(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
+ }
+}
+
+void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
+{
+ /* These constants are cut-and-paste directly from the ARM pseudocode. */
+ static const uint16_t coeff[] = {
+ 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
+ 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
+ 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
+ 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
+ };
+ intptr_t i, opr_sz = simd_oprsz(desc) / 2;
+ uint16_t *d = vd, *n = vn;
+
+ for (i = 0; i < opr_sz; i++) {
+ uint16_t nn = n[i];
+ intptr_t idx = extract32(nn, 0, 5);
+ uint16_t exp = extract32(nn, 5, 5);
+ d[i] = coeff[idx] | (exp << 10);
+ }
+}
+
+void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
+{
+ /* These constants are cut-and-paste directly from the ARM pseudocode. */
+ static const uint32_t coeff[] = {
+ 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
+ 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
+ 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
+ 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
+ 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
+ 0x1ef532, 0x20b051, 0x227043, 0x243516,
+ 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
+ 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
+ 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
+ 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
+ 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
+ 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
+ 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
+ 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
+ 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
+ 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
+ };
+ intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+ uint32_t *d = vd, *n = vn;
+
+ for (i = 0; i < opr_sz; i++) {
+ uint32_t nn = n[i];
+ intptr_t idx = extract32(nn, 0, 6);
+ uint32_t exp = extract32(nn, 6, 8);
+ d[i] = coeff[idx] | (exp << 23);
+ }
+}
+
+void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
+{
+ /* These constants are cut-and-paste directly from the ARM pseudocode. */
+ static const uint64_t coeff[] = {
+ 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
+ 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
+ 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
+ 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
+ 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
+ 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
+ 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
+ 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
+ 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
+ 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
+ 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
+ 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
+ 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
+ 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
+ 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
+ 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
+ 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
+ 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
+ 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
+ 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
+ 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
+ 0xFA7C1819E90D8ull,
+ };
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn;
+
+ for (i = 0; i < opr_sz; i++) {
+ uint64_t nn = n[i];
+ intptr_t idx = extract32(nn, 0, 6);
+ uint64_t exp = extract32(nn, 6, 11);
+ d[i] = coeff[idx] | (exp << 52);
+ }
+}
+
+void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 2;
+ uint16_t *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i += 1) {
+ uint16_t nn = n[i];
+ uint16_t mm = m[i];
+ if (mm & 1) {
+ nn = float16_one;
+ }
+ d[i] = nn ^ (mm & 2) << 14;
+ }
+}
+
+void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+ uint32_t *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i += 1) {
+ uint32_t nn = n[i];
+ uint32_t mm = m[i];
+ if (mm & 1) {
+ nn = float32_one;
+ }
+ d[i] = nn ^ (mm & 2) << 30;
+ }
+}
+
+void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i += 1) {
+ uint64_t nn = n[i];
+ uint64_t mm = m[i];
+ if (mm & 1) {
+ nn = float64_one;
+ }
+ d[i] = nn ^ (mm & 2) << 62;
+ }
+}
+
+/*
+ * Signed saturating addition with scalar operand.
+ */
+
+void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+
+ for (i = 0; i < oprsz; i += sizeof(int8_t)) {
+ *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
+ }
+}
+
+void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+
+ for (i = 0; i < oprsz; i += sizeof(int16_t)) {
+ *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
+ }
+}
+
+void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+
+ for (i = 0; i < oprsz; i += sizeof(int32_t)) {
+ *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
+ }
+}
+
+void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+
+ for (i = 0; i < oprsz; i += sizeof(int64_t)) {
+ *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
+ }
+}
+
+/*
+ * Unsigned saturating addition with scalar operand.
+ */
+
+void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+ *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
+ }
+}
+
+void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+ *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
+ }
+}
+
+void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+ *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
+ }
+}
+
+void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+ *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
+ }
+}
+
+void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+ *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
+ }
+}
+
+/* Two operand predicated copy immediate with merge. All valid immediates
+ * can fit within 17 signed bits in the simd_data field.
+ */
+void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
+ uint64_t mm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+
+ mm = dup_const(MO_8, mm);
+ for (i = 0; i < opr_sz; i += 1) {
+ uint64_t nn = n[i];
+ uint64_t pp = expand_pred_b(pg[H1(i)]);
+ d[i] = (mm & pp) | (nn & ~pp);
+ }
+}
+
+void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
+ uint64_t mm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+
+ mm = dup_const(MO_16, mm);
+ for (i = 0; i < opr_sz; i += 1) {
+ uint64_t nn = n[i];
+ uint64_t pp = expand_pred_h(pg[H1(i)]);
+ d[i] = (mm & pp) | (nn & ~pp);
+ }
+}
+
+void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
+ uint64_t mm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+
+ mm = dup_const(MO_32, mm);
+ for (i = 0; i < opr_sz; i += 1) {
+ uint64_t nn = n[i];
+ uint64_t pp = expand_pred_s(pg[H1(i)]);
+ d[i] = (mm & pp) | (nn & ~pp);
+ }
+}
+
+void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
+ uint64_t mm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i += 1) {
+ uint64_t nn = n[i];
+ d[i] = (pg[H1(i)] & 1 ? mm : nn);
+ }
+}
+
+void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd;
+ uint8_t *pg = vg;
+
+ val = dup_const(MO_8, val);
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = val & expand_pred_b(pg[H1(i)]);
+ }
+}
+
+void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd;
+ uint8_t *pg = vg;
+
+ val = dup_const(MO_16, val);
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = val & expand_pred_h(pg[H1(i)]);
+ }
+}
+
+void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd;
+ uint8_t *pg = vg;
+
+ val = dup_const(MO_32, val);
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = val & expand_pred_s(pg[H1(i)]);
+ }
+}
+
+void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd;
+ uint8_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = (pg[H1(i)] & 1 ? val : 0);
+ }
+}
+
+/* Big-endian hosts need to frob the byte indices. If the copy
+ * happens to be 8-byte aligned, then no frobbing necessary.
+ */
+static void swap_memmove(void *vd, void *vs, size_t n)
+{
+ uintptr_t d = (uintptr_t)vd;
+ uintptr_t s = (uintptr_t)vs;
+ uintptr_t o = (d | s | n) & 7;
+ size_t i;
+
+#if !HOST_BIG_ENDIAN
+ o = 0;
+#endif
+ switch (o) {
+ case 0:
+ memmove(vd, vs, n);
+ break;
+
+ case 4:
+ if (d < s || d >= s + n) {
+ for (i = 0; i < n; i += 4) {
+ *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
+ }
+ } else {
+ for (i = n; i > 0; ) {
+ i -= 4;
+ *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
+ }
+ }
+ break;
+
+ case 2:
+ case 6:
+ if (d < s || d >= s + n) {
+ for (i = 0; i < n; i += 2) {
+ *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
+ }
+ } else {
+ for (i = n; i > 0; ) {
+ i -= 2;
+ *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
+ }
+ }
+ break;
+
+ default:
+ if (d < s || d >= s + n) {
+ for (i = 0; i < n; i++) {
+ *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
+ }
+ } else {
+ for (i = n; i > 0; ) {
+ i -= 1;
+ *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
+ }
+ }
+ break;
+ }
+}
+
+/* Similarly for memset of 0. */
+static void swap_memzero(void *vd, size_t n)
+{
+ uintptr_t d = (uintptr_t)vd;
+ uintptr_t o = (d | n) & 7;
+ size_t i;
+
+ /* Usually, the first bit of a predicate is set, so N is 0. */
+ if (likely(n == 0)) {
+ return;
+ }
+
+#if !HOST_BIG_ENDIAN
+ o = 0;
+#endif
+ switch (o) {
+ case 0:
+ memset(vd, 0, n);
+ break;
+
+ case 4:
+ for (i = 0; i < n; i += 4) {
+ *(uint32_t *)H1_4(d + i) = 0;
+ }
+ break;
+
+ case 2:
+ case 6:
+ for (i = 0; i < n; i += 2) {
+ *(uint16_t *)H1_2(d + i) = 0;
+ }
+ break;
+
+ default:
+ for (i = 0; i < n; i++) {
+ *(uint8_t *)H1(d + i) = 0;
+ }
+ break;
+ }
+}
+
+void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t opr_sz = simd_oprsz(desc);
+ size_t n_ofs = simd_data(desc);
+ size_t n_siz = opr_sz - n_ofs;
+
+ if (vd != vm) {
+ swap_memmove(vd, vn + n_ofs, n_siz);
+ swap_memmove(vd + n_siz, vm, n_ofs);
+ } else if (vd != vn) {
+ swap_memmove(vd + n_siz, vd, n_ofs);
+ swap_memmove(vd, vn + n_ofs, n_siz);
+ } else {
+ /* vd == vn == vm. Need temp space. */
+ ARMVectorReg tmp;
+ swap_memmove(&tmp, vm, n_ofs);
+ swap_memmove(vd, vd + n_ofs, n_siz);
+ memcpy(vd + n_siz, &tmp, n_ofs);
+ }
+}
+
+#define DO_INSR(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
+{ \
+ intptr_t opr_sz = simd_oprsz(desc); \
+ swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
+ *(TYPE *)(vd + H(0)) = val; \
+}
+
+DO_INSR(sve_insr_b, uint8_t, H1)
+DO_INSR(sve_insr_h, uint16_t, H1_2)
+DO_INSR(sve_insr_s, uint32_t, H1_4)
+DO_INSR(sve_insr_d, uint64_t, H1_8)
+
+#undef DO_INSR
+
+void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
+ uint64_t f = *(uint64_t *)(vn + i);
+ uint64_t b = *(uint64_t *)(vn + j);
+ *(uint64_t *)(vd + i) = bswap64(b);
+ *(uint64_t *)(vd + j) = bswap64(f);
+ }
+}
+
+void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
+ uint64_t f = *(uint64_t *)(vn + i);
+ uint64_t b = *(uint64_t *)(vn + j);
+ *(uint64_t *)(vd + i) = hswap64(b);
+ *(uint64_t *)(vd + j) = hswap64(f);
+ }
+}
+
+void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
+ uint64_t f = *(uint64_t *)(vn + i);
+ uint64_t b = *(uint64_t *)(vn + j);
+ *(uint64_t *)(vd + i) = rol64(b, 32);
+ *(uint64_t *)(vd + j) = rol64(f, 32);
+ }
+}
+
+void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
+ uint64_t f = *(uint64_t *)(vn + i);
+ uint64_t b = *(uint64_t *)(vn + j);
+ *(uint64_t *)(vd + i) = b;
+ *(uint64_t *)(vd + j) = f;
+ }
+}
+
+typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
+
+static inline void do_tbl1(void *vd, void *vn, void *vm, uint32_t desc,
+ bool is_tbx, tb_impl_fn *fn)
+{
+ ARMVectorReg scratch;
+ uintptr_t oprsz = simd_oprsz(desc);
+
+ if (unlikely(vd == vn)) {
+ vn = memcpy(&scratch, vn, oprsz);
+ }
+
+ fn(vd, vn, NULL, vm, oprsz, is_tbx);
+}
+
+static inline void do_tbl2(void *vd, void *vn0, void *vn1, void *vm,
+ uint32_t desc, bool is_tbx, tb_impl_fn *fn)
+{
+ ARMVectorReg scratch;
+ uintptr_t oprsz = simd_oprsz(desc);
+
+ if (unlikely(vd == vn0)) {
+ vn0 = memcpy(&scratch, vn0, oprsz);
+ if (vd == vn1) {
+ vn1 = vn0;
+ }
+ } else if (unlikely(vd == vn1)) {
+ vn1 = memcpy(&scratch, vn1, oprsz);
+ }
+
+ fn(vd, vn0, vn1, vm, oprsz, is_tbx);
+}
+
+#define DO_TB(SUFF, TYPE, H) \
+static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
+ void *vm, uintptr_t oprsz, bool is_tbx) \
+{ \
+ TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
+ uintptr_t i, nelem = oprsz / sizeof(TYPE); \
+ for (i = 0; i < nelem; ++i) { \
+ TYPE index = indexes[H1(i)], val = 0; \
+ if (index < nelem) { \
+ val = tbl0[H(index)]; \
+ } else { \
+ index -= nelem; \
+ if (tbl1 && index < nelem) { \
+ val = tbl1[H(index)]; \
+ } else if (is_tbx) { \
+ continue; \
+ } \
+ } \
+ d[H(i)] = val; \
+ } \
+} \
+void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
+} \
+void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
+ void *vm, uint32_t desc) \
+{ \
+ do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
+} \
+void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
+}
+
+DO_TB(b, uint8_t, H1)
+DO_TB(h, uint16_t, H2)
+DO_TB(s, uint32_t, H4)
+DO_TB(d, uint64_t, H8)
+
+#undef DO_TB
+
+#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ TYPED *d = vd; \
+ TYPES *n = vn; \
+ ARMVectorReg tmp; \
+ if (unlikely(vn - vd < opr_sz)) { \
+ n = memcpy(&tmp, n, opr_sz / 2); \
+ } \
+ for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
+ d[HD(i)] = n[HS(i)]; \
+ } \
+}
+
+DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
+DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
+DO_UNPK(sve_sunpk_d, int64_t, int32_t, H8, H4)
+
+DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
+DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
+DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, H8, H4)
+
+#undef DO_UNPK
+
+/* Mask of bits included in the even numbered predicates of width esz.
+ * We also use this for expand_bits/compress_bits, and so extend the
+ * same pattern out to 16-bit units.
+ */
+static const uint64_t even_bit_esz_masks[5] = {
+ 0x5555555555555555ull,
+ 0x3333333333333333ull,
+ 0x0f0f0f0f0f0f0f0full,
+ 0x00ff00ff00ff00ffull,
+ 0x0000ffff0000ffffull,
+};
+
+/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
+ * For N==0, this corresponds to the operation that in qemu/bitops.h
+ * we call half_shuffle64; this algorithm is from Hacker's Delight,
+ * section 7-2 Shuffling Bits.
+ */
+static uint64_t expand_bits(uint64_t x, int n)
+{
+ int i;
+
+ x &= 0xffffffffu;
+ for (i = 4; i >= n; i--) {
+ int sh = 1 << i;
+ x = ((x << sh) | x) & even_bit_esz_masks[i];
+ }
+ return x;
+}
+
+/* Compress units of 2**(N+1) bits to units of 2**N bits.
+ * For N==0, this corresponds to the operation that in qemu/bitops.h
+ * we call half_unshuffle64; this algorithm is from Hacker's Delight,
+ * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
+ */
+static uint64_t compress_bits(uint64_t x, int n)
+{
+ int i;
+
+ for (i = n; i <= 4; i++) {
+ int sh = 1 << i;
+ x &= even_bit_esz_masks[i];
+ x = (x >> sh) | x;
+ }
+ return x & 0xffffffffu;
+}
+
+void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
+ int esize = 1 << esz;
+ uint64_t *d = vd;
+ intptr_t i;
+
+ if (oprsz <= 8) {
+ uint64_t nn = *(uint64_t *)vn;
+ uint64_t mm = *(uint64_t *)vm;
+ int half = 4 * oprsz;
+
+ nn = extract64(nn, high * half, half);
+ mm = extract64(mm, high * half, half);
+ nn = expand_bits(nn, esz);
+ mm = expand_bits(mm, esz);
+ d[0] = nn | (mm << esize);
+ } else {
+ ARMPredicateReg tmp;
+
+ /* We produce output faster than we consume input.
+ Therefore we must be mindful of possible overlap. */
+ if (vd == vn) {
+ vn = memcpy(&tmp, vn, oprsz);
+ if (vd == vm) {
+ vm = vn;
+ }
+ } else if (vd == vm) {
+ vm = memcpy(&tmp, vm, oprsz);
+ }
+ if (high) {
+ high = oprsz >> 1;
+ }
+
+ if ((oprsz & 7) == 0) {
+ uint32_t *n = vn, *m = vm;
+ high >>= 2;
+
+ for (i = 0; i < oprsz / 8; i++) {
+ uint64_t nn = n[H4(high + i)];
+ uint64_t mm = m[H4(high + i)];
+
+ nn = expand_bits(nn, esz);
+ mm = expand_bits(mm, esz);
+ d[i] = nn | (mm << esize);
+ }
+ } else {
+ uint8_t *n = vn, *m = vm;
+ uint16_t *d16 = vd;
+
+ for (i = 0; i < oprsz / 2; i++) {
+ uint16_t nn = n[H1(high + i)];
+ uint16_t mm = m[H1(high + i)];
+
+ nn = expand_bits(nn, esz);
+ mm = expand_bits(mm, esz);
+ d16[H2(i)] = nn | (mm << esize);
+ }
+ }
+ }
+}
+
+void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint64_t l, h;
+ intptr_t i;
+
+ if (oprsz <= 8) {
+ l = compress_bits(n[0] >> odd, esz);
+ h = compress_bits(m[0] >> odd, esz);
+ d[0] = l | (h << (4 * oprsz));
+ } else {
+ ARMPredicateReg tmp_m;
+ intptr_t oprsz_16 = oprsz / 16;
+
+ if ((vm - vd) < (uintptr_t)oprsz) {
+ m = memcpy(&tmp_m, vm, oprsz);
+ }
+
+ for (i = 0; i < oprsz_16; i++) {
+ l = n[2 * i + 0];
+ h = n[2 * i + 1];
+ l = compress_bits(l >> odd, esz);
+ h = compress_bits(h >> odd, esz);
+ d[i] = l | (h << 32);
+ }
+
+ /*
+ * For VL which is not a multiple of 512, the results from M do not
+ * align nicely with the uint64_t for D. Put the aligned results
+ * from M into TMP_M and then copy it into place afterward.
+ */
+ if (oprsz & 15) {
+ int final_shift = (oprsz & 15) * 2;
+
+ l = n[2 * i + 0];
+ h = n[2 * i + 1];
+ l = compress_bits(l >> odd, esz);
+ h = compress_bits(h >> odd, esz);
+ d[i] = l | (h << final_shift);
+
+ for (i = 0; i < oprsz_16; i++) {
+ l = m[2 * i + 0];
+ h = m[2 * i + 1];
+ l = compress_bits(l >> odd, esz);
+ h = compress_bits(h >> odd, esz);
+ tmp_m.p[i] = l | (h << 32);
+ }
+ l = m[2 * i + 0];
+ h = m[2 * i + 1];
+ l = compress_bits(l >> odd, esz);
+ h = compress_bits(h >> odd, esz);
+ tmp_m.p[i] = l | (h << final_shift);
+
+ swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
+ } else {
+ for (i = 0; i < oprsz_16; i++) {
+ l = m[2 * i + 0];
+ h = m[2 * i + 1];
+ l = compress_bits(l >> odd, esz);
+ h = compress_bits(h >> odd, esz);
+ d[oprsz_16 + i] = l | (h << 32);
+ }
+ }
+ }
+}
+
+void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint64_t mask;
+ int shr, shl;
+ intptr_t i;
+
+ shl = 1 << esz;
+ shr = 0;
+ mask = even_bit_esz_masks[esz];
+ if (odd) {
+ mask <<= shl;
+ shr = shl;
+ shl = 0;
+ }
+
+ for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
+ uint64_t nn = (n[i] & mask) >> shr;
+ uint64_t mm = (m[i] & mask) << shl;
+ d[i] = nn + mm;
+ }
+}
+
+/* Reverse units of 2**N bits. */
+static uint64_t reverse_bits_64(uint64_t x, int n)
+{
+ int i, sh;
+
+ x = bswap64(x);
+ for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
+ uint64_t mask = even_bit_esz_masks[i];
+ x = ((x & mask) << sh) | ((x >> sh) & mask);
+ }
+ return x;
+}
+
+static uint8_t reverse_bits_8(uint8_t x, int n)
+{
+ static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
+ int i, sh;
+
+ for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
+ x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
+ }
+ return x;
+}
+
+void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ intptr_t i, oprsz_2 = oprsz / 2;
+
+ if (oprsz <= 8) {
+ uint64_t l = *(uint64_t *)vn;
+ l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
+ *(uint64_t *)vd = l;
+ } else if ((oprsz & 15) == 0) {
+ for (i = 0; i < oprsz_2; i += 8) {
+ intptr_t ih = oprsz - 8 - i;
+ uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
+ uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
+ *(uint64_t *)(vd + i) = h;
+ *(uint64_t *)(vd + ih) = l;
+ }
+ } else {
+ for (i = 0; i < oprsz_2; i += 1) {
+ intptr_t il = H1(i);
+ intptr_t ih = H1(oprsz - 1 - i);
+ uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
+ uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
+ *(uint8_t *)(vd + il) = h;
+ *(uint8_t *)(vd + ih) = l;
+ }
+ }
+}
+
+void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
+ uint64_t *d = vd;
+ intptr_t i;
+
+ if (oprsz <= 8) {
+ uint64_t nn = *(uint64_t *)vn;
+ int half = 4 * oprsz;
+
+ nn = extract64(nn, high * half, half);
+ nn = expand_bits(nn, 0);
+ d[0] = nn;
+ } else {
+ ARMPredicateReg tmp_n;
+
+ /* We produce output faster than we consume input.
+ Therefore we must be mindful of possible overlap. */
+ if ((vn - vd) < (uintptr_t)oprsz) {
+ vn = memcpy(&tmp_n, vn, oprsz);
+ }
+ if (high) {
+ high = oprsz >> 1;
+ }
+
+ if ((oprsz & 7) == 0) {
+ uint32_t *n = vn;
+ high >>= 2;
+
+ for (i = 0; i < oprsz / 8; i++) {
+ uint64_t nn = n[H4(high + i)];
+ d[i] = expand_bits(nn, 0);
+ }
+ } else {
+ uint16_t *d16 = vd;
+ uint8_t *n = vn;
+
+ for (i = 0; i < oprsz / 2; i++) {
+ uint16_t nn = n[H1(high + i)];
+ d16[H2(i)] = expand_bits(nn, 0);
+ }
+ }
+ }
+}
+
+#define DO_ZIP(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t oprsz = simd_oprsz(desc); \
+ intptr_t odd_ofs = simd_data(desc); \
+ intptr_t i, oprsz_2 = oprsz / 2; \
+ ARMVectorReg tmp_n, tmp_m; \
+ /* We produce output faster than we consume input. \
+ Therefore we must be mindful of possible overlap. */ \
+ if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
+ vn = memcpy(&tmp_n, vn, oprsz); \
+ } \
+ if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
+ vm = memcpy(&tmp_m, vm, oprsz); \
+ } \
+ for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
+ *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + odd_ofs + H(i)); \
+ *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = \
+ *(TYPE *)(vm + odd_ofs + H(i)); \
+ } \
+ if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
+ memset(vd + oprsz - 16, 0, 16); \
+ } \
+}
+
+DO_ZIP(sve_zip_b, uint8_t, H1)
+DO_ZIP(sve_zip_h, uint16_t, H1_2)
+DO_ZIP(sve_zip_s, uint32_t, H1_4)
+DO_ZIP(sve_zip_d, uint64_t, H1_8)
+DO_ZIP(sve2_zip_q, Int128, )
+
+#define DO_UZP(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t oprsz = simd_oprsz(desc); \
+ intptr_t odd_ofs = simd_data(desc); \
+ intptr_t i, p; \
+ ARMVectorReg tmp_m; \
+ if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
+ vm = memcpy(&tmp_m, vm, oprsz); \
+ } \
+ i = 0, p = odd_ofs; \
+ do { \
+ *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(p)); \
+ i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
+ } while (p < oprsz); \
+ p -= oprsz; \
+ do { \
+ *(TYPE *)(vd + H(i)) = *(TYPE *)(vm + H(p)); \
+ i += sizeof(TYPE), p += 2 * sizeof(TYPE); \
+ } while (p < oprsz); \
+ tcg_debug_assert(i == oprsz); \
+}
+
+DO_UZP(sve_uzp_b, uint8_t, H1)
+DO_UZP(sve_uzp_h, uint16_t, H1_2)
+DO_UZP(sve_uzp_s, uint32_t, H1_4)
+DO_UZP(sve_uzp_d, uint64_t, H1_8)
+DO_UZP(sve2_uzp_q, Int128, )
+
+#define DO_TRN(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t oprsz = simd_oprsz(desc); \
+ intptr_t odd_ofs = simd_data(desc); \
+ intptr_t i; \
+ for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
+ TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
+ TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
+ *(TYPE *)(vd + H(i + 0)) = ae; \
+ *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
+ } \
+ if (sizeof(TYPE) == 16 && unlikely(oprsz & 16)) { \
+ memset(vd + oprsz - 16, 0, 16); \
+ } \
+}
+
+DO_TRN(sve_trn_b, uint8_t, H1)
+DO_TRN(sve_trn_h, uint16_t, H1_2)
+DO_TRN(sve_trn_s, uint32_t, H1_4)
+DO_TRN(sve_trn_d, uint64_t, H1_8)
+DO_TRN(sve2_trn_q, Int128, )
+
+#undef DO_ZIP
+#undef DO_UZP
+#undef DO_TRN
+
+void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
+ uint32_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+
+ for (i = j = 0; i < opr_sz; i++) {
+ if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
+ d[H4(j)] = n[H4(i)];
+ j++;
+ }
+ }
+ for (; j < opr_sz; j++) {
+ d[H4(j)] = 0;
+ }
+}
+
+void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn;
+ uint8_t *pg = vg;
+
+ for (i = j = 0; i < opr_sz; i++) {
+ if (pg[H1(i)] & 1) {
+ d[j] = n[i];
+ j++;
+ }
+ }
+ for (; j < opr_sz; j++) {
+ d[j] = 0;
+ }
+}
+
+/* Similar to the ARM LastActiveElement pseudocode function, except the
+ * result is multiplied by the element size. This includes the not found
+ * indication; e.g. not found for esz=3 is -8.
+ */
+int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
+{
+ intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+ intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+
+ return last_active_element(vg, words, esz);
+}
+
+void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
+{
+ intptr_t opr_sz = simd_oprsz(desc) / 8;
+ int esz = simd_data(desc);
+ uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
+ intptr_t i, first_i, last_i;
+ ARMVectorReg tmp;
+
+ first_i = last_i = 0;
+ first_g = last_g = 0;
+
+ /* Find the extent of the active elements within VG. */
+ for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
+ pg = *(uint64_t *)(vg + i) & mask;
+ if (pg) {
+ if (last_g == 0) {
+ last_g = pg;
+ last_i = i;
+ }
+ first_g = pg;
+ first_i = i;
+ }
+ }
+
+ len = 0;
+ if (first_g != 0) {
+ first_i = first_i * 8 + ctz64(first_g);
+ last_i = last_i * 8 + 63 - clz64(last_g);
+ len = last_i - first_i + (1 << esz);
+ if (vd == vm) {
+ vm = memcpy(&tmp, vm, opr_sz * 8);
+ }
+ swap_memmove(vd, vn + first_i, len);
+ }
+ swap_memmove(vd + len, vm, opr_sz * 8 - len);
+}
+
+void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
+ void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint8_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i += 1) {
+ uint64_t nn = n[i], mm = m[i];
+ uint64_t pp = expand_pred_b(pg[H1(i)]);
+ d[i] = (nn & pp) | (mm & ~pp);
+ }
+}
+
+void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
+ void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint8_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i += 1) {
+ uint64_t nn = n[i], mm = m[i];
+ uint64_t pp = expand_pred_h(pg[H1(i)]);
+ d[i] = (nn & pp) | (mm & ~pp);
+ }
+}
+
+void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
+ void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint8_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i += 1) {
+ uint64_t nn = n[i], mm = m[i];
+ uint64_t pp = expand_pred_s(pg[H1(i)]);
+ d[i] = (nn & pp) | (mm & ~pp);
+ }
+}
+
+void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
+ void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint8_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i += 1) {
+ uint64_t nn = n[i], mm = m[i];
+ d[i] = (pg[H1(i)] & 1 ? nn : mm);
+ }
+}
+
+void HELPER(sve_sel_zpzz_q)(void *vd, void *vn, void *vm,
+ void *vg, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 16;
+ Int128 *d = vd, *n = vn, *m = vm;
+ uint16_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i += 1) {
+ d[i] = (pg[H2(i)] & 1 ? n : m)[i];
+ }
+}
+
+/* Two operand comparison controlled by a predicate.
+ * ??? It is very tempting to want to be able to expand this inline
+ * with x86 instructions, e.g.
+ *
+ * vcmpeqw zm, zn, %ymm0
+ * vpmovmskb %ymm0, %eax
+ * and $0x5555, %eax
+ * and pg, %eax
+ *
+ * or even aarch64, e.g.
+ *
+ * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
+ * cmeq v0.8h, zn, zm
+ * and v0.8h, v0.8h, mask
+ * addv h0, v0.8h
+ * and v0.8b, pg
+ *
+ * However, coming up with an abstraction that allows vector inputs and
+ * a scalar output, and also handles the byte-ordering of sub-uint64_t
+ * scalar outputs, is tricky.
+ */
+#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
+uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{ \
+ intptr_t opr_sz = simd_oprsz(desc); \
+ uint32_t flags = PREDTEST_INIT; \
+ intptr_t i = opr_sz; \
+ do { \
+ uint64_t out = 0, pg; \
+ do { \
+ i -= sizeof(TYPE), out <<= sizeof(TYPE); \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ TYPE mm = *(TYPE *)(vm + H(i)); \
+ out |= nn OP mm; \
+ } while (i & 63); \
+ pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
+ out &= pg; \
+ *(uint64_t *)(vd + (i >> 3)) = out; \
+ flags = iter_predtest_bwd(out, pg, flags); \
+ } while (i > 0); \
+ return flags; \
+}
+
+#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
+ DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
+#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
+ DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
+#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
+ DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
+#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
+ DO_CMP_PPZZ(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
+
+DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
+DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
+DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
+DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
+
+DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
+DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
+DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
+DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
+
+DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
+DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
+DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
+DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
+
+DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
+DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
+DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
+DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
+
+DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
+DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
+DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
+DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
+
+DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
+DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
+DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
+DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
+
+#undef DO_CMP_PPZZ_B
+#undef DO_CMP_PPZZ_H
+#undef DO_CMP_PPZZ_S
+#undef DO_CMP_PPZZ_D
+#undef DO_CMP_PPZZ
+
+/* Similar, but the second source is "wide". */
+#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
+uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{ \
+ intptr_t opr_sz = simd_oprsz(desc); \
+ uint32_t flags = PREDTEST_INIT; \
+ intptr_t i = opr_sz; \
+ do { \
+ uint64_t out = 0, pg; \
+ do { \
+ TYPEW mm = *(TYPEW *)(vm + i - 8); \
+ do { \
+ i -= sizeof(TYPE), out <<= sizeof(TYPE); \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ out |= nn OP mm; \
+ } while (i & 7); \
+ } while (i & 63); \
+ pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
+ out &= pg; \
+ *(uint64_t *)(vd + (i >> 3)) = out; \
+ flags = iter_predtest_bwd(out, pg, flags); \
+ } while (i > 0); \
+ return flags; \
+}
+
+#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
+ DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
+#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
+ DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
+#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
+ DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
+
+DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
+DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
+DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
+
+DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
+DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
+DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
+
+DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
+DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
+DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
+
+DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
+DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
+DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
+
+DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
+DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
+DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
+
+DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
+DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
+DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
+
+DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
+DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
+DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
+
+DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
+DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
+DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
+
+DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
+DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
+DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
+
+DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
+DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
+DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
+
+#undef DO_CMP_PPZW_B
+#undef DO_CMP_PPZW_H
+#undef DO_CMP_PPZW_S
+#undef DO_CMP_PPZW
+
+/* Similar, but the second source is immediate. */
+#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
+uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
+{ \
+ intptr_t opr_sz = simd_oprsz(desc); \
+ uint32_t flags = PREDTEST_INIT; \
+ TYPE mm = simd_data(desc); \
+ intptr_t i = opr_sz; \
+ do { \
+ uint64_t out = 0, pg; \
+ do { \
+ i -= sizeof(TYPE), out <<= sizeof(TYPE); \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ out |= nn OP mm; \
+ } while (i & 63); \
+ pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
+ out &= pg; \
+ *(uint64_t *)(vd + (i >> 3)) = out; \
+ flags = iter_predtest_bwd(out, pg, flags); \
+ } while (i > 0); \
+ return flags; \
+}
+
+#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
+ DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
+#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
+ DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
+#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
+ DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
+#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
+ DO_CMP_PPZI(NAME, TYPE, OP, H1_8, 0x0101010101010101ull)
+
+DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
+DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
+DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
+DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
+
+DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
+DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
+DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
+DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
+
+DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
+DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
+DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
+DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
+
+DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
+DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
+DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
+DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
+
+DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
+DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
+DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
+DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
+
+DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
+DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
+DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
+DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
+
+DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
+DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
+DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
+DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
+
+DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
+DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
+DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
+DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
+
+DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
+DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
+DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
+DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
+
+DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
+DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
+DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
+DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
+
+#undef DO_CMP_PPZI_B
+#undef DO_CMP_PPZI_H
+#undef DO_CMP_PPZI_S
+#undef DO_CMP_PPZI_D
+#undef DO_CMP_PPZI
+
+/* Similar to the ARM LastActive pseudocode function. */
+static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
+{
+ intptr_t i;
+
+ for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
+ uint64_t pg = *(uint64_t *)(vg + i);
+ if (pg) {
+ return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
+ }
+ }
+ return 0;
+}
+
+/* Compute a mask into RETB that is true for all G, up to and including
+ * (if after) or excluding (if !after) the first G & N.
+ * Return true if BRK found.
+ */
+static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
+ bool brk, bool after)
+{
+ uint64_t b;
+
+ if (brk) {
+ b = 0;
+ } else if ((g & n) == 0) {
+ /* For all G, no N are set; break not found. */
+ b = g;
+ } else {
+ /* Break somewhere in N. Locate it. */
+ b = g & n; /* guard true, pred true */
+ b = b & -b; /* first such */
+ if (after) {
+ b = b | (b - 1); /* break after same */
+ } else {
+ b = b - 1; /* break before same */
+ }
+ brk = true;
+ }
+
+ *retb = b;
+ return brk;
+}
+
+/* Compute a zeroing BRK. */
+static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
+ intptr_t oprsz, bool after)
+{
+ bool brk = false;
+ intptr_t i;
+
+ for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
+ uint64_t this_b, this_g = g[i];
+
+ brk = compute_brk(&this_b, n[i], this_g, brk, after);
+ d[i] = this_b & this_g;
+ }
+}
+
+/* Likewise, but also compute flags. */
+static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
+ intptr_t oprsz, bool after)
+{
+ uint32_t flags = PREDTEST_INIT;
+ bool brk = false;
+ intptr_t i;
+
+ for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
+ uint64_t this_b, this_d, this_g = g[i];
+
+ brk = compute_brk(&this_b, n[i], this_g, brk, after);
+ d[i] = this_d = this_b & this_g;
+ flags = iter_predtest_fwd(this_d, this_g, flags);
+ }
+ return flags;
+}
+
+/* Compute a merging BRK. */
+static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
+ intptr_t oprsz, bool after)
+{
+ bool brk = false;
+ intptr_t i;
+
+ for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
+ uint64_t this_b, this_g = g[i];
+
+ brk = compute_brk(&this_b, n[i], this_g, brk, after);
+ d[i] = (this_b & this_g) | (d[i] & ~this_g);
+ }
+}
+
+/* Likewise, but also compute flags. */
+static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
+ intptr_t oprsz, bool after)
+{
+ uint32_t flags = PREDTEST_INIT;
+ bool brk = false;
+ intptr_t i;
+
+ for (i = 0; i < oprsz / 8; ++i) {
+ uint64_t this_b, this_d = d[i], this_g = g[i];
+
+ brk = compute_brk(&this_b, n[i], this_g, brk, after);
+ d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
+ flags = iter_predtest_fwd(this_d, this_g, flags);
+ }
+ return flags;
+}
+
+static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
+{
+ /* It is quicker to zero the whole predicate than loop on OPRSZ.
+ * The compiler should turn this into 4 64-bit integer stores.
+ */
+ memset(d, 0, sizeof(ARMPredicateReg));
+ return PREDTEST_INIT;
+}
+
+void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
+ uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ if (last_active_pred(vn, vg, oprsz)) {
+ compute_brk_z(vd, vm, vg, oprsz, true);
+ } else {
+ do_zero(vd, oprsz);
+ }
+}
+
+uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
+ uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ if (last_active_pred(vn, vg, oprsz)) {
+ return compute_brks_z(vd, vm, vg, oprsz, true);
+ } else {
+ return do_zero(vd, oprsz);
+ }
+}
+
+void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
+ uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ if (last_active_pred(vn, vg, oprsz)) {
+ compute_brk_z(vd, vm, vg, oprsz, false);
+ } else {
+ do_zero(vd, oprsz);
+ }
+}
+
+uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
+ uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ if (last_active_pred(vn, vg, oprsz)) {
+ return compute_brks_z(vd, vm, vg, oprsz, false);
+ } else {
+ return do_zero(vd, oprsz);
+ }
+}
+
+void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ compute_brk_z(vd, vn, vg, oprsz, true);
+}
+
+uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ return compute_brks_z(vd, vn, vg, oprsz, true);
+}
+
+void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ compute_brk_z(vd, vn, vg, oprsz, false);
+}
+
+uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ return compute_brks_z(vd, vn, vg, oprsz, false);
+}
+
+void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ compute_brk_m(vd, vn, vg, oprsz, true);
+}
+
+uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ return compute_brks_m(vd, vn, vg, oprsz, true);
+}
+
+void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ compute_brk_m(vd, vn, vg, oprsz, false);
+}
+
+uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ return compute_brks_m(vd, vn, vg, oprsz, false);
+}
+
+void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ if (!last_active_pred(vn, vg, oprsz)) {
+ do_zero(vd, oprsz);
+ }
+}
+
+/* As if PredTest(Ones(PL), D, esz). */
+static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
+ uint64_t esz_mask)
+{
+ uint32_t flags = PREDTEST_INIT;
+ intptr_t i;
+
+ for (i = 0; i < oprsz / 8; i++) {
+ flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
+ }
+ if (oprsz & 7) {
+ uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
+ flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
+ }
+ return flags;
+}
+
+uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ if (last_active_pred(vn, vg, oprsz)) {
+ return predtest_ones(vd, oprsz, -1);
+ } else {
+ return do_zero(vd, oprsz);
+ }
+}
+
+uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
+{
+ intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+ intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
+ intptr_t i;
+
+ for (i = 0; i < words; ++i) {
+ uint64_t t = n[i] & g[i] & mask;
+ sum += ctpop64(t);
+ }
+ return sum;
+}
+
+uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint64_t esz_mask = pred_esz_masks[esz];
+ ARMPredicateReg *d = vd;
+ uint32_t flags;
+ intptr_t i;
+
+ /* Begin with a zero predicate register. */
+ flags = do_zero(d, oprsz);
+ if (count == 0) {
+ return flags;
+ }
+
+ /* Set all of the requested bits. */
+ for (i = 0; i < count / 64; ++i) {
+ d->p[i] = esz_mask;
+ }
+ if (count & 63) {
+ d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
+ }
+
+ return predtest_ones(d, oprsz, esz_mask);
+}
+
+uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
+{
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ uint64_t esz_mask = pred_esz_masks[esz];
+ ARMPredicateReg *d = vd;
+ intptr_t i, invcount, oprbits;
+ uint64_t bits;
+
+ if (count == 0) {
+ return do_zero(d, oprsz);
+ }
+
+ oprbits = oprsz * 8;
+ tcg_debug_assert(count <= oprbits);
+
+ bits = esz_mask;
+ if (oprbits & 63) {
+ bits &= MAKE_64BIT_MASK(0, oprbits & 63);
+ }
+
+ invcount = oprbits - count;
+ for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
+ d->p[i] = bits;
+ bits = esz_mask;
+ }
+
+ d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
+
+ while (--i >= 0) {
+ d->p[i] = 0;
+ }
+
+ return predtest_ones(d, oprsz, esz_mask);
+}
+
+/* Recursive reduction on a function;
+ * C.f. the ARM ARM function ReducePredicated.
+ *
+ * While it would be possible to write this without the DATA temporary,
+ * it is much simpler to process the predicate register this way.
+ * The recursion is bounded to depth 7 (128 fp16 elements), so there's
+ * little to gain with a more complex non-recursive form.
+ */
+#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
+static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
+{ \
+ if (n == 1) { \
+ return *data; \
+ } else { \
+ uintptr_t half = n / 2; \
+ TYPE lo = NAME##_reduce(data, status, half); \
+ TYPE hi = NAME##_reduce(data + half, status, half); \
+ return TYPE##_##FUNC(lo, hi, status); \
+ } \
+} \
+uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
+{ \
+ uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
+ TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
+ for (i = 0; i < oprsz; ) { \
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
+ do { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
+ i += sizeof(TYPE), pg >>= sizeof(TYPE); \
+ } while (i & 15); \
+ } \
+ for (; i < maxsz; i += sizeof(TYPE)) { \
+ *(TYPE *)((void *)data + i) = IDENT; \
+ } \
+ return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
+}
+
+DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
+DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
+DO_REDUCE(sve_faddv_d, float64, H1_8, add, float64_zero)
+
+/* Identity is floatN_default_nan, without the function call. */
+DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
+DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
+DO_REDUCE(sve_fminnmv_d, float64, H1_8, minnum, 0x7FF8000000000000ULL)
+
+DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
+DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
+DO_REDUCE(sve_fmaxnmv_d, float64, H1_8, maxnum, 0x7FF8000000000000ULL)
+
+DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
+DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
+DO_REDUCE(sve_fminv_d, float64, H1_8, min, float64_infinity)
+
+DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
+DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
+DO_REDUCE(sve_fmaxv_d, float64, H1_8, max, float64_chs(float64_infinity))
+
+#undef DO_REDUCE
+
+uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
+ void *status, uint32_t desc)
+{
+ intptr_t i = 0, opr_sz = simd_oprsz(desc);
+ float16 result = nn;
+
+ do {
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
+ do {
+ if (pg & 1) {
+ float16 mm = *(float16 *)(vm + H1_2(i));
+ result = float16_add(result, mm, status);
+ }
+ i += sizeof(float16), pg >>= sizeof(float16);
+ } while (i & 15);
+ } while (i < opr_sz);
+
+ return result;
+}
+
+uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
+ void *status, uint32_t desc)
+{
+ intptr_t i = 0, opr_sz = simd_oprsz(desc);
+ float32 result = nn;
+
+ do {
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
+ do {
+ if (pg & 1) {
+ float32 mm = *(float32 *)(vm + H1_2(i));
+ result = float32_add(result, mm, status);
+ }
+ i += sizeof(float32), pg >>= sizeof(float32);
+ } while (i & 15);
+ } while (i < opr_sz);
+
+ return result;
+}
+
+uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
+ void *status, uint32_t desc)
+{
+ intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *m = vm;
+ uint8_t *pg = vg;
+
+ for (i = 0; i < opr_sz; i++) {
+ if (pg[H1(i)] & 1) {
+ nn = float64_add(nn, m[i], status);
+ }
+ }
+
+ return nn;
+}
+
+/* Fully general three-operand expander, controlled by a predicate,
+ * With the extra float_status parameter.
+ */
+#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
+ void *status, uint32_t desc) \
+{ \
+ intptr_t i = simd_oprsz(desc); \
+ uint64_t *g = vg; \
+ do { \
+ uint64_t pg = g[(i - 1) >> 6]; \
+ do { \
+ i -= sizeof(TYPE); \
+ if (likely((pg >> (i & 63)) & 1)) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ TYPE mm = *(TYPE *)(vm + H(i)); \
+ *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
+ } \
+ } while (i & 63); \
+ } while (i != 0); \
+}
+
+DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
+DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
+DO_ZPZZ_FP(sve_fadd_d, uint64_t, H1_8, float64_add)
+
+DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
+DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
+DO_ZPZZ_FP(sve_fsub_d, uint64_t, H1_8, float64_sub)
+
+DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
+DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
+DO_ZPZZ_FP(sve_fmul_d, uint64_t, H1_8, float64_mul)
+
+DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
+DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
+DO_ZPZZ_FP(sve_fdiv_d, uint64_t, H1_8, float64_div)
+
+DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
+DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
+DO_ZPZZ_FP(sve_fmin_d, uint64_t, H1_8, float64_min)
+
+DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
+DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
+DO_ZPZZ_FP(sve_fmax_d, uint64_t, H1_8, float64_max)
+
+DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
+DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
+DO_ZPZZ_FP(sve_fminnum_d, uint64_t, H1_8, float64_minnum)
+
+DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
+DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
+DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, H1_8, float64_maxnum)
+
+static inline float16 abd_h(float16 a, float16 b, float_status *s)
+{
+ return float16_abs(float16_sub(a, b, s));
+}
+
+static inline float32 abd_s(float32 a, float32 b, float_status *s)
+{
+ return float32_abs(float32_sub(a, b, s));
+}
+
+static inline float64 abd_d(float64 a, float64 b, float_status *s)
+{
+ return float64_abs(float64_sub(a, b, s));
+}
+
+DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
+DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
+DO_ZPZZ_FP(sve_fabd_d, uint64_t, H1_8, abd_d)
+
+static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
+{
+ int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
+ return float64_scalbn(a, b_int, s);
+}
+
+DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
+DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
+DO_ZPZZ_FP(sve_fscalbn_d, int64_t, H1_8, scalbn_d)
+
+DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
+DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
+DO_ZPZZ_FP(sve_fmulx_d, uint64_t, H1_8, helper_vfp_mulxd)
+
+#undef DO_ZPZZ_FP
+
+/* Three-operand expander, with one scalar operand, controlled by
+ * a predicate, with the extra float_status parameter.
+ */
+#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
+ void *status, uint32_t desc) \
+{ \
+ intptr_t i = simd_oprsz(desc); \
+ uint64_t *g = vg; \
+ TYPE mm = scalar; \
+ do { \
+ uint64_t pg = g[(i - 1) >> 6]; \
+ do { \
+ i -= sizeof(TYPE); \
+ if (likely((pg >> (i & 63)) & 1)) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
+ } \
+ } while (i & 63); \
+ } while (i != 0); \
+}
+
+DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
+DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
+DO_ZPZS_FP(sve_fadds_d, float64, H1_8, float64_add)
+
+DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
+DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
+DO_ZPZS_FP(sve_fsubs_d, float64, H1_8, float64_sub)
+
+DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
+DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
+DO_ZPZS_FP(sve_fmuls_d, float64, H1_8, float64_mul)
+
+static inline float16 subr_h(float16 a, float16 b, float_status *s)
+{
+ return float16_sub(b, a, s);
+}
+
+static inline float32 subr_s(float32 a, float32 b, float_status *s)
+{
+ return float32_sub(b, a, s);
+}
+
+static inline float64 subr_d(float64 a, float64 b, float_status *s)
+{
+ return float64_sub(b, a, s);
+}
+
+DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
+DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
+DO_ZPZS_FP(sve_fsubrs_d, float64, H1_8, subr_d)
+
+DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
+DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
+DO_ZPZS_FP(sve_fmaxnms_d, float64, H1_8, float64_maxnum)
+
+DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
+DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
+DO_ZPZS_FP(sve_fminnms_d, float64, H1_8, float64_minnum)
+
+DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
+DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
+DO_ZPZS_FP(sve_fmaxs_d, float64, H1_8, float64_max)
+
+DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
+DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
+DO_ZPZS_FP(sve_fmins_d, float64, H1_8, float64_min)
+
+/* Fully general two-operand expander, controlled by a predicate,
+ * With the extra float_status parameter.
+ */
+#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
+{ \
+ intptr_t i = simd_oprsz(desc); \
+ uint64_t *g = vg; \
+ do { \
+ uint64_t pg = g[(i - 1) >> 6]; \
+ do { \
+ i -= sizeof(TYPE); \
+ if (likely((pg >> (i & 63)) & 1)) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ *(TYPE *)(vd + H(i)) = OP(nn, status); \
+ } \
+ } while (i & 63); \
+ } while (i != 0); \
+}
+
+/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
+ * FZ16. When converting from fp16, this affects flushing input denormals;
+ * when converting to fp16, this affects flushing output denormals.
+ */
+static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
+{
+ bool save = get_flush_inputs_to_zero(fpst);
+ float32 ret;
+
+ set_flush_inputs_to_zero(false, fpst);
+ ret = float16_to_float32(f, true, fpst);
+ set_flush_inputs_to_zero(save, fpst);
+ return ret;
+}
+
+static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
+{
+ bool save = get_flush_inputs_to_zero(fpst);
+ float64 ret;
+
+ set_flush_inputs_to_zero(false, fpst);
+ ret = float16_to_float64(f, true, fpst);
+ set_flush_inputs_to_zero(save, fpst);
+ return ret;
+}
+
+static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
+{
+ bool save = get_flush_to_zero(fpst);
+ float16 ret;
+
+ set_flush_to_zero(false, fpst);
+ ret = float32_to_float16(f, true, fpst);
+ set_flush_to_zero(save, fpst);
+ return ret;
+}
+
+static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
+{
+ bool save = get_flush_to_zero(fpst);
+ float16 ret;
+
+ set_flush_to_zero(false, fpst);
+ ret = float64_to_float16(f, true, fpst);
+ set_flush_to_zero(save, fpst);
+ return ret;
+}
+
+static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
+{
+ if (float16_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float16_to_int16_round_to_zero(f, s);
+}
+
+static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
+{
+ if (float16_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float16_to_int64_round_to_zero(f, s);
+}
+
+static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
+{
+ if (float32_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float32_to_int64_round_to_zero(f, s);
+}
+
+static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
+{
+ if (float64_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float64_to_int64_round_to_zero(f, s);
+}
+
+static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
+{
+ if (float16_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float16_to_uint16_round_to_zero(f, s);
+}
+
+static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
+{
+ if (float16_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float16_to_uint64_round_to_zero(f, s);
+}
+
+static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
+{
+ if (float32_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float32_to_uint64_round_to_zero(f, s);
+}
+
+static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
+{
+ if (float64_is_any_nan(f)) {
+ float_raise(float_flag_invalid, s);
+ return 0;
+ }
+ return float64_to_uint64_round_to_zero(f, s);
+}
+
+DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
+DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
+DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
+DO_ZPZ_FP(sve_fcvt_dh, uint64_t, H1_8, sve_f64_to_f16)
+DO_ZPZ_FP(sve_fcvt_hd, uint64_t, H1_8, sve_f16_to_f64)
+DO_ZPZ_FP(sve_fcvt_ds, uint64_t, H1_8, float64_to_float32)
+DO_ZPZ_FP(sve_fcvt_sd, uint64_t, H1_8, float32_to_float64)
+
+DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
+DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
+DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
+DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, H1_8, vfp_float16_to_int64_rtz)
+DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, H1_8, vfp_float32_to_int64_rtz)
+DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, H1_8, helper_vfp_tosizd)
+DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, H1_8, vfp_float64_to_int64_rtz)
+
+DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
+DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
+DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
+DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, H1_8, vfp_float16_to_uint64_rtz)
+DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, H1_8, vfp_float32_to_uint64_rtz)
+DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, H1_8, helper_vfp_touizd)
+DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, H1_8, vfp_float64_to_uint64_rtz)
+
+DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
+DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
+DO_ZPZ_FP(sve_frint_d, uint64_t, H1_8, helper_rintd)
+
+DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
+DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
+DO_ZPZ_FP(sve_frintx_d, uint64_t, H1_8, float64_round_to_int)
+
+DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
+DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
+DO_ZPZ_FP(sve_frecpx_d, uint64_t, H1_8, helper_frecpx_f64)
+
+DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
+DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
+DO_ZPZ_FP(sve_fsqrt_d, uint64_t, H1_8, float64_sqrt)
+
+DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
+DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
+DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
+DO_ZPZ_FP(sve_scvt_sd, uint64_t, H1_8, int32_to_float64)
+DO_ZPZ_FP(sve_scvt_dh, uint64_t, H1_8, int64_to_float16)
+DO_ZPZ_FP(sve_scvt_ds, uint64_t, H1_8, int64_to_float32)
+DO_ZPZ_FP(sve_scvt_dd, uint64_t, H1_8, int64_to_float64)
+
+DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
+DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
+DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
+DO_ZPZ_FP(sve_ucvt_sd, uint64_t, H1_8, uint32_to_float64)
+DO_ZPZ_FP(sve_ucvt_dh, uint64_t, H1_8, uint64_to_float16)
+DO_ZPZ_FP(sve_ucvt_ds, uint64_t, H1_8, uint64_to_float32)
+DO_ZPZ_FP(sve_ucvt_dd, uint64_t, H1_8, uint64_to_float64)
+
+static int16_t do_float16_logb_as_int(float16 a, float_status *s)
+{
+ /* Extract frac to the top of the uint32_t. */
+ uint32_t frac = (uint32_t)a << (16 + 6);
+ int16_t exp = extract32(a, 10, 5);
+
+ if (unlikely(exp == 0)) {
+ if (frac != 0) {
+ if (!get_flush_inputs_to_zero(s)) {
+ /* denormal: bias - fractional_zeros */
+ return -15 - clz32(frac);
+ }
+ /* flush to zero */
+ float_raise(float_flag_input_denormal, s);
+ }
+ } else if (unlikely(exp == 0x1f)) {
+ if (frac == 0) {
+ return INT16_MAX; /* infinity */
+ }
+ } else {
+ /* normal: exp - bias */
+ return exp - 15;
+ }
+ /* nan or zero */
+ float_raise(float_flag_invalid, s);
+ return INT16_MIN;
+}
+
+static int32_t do_float32_logb_as_int(float32 a, float_status *s)
+{
+ /* Extract frac to the top of the uint32_t. */
+ uint32_t frac = a << 9;
+ int32_t exp = extract32(a, 23, 8);
+
+ if (unlikely(exp == 0)) {
+ if (frac != 0) {
+ if (!get_flush_inputs_to_zero(s)) {
+ /* denormal: bias - fractional_zeros */
+ return -127 - clz32(frac);
+ }
+ /* flush to zero */
+ float_raise(float_flag_input_denormal, s);
+ }
+ } else if (unlikely(exp == 0xff)) {
+ if (frac == 0) {
+ return INT32_MAX; /* infinity */
+ }
+ } else {
+ /* normal: exp - bias */
+ return exp - 127;
+ }
+ /* nan or zero */
+ float_raise(float_flag_invalid, s);
+ return INT32_MIN;
+}
+
+static int64_t do_float64_logb_as_int(float64 a, float_status *s)
+{
+ /* Extract frac to the top of the uint64_t. */
+ uint64_t frac = a << 12;
+ int64_t exp = extract64(a, 52, 11);
+
+ if (unlikely(exp == 0)) {
+ if (frac != 0) {
+ if (!get_flush_inputs_to_zero(s)) {
+ /* denormal: bias - fractional_zeros */
+ return -1023 - clz64(frac);
+ }
+ /* flush to zero */
+ float_raise(float_flag_input_denormal, s);
+ }
+ } else if (unlikely(exp == 0x7ff)) {
+ if (frac == 0) {
+ return INT64_MAX; /* infinity */
+ }
+ } else {
+ /* normal: exp - bias */
+ return exp - 1023;
+ }
+ /* nan or zero */
+ float_raise(float_flag_invalid, s);
+ return INT64_MIN;
+}
+
+DO_ZPZ_FP(flogb_h, float16, H1_2, do_float16_logb_as_int)
+DO_ZPZ_FP(flogb_s, float32, H1_4, do_float32_logb_as_int)
+DO_ZPZ_FP(flogb_d, float64, H1_8, do_float64_logb_as_int)
+
+#undef DO_ZPZ_FP
+
+static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
+ float_status *status, uint32_t desc,
+ uint16_t neg1, uint16_t neg3)
+{
+ intptr_t i = simd_oprsz(desc);
+ uint64_t *g = vg;
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ i -= 2;
+ if (likely((pg >> (i & 63)) & 1)) {
+ float16 e1, e2, e3, r;
+
+ e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
+ e2 = *(uint16_t *)(vm + H1_2(i));
+ e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
+ r = float16_muladd(e1, e2, e3, 0, status);
+ *(uint16_t *)(vd + H1_2(i)) = r;
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
+}
+
+void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
+}
+
+void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
+}
+
+void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
+}
+
+static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
+ float_status *status, uint32_t desc,
+ uint32_t neg1, uint32_t neg3)
+{
+ intptr_t i = simd_oprsz(desc);
+ uint64_t *g = vg;
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ i -= 4;
+ if (likely((pg >> (i & 63)) & 1)) {
+ float32 e1, e2, e3, r;
+
+ e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
+ e2 = *(uint32_t *)(vm + H1_4(i));
+ e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
+ r = float32_muladd(e1, e2, e3, 0, status);
+ *(uint32_t *)(vd + H1_4(i)) = r;
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
+}
+
+void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
+}
+
+void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
+}
+
+void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
+}
+
+static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
+ float_status *status, uint32_t desc,
+ uint64_t neg1, uint64_t neg3)
+{
+ intptr_t i = simd_oprsz(desc);
+ uint64_t *g = vg;
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ i -= 8;
+ if (likely((pg >> (i & 63)) & 1)) {
+ float64 e1, e2, e3, r;
+
+ e1 = *(uint64_t *)(vn + i) ^ neg1;
+ e2 = *(uint64_t *)(vm + i);
+ e3 = *(uint64_t *)(va + i) ^ neg3;
+ r = float64_muladd(e1, e2, e3, 0, status);
+ *(uint64_t *)(vd + i) = r;
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
+}
+
+void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
+}
+
+void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
+}
+
+void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
+}
+
+/* Two operand floating-point comparison controlled by a predicate.
+ * Unlike the integer version, we are not allowed to optimistically
+ * compare operands, since the comparison may have side effects wrt
+ * the FPSR.
+ */
+#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
+ void *status, uint32_t desc) \
+{ \
+ intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
+ uint64_t *d = vd, *g = vg; \
+ do { \
+ uint64_t out = 0, pg = g[j]; \
+ do { \
+ i -= sizeof(TYPE), out <<= sizeof(TYPE); \
+ if (likely((pg >> (i & 63)) & 1)) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ TYPE mm = *(TYPE *)(vm + H(i)); \
+ out |= OP(TYPE, nn, mm, status); \
+ } \
+ } while (i & 63); \
+ d[j--] = out; \
+ } while (i > 0); \
+}
+
+#define DO_FPCMP_PPZZ_H(NAME, OP) \
+ DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
+#define DO_FPCMP_PPZZ_S(NAME, OP) \
+ DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
+#define DO_FPCMP_PPZZ_D(NAME, OP) \
+ DO_FPCMP_PPZZ(NAME##_d, float64, H1_8, OP)
+
+#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
+ DO_FPCMP_PPZZ_H(NAME, OP) \
+ DO_FPCMP_PPZZ_S(NAME, OP) \
+ DO_FPCMP_PPZZ_D(NAME, OP)
+
+#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
+#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
+#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
+#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
+#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
+#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
+#define DO_FCMUO(TYPE, X, Y, ST) \
+ TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
+#define DO_FACGE(TYPE, X, Y, ST) \
+ TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
+#define DO_FACGT(TYPE, X, Y, ST) \
+ TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
+
+DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
+DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
+DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
+DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
+DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
+DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
+DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
+
+#undef DO_FPCMP_PPZZ_ALL
+#undef DO_FPCMP_PPZZ_D
+#undef DO_FPCMP_PPZZ_S
+#undef DO_FPCMP_PPZZ_H
+#undef DO_FPCMP_PPZZ
+
+/* One operand floating-point comparison against zero, controlled
+ * by a predicate.
+ */
+#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, \
+ void *status, uint32_t desc) \
+{ \
+ intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
+ uint64_t *d = vd, *g = vg; \
+ do { \
+ uint64_t out = 0, pg = g[j]; \
+ do { \
+ i -= sizeof(TYPE), out <<= sizeof(TYPE); \
+ if ((pg >> (i & 63)) & 1) { \
+ TYPE nn = *(TYPE *)(vn + H(i)); \
+ out |= OP(TYPE, nn, 0, status); \
+ } \
+ } while (i & 63); \
+ d[j--] = out; \
+ } while (i > 0); \
+}
+
+#define DO_FPCMP_PPZ0_H(NAME, OP) \
+ DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
+#define DO_FPCMP_PPZ0_S(NAME, OP) \
+ DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
+#define DO_FPCMP_PPZ0_D(NAME, OP) \
+ DO_FPCMP_PPZ0(NAME##_d, float64, H1_8, OP)
+
+#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
+ DO_FPCMP_PPZ0_H(NAME, OP) \
+ DO_FPCMP_PPZ0_S(NAME, OP) \
+ DO_FPCMP_PPZ0_D(NAME, OP)
+
+DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
+DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
+DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
+DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
+DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
+DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
+
+/* FP Trig Multiply-Add. */
+
+void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
+{
+ static const float16 coeff[16] = {
+ 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ };
+ intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
+ intptr_t x = simd_data(desc);
+ float16 *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i++) {
+ float16 mm = m[i];
+ intptr_t xx = x;
+ if (float16_is_neg(mm)) {
+ mm = float16_abs(mm);
+ xx += 8;
+ }
+ d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
+ }
+}
+
+void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
+{
+ static const float32 coeff[16] = {
+ 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
+ 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
+ 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
+ 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
+ };
+ intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
+ intptr_t x = simd_data(desc);
+ float32 *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i++) {
+ float32 mm = m[i];
+ intptr_t xx = x;
+ if (float32_is_neg(mm)) {
+ mm = float32_abs(mm);
+ xx += 8;
+ }
+ d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
+ }
+}
+
+void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
+{
+ static const float64 coeff[16] = {
+ 0x3ff0000000000000ull, 0xbfc5555555555543ull,
+ 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
+ 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
+ 0x3de5d8408868552full, 0x0000000000000000ull,
+ 0x3ff0000000000000ull, 0xbfe0000000000000ull,
+ 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
+ 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
+ 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
+ };
+ intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
+ intptr_t x = simd_data(desc);
+ float64 *d = vd, *n = vn, *m = vm;
+ for (i = 0; i < opr_sz; i++) {
+ float64 mm = m[i];
+ intptr_t xx = x;
+ if (float64_is_neg(mm)) {
+ mm = float64_abs(mm);
+ xx += 8;
+ }
+ d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
+ }
+}
+
+/*
+ * FP Complex Add
+ */
+
+void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
+ void *vs, uint32_t desc)
+{
+ intptr_t j, i = simd_oprsz(desc);
+ uint64_t *g = vg;
+ float16 neg_imag = float16_set_sign(0, simd_data(desc));
+ float16 neg_real = float16_chs(neg_imag);
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ float16 e0, e1, e2, e3;
+
+ /* I holds the real index; J holds the imag index. */
+ j = i - sizeof(float16);
+ i -= 2 * sizeof(float16);
+
+ e0 = *(float16 *)(vn + H1_2(i));
+ e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
+ e2 = *(float16 *)(vn + H1_2(j));
+ e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
+
+ if (likely((pg >> (i & 63)) & 1)) {
+ *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
+ }
+ if (likely((pg >> (j & 63)) & 1)) {
+ *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
+ void *vs, uint32_t desc)
+{
+ intptr_t j, i = simd_oprsz(desc);
+ uint64_t *g = vg;
+ float32 neg_imag = float32_set_sign(0, simd_data(desc));
+ float32 neg_real = float32_chs(neg_imag);
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ float32 e0, e1, e2, e3;
+
+ /* I holds the real index; J holds the imag index. */
+ j = i - sizeof(float32);
+ i -= 2 * sizeof(float32);
+
+ e0 = *(float32 *)(vn + H1_2(i));
+ e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
+ e2 = *(float32 *)(vn + H1_2(j));
+ e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
+
+ if (likely((pg >> (i & 63)) & 1)) {
+ *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
+ }
+ if (likely((pg >> (j & 63)) & 1)) {
+ *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
+ void *vs, uint32_t desc)
+{
+ intptr_t j, i = simd_oprsz(desc);
+ uint64_t *g = vg;
+ float64 neg_imag = float64_set_sign(0, simd_data(desc));
+ float64 neg_real = float64_chs(neg_imag);
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ float64 e0, e1, e2, e3;
+
+ /* I holds the real index; J holds the imag index. */
+ j = i - sizeof(float64);
+ i -= 2 * sizeof(float64);
+
+ e0 = *(float64 *)(vn + H1_2(i));
+ e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
+ e2 = *(float64 *)(vn + H1_2(j));
+ e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
+
+ if (likely((pg >> (i & 63)) & 1)) {
+ *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
+ }
+ if (likely((pg >> (j & 63)) & 1)) {
+ *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+/*
+ * FP Complex Multiply
+ */
+
+void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ intptr_t j, i = simd_oprsz(desc);
+ unsigned rot = simd_data(desc);
+ bool flip = rot & 1;
+ float16 neg_imag, neg_real;
+ uint64_t *g = vg;
+
+ neg_imag = float16_set_sign(0, (rot & 2) != 0);
+ neg_real = float16_set_sign(0, rot == 1 || rot == 2);
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
+
+ /* I holds the real index; J holds the imag index. */
+ j = i - sizeof(float16);
+ i -= 2 * sizeof(float16);
+
+ nr = *(float16 *)(vn + H1_2(i));
+ ni = *(float16 *)(vn + H1_2(j));
+ mr = *(float16 *)(vm + H1_2(i));
+ mi = *(float16 *)(vm + H1_2(j));
+
+ e2 = (flip ? ni : nr);
+ e1 = (flip ? mi : mr) ^ neg_real;
+ e4 = e2;
+ e3 = (flip ? mr : mi) ^ neg_imag;
+
+ if (likely((pg >> (i & 63)) & 1)) {
+ d = *(float16 *)(va + H1_2(i));
+ d = float16_muladd(e2, e1, d, 0, status);
+ *(float16 *)(vd + H1_2(i)) = d;
+ }
+ if (likely((pg >> (j & 63)) & 1)) {
+ d = *(float16 *)(va + H1_2(j));
+ d = float16_muladd(e4, e3, d, 0, status);
+ *(float16 *)(vd + H1_2(j)) = d;
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ intptr_t j, i = simd_oprsz(desc);
+ unsigned rot = simd_data(desc);
+ bool flip = rot & 1;
+ float32 neg_imag, neg_real;
+ uint64_t *g = vg;
+
+ neg_imag = float32_set_sign(0, (rot & 2) != 0);
+ neg_real = float32_set_sign(0, rot == 1 || rot == 2);
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
+
+ /* I holds the real index; J holds the imag index. */
+ j = i - sizeof(float32);
+ i -= 2 * sizeof(float32);
+
+ nr = *(float32 *)(vn + H1_2(i));
+ ni = *(float32 *)(vn + H1_2(j));
+ mr = *(float32 *)(vm + H1_2(i));
+ mi = *(float32 *)(vm + H1_2(j));
+
+ e2 = (flip ? ni : nr);
+ e1 = (flip ? mi : mr) ^ neg_real;
+ e4 = e2;
+ e3 = (flip ? mr : mi) ^ neg_imag;
+
+ if (likely((pg >> (i & 63)) & 1)) {
+ d = *(float32 *)(va + H1_2(i));
+ d = float32_muladd(e2, e1, d, 0, status);
+ *(float32 *)(vd + H1_2(i)) = d;
+ }
+ if (likely((pg >> (j & 63)) & 1)) {
+ d = *(float32 *)(va + H1_2(j));
+ d = float32_muladd(e4, e3, d, 0, status);
+ *(float32 *)(vd + H1_2(j)) = d;
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
+{
+ intptr_t j, i = simd_oprsz(desc);
+ unsigned rot = simd_data(desc);
+ bool flip = rot & 1;
+ float64 neg_imag, neg_real;
+ uint64_t *g = vg;
+
+ neg_imag = float64_set_sign(0, (rot & 2) != 0);
+ neg_real = float64_set_sign(0, rot == 1 || rot == 2);
+
+ do {
+ uint64_t pg = g[(i - 1) >> 6];
+ do {
+ float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
+
+ /* I holds the real index; J holds the imag index. */
+ j = i - sizeof(float64);
+ i -= 2 * sizeof(float64);
+
+ nr = *(float64 *)(vn + H1_2(i));
+ ni = *(float64 *)(vn + H1_2(j));
+ mr = *(float64 *)(vm + H1_2(i));
+ mi = *(float64 *)(vm + H1_2(j));
+
+ e2 = (flip ? ni : nr);
+ e1 = (flip ? mi : mr) ^ neg_real;
+ e4 = e2;
+ e3 = (flip ? mr : mi) ^ neg_imag;
+
+ if (likely((pg >> (i & 63)) & 1)) {
+ d = *(float64 *)(va + H1_2(i));
+ d = float64_muladd(e2, e1, d, 0, status);
+ *(float64 *)(vd + H1_2(i)) = d;
+ }
+ if (likely((pg >> (j & 63)) & 1)) {
+ d = *(float64 *)(va + H1_2(j));
+ d = float64_muladd(e4, e3, d, 0, status);
+ *(float64 *)(vd + H1_2(j)) = d;
+ }
+ } while (i & 63);
+ } while (i != 0);
+}
+
+/*
+ * Load contiguous data, protected by a governing predicate.
+ */
+
+/*
+ * Skip through a sequence of inactive elements in the guarding predicate @vg,
+ * beginning at @reg_off bounded by @reg_max. Return the offset of the active
+ * element >= @reg_off, or @reg_max if there were no active elements at all.
+ */
+static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
+ intptr_t reg_max, int esz)
+{
+ uint64_t pg_mask = pred_esz_masks[esz];
+ uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
+
+ /* In normal usage, the first element is active. */
+ if (likely(pg & 1)) {
+ return reg_off;
+ }
+
+ if (pg == 0) {
+ reg_off &= -64;
+ do {
+ reg_off += 64;
+ if (unlikely(reg_off >= reg_max)) {
+ /* The entire predicate was false. */
+ return reg_max;
+ }
+ pg = vg[reg_off >> 6] & pg_mask;
+ } while (pg == 0);
+ }
+ reg_off += ctz64(pg);
+
+ /* We should never see an out of range predicate bit set. */
+ tcg_debug_assert(reg_off < reg_max);
+ return reg_off;
+}
+
+/*
+ * Resolve the guest virtual address to info->host and info->flags.
+ * If @nofault, return false if the page is invalid, otherwise
+ * exit via page fault exception.
+ */
+
+bool sve_probe_page(SVEHostPage *info, bool nofault, CPUARMState *env,
+ target_ulong addr, int mem_off, MMUAccessType access_type,
+ int mmu_idx, uintptr_t retaddr)
+{
+ int flags;
+
+ addr += mem_off;
+
+ /*
+ * User-only currently always issues with TBI. See the comment
+ * above useronly_clean_ptr. Usually we clean this top byte away
+ * during translation, but we can't do that for e.g. vector + imm
+ * addressing modes.
+ *
+ * We currently always enable TBI for user-only, and do not provide
+ * a way to turn it off. So clean the pointer unconditionally here,
+ * rather than look it up here, or pass it down from above.
+ */
+ addr = useronly_clean_ptr(addr);
+
+#ifdef CONFIG_USER_ONLY
+ flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
+ &info->host, retaddr);
+#else
+ CPUTLBEntryFull *full;
+ flags = probe_access_full(env, addr, access_type, mmu_idx, nofault,
+ &info->host, &full, retaddr);
+#endif
+ info->flags = flags;
+
+ if (flags & TLB_INVALID_MASK) {
+ g_assert(nofault);
+ return false;
+ }
+
+#ifdef CONFIG_USER_ONLY
+ memset(&info->attrs, 0, sizeof(info->attrs));
+ /* Require both ANON and MTE; see allocation_tag_mem(). */
+ info->tagged = (flags & PAGE_ANON) && (flags & PAGE_MTE);
+#else
+ info->attrs = full->attrs;
+ info->tagged = full->pte_attrs == 0xf0;
+#endif
+
+ /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
+ info->host -= mem_off;
+ return true;
+}
+
+/*
+ * Find first active element on each page, and a loose bound for the
+ * final element on each page. Identify any single element that spans
+ * the page boundary. Return true if there are any active elements.
+ */
+bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr, uint64_t *vg,
+ intptr_t reg_max, int esz, int msize)
+{
+ const int esize = 1 << esz;
+ const uint64_t pg_mask = pred_esz_masks[esz];
+ intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
+ intptr_t mem_off_last, mem_off_split;
+ intptr_t page_split, elt_split;
+ intptr_t i;
+
+ /* Set all of the element indices to -1, and the TLB data to 0. */
+ memset(info, -1, offsetof(SVEContLdSt, page));
+ memset(info->page, 0, sizeof(info->page));
+
+ /* Gross scan over the entire predicate to find bounds. */
+ i = 0;
+ do {
+ uint64_t pg = vg[i] & pg_mask;
+ if (pg) {
+ reg_off_last = i * 64 + 63 - clz64(pg);
+ if (reg_off_first < 0) {
+ reg_off_first = i * 64 + ctz64(pg);
+ }
+ }
+ } while (++i * 64 < reg_max);
+
+ if (unlikely(reg_off_first < 0)) {
+ /* No active elements, no pages touched. */
+ return false;
+ }
+ tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
+
+ info->reg_off_first[0] = reg_off_first;
+ info->mem_off_first[0] = (reg_off_first >> esz) * msize;
+ mem_off_last = (reg_off_last >> esz) * msize;
+
+ page_split = -(addr | TARGET_PAGE_MASK);
+ if (likely(mem_off_last + msize <= page_split)) {
+ /* The entire operation fits within a single page. */
+ info->reg_off_last[0] = reg_off_last;
+ return true;
+ }
+
+ info->page_split = page_split;
+ elt_split = page_split / msize;
+ reg_off_split = elt_split << esz;
+ mem_off_split = elt_split * msize;
+
+ /*
+ * This is the last full element on the first page, but it is not
+ * necessarily active. If there is no full element, i.e. the first
+ * active element is the one that's split, this value remains -1.
+ * It is useful as iteration bounds.
+ */
+ if (elt_split != 0) {
+ info->reg_off_last[0] = reg_off_split - esize;
+ }
+
+ /* Determine if an unaligned element spans the pages. */
+ if (page_split % msize != 0) {
+ /* It is helpful to know if the split element is active. */
+ if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
+ info->reg_off_split = reg_off_split;
+ info->mem_off_split = mem_off_split;
+
+ if (reg_off_split == reg_off_last) {
+ /* The page crossing element is last. */
+ return true;
+ }
+ }
+ reg_off_split += esize;
+ mem_off_split += msize;
+ }
+
+ /*
+ * We do want the first active element on the second page, because
+ * this may affect the address reported in an exception.
+ */
+ reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
+ tcg_debug_assert(reg_off_split <= reg_off_last);
+ info->reg_off_first[1] = reg_off_split;
+ info->mem_off_first[1] = (reg_off_split >> esz) * msize;
+ info->reg_off_last[1] = reg_off_last;
+ return true;
+}
+
+/*
+ * Resolve the guest virtual addresses to info->page[].
+ * Control the generation of page faults with @fault. Return false if
+ * there is no work to do, which can only happen with @fault == FAULT_NO.
+ */
+bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
+ CPUARMState *env, target_ulong addr,
+ MMUAccessType access_type, uintptr_t retaddr)
+{
+ int mmu_idx = cpu_mmu_index(env, false);
+ int mem_off = info->mem_off_first[0];
+ bool nofault = fault == FAULT_NO;
+ bool have_work = true;
+
+ if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
+ access_type, mmu_idx, retaddr)) {
+ /* No work to be done. */
+ return false;
+ }
+
+ if (likely(info->page_split < 0)) {
+ /* The entire operation was on the one page. */
+ return true;
+ }
+
+ /*
+ * If the second page is invalid, then we want the fault address to be
+ * the first byte on that page which is accessed.
+ */
+ if (info->mem_off_split >= 0) {
+ /*
+ * There is an element split across the pages. The fault address
+ * should be the first byte of the second page.
+ */
+ mem_off = info->page_split;
+ /*
+ * If the split element is also the first active element
+ * of the vector, then: For first-fault we should continue
+ * to generate faults for the second page. For no-fault,
+ * we have work only if the second page is valid.
+ */
+ if (info->mem_off_first[0] < info->mem_off_split) {
+ nofault = FAULT_FIRST;
+ have_work = false;
+ }
+ } else {
+ /*
+ * There is no element split across the pages. The fault address
+ * should be the first active element on the second page.
+ */
+ mem_off = info->mem_off_first[1];
+ /*
+ * There must have been one active element on the first page,
+ * so we're out of first-fault territory.
+ */
+ nofault = fault != FAULT_ALL;
+ }
+
+ have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
+ access_type, mmu_idx, retaddr);
+ return have_work;
+}
+
+#ifndef CONFIG_USER_ONLY
+void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr,
+ int esize, int msize, int wp_access,
+ uintptr_t retaddr)
+{
+ intptr_t mem_off, reg_off, reg_last;
+ int flags0 = info->page[0].flags;
+ int flags1 = info->page[1].flags;
+
+ if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
+ return;
+ }
+
+ /* Indicate that watchpoints are handled. */
+ info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
+ info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
+
+ if (flags0 & TLB_WATCHPOINT) {
+ mem_off = info->mem_off_first[0];
+ reg_off = info->reg_off_first[0];
+ reg_last = info->reg_off_last[0];
+
+ while (reg_off <= reg_last) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ cpu_check_watchpoint(env_cpu(env), addr + mem_off,
+ msize, info->page[0].attrs,
+ wp_access, retaddr);
+ }
+ reg_off += esize;
+ mem_off += msize;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ }
+ }
+
+ mem_off = info->mem_off_split;
+ if (mem_off >= 0) {
+ cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
+ info->page[0].attrs, wp_access, retaddr);
+ }
+
+ mem_off = info->mem_off_first[1];
+ if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
+ reg_off = info->reg_off_first[1];
+ reg_last = info->reg_off_last[1];
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ cpu_check_watchpoint(env_cpu(env), addr + mem_off,
+ msize, info->page[1].attrs,
+ wp_access, retaddr);
+ }
+ reg_off += esize;
+ mem_off += msize;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ }
+}
+#endif
+
+void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr, int esize,
+ int msize, uint32_t mtedesc, uintptr_t ra)
+{
+ intptr_t mem_off, reg_off, reg_last;
+
+ /* Process the page only if MemAttr == Tagged. */
+ if (info->page[0].tagged) {
+ mem_off = info->mem_off_first[0];
+ reg_off = info->reg_off_first[0];
+ reg_last = info->reg_off_split;
+ if (reg_last < 0) {
+ reg_last = info->reg_off_last[0];
+ }
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ mte_check(env, mtedesc, addr, ra);
+ }
+ reg_off += esize;
+ mem_off += msize;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ } while (reg_off <= reg_last);
+ }
+
+ mem_off = info->mem_off_first[1];
+ if (mem_off >= 0 && info->page[1].tagged) {
+ reg_off = info->reg_off_first[1];
+ reg_last = info->reg_off_last[1];
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ mte_check(env, mtedesc, addr, ra);
+ }
+ reg_off += esize;
+ mem_off += msize;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ }
+}
+
+/*
+ * Common helper for all contiguous 1,2,3,4-register predicated stores.
+ */
+static inline QEMU_ALWAYS_INLINE
+void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
+ uint32_t desc, const uintptr_t retaddr,
+ const int esz, const int msz, const int N, uint32_t mtedesc,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const unsigned rd = simd_data(desc);
+ const intptr_t reg_max = simd_oprsz(desc);
+ intptr_t reg_off, reg_last, mem_off;
+ SVEContLdSt info;
+ void *host;
+ int flags, i;
+
+ /* Find the active elements. */
+ if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
+ /* The entire predicate was false; no load occurs. */
+ for (i = 0; i < N; ++i) {
+ memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
+ }
+ return;
+ }
+
+ /* Probe the page(s). Exit with exception for any invalid page. */
+ sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
+
+ /* Handle watchpoints for all active elements. */
+ sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
+ BP_MEM_READ, retaddr);
+
+ /*
+ * Handle mte checks for all active elements.
+ * Since TBI must be set for MTE, !mtedesc => !mte_active.
+ */
+ if (mtedesc) {
+ sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
+ mtedesc, retaddr);
+ }
+
+ flags = info.page[0].flags | info.page[1].flags;
+ if (unlikely(flags != 0)) {
+#ifdef CONFIG_USER_ONLY
+ g_assert_not_reached();
+#else
+ /*
+ * At least one page includes MMIO.
+ * Any bus operation can fail with cpu_transaction_failed,
+ * which for ARM will raise SyncExternal. Perform the load
+ * into scratch memory to preserve register state until the end.
+ */
+ ARMVectorReg scratch[4] = { };
+
+ mem_off = info.mem_off_first[0];
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[1];
+ if (reg_last < 0) {
+ reg_last = info.reg_off_split;
+ if (reg_last < 0) {
+ reg_last = info.reg_off_last[0];
+ }
+ }
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ tlb_fn(env, &scratch[i], reg_off,
+ addr + mem_off + (i << msz), retaddr);
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+
+ for (i = 0; i < N; ++i) {
+ memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
+ }
+ return;
+#endif
+ }
+
+ /* The entire operation is in RAM, on valid pages. */
+
+ for (i = 0; i < N; ++i) {
+ memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
+ }
+
+ mem_off = info.mem_off_first[0];
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[0];
+ host = info.page[0].host;
+
+ while (reg_off <= reg_last) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+ host + mem_off + (i << msz));
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ }
+
+ /*
+ * Use the slow path to manage the cross-page misalignment.
+ * But we know this is RAM and cannot trap.
+ */
+ mem_off = info.mem_off_split;
+ if (unlikely(mem_off >= 0)) {
+ reg_off = info.reg_off_split;
+ for (i = 0; i < N; ++i) {
+ tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
+ addr + mem_off + (i << msz), retaddr);
+ }
+ }
+
+ mem_off = info.mem_off_first[1];
+ if (unlikely(mem_off >= 0)) {
+ reg_off = info.reg_off_first[1];
+ reg_last = info.reg_off_last[1];
+ host = info.page[1].host;
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+ host + mem_off + (i << msz));
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ }
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
+ uint32_t desc, const uintptr_t ra,
+ const int esz, const int msz, const int N,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ int bit55 = extract64(addr, 55, 1);
+
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /* Perform gross MTE suppression early. */
+ if (!tbi_check(desc, bit55) ||
+ tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+ mtedesc = 0;
+ }
+
+ sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
+}
+
+#define DO_LD1_1(NAME, ESZ) \
+void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
+ sve_##NAME##_host, sve_##NAME##_tlb); \
+} \
+void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
+ sve_##NAME##_host, sve_##NAME##_tlb); \
+}
+
+#define DO_LD1_2(NAME, ESZ, MSZ) \
+void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
+ sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
+} \
+void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
+ sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
+} \
+void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
+ sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
+} \
+void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
+ sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
+}
+
+DO_LD1_1(ld1bb, MO_8)
+DO_LD1_1(ld1bhu, MO_16)
+DO_LD1_1(ld1bhs, MO_16)
+DO_LD1_1(ld1bsu, MO_32)
+DO_LD1_1(ld1bss, MO_32)
+DO_LD1_1(ld1bdu, MO_64)
+DO_LD1_1(ld1bds, MO_64)
+
+DO_LD1_2(ld1hh, MO_16, MO_16)
+DO_LD1_2(ld1hsu, MO_32, MO_16)
+DO_LD1_2(ld1hss, MO_32, MO_16)
+DO_LD1_2(ld1hdu, MO_64, MO_16)
+DO_LD1_2(ld1hds, MO_64, MO_16)
+
+DO_LD1_2(ld1ss, MO_32, MO_32)
+DO_LD1_2(ld1sdu, MO_64, MO_32)
+DO_LD1_2(ld1sds, MO_64, MO_32)
+
+DO_LD1_2(ld1dd, MO_64, MO_64)
+
+#undef DO_LD1_1
+#undef DO_LD1_2
+
+#define DO_LDN_1(N) \
+void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
+ sve_ld1bb_host, sve_ld1bb_tlb); \
+} \
+void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
+ sve_ld1bb_host, sve_ld1bb_tlb); \
+}
+
+#define DO_LDN_2(N, SUFF, ESZ) \
+void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
+ sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
+} \
+void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
+ sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
+} \
+void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
+ sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
+} \
+void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
+ sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
+}
+
+DO_LDN_1(2)
+DO_LDN_1(3)
+DO_LDN_1(4)
+
+DO_LDN_2(2, hh, MO_16)
+DO_LDN_2(3, hh, MO_16)
+DO_LDN_2(4, hh, MO_16)
+
+DO_LDN_2(2, ss, MO_32)
+DO_LDN_2(3, ss, MO_32)
+DO_LDN_2(4, ss, MO_32)
+
+DO_LDN_2(2, dd, MO_64)
+DO_LDN_2(3, dd, MO_64)
+DO_LDN_2(4, dd, MO_64)
+
+#undef DO_LDN_1
+#undef DO_LDN_2
+
+/*
+ * Load contiguous data, first-fault and no-fault.
+ *
+ * For user-only, one could argue that we should hold the mmap_lock during
+ * the operation so that there is no race between page_check_range and the
+ * load operation. However, unmapping pages out from under a running thread
+ * is extraordinarily unlikely. This theoretical race condition also affects
+ * linux-user/ in its get_user/put_user macros.
+ *
+ * TODO: Construct some helpers, written in assembly, that interact with
+ * host_signal_handler to produce memory ops which can properly report errors
+ * without racing.
+ */
+
+/* Fault on byte I. All bits in FFR from I are cleared. The vector
+ * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
+ * option, which leaves subsequent data unchanged.
+ */
+static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
+{
+ uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
+
+ if (i & 63) {
+ ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
+ i = ROUND_UP(i, 64);
+ }
+ for (; i < oprsz; i += 64) {
+ ffr[i / 64] = 0;
+ }
+}
+
+/*
+ * Common helper for all contiguous no-fault and first-fault loads.
+ */
+static inline QEMU_ALWAYS_INLINE
+void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
+ uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
+ const int esz, const int msz, const SVEContFault fault,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const unsigned rd = simd_data(desc);
+ void *vd = &env->vfp.zregs[rd];
+ const intptr_t reg_max = simd_oprsz(desc);
+ intptr_t reg_off, mem_off, reg_last;
+ SVEContLdSt info;
+ int flags;
+ void *host;
+
+ /* Find the active elements. */
+ if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
+ /* The entire predicate was false; no load occurs. */
+ memset(vd, 0, reg_max);
+ return;
+ }
+ reg_off = info.reg_off_first[0];
+
+ /* Probe the page(s). */
+ if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
+ /* Fault on first element. */
+ tcg_debug_assert(fault == FAULT_NO);
+ memset(vd, 0, reg_max);
+ goto do_fault;
+ }
+
+ mem_off = info.mem_off_first[0];
+ flags = info.page[0].flags;
+
+ /*
+ * Disable MTE checking if the Tagged bit is not set. Since TBI must
+ * be set within MTEDESC for MTE, !mtedesc => !mte_active.
+ */
+ if (!info.page[0].tagged) {
+ mtedesc = 0;
+ }
+
+ if (fault == FAULT_FIRST) {
+ /* Trapping mte check for the first-fault element. */
+ if (mtedesc) {
+ mte_check(env, mtedesc, addr + mem_off, retaddr);
+ }
+
+ /*
+ * Special handling of the first active element,
+ * if it crosses a page boundary or is MMIO.
+ */
+ bool is_split = mem_off == info.mem_off_split;
+ if (unlikely(flags != 0) || unlikely(is_split)) {
+ /*
+ * Use the slow path for cross-page handling.
+ * Might trap for MMIO or watchpoints.
+ */
+ tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
+
+ /* After any fault, zero the other elements. */
+ swap_memzero(vd, reg_off);
+ reg_off += 1 << esz;
+ mem_off += 1 << msz;
+ swap_memzero(vd + reg_off, reg_max - reg_off);
+
+ if (is_split) {
+ goto second_page;
+ }
+ } else {
+ memset(vd, 0, reg_max);
+ }
+ } else {
+ memset(vd, 0, reg_max);
+ if (unlikely(mem_off == info.mem_off_split)) {
+ /* The first active element crosses a page boundary. */
+ flags |= info.page[1].flags;
+ if (unlikely(flags & TLB_MMIO)) {
+ /* Some page is MMIO, see below. */
+ goto do_fault;
+ }
+ if (unlikely(flags & TLB_WATCHPOINT) &&
+ (cpu_watchpoint_address_matches
+ (env_cpu(env), addr + mem_off, 1 << msz)
+ & BP_MEM_READ)) {
+ /* Watchpoint hit, see below. */
+ goto do_fault;
+ }
+ if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
+ goto do_fault;
+ }
+ /*
+ * Use the slow path for cross-page handling.
+ * This is RAM, without a watchpoint, and will not trap.
+ */
+ tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
+ goto second_page;
+ }
+ }
+
+ /*
+ * From this point on, all memory operations are MemSingleNF.
+ *
+ * Per the MemSingleNF pseudocode, a no-fault load from Device memory
+ * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
+ *
+ * Unfortuately we do not have access to the memory attributes from the
+ * PTE to tell Device memory from Normal memory. So we make a mostly
+ * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
+ * This gives the right answer for the common cases of "Normal memory,
+ * backed by host RAM" and "Device memory, backed by MMIO".
+ * The architecture allows us to suppress an NF load and return
+ * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
+ * case of "Normal memory, backed by MMIO" is permitted. The case we
+ * get wrong is "Device memory, backed by host RAM", for which we
+ * should return (UNKNOWN, FAULT) for but do not.
+ *
+ * Similarly, CPU_BP breakpoints would raise exceptions, and so
+ * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
+ * architectural breakpoints the same.
+ */
+ if (unlikely(flags & TLB_MMIO)) {
+ goto do_fault;
+ }
+
+ reg_last = info.reg_off_last[0];
+ host = info.page[0].host;
+
+ do {
+ uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ if (unlikely(flags & TLB_WATCHPOINT) &&
+ (cpu_watchpoint_address_matches
+ (env_cpu(env), addr + mem_off, 1 << msz)
+ & BP_MEM_READ)) {
+ goto do_fault;
+ }
+ if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
+ goto do_fault;
+ }
+ host_fn(vd, reg_off, host + mem_off);
+ }
+ reg_off += 1 << esz;
+ mem_off += 1 << msz;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ } while (reg_off <= reg_last);
+
+ /*
+ * MemSingleNF is allowed to fail for any reason. We have special
+ * code above to handle the first element crossing a page boundary.
+ * As an implementation choice, decline to handle a cross-page element
+ * in any other position.
+ */
+ reg_off = info.reg_off_split;
+ if (reg_off >= 0) {
+ goto do_fault;
+ }
+
+ second_page:
+ reg_off = info.reg_off_first[1];
+ if (likely(reg_off < 0)) {
+ /* No active elements on the second page. All done. */
+ return;
+ }
+
+ /*
+ * MemSingleNF is allowed to fail for any reason. As an implementation
+ * choice, decline to handle elements on the second page. This should
+ * be low frequency as the guest walks through memory -- the next
+ * iteration of the guest's loop should be aligned on the page boundary,
+ * and then all following iterations will stay aligned.
+ */
+
+ do_fault:
+ record_fault(env, reg_off, reg_max);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
+ uint32_t desc, const uintptr_t retaddr,
+ const int esz, const int msz, const SVEContFault fault,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ int bit55 = extract64(addr, 55, 1);
+
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /* Perform gross MTE suppression early. */
+ if (!tbi_check(desc, bit55) ||
+ tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+ mtedesc = 0;
+ }
+
+ sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
+ esz, msz, fault, host_fn, tlb_fn);
+}
+
+#define DO_LDFF1_LDNF1_1(PART, ESZ) \
+void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
+ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
+ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
+} \
+void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
+ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
+ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
+}
+
+#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
+void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
+ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
+ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+} \
+void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
+ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
+ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+} \
+void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
+ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
+ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+} \
+void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
+ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
+ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+}
+
+DO_LDFF1_LDNF1_1(bb, MO_8)
+DO_LDFF1_LDNF1_1(bhu, MO_16)
+DO_LDFF1_LDNF1_1(bhs, MO_16)
+DO_LDFF1_LDNF1_1(bsu, MO_32)
+DO_LDFF1_LDNF1_1(bss, MO_32)
+DO_LDFF1_LDNF1_1(bdu, MO_64)
+DO_LDFF1_LDNF1_1(bds, MO_64)
+
+DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
+DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
+DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
+DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
+DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
+
+DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
+DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
+DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
+
+DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
+
+#undef DO_LDFF1_LDNF1_1
+#undef DO_LDFF1_LDNF1_2
+
+/*
+ * Common helper for all contiguous 1,2,3,4-register predicated stores.
+ */
+
+static inline QEMU_ALWAYS_INLINE
+void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
+ uint32_t desc, const uintptr_t retaddr,
+ const int esz, const int msz, const int N, uint32_t mtedesc,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const unsigned rd = simd_data(desc);
+ const intptr_t reg_max = simd_oprsz(desc);
+ intptr_t reg_off, reg_last, mem_off;
+ SVEContLdSt info;
+ void *host;
+ int i, flags;
+
+ /* Find the active elements. */
+ if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
+ /* The entire predicate was false; no store occurs. */
+ return;
+ }
+
+ /* Probe the page(s). Exit with exception for any invalid page. */
+ sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
+
+ /* Handle watchpoints for all active elements. */
+ sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
+ BP_MEM_WRITE, retaddr);
+
+ /*
+ * Handle mte checks for all active elements.
+ * Since TBI must be set for MTE, !mtedesc => !mte_active.
+ */
+ if (mtedesc) {
+ sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
+ mtedesc, retaddr);
+ }
+
+ flags = info.page[0].flags | info.page[1].flags;
+ if (unlikely(flags != 0)) {
+#ifdef CONFIG_USER_ONLY
+ g_assert_not_reached();
+#else
+ /*
+ * At least one page includes MMIO.
+ * Any bus operation can fail with cpu_transaction_failed,
+ * which for ARM will raise SyncExternal. We cannot avoid
+ * this fault and will leave with the store incomplete.
+ */
+ mem_off = info.mem_off_first[0];
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[1];
+ if (reg_last < 0) {
+ reg_last = info.reg_off_split;
+ if (reg_last < 0) {
+ reg_last = info.reg_off_last[0];
+ }
+ }
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
+ addr + mem_off + (i << msz), retaddr);
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ return;
+#endif
+ }
+
+ mem_off = info.mem_off_first[0];
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[0];
+ host = info.page[0].host;
+
+ while (reg_off <= reg_last) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+ host + mem_off + (i << msz));
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ }
+
+ /*
+ * Use the slow path to manage the cross-page misalignment.
+ * But we know this is RAM and cannot trap.
+ */
+ mem_off = info.mem_off_split;
+ if (unlikely(mem_off >= 0)) {
+ reg_off = info.reg_off_split;
+ for (i = 0; i < N; ++i) {
+ tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
+ addr + mem_off + (i << msz), retaddr);
+ }
+ }
+
+ mem_off = info.mem_off_first[1];
+ if (unlikely(mem_off >= 0)) {
+ reg_off = info.reg_off_first[1];
+ reg_last = info.reg_off_last[1];
+ host = info.page[1].host;
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+ host + mem_off + (i << msz));
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ }
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
+ uint32_t desc, const uintptr_t ra,
+ const int esz, const int msz, const int N,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ int bit55 = extract64(addr, 55, 1);
+
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /* Perform gross MTE suppression early. */
+ if (!tbi_check(desc, bit55) ||
+ tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+ mtedesc = 0;
+ }
+
+ sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
+}
+
+#define DO_STN_1(N, NAME, ESZ) \
+void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
+ sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
+} \
+void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
+ sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
+}
+
+#define DO_STN_2(N, NAME, ESZ, MSZ) \
+void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
+ sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
+} \
+void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
+ sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
+} \
+void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
+ sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
+} \
+void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
+ sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
+}
+
+DO_STN_1(1, bb, MO_8)
+DO_STN_1(1, bh, MO_16)
+DO_STN_1(1, bs, MO_32)
+DO_STN_1(1, bd, MO_64)
+DO_STN_1(2, bb, MO_8)
+DO_STN_1(3, bb, MO_8)
+DO_STN_1(4, bb, MO_8)
+
+DO_STN_2(1, hh, MO_16, MO_16)
+DO_STN_2(1, hs, MO_32, MO_16)
+DO_STN_2(1, hd, MO_64, MO_16)
+DO_STN_2(2, hh, MO_16, MO_16)
+DO_STN_2(3, hh, MO_16, MO_16)
+DO_STN_2(4, hh, MO_16, MO_16)
+
+DO_STN_2(1, ss, MO_32, MO_32)
+DO_STN_2(1, sd, MO_64, MO_32)
+DO_STN_2(2, ss, MO_32, MO_32)
+DO_STN_2(3, ss, MO_32, MO_32)
+DO_STN_2(4, ss, MO_32, MO_32)
+
+DO_STN_2(1, dd, MO_64, MO_64)
+DO_STN_2(2, dd, MO_64, MO_64)
+DO_STN_2(3, dd, MO_64, MO_64)
+DO_STN_2(4, dd, MO_64, MO_64)
+
+#undef DO_STN_1
+#undef DO_STN_2
+
+/*
+ * Loads with a vector index.
+ */
+
+/*
+ * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
+ */
+typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
+
+static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
+{
+ return *(uint32_t *)(reg + H1_4(reg_ofs));
+}
+
+static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
+{
+ return *(int32_t *)(reg + H1_4(reg_ofs));
+}
+
+static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
+{
+ return (uint32_t)*(uint64_t *)(reg + reg_ofs);
+}
+
+static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
+{
+ return (int32_t)*(uint64_t *)(reg + reg_ofs);
+}
+
+static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
+{
+ return *(uint64_t *)(reg + reg_ofs);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ uint32_t mtedesc, int esize, int msize,
+ zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const int mmu_idx = cpu_mmu_index(env, false);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const int scale = simd_data(desc);
+ ARMVectorReg scratch;
+ intptr_t reg_off;
+ SVEHostPage info, info2;
+
+ memset(&scratch, 0, reg_max);
+ reg_off = 0;
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if (likely(pg & 1)) {
+ target_ulong addr = base + (off_fn(vm, reg_off) << scale);
+ target_ulong in_page = -(addr | TARGET_PAGE_MASK);
+
+ sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
+ mmu_idx, retaddr);
+
+ if (likely(in_page >= msize)) {
+ if (unlikely(info.flags & TLB_WATCHPOINT)) {
+ cpu_check_watchpoint(env_cpu(env), addr, msize,
+ info.attrs, BP_MEM_READ, retaddr);
+ }
+ if (mtedesc && info.tagged) {
+ mte_check(env, mtedesc, addr, retaddr);
+ }
+ if (unlikely(info.flags & TLB_MMIO)) {
+ tlb_fn(env, &scratch, reg_off, addr, retaddr);
+ } else {
+ host_fn(&scratch, reg_off, info.host);
+ }
+ } else {
+ /* Element crosses the page boundary. */
+ sve_probe_page(&info2, false, env, addr + in_page, 0,
+ MMU_DATA_LOAD, mmu_idx, retaddr);
+ if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
+ cpu_check_watchpoint(env_cpu(env), addr,
+ msize, info.attrs,
+ BP_MEM_READ, retaddr);
+ }
+ if (mtedesc && info.tagged) {
+ mte_check(env, mtedesc, addr, retaddr);
+ }
+ tlb_fn(env, &scratch, reg_off, addr, retaddr);
+ }
+ }
+ reg_off += esize;
+ pg >>= esize;
+ } while (reg_off & 63);
+ } while (reg_off < reg_max);
+
+ /* Wait until all exceptions have been raised to write back. */
+ memcpy(vd, &scratch, reg_max);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ int esize, int msize, zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /*
+ * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
+ * offset base entirely over the address space hole to change the
+ * pointer tag, or change the bit55 selector. So we could here
+ * examine TBI + TCMA like we do for sve_ldN_r_mte().
+ */
+ sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
+ esize, msize, off_fn, host_fn, tlb_fn);
+}
+
+#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
+void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
+ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+} \
+void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
+ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
+
+#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
+void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
+ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+} \
+void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
+ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
+
+DO_LD1_ZPZ_S(bsu, zsu, MO_8)
+DO_LD1_ZPZ_S(bsu, zss, MO_8)
+DO_LD1_ZPZ_D(bdu, zsu, MO_8)
+DO_LD1_ZPZ_D(bdu, zss, MO_8)
+DO_LD1_ZPZ_D(bdu, zd, MO_8)
+
+DO_LD1_ZPZ_S(bss, zsu, MO_8)
+DO_LD1_ZPZ_S(bss, zss, MO_8)
+DO_LD1_ZPZ_D(bds, zsu, MO_8)
+DO_LD1_ZPZ_D(bds, zss, MO_8)
+DO_LD1_ZPZ_D(bds, zd, MO_8)
+
+DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
+DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
+DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
+
+DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
+DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
+DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
+
+DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
+DO_LD1_ZPZ_S(hss_le, zss, MO_16)
+DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
+DO_LD1_ZPZ_D(hds_le, zss, MO_16)
+DO_LD1_ZPZ_D(hds_le, zd, MO_16)
+
+DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
+DO_LD1_ZPZ_S(hss_be, zss, MO_16)
+DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
+DO_LD1_ZPZ_D(hds_be, zss, MO_16)
+DO_LD1_ZPZ_D(hds_be, zd, MO_16)
+
+DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
+DO_LD1_ZPZ_S(ss_le, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
+DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
+
+DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
+DO_LD1_ZPZ_S(ss_be, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
+DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
+
+DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
+DO_LD1_ZPZ_D(sds_le, zss, MO_32)
+DO_LD1_ZPZ_D(sds_le, zd, MO_32)
+
+DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
+DO_LD1_ZPZ_D(sds_be, zss, MO_32)
+DO_LD1_ZPZ_D(sds_be, zd, MO_32)
+
+DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
+DO_LD1_ZPZ_D(dd_le, zss, MO_64)
+DO_LD1_ZPZ_D(dd_le, zd, MO_64)
+
+DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
+DO_LD1_ZPZ_D(dd_be, zss, MO_64)
+DO_LD1_ZPZ_D(dd_be, zd, MO_64)
+
+#undef DO_LD1_ZPZ_S
+#undef DO_LD1_ZPZ_D
+
+/* First fault loads with a vector index. */
+
+/*
+ * Common helpers for all gather first-faulting loads.
+ */
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ uint32_t mtedesc, const int esz, const int msz,
+ zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const int mmu_idx = cpu_mmu_index(env, false);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const int scale = simd_data(desc);
+ const int esize = 1 << esz;
+ const int msize = 1 << msz;
+ intptr_t reg_off;
+ SVEHostPage info;
+ target_ulong addr, in_page;
+
+ /* Skip to the first true predicate. */
+ reg_off = find_next_active(vg, 0, reg_max, esz);
+ if (unlikely(reg_off >= reg_max)) {
+ /* The entire predicate was false; no load occurs. */
+ memset(vd, 0, reg_max);
+ return;
+ }
+
+ /*
+ * Probe the first element, allowing faults.
+ */
+ addr = base + (off_fn(vm, reg_off) << scale);
+ if (mtedesc) {
+ mte_check(env, mtedesc, addr, retaddr);
+ }
+ tlb_fn(env, vd, reg_off, addr, retaddr);
+
+ /* After any fault, zero the other elements. */
+ swap_memzero(vd, reg_off);
+ reg_off += esize;
+ swap_memzero(vd + reg_off, reg_max - reg_off);
+
+ /*
+ * Probe the remaining elements, not allowing faults.
+ */
+ while (reg_off < reg_max) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if (likely((pg >> (reg_off & 63)) & 1)) {
+ addr = base + (off_fn(vm, reg_off) << scale);
+ in_page = -(addr | TARGET_PAGE_MASK);
+
+ if (unlikely(in_page < msize)) {
+ /* Stop if the element crosses a page boundary. */
+ goto fault;
+ }
+
+ sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
+ mmu_idx, retaddr);
+ if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
+ goto fault;
+ }
+ if (unlikely(info.flags & TLB_WATCHPOINT) &&
+ (cpu_watchpoint_address_matches
+ (env_cpu(env), addr, msize) & BP_MEM_READ)) {
+ goto fault;
+ }
+ if (mtedesc && info.tagged && !mte_probe(env, mtedesc, addr)) {
+ goto fault;
+ }
+
+ host_fn(vd, reg_off, info.host);
+ }
+ reg_off += esize;
+ } while (reg_off & 63);
+ }
+ return;
+
+ fault:
+ record_fault(env, reg_off, reg_max);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ const int esz, const int msz,
+ zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /*
+ * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
+ * offset base entirely over the address space hole to change the
+ * pointer tag, or change the bit55 selector. So we could here
+ * examine TBI + TCMA like we do for sve_ldN_r_mte().
+ */
+ sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
+ esz, msz, off_fn, host_fn, tlb_fn);
+}
+
+#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
+void HELPER(sve_ldff##MEM##_##OFS) \
+ (CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
+ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+} \
+void HELPER(sve_ldff##MEM##_##OFS##_mte) \
+ (CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
+ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
+
+#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
+void HELPER(sve_ldff##MEM##_##OFS) \
+ (CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
+ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+} \
+void HELPER(sve_ldff##MEM##_##OFS##_mte) \
+ (CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
+ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
+
+DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
+DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
+DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
+DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
+DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
+
+DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
+DO_LDFF1_ZPZ_S(bss, zss, MO_8)
+DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
+DO_LDFF1_ZPZ_D(bds, zss, MO_8)
+DO_LDFF1_ZPZ_D(bds, zd, MO_8)
+
+DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
+DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
+
+DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
+DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
+
+DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
+DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
+
+DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
+DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
+
+DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
+DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
+DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
+
+DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
+DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
+DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
+
+/* Stores with a vector index. */
+
+static inline QEMU_ALWAYS_INLINE
+void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ uint32_t mtedesc, int esize, int msize,
+ zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const int mmu_idx = cpu_mmu_index(env, false);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const int scale = simd_data(desc);
+ void *host[ARM_MAX_VQ * 4];
+ intptr_t reg_off, i;
+ SVEHostPage info, info2;
+
+ /*
+ * Probe all of the elements for host addresses and flags.
+ */
+ i = reg_off = 0;
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ target_ulong addr = base + (off_fn(vm, reg_off) << scale);
+ target_ulong in_page = -(addr | TARGET_PAGE_MASK);
+
+ host[i] = NULL;
+ if (likely((pg >> (reg_off & 63)) & 1)) {
+ if (likely(in_page >= msize)) {
+ sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
+ mmu_idx, retaddr);
+ if (!(info.flags & TLB_MMIO)) {
+ host[i] = info.host;
+ }
+ } else {
+ /*
+ * Element crosses the page boundary.
+ * Probe both pages, but do not record the host address,
+ * so that we use the slow path.
+ */
+ sve_probe_page(&info, false, env, addr, 0,
+ MMU_DATA_STORE, mmu_idx, retaddr);
+ sve_probe_page(&info2, false, env, addr + in_page, 0,
+ MMU_DATA_STORE, mmu_idx, retaddr);
+ info.flags |= info2.flags;
+ }
+
+ if (unlikely(info.flags & TLB_WATCHPOINT)) {
+ cpu_check_watchpoint(env_cpu(env), addr, msize,
+ info.attrs, BP_MEM_WRITE, retaddr);
+ }
+
+ if (mtedesc && info.tagged) {
+ mte_check(env, mtedesc, addr, retaddr);
+ }
+ }
+ i += 1;
+ reg_off += esize;
+ } while (reg_off & 63);
+ } while (reg_off < reg_max);
+
+ /*
+ * Now that we have recognized all exceptions except SyncExternal
+ * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
+ *
+ * Note for the common case of an element in RAM, not crossing a page
+ * boundary, we have stored the host address in host[]. This doubles
+ * as a first-level check against the predicate, since only enabled
+ * elements have non-null host addresses.
+ */
+ i = reg_off = 0;
+ do {
+ void *h = host[i];
+ if (likely(h != NULL)) {
+ host_fn(vd, reg_off, h);
+ } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
+ target_ulong addr = base + (off_fn(vm, reg_off) << scale);
+ tlb_fn(env, vd, reg_off, addr, retaddr);
+ }
+ i += 1;
+ reg_off += esize;
+ } while (reg_off < reg_max);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ int esize, int msize, zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /*
+ * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
+ * offset base entirely over the address space hole to change the
+ * pointer tag, or change the bit55 selector. So we could here
+ * examine TBI + TCMA like we do for sve_ldN_r_mte().
+ */
+ sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
+ esize, msize, off_fn, host_fn, tlb_fn);
+}
+
+#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
+void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
+ off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+} \
+void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
+ off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+}
+
+#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
+void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
+ off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+} \
+void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
+ off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+}
+
+DO_ST1_ZPZ_S(bs, zsu, MO_8)
+DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
+DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
+DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
+DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
+
+DO_ST1_ZPZ_S(bs, zss, MO_8)
+DO_ST1_ZPZ_S(hs_le, zss, MO_16)
+DO_ST1_ZPZ_S(hs_be, zss, MO_16)
+DO_ST1_ZPZ_S(ss_le, zss, MO_32)
+DO_ST1_ZPZ_S(ss_be, zss, MO_32)
+
+DO_ST1_ZPZ_D(bd, zsu, MO_8)
+DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
+DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
+DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
+DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
+DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
+DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
+
+DO_ST1_ZPZ_D(bd, zss, MO_8)
+DO_ST1_ZPZ_D(hd_le, zss, MO_16)
+DO_ST1_ZPZ_D(hd_be, zss, MO_16)
+DO_ST1_ZPZ_D(sd_le, zss, MO_32)
+DO_ST1_ZPZ_D(sd_be, zss, MO_32)
+DO_ST1_ZPZ_D(dd_le, zss, MO_64)
+DO_ST1_ZPZ_D(dd_be, zss, MO_64)
+
+DO_ST1_ZPZ_D(bd, zd, MO_8)
+DO_ST1_ZPZ_D(hd_le, zd, MO_16)
+DO_ST1_ZPZ_D(hd_be, zd, MO_16)
+DO_ST1_ZPZ_D(sd_le, zd, MO_32)
+DO_ST1_ZPZ_D(sd_be, zd, MO_32)
+DO_ST1_ZPZ_D(dd_le, zd, MO_64)
+DO_ST1_ZPZ_D(dd_be, zd, MO_64)
+
+#undef DO_ST1_ZPZ_S
+#undef DO_ST1_ZPZ_D
+
+void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = n[i] ^ m[i] ^ k[i];
+ }
+}
+
+void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = n[i] ^ (m[i] & ~k[i]);
+ }
+}
+
+void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
+ }
+}
+
+void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
+ }
+}
+
+void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
+ }
+}
+
+/*
+ * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
+ * See hasless(v,1) from
+ * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+ */
+static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
+{
+ int bits = 8 << esz;
+ uint64_t ones = dup_const(esz, 1);
+ uint64_t signs = ones << (bits - 1);
+ uint64_t cmp0, cmp1;
+
+ cmp1 = dup_const(esz, n);
+ cmp0 = cmp1 ^ m0;
+ cmp1 = cmp1 ^ m1;
+ cmp0 = (cmp0 - ones) & ~cmp0;
+ cmp1 = (cmp1 - ones) & ~cmp1;
+ return (cmp0 | cmp1) & signs;
+}
+
+static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
+ uint32_t desc, int esz, bool nmatch)
+{
+ uint16_t esz_mask = pred_esz_masks[esz];
+ intptr_t opr_sz = simd_oprsz(desc);
+ uint32_t flags = PREDTEST_INIT;
+ intptr_t i, j, k;
+
+ for (i = 0; i < opr_sz; i += 16) {
+ uint64_t m0 = *(uint64_t *)(vm + i);
+ uint64_t m1 = *(uint64_t *)(vm + i + 8);
+ uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
+ uint16_t out = 0;
+
+ for (j = 0; j < 16; j += 8) {
+ uint64_t n = *(uint64_t *)(vn + i + j);
+
+ for (k = 0; k < 8; k += 1 << esz) {
+ if (pg & (1 << (j + k))) {
+ bool o = do_match2(n >> (k * 8), m0, m1, esz);
+ out |= (o ^ nmatch) << (j + k);
+ }
+ }
+ }
+ *(uint16_t *)(vd + H1_2(i >> 3)) = out;
+ flags = iter_predtest_fwd(out, pg, flags);
+ }
+ return flags;
+}
+
+#define DO_PPZZ_MATCH(NAME, ESZ, INV) \
+uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
+{ \
+ return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
+}
+
+DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
+DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
+
+DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
+DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
+
+#undef DO_PPZZ_MATCH
+
+void HELPER(sve2_histcnt_s)(void *vd, void *vn, void *vm, void *vg,
+ uint32_t desc)
+{
+ ARMVectorReg scratch;
+ intptr_t i, j;
+ intptr_t opr_sz = simd_oprsz(desc);
+ uint32_t *d = vd, *n = vn, *m = vm;
+ uint8_t *pg = vg;
+
+ if (d == n) {
+ n = memcpy(&scratch, n, opr_sz);
+ if (d == m) {
+ m = n;
+ }
+ } else if (d == m) {
+ m = memcpy(&scratch, m, opr_sz);
+ }
+
+ for (i = 0; i < opr_sz; i += 4) {
+ uint64_t count = 0;
+ uint8_t pred;
+
+ pred = pg[H1(i >> 3)] >> (i & 7);
+ if (pred & 1) {
+ uint32_t nn = n[H4(i >> 2)];
+
+ for (j = 0; j <= i; j += 4) {
+ pred = pg[H1(j >> 3)] >> (j & 7);
+ if ((pred & 1) && nn == m[H4(j >> 2)]) {
+ ++count;
+ }
+ }
+ }
+ d[H4(i >> 2)] = count;
+ }
+}
+
+void HELPER(sve2_histcnt_d)(void *vd, void *vn, void *vm, void *vg,
+ uint32_t desc)
+{
+ ARMVectorReg scratch;
+ intptr_t i, j;
+ intptr_t opr_sz = simd_oprsz(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint8_t *pg = vg;
+
+ if (d == n) {
+ n = memcpy(&scratch, n, opr_sz);
+ if (d == m) {
+ m = n;
+ }
+ } else if (d == m) {
+ m = memcpy(&scratch, m, opr_sz);
+ }
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ uint64_t count = 0;
+ if (pg[H1(i)] & 1) {
+ uint64_t nn = n[i];
+ for (j = 0; j <= i; ++j) {
+ if ((pg[H1(j)] & 1) && nn == m[j]) {
+ ++count;
+ }
+ }
+ }
+ d[i] = count;
+ }
+}
+
+/*
+ * Returns the number of bytes in m0 and m1 that match n.
+ * Unlike do_match2 we don't just need true/false, we need an exact count.
+ * This requires two extra logical operations.
+ */
+static inline uint64_t do_histseg_cnt(uint8_t n, uint64_t m0, uint64_t m1)
+{
+ const uint64_t mask = dup_const(MO_8, 0x7f);
+ uint64_t cmp0, cmp1;
+
+ cmp1 = dup_const(MO_8, n);
+ cmp0 = cmp1 ^ m0;
+ cmp1 = cmp1 ^ m1;
+
+ /*
+ * 1: clear msb of each byte to avoid carry to next byte (& mask)
+ * 2: carry in to msb if byte != 0 (+ mask)
+ * 3: set msb if cmp has msb set (| cmp)
+ * 4: set ~msb to ignore them (| mask)
+ * We now have 0xff for byte != 0 or 0x7f for byte == 0.
+ * 5: invert, resulting in 0x80 if and only if byte == 0.
+ */
+ cmp0 = ~(((cmp0 & mask) + mask) | cmp0 | mask);
+ cmp1 = ~(((cmp1 & mask) + mask) | cmp1 | mask);
+
+ /*
+ * Combine the two compares in a way that the bits do
+ * not overlap, and so preserves the count of set bits.
+ * If the host has an efficient instruction for ctpop,
+ * then ctpop(x) + ctpop(y) has the same number of
+ * operations as ctpop(x | (y >> 1)). If the host does
+ * not have an efficient ctpop, then we only want to
+ * use it once.
+ */
+ return ctpop64(cmp0 | (cmp1 >> 1));
+}
+
+void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, j;
+ intptr_t opr_sz = simd_oprsz(desc);
+
+ for (i = 0; i < opr_sz; i += 16) {
+ uint64_t n0 = *(uint64_t *)(vn + i);
+ uint64_t m0 = *(uint64_t *)(vm + i);
+ uint64_t n1 = *(uint64_t *)(vn + i + 8);
+ uint64_t m1 = *(uint64_t *)(vm + i + 8);
+ uint64_t out0 = 0;
+ uint64_t out1 = 0;
+
+ for (j = 0; j < 64; j += 8) {
+ uint64_t cnt0 = do_histseg_cnt(n0 >> j, m0, m1);
+ uint64_t cnt1 = do_histseg_cnt(n1 >> j, m0, m1);
+ out0 |= cnt0 << j;
+ out1 |= cnt1 << j;
+ }
+
+ *(uint64_t *)(vd + i) = out0;
+ *(uint64_t *)(vd + i + 8) = out1;
+ }
+}
+
+void HELPER(sve2_xar_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ int shr = simd_data(desc);
+ int shl = 8 - shr;
+ uint64_t mask = dup_const(MO_8, 0xff >> shr);
+ uint64_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ uint64_t t = n[i] ^ m[i];
+ d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
+ }
+}
+
+void HELPER(sve2_xar_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ int shr = simd_data(desc);
+ int shl = 16 - shr;
+ uint64_t mask = dup_const(MO_16, 0xffff >> shr);
+ uint64_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ uint64_t t = n[i] ^ m[i];
+ d[i] = ((t >> shr) & mask) | ((t << shl) & ~mask);
+ }
+}
+
+void HELPER(sve2_xar_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 4;
+ int shr = simd_data(desc);
+ uint32_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = ror32(n[i] ^ m[i], shr);
+ }
+}
+
+void HELPER(fmmla_s)(void *vd, void *vn, void *vm, void *va,
+ void *status, uint32_t desc)
+{
+ intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float32) * 4);
+
+ for (s = 0; s < opr_sz; ++s) {
+ float32 *n = vn + s * sizeof(float32) * 4;
+ float32 *m = vm + s * sizeof(float32) * 4;
+ float32 *a = va + s * sizeof(float32) * 4;
+ float32 *d = vd + s * sizeof(float32) * 4;
+ float32 n00 = n[H4(0)], n01 = n[H4(1)];
+ float32 n10 = n[H4(2)], n11 = n[H4(3)];
+ float32 m00 = m[H4(0)], m01 = m[H4(1)];
+ float32 m10 = m[H4(2)], m11 = m[H4(3)];
+ float32 p0, p1;
+
+ /* i = 0, j = 0 */
+ p0 = float32_mul(n00, m00, status);
+ p1 = float32_mul(n01, m01, status);
+ d[H4(0)] = float32_add(a[H4(0)], float32_add(p0, p1, status), status);
+
+ /* i = 0, j = 1 */
+ p0 = float32_mul(n00, m10, status);
+ p1 = float32_mul(n01, m11, status);
+ d[H4(1)] = float32_add(a[H4(1)], float32_add(p0, p1, status), status);
+
+ /* i = 1, j = 0 */
+ p0 = float32_mul(n10, m00, status);
+ p1 = float32_mul(n11, m01, status);
+ d[H4(2)] = float32_add(a[H4(2)], float32_add(p0, p1, status), status);
+
+ /* i = 1, j = 1 */
+ p0 = float32_mul(n10, m10, status);
+ p1 = float32_mul(n11, m11, status);
+ d[H4(3)] = float32_add(a[H4(3)], float32_add(p0, p1, status), status);
+ }
+}
+
+void HELPER(fmmla_d)(void *vd, void *vn, void *vm, void *va,
+ void *status, uint32_t desc)
+{
+ intptr_t s, opr_sz = simd_oprsz(desc) / (sizeof(float64) * 4);
+
+ for (s = 0; s < opr_sz; ++s) {
+ float64 *n = vn + s * sizeof(float64) * 4;
+ float64 *m = vm + s * sizeof(float64) * 4;
+ float64 *a = va + s * sizeof(float64) * 4;
+ float64 *d = vd + s * sizeof(float64) * 4;
+ float64 n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
+ float64 m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
+ float64 p0, p1;
+
+ /* i = 0, j = 0 */
+ p0 = float64_mul(n00, m00, status);
+ p1 = float64_mul(n01, m01, status);
+ d[0] = float64_add(a[0], float64_add(p0, p1, status), status);
+
+ /* i = 0, j = 1 */
+ p0 = float64_mul(n00, m10, status);
+ p1 = float64_mul(n01, m11, status);
+ d[1] = float64_add(a[1], float64_add(p0, p1, status), status);
+
+ /* i = 1, j = 0 */
+ p0 = float64_mul(n10, m00, status);
+ p1 = float64_mul(n11, m01, status);
+ d[2] = float64_add(a[2], float64_add(p0, p1, status), status);
+
+ /* i = 1, j = 1 */
+ p0 = float64_mul(n10, m10, status);
+ p1 = float64_mul(n11, m11, status);
+ d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
+ }
+}
+
+#define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
+{ \
+ intptr_t i = simd_oprsz(desc); \
+ uint64_t *g = vg; \
+ do { \
+ uint64_t pg = g[(i - 1) >> 6]; \
+ do { \
+ i -= sizeof(TYPEW); \
+ if (likely((pg >> (i & 63)) & 1)) { \
+ TYPEW nn = *(TYPEW *)(vn + HW(i)); \
+ *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
+ } \
+ } while (i & 63); \
+ } while (i != 0); \
+}
+
+DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
+DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
+DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_8, H1_4, float64_to_float32)
+
+#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
+void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
+{ \
+ intptr_t i = simd_oprsz(desc); \
+ uint64_t *g = vg; \
+ do { \
+ uint64_t pg = g[(i - 1) >> 6]; \
+ do { \
+ i -= sizeof(TYPEW); \
+ if (likely((pg >> (i & 63)) & 1)) { \
+ TYPEN nn = *(TYPEN *)(vn + HN(i + sizeof(TYPEN))); \
+ *(TYPEW *)(vd + HW(i)) = OP(nn, status); \
+ } \
+ } while (i & 63); \
+ } while (i != 0); \
+}
+
+DO_FCVTLT(sve2_fcvtlt_hs, uint32_t, uint16_t, H1_4, H1_2, sve_f16_to_f32)
+DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
+
+#undef DO_FCVTLT
+#undef DO_FCVTNT
--- /dev/null
+/*
+ * ARM TLB (Translation lookaside buffer) helpers.
+ *
+ * This code is licensed under the GNU GPL v2 or later.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "internals.h"
+#include "exec/exec-all.h"
+#include "exec/helper-proto.h"
+
+
+/* Return true if the translation regime is using LPAE format page tables */
+bool regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx)
+{
+ int el = regime_el(env, mmu_idx);
+ if (el == 2 || arm_el_is_aa64(env, el)) {
+ return true;
+ }
+ if (arm_feature(env, ARM_FEATURE_PMSA) &&
+ arm_feature(env, ARM_FEATURE_V8)) {
+ return true;
+ }
+ if (arm_feature(env, ARM_FEATURE_LPAE)
+ && (regime_tcr(env, mmu_idx) & TTBCR_EAE)) {
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Returns true if the stage 1 translation regime is using LPAE format page
+ * tables. Used when raising alignment exceptions, whose FSR changes depending
+ * on whether the long or short descriptor format is in use.
+ */
+bool arm_s1_regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx)
+{
+ mmu_idx = stage_1_mmu_idx(mmu_idx);
+ return regime_using_lpae_format(env, mmu_idx);
+}
+
+static inline uint32_t merge_syn_data_abort(uint32_t template_syn,
+ unsigned int target_el,
+ bool same_el, bool ea,
+ bool s1ptw, bool is_write,
+ int fsc)
+{
+ uint32_t syn;
+
+ /*
+ * ISV is only set for data aborts routed to EL2 and
+ * never for stage-1 page table walks faulting on stage 2.
+ *
+ * Furthermore, ISV is only set for certain kinds of load/stores.
+ * If the template syndrome does not have ISV set, we should leave
+ * it cleared.
+ *
+ * See ARMv8 specs, D7-1974:
+ * ISS encoding for an exception from a Data Abort, the
+ * ISV field.
+ */
+ if (!(template_syn & ARM_EL_ISV) || target_el != 2 || s1ptw) {
+ syn = syn_data_abort_no_iss(same_el, 0,
+ ea, 0, s1ptw, is_write, fsc);
+ } else {
+ /*
+ * Fields: IL, ISV, SAS, SSE, SRT, SF and AR come from the template
+ * syndrome created at translation time.
+ * Now we create the runtime syndrome with the remaining fields.
+ */
+ syn = syn_data_abort_with_iss(same_el,
+ 0, 0, 0, 0, 0,
+ ea, 0, s1ptw, is_write, fsc,
+ true);
+ /* Merge the runtime syndrome with the template syndrome. */
+ syn |= template_syn;
+ }
+ return syn;
+}
+
+static uint32_t compute_fsr_fsc(CPUARMState *env, ARMMMUFaultInfo *fi,
+ int target_el, int mmu_idx, uint32_t *ret_fsc)
+{
+ ARMMMUIdx arm_mmu_idx = core_to_arm_mmu_idx(env, mmu_idx);
+ uint32_t fsr, fsc;
+
+ if (target_el == 2 || arm_el_is_aa64(env, target_el) ||
+ arm_s1_regime_using_lpae_format(env, arm_mmu_idx)) {
+ /*
+ * LPAE format fault status register : bottom 6 bits are
+ * status code in the same form as needed for syndrome
+ */
+ fsr = arm_fi_to_lfsc(fi);
+ fsc = extract32(fsr, 0, 6);
+ } else {
+ fsr = arm_fi_to_sfsc(fi);
+ /*
+ * Short format FSR : this fault will never actually be reported
+ * to an EL that uses a syndrome register. Use a (currently)
+ * reserved FSR code in case the constructed syndrome does leak
+ * into the guest somehow.
+ */
+ fsc = 0x3f;
+ }
+
+ *ret_fsc = fsc;
+ return fsr;
+}
+
+static G_NORETURN
+void arm_deliver_fault(ARMCPU *cpu, vaddr addr,
+ MMUAccessType access_type,
+ int mmu_idx, ARMMMUFaultInfo *fi)
+{
+ CPUARMState *env = &cpu->env;
+ int target_el;
+ bool same_el;
+ uint32_t syn, exc, fsr, fsc;
+
+ target_el = exception_target_el(env);
+ if (fi->stage2) {
+ target_el = 2;
+ env->cp15.hpfar_el2 = extract64(fi->s2addr, 12, 47) << 4;
+ if (arm_is_secure_below_el3(env) && fi->s1ns) {
+ env->cp15.hpfar_el2 |= HPFAR_NS;
+ }
+ }
+ same_el = (arm_current_el(env) == target_el);
+
+ fsr = compute_fsr_fsc(env, fi, target_el, mmu_idx, &fsc);
+
+ if (access_type == MMU_INST_FETCH) {
+ syn = syn_insn_abort(same_el, fi->ea, fi->s1ptw, fsc);
+ exc = EXCP_PREFETCH_ABORT;
+ } else {
+ syn = merge_syn_data_abort(env->exception.syndrome, target_el,
+ same_el, fi->ea, fi->s1ptw,
+ access_type == MMU_DATA_STORE,
+ fsc);
+ if (access_type == MMU_DATA_STORE
+ && arm_feature(env, ARM_FEATURE_V6)) {
+ fsr |= (1 << 11);
+ }
+ exc = EXCP_DATA_ABORT;
+ }
+
+ env->exception.vaddress = addr;
+ env->exception.fsr = fsr;
+ raise_exception(env, exc, syn, target_el);
+}
+
+/* Raise a data fault alignment exception for the specified virtual address */
+void arm_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
+ MMUAccessType access_type,
+ int mmu_idx, uintptr_t retaddr)
+{
+ ARMCPU *cpu = ARM_CPU(cs);
+ ARMMMUFaultInfo fi = {};
+
+ /* now we have a real cpu fault */
+ cpu_restore_state(cs, retaddr);
+
+ fi.type = ARMFault_Alignment;
+ arm_deliver_fault(cpu, vaddr, access_type, mmu_idx, &fi);
+}
+
+void helper_exception_pc_alignment(CPUARMState *env, target_ulong pc)
+{
+ ARMMMUFaultInfo fi = { .type = ARMFault_Alignment };
+ int target_el = exception_target_el(env);
+ int mmu_idx = cpu_mmu_index(env, true);
+ uint32_t fsc;
+
+ env->exception.vaddress = pc;
+
+ /*
+ * Note that the fsc is not applicable to this exception,
+ * since any syndrome is pcalignment not insn_abort.
+ */
+ env->exception.fsr = compute_fsr_fsc(env, &fi, target_el, mmu_idx, &fsc);
+ raise_exception(env, EXCP_PREFETCH_ABORT, syn_pcalignment(), target_el);
+}
+
+#if !defined(CONFIG_USER_ONLY)
+
+/*
+ * arm_cpu_do_transaction_failed: handle a memory system error response
+ * (eg "no device/memory present at address") by raising an external abort
+ * exception
+ */
+void arm_cpu_do_transaction_failed(CPUState *cs, hwaddr physaddr,
+ vaddr addr, unsigned size,
+ MMUAccessType access_type,
+ int mmu_idx, MemTxAttrs attrs,
+ MemTxResult response, uintptr_t retaddr)
+{
+ ARMCPU *cpu = ARM_CPU(cs);
+ ARMMMUFaultInfo fi = {};
+
+ /* now we have a real cpu fault */
+ cpu_restore_state(cs, retaddr);
+
+ fi.ea = arm_extabort_type(response);
+ fi.type = ARMFault_SyncExternal;
+ arm_deliver_fault(cpu, addr, access_type, mmu_idx, &fi);
+}
+
+bool arm_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
+ MMUAccessType access_type, int mmu_idx,
+ bool probe, uintptr_t retaddr)
+{
+ ARMCPU *cpu = ARM_CPU(cs);
+ GetPhysAddrResult res = {};
+ ARMMMUFaultInfo local_fi, *fi;
+ int ret;
+
+ /*
+ * Allow S1_ptw_translate to see any fault generated here.
+ * Since this may recurse, read and clear.
+ */
+ fi = cpu->env.tlb_fi;
+ if (fi) {
+ cpu->env.tlb_fi = NULL;
+ } else {
+ fi = memset(&local_fi, 0, sizeof(local_fi));
+ }
+
+ /*
+ * Walk the page table and (if the mapping exists) add the page
+ * to the TLB. On success, return true. Otherwise, if probing,
+ * return false. Otherwise populate fsr with ARM DFSR/IFSR fault
+ * register format, and signal the fault.
+ */
+ ret = get_phys_addr(&cpu->env, address, access_type,
+ core_to_arm_mmu_idx(&cpu->env, mmu_idx),
+ &res, fi);
+ if (likely(!ret)) {
+ /*
+ * Map a single [sub]page. Regions smaller than our declared
+ * target page size are handled specially, so for those we
+ * pass in the exact addresses.
+ */
+ if (res.f.lg_page_size >= TARGET_PAGE_BITS) {
+ res.f.phys_addr &= TARGET_PAGE_MASK;
+ address &= TARGET_PAGE_MASK;
+ }
+
+ res.f.pte_attrs = res.cacheattrs.attrs;
+ res.f.shareability = res.cacheattrs.shareability;
+
+ tlb_set_page_full(cs, mmu_idx, address, &res.f);
+ return true;
+ } else if (probe) {
+ return false;
+ } else {
+ /* now we have a real cpu fault */
+ cpu_restore_state(cs, retaddr);
+ arm_deliver_fault(cpu, address, access_type, mmu_idx, fi);
+ }
+}
+#else
+void arm_cpu_record_sigsegv(CPUState *cs, vaddr addr,
+ MMUAccessType access_type,
+ bool maperr, uintptr_t ra)
+{
+ ARMMMUFaultInfo fi = {
+ .type = maperr ? ARMFault_Translation : ARMFault_Permission,
+ .level = 3,
+ };
+ ARMCPU *cpu = ARM_CPU(cs);
+
+ /*
+ * We report both ESR and FAR to signal handlers.
+ * For now, it's easiest to deliver the fault normally.
+ */
+ cpu_restore_state(cs, ra);
+ arm_deliver_fault(cpu, addr, access_type, MMU_USER_IDX, &fi);
+}
+
+void arm_cpu_record_sigbus(CPUState *cs, vaddr addr,
+ MMUAccessType access_type, uintptr_t ra)
+{
+ arm_cpu_do_unaligned_access(cs, addr, access_type, MMU_USER_IDX, ra);
+}
+#endif /* !defined(CONFIG_USER_ONLY) */
--- /dev/null
+/*
+ * ARM AdvSIMD / SVE Vector Operations
+ *
+ * Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "fpu/softfloat.h"
+#include "qemu/int128.h"
+#include "vec_internal.h"
+
+/*
+ * Data for expanding active predicate bits to bytes, for byte elements.
+ *
+ * for (i = 0; i < 256; ++i) {
+ * unsigned long m = 0;
+ * for (j = 0; j < 8; j++) {
+ * if ((i >> j) & 1) {
+ * m |= 0xfful << (j << 3);
+ * }
+ * }
+ * printf("0x%016lx,\n", m);
+ * }
+ */
+const uint64_t expand_pred_b_data[256] = {
+ 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
+ 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
+ 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
+ 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
+ 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
+ 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
+ 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
+ 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
+ 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
+ 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
+ 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
+ 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
+ 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
+ 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
+ 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
+ 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
+ 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
+ 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
+ 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
+ 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
+ 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
+ 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
+ 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
+ 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
+ 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
+ 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
+ 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
+ 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
+ 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
+ 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
+ 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
+ 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
+ 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
+ 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
+ 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
+ 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
+ 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
+ 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
+ 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
+ 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
+ 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
+ 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
+ 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
+ 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
+ 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
+ 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
+ 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
+ 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
+ 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
+ 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
+ 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
+ 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
+ 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
+ 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
+ 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
+ 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
+ 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
+ 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
+ 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
+ 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
+ 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
+ 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
+ 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
+ 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
+ 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
+ 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
+ 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
+ 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
+ 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
+ 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
+ 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
+ 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
+ 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
+ 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
+ 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
+ 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
+ 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
+ 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
+ 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
+ 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
+ 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
+ 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
+ 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
+ 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
+ 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
+ 0xffffffffffffffff,
+};
+
+/*
+ * Similarly for half-word elements.
+ * for (i = 0; i < 256; ++i) {
+ * unsigned long m = 0;
+ * if (i & 0xaa) {
+ * continue;
+ * }
+ * for (j = 0; j < 8; j += 2) {
+ * if ((i >> j) & 1) {
+ * m |= 0xfffful << (j << 3);
+ * }
+ * }
+ * printf("[0x%x] = 0x%016lx,\n", i, m);
+ * }
+ */
+const uint64_t expand_pred_h_data[0x55 + 1] = {
+ [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
+ [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
+ [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
+ [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
+ [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
+ [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
+ [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
+ [0x55] = 0xffffffffffffffff,
+};
+
+/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
+int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
+ bool neg, bool round)
+{
+ /*
+ * Simplify:
+ * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
+ * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
+ */
+ int32_t ret = (int32_t)src1 * src2;
+ if (neg) {
+ ret = -ret;
+ }
+ ret += ((int32_t)src3 << 7) + (round << 6);
+ ret >>= 7;
+
+ if (ret != (int8_t)ret) {
+ ret = (ret < 0 ? INT8_MIN : INT8_MAX);
+ }
+ return ret;
+}
+
+void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int8_t *d = vd, *n = vn, *m = vm, *a = va;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
+ }
+}
+
+void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int8_t *d = vd, *n = vn, *m = vm, *a = va;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
+ }
+}
+
+void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int8_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
+ }
+}
+
+void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int8_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
+ }
+}
+
+/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
+int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
+ bool neg, bool round, uint32_t *sat)
+{
+ /* Simplify similarly to do_sqrdmlah_b above. */
+ int32_t ret = (int32_t)src1 * src2;
+ if (neg) {
+ ret = -ret;
+ }
+ ret += ((int32_t)src3 << 15) + (round << 14);
+ ret >>= 15;
+
+ if (ret != (int16_t)ret) {
+ *sat = 1;
+ ret = (ret < 0 ? INT16_MIN : INT16_MAX);
+ }
+ return ret;
+}
+
+uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
+ uint32_t src2, uint32_t src3)
+{
+ uint32_t *sat = &env->vfp.qc[0];
+ uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
+ uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
+ false, true, sat);
+ return deposit32(e1, 16, 16, e2);
+}
+
+void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
+ void *vq, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ int16_t *d = vd;
+ int16_t *n = vn;
+ int16_t *m = vm;
+ uintptr_t i;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
+ uint32_t src2, uint32_t src3)
+{
+ uint32_t *sat = &env->vfp.qc[0];
+ uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
+ uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
+ true, true, sat);
+ return deposit32(e1, 16, 16, e2);
+}
+
+void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
+ void *vq, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ int16_t *d = vd;
+ int16_t *n = vn;
+ int16_t *m = vm;
+ uintptr_t i;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
+ void *vq, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int16_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
+ void *vq, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int16_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int16_t *d = vd, *n = vn, *m = vm, *a = va;
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
+ }
+}
+
+void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int16_t *d = vd, *n = vn, *m = vm, *a = va;
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
+ }
+}
+
+void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int16_t *d = vd, *n = vn, *m = vm;
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
+ }
+}
+
+void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int16_t *d = vd, *n = vn, *m = vm;
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
+ }
+}
+
+void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ int idx = simd_data(desc);
+ int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 2; i += 16 / 2) {
+ int16_t mm = m[i];
+ for (j = 0; j < 16 / 2; ++j) {
+ d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
+ }
+ }
+}
+
+void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ int idx = simd_data(desc);
+ int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 2; i += 16 / 2) {
+ int16_t mm = m[i];
+ for (j = 0; j < 16 / 2; ++j) {
+ d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
+ }
+ }
+}
+
+/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
+int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
+ bool neg, bool round, uint32_t *sat)
+{
+ /* Simplify similarly to do_sqrdmlah_b above. */
+ int64_t ret = (int64_t)src1 * src2;
+ if (neg) {
+ ret = -ret;
+ }
+ ret += ((int64_t)src3 << 31) + (round << 30);
+ ret >>= 31;
+
+ if (ret != (int32_t)ret) {
+ *sat = 1;
+ ret = (ret < 0 ? INT32_MIN : INT32_MAX);
+ }
+ return ret;
+}
+
+uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
+ int32_t src2, int32_t src3)
+{
+ uint32_t *sat = &env->vfp.qc[0];
+ return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
+}
+
+void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
+ void *vq, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ int32_t *d = vd;
+ int32_t *n = vn;
+ int32_t *m = vm;
+ uintptr_t i;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
+ int32_t src2, int32_t src3)
+{
+ uint32_t *sat = &env->vfp.qc[0];
+ return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
+}
+
+void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
+ void *vq, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ int32_t *d = vd;
+ int32_t *n = vn;
+ int32_t *m = vm;
+ uintptr_t i;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
+ void *vq, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int32_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
+ void *vq, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int32_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int32_t *d = vd, *n = vn, *m = vm, *a = va;
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
+ }
+}
+
+void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int32_t *d = vd, *n = vn, *m = vm, *a = va;
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
+ }
+}
+
+void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int32_t *d = vd, *n = vn, *m = vm;
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
+ }
+}
+
+void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int32_t *d = vd, *n = vn, *m = vm;
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
+ }
+}
+
+void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ int idx = simd_data(desc);
+ int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 4; i += 16 / 4) {
+ int32_t mm = m[i];
+ for (j = 0; j < 16 / 4; ++j) {
+ d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
+ }
+ }
+}
+
+void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ int idx = simd_data(desc);
+ int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
+ uint32_t discard;
+
+ for (i = 0; i < opr_sz / 4; i += 16 / 4) {
+ int32_t mm = m[i];
+ for (j = 0; j < 16 / 4; ++j) {
+ d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
+ }
+ }
+}
+
+/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
+static int64_t do_sat128_d(Int128 r)
+{
+ int64_t ls = int128_getlo(r);
+ int64_t hs = int128_gethi(r);
+
+ if (unlikely(hs != (ls >> 63))) {
+ return hs < 0 ? INT64_MIN : INT64_MAX;
+ }
+ return ls;
+}
+
+int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
+{
+ uint64_t l, h;
+ Int128 r, t;
+
+ /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
+ muls64(&l, &h, m, n);
+ r = int128_make128(l, h);
+ if (neg) {
+ r = int128_neg(r);
+ }
+ if (a) {
+ t = int128_exts64(a);
+ t = int128_lshift(t, 63);
+ r = int128_add(r, t);
+ }
+ if (round) {
+ t = int128_exts64(1ll << 62);
+ r = int128_add(r, t);
+ }
+ r = int128_rshift(r, 63);
+
+ return do_sat128_d(r);
+}
+
+void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int64_t *d = vd, *n = vn, *m = vm, *a = va;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
+ }
+}
+
+void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int64_t *d = vd, *n = vn, *m = vm, *a = va;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
+ }
+}
+
+void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int64_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
+ }
+}
+
+void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int64_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
+ }
+}
+
+void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ int idx = simd_data(desc);
+ int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
+
+ for (i = 0; i < opr_sz / 8; i += 16 / 8) {
+ int64_t mm = m[i];
+ for (j = 0; j < 16 / 8; ++j) {
+ d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
+ }
+ }
+}
+
+void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ int idx = simd_data(desc);
+ int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
+
+ for (i = 0; i < opr_sz / 8; i += 16 / 8) {
+ int64_t mm = m[i];
+ for (j = 0; j < 16 / 8; ++j) {
+ d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
+ }
+ }
+}
+
+/* Integer 8 and 16-bit dot-product.
+ *
+ * Note that for the loops herein, host endianness does not matter
+ * with respect to the ordering of data within the quad-width lanes.
+ * All elements are treated equally, no matter where they are.
+ */
+
+#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ TYPED *d = vd, *a = va; \
+ TYPEN *n = vn; \
+ TYPEM *m = vm; \
+ for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
+ d[i] = (a[i] + \
+ (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
+ (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
+ (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
+ (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
+ } \
+ clear_tail(d, opr_sz, simd_maxsz(desc)); \
+}
+
+DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
+DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
+DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
+DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
+DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
+
+#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i = 0, opr_sz = simd_oprsz(desc); \
+ intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
+ intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
+ intptr_t index = simd_data(desc); \
+ TYPED *d = vd, *a = va; \
+ TYPEN *n = vn; \
+ TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
+ do { \
+ TYPED m0 = m_indexed[i * 4 + 0]; \
+ TYPED m1 = m_indexed[i * 4 + 1]; \
+ TYPED m2 = m_indexed[i * 4 + 2]; \
+ TYPED m3 = m_indexed[i * 4 + 3]; \
+ do { \
+ d[i] = (a[i] + \
+ n[i * 4 + 0] * m0 + \
+ n[i * 4 + 1] * m1 + \
+ n[i * 4 + 2] * m2 + \
+ n[i * 4 + 3] * m3); \
+ } while (++i < segend); \
+ segend = i + 4; \
+ } while (i < opr_sz_n); \
+ clear_tail(d, opr_sz, simd_maxsz(desc)); \
+}
+
+DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
+DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
+DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
+DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
+DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
+DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
+
+void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float16 *d = vd;
+ float16 *n = vn;
+ float16 *m = vm;
+ float_status *fpst = vfpst;
+ uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint32_t neg_imag = neg_real ^ 1;
+ uintptr_t i;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 15;
+ neg_imag <<= 15;
+
+ for (i = 0; i < opr_sz / 2; i += 2) {
+ float16 e0 = n[H2(i)];
+ float16 e1 = m[H2(i + 1)] ^ neg_imag;
+ float16 e2 = n[H2(i + 1)];
+ float16 e3 = m[H2(i)] ^ neg_real;
+
+ d[H2(i)] = float16_add(e0, e1, fpst);
+ d[H2(i + 1)] = float16_add(e2, e3, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float32 *d = vd;
+ float32 *n = vn;
+ float32 *m = vm;
+ float_status *fpst = vfpst;
+ uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint32_t neg_imag = neg_real ^ 1;
+ uintptr_t i;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 31;
+ neg_imag <<= 31;
+
+ for (i = 0; i < opr_sz / 4; i += 2) {
+ float32 e0 = n[H4(i)];
+ float32 e1 = m[H4(i + 1)] ^ neg_imag;
+ float32 e2 = n[H4(i + 1)];
+ float32 e3 = m[H4(i)] ^ neg_real;
+
+ d[H4(i)] = float32_add(e0, e1, fpst);
+ d[H4(i + 1)] = float32_add(e2, e3, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float64 *d = vd;
+ float64 *n = vn;
+ float64 *m = vm;
+ float_status *fpst = vfpst;
+ uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
+ uint64_t neg_imag = neg_real ^ 1;
+ uintptr_t i;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 63;
+ neg_imag <<= 63;
+
+ for (i = 0; i < opr_sz / 8; i += 2) {
+ float64 e0 = n[i];
+ float64 e1 = m[i + 1] ^ neg_imag;
+ float64 e2 = n[i + 1];
+ float64 e3 = m[i] ^ neg_real;
+
+ d[i] = float64_add(e0, e1, fpst);
+ d[i + 1] = float64_add(e2, e3, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float16 *d = vd, *n = vn, *m = vm, *a = va;
+ float_status *fpst = vfpst;
+ intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ uint32_t neg_real = flip ^ neg_imag;
+ uintptr_t i;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 15;
+ neg_imag <<= 15;
+
+ for (i = 0; i < opr_sz / 2; i += 2) {
+ float16 e2 = n[H2(i + flip)];
+ float16 e1 = m[H2(i + flip)] ^ neg_real;
+ float16 e4 = e2;
+ float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
+
+ d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
+ d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float16 *d = vd, *n = vn, *m = vm, *a = va;
+ float_status *fpst = vfpst;
+ intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
+ uint32_t neg_real = flip ^ neg_imag;
+ intptr_t elements = opr_sz / sizeof(float16);
+ intptr_t eltspersegment = 16 / sizeof(float16);
+ intptr_t i, j;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 15;
+ neg_imag <<= 15;
+
+ for (i = 0; i < elements; i += eltspersegment) {
+ float16 mr = m[H2(i + 2 * index + 0)];
+ float16 mi = m[H2(i + 2 * index + 1)];
+ float16 e1 = neg_real ^ (flip ? mi : mr);
+ float16 e3 = neg_imag ^ (flip ? mr : mi);
+
+ for (j = i; j < i + eltspersegment; j += 2) {
+ float16 e2 = n[H2(j + flip)];
+ float16 e4 = e2;
+
+ d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
+ d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
+ }
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float32 *d = vd, *n = vn, *m = vm, *a = va;
+ float_status *fpst = vfpst;
+ intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ uint32_t neg_real = flip ^ neg_imag;
+ uintptr_t i;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 31;
+ neg_imag <<= 31;
+
+ for (i = 0; i < opr_sz / 4; i += 2) {
+ float32 e2 = n[H4(i + flip)];
+ float32 e1 = m[H4(i + flip)] ^ neg_real;
+ float32 e4 = e2;
+ float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
+
+ d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
+ d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float32 *d = vd, *n = vn, *m = vm, *a = va;
+ float_status *fpst = vfpst;
+ intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
+ uint32_t neg_real = flip ^ neg_imag;
+ intptr_t elements = opr_sz / sizeof(float32);
+ intptr_t eltspersegment = 16 / sizeof(float32);
+ intptr_t i, j;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 31;
+ neg_imag <<= 31;
+
+ for (i = 0; i < elements; i += eltspersegment) {
+ float32 mr = m[H4(i + 2 * index + 0)];
+ float32 mi = m[H4(i + 2 * index + 1)];
+ float32 e1 = neg_real ^ (flip ? mi : mr);
+ float32 e3 = neg_imag ^ (flip ? mr : mi);
+
+ for (j = i; j < i + eltspersegment; j += 2) {
+ float32 e2 = n[H4(j + flip)];
+ float32 e4 = e2;
+
+ d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
+ d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
+ }
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float64 *d = vd, *n = vn, *m = vm, *a = va;
+ float_status *fpst = vfpst;
+ intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ uint64_t neg_real = flip ^ neg_imag;
+ uintptr_t i;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 63;
+ neg_imag <<= 63;
+
+ for (i = 0; i < opr_sz / 8; i += 2) {
+ float64 e2 = n[i + flip];
+ float64 e1 = m[i + flip] ^ neg_real;
+ float64 e4 = e2;
+ float64 e3 = m[i + 1 - flip] ^ neg_imag;
+
+ d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
+ d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+/*
+ * Floating point comparisons producing an integer result (all 1s or all 0s).
+ * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
+ * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
+ */
+static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
+{
+ return -float16_eq_quiet(op1, op2, stat);
+}
+
+static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
+{
+ return -float32_eq_quiet(op1, op2, stat);
+}
+
+static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
+{
+ return -float16_le(op2, op1, stat);
+}
+
+static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
+{
+ return -float32_le(op2, op1, stat);
+}
+
+static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
+{
+ return -float16_lt(op2, op1, stat);
+}
+
+static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
+{
+ return -float32_lt(op2, op1, stat);
+}
+
+static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
+{
+ return -float16_le(float16_abs(op2), float16_abs(op1), stat);
+}
+
+static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
+{
+ return -float32_le(float32_abs(op2), float32_abs(op1), stat);
+}
+
+static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
+{
+ return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
+}
+
+static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
+{
+ return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
+}
+
+static int16_t vfp_tosszh(float16 x, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ if (float16_is_any_nan(x)) {
+ float_raise(float_flag_invalid, fpst);
+ return 0;
+ }
+ return float16_to_int16_round_to_zero(x, fpst);
+}
+
+static uint16_t vfp_touszh(float16 x, void *fpstp)
+{
+ float_status *fpst = fpstp;
+ if (float16_is_any_nan(x)) {
+ float_raise(float_flag_invalid, fpst);
+ return 0;
+ }
+ return float16_to_uint16_round_to_zero(x, fpst);
+}
+
+#define DO_2OP(NAME, FUNC, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ TYPE *d = vd, *n = vn; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = FUNC(n[i], stat); \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
+DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
+DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
+
+DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
+DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
+DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
+
+DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
+DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
+
+DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
+DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
+DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
+DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
+DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
+DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
+DO_2OP(gvec_tosszh, vfp_tosszh, float16)
+DO_2OP(gvec_touszh, vfp_touszh, float16)
+
+#define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
+ static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
+ { \
+ return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
+ }
+
+#define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
+ static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
+ { \
+ return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
+ }
+
+#define DO_2OP_CMP0(FN, CMPOP, DIRN) \
+ WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
+ WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
+ DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
+ DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
+
+DO_2OP_CMP0(cgt, cgt, FWD)
+DO_2OP_CMP0(cge, cge, FWD)
+DO_2OP_CMP0(ceq, ceq, FWD)
+DO_2OP_CMP0(clt, cgt, REV)
+DO_2OP_CMP0(cle, cge, REV)
+
+#undef DO_2OP
+#undef DO_2OP_CMP0
+
+/* Floating-point trigonometric starting value.
+ * See the ARM ARM pseudocode function FPTrigSMul.
+ */
+static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
+{
+ float16 result = float16_mul(op1, op1, stat);
+ if (!float16_is_any_nan(result)) {
+ result = float16_set_sign(result, op2 & 1);
+ }
+ return result;
+}
+
+static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
+{
+ float32 result = float32_mul(op1, op1, stat);
+ if (!float32_is_any_nan(result)) {
+ result = float32_set_sign(result, op2 & 1);
+ }
+ return result;
+}
+
+static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
+{
+ float64 result = float64_mul(op1, op1, stat);
+ if (!float64_is_any_nan(result)) {
+ result = float64_set_sign(result, op2 & 1);
+ }
+ return result;
+}
+
+static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
+{
+ return float16_abs(float16_sub(op1, op2, stat));
+}
+
+static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
+{
+ return float32_abs(float32_sub(op1, op2, stat));
+}
+
+/*
+ * Reciprocal step. These are the AArch32 version which uses a
+ * non-fused multiply-and-subtract.
+ */
+static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
+{
+ op1 = float16_squash_input_denormal(op1, stat);
+ op2 = float16_squash_input_denormal(op2, stat);
+
+ if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
+ (float16_is_infinity(op2) && float16_is_zero(op1))) {
+ return float16_two;
+ }
+ return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
+}
+
+static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
+{
+ op1 = float32_squash_input_denormal(op1, stat);
+ op2 = float32_squash_input_denormal(op2, stat);
+
+ if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
+ (float32_is_infinity(op2) && float32_is_zero(op1))) {
+ return float32_two;
+ }
+ return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
+}
+
+/* Reciprocal square-root step. AArch32 non-fused semantics. */
+static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
+{
+ op1 = float16_squash_input_denormal(op1, stat);
+ op2 = float16_squash_input_denormal(op2, stat);
+
+ if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
+ (float16_is_infinity(op2) && float16_is_zero(op1))) {
+ return float16_one_point_five;
+ }
+ op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
+ return float16_div(op1, float16_two, stat);
+}
+
+static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
+{
+ op1 = float32_squash_input_denormal(op1, stat);
+ op2 = float32_squash_input_denormal(op2, stat);
+
+ if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
+ (float32_is_infinity(op2) && float32_is_zero(op1))) {
+ return float32_one_point_five;
+ }
+ op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
+ return float32_div(op1, float32_two, stat);
+}
+
+#define DO_3OP(NAME, FUNC, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = FUNC(n[i], m[i], stat); \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_3OP(gvec_fadd_h, float16_add, float16)
+DO_3OP(gvec_fadd_s, float32_add, float32)
+DO_3OP(gvec_fadd_d, float64_add, float64)
+
+DO_3OP(gvec_fsub_h, float16_sub, float16)
+DO_3OP(gvec_fsub_s, float32_sub, float32)
+DO_3OP(gvec_fsub_d, float64_sub, float64)
+
+DO_3OP(gvec_fmul_h, float16_mul, float16)
+DO_3OP(gvec_fmul_s, float32_mul, float32)
+DO_3OP(gvec_fmul_d, float64_mul, float64)
+
+DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
+DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
+DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
+
+DO_3OP(gvec_fabd_h, float16_abd, float16)
+DO_3OP(gvec_fabd_s, float32_abd, float32)
+
+DO_3OP(gvec_fceq_h, float16_ceq, float16)
+DO_3OP(gvec_fceq_s, float32_ceq, float32)
+
+DO_3OP(gvec_fcge_h, float16_cge, float16)
+DO_3OP(gvec_fcge_s, float32_cge, float32)
+
+DO_3OP(gvec_fcgt_h, float16_cgt, float16)
+DO_3OP(gvec_fcgt_s, float32_cgt, float32)
+
+DO_3OP(gvec_facge_h, float16_acge, float16)
+DO_3OP(gvec_facge_s, float32_acge, float32)
+
+DO_3OP(gvec_facgt_h, float16_acgt, float16)
+DO_3OP(gvec_facgt_s, float32_acgt, float32)
+
+DO_3OP(gvec_fmax_h, float16_max, float16)
+DO_3OP(gvec_fmax_s, float32_max, float32)
+
+DO_3OP(gvec_fmin_h, float16_min, float16)
+DO_3OP(gvec_fmin_s, float32_min, float32)
+
+DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
+DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
+
+DO_3OP(gvec_fminnum_h, float16_minnum, float16)
+DO_3OP(gvec_fminnum_s, float32_minnum, float32)
+
+DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
+DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
+
+DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
+DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
+
+#ifdef TARGET_AARCH64
+
+DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
+DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
+DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
+
+DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
+DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
+DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
+
+#endif
+#undef DO_3OP
+
+/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
+static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
+ float_status *stat)
+{
+ return float16_add(dest, float16_mul(op1, op2, stat), stat);
+}
+
+static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
+ float_status *stat)
+{
+ return float32_add(dest, float32_mul(op1, op2, stat), stat);
+}
+
+static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
+ float_status *stat)
+{
+ return float16_sub(dest, float16_mul(op1, op2, stat), stat);
+}
+
+static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
+ float_status *stat)
+{
+ return float32_sub(dest, float32_mul(op1, op2, stat), stat);
+}
+
+/* Fused versions; these have the semantics Neon VFMA/VFMS want */
+static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
+ float_status *stat)
+{
+ return float16_muladd(op1, op2, dest, 0, stat);
+}
+
+static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
+ float_status *stat)
+{
+ return float32_muladd(op1, op2, dest, 0, stat);
+}
+
+static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
+ float_status *stat)
+{
+ return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
+}
+
+static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
+ float_status *stat)
+{
+ return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
+}
+
+#define DO_MULADD(NAME, FUNC, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = FUNC(d[i], n[i], m[i], stat); \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
+DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
+
+DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
+DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
+
+DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
+DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
+
+DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
+DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
+
+/* For the indexed ops, SVE applies the index per 128-bit vector segment.
+ * For AdvSIMD, there is of course only one such vector segment.
+ */
+
+#define DO_MUL_IDX(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
+ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
+ intptr_t idx = simd_data(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
+ TYPE mm = m[H(i + idx)]; \
+ for (j = 0; j < segment; j++) { \
+ d[i + j] = n[i + j] * mm; \
+ } \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
+DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
+DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
+
+#undef DO_MUL_IDX
+
+#define DO_MLA_IDX(NAME, TYPE, OP, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+{ \
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
+ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
+ intptr_t idx = simd_data(desc); \
+ TYPE *d = vd, *n = vn, *m = vm, *a = va; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
+ TYPE mm = m[H(i + idx)]; \
+ for (j = 0; j < segment; j++) { \
+ d[i + j] = a[i + j] OP n[i + j] * mm; \
+ } \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
+DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
+DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
+
+DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
+DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
+DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
+
+#undef DO_MLA_IDX
+
+#define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
+{ \
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
+ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
+ intptr_t idx = simd_data(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
+ TYPE mm = m[H(i + idx)]; \
+ for (j = 0; j < segment; j++) { \
+ d[i + j] = TYPE##_##ADD(d[i + j], \
+ TYPE##_mul(n[i + j], mm, stat), stat); \
+ } \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+#define float16_nop(N, M, S) (M)
+#define float32_nop(N, M, S) (M)
+#define float64_nop(N, M, S) (M)
+
+DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
+DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
+DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
+
+/*
+ * Non-fused multiply-accumulate operations, for Neon. NB that unlike
+ * the fused ops below they assume accumulate both from and into Vd.
+ */
+DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
+DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
+DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
+DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
+
+#undef float16_nop
+#undef float32_nop
+#undef float64_nop
+#undef DO_FMUL_IDX
+
+#define DO_FMLA_IDX(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
+ void *stat, uint32_t desc) \
+{ \
+ intptr_t i, j, oprsz = simd_oprsz(desc); \
+ intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
+ TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
+ intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
+ TYPE *d = vd, *n = vn, *m = vm, *a = va; \
+ op1_neg <<= (8 * sizeof(TYPE) - 1); \
+ for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
+ TYPE mm = m[H(i + idx)]; \
+ for (j = 0; j < segment; j++) { \
+ d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
+ mm, a[i + j], 0, stat); \
+ } \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
+DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
+DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
+
+#undef DO_FMLA_IDX
+
+#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
+void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
+ bool q = false; \
+ for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
+ WTYPE dd = (WTYPE)n[i] OP m[i]; \
+ if (dd < MIN) { \
+ dd = MIN; \
+ q = true; \
+ } else if (dd > MAX) { \
+ dd = MAX; \
+ q = true; \
+ } \
+ d[i] = dd; \
+ } \
+ if (q) { \
+ uint32_t *qc = vq; \
+ qc[0] = 1; \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
+DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
+DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
+
+DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
+DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
+DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
+
+DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
+DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
+DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
+
+DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
+DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
+DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
+
+#undef DO_SAT
+
+void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
+ void *vm, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+ bool q = false;
+
+ for (i = 0; i < oprsz / 8; i++) {
+ uint64_t nn = n[i], mm = m[i], dd = nn + mm;
+ if (dd < nn) {
+ dd = UINT64_MAX;
+ q = true;
+ }
+ d[i] = dd;
+ }
+ if (q) {
+ uint32_t *qc = vq;
+ qc[0] = 1;
+ }
+ clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
+ void *vm, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+ bool q = false;
+
+ for (i = 0; i < oprsz / 8; i++) {
+ uint64_t nn = n[i], mm = m[i], dd = nn - mm;
+ if (nn < mm) {
+ dd = 0;
+ q = true;
+ }
+ d[i] = dd;
+ }
+ if (q) {
+ uint32_t *qc = vq;
+ qc[0] = 1;
+ }
+ clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
+ void *vm, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+ int64_t *d = vd, *n = vn, *m = vm;
+ bool q = false;
+
+ for (i = 0; i < oprsz / 8; i++) {
+ int64_t nn = n[i], mm = m[i], dd = nn + mm;
+ if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
+ dd = (nn >> 63) ^ ~INT64_MIN;
+ q = true;
+ }
+ d[i] = dd;
+ }
+ if (q) {
+ uint32_t *qc = vq;
+ qc[0] = 1;
+ }
+ clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
+ void *vm, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+ int64_t *d = vd, *n = vn, *m = vm;
+ bool q = false;
+
+ for (i = 0; i < oprsz / 8; i++) {
+ int64_t nn = n[i], mm = m[i], dd = nn - mm;
+ if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
+ dd = (nn >> 63) ^ ~INT64_MIN;
+ q = true;
+ }
+ d[i] = dd;
+ }
+ if (q) {
+ uint32_t *qc = vq;
+ qc[0] = 1;
+ }
+ clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+
+#define DO_SRA(NAME, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ int shift = simd_data(desc); \
+ TYPE *d = vd, *n = vn; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] += n[i] >> shift; \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_SRA(gvec_ssra_b, int8_t)
+DO_SRA(gvec_ssra_h, int16_t)
+DO_SRA(gvec_ssra_s, int32_t)
+DO_SRA(gvec_ssra_d, int64_t)
+
+DO_SRA(gvec_usra_b, uint8_t)
+DO_SRA(gvec_usra_h, uint16_t)
+DO_SRA(gvec_usra_s, uint32_t)
+DO_SRA(gvec_usra_d, uint64_t)
+
+#undef DO_SRA
+
+#define DO_RSHR(NAME, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ int shift = simd_data(desc); \
+ TYPE *d = vd, *n = vn; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ TYPE tmp = n[i] >> (shift - 1); \
+ d[i] = (tmp >> 1) + (tmp & 1); \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_RSHR(gvec_srshr_b, int8_t)
+DO_RSHR(gvec_srshr_h, int16_t)
+DO_RSHR(gvec_srshr_s, int32_t)
+DO_RSHR(gvec_srshr_d, int64_t)
+
+DO_RSHR(gvec_urshr_b, uint8_t)
+DO_RSHR(gvec_urshr_h, uint16_t)
+DO_RSHR(gvec_urshr_s, uint32_t)
+DO_RSHR(gvec_urshr_d, uint64_t)
+
+#undef DO_RSHR
+
+#define DO_RSRA(NAME, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ int shift = simd_data(desc); \
+ TYPE *d = vd, *n = vn; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ TYPE tmp = n[i] >> (shift - 1); \
+ d[i] += (tmp >> 1) + (tmp & 1); \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_RSRA(gvec_srsra_b, int8_t)
+DO_RSRA(gvec_srsra_h, int16_t)
+DO_RSRA(gvec_srsra_s, int32_t)
+DO_RSRA(gvec_srsra_d, int64_t)
+
+DO_RSRA(gvec_ursra_b, uint8_t)
+DO_RSRA(gvec_ursra_h, uint16_t)
+DO_RSRA(gvec_ursra_s, uint32_t)
+DO_RSRA(gvec_ursra_d, uint64_t)
+
+#undef DO_RSRA
+
+#define DO_SRI(NAME, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ int shift = simd_data(desc); \
+ TYPE *d = vd, *n = vn; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_SRI(gvec_sri_b, uint8_t)
+DO_SRI(gvec_sri_h, uint16_t)
+DO_SRI(gvec_sri_s, uint32_t)
+DO_SRI(gvec_sri_d, uint64_t)
+
+#undef DO_SRI
+
+#define DO_SLI(NAME, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ int shift = simd_data(desc); \
+ TYPE *d = vd, *n = vn; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+}
+
+DO_SLI(gvec_sli_b, uint8_t)
+DO_SLI(gvec_sli_h, uint16_t)
+DO_SLI(gvec_sli_s, uint32_t)
+DO_SLI(gvec_sli_d, uint64_t)
+
+#undef DO_SLI
+
+/*
+ * Convert float16 to float32, raising no exceptions and
+ * preserving exceptional values, including SNaN.
+ * This is effectively an unpack+repack operation.
+ */
+static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
+{
+ const int f16_bias = 15;
+ const int f32_bias = 127;
+ uint32_t sign = extract32(f16, 15, 1);
+ uint32_t exp = extract32(f16, 10, 5);
+ uint32_t frac = extract32(f16, 0, 10);
+
+ if (exp == 0x1f) {
+ /* Inf or NaN */
+ exp = 0xff;
+ } else if (exp == 0) {
+ /* Zero or denormal. */
+ if (frac != 0) {
+ if (fz16) {
+ frac = 0;
+ } else {
+ /*
+ * Denormal; these are all normal float32.
+ * Shift the fraction so that the msb is at bit 11,
+ * then remove bit 11 as the implicit bit of the
+ * normalized float32. Note that we still go through
+ * the shift for normal numbers below, to put the
+ * float32 fraction at the right place.
+ */
+ int shift = clz32(frac) - 21;
+ frac = (frac << shift) & 0x3ff;
+ exp = f32_bias - f16_bias - shift + 1;
+ }
+ }
+ } else {
+ /* Normal number; adjust the bias. */
+ exp += f32_bias - f16_bias;
+ }
+ sign <<= 31;
+ exp <<= 23;
+ frac <<= 23 - 10;
+
+ return sign | exp | frac;
+}
+
+static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
+{
+ /*
+ * Branchless load of u32[0], u64[0], u32[1], or u64[1].
+ * Load the 2nd qword iff is_q & is_2.
+ * Shift to the 2nd dword iff !is_q & is_2.
+ * For !is_q & !is_2, the upper bits of the result are garbage.
+ */
+ return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
+}
+
+/*
+ * Note that FMLAL requires oprsz == 8 or oprsz == 16,
+ * as there is not yet SVE versions that might use blocking.
+ */
+
+static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
+ uint32_t desc, bool fz16)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+ int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
+ int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ int is_q = oprsz == 16;
+ uint64_t n_4, m_4;
+
+ /* Pre-load all of the f16 data, avoiding overlap issues. */
+ n_4 = load4_f16(vn, is_q, is_2);
+ m_4 = load4_f16(vm, is_q, is_2);
+
+ /* Negate all inputs for FMLSL at once. */
+ if (is_s) {
+ n_4 ^= 0x8000800080008000ull;
+ }
+
+ for (i = 0; i < oprsz / 4; i++) {
+ float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
+ float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
+ d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
+ }
+ clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
+ void *venv, uint32_t desc)
+{
+ CPUARMState *env = venv;
+ do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
+ get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
+ void *venv, uint32_t desc)
+{
+ CPUARMState *env = venv;
+ do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
+ get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
+ void *venv, uint32_t desc)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+ uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
+ CPUARMState *env = venv;
+ float_status *status = &env->vfp.fp_status;
+ bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
+
+ for (i = 0; i < oprsz; i += sizeof(float32)) {
+ float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
+ float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
+ float32 nn = float16_to_float32_by_bits(nn_16, fz16);
+ float32 mm = float16_to_float32_by_bits(mm_16, fz16);
+ float32 aa = *(float32 *)(va + H1_4(i));
+
+ *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
+ }
+}
+
+static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
+ uint32_t desc, bool fz16)
+{
+ intptr_t i, oprsz = simd_oprsz(desc);
+ int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
+ int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
+ int is_q = oprsz == 16;
+ uint64_t n_4;
+ float32 m_1;
+
+ /* Pre-load all of the f16 data, avoiding overlap issues. */
+ n_4 = load4_f16(vn, is_q, is_2);
+
+ /* Negate all inputs for FMLSL at once. */
+ if (is_s) {
+ n_4 ^= 0x8000800080008000ull;
+ }
+
+ m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
+
+ for (i = 0; i < oprsz / 4; i++) {
+ float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
+ d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
+ }
+ clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
+ void *venv, uint32_t desc)
+{
+ CPUARMState *env = venv;
+ do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
+ get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
+ void *venv, uint32_t desc)
+{
+ CPUARMState *env = venv;
+ do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
+ get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
+}
+
+void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
+ void *venv, uint32_t desc)
+{
+ intptr_t i, j, oprsz = simd_oprsz(desc);
+ uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
+ intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
+ CPUARMState *env = venv;
+ float_status *status = &env->vfp.fp_status;
+ bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
+
+ for (i = 0; i < oprsz; i += 16) {
+ float16 mm_16 = *(float16 *)(vm + i + idx);
+ float32 mm = float16_to_float32_by_bits(mm_16, fz16);
+
+ for (j = 0; j < 16; j += sizeof(float32)) {
+ float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
+ float32 nn = float16_to_float32_by_bits(nn_16, fz16);
+ float32 aa = *(float32 *)(va + H1_4(i + j));
+
+ *(float32 *)(vd + H1_4(i + j)) =
+ float32_muladd(nn, mm, aa, 0, status);
+ }
+ }
+}
+
+void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int8_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ int8_t mm = m[i];
+ int8_t nn = n[i];
+ int8_t res = 0;
+ if (mm >= 0) {
+ if (mm < 8) {
+ res = nn << mm;
+ }
+ } else {
+ res = nn >> (mm > -8 ? -mm : 7);
+ }
+ d[i] = res;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int16_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ int8_t mm = m[i]; /* only 8 bits of shift are significant */
+ int16_t nn = n[i];
+ int16_t res = 0;
+ if (mm >= 0) {
+ if (mm < 16) {
+ res = nn << mm;
+ }
+ } else {
+ res = nn >> (mm > -16 ? -mm : 15);
+ }
+ d[i] = res;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint8_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ int8_t mm = m[i];
+ uint8_t nn = n[i];
+ uint8_t res = 0;
+ if (mm >= 0) {
+ if (mm < 8) {
+ res = nn << mm;
+ }
+ } else {
+ if (mm > -8) {
+ res = nn >> -mm;
+ }
+ }
+ d[i] = res;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint16_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ int8_t mm = m[i]; /* only 8 bits of shift are significant */
+ uint16_t nn = n[i];
+ uint16_t res = 0;
+ if (mm >= 0) {
+ if (mm < 16) {
+ res = nn << mm;
+ }
+ } else {
+ if (mm > -16) {
+ res = nn >> -mm;
+ }
+ }
+ d[i] = res;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+/*
+ * 8x8->8 polynomial multiply.
+ *
+ * Polynomial multiplication is like integer multiplication except the
+ * partial products are XORed, not added.
+ *
+ * TODO: expose this as a generic vector operation, as it is a common
+ * crypto building block.
+ */
+void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ uint64_t nn = n[i];
+ uint64_t mm = m[i];
+ uint64_t rr = 0;
+
+ for (j = 0; j < 8; ++j) {
+ uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
+ rr ^= mm & mask;
+ mm = (mm << 1) & 0xfefefefefefefefeull;
+ nn >>= 1;
+ }
+ d[i] = rr;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+/*
+ * 64x64->128 polynomial multiply.
+ * Because of the lanes are not accessed in strict columns,
+ * this probably cannot be turned into a generic helper.
+ */
+void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ intptr_t hi = simd_data(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; i += 2) {
+ uint64_t nn = n[i + hi];
+ uint64_t mm = m[i + hi];
+ uint64_t rhi = 0;
+ uint64_t rlo = 0;
+
+ /* Bit 0 can only influence the low 64-bit result. */
+ if (nn & 1) {
+ rlo = mm;
+ }
+
+ for (j = 1; j < 64; ++j) {
+ uint64_t mask = -((nn >> j) & 1);
+ rlo ^= (mm << j) & mask;
+ rhi ^= (mm >> (64 - j)) & mask;
+ }
+ d[i] = rlo;
+ d[i + 1] = rhi;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+/*
+ * 8x8->16 polynomial multiply.
+ *
+ * The byte inputs are expanded to (or extracted from) half-words.
+ * Note that neon and sve2 get the inputs from different positions.
+ * This allows 4 bytes to be processed in parallel with uint64_t.
+ */
+
+static uint64_t expand_byte_to_half(uint64_t x)
+{
+ return (x & 0x000000ff)
+ | ((x & 0x0000ff00) << 8)
+ | ((x & 0x00ff0000) << 16)
+ | ((x & 0xff000000) << 24);
+}
+
+uint64_t pmull_w(uint64_t op1, uint64_t op2)
+{
+ uint64_t result = 0;
+ int i;
+ for (i = 0; i < 16; ++i) {
+ uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
+ result ^= op2 & mask;
+ op1 >>= 1;
+ op2 <<= 1;
+ }
+ return result;
+}
+
+uint64_t pmull_h(uint64_t op1, uint64_t op2)
+{
+ uint64_t result = 0;
+ int i;
+ for (i = 0; i < 8; ++i) {
+ uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
+ result ^= op2 & mask;
+ op1 >>= 1;
+ op2 <<= 1;
+ }
+ return result;
+}
+
+void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ int hi = simd_data(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint64_t nn = n[hi], mm = m[hi];
+
+ d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
+ nn >>= 32;
+ mm >>= 32;
+ d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
+
+ clear_tail(d, 16, simd_maxsz(desc));
+}
+
+#ifdef TARGET_AARCH64
+void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ int shift = simd_data(desc) * 8;
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
+ uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
+
+ d[i] = pmull_h(nn, mm);
+ }
+}
+
+static uint64_t pmull_d(uint64_t op1, uint64_t op2)
+{
+ uint64_t result = 0;
+ int i;
+
+ for (i = 0; i < 32; ++i) {
+ uint64_t mask = -((op1 >> i) & 1);
+ result ^= (op2 << i) & mask;
+ }
+ return result;
+}
+
+void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t sel = H4(simd_data(desc));
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint32_t *n = vn, *m = vm;
+ uint64_t *d = vd;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
+ }
+}
+#endif
+
+#define DO_CMP0(NAME, TYPE, OP) \
+void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
+ TYPE nn = *(TYPE *)(vn + i); \
+ *(TYPE *)(vd + i) = -(nn OP 0); \
+ } \
+ clear_tail(vd, opr_sz, simd_maxsz(desc)); \
+}
+
+DO_CMP0(gvec_ceq0_b, int8_t, ==)
+DO_CMP0(gvec_clt0_b, int8_t, <)
+DO_CMP0(gvec_cle0_b, int8_t, <=)
+DO_CMP0(gvec_cgt0_b, int8_t, >)
+DO_CMP0(gvec_cge0_b, int8_t, >=)
+
+DO_CMP0(gvec_ceq0_h, int16_t, ==)
+DO_CMP0(gvec_clt0_h, int16_t, <)
+DO_CMP0(gvec_cle0_h, int16_t, <=)
+DO_CMP0(gvec_cgt0_h, int16_t, >)
+DO_CMP0(gvec_cge0_h, int16_t, >=)
+
+#undef DO_CMP0
+
+#define DO_ABD(NAME, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ \
+ for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
+ d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
+ } \
+ clear_tail(d, opr_sz, simd_maxsz(desc)); \
+}
+
+DO_ABD(gvec_sabd_b, int8_t)
+DO_ABD(gvec_sabd_h, int16_t)
+DO_ABD(gvec_sabd_s, int32_t)
+DO_ABD(gvec_sabd_d, int64_t)
+
+DO_ABD(gvec_uabd_b, uint8_t)
+DO_ABD(gvec_uabd_h, uint16_t)
+DO_ABD(gvec_uabd_s, uint32_t)
+DO_ABD(gvec_uabd_d, uint64_t)
+
+#undef DO_ABD
+
+#define DO_ABA(NAME, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ \
+ for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
+ d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
+ } \
+ clear_tail(d, opr_sz, simd_maxsz(desc)); \
+}
+
+DO_ABA(gvec_saba_b, int8_t)
+DO_ABA(gvec_saba_h, int16_t)
+DO_ABA(gvec_saba_s, int32_t)
+DO_ABA(gvec_saba_d, int64_t)
+
+DO_ABA(gvec_uaba_b, uint8_t)
+DO_ABA(gvec_uaba_h, uint16_t)
+DO_ABA(gvec_uaba_s, uint32_t)
+DO_ABA(gvec_uaba_d, uint64_t)
+
+#undef DO_ABA
+
+#define DO_NEON_PAIRWISE(NAME, OP) \
+ void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
+ void *stat, uint32_t oprsz) \
+ { \
+ float_status *fpst = stat; \
+ float32 *d = vd; \
+ float32 *n = vn; \
+ float32 *m = vm; \
+ float32 r0, r1; \
+ \
+ /* Read all inputs before writing outputs in case vm == vd */ \
+ r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
+ r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
+ \
+ d[H4(0)] = r0; \
+ d[H4(1)] = r1; \
+ } \
+ \
+ void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
+ void *stat, uint32_t oprsz) \
+ { \
+ float_status *fpst = stat; \
+ float16 *d = vd; \
+ float16 *n = vn; \
+ float16 *m = vm; \
+ float16 r0, r1, r2, r3; \
+ \
+ /* Read all inputs before writing outputs in case vm == vd */ \
+ r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
+ r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
+ r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
+ r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
+ \
+ d[H2(0)] = r0; \
+ d[H2(1)] = r1; \
+ d[H2(2)] = r2; \
+ d[H2(3)] = r3; \
+ }
+
+DO_NEON_PAIRWISE(neon_padd, add)
+DO_NEON_PAIRWISE(neon_pmax, max)
+DO_NEON_PAIRWISE(neon_pmin, min)
+
+#undef DO_NEON_PAIRWISE
+
+#define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
+ void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
+ { \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ int shift = simd_data(desc); \
+ TYPE *d = vd, *n = vn; \
+ float_status *fpst = stat; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = FUNC(n[i], shift, fpst); \
+ } \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+ }
+
+DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
+DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
+DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
+DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
+DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
+DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
+DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
+DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
+
+#undef DO_VCVT_FIXED
+
+#define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
+ void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
+ { \
+ float_status *fpst = stat; \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ uint32_t rmode = simd_data(desc); \
+ uint32_t prev_rmode = get_float_rounding_mode(fpst); \
+ TYPE *d = vd, *n = vn; \
+ set_float_rounding_mode(rmode, fpst); \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = FUNC(n[i], 0, fpst); \
+ } \
+ set_float_rounding_mode(prev_rmode, fpst); \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+ }
+
+DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
+DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
+DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
+DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
+
+#undef DO_VCVT_RMODE
+
+#define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
+ void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
+ { \
+ float_status *fpst = stat; \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ uint32_t rmode = simd_data(desc); \
+ uint32_t prev_rmode = get_float_rounding_mode(fpst); \
+ TYPE *d = vd, *n = vn; \
+ set_float_rounding_mode(rmode, fpst); \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = FUNC(n[i], fpst); \
+ } \
+ set_float_rounding_mode(prev_rmode, fpst); \
+ clear_tail(d, oprsz, simd_maxsz(desc)); \
+ }
+
+DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
+DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
+
+#undef DO_VRINT_RMODE
+
+#ifdef TARGET_AARCH64
+void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
+{
+ const uint8_t *indices = vm;
+ CPUARMState *env = venv;
+ size_t oprsz = simd_oprsz(desc);
+ uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
+ bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
+ uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
+ union {
+ uint8_t b[16];
+ uint64_t d[2];
+ } result;
+
+ /*
+ * We must construct the final result in a temp, lest the output
+ * overlaps the input table. For TBL, begin with zero; for TBX,
+ * begin with the original register contents. Note that we always
+ * copy 16 bytes here to avoid an extra branch; clearing the high
+ * bits of the register for oprsz == 8 is handled below.
+ */
+ if (is_tbx) {
+ memcpy(&result, vd, 16);
+ } else {
+ memset(&result, 0, 16);
+ }
+
+ for (size_t i = 0; i < oprsz; ++i) {
+ uint32_t index = indices[H1(i)];
+
+ if (index < table_len) {
+ /*
+ * Convert index (a byte offset into the virtual table
+ * which is a series of 128-bit vectors concatenated)
+ * into the correct register element, bearing in mind
+ * that the table can wrap around from V31 to V0.
+ */
+ const uint8_t *table = (const uint8_t *)
+ aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
+ result.b[H1(i)] = table[H1(index % 16)];
+ }
+ }
+
+ memcpy(vd, &result, 16);
+ clear_tail(vd, oprsz, simd_maxsz(desc));
+}
+#endif
+
+/*
+ * NxN -> N highpart multiply
+ *
+ * TODO: expose this as a generic vector operation.
+ */
+
+void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int8_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = ((int32_t)n[i] * m[i]) >> 8;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int16_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = ((int32_t)n[i] * m[i]) >> 16;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ int32_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = ((int64_t)n[i] * m[i]) >> 32;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint64_t discard;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ muls64(&discard, &d[i], n[i], m[i]);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint8_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = ((uint32_t)n[i] * m[i]) >> 8;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint16_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = ((uint32_t)n[i] * m[i]) >> 16;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint32_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = ((uint64_t)n[i] * m[i]) >> 32;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+ uint64_t discard;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ mulu64(&discard, &d[i], n[i], m[i]);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ int shr = simd_data(desc);
+ uint64_t *d = vd, *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz; ++i) {
+ d[i] = ror64(n[i] ^ m[i], shr);
+ }
+ clear_tail(d, opr_sz * 8, simd_maxsz(desc));
+}
+
+/*
+ * Integer matrix-multiply accumulate
+ */
+
+static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
+{
+ int8_t *n = vn, *m = vm;
+
+ for (intptr_t k = 0; k < 8; ++k) {
+ sum += n[H1(k)] * m[H1(k)];
+ }
+ return sum;
+}
+
+static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
+{
+ uint8_t *n = vn, *m = vm;
+
+ for (intptr_t k = 0; k < 8; ++k) {
+ sum += n[H1(k)] * m[H1(k)];
+ }
+ return sum;
+}
+
+static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
+{
+ uint8_t *n = vn;
+ int8_t *m = vm;
+
+ for (intptr_t k = 0; k < 8; ++k) {
+ sum += n[H1(k)] * m[H1(k)];
+ }
+ return sum;
+}
+
+static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
+ uint32_t (*inner_loop)(uint32_t, void *, void *))
+{
+ intptr_t seg, opr_sz = simd_oprsz(desc);
+
+ for (seg = 0; seg < opr_sz; seg += 16) {
+ uint32_t *d = vd + seg;
+ uint32_t *a = va + seg;
+ uint32_t sum0, sum1, sum2, sum3;
+
+ /*
+ * Process the entire segment at once, writing back the
+ * results only after we've consumed all of the inputs.
+ *
+ * Key to indices by column:
+ * i j i j
+ */
+ sum0 = a[H4(0 + 0)];
+ sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
+ sum1 = a[H4(0 + 1)];
+ sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
+ sum2 = a[H4(2 + 0)];
+ sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
+ sum3 = a[H4(2 + 1)];
+ sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
+
+ d[H4(0)] = sum0;
+ d[H4(1)] = sum1;
+ d[H4(2)] = sum2;
+ d[H4(3)] = sum3;
+ }
+ clear_tail(vd, opr_sz, simd_maxsz(desc));
+}
+
+#define DO_MMLA_B(NAME, INNER) \
+ void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
+ { do_mmla_b(vd, vn, vm, va, desc, INNER); }
+
+DO_MMLA_B(gvec_smmla_b, do_smmla_b)
+DO_MMLA_B(gvec_ummla_b, do_ummla_b)
+DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
+
+/*
+ * BFloat16 Dot Product
+ */
+
+float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
+{
+ /* FPCR is ignored for BFDOT and BFMMLA. */
+ float_status bf_status = {
+ .tininess_before_rounding = float_tininess_before_rounding,
+ .float_rounding_mode = float_round_to_odd_inf,
+ .flush_to_zero = true,
+ .flush_inputs_to_zero = true,
+ .default_nan_mode = true,
+ };
+ float32 t1, t2;
+
+ /*
+ * Extract each BFloat16 from the element pair, and shift
+ * them such that they become float32.
+ */
+ t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
+ t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
+ t1 = float32_add(t1, t2, &bf_status);
+ t1 = float32_add(sum, t1, &bf_status);
+
+ return t1;
+}
+
+void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ float32 *d = vd, *a = va;
+ uint32_t *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = bfdotadd(a[i], n[i], m[i]);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ intptr_t index = simd_data(desc);
+ intptr_t elements = opr_sz / 4;
+ intptr_t eltspersegment = MIN(16 / 4, elements);
+ float32 *d = vd, *a = va;
+ uint32_t *n = vn, *m = vm;
+
+ for (i = 0; i < elements; i += eltspersegment) {
+ uint32_t m_idx = m[i + H4(index)];
+
+ for (j = i; j < i + eltspersegment; j++) {
+ d[j] = bfdotadd(a[j], n[j], m_idx);
+ }
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
+{
+ intptr_t s, opr_sz = simd_oprsz(desc);
+ float32 *d = vd, *a = va;
+ uint32_t *n = vn, *m = vm;
+
+ for (s = 0; s < opr_sz / 4; s += 4) {
+ float32 sum00, sum01, sum10, sum11;
+
+ /*
+ * Process the entire segment at once, writing back the
+ * results only after we've consumed all of the inputs.
+ *
+ * Key to indicies by column:
+ * i j i k j k
+ */
+ sum00 = a[s + H4(0 + 0)];
+ sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
+ sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
+
+ sum01 = a[s + H4(0 + 1)];
+ sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
+ sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
+
+ sum10 = a[s + H4(2 + 0)];
+ sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
+ sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
+
+ sum11 = a[s + H4(2 + 1)];
+ sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
+ sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
+
+ d[s + H4(0 + 0)] = sum00;
+ d[s + H4(0 + 1)] = sum01;
+ d[s + H4(2 + 0)] = sum10;
+ d[s + H4(2 + 1)] = sum11;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
+ void *stat, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ intptr_t sel = simd_data(desc);
+ float32 *d = vd, *a = va;
+ bfloat16 *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ float32 nn = n[H2(i * 2 + sel)] << 16;
+ float32 mm = m[H2(i * 2 + sel)] << 16;
+ d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
+ void *va, void *stat, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
+ intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
+ intptr_t elements = opr_sz / 4;
+ intptr_t eltspersegment = MIN(16 / 4, elements);
+ float32 *d = vd, *a = va;
+ bfloat16 *n = vn, *m = vm;
+
+ for (i = 0; i < elements; i += eltspersegment) {
+ float32 m_idx = m[H2(2 * i + index)] << 16;
+
+ for (j = i; j < i + eltspersegment; j++) {
+ float32 n_j = n[H2(2 * j + sel)] << 16;
+ d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
+ }
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+#define DO_CLAMP(NAME, TYPE) \
+void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
+{ \
+ intptr_t i, opr_sz = simd_oprsz(desc); \
+ for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
+ TYPE aa = *(TYPE *)(a + i); \
+ TYPE nn = *(TYPE *)(n + i); \
+ TYPE mm = *(TYPE *)(m + i); \
+ TYPE dd = MIN(MAX(aa, nn), mm); \
+ *(TYPE *)(d + i) = dd; \
+ } \
+ clear_tail(d, opr_sz, simd_maxsz(desc)); \
+}
+
+DO_CLAMP(gvec_sclamp_b, int8_t)
+DO_CLAMP(gvec_sclamp_h, int16_t)
+DO_CLAMP(gvec_sclamp_s, int32_t)
+DO_CLAMP(gvec_sclamp_d, int64_t)
+
+DO_CLAMP(gvec_uclamp_b, uint8_t)
+DO_CLAMP(gvec_uclamp_h, uint16_t)
+DO_CLAMP(gvec_uclamp_s, uint32_t)
+DO_CLAMP(gvec_uclamp_d, uint64_t)
--- /dev/null
+/*
+ * ARM AdvSIMD / SVE Vector Helpers
+ *
+ * Copyright (c) 2020 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TARGET_ARM_VEC_INTERNAL_H
+#define TARGET_ARM_VEC_INTERNAL_H
+
+/*
+ * Note that vector data is stored in host-endian 64-bit chunks,
+ * so addressing units smaller than that needs a host-endian fixup.
+ *
+ * The H<N> macros are used when indexing an array of elements of size N.
+ *
+ * The H1_<N> macros are used when performing byte arithmetic and then
+ * casting the final pointer to a type of size N.
+ */
+#if HOST_BIG_ENDIAN
+#define H1(x) ((x) ^ 7)
+#define H1_2(x) ((x) ^ 6)
+#define H1_4(x) ((x) ^ 4)
+#define H2(x) ((x) ^ 3)
+#define H4(x) ((x) ^ 1)
+#else
+#define H1(x) (x)
+#define H1_2(x) (x)
+#define H1_4(x) (x)
+#define H2(x) (x)
+#define H4(x) (x)
+#endif
+/*
+ * Access to 64-bit elements isn't host-endian dependent; we provide H8
+ * and H1_8 so that when a function is being generated from a macro we
+ * can pass these rather than an empty macro argument, for clarity.
+ */
+#define H8(x) (x)
+#define H1_8(x) (x)
+
+/*
+ * Expand active predicate bits to bytes, for byte elements.
+ */
+extern const uint64_t expand_pred_b_data[256];
+static inline uint64_t expand_pred_b(uint8_t byte)
+{
+ return expand_pred_b_data[byte];
+}
+
+/* Similarly for half-word elements. */
+extern const uint64_t expand_pred_h_data[0x55 + 1];
+static inline uint64_t expand_pred_h(uint8_t byte)
+{
+ return expand_pred_h_data[byte & 0x55];
+}
+
+static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
+{
+ uint64_t *d = vd + opr_sz;
+ uintptr_t i;
+
+ for (i = opr_sz; i < max_sz; i += 8) {
+ *d++ = 0;
+ }
+}
+
+static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits,
+ bool round, uint32_t *sat)
+{
+ if (shift <= -bits) {
+ /* Rounding the sign bit always produces 0. */
+ if (round) {
+ return 0;
+ }
+ return src >> 31;
+ } else if (shift < 0) {
+ if (round) {
+ src >>= -shift - 1;
+ return (src >> 1) + (src & 1);
+ }
+ return src >> -shift;
+ } else if (shift < bits) {
+ int32_t val = src << shift;
+ if (bits == 32) {
+ if (!sat || val >> shift == src) {
+ return val;
+ }
+ } else {
+ int32_t extval = sextract32(val, 0, bits);
+ if (!sat || val == extval) {
+ return extval;
+ }
+ }
+ } else if (!sat || src == 0) {
+ return 0;
+ }
+
+ *sat = 1;
+ return (1u << (bits - 1)) - (src >= 0);
+}
+
+static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits,
+ bool round, uint32_t *sat)
+{
+ if (shift <= -(bits + round)) {
+ return 0;
+ } else if (shift < 0) {
+ if (round) {
+ src >>= -shift - 1;
+ return (src >> 1) + (src & 1);
+ }
+ return src >> -shift;
+ } else if (shift < bits) {
+ uint32_t val = src << shift;
+ if (bits == 32) {
+ if (!sat || val >> shift == src) {
+ return val;
+ }
+ } else {
+ uint32_t extval = extract32(val, 0, bits);
+ if (!sat || val == extval) {
+ return extval;
+ }
+ }
+ } else if (!sat || src == 0) {
+ return 0;
+ }
+
+ *sat = 1;
+ return MAKE_64BIT_MASK(0, bits);
+}
+
+static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits,
+ bool round, uint32_t *sat)
+{
+ if (sat && src < 0) {
+ *sat = 1;
+ return 0;
+ }
+ return do_uqrshl_bhs(src, shift, bits, round, sat);
+}
+
+static inline int64_t do_sqrshl_d(int64_t src, int64_t shift,
+ bool round, uint32_t *sat)
+{
+ if (shift <= -64) {
+ /* Rounding the sign bit always produces 0. */
+ if (round) {
+ return 0;
+ }
+ return src >> 63;
+ } else if (shift < 0) {
+ if (round) {
+ src >>= -shift - 1;
+ return (src >> 1) + (src & 1);
+ }
+ return src >> -shift;
+ } else if (shift < 64) {
+ int64_t val = src << shift;
+ if (!sat || val >> shift == src) {
+ return val;
+ }
+ } else if (!sat || src == 0) {
+ return 0;
+ }
+
+ *sat = 1;
+ return src < 0 ? INT64_MIN : INT64_MAX;
+}
+
+static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift,
+ bool round, uint32_t *sat)
+{
+ if (shift <= -(64 + round)) {
+ return 0;
+ } else if (shift < 0) {
+ if (round) {
+ src >>= -shift - 1;
+ return (src >> 1) + (src & 1);
+ }
+ return src >> -shift;
+ } else if (shift < 64) {
+ uint64_t val = src << shift;
+ if (!sat || val >> shift == src) {
+ return val;
+ }
+ } else if (!sat || src == 0) {
+ return 0;
+ }
+
+ *sat = 1;
+ return UINT64_MAX;
+}
+
+static inline int64_t do_suqrshl_d(int64_t src, int64_t shift,
+ bool round, uint32_t *sat)
+{
+ if (sat && src < 0) {
+ *sat = 1;
+ return 0;
+ }
+ return do_uqrshl_d(src, shift, round, sat);
+}
+
+int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool);
+int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
+int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
+int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
+
+/*
+ * 8 x 8 -> 16 vector polynomial multiply where the inputs are
+ * in the low 8 bits of each 16-bit element
+*/
+uint64_t pmull_h(uint64_t op1, uint64_t op2);
+/*
+ * 16 x 16 -> 32 vector polynomial multiply where the inputs are
+ * in the low 16 bits of each 32-bit element
+ */
+uint64_t pmull_w(uint64_t op1, uint64_t op2);
+
+/**
+ * bfdotadd:
+ * @sum: addend
+ * @e1, @e2: multiplicand vectors
+ *
+ * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum.
+ * The @e1 and @e2 operands correspond to the 32-bit source vector
+ * slots and contain two Bfloat16 values each.
+ *
+ * Corresponds to the ARM pseudocode function BFDotAdd.
+ */
+float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2);
+
+#endif /* TARGET_ARM_VEC_INTERNAL_H */
+++ /dev/null
-/*
- * ARM TLB (Translation lookaside buffer) helpers.
- *
- * This code is licensed under the GNU GPL v2 or later.
- *
- * SPDX-License-Identifier: GPL-2.0-or-later
- */
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "internals.h"
-#include "exec/exec-all.h"
-#include "exec/helper-proto.h"
-
-
-/* Return true if the translation regime is using LPAE format page tables */
-bool regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx)
-{
- int el = regime_el(env, mmu_idx);
- if (el == 2 || arm_el_is_aa64(env, el)) {
- return true;
- }
- if (arm_feature(env, ARM_FEATURE_PMSA) &&
- arm_feature(env, ARM_FEATURE_V8)) {
- return true;
- }
- if (arm_feature(env, ARM_FEATURE_LPAE)
- && (regime_tcr(env, mmu_idx) & TTBCR_EAE)) {
- return true;
- }
- return false;
-}
-
-/*
- * Returns true if the stage 1 translation regime is using LPAE format page
- * tables. Used when raising alignment exceptions, whose FSR changes depending
- * on whether the long or short descriptor format is in use.
- */
-bool arm_s1_regime_using_lpae_format(CPUARMState *env, ARMMMUIdx mmu_idx)
-{
- mmu_idx = stage_1_mmu_idx(mmu_idx);
- return regime_using_lpae_format(env, mmu_idx);
-}
-
-static inline uint32_t merge_syn_data_abort(uint32_t template_syn,
- unsigned int target_el,
- bool same_el, bool ea,
- bool s1ptw, bool is_write,
- int fsc)
-{
- uint32_t syn;
-
- /*
- * ISV is only set for data aborts routed to EL2 and
- * never for stage-1 page table walks faulting on stage 2.
- *
- * Furthermore, ISV is only set for certain kinds of load/stores.
- * If the template syndrome does not have ISV set, we should leave
- * it cleared.
- *
- * See ARMv8 specs, D7-1974:
- * ISS encoding for an exception from a Data Abort, the
- * ISV field.
- */
- if (!(template_syn & ARM_EL_ISV) || target_el != 2 || s1ptw) {
- syn = syn_data_abort_no_iss(same_el, 0,
- ea, 0, s1ptw, is_write, fsc);
- } else {
- /*
- * Fields: IL, ISV, SAS, SSE, SRT, SF and AR come from the template
- * syndrome created at translation time.
- * Now we create the runtime syndrome with the remaining fields.
- */
- syn = syn_data_abort_with_iss(same_el,
- 0, 0, 0, 0, 0,
- ea, 0, s1ptw, is_write, fsc,
- true);
- /* Merge the runtime syndrome with the template syndrome. */
- syn |= template_syn;
- }
- return syn;
-}
-
-static uint32_t compute_fsr_fsc(CPUARMState *env, ARMMMUFaultInfo *fi,
- int target_el, int mmu_idx, uint32_t *ret_fsc)
-{
- ARMMMUIdx arm_mmu_idx = core_to_arm_mmu_idx(env, mmu_idx);
- uint32_t fsr, fsc;
-
- if (target_el == 2 || arm_el_is_aa64(env, target_el) ||
- arm_s1_regime_using_lpae_format(env, arm_mmu_idx)) {
- /*
- * LPAE format fault status register : bottom 6 bits are
- * status code in the same form as needed for syndrome
- */
- fsr = arm_fi_to_lfsc(fi);
- fsc = extract32(fsr, 0, 6);
- } else {
- fsr = arm_fi_to_sfsc(fi);
- /*
- * Short format FSR : this fault will never actually be reported
- * to an EL that uses a syndrome register. Use a (currently)
- * reserved FSR code in case the constructed syndrome does leak
- * into the guest somehow.
- */
- fsc = 0x3f;
- }
-
- *ret_fsc = fsc;
- return fsr;
-}
-
-static G_NORETURN
-void arm_deliver_fault(ARMCPU *cpu, vaddr addr,
- MMUAccessType access_type,
- int mmu_idx, ARMMMUFaultInfo *fi)
-{
- CPUARMState *env = &cpu->env;
- int target_el;
- bool same_el;
- uint32_t syn, exc, fsr, fsc;
-
- target_el = exception_target_el(env);
- if (fi->stage2) {
- target_el = 2;
- env->cp15.hpfar_el2 = extract64(fi->s2addr, 12, 47) << 4;
- if (arm_is_secure_below_el3(env) && fi->s1ns) {
- env->cp15.hpfar_el2 |= HPFAR_NS;
- }
- }
- same_el = (arm_current_el(env) == target_el);
-
- fsr = compute_fsr_fsc(env, fi, target_el, mmu_idx, &fsc);
-
- if (access_type == MMU_INST_FETCH) {
- syn = syn_insn_abort(same_el, fi->ea, fi->s1ptw, fsc);
- exc = EXCP_PREFETCH_ABORT;
- } else {
- syn = merge_syn_data_abort(env->exception.syndrome, target_el,
- same_el, fi->ea, fi->s1ptw,
- access_type == MMU_DATA_STORE,
- fsc);
- if (access_type == MMU_DATA_STORE
- && arm_feature(env, ARM_FEATURE_V6)) {
- fsr |= (1 << 11);
- }
- exc = EXCP_DATA_ABORT;
- }
-
- env->exception.vaddress = addr;
- env->exception.fsr = fsr;
- raise_exception(env, exc, syn, target_el);
-}
-
-/* Raise a data fault alignment exception for the specified virtual address */
-void arm_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr,
- MMUAccessType access_type,
- int mmu_idx, uintptr_t retaddr)
-{
- ARMCPU *cpu = ARM_CPU(cs);
- ARMMMUFaultInfo fi = {};
-
- /* now we have a real cpu fault */
- cpu_restore_state(cs, retaddr);
-
- fi.type = ARMFault_Alignment;
- arm_deliver_fault(cpu, vaddr, access_type, mmu_idx, &fi);
-}
-
-void helper_exception_pc_alignment(CPUARMState *env, target_ulong pc)
-{
- ARMMMUFaultInfo fi = { .type = ARMFault_Alignment };
- int target_el = exception_target_el(env);
- int mmu_idx = cpu_mmu_index(env, true);
- uint32_t fsc;
-
- env->exception.vaddress = pc;
-
- /*
- * Note that the fsc is not applicable to this exception,
- * since any syndrome is pcalignment not insn_abort.
- */
- env->exception.fsr = compute_fsr_fsc(env, &fi, target_el, mmu_idx, &fsc);
- raise_exception(env, EXCP_PREFETCH_ABORT, syn_pcalignment(), target_el);
-}
-
-#if !defined(CONFIG_USER_ONLY)
-
-/*
- * arm_cpu_do_transaction_failed: handle a memory system error response
- * (eg "no device/memory present at address") by raising an external abort
- * exception
- */
-void arm_cpu_do_transaction_failed(CPUState *cs, hwaddr physaddr,
- vaddr addr, unsigned size,
- MMUAccessType access_type,
- int mmu_idx, MemTxAttrs attrs,
- MemTxResult response, uintptr_t retaddr)
-{
- ARMCPU *cpu = ARM_CPU(cs);
- ARMMMUFaultInfo fi = {};
-
- /* now we have a real cpu fault */
- cpu_restore_state(cs, retaddr);
-
- fi.ea = arm_extabort_type(response);
- fi.type = ARMFault_SyncExternal;
- arm_deliver_fault(cpu, addr, access_type, mmu_idx, &fi);
-}
-
-bool arm_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
- MMUAccessType access_type, int mmu_idx,
- bool probe, uintptr_t retaddr)
-{
- ARMCPU *cpu = ARM_CPU(cs);
- GetPhysAddrResult res = {};
- ARMMMUFaultInfo local_fi, *fi;
- int ret;
-
- /*
- * Allow S1_ptw_translate to see any fault generated here.
- * Since this may recurse, read and clear.
- */
- fi = cpu->env.tlb_fi;
- if (fi) {
- cpu->env.tlb_fi = NULL;
- } else {
- fi = memset(&local_fi, 0, sizeof(local_fi));
- }
-
- /*
- * Walk the page table and (if the mapping exists) add the page
- * to the TLB. On success, return true. Otherwise, if probing,
- * return false. Otherwise populate fsr with ARM DFSR/IFSR fault
- * register format, and signal the fault.
- */
- ret = get_phys_addr(&cpu->env, address, access_type,
- core_to_arm_mmu_idx(&cpu->env, mmu_idx),
- &res, fi);
- if (likely(!ret)) {
- /*
- * Map a single [sub]page. Regions smaller than our declared
- * target page size are handled specially, so for those we
- * pass in the exact addresses.
- */
- if (res.f.lg_page_size >= TARGET_PAGE_BITS) {
- res.f.phys_addr &= TARGET_PAGE_MASK;
- address &= TARGET_PAGE_MASK;
- }
-
- res.f.pte_attrs = res.cacheattrs.attrs;
- res.f.shareability = res.cacheattrs.shareability;
-
- tlb_set_page_full(cs, mmu_idx, address, &res.f);
- return true;
- } else if (probe) {
- return false;
- } else {
- /* now we have a real cpu fault */
- cpu_restore_state(cs, retaddr);
- arm_deliver_fault(cpu, address, access_type, mmu_idx, fi);
- }
-}
-#else
-void arm_cpu_record_sigsegv(CPUState *cs, vaddr addr,
- MMUAccessType access_type,
- bool maperr, uintptr_t ra)
-{
- ARMMMUFaultInfo fi = {
- .type = maperr ? ARMFault_Translation : ARMFault_Permission,
- .level = 3,
- };
- ARMCPU *cpu = ARM_CPU(cs);
-
- /*
- * We report both ESR and FAR to signal handlers.
- * For now, it's easiest to deliver the fault normally.
- */
- cpu_restore_state(cs, ra);
- arm_deliver_fault(cpu, addr, access_type, MMU_USER_IDX, &fi);
-}
-
-void arm_cpu_record_sigbus(CPUState *cs, vaddr addr,
- MMUAccessType access_type, uintptr_t ra)
-{
- arm_cpu_do_unaligned_access(cs, addr, access_type, MMU_USER_IDX, ra);
-}
-#endif /* !defined(CONFIG_USER_ONLY) */
+++ /dev/null
-/*
- * ARM AdvSIMD / SVE Vector Operations
- *
- * Copyright (c) 2018 Linaro
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "exec/helper-proto.h"
-#include "tcg/tcg-gvec-desc.h"
-#include "fpu/softfloat.h"
-#include "qemu/int128.h"
-#include "vec_internal.h"
-
-/*
- * Data for expanding active predicate bits to bytes, for byte elements.
- *
- * for (i = 0; i < 256; ++i) {
- * unsigned long m = 0;
- * for (j = 0; j < 8; j++) {
- * if ((i >> j) & 1) {
- * m |= 0xfful << (j << 3);
- * }
- * }
- * printf("0x%016lx,\n", m);
- * }
- */
-const uint64_t expand_pred_b_data[256] = {
- 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
- 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
- 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
- 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
- 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
- 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
- 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
- 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
- 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
- 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
- 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
- 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
- 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
- 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
- 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
- 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
- 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
- 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
- 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
- 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
- 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
- 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
- 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
- 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
- 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
- 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
- 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
- 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
- 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
- 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
- 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
- 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
- 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
- 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
- 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
- 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
- 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
- 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
- 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
- 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
- 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
- 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
- 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
- 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
- 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
- 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
- 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
- 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
- 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
- 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
- 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
- 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
- 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
- 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
- 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
- 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
- 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
- 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
- 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
- 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
- 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
- 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
- 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
- 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
- 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
- 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
- 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
- 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
- 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
- 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
- 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
- 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
- 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
- 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
- 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
- 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
- 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
- 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
- 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
- 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
- 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
- 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
- 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
- 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
- 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
- 0xffffffffffffffff,
-};
-
-/*
- * Similarly for half-word elements.
- * for (i = 0; i < 256; ++i) {
- * unsigned long m = 0;
- * if (i & 0xaa) {
- * continue;
- * }
- * for (j = 0; j < 8; j += 2) {
- * if ((i >> j) & 1) {
- * m |= 0xfffful << (j << 3);
- * }
- * }
- * printf("[0x%x] = 0x%016lx,\n", i, m);
- * }
- */
-const uint64_t expand_pred_h_data[0x55 + 1] = {
- [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
- [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
- [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
- [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
- [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
- [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
- [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
- [0x55] = 0xffffffffffffffff,
-};
-
-/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
-int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
- bool neg, bool round)
-{
- /*
- * Simplify:
- * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
- * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
- */
- int32_t ret = (int32_t)src1 * src2;
- if (neg) {
- ret = -ret;
- }
- ret += ((int32_t)src3 << 7) + (round << 6);
- ret >>= 7;
-
- if (ret != (int8_t)ret) {
- ret = (ret < 0 ? INT8_MIN : INT8_MAX);
- }
- return ret;
-}
-
-void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int8_t *d = vd, *n = vn, *m = vm, *a = va;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
- }
-}
-
-void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int8_t *d = vd, *n = vn, *m = vm, *a = va;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
- }
-}
-
-void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int8_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
- }
-}
-
-void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int8_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
- }
-}
-
-/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
-int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
- bool neg, bool round, uint32_t *sat)
-{
- /* Simplify similarly to do_sqrdmlah_b above. */
- int32_t ret = (int32_t)src1 * src2;
- if (neg) {
- ret = -ret;
- }
- ret += ((int32_t)src3 << 15) + (round << 14);
- ret >>= 15;
-
- if (ret != (int16_t)ret) {
- *sat = 1;
- ret = (ret < 0 ? INT16_MIN : INT16_MAX);
- }
- return ret;
-}
-
-uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
- uint32_t src2, uint32_t src3)
-{
- uint32_t *sat = &env->vfp.qc[0];
- uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
- uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
- false, true, sat);
- return deposit32(e1, 16, 16, e2);
-}
-
-void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
- void *vq, uint32_t desc)
-{
- uintptr_t opr_sz = simd_oprsz(desc);
- int16_t *d = vd;
- int16_t *n = vn;
- int16_t *m = vm;
- uintptr_t i;
-
- for (i = 0; i < opr_sz / 2; ++i) {
- d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
- uint32_t src2, uint32_t src3)
-{
- uint32_t *sat = &env->vfp.qc[0];
- uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
- uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
- true, true, sat);
- return deposit32(e1, 16, 16, e2);
-}
-
-void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
- void *vq, uint32_t desc)
-{
- uintptr_t opr_sz = simd_oprsz(desc);
- int16_t *d = vd;
- int16_t *n = vn;
- int16_t *m = vm;
- uintptr_t i;
-
- for (i = 0; i < opr_sz / 2; ++i) {
- d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
- void *vq, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int16_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 2; ++i) {
- d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
- void *vq, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int16_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 2; ++i) {
- d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int16_t *d = vd, *n = vn, *m = vm, *a = va;
- uint32_t discard;
-
- for (i = 0; i < opr_sz / 2; ++i) {
- d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
- }
-}
-
-void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int16_t *d = vd, *n = vn, *m = vm, *a = va;
- uint32_t discard;
-
- for (i = 0; i < opr_sz / 2; ++i) {
- d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
- }
-}
-
-void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int16_t *d = vd, *n = vn, *m = vm;
- uint32_t discard;
-
- for (i = 0; i < opr_sz / 2; ++i) {
- d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
- }
-}
-
-void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int16_t *d = vd, *n = vn, *m = vm;
- uint32_t discard;
-
- for (i = 0; i < opr_sz / 2; ++i) {
- d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
- }
-}
-
-void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- int idx = simd_data(desc);
- int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
- uint32_t discard;
-
- for (i = 0; i < opr_sz / 2; i += 16 / 2) {
- int16_t mm = m[i];
- for (j = 0; j < 16 / 2; ++j) {
- d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
- }
- }
-}
-
-void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- int idx = simd_data(desc);
- int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
- uint32_t discard;
-
- for (i = 0; i < opr_sz / 2; i += 16 / 2) {
- int16_t mm = m[i];
- for (j = 0; j < 16 / 2; ++j) {
- d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
- }
- }
-}
-
-/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
-int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
- bool neg, bool round, uint32_t *sat)
-{
- /* Simplify similarly to do_sqrdmlah_b above. */
- int64_t ret = (int64_t)src1 * src2;
- if (neg) {
- ret = -ret;
- }
- ret += ((int64_t)src3 << 31) + (round << 30);
- ret >>= 31;
-
- if (ret != (int32_t)ret) {
- *sat = 1;
- ret = (ret < 0 ? INT32_MIN : INT32_MAX);
- }
- return ret;
-}
-
-uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
- int32_t src2, int32_t src3)
-{
- uint32_t *sat = &env->vfp.qc[0];
- return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
-}
-
-void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
- void *vq, uint32_t desc)
-{
- uintptr_t opr_sz = simd_oprsz(desc);
- int32_t *d = vd;
- int32_t *n = vn;
- int32_t *m = vm;
- uintptr_t i;
-
- for (i = 0; i < opr_sz / 4; ++i) {
- d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
- int32_t src2, int32_t src3)
-{
- uint32_t *sat = &env->vfp.qc[0];
- return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
-}
-
-void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
- void *vq, uint32_t desc)
-{
- uintptr_t opr_sz = simd_oprsz(desc);
- int32_t *d = vd;
- int32_t *n = vn;
- int32_t *m = vm;
- uintptr_t i;
-
- for (i = 0; i < opr_sz / 4; ++i) {
- d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
- void *vq, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int32_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 4; ++i) {
- d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
- void *vq, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int32_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 4; ++i) {
- d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int32_t *d = vd, *n = vn, *m = vm, *a = va;
- uint32_t discard;
-
- for (i = 0; i < opr_sz / 4; ++i) {
- d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
- }
-}
-
-void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int32_t *d = vd, *n = vn, *m = vm, *a = va;
- uint32_t discard;
-
- for (i = 0; i < opr_sz / 4; ++i) {
- d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
- }
-}
-
-void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int32_t *d = vd, *n = vn, *m = vm;
- uint32_t discard;
-
- for (i = 0; i < opr_sz / 4; ++i) {
- d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
- }
-}
-
-void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int32_t *d = vd, *n = vn, *m = vm;
- uint32_t discard;
-
- for (i = 0; i < opr_sz / 4; ++i) {
- d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
- }
-}
-
-void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- int idx = simd_data(desc);
- int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
- uint32_t discard;
-
- for (i = 0; i < opr_sz / 4; i += 16 / 4) {
- int32_t mm = m[i];
- for (j = 0; j < 16 / 4; ++j) {
- d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
- }
- }
-}
-
-void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- int idx = simd_data(desc);
- int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
- uint32_t discard;
-
- for (i = 0; i < opr_sz / 4; i += 16 / 4) {
- int32_t mm = m[i];
- for (j = 0; j < 16 / 4; ++j) {
- d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
- }
- }
-}
-
-/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
-static int64_t do_sat128_d(Int128 r)
-{
- int64_t ls = int128_getlo(r);
- int64_t hs = int128_gethi(r);
-
- if (unlikely(hs != (ls >> 63))) {
- return hs < 0 ? INT64_MIN : INT64_MAX;
- }
- return ls;
-}
-
-int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
-{
- uint64_t l, h;
- Int128 r, t;
-
- /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
- muls64(&l, &h, m, n);
- r = int128_make128(l, h);
- if (neg) {
- r = int128_neg(r);
- }
- if (a) {
- t = int128_exts64(a);
- t = int128_lshift(t, 63);
- r = int128_add(r, t);
- }
- if (round) {
- t = int128_exts64(1ll << 62);
- r = int128_add(r, t);
- }
- r = int128_rshift(r, 63);
-
- return do_sat128_d(r);
-}
-
-void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int64_t *d = vd, *n = vn, *m = vm, *a = va;
-
- for (i = 0; i < opr_sz / 8; ++i) {
- d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
- }
-}
-
-void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int64_t *d = vd, *n = vn, *m = vm, *a = va;
-
- for (i = 0; i < opr_sz / 8; ++i) {
- d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
- }
-}
-
-void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int64_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 8; ++i) {
- d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
- }
-}
-
-void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int64_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 8; ++i) {
- d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
- }
-}
-
-void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- int idx = simd_data(desc);
- int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
-
- for (i = 0; i < opr_sz / 8; i += 16 / 8) {
- int64_t mm = m[i];
- for (j = 0; j < 16 / 8; ++j) {
- d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
- }
- }
-}
-
-void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- int idx = simd_data(desc);
- int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
-
- for (i = 0; i < opr_sz / 8; i += 16 / 8) {
- int64_t mm = m[i];
- for (j = 0; j < 16 / 8; ++j) {
- d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
- }
- }
-}
-
-/* Integer 8 and 16-bit dot-product.
- *
- * Note that for the loops herein, host endianness does not matter
- * with respect to the ordering of data within the quad-width lanes.
- * All elements are treated equally, no matter where they are.
- */
-
-#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- TYPED *d = vd, *a = va; \
- TYPEN *n = vn; \
- TYPEM *m = vm; \
- for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
- d[i] = (a[i] + \
- (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
- (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
- (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
- (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
- } \
- clear_tail(d, opr_sz, simd_maxsz(desc)); \
-}
-
-DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
-DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
-DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
-DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
-DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
-
-#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{ \
- intptr_t i = 0, opr_sz = simd_oprsz(desc); \
- intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
- intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
- intptr_t index = simd_data(desc); \
- TYPED *d = vd, *a = va; \
- TYPEN *n = vn; \
- TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
- do { \
- TYPED m0 = m_indexed[i * 4 + 0]; \
- TYPED m1 = m_indexed[i * 4 + 1]; \
- TYPED m2 = m_indexed[i * 4 + 2]; \
- TYPED m3 = m_indexed[i * 4 + 3]; \
- do { \
- d[i] = (a[i] + \
- n[i * 4 + 0] * m0 + \
- n[i * 4 + 1] * m1 + \
- n[i * 4 + 2] * m2 + \
- n[i * 4 + 3] * m3); \
- } while (++i < segend); \
- segend = i + 4; \
- } while (i < opr_sz_n); \
- clear_tail(d, opr_sz, simd_maxsz(desc)); \
-}
-
-DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
-DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
-DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
-DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
-DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
-DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
-
-void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
- void *vfpst, uint32_t desc)
-{
- uintptr_t opr_sz = simd_oprsz(desc);
- float16 *d = vd;
- float16 *n = vn;
- float16 *m = vm;
- float_status *fpst = vfpst;
- uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
- uint32_t neg_imag = neg_real ^ 1;
- uintptr_t i;
-
- /* Shift boolean to the sign bit so we can xor to negate. */
- neg_real <<= 15;
- neg_imag <<= 15;
-
- for (i = 0; i < opr_sz / 2; i += 2) {
- float16 e0 = n[H2(i)];
- float16 e1 = m[H2(i + 1)] ^ neg_imag;
- float16 e2 = n[H2(i + 1)];
- float16 e3 = m[H2(i)] ^ neg_real;
-
- d[H2(i)] = float16_add(e0, e1, fpst);
- d[H2(i + 1)] = float16_add(e2, e3, fpst);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
- void *vfpst, uint32_t desc)
-{
- uintptr_t opr_sz = simd_oprsz(desc);
- float32 *d = vd;
- float32 *n = vn;
- float32 *m = vm;
- float_status *fpst = vfpst;
- uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
- uint32_t neg_imag = neg_real ^ 1;
- uintptr_t i;
-
- /* Shift boolean to the sign bit so we can xor to negate. */
- neg_real <<= 31;
- neg_imag <<= 31;
-
- for (i = 0; i < opr_sz / 4; i += 2) {
- float32 e0 = n[H4(i)];
- float32 e1 = m[H4(i + 1)] ^ neg_imag;
- float32 e2 = n[H4(i + 1)];
- float32 e3 = m[H4(i)] ^ neg_real;
-
- d[H4(i)] = float32_add(e0, e1, fpst);
- d[H4(i + 1)] = float32_add(e2, e3, fpst);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
- void *vfpst, uint32_t desc)
-{
- uintptr_t opr_sz = simd_oprsz(desc);
- float64 *d = vd;
- float64 *n = vn;
- float64 *m = vm;
- float_status *fpst = vfpst;
- uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
- uint64_t neg_imag = neg_real ^ 1;
- uintptr_t i;
-
- /* Shift boolean to the sign bit so we can xor to negate. */
- neg_real <<= 63;
- neg_imag <<= 63;
-
- for (i = 0; i < opr_sz / 8; i += 2) {
- float64 e0 = n[i];
- float64 e1 = m[i + 1] ^ neg_imag;
- float64 e2 = n[i + 1];
- float64 e3 = m[i] ^ neg_real;
-
- d[i] = float64_add(e0, e1, fpst);
- d[i + 1] = float64_add(e2, e3, fpst);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
- void *vfpst, uint32_t desc)
-{
- uintptr_t opr_sz = simd_oprsz(desc);
- float16 *d = vd, *n = vn, *m = vm, *a = va;
- float_status *fpst = vfpst;
- intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
- uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
- uint32_t neg_real = flip ^ neg_imag;
- uintptr_t i;
-
- /* Shift boolean to the sign bit so we can xor to negate. */
- neg_real <<= 15;
- neg_imag <<= 15;
-
- for (i = 0; i < opr_sz / 2; i += 2) {
- float16 e2 = n[H2(i + flip)];
- float16 e1 = m[H2(i + flip)] ^ neg_real;
- float16 e4 = e2;
- float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
-
- d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
- d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
- void *vfpst, uint32_t desc)
-{
- uintptr_t opr_sz = simd_oprsz(desc);
- float16 *d = vd, *n = vn, *m = vm, *a = va;
- float_status *fpst = vfpst;
- intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
- uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
- intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
- uint32_t neg_real = flip ^ neg_imag;
- intptr_t elements = opr_sz / sizeof(float16);
- intptr_t eltspersegment = 16 / sizeof(float16);
- intptr_t i, j;
-
- /* Shift boolean to the sign bit so we can xor to negate. */
- neg_real <<= 15;
- neg_imag <<= 15;
-
- for (i = 0; i < elements; i += eltspersegment) {
- float16 mr = m[H2(i + 2 * index + 0)];
- float16 mi = m[H2(i + 2 * index + 1)];
- float16 e1 = neg_real ^ (flip ? mi : mr);
- float16 e3 = neg_imag ^ (flip ? mr : mi);
-
- for (j = i; j < i + eltspersegment; j += 2) {
- float16 e2 = n[H2(j + flip)];
- float16 e4 = e2;
-
- d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
- d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
- }
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
- void *vfpst, uint32_t desc)
-{
- uintptr_t opr_sz = simd_oprsz(desc);
- float32 *d = vd, *n = vn, *m = vm, *a = va;
- float_status *fpst = vfpst;
- intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
- uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
- uint32_t neg_real = flip ^ neg_imag;
- uintptr_t i;
-
- /* Shift boolean to the sign bit so we can xor to negate. */
- neg_real <<= 31;
- neg_imag <<= 31;
-
- for (i = 0; i < opr_sz / 4; i += 2) {
- float32 e2 = n[H4(i + flip)];
- float32 e1 = m[H4(i + flip)] ^ neg_real;
- float32 e4 = e2;
- float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
-
- d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
- d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
- void *vfpst, uint32_t desc)
-{
- uintptr_t opr_sz = simd_oprsz(desc);
- float32 *d = vd, *n = vn, *m = vm, *a = va;
- float_status *fpst = vfpst;
- intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
- uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
- intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
- uint32_t neg_real = flip ^ neg_imag;
- intptr_t elements = opr_sz / sizeof(float32);
- intptr_t eltspersegment = 16 / sizeof(float32);
- intptr_t i, j;
-
- /* Shift boolean to the sign bit so we can xor to negate. */
- neg_real <<= 31;
- neg_imag <<= 31;
-
- for (i = 0; i < elements; i += eltspersegment) {
- float32 mr = m[H4(i + 2 * index + 0)];
- float32 mi = m[H4(i + 2 * index + 1)];
- float32 e1 = neg_real ^ (flip ? mi : mr);
- float32 e3 = neg_imag ^ (flip ? mr : mi);
-
- for (j = i; j < i + eltspersegment; j += 2) {
- float32 e2 = n[H4(j + flip)];
- float32 e4 = e2;
-
- d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
- d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
- }
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
- void *vfpst, uint32_t desc)
-{
- uintptr_t opr_sz = simd_oprsz(desc);
- float64 *d = vd, *n = vn, *m = vm, *a = va;
- float_status *fpst = vfpst;
- intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
- uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
- uint64_t neg_real = flip ^ neg_imag;
- uintptr_t i;
-
- /* Shift boolean to the sign bit so we can xor to negate. */
- neg_real <<= 63;
- neg_imag <<= 63;
-
- for (i = 0; i < opr_sz / 8; i += 2) {
- float64 e2 = n[i + flip];
- float64 e1 = m[i + flip] ^ neg_real;
- float64 e4 = e2;
- float64 e3 = m[i + 1 - flip] ^ neg_imag;
-
- d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
- d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-/*
- * Floating point comparisons producing an integer result (all 1s or all 0s).
- * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
- * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
- */
-static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
-{
- return -float16_eq_quiet(op1, op2, stat);
-}
-
-static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
-{
- return -float32_eq_quiet(op1, op2, stat);
-}
-
-static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
-{
- return -float16_le(op2, op1, stat);
-}
-
-static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
-{
- return -float32_le(op2, op1, stat);
-}
-
-static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
-{
- return -float16_lt(op2, op1, stat);
-}
-
-static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
-{
- return -float32_lt(op2, op1, stat);
-}
-
-static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
-{
- return -float16_le(float16_abs(op2), float16_abs(op1), stat);
-}
-
-static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
-{
- return -float32_le(float32_abs(op2), float32_abs(op1), stat);
-}
-
-static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
-{
- return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
-}
-
-static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
-{
- return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
-}
-
-static int16_t vfp_tosszh(float16 x, void *fpstp)
-{
- float_status *fpst = fpstp;
- if (float16_is_any_nan(x)) {
- float_raise(float_flag_invalid, fpst);
- return 0;
- }
- return float16_to_int16_round_to_zero(x, fpst);
-}
-
-static uint16_t vfp_touszh(float16 x, void *fpstp)
-{
- float_status *fpst = fpstp;
- if (float16_is_any_nan(x)) {
- float_raise(float_flag_invalid, fpst);
- return 0;
- }
- return float16_to_uint16_round_to_zero(x, fpst);
-}
-
-#define DO_2OP(NAME, FUNC, TYPE) \
-void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- TYPE *d = vd, *n = vn; \
- for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
- d[i] = FUNC(n[i], stat); \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
-DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
-DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
-
-DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
-DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
-DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
-
-DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
-DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
-
-DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
-DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
-DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
-DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
-DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
-DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
-DO_2OP(gvec_tosszh, vfp_tosszh, float16)
-DO_2OP(gvec_touszh, vfp_touszh, float16)
-
-#define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
- static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
- { \
- return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
- }
-
-#define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
- static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
- { \
- return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
- }
-
-#define DO_2OP_CMP0(FN, CMPOP, DIRN) \
- WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
- WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
- DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
- DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
-
-DO_2OP_CMP0(cgt, cgt, FWD)
-DO_2OP_CMP0(cge, cge, FWD)
-DO_2OP_CMP0(ceq, ceq, FWD)
-DO_2OP_CMP0(clt, cgt, REV)
-DO_2OP_CMP0(cle, cge, REV)
-
-#undef DO_2OP
-#undef DO_2OP_CMP0
-
-/* Floating-point trigonometric starting value.
- * See the ARM ARM pseudocode function FPTrigSMul.
- */
-static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
-{
- float16 result = float16_mul(op1, op1, stat);
- if (!float16_is_any_nan(result)) {
- result = float16_set_sign(result, op2 & 1);
- }
- return result;
-}
-
-static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
-{
- float32 result = float32_mul(op1, op1, stat);
- if (!float32_is_any_nan(result)) {
- result = float32_set_sign(result, op2 & 1);
- }
- return result;
-}
-
-static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
-{
- float64 result = float64_mul(op1, op1, stat);
- if (!float64_is_any_nan(result)) {
- result = float64_set_sign(result, op2 & 1);
- }
- return result;
-}
-
-static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
-{
- return float16_abs(float16_sub(op1, op2, stat));
-}
-
-static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
-{
- return float32_abs(float32_sub(op1, op2, stat));
-}
-
-/*
- * Reciprocal step. These are the AArch32 version which uses a
- * non-fused multiply-and-subtract.
- */
-static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
-{
- op1 = float16_squash_input_denormal(op1, stat);
- op2 = float16_squash_input_denormal(op2, stat);
-
- if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
- (float16_is_infinity(op2) && float16_is_zero(op1))) {
- return float16_two;
- }
- return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
-}
-
-static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
-{
- op1 = float32_squash_input_denormal(op1, stat);
- op2 = float32_squash_input_denormal(op2, stat);
-
- if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
- (float32_is_infinity(op2) && float32_is_zero(op1))) {
- return float32_two;
- }
- return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
-}
-
-/* Reciprocal square-root step. AArch32 non-fused semantics. */
-static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
-{
- op1 = float16_squash_input_denormal(op1, stat);
- op2 = float16_squash_input_denormal(op2, stat);
-
- if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
- (float16_is_infinity(op2) && float16_is_zero(op1))) {
- return float16_one_point_five;
- }
- op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
- return float16_div(op1, float16_two, stat);
-}
-
-static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
-{
- op1 = float32_squash_input_denormal(op1, stat);
- op2 = float32_squash_input_denormal(op2, stat);
-
- if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
- (float32_is_infinity(op2) && float32_is_zero(op1))) {
- return float32_one_point_five;
- }
- op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
- return float32_div(op1, float32_two, stat);
-}
-
-#define DO_3OP(NAME, FUNC, TYPE) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- TYPE *d = vd, *n = vn, *m = vm; \
- for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
- d[i] = FUNC(n[i], m[i], stat); \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-DO_3OP(gvec_fadd_h, float16_add, float16)
-DO_3OP(gvec_fadd_s, float32_add, float32)
-DO_3OP(gvec_fadd_d, float64_add, float64)
-
-DO_3OP(gvec_fsub_h, float16_sub, float16)
-DO_3OP(gvec_fsub_s, float32_sub, float32)
-DO_3OP(gvec_fsub_d, float64_sub, float64)
-
-DO_3OP(gvec_fmul_h, float16_mul, float16)
-DO_3OP(gvec_fmul_s, float32_mul, float32)
-DO_3OP(gvec_fmul_d, float64_mul, float64)
-
-DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
-DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
-DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
-
-DO_3OP(gvec_fabd_h, float16_abd, float16)
-DO_3OP(gvec_fabd_s, float32_abd, float32)
-
-DO_3OP(gvec_fceq_h, float16_ceq, float16)
-DO_3OP(gvec_fceq_s, float32_ceq, float32)
-
-DO_3OP(gvec_fcge_h, float16_cge, float16)
-DO_3OP(gvec_fcge_s, float32_cge, float32)
-
-DO_3OP(gvec_fcgt_h, float16_cgt, float16)
-DO_3OP(gvec_fcgt_s, float32_cgt, float32)
-
-DO_3OP(gvec_facge_h, float16_acge, float16)
-DO_3OP(gvec_facge_s, float32_acge, float32)
-
-DO_3OP(gvec_facgt_h, float16_acgt, float16)
-DO_3OP(gvec_facgt_s, float32_acgt, float32)
-
-DO_3OP(gvec_fmax_h, float16_max, float16)
-DO_3OP(gvec_fmax_s, float32_max, float32)
-
-DO_3OP(gvec_fmin_h, float16_min, float16)
-DO_3OP(gvec_fmin_s, float32_min, float32)
-
-DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
-DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
-
-DO_3OP(gvec_fminnum_h, float16_minnum, float16)
-DO_3OP(gvec_fminnum_s, float32_minnum, float32)
-
-DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
-DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
-
-DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
-DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
-
-#ifdef TARGET_AARCH64
-
-DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
-DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
-DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
-
-DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
-DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
-DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
-
-#endif
-#undef DO_3OP
-
-/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
-static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
- float_status *stat)
-{
- return float16_add(dest, float16_mul(op1, op2, stat), stat);
-}
-
-static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
- float_status *stat)
-{
- return float32_add(dest, float32_mul(op1, op2, stat), stat);
-}
-
-static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
- float_status *stat)
-{
- return float16_sub(dest, float16_mul(op1, op2, stat), stat);
-}
-
-static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
- float_status *stat)
-{
- return float32_sub(dest, float32_mul(op1, op2, stat), stat);
-}
-
-/* Fused versions; these have the semantics Neon VFMA/VFMS want */
-static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
- float_status *stat)
-{
- return float16_muladd(op1, op2, dest, 0, stat);
-}
-
-static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
- float_status *stat)
-{
- return float32_muladd(op1, op2, dest, 0, stat);
-}
-
-static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
- float_status *stat)
-{
- return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
-}
-
-static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
- float_status *stat)
-{
- return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
-}
-
-#define DO_MULADD(NAME, FUNC, TYPE) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- TYPE *d = vd, *n = vn, *m = vm; \
- for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
- d[i] = FUNC(d[i], n[i], m[i], stat); \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
-DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
-
-DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
-DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
-
-DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
-DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
-
-DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
-DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
-
-/* For the indexed ops, SVE applies the index per 128-bit vector segment.
- * For AdvSIMD, there is of course only one such vector segment.
- */
-
-#define DO_MUL_IDX(NAME, TYPE, H) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, j, oprsz = simd_oprsz(desc); \
- intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
- intptr_t idx = simd_data(desc); \
- TYPE *d = vd, *n = vn, *m = vm; \
- for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
- TYPE mm = m[H(i + idx)]; \
- for (j = 0; j < segment; j++) { \
- d[i + j] = n[i + j] * mm; \
- } \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
-DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
-DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
-
-#undef DO_MUL_IDX
-
-#define DO_MLA_IDX(NAME, TYPE, OP, H) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
-{ \
- intptr_t i, j, oprsz = simd_oprsz(desc); \
- intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
- intptr_t idx = simd_data(desc); \
- TYPE *d = vd, *n = vn, *m = vm, *a = va; \
- for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
- TYPE mm = m[H(i + idx)]; \
- for (j = 0; j < segment; j++) { \
- d[i + j] = a[i + j] OP n[i + j] * mm; \
- } \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
-DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
-DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
-
-DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
-DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
-DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
-
-#undef DO_MLA_IDX
-
-#define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
-{ \
- intptr_t i, j, oprsz = simd_oprsz(desc); \
- intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
- intptr_t idx = simd_data(desc); \
- TYPE *d = vd, *n = vn, *m = vm; \
- for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
- TYPE mm = m[H(i + idx)]; \
- for (j = 0; j < segment; j++) { \
- d[i + j] = TYPE##_##ADD(d[i + j], \
- TYPE##_mul(n[i + j], mm, stat), stat); \
- } \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-#define float16_nop(N, M, S) (M)
-#define float32_nop(N, M, S) (M)
-#define float64_nop(N, M, S) (M)
-
-DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
-DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
-DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
-
-/*
- * Non-fused multiply-accumulate operations, for Neon. NB that unlike
- * the fused ops below they assume accumulate both from and into Vd.
- */
-DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
-DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
-DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
-DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
-
-#undef float16_nop
-#undef float32_nop
-#undef float64_nop
-#undef DO_FMUL_IDX
-
-#define DO_FMLA_IDX(NAME, TYPE, H) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
- void *stat, uint32_t desc) \
-{ \
- intptr_t i, j, oprsz = simd_oprsz(desc); \
- intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
- TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
- intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
- TYPE *d = vd, *n = vn, *m = vm, *a = va; \
- op1_neg <<= (8 * sizeof(TYPE) - 1); \
- for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
- TYPE mm = m[H(i + idx)]; \
- for (j = 0; j < segment; j++) { \
- d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
- mm, a[i + j], 0, stat); \
- } \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
-DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
-DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
-
-#undef DO_FMLA_IDX
-
-#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
-void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
- bool q = false; \
- for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
- WTYPE dd = (WTYPE)n[i] OP m[i]; \
- if (dd < MIN) { \
- dd = MIN; \
- q = true; \
- } else if (dd > MAX) { \
- dd = MAX; \
- q = true; \
- } \
- d[i] = dd; \
- } \
- if (q) { \
- uint32_t *qc = vq; \
- qc[0] = 1; \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
-DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
-DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
-
-DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
-DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
-DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
-
-DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
-DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
-DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
-
-DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
-DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
-DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
-
-#undef DO_SAT
-
-void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
- void *vm, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
- bool q = false;
-
- for (i = 0; i < oprsz / 8; i++) {
- uint64_t nn = n[i], mm = m[i], dd = nn + mm;
- if (dd < nn) {
- dd = UINT64_MAX;
- q = true;
- }
- d[i] = dd;
- }
- if (q) {
- uint32_t *qc = vq;
- qc[0] = 1;
- }
- clear_tail(d, oprsz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
- void *vm, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
- bool q = false;
-
- for (i = 0; i < oprsz / 8; i++) {
- uint64_t nn = n[i], mm = m[i], dd = nn - mm;
- if (nn < mm) {
- dd = 0;
- q = true;
- }
- d[i] = dd;
- }
- if (q) {
- uint32_t *qc = vq;
- qc[0] = 1;
- }
- clear_tail(d, oprsz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
- void *vm, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
- int64_t *d = vd, *n = vn, *m = vm;
- bool q = false;
-
- for (i = 0; i < oprsz / 8; i++) {
- int64_t nn = n[i], mm = m[i], dd = nn + mm;
- if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
- dd = (nn >> 63) ^ ~INT64_MIN;
- q = true;
- }
- d[i] = dd;
- }
- if (q) {
- uint32_t *qc = vq;
- qc[0] = 1;
- }
- clear_tail(d, oprsz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
- void *vm, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
- int64_t *d = vd, *n = vn, *m = vm;
- bool q = false;
-
- for (i = 0; i < oprsz / 8; i++) {
- int64_t nn = n[i], mm = m[i], dd = nn - mm;
- if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
- dd = (nn >> 63) ^ ~INT64_MIN;
- q = true;
- }
- d[i] = dd;
- }
- if (q) {
- uint32_t *qc = vq;
- qc[0] = 1;
- }
- clear_tail(d, oprsz, simd_maxsz(desc));
-}
-
-
-#define DO_SRA(NAME, TYPE) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- int shift = simd_data(desc); \
- TYPE *d = vd, *n = vn; \
- for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
- d[i] += n[i] >> shift; \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-DO_SRA(gvec_ssra_b, int8_t)
-DO_SRA(gvec_ssra_h, int16_t)
-DO_SRA(gvec_ssra_s, int32_t)
-DO_SRA(gvec_ssra_d, int64_t)
-
-DO_SRA(gvec_usra_b, uint8_t)
-DO_SRA(gvec_usra_h, uint16_t)
-DO_SRA(gvec_usra_s, uint32_t)
-DO_SRA(gvec_usra_d, uint64_t)
-
-#undef DO_SRA
-
-#define DO_RSHR(NAME, TYPE) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- int shift = simd_data(desc); \
- TYPE *d = vd, *n = vn; \
- for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
- TYPE tmp = n[i] >> (shift - 1); \
- d[i] = (tmp >> 1) + (tmp & 1); \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-DO_RSHR(gvec_srshr_b, int8_t)
-DO_RSHR(gvec_srshr_h, int16_t)
-DO_RSHR(gvec_srshr_s, int32_t)
-DO_RSHR(gvec_srshr_d, int64_t)
-
-DO_RSHR(gvec_urshr_b, uint8_t)
-DO_RSHR(gvec_urshr_h, uint16_t)
-DO_RSHR(gvec_urshr_s, uint32_t)
-DO_RSHR(gvec_urshr_d, uint64_t)
-
-#undef DO_RSHR
-
-#define DO_RSRA(NAME, TYPE) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- int shift = simd_data(desc); \
- TYPE *d = vd, *n = vn; \
- for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
- TYPE tmp = n[i] >> (shift - 1); \
- d[i] += (tmp >> 1) + (tmp & 1); \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-DO_RSRA(gvec_srsra_b, int8_t)
-DO_RSRA(gvec_srsra_h, int16_t)
-DO_RSRA(gvec_srsra_s, int32_t)
-DO_RSRA(gvec_srsra_d, int64_t)
-
-DO_RSRA(gvec_ursra_b, uint8_t)
-DO_RSRA(gvec_ursra_h, uint16_t)
-DO_RSRA(gvec_ursra_s, uint32_t)
-DO_RSRA(gvec_ursra_d, uint64_t)
-
-#undef DO_RSRA
-
-#define DO_SRI(NAME, TYPE) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- int shift = simd_data(desc); \
- TYPE *d = vd, *n = vn; \
- for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
- d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-DO_SRI(gvec_sri_b, uint8_t)
-DO_SRI(gvec_sri_h, uint16_t)
-DO_SRI(gvec_sri_s, uint32_t)
-DO_SRI(gvec_sri_d, uint64_t)
-
-#undef DO_SRI
-
-#define DO_SLI(NAME, TYPE) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- int shift = simd_data(desc); \
- TYPE *d = vd, *n = vn; \
- for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
- d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
-}
-
-DO_SLI(gvec_sli_b, uint8_t)
-DO_SLI(gvec_sli_h, uint16_t)
-DO_SLI(gvec_sli_s, uint32_t)
-DO_SLI(gvec_sli_d, uint64_t)
-
-#undef DO_SLI
-
-/*
- * Convert float16 to float32, raising no exceptions and
- * preserving exceptional values, including SNaN.
- * This is effectively an unpack+repack operation.
- */
-static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
-{
- const int f16_bias = 15;
- const int f32_bias = 127;
- uint32_t sign = extract32(f16, 15, 1);
- uint32_t exp = extract32(f16, 10, 5);
- uint32_t frac = extract32(f16, 0, 10);
-
- if (exp == 0x1f) {
- /* Inf or NaN */
- exp = 0xff;
- } else if (exp == 0) {
- /* Zero or denormal. */
- if (frac != 0) {
- if (fz16) {
- frac = 0;
- } else {
- /*
- * Denormal; these are all normal float32.
- * Shift the fraction so that the msb is at bit 11,
- * then remove bit 11 as the implicit bit of the
- * normalized float32. Note that we still go through
- * the shift for normal numbers below, to put the
- * float32 fraction at the right place.
- */
- int shift = clz32(frac) - 21;
- frac = (frac << shift) & 0x3ff;
- exp = f32_bias - f16_bias - shift + 1;
- }
- }
- } else {
- /* Normal number; adjust the bias. */
- exp += f32_bias - f16_bias;
- }
- sign <<= 31;
- exp <<= 23;
- frac <<= 23 - 10;
-
- return sign | exp | frac;
-}
-
-static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
-{
- /*
- * Branchless load of u32[0], u64[0], u32[1], or u64[1].
- * Load the 2nd qword iff is_q & is_2.
- * Shift to the 2nd dword iff !is_q & is_2.
- * For !is_q & !is_2, the upper bits of the result are garbage.
- */
- return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
-}
-
-/*
- * Note that FMLAL requires oprsz == 8 or oprsz == 16,
- * as there is not yet SVE versions that might use blocking.
- */
-
-static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
- uint32_t desc, bool fz16)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
- int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
- int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
- int is_q = oprsz == 16;
- uint64_t n_4, m_4;
-
- /* Pre-load all of the f16 data, avoiding overlap issues. */
- n_4 = load4_f16(vn, is_q, is_2);
- m_4 = load4_f16(vm, is_q, is_2);
-
- /* Negate all inputs for FMLSL at once. */
- if (is_s) {
- n_4 ^= 0x8000800080008000ull;
- }
-
- for (i = 0; i < oprsz / 4; i++) {
- float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
- float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
- d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
- }
- clear_tail(d, oprsz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
- void *venv, uint32_t desc)
-{
- CPUARMState *env = venv;
- do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
- get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
-}
-
-void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
- void *venv, uint32_t desc)
-{
- CPUARMState *env = venv;
- do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
- get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
-}
-
-void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
- void *venv, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
- uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
- intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
- CPUARMState *env = venv;
- float_status *status = &env->vfp.fp_status;
- bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
-
- for (i = 0; i < oprsz; i += sizeof(float32)) {
- float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
- float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
- float32 nn = float16_to_float32_by_bits(nn_16, fz16);
- float32 mm = float16_to_float32_by_bits(mm_16, fz16);
- float32 aa = *(float32 *)(va + H1_4(i));
-
- *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
- }
-}
-
-static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
- uint32_t desc, bool fz16)
-{
- intptr_t i, oprsz = simd_oprsz(desc);
- int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
- int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
- int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
- int is_q = oprsz == 16;
- uint64_t n_4;
- float32 m_1;
-
- /* Pre-load all of the f16 data, avoiding overlap issues. */
- n_4 = load4_f16(vn, is_q, is_2);
-
- /* Negate all inputs for FMLSL at once. */
- if (is_s) {
- n_4 ^= 0x8000800080008000ull;
- }
-
- m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
-
- for (i = 0; i < oprsz / 4; i++) {
- float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
- d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
- }
- clear_tail(d, oprsz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
- void *venv, uint32_t desc)
-{
- CPUARMState *env = venv;
- do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
- get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
-}
-
-void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
- void *venv, uint32_t desc)
-{
- CPUARMState *env = venv;
- do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
- get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
-}
-
-void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
- void *venv, uint32_t desc)
-{
- intptr_t i, j, oprsz = simd_oprsz(desc);
- uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
- intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
- intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
- CPUARMState *env = venv;
- float_status *status = &env->vfp.fp_status;
- bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
-
- for (i = 0; i < oprsz; i += 16) {
- float16 mm_16 = *(float16 *)(vm + i + idx);
- float32 mm = float16_to_float32_by_bits(mm_16, fz16);
-
- for (j = 0; j < 16; j += sizeof(float32)) {
- float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
- float32 nn = float16_to_float32_by_bits(nn_16, fz16);
- float32 aa = *(float32 *)(va + H1_4(i + j));
-
- *(float32 *)(vd + H1_4(i + j)) =
- float32_muladd(nn, mm, aa, 0, status);
- }
- }
-}
-
-void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int8_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz; ++i) {
- int8_t mm = m[i];
- int8_t nn = n[i];
- int8_t res = 0;
- if (mm >= 0) {
- if (mm < 8) {
- res = nn << mm;
- }
- } else {
- res = nn >> (mm > -8 ? -mm : 7);
- }
- d[i] = res;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int16_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 2; ++i) {
- int8_t mm = m[i]; /* only 8 bits of shift are significant */
- int16_t nn = n[i];
- int16_t res = 0;
- if (mm >= 0) {
- if (mm < 16) {
- res = nn << mm;
- }
- } else {
- res = nn >> (mm > -16 ? -mm : 15);
- }
- d[i] = res;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- uint8_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz; ++i) {
- int8_t mm = m[i];
- uint8_t nn = n[i];
- uint8_t res = 0;
- if (mm >= 0) {
- if (mm < 8) {
- res = nn << mm;
- }
- } else {
- if (mm > -8) {
- res = nn >> -mm;
- }
- }
- d[i] = res;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- uint16_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 2; ++i) {
- int8_t mm = m[i]; /* only 8 bits of shift are significant */
- uint16_t nn = n[i];
- uint16_t res = 0;
- if (mm >= 0) {
- if (mm < 16) {
- res = nn << mm;
- }
- } else {
- if (mm > -16) {
- res = nn >> -mm;
- }
- }
- d[i] = res;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-/*
- * 8x8->8 polynomial multiply.
- *
- * Polynomial multiplication is like integer multiplication except the
- * partial products are XORed, not added.
- *
- * TODO: expose this as a generic vector operation, as it is a common
- * crypto building block.
- */
-void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 8; ++i) {
- uint64_t nn = n[i];
- uint64_t mm = m[i];
- uint64_t rr = 0;
-
- for (j = 0; j < 8; ++j) {
- uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
- rr ^= mm & mask;
- mm = (mm << 1) & 0xfefefefefefefefeull;
- nn >>= 1;
- }
- d[i] = rr;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-/*
- * 64x64->128 polynomial multiply.
- * Because of the lanes are not accessed in strict columns,
- * this probably cannot be turned into a generic helper.
- */
-void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- intptr_t hi = simd_data(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 8; i += 2) {
- uint64_t nn = n[i + hi];
- uint64_t mm = m[i + hi];
- uint64_t rhi = 0;
- uint64_t rlo = 0;
-
- /* Bit 0 can only influence the low 64-bit result. */
- if (nn & 1) {
- rlo = mm;
- }
-
- for (j = 1; j < 64; ++j) {
- uint64_t mask = -((nn >> j) & 1);
- rlo ^= (mm << j) & mask;
- rhi ^= (mm >> (64 - j)) & mask;
- }
- d[i] = rlo;
- d[i + 1] = rhi;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-/*
- * 8x8->16 polynomial multiply.
- *
- * The byte inputs are expanded to (or extracted from) half-words.
- * Note that neon and sve2 get the inputs from different positions.
- * This allows 4 bytes to be processed in parallel with uint64_t.
- */
-
-static uint64_t expand_byte_to_half(uint64_t x)
-{
- return (x & 0x000000ff)
- | ((x & 0x0000ff00) << 8)
- | ((x & 0x00ff0000) << 16)
- | ((x & 0xff000000) << 24);
-}
-
-uint64_t pmull_w(uint64_t op1, uint64_t op2)
-{
- uint64_t result = 0;
- int i;
- for (i = 0; i < 16; ++i) {
- uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
- result ^= op2 & mask;
- op1 >>= 1;
- op2 <<= 1;
- }
- return result;
-}
-
-uint64_t pmull_h(uint64_t op1, uint64_t op2)
-{
- uint64_t result = 0;
- int i;
- for (i = 0; i < 8; ++i) {
- uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
- result ^= op2 & mask;
- op1 >>= 1;
- op2 <<= 1;
- }
- return result;
-}
-
-void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- int hi = simd_data(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
- uint64_t nn = n[hi], mm = m[hi];
-
- d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
- nn >>= 32;
- mm >>= 32;
- d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
-
- clear_tail(d, 16, simd_maxsz(desc));
-}
-
-#ifdef TARGET_AARCH64
-void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- int shift = simd_data(desc) * 8;
- intptr_t i, opr_sz = simd_oprsz(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 8; ++i) {
- uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
- uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
-
- d[i] = pmull_h(nn, mm);
- }
-}
-
-static uint64_t pmull_d(uint64_t op1, uint64_t op2)
-{
- uint64_t result = 0;
- int i;
-
- for (i = 0; i < 32; ++i) {
- uint64_t mask = -((op1 >> i) & 1);
- result ^= (op2 << i) & mask;
- }
- return result;
-}
-
-void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t sel = H4(simd_data(desc));
- intptr_t i, opr_sz = simd_oprsz(desc);
- uint32_t *n = vn, *m = vm;
- uint64_t *d = vd;
-
- for (i = 0; i < opr_sz / 8; ++i) {
- d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
- }
-}
-#endif
-
-#define DO_CMP0(NAME, TYPE, OP) \
-void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
- TYPE nn = *(TYPE *)(vn + i); \
- *(TYPE *)(vd + i) = -(nn OP 0); \
- } \
- clear_tail(vd, opr_sz, simd_maxsz(desc)); \
-}
-
-DO_CMP0(gvec_ceq0_b, int8_t, ==)
-DO_CMP0(gvec_clt0_b, int8_t, <)
-DO_CMP0(gvec_cle0_b, int8_t, <=)
-DO_CMP0(gvec_cgt0_b, int8_t, >)
-DO_CMP0(gvec_cge0_b, int8_t, >=)
-
-DO_CMP0(gvec_ceq0_h, int16_t, ==)
-DO_CMP0(gvec_clt0_h, int16_t, <)
-DO_CMP0(gvec_cle0_h, int16_t, <=)
-DO_CMP0(gvec_cgt0_h, int16_t, >)
-DO_CMP0(gvec_cge0_h, int16_t, >=)
-
-#undef DO_CMP0
-
-#define DO_ABD(NAME, TYPE) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- TYPE *d = vd, *n = vn, *m = vm; \
- \
- for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
- d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
- } \
- clear_tail(d, opr_sz, simd_maxsz(desc)); \
-}
-
-DO_ABD(gvec_sabd_b, int8_t)
-DO_ABD(gvec_sabd_h, int16_t)
-DO_ABD(gvec_sabd_s, int32_t)
-DO_ABD(gvec_sabd_d, int64_t)
-
-DO_ABD(gvec_uabd_b, uint8_t)
-DO_ABD(gvec_uabd_h, uint16_t)
-DO_ABD(gvec_uabd_s, uint32_t)
-DO_ABD(gvec_uabd_d, uint64_t)
-
-#undef DO_ABD
-
-#define DO_ABA(NAME, TYPE) \
-void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- TYPE *d = vd, *n = vn, *m = vm; \
- \
- for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
- d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
- } \
- clear_tail(d, opr_sz, simd_maxsz(desc)); \
-}
-
-DO_ABA(gvec_saba_b, int8_t)
-DO_ABA(gvec_saba_h, int16_t)
-DO_ABA(gvec_saba_s, int32_t)
-DO_ABA(gvec_saba_d, int64_t)
-
-DO_ABA(gvec_uaba_b, uint8_t)
-DO_ABA(gvec_uaba_h, uint16_t)
-DO_ABA(gvec_uaba_s, uint32_t)
-DO_ABA(gvec_uaba_d, uint64_t)
-
-#undef DO_ABA
-
-#define DO_NEON_PAIRWISE(NAME, OP) \
- void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
- void *stat, uint32_t oprsz) \
- { \
- float_status *fpst = stat; \
- float32 *d = vd; \
- float32 *n = vn; \
- float32 *m = vm; \
- float32 r0, r1; \
- \
- /* Read all inputs before writing outputs in case vm == vd */ \
- r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
- r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
- \
- d[H4(0)] = r0; \
- d[H4(1)] = r1; \
- } \
- \
- void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
- void *stat, uint32_t oprsz) \
- { \
- float_status *fpst = stat; \
- float16 *d = vd; \
- float16 *n = vn; \
- float16 *m = vm; \
- float16 r0, r1, r2, r3; \
- \
- /* Read all inputs before writing outputs in case vm == vd */ \
- r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
- r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
- r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
- r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
- \
- d[H2(0)] = r0; \
- d[H2(1)] = r1; \
- d[H2(2)] = r2; \
- d[H2(3)] = r3; \
- }
-
-DO_NEON_PAIRWISE(neon_padd, add)
-DO_NEON_PAIRWISE(neon_pmax, max)
-DO_NEON_PAIRWISE(neon_pmin, min)
-
-#undef DO_NEON_PAIRWISE
-
-#define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
- void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
- { \
- intptr_t i, oprsz = simd_oprsz(desc); \
- int shift = simd_data(desc); \
- TYPE *d = vd, *n = vn; \
- float_status *fpst = stat; \
- for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
- d[i] = FUNC(n[i], shift, fpst); \
- } \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
- }
-
-DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
-DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
-DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
-DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
-DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
-DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
-DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
-DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
-
-#undef DO_VCVT_FIXED
-
-#define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
- void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
- { \
- float_status *fpst = stat; \
- intptr_t i, oprsz = simd_oprsz(desc); \
- uint32_t rmode = simd_data(desc); \
- uint32_t prev_rmode = get_float_rounding_mode(fpst); \
- TYPE *d = vd, *n = vn; \
- set_float_rounding_mode(rmode, fpst); \
- for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
- d[i] = FUNC(n[i], 0, fpst); \
- } \
- set_float_rounding_mode(prev_rmode, fpst); \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
- }
-
-DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
-DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
-DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
-DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
-
-#undef DO_VCVT_RMODE
-
-#define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
- void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
- { \
- float_status *fpst = stat; \
- intptr_t i, oprsz = simd_oprsz(desc); \
- uint32_t rmode = simd_data(desc); \
- uint32_t prev_rmode = get_float_rounding_mode(fpst); \
- TYPE *d = vd, *n = vn; \
- set_float_rounding_mode(rmode, fpst); \
- for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
- d[i] = FUNC(n[i], fpst); \
- } \
- set_float_rounding_mode(prev_rmode, fpst); \
- clear_tail(d, oprsz, simd_maxsz(desc)); \
- }
-
-DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
-DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
-
-#undef DO_VRINT_RMODE
-
-#ifdef TARGET_AARCH64
-void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
-{
- const uint8_t *indices = vm;
- CPUARMState *env = venv;
- size_t oprsz = simd_oprsz(desc);
- uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
- bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
- uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
- union {
- uint8_t b[16];
- uint64_t d[2];
- } result;
-
- /*
- * We must construct the final result in a temp, lest the output
- * overlaps the input table. For TBL, begin with zero; for TBX,
- * begin with the original register contents. Note that we always
- * copy 16 bytes here to avoid an extra branch; clearing the high
- * bits of the register for oprsz == 8 is handled below.
- */
- if (is_tbx) {
- memcpy(&result, vd, 16);
- } else {
- memset(&result, 0, 16);
- }
-
- for (size_t i = 0; i < oprsz; ++i) {
- uint32_t index = indices[H1(i)];
-
- if (index < table_len) {
- /*
- * Convert index (a byte offset into the virtual table
- * which is a series of 128-bit vectors concatenated)
- * into the correct register element, bearing in mind
- * that the table can wrap around from V31 to V0.
- */
- const uint8_t *table = (const uint8_t *)
- aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
- result.b[H1(i)] = table[H1(index % 16)];
- }
- }
-
- memcpy(vd, &result, 16);
- clear_tail(vd, oprsz, simd_maxsz(desc));
-}
-#endif
-
-/*
- * NxN -> N highpart multiply
- *
- * TODO: expose this as a generic vector operation.
- */
-
-void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int8_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = ((int32_t)n[i] * m[i]) >> 8;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int16_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 2; ++i) {
- d[i] = ((int32_t)n[i] * m[i]) >> 16;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- int32_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 4; ++i) {
- d[i] = ((int64_t)n[i] * m[i]) >> 32;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
- uint64_t discard;
-
- for (i = 0; i < opr_sz / 8; ++i) {
- muls64(&discard, &d[i], n[i], m[i]);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- uint8_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = ((uint32_t)n[i] * m[i]) >> 8;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- uint16_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 2; ++i) {
- d[i] = ((uint32_t)n[i] * m[i]) >> 16;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- uint32_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 4; ++i) {
- d[i] = ((uint64_t)n[i] * m[i]) >> 32;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
- uint64_t discard;
-
- for (i = 0; i < opr_sz / 8; ++i) {
- mulu64(&discard, &d[i], n[i], m[i]);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- int shr = simd_data(desc);
- uint64_t *d = vd, *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz; ++i) {
- d[i] = ror64(n[i] ^ m[i], shr);
- }
- clear_tail(d, opr_sz * 8, simd_maxsz(desc));
-}
-
-/*
- * Integer matrix-multiply accumulate
- */
-
-static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
-{
- int8_t *n = vn, *m = vm;
-
- for (intptr_t k = 0; k < 8; ++k) {
- sum += n[H1(k)] * m[H1(k)];
- }
- return sum;
-}
-
-static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
-{
- uint8_t *n = vn, *m = vm;
-
- for (intptr_t k = 0; k < 8; ++k) {
- sum += n[H1(k)] * m[H1(k)];
- }
- return sum;
-}
-
-static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
-{
- uint8_t *n = vn;
- int8_t *m = vm;
-
- for (intptr_t k = 0; k < 8; ++k) {
- sum += n[H1(k)] * m[H1(k)];
- }
- return sum;
-}
-
-static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
- uint32_t (*inner_loop)(uint32_t, void *, void *))
-{
- intptr_t seg, opr_sz = simd_oprsz(desc);
-
- for (seg = 0; seg < opr_sz; seg += 16) {
- uint32_t *d = vd + seg;
- uint32_t *a = va + seg;
- uint32_t sum0, sum1, sum2, sum3;
-
- /*
- * Process the entire segment at once, writing back the
- * results only after we've consumed all of the inputs.
- *
- * Key to indices by column:
- * i j i j
- */
- sum0 = a[H4(0 + 0)];
- sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
- sum1 = a[H4(0 + 1)];
- sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
- sum2 = a[H4(2 + 0)];
- sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
- sum3 = a[H4(2 + 1)];
- sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
-
- d[H4(0)] = sum0;
- d[H4(1)] = sum1;
- d[H4(2)] = sum2;
- d[H4(3)] = sum3;
- }
- clear_tail(vd, opr_sz, simd_maxsz(desc));
-}
-
-#define DO_MMLA_B(NAME, INNER) \
- void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
- { do_mmla_b(vd, vn, vm, va, desc, INNER); }
-
-DO_MMLA_B(gvec_smmla_b, do_smmla_b)
-DO_MMLA_B(gvec_ummla_b, do_ummla_b)
-DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
-
-/*
- * BFloat16 Dot Product
- */
-
-float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
-{
- /* FPCR is ignored for BFDOT and BFMMLA. */
- float_status bf_status = {
- .tininess_before_rounding = float_tininess_before_rounding,
- .float_rounding_mode = float_round_to_odd_inf,
- .flush_to_zero = true,
- .flush_inputs_to_zero = true,
- .default_nan_mode = true,
- };
- float32 t1, t2;
-
- /*
- * Extract each BFloat16 from the element pair, and shift
- * them such that they become float32.
- */
- t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
- t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
- t1 = float32_add(t1, t2, &bf_status);
- t1 = float32_add(sum, t1, &bf_status);
-
- return t1;
-}
-
-void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- float32 *d = vd, *a = va;
- uint32_t *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 4; ++i) {
- d[i] = bfdotadd(a[i], n[i], m[i]);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
- void *va, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- intptr_t index = simd_data(desc);
- intptr_t elements = opr_sz / 4;
- intptr_t eltspersegment = MIN(16 / 4, elements);
- float32 *d = vd, *a = va;
- uint32_t *n = vn, *m = vm;
-
- for (i = 0; i < elements; i += eltspersegment) {
- uint32_t m_idx = m[i + H4(index)];
-
- for (j = i; j < i + eltspersegment; j++) {
- d[j] = bfdotadd(a[j], n[j], m_idx);
- }
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
-{
- intptr_t s, opr_sz = simd_oprsz(desc);
- float32 *d = vd, *a = va;
- uint32_t *n = vn, *m = vm;
-
- for (s = 0; s < opr_sz / 4; s += 4) {
- float32 sum00, sum01, sum10, sum11;
-
- /*
- * Process the entire segment at once, writing back the
- * results only after we've consumed all of the inputs.
- *
- * Key to indicies by column:
- * i j i k j k
- */
- sum00 = a[s + H4(0 + 0)];
- sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
- sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
-
- sum01 = a[s + H4(0 + 1)];
- sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
- sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
-
- sum10 = a[s + H4(2 + 0)];
- sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
- sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
-
- sum11 = a[s + H4(2 + 1)];
- sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
- sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
-
- d[s + H4(0 + 0)] = sum00;
- d[s + H4(0 + 1)] = sum01;
- d[s + H4(2 + 0)] = sum10;
- d[s + H4(2 + 1)] = sum11;
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
- void *stat, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc);
- intptr_t sel = simd_data(desc);
- float32 *d = vd, *a = va;
- bfloat16 *n = vn, *m = vm;
-
- for (i = 0; i < opr_sz / 4; ++i) {
- float32 nn = n[H2(i * 2 + sel)] << 16;
- float32 mm = m[H2(i * 2 + sel)] << 16;
- d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
- void *va, void *stat, uint32_t desc)
-{
- intptr_t i, j, opr_sz = simd_oprsz(desc);
- intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
- intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
- intptr_t elements = opr_sz / 4;
- intptr_t eltspersegment = MIN(16 / 4, elements);
- float32 *d = vd, *a = va;
- bfloat16 *n = vn, *m = vm;
-
- for (i = 0; i < elements; i += eltspersegment) {
- float32 m_idx = m[H2(2 * i + index)] << 16;
-
- for (j = i; j < i + eltspersegment; j++) {
- float32 n_j = n[H2(2 * j + sel)] << 16;
- d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
- }
- }
- clear_tail(d, opr_sz, simd_maxsz(desc));
-}
-
-#define DO_CLAMP(NAME, TYPE) \
-void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
-{ \
- intptr_t i, opr_sz = simd_oprsz(desc); \
- for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
- TYPE aa = *(TYPE *)(a + i); \
- TYPE nn = *(TYPE *)(n + i); \
- TYPE mm = *(TYPE *)(m + i); \
- TYPE dd = MIN(MAX(aa, nn), mm); \
- *(TYPE *)(d + i) = dd; \
- } \
- clear_tail(d, opr_sz, simd_maxsz(desc)); \
-}
-
-DO_CLAMP(gvec_sclamp_b, int8_t)
-DO_CLAMP(gvec_sclamp_h, int16_t)
-DO_CLAMP(gvec_sclamp_s, int32_t)
-DO_CLAMP(gvec_sclamp_d, int64_t)
-
-DO_CLAMP(gvec_uclamp_b, uint8_t)
-DO_CLAMP(gvec_uclamp_h, uint16_t)
-DO_CLAMP(gvec_uclamp_s, uint32_t)
-DO_CLAMP(gvec_uclamp_d, uint64_t)
+++ /dev/null
-/*
- * ARM AdvSIMD / SVE Vector Helpers
- *
- * Copyright (c) 2020 Linaro
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef TARGET_ARM_VEC_INTERNAL_H
-#define TARGET_ARM_VEC_INTERNAL_H
-
-/*
- * Note that vector data is stored in host-endian 64-bit chunks,
- * so addressing units smaller than that needs a host-endian fixup.
- *
- * The H<N> macros are used when indexing an array of elements of size N.
- *
- * The H1_<N> macros are used when performing byte arithmetic and then
- * casting the final pointer to a type of size N.
- */
-#if HOST_BIG_ENDIAN
-#define H1(x) ((x) ^ 7)
-#define H1_2(x) ((x) ^ 6)
-#define H1_4(x) ((x) ^ 4)
-#define H2(x) ((x) ^ 3)
-#define H4(x) ((x) ^ 1)
-#else
-#define H1(x) (x)
-#define H1_2(x) (x)
-#define H1_4(x) (x)
-#define H2(x) (x)
-#define H4(x) (x)
-#endif
-/*
- * Access to 64-bit elements isn't host-endian dependent; we provide H8
- * and H1_8 so that when a function is being generated from a macro we
- * can pass these rather than an empty macro argument, for clarity.
- */
-#define H8(x) (x)
-#define H1_8(x) (x)
-
-/*
- * Expand active predicate bits to bytes, for byte elements.
- */
-extern const uint64_t expand_pred_b_data[256];
-static inline uint64_t expand_pred_b(uint8_t byte)
-{
- return expand_pred_b_data[byte];
-}
-
-/* Similarly for half-word elements. */
-extern const uint64_t expand_pred_h_data[0x55 + 1];
-static inline uint64_t expand_pred_h(uint8_t byte)
-{
- return expand_pred_h_data[byte & 0x55];
-}
-
-static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
-{
- uint64_t *d = vd + opr_sz;
- uintptr_t i;
-
- for (i = opr_sz; i < max_sz; i += 8) {
- *d++ = 0;
- }
-}
-
-static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits,
- bool round, uint32_t *sat)
-{
- if (shift <= -bits) {
- /* Rounding the sign bit always produces 0. */
- if (round) {
- return 0;
- }
- return src >> 31;
- } else if (shift < 0) {
- if (round) {
- src >>= -shift - 1;
- return (src >> 1) + (src & 1);
- }
- return src >> -shift;
- } else if (shift < bits) {
- int32_t val = src << shift;
- if (bits == 32) {
- if (!sat || val >> shift == src) {
- return val;
- }
- } else {
- int32_t extval = sextract32(val, 0, bits);
- if (!sat || val == extval) {
- return extval;
- }
- }
- } else if (!sat || src == 0) {
- return 0;
- }
-
- *sat = 1;
- return (1u << (bits - 1)) - (src >= 0);
-}
-
-static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits,
- bool round, uint32_t *sat)
-{
- if (shift <= -(bits + round)) {
- return 0;
- } else if (shift < 0) {
- if (round) {
- src >>= -shift - 1;
- return (src >> 1) + (src & 1);
- }
- return src >> -shift;
- } else if (shift < bits) {
- uint32_t val = src << shift;
- if (bits == 32) {
- if (!sat || val >> shift == src) {
- return val;
- }
- } else {
- uint32_t extval = extract32(val, 0, bits);
- if (!sat || val == extval) {
- return extval;
- }
- }
- } else if (!sat || src == 0) {
- return 0;
- }
-
- *sat = 1;
- return MAKE_64BIT_MASK(0, bits);
-}
-
-static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits,
- bool round, uint32_t *sat)
-{
- if (sat && src < 0) {
- *sat = 1;
- return 0;
- }
- return do_uqrshl_bhs(src, shift, bits, round, sat);
-}
-
-static inline int64_t do_sqrshl_d(int64_t src, int64_t shift,
- bool round, uint32_t *sat)
-{
- if (shift <= -64) {
- /* Rounding the sign bit always produces 0. */
- if (round) {
- return 0;
- }
- return src >> 63;
- } else if (shift < 0) {
- if (round) {
- src >>= -shift - 1;
- return (src >> 1) + (src & 1);
- }
- return src >> -shift;
- } else if (shift < 64) {
- int64_t val = src << shift;
- if (!sat || val >> shift == src) {
- return val;
- }
- } else if (!sat || src == 0) {
- return 0;
- }
-
- *sat = 1;
- return src < 0 ? INT64_MIN : INT64_MAX;
-}
-
-static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift,
- bool round, uint32_t *sat)
-{
- if (shift <= -(64 + round)) {
- return 0;
- } else if (shift < 0) {
- if (round) {
- src >>= -shift - 1;
- return (src >> 1) + (src & 1);
- }
- return src >> -shift;
- } else if (shift < 64) {
- uint64_t val = src << shift;
- if (!sat || val >> shift == src) {
- return val;
- }
- } else if (!sat || src == 0) {
- return 0;
- }
-
- *sat = 1;
- return UINT64_MAX;
-}
-
-static inline int64_t do_suqrshl_d(int64_t src, int64_t shift,
- bool round, uint32_t *sat)
-{
- if (sat && src < 0) {
- *sat = 1;
- return 0;
- }
- return do_uqrshl_d(src, shift, round, sat);
-}
-
-int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool);
-int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
-int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
-int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
-
-/*
- * 8 x 8 -> 16 vector polynomial multiply where the inputs are
- * in the low 8 bits of each 16-bit element
-*/
-uint64_t pmull_h(uint64_t op1, uint64_t op2);
-/*
- * 16 x 16 -> 32 vector polynomial multiply where the inputs are
- * in the low 16 bits of each 32-bit element
- */
-uint64_t pmull_w(uint64_t op1, uint64_t op2);
-
-/**
- * bfdotadd:
- * @sum: addend
- * @e1, @e2: multiplicand vectors
- *
- * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum.
- * The @e1 and @e2 operands correspond to the 32-bit source vector
- * slots and contain two Bfloat16 values each.
- *
- * Corresponds to the ARM pseudocode function BFDotAdd.
- */
-float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2);
-
-#endif /* TARGET_ARM_VEC_INTERNAL_H */