* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
#include "qemu/osdep.h"
#include "cpu.h"
+#include "internals.h"
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
#include "exec/helper-proto.h"
#include "tcg/tcg-gvec-desc.h"
#include "fpu/softfloat.h"
+#include "tcg/tcg.h"
/* Note that vector data is stored in host-endian 64-bit chunks,
return (intptr_t)-1 << esz;
}
-uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
+uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
{
+ intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
uint32_t flags = PREDTEST_INIT;
uint64_t *d = vd, *g = vg;
intptr_t i = 0;
uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
{
- intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
- intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+ intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+ intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
uint32_t flags = PREDTEST_INIT;
uint64_t *d = vd, *g = vg, esz_mask;
intptr_t i, next;
return flags;
}
-/* Store zero into every active element of Zd. We will use this for two
- * and three-operand predicated instructions for which logic dictates a
- * zero result. In particular, logical shift by element size, which is
- * otherwise undefined on the host.
- *
- * For element sizes smaller than uint64_t, we use tables to expand
- * the N bits of the controlling predicate to a byte mask, and clear
- * those bytes.
+/*
+ * Copy Zn into Zd, and store zero into inactive elements.
+ * If inv, store zeros into the active elements.
*/
-void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- uint8_t *pg = vg;
- for (i = 0; i < opr_sz; i += 1) {
- d[i] &= ~expand_pred_b(pg[H1(i)]);
- }
-}
-
-void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- uint8_t *pg = vg;
- for (i = 0; i < opr_sz; i += 1) {
- d[i] &= ~expand_pred_h(pg[H1(i)]);
- }
-}
-
-void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- uint8_t *pg = vg;
- for (i = 0; i < opr_sz; i += 1) {
- d[i] &= ~expand_pred_s(pg[H1(i)]);
- }
-}
-
-void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- uint8_t *pg = vg;
- for (i = 0; i < opr_sz; i += 1) {
- if (pg[H1(i)] & 1) {
- d[i] = 0;
- }
- }
-}
-
-/* Copy Zn into Zd, and store zero into inactive elements. */
void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
uint64_t *d = vd, *n = vn;
uint8_t *pg = vg;
+
for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] & expand_pred_b(pg[H1(i)]);
+ d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
}
}
void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
uint64_t *d = vd, *n = vn;
uint8_t *pg = vg;
+
for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] & expand_pred_h(pg[H1(i)]);
+ d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
}
}
void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
uint64_t *d = vd, *n = vn;
uint8_t *pg = vg;
+
for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] & expand_pred_s(pg[H1(i)]);
+ d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
}
}
intptr_t i, opr_sz = simd_oprsz(desc) / 8;
uint64_t *d = vd, *n = vn;
uint8_t *pg = vg;
+ uint8_t inv = simd_data(desc);
+
for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[1] & -(uint64_t)(pg[H1(i)] & 1);
+ d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
}
}
}
}
-/* Big-endian hosts need to frob the byte indicies. If the copy
+/* Big-endian hosts need to frob the byte indices. If the copy
* happens to be 8-byte aligned, then no frobbing necessary.
*/
static void swap_memmove(void *vd, void *vs, size_t n)
}
}
+/* Similarly for memset of 0. */
+static void swap_memzero(void *vd, size_t n)
+{
+ uintptr_t d = (uintptr_t)vd;
+ uintptr_t o = (d | n) & 7;
+ size_t i;
+
+ /* Usually, the first bit of a predicate is set, so N is 0. */
+ if (likely(n == 0)) {
+ return;
+ }
+
+#ifndef HOST_WORDS_BIGENDIAN
+ o = 0;
+#endif
+ switch (o) {
+ case 0:
+ memset(vd, 0, n);
+ break;
+
+ case 4:
+ for (i = 0; i < n; i += 4) {
+ *(uint32_t *)H1_4(d + i) = 0;
+ }
+ break;
+
+ case 2:
+ case 6:
+ for (i = 0; i < n; i += 2) {
+ *(uint16_t *)H1_2(d + i) = 0;
+ }
+ break;
+
+ default:
+ for (i = 0; i < n; i++) {
+ *(uint8_t *)H1(d + i) = 0;
+ }
+ break;
+ }
+}
+
void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
{
intptr_t opr_sz = simd_oprsz(desc);
void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
- intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
uint64_t *d = vd;
intptr_t i;
void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
- int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
uint64_t *d = vd, *n = vn, *m = vm;
uint64_t l, h;
intptr_t i;
if (oprsz <= 8) {
l = compress_bits(n[0] >> odd, esz);
h = compress_bits(m[0] >> odd, esz);
- d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
+ d[0] = l | (h << (4 * oprsz));
} else {
ARMPredicateReg tmp_m;
intptr_t oprsz_16 = oprsz / 16;
h = n[2 * i + 1];
l = compress_bits(l >> odd, esz);
h = compress_bits(h >> odd, esz);
- d[i] = l + (h << 32);
+ d[i] = l | (h << 32);
}
- /* For VL which is not a power of 2, the results from M do not
- align nicely with the uint64_t for D. Put the aligned results
- from M into TMP_M and then copy it into place afterward. */
+ /*
+ * For VL which is not a multiple of 512, the results from M do not
+ * align nicely with the uint64_t for D. Put the aligned results
+ * from M into TMP_M and then copy it into place afterward.
+ */
if (oprsz & 15) {
- d[i] = compress_bits(n[2 * i] >> odd, esz);
+ int final_shift = (oprsz & 15) * 2;
+
+ l = n[2 * i + 0];
+ h = n[2 * i + 1];
+ l = compress_bits(l >> odd, esz);
+ h = compress_bits(h >> odd, esz);
+ d[i] = l | (h << final_shift);
for (i = 0; i < oprsz_16; i++) {
l = m[2 * i + 0];
h = m[2 * i + 1];
l = compress_bits(l >> odd, esz);
h = compress_bits(h >> odd, esz);
- tmp_m.p[i] = l + (h << 32);
+ tmp_m.p[i] = l | (h << 32);
}
- tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
+ l = m[2 * i + 0];
+ h = m[2 * i + 1];
+ l = compress_bits(l >> odd, esz);
+ h = compress_bits(h >> odd, esz);
+ tmp_m.p[i] = l | (h << final_shift);
swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
} else {
h = m[2 * i + 1];
l = compress_bits(l >> odd, esz);
h = compress_bits(h >> odd, esz);
- d[oprsz_16 + i] = l + (h << 32);
+ d[oprsz_16 + i] = l | (h << 32);
}
}
}
void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
- bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
uint64_t *d = vd, *n = vn, *m = vm;
uint64_t mask;
int shr, shl;
void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
intptr_t i, oprsz_2 = oprsz / 2;
if (oprsz <= 8) {
void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
uint64_t *d = vd;
intptr_t i;
#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
-DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, uint8_t, uint64_t, ==)
-DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, uint16_t, uint64_t, ==)
-DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, uint32_t, uint64_t, ==)
+DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
+DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
+DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
-DO_CMP_PPZW_B(sve_cmpne_ppzw_b, uint8_t, uint64_t, !=)
-DO_CMP_PPZW_H(sve_cmpne_ppzw_h, uint16_t, uint64_t, !=)
-DO_CMP_PPZW_S(sve_cmpne_ppzw_s, uint32_t, uint64_t, !=)
+DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
+DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
+DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
return flags;
}
- /* Scale from predicate element count to bits. */
- count <<= esz;
- /* Bound to the bits in the predicate. */
- count = MIN(count, oprsz * 8);
-
/* Set all of the requested bits. */
for (i = 0; i < count / 64; ++i) {
d->p[i] = esz_mask;
*/
static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
{
- flag save = get_flush_inputs_to_zero(fpst);
+ bool save = get_flush_inputs_to_zero(fpst);
float32 ret;
set_flush_inputs_to_zero(false, fpst);
static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
{
- flag save = get_flush_inputs_to_zero(fpst);
+ bool save = get_flush_inputs_to_zero(fpst);
float64 ret;
set_flush_inputs_to_zero(false, fpst);
static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
{
- flag save = get_flush_to_zero(fpst);
+ bool save = get_flush_to_zero(fpst);
float16 ret;
set_flush_to_zero(false, fpst);
static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
{
- flag save = get_flush_to_zero(fpst);
+ bool save = get_flush_to_zero(fpst);
float16 ret;
set_flush_to_zero(false, fpst);
#undef DO_ZPZ_FP
-/* 4-operand predicated multiply-add. This requires 7 operands to pass
- * "properly", so we need to encode some of the registers into DESC.
- */
-QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
-
-static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
+static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
+ float_status *status, uint32_t desc,
uint16_t neg1, uint16_t neg3)
{
intptr_t i = simd_oprsz(desc);
- unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
- unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
- unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
- unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
- void *vd = &env->vfp.zregs[rd];
- void *vn = &env->vfp.zregs[rn];
- void *vm = &env->vfp.zregs[rm];
- void *va = &env->vfp.zregs[ra];
uint64_t *g = vg;
do {
e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
e2 = *(uint16_t *)(vm + H1_2(i));
e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
- r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
+ r = float16_muladd(e1, e2, e3, 0, status);
*(uint16_t *)(vd + H1_2(i)) = r;
}
} while (i & 63);
} while (i != 0);
}
-void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_h(env, vg, desc, 0, 0);
+ do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
}
-void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
+ do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
}
-void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
+ do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
}
-void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
+ do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
}
-static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
+static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
+ float_status *status, uint32_t desc,
uint32_t neg1, uint32_t neg3)
{
intptr_t i = simd_oprsz(desc);
- unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
- unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
- unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
- unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
- void *vd = &env->vfp.zregs[rd];
- void *vn = &env->vfp.zregs[rn];
- void *vm = &env->vfp.zregs[rm];
- void *va = &env->vfp.zregs[ra];
uint64_t *g = vg;
do {
e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
e2 = *(uint32_t *)(vm + H1_4(i));
e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
- r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
+ r = float32_muladd(e1, e2, e3, 0, status);
*(uint32_t *)(vd + H1_4(i)) = r;
}
} while (i & 63);
} while (i != 0);
}
-void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_s(env, vg, desc, 0, 0);
+ do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
}
-void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
+ do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
}
-void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
+ do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
}
-void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
+ do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
}
-static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
+static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
+ float_status *status, uint32_t desc,
uint64_t neg1, uint64_t neg3)
{
intptr_t i = simd_oprsz(desc);
- unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
- unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
- unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
- unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
- void *vd = &env->vfp.zregs[rd];
- void *vn = &env->vfp.zregs[rn];
- void *vm = &env->vfp.zregs[rm];
- void *va = &env->vfp.zregs[ra];
uint64_t *g = vg;
do {
e1 = *(uint64_t *)(vn + i) ^ neg1;
e2 = *(uint64_t *)(vm + i);
e3 = *(uint64_t *)(va + i) ^ neg3;
- r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
+ r = float64_muladd(e1, e2, e3, 0, status);
*(uint64_t *)(vd + i) = r;
}
} while (i & 63);
} while (i != 0);
}
-void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_d(env, vg, desc, 0, 0);
+ do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
}
-void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
+ do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
}
-void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
+ do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
}
-void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
+ do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
}
/* Two operand floating-point comparison controlled by a predicate.
* FP Complex Multiply
*/
-QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
-
-void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
intptr_t j, i = simd_oprsz(desc);
- unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
- unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
- unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
- unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
- unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
+ unsigned rot = simd_data(desc);
bool flip = rot & 1;
float16 neg_imag, neg_real;
- void *vd = &env->vfp.zregs[rd];
- void *vn = &env->vfp.zregs[rn];
- void *vm = &env->vfp.zregs[rm];
- void *va = &env->vfp.zregs[ra];
uint64_t *g = vg;
neg_imag = float16_set_sign(0, (rot & 2) != 0);
if (likely((pg >> (i & 63)) & 1)) {
d = *(float16 *)(va + H1_2(i));
- d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
+ d = float16_muladd(e2, e1, d, 0, status);
*(float16 *)(vd + H1_2(i)) = d;
}
if (likely((pg >> (j & 63)) & 1)) {
d = *(float16 *)(va + H1_2(j));
- d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
+ d = float16_muladd(e4, e3, d, 0, status);
*(float16 *)(vd + H1_2(j)) = d;
}
} while (i & 63);
} while (i != 0);
}
-void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
intptr_t j, i = simd_oprsz(desc);
- unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
- unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
- unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
- unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
- unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
+ unsigned rot = simd_data(desc);
bool flip = rot & 1;
float32 neg_imag, neg_real;
- void *vd = &env->vfp.zregs[rd];
- void *vn = &env->vfp.zregs[rn];
- void *vm = &env->vfp.zregs[rm];
- void *va = &env->vfp.zregs[ra];
uint64_t *g = vg;
neg_imag = float32_set_sign(0, (rot & 2) != 0);
if (likely((pg >> (i & 63)) & 1)) {
d = *(float32 *)(va + H1_2(i));
- d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
+ d = float32_muladd(e2, e1, d, 0, status);
*(float32 *)(vd + H1_2(i)) = d;
}
if (likely((pg >> (j & 63)) & 1)) {
d = *(float32 *)(va + H1_2(j));
- d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
+ d = float32_muladd(e4, e3, d, 0, status);
*(float32 *)(vd + H1_2(j)) = d;
}
} while (i & 63);
} while (i != 0);
}
-void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
intptr_t j, i = simd_oprsz(desc);
- unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
- unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
- unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
- unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
- unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
+ unsigned rot = simd_data(desc);
bool flip = rot & 1;
float64 neg_imag, neg_real;
- void *vd = &env->vfp.zregs[rd];
- void *vn = &env->vfp.zregs[rn];
- void *vm = &env->vfp.zregs[rm];
- void *va = &env->vfp.zregs[ra];
uint64_t *g = vg;
neg_imag = float64_set_sign(0, (rot & 2) != 0);
if (likely((pg >> (i & 63)) & 1)) {
d = *(float64 *)(va + H1_2(i));
- d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
+ d = float64_muladd(e2, e1, d, 0, status);
*(float64 *)(vd + H1_2(i)) = d;
}
if (likely((pg >> (j & 63)) & 1)) {
d = *(float64 *)(va + H1_2(j));
- d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
+ d = float64_muladd(e4, e3, d, 0, status);
*(float64 *)(vd + H1_2(j)) = d;
}
} while (i & 63);
/*
* Load contiguous data, protected by a governing predicate.
*/
-#define DO_LD1(NAME, FN, TYPEE, TYPEM, H) \
-static void do_##NAME(CPUARMState *env, void *vd, void *vg, \
- target_ulong addr, intptr_t oprsz, \
- uintptr_t ra) \
-{ \
- intptr_t i = 0; \
- do { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- TYPEM m = 0; \
- if (pg & 1) { \
- m = FN(env, addr, ra); \
- } \
- *(TYPEE *)(vd + H(i)) = m; \
- i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
- addr += sizeof(TYPEM); \
- } while (i & 15); \
- } while (i < oprsz); \
-} \
-void HELPER(NAME)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- do_##NAME(env, &env->vfp.zregs[simd_data(desc)], vg, \
- addr, simd_oprsz(desc), GETPC()); \
-}
-
-#define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \
-void HELPER(NAME)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- intptr_t ra = GETPC(); \
- unsigned rd = simd_data(desc); \
- void *d1 = &env->vfp.zregs[rd]; \
- void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
- for (i = 0; i < oprsz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- TYPEM m1 = 0, m2 = 0; \
- if (pg & 1) { \
- m1 = FN(env, addr, ra); \
- m2 = FN(env, addr + sizeof(TYPEM), ra); \
- } \
- *(TYPEE *)(d1 + H(i)) = m1; \
- *(TYPEE *)(d2 + H(i)) = m2; \
- i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
- addr += 2 * sizeof(TYPEM); \
- } while (i & 15); \
- } \
-}
-
-#define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \
-void HELPER(NAME)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- intptr_t ra = GETPC(); \
- unsigned rd = simd_data(desc); \
- void *d1 = &env->vfp.zregs[rd]; \
- void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
- void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
- for (i = 0; i < oprsz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- TYPEM m1 = 0, m2 = 0, m3 = 0; \
- if (pg & 1) { \
- m1 = FN(env, addr, ra); \
- m2 = FN(env, addr + sizeof(TYPEM), ra); \
- m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
- } \
- *(TYPEE *)(d1 + H(i)) = m1; \
- *(TYPEE *)(d2 + H(i)) = m2; \
- *(TYPEE *)(d3 + H(i)) = m3; \
- i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
- addr += 3 * sizeof(TYPEM); \
- } while (i & 15); \
- } \
-}
-
-#define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \
-void HELPER(NAME)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- intptr_t ra = GETPC(); \
- unsigned rd = simd_data(desc); \
- void *d1 = &env->vfp.zregs[rd]; \
- void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
- void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
- void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
- for (i = 0; i < oprsz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0; \
- if (pg & 1) { \
- m1 = FN(env, addr, ra); \
- m2 = FN(env, addr + sizeof(TYPEM), ra); \
- m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
- m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \
- } \
- *(TYPEE *)(d1 + H(i)) = m1; \
- *(TYPEE *)(d2 + H(i)) = m2; \
- *(TYPEE *)(d3 + H(i)) = m3; \
- *(TYPEE *)(d4 + H(i)) = m4; \
- i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
- addr += 4 * sizeof(TYPEM); \
- } while (i & 15); \
- } \
-}
-
-DO_LD1(sve_ld1bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
-DO_LD1(sve_ld1bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
-DO_LD1(sve_ld1bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
-DO_LD1(sve_ld1bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
-DO_LD1(sve_ld1bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
-DO_LD1(sve_ld1bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
-DO_LD1(sve_ld1hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
-DO_LD1(sve_ld1hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
-DO_LD1(sve_ld1hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
-DO_LD1(sve_ld1hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
-
-DO_LD1(sve_ld1sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
-DO_LD1(sve_ld1sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
-
-DO_LD1(sve_ld1bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
-DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
-DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
-DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
-
-DO_LD1(sve_ld1hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
-DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
-DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
-DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
-
-DO_LD1(sve_ld1ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
-DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
-DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
-DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
-
-DO_LD1(sve_ld1dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
-DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
-DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
-DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
+/*
+ * Load one element into @vd + @reg_off from @host.
+ * The controlling predicate is known to be true.
+ */
+typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
-#undef DO_LD1
-#undef DO_LD2
-#undef DO_LD3
-#undef DO_LD4
+/*
+ * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
+ * The controlling predicate is known to be true.
+ */
+typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
+ target_ulong vaddr, uintptr_t retaddr);
/*
- * Load contiguous data, first-fault and no-fault.
+ * Generate the above primitives.
*/
-#ifdef CONFIG_USER_ONLY
+#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
+static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
+{ \
+ TYPEM val = HOST(host); \
+ *(TYPEE *)(vd + H(reg_off)) = val; \
+}
+
+#define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
+static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
+{ HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
+
+#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
+static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
+ target_ulong addr, uintptr_t ra) \
+{ \
+ *(TYPEE *)(vd + H(reg_off)) = \
+ (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
+}
+
+#define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
+static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
+ target_ulong addr, uintptr_t ra) \
+{ \
+ TLB(env, useronly_clean_ptr(addr), \
+ (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
+}
+
+#define DO_LD_PRIM_1(NAME, H, TE, TM) \
+ DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
+ DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
+
+DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
+DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
+DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
+DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
+DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
+DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
+DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
+
+#define DO_ST_PRIM_1(NAME, H, TE, TM) \
+ DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
+ DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
+
+DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
+DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
+DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
+DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
+
+#define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
+ DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
+ DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
+ DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
+ DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
+
+#define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
+ DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
+ DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
+ DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
+ DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
+
+DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
+DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
+DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
+DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
+DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
+
+DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
+DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
+DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
+
+DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
+DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
+DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
+
+DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
+DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
+
+DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
+DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
+
+#undef DO_LD_TLB
+#undef DO_ST_TLB
+#undef DO_LD_HOST
+#undef DO_LD_PRIM_1
+#undef DO_ST_PRIM_1
+#undef DO_LD_PRIM_2
+#undef DO_ST_PRIM_2
-/* Fault on byte I. All bits in FFR from I are cleared. The vector
- * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
- * option, which leaves subsequent data unchanged.
+/*
+ * Skip through a sequence of inactive elements in the guarding predicate @vg,
+ * beginning at @reg_off bounded by @reg_max. Return the offset of the active
+ * element >= @reg_off, or @reg_max if there were no active elements at all.
*/
-static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
+static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
+ intptr_t reg_max, int esz)
{
- uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
+ uint64_t pg_mask = pred_esz_masks[esz];
+ uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
- if (i & 63) {
- ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
- i = ROUND_UP(i, 64);
- }
- for (; i < oprsz; i += 64) {
- ffr[i / 64] = 0;
+ /* In normal usage, the first element is active. */
+ if (likely(pg & 1)) {
+ return reg_off;
}
-}
-/* Hold the mmap lock during the operation so that there is no race
- * between page_check_range and the load operation. We expect the
- * usual case to have no faults at all, so we check the whole range
- * first and if successful defer to the normal load operation.
- *
- * TODO: Change mmap_lock to a rwlock so that multiple readers
- * can run simultaneously. This will probably help other uses
- * within QEMU as well.
- */
-#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
-static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
- target_ulong addr, intptr_t oprsz, \
- bool first, uintptr_t ra) \
-{ \
- intptr_t i = 0; \
- do { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- TYPEM m = 0; \
- if (pg & 1) { \
- if (!first && \
- unlikely(page_check_range(addr, sizeof(TYPEM), \
- PAGE_READ))) { \
- record_fault(env, i, oprsz); \
- return; \
- } \
- m = FN(env, addr, ra); \
- first = false; \
- } \
- *(TYPEE *)(vd + H(i)) = m; \
- i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
- addr += sizeof(TYPEM); \
- } while (i & 15); \
- } while (i < oprsz); \
-} \
-void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- intptr_t oprsz = simd_oprsz(desc); \
- unsigned rd = simd_data(desc); \
- void *vd = &env->vfp.zregs[rd]; \
- mmap_lock(); \
- if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
- do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
- } else { \
- do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \
- } \
- mmap_unlock(); \
-}
+ if (pg == 0) {
+ reg_off &= -64;
+ do {
+ reg_off += 64;
+ if (unlikely(reg_off >= reg_max)) {
+ /* The entire predicate was false. */
+ return reg_max;
+ }
+ pg = vg[reg_off >> 6] & pg_mask;
+ } while (pg == 0);
+ }
+ reg_off += ctz64(pg);
-/* No-fault loads are like first-fault loads without the
- * first faulting special case.
- */
-#define DO_LDNF1(PART) \
-void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- intptr_t oprsz = simd_oprsz(desc); \
- unsigned rd = simd_data(desc); \
- void *vd = &env->vfp.zregs[rd]; \
- mmap_lock(); \
- if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
- do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
- } else { \
- do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \
- } \
- mmap_unlock(); \
+ /* We should never see an out of range predicate bit set. */
+ tcg_debug_assert(reg_off < reg_max);
+ return reg_off;
}
-#else
-
-/* TODO: System mode is not yet supported.
- * This would probably use tlb_vaddr_to_host.
+/*
+ * Resolve the guest virtual address to info->host and info->flags.
+ * If @nofault, return false if the page is invalid, otherwise
+ * exit via page fault exception.
*/
-#define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
-void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- g_assert_not_reached(); \
-}
-#define DO_LDNF1(PART) \
-void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- g_assert_not_reached(); \
-}
+typedef struct {
+ void *host;
+ int flags;
+ MemTxAttrs attrs;
+} SVEHostPage;
+
+static bool sve_probe_page(SVEHostPage *info, bool nofault,
+ CPUARMState *env, target_ulong addr,
+ int mem_off, MMUAccessType access_type,
+ int mmu_idx, uintptr_t retaddr)
+{
+ int flags;
+
+ addr += mem_off;
+
+ /*
+ * User-only currently always issues with TBI. See the comment
+ * above useronly_clean_ptr. Usually we clean this top byte away
+ * during translation, but we can't do that for e.g. vector + imm
+ * addressing modes.
+ *
+ * We currently always enable TBI for user-only, and do not provide
+ * a way to turn it off. So clean the pointer unconditionally here,
+ * rather than look it up here, or pass it down from above.
+ */
+ addr = useronly_clean_ptr(addr);
-#endif
+ flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
+ &info->host, retaddr);
+ info->flags = flags;
-DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
-DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
-DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
-DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
-DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
-DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
-DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
+ if (flags & TLB_INVALID_MASK) {
+ g_assert(nofault);
+ return false;
+ }
-DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
-DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
-DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
-DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
-DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
+ /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
+ info->host -= mem_off;
-DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
-DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
-DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
+#ifdef CONFIG_USER_ONLY
+ memset(&info->attrs, 0, sizeof(info->attrs));
+#else
+ /*
+ * Find the iotlbentry for addr and return the transaction attributes.
+ * This *must* be present in the TLB because we just found the mapping.
+ */
+ {
+ uintptr_t index = tlb_index(env, mmu_idx, addr);
+
+# ifdef CONFIG_DEBUG_TCG
+ CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
+ target_ulong comparator = (access_type == MMU_DATA_LOAD
+ ? entry->addr_read
+ : tlb_addr_write(entry));
+ g_assert(tlb_hit(comparator, addr));
+# endif
+
+ CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+ info->attrs = iotlbentry->attrs;
+ }
+#endif
-DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
+ return true;
+}
-#undef DO_LDFF1
-DO_LDNF1(bb_r)
-DO_LDNF1(bhu_r)
-DO_LDNF1(bhs_r)
-DO_LDNF1(bsu_r)
-DO_LDNF1(bss_r)
-DO_LDNF1(bdu_r)
-DO_LDNF1(bds_r)
+/*
+ * Analyse contiguous data, protected by a governing predicate.
+ */
-DO_LDNF1(hh_r)
-DO_LDNF1(hsu_r)
-DO_LDNF1(hss_r)
-DO_LDNF1(hdu_r)
-DO_LDNF1(hds_r)
+typedef enum {
+ FAULT_NO,
+ FAULT_FIRST,
+ FAULT_ALL,
+} SVEContFault;
+
+typedef struct {
+ /*
+ * First and last element wholly contained within the two pages.
+ * mem_off_first[0] and reg_off_first[0] are always set >= 0.
+ * reg_off_last[0] may be < 0 if the first element crosses pages.
+ * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
+ * are set >= 0 only if there are complete elements on a second page.
+ *
+ * The reg_off_* offsets are relative to the internal vector register.
+ * The mem_off_first offset is relative to the memory address; the
+ * two offsets are different when a load operation extends, a store
+ * operation truncates, or for multi-register operations.
+ */
+ int16_t mem_off_first[2];
+ int16_t reg_off_first[2];
+ int16_t reg_off_last[2];
-DO_LDNF1(ss_r)
-DO_LDNF1(sdu_r)
-DO_LDNF1(sds_r)
+ /*
+ * One element that is misaligned and spans both pages,
+ * or -1 if there is no such active element.
+ */
+ int16_t mem_off_split;
+ int16_t reg_off_split;
-DO_LDNF1(dd_r)
+ /*
+ * The byte offset at which the entire operation crosses a page boundary.
+ * Set >= 0 if and only if the entire operation spans two pages.
+ */
+ int16_t page_split;
-#undef DO_LDNF1
+ /* TLB data for the two pages. */
+ SVEHostPage page[2];
+} SVEContLdSt;
/*
- * Store contiguous data, protected by a governing predicate.
+ * Find first active element on each page, and a loose bound for the
+ * final element on each page. Identify any single element that spans
+ * the page boundary. Return true if there are any active elements.
*/
-#define DO_ST1(NAME, FN, TYPEE, TYPEM, H) \
-void HELPER(NAME)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- intptr_t ra = GETPC(); \
- unsigned rd = simd_data(desc); \
- void *vd = &env->vfp.zregs[rd]; \
- for (i = 0; i < oprsz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- if (pg & 1) { \
- TYPEM m = *(TYPEE *)(vd + H(i)); \
- FN(env, addr, m, ra); \
- } \
- i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
- addr += sizeof(TYPEM); \
- } while (i & 15); \
- } \
-}
+static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
+ uint64_t *vg, intptr_t reg_max,
+ int esz, int msize)
+{
+ const int esize = 1 << esz;
+ const uint64_t pg_mask = pred_esz_masks[esz];
+ intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
+ intptr_t mem_off_last, mem_off_split;
+ intptr_t page_split, elt_split;
+ intptr_t i;
-#define DO_ST1_D(NAME, FN, TYPEM) \
-void HELPER(NAME)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc) / 8; \
- intptr_t ra = GETPC(); \
- unsigned rd = simd_data(desc); \
- uint64_t *d = &env->vfp.zregs[rd].d[0]; \
- uint8_t *pg = vg; \
- for (i = 0; i < oprsz; i += 1) { \
- if (pg[H1(i)] & 1) { \
- FN(env, addr, d[i], ra); \
- } \
- addr += sizeof(TYPEM); \
- } \
-}
+ /* Set all of the element indices to -1, and the TLB data to 0. */
+ memset(info, -1, offsetof(SVEContLdSt, page));
+ memset(info->page, 0, sizeof(info->page));
-#define DO_ST2(NAME, FN, TYPEE, TYPEM, H) \
-void HELPER(NAME)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- intptr_t ra = GETPC(); \
- unsigned rd = simd_data(desc); \
- void *d1 = &env->vfp.zregs[rd]; \
- void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
- for (i = 0; i < oprsz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- if (pg & 1) { \
- TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
- TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
- FN(env, addr, m1, ra); \
- FN(env, addr + sizeof(TYPEM), m2, ra); \
- } \
- i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
- addr += 2 * sizeof(TYPEM); \
- } while (i & 15); \
- } \
-}
+ /* Gross scan over the entire predicate to find bounds. */
+ i = 0;
+ do {
+ uint64_t pg = vg[i] & pg_mask;
+ if (pg) {
+ reg_off_last = i * 64 + 63 - clz64(pg);
+ if (reg_off_first < 0) {
+ reg_off_first = i * 64 + ctz64(pg);
+ }
+ }
+ } while (++i * 64 < reg_max);
-#define DO_ST3(NAME, FN, TYPEE, TYPEM, H) \
-void HELPER(NAME)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- intptr_t ra = GETPC(); \
- unsigned rd = simd_data(desc); \
- void *d1 = &env->vfp.zregs[rd]; \
- void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
- void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
- for (i = 0; i < oprsz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- if (pg & 1) { \
- TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
- TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
- TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
- FN(env, addr, m1, ra); \
- FN(env, addr + sizeof(TYPEM), m2, ra); \
- FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
- } \
- i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
- addr += 3 * sizeof(TYPEM); \
- } while (i & 15); \
- } \
-}
+ if (unlikely(reg_off_first < 0)) {
+ /* No active elements, no pages touched. */
+ return false;
+ }
+ tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
-#define DO_ST4(NAME, FN, TYPEE, TYPEM, H) \
-void HELPER(NAME)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- intptr_t ra = GETPC(); \
- unsigned rd = simd_data(desc); \
- void *d1 = &env->vfp.zregs[rd]; \
- void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
- void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
- void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
- for (i = 0; i < oprsz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- if (pg & 1) { \
- TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
- TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
- TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
- TYPEM m4 = *(TYPEE *)(d4 + H(i)); \
- FN(env, addr, m1, ra); \
- FN(env, addr + sizeof(TYPEM), m2, ra); \
- FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
- FN(env, addr + 3 * sizeof(TYPEM), m4, ra); \
- } \
- i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
- addr += 4 * sizeof(TYPEM); \
- } while (i & 15); \
- } \
-}
+ info->reg_off_first[0] = reg_off_first;
+ info->mem_off_first[0] = (reg_off_first >> esz) * msize;
+ mem_off_last = (reg_off_last >> esz) * msize;
-DO_ST1(sve_st1bh_r, cpu_stb_data_ra, uint16_t, uint8_t, H1_2)
-DO_ST1(sve_st1bs_r, cpu_stb_data_ra, uint32_t, uint8_t, H1_4)
-DO_ST1_D(sve_st1bd_r, cpu_stb_data_ra, uint8_t)
+ page_split = -(addr | TARGET_PAGE_MASK);
+ if (likely(mem_off_last + msize <= page_split)) {
+ /* The entire operation fits within a single page. */
+ info->reg_off_last[0] = reg_off_last;
+ return true;
+ }
-DO_ST1(sve_st1hs_r, cpu_stw_data_ra, uint32_t, uint16_t, H1_4)
-DO_ST1_D(sve_st1hd_r, cpu_stw_data_ra, uint16_t)
+ info->page_split = page_split;
+ elt_split = page_split / msize;
+ reg_off_split = elt_split << esz;
+ mem_off_split = elt_split * msize;
-DO_ST1_D(sve_st1sd_r, cpu_stl_data_ra, uint32_t)
+ /*
+ * This is the last full element on the first page, but it is not
+ * necessarily active. If there is no full element, i.e. the first
+ * active element is the one that's split, this value remains -1.
+ * It is useful as iteration bounds.
+ */
+ if (elt_split != 0) {
+ info->reg_off_last[0] = reg_off_split - esize;
+ }
-DO_ST1(sve_st1bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
-DO_ST2(sve_st2bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
-DO_ST3(sve_st3bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
-DO_ST4(sve_st4bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
+ /* Determine if an unaligned element spans the pages. */
+ if (page_split % msize != 0) {
+ /* It is helpful to know if the split element is active. */
+ if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
+ info->reg_off_split = reg_off_split;
+ info->mem_off_split = mem_off_split;
-DO_ST1(sve_st1hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
-DO_ST2(sve_st2hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
-DO_ST3(sve_st3hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
-DO_ST4(sve_st4hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
+ if (reg_off_split == reg_off_last) {
+ /* The page crossing element is last. */
+ return true;
+ }
+ }
+ reg_off_split += esize;
+ mem_off_split += msize;
+ }
-DO_ST1(sve_st1ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
-DO_ST2(sve_st2ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
-DO_ST3(sve_st3ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
-DO_ST4(sve_st4ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
+ /*
+ * We do want the first active element on the second page, because
+ * this may affect the address reported in an exception.
+ */
+ reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
+ tcg_debug_assert(reg_off_split <= reg_off_last);
+ info->reg_off_first[1] = reg_off_split;
+ info->mem_off_first[1] = (reg_off_split >> esz) * msize;
+ info->reg_off_last[1] = reg_off_last;
+ return true;
+}
-DO_ST1_D(sve_st1dd_r, cpu_stq_data_ra, uint64_t)
+/*
+ * Resolve the guest virtual addresses to info->page[].
+ * Control the generation of page faults with @fault. Return false if
+ * there is no work to do, which can only happen with @fault == FAULT_NO.
+ */
+static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
+ CPUARMState *env, target_ulong addr,
+ MMUAccessType access_type, uintptr_t retaddr)
+{
+ int mmu_idx = cpu_mmu_index(env, false);
+ int mem_off = info->mem_off_first[0];
+ bool nofault = fault == FAULT_NO;
+ bool have_work = true;
+
+ if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
+ access_type, mmu_idx, retaddr)) {
+ /* No work to be done. */
+ return false;
+ }
-void HELPER(sve_st2dd_r)(CPUARMState *env, void *vg,
- target_ulong addr, uint32_t desc)
-{
- intptr_t i, oprsz = simd_oprsz(desc) / 8;
- intptr_t ra = GETPC();
- unsigned rd = simd_data(desc);
- uint64_t *d1 = &env->vfp.zregs[rd].d[0];
- uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
- uint8_t *pg = vg;
+ if (likely(info->page_split < 0)) {
+ /* The entire operation was on the one page. */
+ return true;
+ }
- for (i = 0; i < oprsz; i += 1) {
- if (pg[H1(i)] & 1) {
- cpu_stq_data_ra(env, addr, d1[i], ra);
- cpu_stq_data_ra(env, addr + 8, d2[i], ra);
+ /*
+ * If the second page is invalid, then we want the fault address to be
+ * the first byte on that page which is accessed.
+ */
+ if (info->mem_off_split >= 0) {
+ /*
+ * There is an element split across the pages. The fault address
+ * should be the first byte of the second page.
+ */
+ mem_off = info->page_split;
+ /*
+ * If the split element is also the first active element
+ * of the vector, then: For first-fault we should continue
+ * to generate faults for the second page. For no-fault,
+ * we have work only if the second page is valid.
+ */
+ if (info->mem_off_first[0] < info->mem_off_split) {
+ nofault = FAULT_FIRST;
+ have_work = false;
}
- addr += 2 * 8;
+ } else {
+ /*
+ * There is no element split across the pages. The fault address
+ * should be the first active element on the second page.
+ */
+ mem_off = info->mem_off_first[1];
+ /*
+ * There must have been one active element on the first page,
+ * so we're out of first-fault territory.
+ */
+ nofault = fault != FAULT_ALL;
}
+
+ have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
+ access_type, mmu_idx, retaddr);
+ return have_work;
}
-void HELPER(sve_st3dd_r)(CPUARMState *env, void *vg,
- target_ulong addr, uint32_t desc)
+static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr,
+ int esize, int msize, int wp_access,
+ uintptr_t retaddr)
{
- intptr_t i, oprsz = simd_oprsz(desc) / 8;
- intptr_t ra = GETPC();
- unsigned rd = simd_data(desc);
- uint64_t *d1 = &env->vfp.zregs[rd].d[0];
- uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
- uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
- uint8_t *pg = vg;
+#ifndef CONFIG_USER_ONLY
+ intptr_t mem_off, reg_off, reg_last;
+ int flags0 = info->page[0].flags;
+ int flags1 = info->page[1].flags;
- for (i = 0; i < oprsz; i += 1) {
- if (pg[H1(i)] & 1) {
- cpu_stq_data_ra(env, addr, d1[i], ra);
- cpu_stq_data_ra(env, addr + 8, d2[i], ra);
- cpu_stq_data_ra(env, addr + 16, d3[i], ra);
+ if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
+ return;
+ }
+
+ /* Indicate that watchpoints are handled. */
+ info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
+ info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
+
+ if (flags0 & TLB_WATCHPOINT) {
+ mem_off = info->mem_off_first[0];
+ reg_off = info->reg_off_first[0];
+ reg_last = info->reg_off_last[0];
+
+ while (reg_off <= reg_last) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ cpu_check_watchpoint(env_cpu(env), addr + mem_off,
+ msize, info->page[0].attrs,
+ wp_access, retaddr);
+ }
+ reg_off += esize;
+ mem_off += msize;
+ } while (reg_off <= reg_last && (reg_off & 63));
}
- addr += 3 * 8;
}
+
+ mem_off = info->mem_off_split;
+ if (mem_off >= 0) {
+ cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
+ info->page[0].attrs, wp_access, retaddr);
+ }
+
+ mem_off = info->mem_off_first[1];
+ if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
+ reg_off = info->reg_off_first[1];
+ reg_last = info->reg_off_last[1];
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ cpu_check_watchpoint(env_cpu(env), addr + mem_off,
+ msize, info->page[1].attrs,
+ wp_access, retaddr);
+ }
+ reg_off += esize;
+ mem_off += msize;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ }
+#endif
}
-void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
- target_ulong addr, uint32_t desc)
+typedef uint64_t mte_check_fn(CPUARMState *, uint32_t, uint64_t, uintptr_t);
+
+static inline QEMU_ALWAYS_INLINE
+void sve_cont_ldst_mte_check_int(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr, int esize,
+ int msize, uint32_t mtedesc, uintptr_t ra,
+ mte_check_fn *check)
{
- intptr_t i, oprsz = simd_oprsz(desc) / 8;
- intptr_t ra = GETPC();
- unsigned rd = simd_data(desc);
- uint64_t *d1 = &env->vfp.zregs[rd].d[0];
- uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
- uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
- uint64_t *d4 = &env->vfp.zregs[(rd + 3) & 31].d[0];
- uint8_t *pg = vg;
+ intptr_t mem_off, reg_off, reg_last;
- for (i = 0; i < oprsz; i += 1) {
- if (pg[H1(i)] & 1) {
- cpu_stq_data_ra(env, addr, d1[i], ra);
- cpu_stq_data_ra(env, addr + 8, d2[i], ra);
- cpu_stq_data_ra(env, addr + 16, d3[i], ra);
- cpu_stq_data_ra(env, addr + 24, d4[i], ra);
+ /* Process the page only if MemAttr == Tagged. */
+ if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
+ mem_off = info->mem_off_first[0];
+ reg_off = info->reg_off_first[0];
+ reg_last = info->reg_off_split;
+ if (reg_last < 0) {
+ reg_last = info->reg_off_last[0];
}
- addr += 4 * 8;
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ check(env, mtedesc, addr, ra);
+ }
+ reg_off += esize;
+ mem_off += msize;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ } while (reg_off <= reg_last);
+ }
+
+ mem_off = info->mem_off_first[1];
+ if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
+ reg_off = info->reg_off_first[1];
+ reg_last = info->reg_off_last[1];
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ check(env, mtedesc, addr, ra);
+ }
+ reg_off += esize;
+ mem_off += msize;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
}
}
-/* Loads with a vector index. */
+typedef void sve_cont_ldst_mte_check_fn(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr,
+ int esize, int msize, uint32_t mtedesc,
+ uintptr_t ra);
+
+static void sve_cont_ldst_mte_check1(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr,
+ int esize, int msize, uint32_t mtedesc,
+ uintptr_t ra)
+{
+ sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
+ mtedesc, ra, mte_check1);
+}
+
+static void sve_cont_ldst_mte_checkN(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr,
+ int esize, int msize, uint32_t mtedesc,
+ uintptr_t ra)
+{
+ sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
+ mtedesc, ra, mte_checkN);
+}
+
+
+/*
+ * Common helper for all contiguous 1,2,3,4-register predicated stores.
+ */
+static inline QEMU_ALWAYS_INLINE
+void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
+ uint32_t desc, const uintptr_t retaddr,
+ const int esz, const int msz, const int N, uint32_t mtedesc,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn,
+ sve_cont_ldst_mte_check_fn *mte_check_fn)
+{
+ const unsigned rd = simd_data(desc);
+ const intptr_t reg_max = simd_oprsz(desc);
+ intptr_t reg_off, reg_last, mem_off;
+ SVEContLdSt info;
+ void *host;
+ int flags, i;
+
+ /* Find the active elements. */
+ if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
+ /* The entire predicate was false; no load occurs. */
+ for (i = 0; i < N; ++i) {
+ memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
+ }
+ return;
+ }
+
+ /* Probe the page(s). Exit with exception for any invalid page. */
+ sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
+
+ /* Handle watchpoints for all active elements. */
+ sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
+ BP_MEM_READ, retaddr);
+
+ /*
+ * Handle mte checks for all active elements.
+ * Since TBI must be set for MTE, !mtedesc => !mte_active.
+ */
+ if (mte_check_fn && mtedesc) {
+ mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
+ mtedesc, retaddr);
+ }
+
+ flags = info.page[0].flags | info.page[1].flags;
+ if (unlikely(flags != 0)) {
+#ifdef CONFIG_USER_ONLY
+ g_assert_not_reached();
+#else
+ /*
+ * At least one page includes MMIO.
+ * Any bus operation can fail with cpu_transaction_failed,
+ * which for ARM will raise SyncExternal. Perform the load
+ * into scratch memory to preserve register state until the end.
+ */
+ ARMVectorReg scratch[4] = { };
+
+ mem_off = info.mem_off_first[0];
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[1];
+ if (reg_last < 0) {
+ reg_last = info.reg_off_split;
+ if (reg_last < 0) {
+ reg_last = info.reg_off_last[0];
+ }
+ }
-#define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
-void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
- target_ulong base, uint32_t desc) \
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ tlb_fn(env, &scratch[i], reg_off,
+ addr + mem_off + (i << msz), retaddr);
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+
+ for (i = 0; i < N; ++i) {
+ memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
+ }
+ return;
+#endif
+ }
+
+ /* The entire operation is in RAM, on valid pages. */
+
+ for (i = 0; i < N; ++i) {
+ memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
+ }
+
+ mem_off = info.mem_off_first[0];
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[0];
+ host = info.page[0].host;
+
+ while (reg_off <= reg_last) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+ host + mem_off + (i << msz));
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ }
+
+ /*
+ * Use the slow path to manage the cross-page misalignment.
+ * But we know this is RAM and cannot trap.
+ */
+ mem_off = info.mem_off_split;
+ if (unlikely(mem_off >= 0)) {
+ reg_off = info.reg_off_split;
+ for (i = 0; i < N; ++i) {
+ tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
+ addr + mem_off + (i << msz), retaddr);
+ }
+ }
+
+ mem_off = info.mem_off_first[1];
+ if (unlikely(mem_off >= 0)) {
+ reg_off = info.reg_off_first[1];
+ reg_last = info.reg_off_last[1];
+ host = info.page[1].host;
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+ host + mem_off + (i << msz));
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ }
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
+ uint32_t desc, const uintptr_t ra,
+ const int esz, const int msz, const int N,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ int bit55 = extract64(addr, 55, 1);
+
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /* Perform gross MTE suppression early. */
+ if (!tbi_check(desc, bit55) ||
+ tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+ mtedesc = 0;
+ }
+
+ sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
+ N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
+}
+
+#define DO_LD1_1(NAME, ESZ) \
+void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- unsigned scale = simd_data(desc); \
- uintptr_t ra = GETPC(); \
- for (i = 0; i < oprsz; i++) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- TYPEM m = 0; \
- if (pg & 1) { \
- target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
- m = FN(env, base + (off << scale), ra); \
- } \
- *(uint32_t *)(vd + H1_4(i)) = m; \
- i += 4, pg >>= 4; \
- } while (i & 15); \
- } \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
+ sve_##NAME##_host, sve_##NAME##_tlb, NULL); \
+} \
+void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
+ sve_##NAME##_host, sve_##NAME##_tlb); \
}
-#define DO_LD1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
-void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
- target_ulong base, uint32_t desc) \
+#define DO_LD1_2(NAME, ESZ, MSZ) \
+void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
{ \
- intptr_t i, oprsz = simd_oprsz(desc) / 8; \
- unsigned scale = simd_data(desc); \
- uintptr_t ra = GETPC(); \
- uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
- for (i = 0; i < oprsz; i++) { \
- TYPEM mm = 0; \
- if (pg[H1(i)] & 1) { \
- target_ulong off = (TYPEI)m[i]; \
- mm = FN(env, base + (off << scale), ra); \
- } \
- d[i] = mm; \
- } \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
+ sve_##NAME##_le_host, sve_##NAME##_le_tlb, NULL); \
+} \
+void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
+ sve_##NAME##_be_host, sve_##NAME##_be_tlb, NULL); \
+} \
+void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
+ sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
+} \
+void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
+ sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
}
-DO_LD1_ZPZ_S(sve_ldbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
-DO_LD1_ZPZ_S(sve_ldhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
-DO_LD1_ZPZ_S(sve_ldssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
-DO_LD1_ZPZ_S(sve_ldbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
-DO_LD1_ZPZ_S(sve_ldhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
-
-DO_LD1_ZPZ_S(sve_ldbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
-DO_LD1_ZPZ_S(sve_ldhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
-DO_LD1_ZPZ_S(sve_ldssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
-DO_LD1_ZPZ_S(sve_ldbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
-DO_LD1_ZPZ_S(sve_ldhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
-
-DO_LD1_ZPZ_D(sve_ldbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
-DO_LD1_ZPZ_D(sve_ldhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
-DO_LD1_ZPZ_D(sve_ldsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
-DO_LD1_ZPZ_D(sve_ldddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
-DO_LD1_ZPZ_D(sve_ldbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
-DO_LD1_ZPZ_D(sve_ldhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
-DO_LD1_ZPZ_D(sve_ldsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
-
-DO_LD1_ZPZ_D(sve_ldbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
-DO_LD1_ZPZ_D(sve_ldhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
-DO_LD1_ZPZ_D(sve_ldsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
-DO_LD1_ZPZ_D(sve_ldddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
-DO_LD1_ZPZ_D(sve_ldbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
-DO_LD1_ZPZ_D(sve_ldhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
-DO_LD1_ZPZ_D(sve_ldsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
-
-DO_LD1_ZPZ_D(sve_ldbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
-DO_LD1_ZPZ_D(sve_ldhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
-DO_LD1_ZPZ_D(sve_ldsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
-DO_LD1_ZPZ_D(sve_ldddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
-DO_LD1_ZPZ_D(sve_ldbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
-DO_LD1_ZPZ_D(sve_ldhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
-DO_LD1_ZPZ_D(sve_ldsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
+DO_LD1_1(ld1bb, MO_8)
+DO_LD1_1(ld1bhu, MO_16)
+DO_LD1_1(ld1bhs, MO_16)
+DO_LD1_1(ld1bsu, MO_32)
+DO_LD1_1(ld1bss, MO_32)
+DO_LD1_1(ld1bdu, MO_64)
+DO_LD1_1(ld1bds, MO_64)
-/* First fault loads with a vector index. */
+DO_LD1_2(ld1hh, MO_16, MO_16)
+DO_LD1_2(ld1hsu, MO_32, MO_16)
+DO_LD1_2(ld1hss, MO_32, MO_16)
+DO_LD1_2(ld1hdu, MO_64, MO_16)
+DO_LD1_2(ld1hds, MO_64, MO_16)
-#ifdef CONFIG_USER_ONLY
+DO_LD1_2(ld1ss, MO_32, MO_32)
+DO_LD1_2(ld1sdu, MO_64, MO_32)
+DO_LD1_2(ld1sds, MO_64, MO_32)
-#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
-void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
- target_ulong base, uint32_t desc) \
+DO_LD1_2(ld1dd, MO_64, MO_64)
+
+#undef DO_LD1_1
+#undef DO_LD1_2
+
+#define DO_LDN_1(N) \
+void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- unsigned scale = simd_data(desc); \
- uintptr_t ra = GETPC(); \
- bool first = true; \
- mmap_lock(); \
- for (i = 0; i < oprsz; i++) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- TYPEM m = 0; \
- if (pg & 1) { \
- target_ulong off = *(TYPEI *)(vm + H(i)); \
- target_ulong addr = base + (off << scale); \
- if (!first && \
- page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
- record_fault(env, i, oprsz); \
- goto exit; \
- } \
- m = FN(env, addr, ra); \
- first = false; \
- } \
- *(TYPEE *)(vd + H(i)) = m; \
- i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
- } while (i & 15); \
- } \
- exit: \
- mmap_unlock(); \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
+ sve_ld1bb_host, sve_ld1bb_tlb, NULL); \
+} \
+void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
+ sve_ld1bb_host, sve_ld1bb_tlb); \
}
-#else
+#define DO_LDN_2(N, SUFF, ESZ) \
+void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
+ sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb, NULL); \
+} \
+void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
+ sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb, NULL); \
+} \
+void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
+ sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
+} \
+void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
+ sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
+}
+
+DO_LDN_1(2)
+DO_LDN_1(3)
+DO_LDN_1(4)
+
+DO_LDN_2(2, hh, MO_16)
+DO_LDN_2(3, hh, MO_16)
+DO_LDN_2(4, hh, MO_16)
+
+DO_LDN_2(2, ss, MO_32)
+DO_LDN_2(3, ss, MO_32)
+DO_LDN_2(4, ss, MO_32)
+
+DO_LDN_2(2, dd, MO_64)
+DO_LDN_2(3, dd, MO_64)
+DO_LDN_2(4, dd, MO_64)
+
+#undef DO_LDN_1
+#undef DO_LDN_2
+
+/*
+ * Load contiguous data, first-fault and no-fault.
+ *
+ * For user-only, one could argue that we should hold the mmap_lock during
+ * the operation so that there is no race between page_check_range and the
+ * load operation. However, unmapping pages out from under a running thread
+ * is extraordinarily unlikely. This theoretical race condition also affects
+ * linux-user/ in its get_user/put_user macros.
+ *
+ * TODO: Construct some helpers, written in assembly, that interact with
+ * handle_cpu_signal to produce memory ops which can properly report errors
+ * without racing.
+ */
+
+/* Fault on byte I. All bits in FFR from I are cleared. The vector
+ * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
+ * option, which leaves subsequent data unchanged.
+ */
+static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
+{
+ uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
+
+ if (i & 63) {
+ ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
+ i = ROUND_UP(i, 64);
+ }
+ for (; i < oprsz; i += 64) {
+ ffr[i / 64] = 0;
+ }
+}
+
+/*
+ * Common helper for all contiguous no-fault and first-fault loads.
+ */
+static inline QEMU_ALWAYS_INLINE
+void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
+ uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
+ const int esz, const int msz, const SVEContFault fault,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const unsigned rd = simd_data(desc);
+ void *vd = &env->vfp.zregs[rd];
+ const intptr_t reg_max = simd_oprsz(desc);
+ intptr_t reg_off, mem_off, reg_last;
+ SVEContLdSt info;
+ int flags;
+ void *host;
+
+ /* Find the active elements. */
+ if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
+ /* The entire predicate was false; no load occurs. */
+ memset(vd, 0, reg_max);
+ return;
+ }
+ reg_off = info.reg_off_first[0];
+
+ /* Probe the page(s). */
+ if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
+ /* Fault on first element. */
+ tcg_debug_assert(fault == FAULT_NO);
+ memset(vd, 0, reg_max);
+ goto do_fault;
+ }
-#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
-void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
- target_ulong base, uint32_t desc) \
+ mem_off = info.mem_off_first[0];
+ flags = info.page[0].flags;
+
+ /*
+ * Disable MTE checking if the Tagged bit is not set. Since TBI must
+ * be set within MTEDESC for MTE, !mtedesc => !mte_active.
+ */
+ if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
+ mtedesc = 0;
+ }
+
+ if (fault == FAULT_FIRST) {
+ /* Trapping mte check for the first-fault element. */
+ if (mtedesc) {
+ mte_check1(env, mtedesc, addr + mem_off, retaddr);
+ }
+
+ /*
+ * Special handling of the first active element,
+ * if it crosses a page boundary or is MMIO.
+ */
+ bool is_split = mem_off == info.mem_off_split;
+ if (unlikely(flags != 0) || unlikely(is_split)) {
+ /*
+ * Use the slow path for cross-page handling.
+ * Might trap for MMIO or watchpoints.
+ */
+ tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
+
+ /* After any fault, zero the other elements. */
+ swap_memzero(vd, reg_off);
+ reg_off += 1 << esz;
+ mem_off += 1 << msz;
+ swap_memzero(vd + reg_off, reg_max - reg_off);
+
+ if (is_split) {
+ goto second_page;
+ }
+ } else {
+ memset(vd, 0, reg_max);
+ }
+ } else {
+ memset(vd, 0, reg_max);
+ if (unlikely(mem_off == info.mem_off_split)) {
+ /* The first active element crosses a page boundary. */
+ flags |= info.page[1].flags;
+ if (unlikely(flags & TLB_MMIO)) {
+ /* Some page is MMIO, see below. */
+ goto do_fault;
+ }
+ if (unlikely(flags & TLB_WATCHPOINT) &&
+ (cpu_watchpoint_address_matches
+ (env_cpu(env), addr + mem_off, 1 << msz)
+ & BP_MEM_READ)) {
+ /* Watchpoint hit, see below. */
+ goto do_fault;
+ }
+ if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
+ goto do_fault;
+ }
+ /*
+ * Use the slow path for cross-page handling.
+ * This is RAM, without a watchpoint, and will not trap.
+ */
+ tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
+ goto second_page;
+ }
+ }
+
+ /*
+ * From this point on, all memory operations are MemSingleNF.
+ *
+ * Per the MemSingleNF pseudocode, a no-fault load from Device memory
+ * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
+ *
+ * Unfortuately we do not have access to the memory attributes from the
+ * PTE to tell Device memory from Normal memory. So we make a mostly
+ * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
+ * This gives the right answer for the common cases of "Normal memory,
+ * backed by host RAM" and "Device memory, backed by MMIO".
+ * The architecture allows us to suppress an NF load and return
+ * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
+ * case of "Normal memory, backed by MMIO" is permitted. The case we
+ * get wrong is "Device memory, backed by host RAM", for which we
+ * should return (UNKNOWN, FAULT) for but do not.
+ *
+ * Similarly, CPU_BP breakpoints would raise exceptions, and so
+ * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
+ * architectural breakpoints the same.
+ */
+ if (unlikely(flags & TLB_MMIO)) {
+ goto do_fault;
+ }
+
+ reg_last = info.reg_off_last[0];
+ host = info.page[0].host;
+
+ do {
+ uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ if (unlikely(flags & TLB_WATCHPOINT) &&
+ (cpu_watchpoint_address_matches
+ (env_cpu(env), addr + mem_off, 1 << msz)
+ & BP_MEM_READ)) {
+ goto do_fault;
+ }
+ if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
+ goto do_fault;
+ }
+ host_fn(vd, reg_off, host + mem_off);
+ }
+ reg_off += 1 << esz;
+ mem_off += 1 << msz;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ } while (reg_off <= reg_last);
+
+ /*
+ * MemSingleNF is allowed to fail for any reason. We have special
+ * code above to handle the first element crossing a page boundary.
+ * As an implementation choice, decline to handle a cross-page element
+ * in any other position.
+ */
+ reg_off = info.reg_off_split;
+ if (reg_off >= 0) {
+ goto do_fault;
+ }
+
+ second_page:
+ reg_off = info.reg_off_first[1];
+ if (likely(reg_off < 0)) {
+ /* No active elements on the second page. All done. */
+ return;
+ }
+
+ /*
+ * MemSingleNF is allowed to fail for any reason. As an implementation
+ * choice, decline to handle elements on the second page. This should
+ * be low frequency as the guest walks through memory -- the next
+ * iteration of the guest's loop should be aligned on the page boundary,
+ * and then all following iterations will stay aligned.
+ */
+
+ do_fault:
+ record_fault(env, reg_off, reg_max);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
+ uint32_t desc, const uintptr_t retaddr,
+ const int esz, const int msz, const SVEContFault fault,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ int bit55 = extract64(addr, 55, 1);
+
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /* Perform gross MTE suppression early. */
+ if (!tbi_check(desc, bit55) ||
+ tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+ mtedesc = 0;
+ }
+
+ sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
+ esz, msz, fault, host_fn, tlb_fn);
+}
+
+#define DO_LDFF1_LDNF1_1(PART, ESZ) \
+void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
+ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
+ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
+} \
+void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
{ \
- g_assert_not_reached(); \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
+ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
+ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
}
+#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
+void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
+ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
+ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+} \
+void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
+ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
+ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+} \
+void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
+ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
+ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+} \
+void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
+ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
+ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+}
+
+DO_LDFF1_LDNF1_1(bb, MO_8)
+DO_LDFF1_LDNF1_1(bhu, MO_16)
+DO_LDFF1_LDNF1_1(bhs, MO_16)
+DO_LDFF1_LDNF1_1(bsu, MO_32)
+DO_LDFF1_LDNF1_1(bss, MO_32)
+DO_LDFF1_LDNF1_1(bdu, MO_64)
+DO_LDFF1_LDNF1_1(bds, MO_64)
+
+DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
+DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
+DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
+DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
+DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
+
+DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
+DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
+DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
+
+DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
+
+#undef DO_LDFF1_LDNF1_1
+#undef DO_LDFF1_LDNF1_2
+
+/*
+ * Common helper for all contiguous 1,2,3,4-register predicated stores.
+ */
+
+static inline QEMU_ALWAYS_INLINE
+void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
+ uint32_t desc, const uintptr_t retaddr,
+ const int esz, const int msz, const int N, uint32_t mtedesc,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn,
+ sve_cont_ldst_mte_check_fn *mte_check_fn)
+{
+ const unsigned rd = simd_data(desc);
+ const intptr_t reg_max = simd_oprsz(desc);
+ intptr_t reg_off, reg_last, mem_off;
+ SVEContLdSt info;
+ void *host;
+ int i, flags;
+
+ /* Find the active elements. */
+ if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
+ /* The entire predicate was false; no store occurs. */
+ return;
+ }
+
+ /* Probe the page(s). Exit with exception for any invalid page. */
+ sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
+
+ /* Handle watchpoints for all active elements. */
+ sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
+ BP_MEM_WRITE, retaddr);
+
+ /*
+ * Handle mte checks for all active elements.
+ * Since TBI must be set for MTE, !mtedesc => !mte_active.
+ */
+ if (mte_check_fn && mtedesc) {
+ mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
+ mtedesc, retaddr);
+ }
+
+ flags = info.page[0].flags | info.page[1].flags;
+ if (unlikely(flags != 0)) {
+#ifdef CONFIG_USER_ONLY
+ g_assert_not_reached();
+#else
+ /*
+ * At least one page includes MMIO.
+ * Any bus operation can fail with cpu_transaction_failed,
+ * which for ARM will raise SyncExternal. We cannot avoid
+ * this fault and will leave with the store incomplete.
+ */
+ mem_off = info.mem_off_first[0];
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[1];
+ if (reg_last < 0) {
+ reg_last = info.reg_off_split;
+ if (reg_last < 0) {
+ reg_last = info.reg_off_last[0];
+ }
+ }
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
+ addr + mem_off + (i << msz), retaddr);
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ return;
#endif
+ }
-#define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
- DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4)
-#define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
- DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, )
-
-DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
-DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
-DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
-DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
-DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
-
-DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
-DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
-DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
-DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
-DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
-
-DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
-
-DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
-
-DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
-DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
+ mem_off = info.mem_off_first[0];
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[0];
+ host = info.page[0].host;
-/* Stores with a vector index. */
+ while (reg_off <= reg_last) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+ host + mem_off + (i << msz));
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ }
-#define DO_ST1_ZPZ_S(NAME, TYPEI, FN) \
-void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
- target_ulong base, uint32_t desc) \
+ /*
+ * Use the slow path to manage the cross-page misalignment.
+ * But we know this is RAM and cannot trap.
+ */
+ mem_off = info.mem_off_split;
+ if (unlikely(mem_off >= 0)) {
+ reg_off = info.reg_off_split;
+ for (i = 0; i < N; ++i) {
+ tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
+ addr + mem_off + (i << msz), retaddr);
+ }
+ }
+
+ mem_off = info.mem_off_first[1];
+ if (unlikely(mem_off >= 0)) {
+ reg_off = info.reg_off_first[1];
+ reg_last = info.reg_off_last[1];
+ host = info.page[1].host;
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+ host + mem_off + (i << msz));
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ }
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
+ uint32_t desc, const uintptr_t ra,
+ const int esz, const int msz, const int N,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ int bit55 = extract64(addr, 55, 1);
+
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /* Perform gross MTE suppression early. */
+ if (!tbi_check(desc, bit55) ||
+ tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+ mtedesc = 0;
+ }
+
+ sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
+ N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
+}
+
+#define DO_STN_1(N, NAME, ESZ) \
+void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
{ \
- intptr_t i, oprsz = simd_oprsz(desc); \
- unsigned scale = simd_data(desc); \
- uintptr_t ra = GETPC(); \
- for (i = 0; i < oprsz; ) { \
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
- do { \
- if (likely(pg & 1)) { \
- target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
- uint32_t d = *(uint32_t *)(vd + H1_4(i)); \
- FN(env, base + (off << scale), d, ra); \
- } \
- i += sizeof(uint32_t), pg >>= sizeof(uint32_t); \
- } while (i & 15); \
- } \
+ sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
+ sve_st1##NAME##_host, sve_st1##NAME##_tlb, NULL); \
+} \
+void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
+ sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
}
-#define DO_ST1_ZPZ_D(NAME, TYPEI, FN) \
-void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
- target_ulong base, uint32_t desc) \
+#define DO_STN_2(N, NAME, ESZ, MSZ) \
+void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
{ \
- intptr_t i, oprsz = simd_oprsz(desc) / 8; \
- unsigned scale = simd_data(desc); \
- uintptr_t ra = GETPC(); \
- uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
- for (i = 0; i < oprsz; i++) { \
- if (likely(pg[H1(i)] & 1)) { \
- target_ulong off = (target_ulong)(TYPEI)m[i] << scale; \
- FN(env, base + off, d[i], ra); \
- } \
- } \
+ sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
+ sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb, NULL); \
+} \
+void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
+ sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb, NULL); \
+} \
+void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
+ sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
+} \
+void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
+ sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
+}
+
+DO_STN_1(1, bb, MO_8)
+DO_STN_1(1, bh, MO_16)
+DO_STN_1(1, bs, MO_32)
+DO_STN_1(1, bd, MO_64)
+DO_STN_1(2, bb, MO_8)
+DO_STN_1(3, bb, MO_8)
+DO_STN_1(4, bb, MO_8)
+
+DO_STN_2(1, hh, MO_16, MO_16)
+DO_STN_2(1, hs, MO_32, MO_16)
+DO_STN_2(1, hd, MO_64, MO_16)
+DO_STN_2(2, hh, MO_16, MO_16)
+DO_STN_2(3, hh, MO_16, MO_16)
+DO_STN_2(4, hh, MO_16, MO_16)
+
+DO_STN_2(1, ss, MO_32, MO_32)
+DO_STN_2(1, sd, MO_64, MO_32)
+DO_STN_2(2, ss, MO_32, MO_32)
+DO_STN_2(3, ss, MO_32, MO_32)
+DO_STN_2(4, ss, MO_32, MO_32)
+
+DO_STN_2(1, dd, MO_64, MO_64)
+DO_STN_2(2, dd, MO_64, MO_64)
+DO_STN_2(3, dd, MO_64, MO_64)
+DO_STN_2(4, dd, MO_64, MO_64)
+
+#undef DO_STN_1
+#undef DO_STN_2
+
+/*
+ * Loads with a vector index.
+ */
+
+/*
+ * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
+ */
+typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
+
+static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
+{
+ return *(uint32_t *)(reg + H1_4(reg_ofs));
+}
+
+static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
+{
+ return *(int32_t *)(reg + H1_4(reg_ofs));
+}
+
+static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
+{
+ return (uint32_t)*(uint64_t *)(reg + reg_ofs);
+}
+
+static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
+{
+ return (int32_t)*(uint64_t *)(reg + reg_ofs);
+}
+
+static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
+{
+ return *(uint64_t *)(reg + reg_ofs);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ uint32_t mtedesc, int esize, int msize,
+ zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const int mmu_idx = cpu_mmu_index(env, false);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const int scale = simd_data(desc);
+ ARMVectorReg scratch;
+ intptr_t reg_off;
+ SVEHostPage info, info2;
+
+ memset(&scratch, 0, reg_max);
+ reg_off = 0;
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if (likely(pg & 1)) {
+ target_ulong addr = base + (off_fn(vm, reg_off) << scale);
+ target_ulong in_page = -(addr | TARGET_PAGE_MASK);
+
+ sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
+ mmu_idx, retaddr);
+
+ if (likely(in_page >= msize)) {
+ if (unlikely(info.flags & TLB_WATCHPOINT)) {
+ cpu_check_watchpoint(env_cpu(env), addr, msize,
+ info.attrs, BP_MEM_READ, retaddr);
+ }
+ if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
+ mte_check1(env, mtedesc, addr, retaddr);
+ }
+ host_fn(&scratch, reg_off, info.host);
+ } else {
+ /* Element crosses the page boundary. */
+ sve_probe_page(&info2, false, env, addr + in_page, 0,
+ MMU_DATA_LOAD, mmu_idx, retaddr);
+ if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
+ cpu_check_watchpoint(env_cpu(env), addr,
+ msize, info.attrs,
+ BP_MEM_READ, retaddr);
+ }
+ if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
+ mte_check1(env, mtedesc, addr, retaddr);
+ }
+ tlb_fn(env, &scratch, reg_off, addr, retaddr);
+ }
+ }
+ reg_off += esize;
+ pg >>= esize;
+ } while (reg_off & 63);
+ } while (reg_off < reg_max);
+
+ /* Wait until all exceptions have been raised to write back. */
+ memcpy(vd, &scratch, reg_max);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ int esize, int msize, zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /*
+ * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
+ * offset base entirely over the address space hole to change the
+ * pointer tag, or change the bit55 selector. So we could here
+ * examine TBI + TCMA like we do for sve_ldN_r_mte().
+ */
+ sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
+ esize, msize, off_fn, host_fn, tlb_fn);
+}
+
+#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
+void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
+ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+} \
+void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
+ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
+
+#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
+void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
+ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+} \
+void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
+ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
+
+DO_LD1_ZPZ_S(bsu, zsu, MO_8)
+DO_LD1_ZPZ_S(bsu, zss, MO_8)
+DO_LD1_ZPZ_D(bdu, zsu, MO_8)
+DO_LD1_ZPZ_D(bdu, zss, MO_8)
+DO_LD1_ZPZ_D(bdu, zd, MO_8)
+
+DO_LD1_ZPZ_S(bss, zsu, MO_8)
+DO_LD1_ZPZ_S(bss, zss, MO_8)
+DO_LD1_ZPZ_D(bds, zsu, MO_8)
+DO_LD1_ZPZ_D(bds, zss, MO_8)
+DO_LD1_ZPZ_D(bds, zd, MO_8)
+
+DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
+DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
+DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
+
+DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
+DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
+DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
+
+DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
+DO_LD1_ZPZ_S(hss_le, zss, MO_16)
+DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
+DO_LD1_ZPZ_D(hds_le, zss, MO_16)
+DO_LD1_ZPZ_D(hds_le, zd, MO_16)
+
+DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
+DO_LD1_ZPZ_S(hss_be, zss, MO_16)
+DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
+DO_LD1_ZPZ_D(hds_be, zss, MO_16)
+DO_LD1_ZPZ_D(hds_be, zd, MO_16)
+
+DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
+DO_LD1_ZPZ_S(ss_le, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
+DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
+
+DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
+DO_LD1_ZPZ_S(ss_be, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
+DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
+
+DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
+DO_LD1_ZPZ_D(sds_le, zss, MO_32)
+DO_LD1_ZPZ_D(sds_le, zd, MO_32)
+
+DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
+DO_LD1_ZPZ_D(sds_be, zss, MO_32)
+DO_LD1_ZPZ_D(sds_be, zd, MO_32)
+
+DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
+DO_LD1_ZPZ_D(dd_le, zss, MO_64)
+DO_LD1_ZPZ_D(dd_le, zd, MO_64)
+
+DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
+DO_LD1_ZPZ_D(dd_be, zss, MO_64)
+DO_LD1_ZPZ_D(dd_be, zd, MO_64)
+
+#undef DO_LD1_ZPZ_S
+#undef DO_LD1_ZPZ_D
+
+/* First fault loads with a vector index. */
+
+/*
+ * Common helpers for all gather first-faulting loads.
+ */
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ uint32_t mtedesc, const int esz, const int msz,
+ zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const int mmu_idx = cpu_mmu_index(env, false);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const int scale = simd_data(desc);
+ const int esize = 1 << esz;
+ const int msize = 1 << msz;
+ intptr_t reg_off;
+ SVEHostPage info;
+ target_ulong addr, in_page;
+
+ /* Skip to the first true predicate. */
+ reg_off = find_next_active(vg, 0, reg_max, esz);
+ if (unlikely(reg_off >= reg_max)) {
+ /* The entire predicate was false; no load occurs. */
+ memset(vd, 0, reg_max);
+ return;
+ }
+
+ /*
+ * Probe the first element, allowing faults.
+ */
+ addr = base + (off_fn(vm, reg_off) << scale);
+ if (mtedesc) {
+ mte_check1(env, mtedesc, addr, retaddr);
+ }
+ tlb_fn(env, vd, reg_off, addr, retaddr);
+
+ /* After any fault, zero the other elements. */
+ swap_memzero(vd, reg_off);
+ reg_off += esize;
+ swap_memzero(vd + reg_off, reg_max - reg_off);
+
+ /*
+ * Probe the remaining elements, not allowing faults.
+ */
+ while (reg_off < reg_max) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if (likely((pg >> (reg_off & 63)) & 1)) {
+ addr = base + (off_fn(vm, reg_off) << scale);
+ in_page = -(addr | TARGET_PAGE_MASK);
+
+ if (unlikely(in_page < msize)) {
+ /* Stop if the element crosses a page boundary. */
+ goto fault;
+ }
+
+ sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
+ mmu_idx, retaddr);
+ if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
+ goto fault;
+ }
+ if (unlikely(info.flags & TLB_WATCHPOINT) &&
+ (cpu_watchpoint_address_matches
+ (env_cpu(env), addr, msize) & BP_MEM_READ)) {
+ goto fault;
+ }
+ if (mtedesc &&
+ arm_tlb_mte_tagged(&info.attrs) &&
+ !mte_probe1(env, mtedesc, addr)) {
+ goto fault;
+ }
+
+ host_fn(vd, reg_off, info.host);
+ }
+ reg_off += esize;
+ } while (reg_off & 63);
+ }
+ return;
+
+ fault:
+ record_fault(env, reg_off, reg_max);
}
-DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra)
-DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra)
-DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra)
+static inline QEMU_ALWAYS_INLINE
+void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ const int esz, const int msz,
+ zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
-DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra)
-DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra)
-DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra)
+ /*
+ * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
+ * offset base entirely over the address space hole to change the
+ * pointer tag, or change the bit55 selector. So we could here
+ * examine TBI + TCMA like we do for sve_ldN_r_mte().
+ */
+ sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
+ esz, msz, off_fn, host_fn, tlb_fn);
+}
-DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra)
-DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra)
-DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra)
-DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra)
+#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
+void HELPER(sve_ldff##MEM##_##OFS) \
+ (CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
+ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+} \
+void HELPER(sve_ldff##MEM##_##OFS##_mte) \
+ (CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
+ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
-DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra)
-DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra)
-DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra)
-DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra)
+#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
+void HELPER(sve_ldff##MEM##_##OFS) \
+ (CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
+ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+} \
+void HELPER(sve_ldff##MEM##_##OFS##_mte) \
+ (CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
+ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
+
+DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
+DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
+DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
+DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
+DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
+
+DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
+DO_LDFF1_ZPZ_S(bss, zss, MO_8)
+DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
+DO_LDFF1_ZPZ_D(bds, zss, MO_8)
+DO_LDFF1_ZPZ_D(bds, zd, MO_8)
+
+DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
+DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
+
+DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
+DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
+
+DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
+DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
+
+DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
+DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
+
+DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
+DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
+DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
+
+DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
+DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
+DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
+
+/* Stores with a vector index. */
-DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra)
-DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra)
-DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra)
-DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra)
+static inline QEMU_ALWAYS_INLINE
+void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ uint32_t mtedesc, int esize, int msize,
+ zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const int mmu_idx = cpu_mmu_index(env, false);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const int scale = simd_data(desc);
+ void *host[ARM_MAX_VQ * 4];
+ intptr_t reg_off, i;
+ SVEHostPage info, info2;
+
+ /*
+ * Probe all of the elements for host addresses and flags.
+ */
+ i = reg_off = 0;
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ target_ulong addr = base + (off_fn(vm, reg_off) << scale);
+ target_ulong in_page = -(addr | TARGET_PAGE_MASK);
+
+ host[i] = NULL;
+ if (likely((pg >> (reg_off & 63)) & 1)) {
+ if (likely(in_page >= msize)) {
+ sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
+ mmu_idx, retaddr);
+ host[i] = info.host;
+ } else {
+ /*
+ * Element crosses the page boundary.
+ * Probe both pages, but do not record the host address,
+ * so that we use the slow path.
+ */
+ sve_probe_page(&info, false, env, addr, 0,
+ MMU_DATA_STORE, mmu_idx, retaddr);
+ sve_probe_page(&info2, false, env, addr + in_page, 0,
+ MMU_DATA_STORE, mmu_idx, retaddr);
+ info.flags |= info2.flags;
+ }
+
+ if (unlikely(info.flags & TLB_WATCHPOINT)) {
+ cpu_check_watchpoint(env_cpu(env), addr, msize,
+ info.attrs, BP_MEM_WRITE, retaddr);
+ }
+
+ if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
+ mte_check1(env, mtedesc, addr, retaddr);
+ }
+ }
+ i += 1;
+ reg_off += esize;
+ } while (reg_off & 63);
+ } while (reg_off < reg_max);
+
+ /*
+ * Now that we have recognized all exceptions except SyncExternal
+ * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
+ *
+ * Note for the common case of an element in RAM, not crossing a page
+ * boundary, we have stored the host address in host[]. This doubles
+ * as a first-level check against the predicate, since only enabled
+ * elements have non-null host addresses.
+ */
+ i = reg_off = 0;
+ do {
+ void *h = host[i];
+ if (likely(h != NULL)) {
+ host_fn(vd, reg_off, h);
+ } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
+ target_ulong addr = base + (off_fn(vm, reg_off) << scale);
+ tlb_fn(env, vd, reg_off, addr, retaddr);
+ }
+ i += 1;
+ reg_off += esize;
+ } while (reg_off < reg_max);
+}
+
+static inline QEMU_ALWAYS_INLINE
+void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ int esize, int msize, zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /*
+ * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
+ * offset base entirely over the address space hole to change the
+ * pointer tag, or change the bit55 selector. So we could here
+ * examine TBI + TCMA like we do for sve_ldN_r_mte().
+ */
+ sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
+ esize, msize, off_fn, host_fn, tlb_fn);
+}
+
+#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
+void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
+ off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+} \
+void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
+ off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+}
+
+#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
+void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
+ off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+} \
+void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
+ off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+}
+
+DO_ST1_ZPZ_S(bs, zsu, MO_8)
+DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
+DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
+DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
+DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
+
+DO_ST1_ZPZ_S(bs, zss, MO_8)
+DO_ST1_ZPZ_S(hs_le, zss, MO_16)
+DO_ST1_ZPZ_S(hs_be, zss, MO_16)
+DO_ST1_ZPZ_S(ss_le, zss, MO_32)
+DO_ST1_ZPZ_S(ss_be, zss, MO_32)
+
+DO_ST1_ZPZ_D(bd, zsu, MO_8)
+DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
+DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
+DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
+DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
+DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
+DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
+
+DO_ST1_ZPZ_D(bd, zss, MO_8)
+DO_ST1_ZPZ_D(hd_le, zss, MO_16)
+DO_ST1_ZPZ_D(hd_be, zss, MO_16)
+DO_ST1_ZPZ_D(sd_le, zss, MO_32)
+DO_ST1_ZPZ_D(sd_be, zss, MO_32)
+DO_ST1_ZPZ_D(dd_le, zss, MO_64)
+DO_ST1_ZPZ_D(dd_be, zss, MO_64)
+
+DO_ST1_ZPZ_D(bd, zd, MO_8)
+DO_ST1_ZPZ_D(hd_le, zd, MO_16)
+DO_ST1_ZPZ_D(hd_be, zd, MO_16)
+DO_ST1_ZPZ_D(sd_le, zd, MO_32)
+DO_ST1_ZPZ_D(sd_be, zd, MO_32)
+DO_ST1_ZPZ_D(dd_le, zd, MO_64)
+DO_ST1_ZPZ_D(dd_be, zd, MO_64)
+
+#undef DO_ST1_ZPZ_S
+#undef DO_ST1_ZPZ_D