* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
#include "exec/helper-proto.h"
#include "tcg/tcg-gvec-desc.h"
#include "fpu/softfloat.h"
+#include "tcg/tcg.h"
/* Note that vector data is stored in host-endian 64-bit chunks,
return (intptr_t)-1 << esz;
}
-uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
+uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
{
+ intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
uint32_t flags = PREDTEST_INIT;
uint64_t *d = vd, *g = vg;
intptr_t i = 0;
uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
{
- intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
- intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+ intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
+ intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
uint32_t flags = PREDTEST_INIT;
uint64_t *d = vd, *g = vg, esz_mask;
intptr_t i, next;
return flags;
}
-/* Store zero into every active element of Zd. We will use this for two
- * and three-operand predicated instructions for which logic dictates a
- * zero result. In particular, logical shift by element size, which is
- * otherwise undefined on the host.
- *
- * For element sizes smaller than uint64_t, we use tables to expand
- * the N bits of the controlling predicate to a byte mask, and clear
- * those bytes.
+/*
+ * Copy Zn into Zd, and store zero into inactive elements.
+ * If inv, store zeros into the active elements.
*/
-void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- uint8_t *pg = vg;
- for (i = 0; i < opr_sz; i += 1) {
- d[i] &= ~expand_pred_b(pg[H1(i)]);
- }
-}
-
-void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- uint8_t *pg = vg;
- for (i = 0; i < opr_sz; i += 1) {
- d[i] &= ~expand_pred_h(pg[H1(i)]);
- }
-}
-
-void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- uint8_t *pg = vg;
- for (i = 0; i < opr_sz; i += 1) {
- d[i] &= ~expand_pred_s(pg[H1(i)]);
- }
-}
-
-void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
-{
- intptr_t i, opr_sz = simd_oprsz(desc) / 8;
- uint64_t *d = vd;
- uint8_t *pg = vg;
- for (i = 0; i < opr_sz; i += 1) {
- if (pg[H1(i)] & 1) {
- d[i] = 0;
- }
- }
-}
-
-/* Copy Zn into Zd, and store zero into inactive elements. */
void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
uint64_t *d = vd, *n = vn;
uint8_t *pg = vg;
+
for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] & expand_pred_b(pg[H1(i)]);
+ d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
}
}
void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
uint64_t *d = vd, *n = vn;
uint8_t *pg = vg;
+
for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] & expand_pred_h(pg[H1(i)]);
+ d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
}
}
void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc) / 8;
+ uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
uint64_t *d = vd, *n = vn;
uint8_t *pg = vg;
+
for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] & expand_pred_s(pg[H1(i)]);
+ d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
}
}
intptr_t i, opr_sz = simd_oprsz(desc) / 8;
uint64_t *d = vd, *n = vn;
uint8_t *pg = vg;
+ uint8_t inv = simd_data(desc);
+
for (i = 0; i < opr_sz; i += 1) {
- d[i] = n[i] & -(uint64_t)(pg[H1(i)] & 1);
+ d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
}
}
}
}
-/* Big-endian hosts need to frob the byte indicies. If the copy
+/* Big-endian hosts need to frob the byte indices. If the copy
* happens to be 8-byte aligned, then no frobbing necessary.
*/
static void swap_memmove(void *vd, void *vs, size_t n)
void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
- intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
uint64_t *d = vd;
intptr_t i;
void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
- int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
uint64_t *d = vd, *n = vn, *m = vm;
uint64_t l, h;
intptr_t i;
if (oprsz <= 8) {
l = compress_bits(n[0] >> odd, esz);
h = compress_bits(m[0] >> odd, esz);
- d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
+ d[0] = l | (h << (4 * oprsz));
} else {
ARMPredicateReg tmp_m;
intptr_t oprsz_16 = oprsz / 16;
h = n[2 * i + 1];
l = compress_bits(l >> odd, esz);
h = compress_bits(h >> odd, esz);
- d[i] = l + (h << 32);
+ d[i] = l | (h << 32);
}
- /* For VL which is not a power of 2, the results from M do not
- align nicely with the uint64_t for D. Put the aligned results
- from M into TMP_M and then copy it into place afterward. */
+ /*
+ * For VL which is not a multiple of 512, the results from M do not
+ * align nicely with the uint64_t for D. Put the aligned results
+ * from M into TMP_M and then copy it into place afterward.
+ */
if (oprsz & 15) {
- d[i] = compress_bits(n[2 * i] >> odd, esz);
+ int final_shift = (oprsz & 15) * 2;
+
+ l = n[2 * i + 0];
+ h = n[2 * i + 1];
+ l = compress_bits(l >> odd, esz);
+ h = compress_bits(h >> odd, esz);
+ d[i] = l | (h << final_shift);
for (i = 0; i < oprsz_16; i++) {
l = m[2 * i + 0];
h = m[2 * i + 1];
l = compress_bits(l >> odd, esz);
h = compress_bits(h >> odd, esz);
- tmp_m.p[i] = l + (h << 32);
+ tmp_m.p[i] = l | (h << 32);
}
- tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
+ l = m[2 * i + 0];
+ h = m[2 * i + 1];
+ l = compress_bits(l >> odd, esz);
+ h = compress_bits(h >> odd, esz);
+ tmp_m.p[i] = l | (h << final_shift);
swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
} else {
h = m[2 * i + 1];
l = compress_bits(l >> odd, esz);
h = compress_bits(h >> odd, esz);
- d[oprsz_16 + i] = l + (h << 32);
+ d[oprsz_16 + i] = l | (h << 32);
}
}
}
void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
- bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
+ int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
uint64_t *d = vd, *n = vn, *m = vm;
uint64_t mask;
int shr, shl;
void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
intptr_t i, oprsz_2 = oprsz / 2;
if (oprsz <= 8) {
void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
{
- intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
- intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
+ intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
+ intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
uint64_t *d = vd;
intptr_t i;
*/
static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
{
- flag save = get_flush_inputs_to_zero(fpst);
+ bool save = get_flush_inputs_to_zero(fpst);
float32 ret;
set_flush_inputs_to_zero(false, fpst);
static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
{
- flag save = get_flush_inputs_to_zero(fpst);
+ bool save = get_flush_inputs_to_zero(fpst);
float64 ret;
set_flush_inputs_to_zero(false, fpst);
static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
{
- flag save = get_flush_to_zero(fpst);
+ bool save = get_flush_to_zero(fpst);
float16 ret;
set_flush_to_zero(false, fpst);
static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
{
- flag save = get_flush_to_zero(fpst);
+ bool save = get_flush_to_zero(fpst);
float16 ret;
set_flush_to_zero(false, fpst);
#undef DO_ZPZ_FP
-/* 4-operand predicated multiply-add. This requires 7 operands to pass
- * "properly", so we need to encode some of the registers into DESC.
- */
-QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
-
-static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
+static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
+ float_status *status, uint32_t desc,
uint16_t neg1, uint16_t neg3)
{
intptr_t i = simd_oprsz(desc);
- unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
- unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
- unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
- unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
- void *vd = &env->vfp.zregs[rd];
- void *vn = &env->vfp.zregs[rn];
- void *vm = &env->vfp.zregs[rm];
- void *va = &env->vfp.zregs[ra];
uint64_t *g = vg;
do {
e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
e2 = *(uint16_t *)(vm + H1_2(i));
e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
- r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status_f16);
+ r = float16_muladd(e1, e2, e3, 0, status);
*(uint16_t *)(vd + H1_2(i)) = r;
}
} while (i & 63);
} while (i != 0);
}
-void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_h(env, vg, desc, 0, 0);
+ do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
}
-void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
+ do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
}
-void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
+ do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
}
-void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
+ do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
}
-static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
+static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
+ float_status *status, uint32_t desc,
uint32_t neg1, uint32_t neg3)
{
intptr_t i = simd_oprsz(desc);
- unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
- unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
- unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
- unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
- void *vd = &env->vfp.zregs[rd];
- void *vn = &env->vfp.zregs[rn];
- void *vm = &env->vfp.zregs[rm];
- void *va = &env->vfp.zregs[ra];
uint64_t *g = vg;
do {
e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
e2 = *(uint32_t *)(vm + H1_4(i));
e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
- r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
+ r = float32_muladd(e1, e2, e3, 0, status);
*(uint32_t *)(vd + H1_4(i)) = r;
}
} while (i & 63);
} while (i != 0);
}
-void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_s(env, vg, desc, 0, 0);
+ do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
}
-void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
+ do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
}
-void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
+ do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
}
-void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
+ do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
}
-static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
+static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
+ float_status *status, uint32_t desc,
uint64_t neg1, uint64_t neg3)
{
intptr_t i = simd_oprsz(desc);
- unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
- unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
- unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
- unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
- void *vd = &env->vfp.zregs[rd];
- void *vn = &env->vfp.zregs[rn];
- void *vm = &env->vfp.zregs[rm];
- void *va = &env->vfp.zregs[ra];
uint64_t *g = vg;
do {
e1 = *(uint64_t *)(vn + i) ^ neg1;
e2 = *(uint64_t *)(vm + i);
e3 = *(uint64_t *)(va + i) ^ neg3;
- r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
+ r = float64_muladd(e1, e2, e3, 0, status);
*(uint64_t *)(vd + i) = r;
}
} while (i & 63);
} while (i != 0);
}
-void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_d(env, vg, desc, 0, 0);
+ do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
}
-void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
+ do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
}
-void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
+ do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
}
-void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
- do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
+ do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
}
/* Two operand floating-point comparison controlled by a predicate.
* FP Complex Multiply
*/
-QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
-
-void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
intptr_t j, i = simd_oprsz(desc);
- unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
- unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
- unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
- unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
- unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
+ unsigned rot = simd_data(desc);
bool flip = rot & 1;
float16 neg_imag, neg_real;
- void *vd = &env->vfp.zregs[rd];
- void *vn = &env->vfp.zregs[rn];
- void *vm = &env->vfp.zregs[rm];
- void *va = &env->vfp.zregs[ra];
uint64_t *g = vg;
neg_imag = float16_set_sign(0, (rot & 2) != 0);
if (likely((pg >> (i & 63)) & 1)) {
d = *(float16 *)(va + H1_2(i));
- d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
+ d = float16_muladd(e2, e1, d, 0, status);
*(float16 *)(vd + H1_2(i)) = d;
}
if (likely((pg >> (j & 63)) & 1)) {
d = *(float16 *)(va + H1_2(j));
- d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
+ d = float16_muladd(e4, e3, d, 0, status);
*(float16 *)(vd + H1_2(j)) = d;
}
} while (i & 63);
} while (i != 0);
}
-void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
intptr_t j, i = simd_oprsz(desc);
- unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
- unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
- unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
- unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
- unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
+ unsigned rot = simd_data(desc);
bool flip = rot & 1;
float32 neg_imag, neg_real;
- void *vd = &env->vfp.zregs[rd];
- void *vn = &env->vfp.zregs[rn];
- void *vm = &env->vfp.zregs[rm];
- void *va = &env->vfp.zregs[ra];
uint64_t *g = vg;
neg_imag = float32_set_sign(0, (rot & 2) != 0);
if (likely((pg >> (i & 63)) & 1)) {
d = *(float32 *)(va + H1_2(i));
- d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
+ d = float32_muladd(e2, e1, d, 0, status);
*(float32 *)(vd + H1_2(i)) = d;
}
if (likely((pg >> (j & 63)) & 1)) {
d = *(float32 *)(va + H1_2(j));
- d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
+ d = float32_muladd(e4, e3, d, 0, status);
*(float32 *)(vd + H1_2(j)) = d;
}
} while (i & 63);
} while (i != 0);
}
-void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
+void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
+ void *vg, void *status, uint32_t desc)
{
intptr_t j, i = simd_oprsz(desc);
- unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
- unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
- unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
- unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
- unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
+ unsigned rot = simd_data(desc);
bool flip = rot & 1;
float64 neg_imag, neg_real;
- void *vd = &env->vfp.zregs[rd];
- void *vn = &env->vfp.zregs[rn];
- void *vm = &env->vfp.zregs[rm];
- void *va = &env->vfp.zregs[ra];
uint64_t *g = vg;
neg_imag = float64_set_sign(0, (rot & 2) != 0);
if (likely((pg >> (i & 63)) & 1)) {
d = *(float64 *)(va + H1_2(i));
- d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
+ d = float64_muladd(e2, e1, d, 0, status);
*(float64 *)(vd + H1_2(i)) = d;
}
if (likely((pg >> (j & 63)) & 1)) {
d = *(float64 *)(va + H1_2(j));
- d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
+ d = float64_muladd(e4, e3, d, 0, status);
*(float64 *)(vd + H1_2(j)) = d;
}
} while (i & 63);
*/
/*
- * Load elements into @vd, controlled by @vg, from @host + @mem_ofs.
- * Memory is valid through @host + @mem_max. The register element
- * indicies are inferred from @mem_ofs, as modified by the types for
- * which the helper is built. Return the @mem_ofs of the first element
- * not loaded (which is @mem_max if they are all loaded).
- *
- * For softmmu, we have fully validated the guest page. For user-only,
- * we cannot fully validate without taking the mmap lock, but since we
- * know the access is within one host page, if any access is valid they
- * all must be valid. However, when @vg is all false, it may be that
- * no access is valid.
+ * Load one element into @vd + @reg_off from @host.
+ * The controlling predicate is known to be true.
*/
-typedef intptr_t sve_ld1_host_fn(void *vd, void *vg, void *host,
- intptr_t mem_ofs, intptr_t mem_max);
+typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
/*
* Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
* The controlling predicate is known to be true.
*/
-typedef void sve_ld1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
- target_ulong vaddr, TCGMemOpIdx oi, uintptr_t ra);
-typedef sve_ld1_tlb_fn sve_st1_tlb_fn;
+typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
+ target_ulong vaddr, uintptr_t retaddr);
/*
* Generate the above primitives.
*/
#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
-static intptr_t sve_##NAME##_host(void *vd, void *vg, void *host, \
- intptr_t mem_off, const intptr_t mem_max) \
-{ \
- intptr_t reg_off = mem_off * (sizeof(TYPEE) / sizeof(TYPEM)); \
- uint64_t *pg = vg; \
- while (mem_off + sizeof(TYPEM) <= mem_max) { \
- TYPEM val = 0; \
- if (likely((pg[reg_off >> 6] >> (reg_off & 63)) & 1)) { \
- val = HOST(host + mem_off); \
- } \
- *(TYPEE *)(vd + H(reg_off)) = val; \
- mem_off += sizeof(TYPEM), reg_off += sizeof(TYPEE); \
- } \
- return mem_off; \
-}
-
-#ifdef CONFIG_SOFTMMU
-#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
+static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
+{ \
+ TYPEM val = HOST(host); \
+ *(TYPEE *)(vd + H(reg_off)) = val; \
+}
+
+#define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
+static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
+{ HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
+
+#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
- target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
+ target_ulong addr, uintptr_t ra) \
{ \
- TYPEM val = TLB(env, addr, oi, ra); \
- *(TYPEE *)(vd + H(reg_off)) = val; \
+ *(TYPEE *)(vd + H(reg_off)) = \
+ (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
}
-#else
-#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
+
+#define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
- target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
+ target_ulong addr, uintptr_t ra) \
{ \
- TYPEM val = HOST(g2h(addr)); \
- *(TYPEE *)(vd + H(reg_off)) = val; \
+ TLB(env, useronly_clean_ptr(addr), \
+ (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
}
-#endif
#define DO_LD_PRIM_1(NAME, H, TE, TM) \
DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
- DO_LD_TLB(NAME, H, TE, TM, ldub_p, 0, helper_ret_ldub_mmu)
+ DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
-#define DO_LD_PRIM_2(NAME, end, MOEND, H, TE, TM, PH, PT) \
- DO_LD_HOST(NAME##_##end, H, TE, TM, PH##_##end##_p) \
- DO_LD_TLB(NAME##_##end, H, TE, TM, PH##_##end##_p, \
- MOEND, helper_##end##_##PT##_mmu)
+#define DO_ST_PRIM_1(NAME, H, TE, TM) \
+ DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
+ DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
+
+DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
+DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
+DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
+DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
+
+#define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
+ DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
+ DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
+ DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
+ DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
-DO_LD_PRIM_2(ld1hh, le, MO_LE, H1_2, uint16_t, uint16_t, lduw, lduw)
-DO_LD_PRIM_2(ld1hsu, le, MO_LE, H1_4, uint32_t, uint16_t, lduw, lduw)
-DO_LD_PRIM_2(ld1hss, le, MO_LE, H1_4, uint32_t, int16_t, lduw, lduw)
-DO_LD_PRIM_2(ld1hdu, le, MO_LE, , uint64_t, uint16_t, lduw, lduw)
-DO_LD_PRIM_2(ld1hds, le, MO_LE, , uint64_t, int16_t, lduw, lduw)
+#define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
+ DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
+ DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
+ DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
+ DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
-DO_LD_PRIM_2(ld1ss, le, MO_LE, H1_4, uint32_t, uint32_t, ldl, ldul)
-DO_LD_PRIM_2(ld1sdu, le, MO_LE, , uint64_t, uint32_t, ldl, ldul)
-DO_LD_PRIM_2(ld1sds, le, MO_LE, , uint64_t, int32_t, ldl, ldul)
+DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
+DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
+DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
+DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
+DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
-DO_LD_PRIM_2(ld1dd, le, MO_LE, , uint64_t, uint64_t, ldq, ldq)
+DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
+DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
+DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
-DO_LD_PRIM_2(ld1hh, be, MO_BE, H1_2, uint16_t, uint16_t, lduw, lduw)
-DO_LD_PRIM_2(ld1hsu, be, MO_BE, H1_4, uint32_t, uint16_t, lduw, lduw)
-DO_LD_PRIM_2(ld1hss, be, MO_BE, H1_4, uint32_t, int16_t, lduw, lduw)
-DO_LD_PRIM_2(ld1hdu, be, MO_BE, , uint64_t, uint16_t, lduw, lduw)
-DO_LD_PRIM_2(ld1hds, be, MO_BE, , uint64_t, int16_t, lduw, lduw)
+DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
+DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
+DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
-DO_LD_PRIM_2(ld1ss, be, MO_BE, H1_4, uint32_t, uint32_t, ldl, ldul)
-DO_LD_PRIM_2(ld1sdu, be, MO_BE, , uint64_t, uint32_t, ldl, ldul)
-DO_LD_PRIM_2(ld1sds, be, MO_BE, , uint64_t, int32_t, ldl, ldul)
+DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
+DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
-DO_LD_PRIM_2(ld1dd, be, MO_BE, , uint64_t, uint64_t, ldq, ldq)
+DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
+DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
#undef DO_LD_TLB
+#undef DO_ST_TLB
#undef DO_LD_HOST
#undef DO_LD_PRIM_1
+#undef DO_ST_PRIM_1
#undef DO_LD_PRIM_2
+#undef DO_ST_PRIM_2
/*
* Skip through a sequence of inactive elements in the guarding predicate @vg,
}
/*
- * Return the maximum offset <= @mem_max which is still within the page
- * referenced by @base + @mem_off.
+ * Resolve the guest virtual address to info->host and info->flags.
+ * If @nofault, return false if the page is invalid, otherwise
+ * exit via page fault exception.
*/
-static intptr_t max_for_page(target_ulong base, intptr_t mem_off,
- intptr_t mem_max)
-{
- target_ulong addr = base + mem_off;
- intptr_t split = -(intptr_t)(addr | TARGET_PAGE_MASK);
- return MIN(split, mem_max - mem_off) + mem_off;
-}
-#ifndef CONFIG_USER_ONLY
-/* These are normally defined only for CONFIG_USER_ONLY in <exec/cpu_ldst.h> */
-static inline void set_helper_retaddr(uintptr_t ra) { }
-static inline void clear_helper_retaddr(void) { }
-#endif
+typedef struct {
+ void *host;
+ int flags;
+ MemTxAttrs attrs;
+} SVEHostPage;
-/*
- * The result of tlb_vaddr_to_host for user-only is just g2h(x),
- * which is always non-null. Elide the useless test.
- */
-static inline bool test_host_page(void *host)
+static bool sve_probe_page(SVEHostPage *info, bool nofault,
+ CPUARMState *env, target_ulong addr,
+ int mem_off, MMUAccessType access_type,
+ int mmu_idx, uintptr_t retaddr)
{
+ int flags;
+
+ addr += mem_off;
+
+ /*
+ * User-only currently always issues with TBI. See the comment
+ * above useronly_clean_ptr. Usually we clean this top byte away
+ * during translation, but we can't do that for e.g. vector + imm
+ * addressing modes.
+ *
+ * We currently always enable TBI for user-only, and do not provide
+ * a way to turn it off. So clean the pointer unconditionally here,
+ * rather than look it up here, or pass it down from above.
+ */
+ addr = useronly_clean_ptr(addr);
+
+ flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
+ &info->host, retaddr);
+ info->flags = flags;
+
+ if (flags & TLB_INVALID_MASK) {
+ g_assert(nofault);
+ return false;
+ }
+
+ /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
+ info->host -= mem_off;
+
#ifdef CONFIG_USER_ONLY
- return true;
+ memset(&info->attrs, 0, sizeof(info->attrs));
#else
- return likely(host != NULL);
+ /*
+ * Find the iotlbentry for addr and return the transaction attributes.
+ * This *must* be present in the TLB because we just found the mapping.
+ */
+ {
+ uintptr_t index = tlb_index(env, mmu_idx, addr);
+
+# ifdef CONFIG_DEBUG_TCG
+ CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
+ target_ulong comparator = (access_type == MMU_DATA_LOAD
+ ? entry->addr_read
+ : tlb_addr_write(entry));
+ g_assert(tlb_hit(comparator, addr));
+# endif
+
+ CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
+ info->attrs = iotlbentry->attrs;
+ }
#endif
+
+ return true;
}
+
/*
- * Common helper for all contiguous one-register predicated loads.
+ * Analyse contiguous data, protected by a governing predicate.
*/
-static void sve_ld1_r(CPUARMState *env, void *vg, const target_ulong addr,
- uint32_t desc, const uintptr_t retaddr,
- const int esz, const int msz,
- sve_ld1_host_fn *host_fn,
- sve_ld1_tlb_fn *tlb_fn)
-{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const int mmu_idx = get_mmuidx(oi);
- const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
- void *vd = &env->vfp.zregs[rd];
- const int diffsz = esz - msz;
- const intptr_t reg_max = simd_oprsz(desc);
- const intptr_t mem_max = reg_max >> diffsz;
- ARMVectorReg scratch;
- void *host;
- intptr_t split, reg_off, mem_off;
- /* Find the first active element. */
- reg_off = find_next_active(vg, 0, reg_max, esz);
- if (unlikely(reg_off == reg_max)) {
- /* The entire predicate was false; no load occurs. */
- memset(vd, 0, reg_max);
- return;
- }
- mem_off = reg_off >> diffsz;
- set_helper_retaddr(retaddr);
+typedef enum {
+ FAULT_NO,
+ FAULT_FIRST,
+ FAULT_ALL,
+} SVEContFault;
+typedef struct {
/*
- * If the (remaining) load is entirely within a single page, then:
- * For softmmu, and the tlb hits, then no faults will occur;
- * For user-only, either the first load will fault or none will.
- * We can thus perform the load directly to the destination and
- * Vd will be unmodified on any exception path.
+ * First and last element wholly contained within the two pages.
+ * mem_off_first[0] and reg_off_first[0] are always set >= 0.
+ * reg_off_last[0] may be < 0 if the first element crosses pages.
+ * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
+ * are set >= 0 only if there are complete elements on a second page.
+ *
+ * The reg_off_* offsets are relative to the internal vector register.
+ * The mem_off_first offset is relative to the memory address; the
+ * two offsets are different when a load operation extends, a store
+ * operation truncates, or for multi-register operations.
*/
- split = max_for_page(addr, mem_off, mem_max);
- if (likely(split == mem_max)) {
- host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
- if (test_host_page(host)) {
- mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
- tcg_debug_assert(mem_off == mem_max);
- clear_helper_retaddr();
- /* After having taken any fault, zero leading inactive elements. */
- swap_memzero(vd, reg_off);
- return;
- }
- }
+ int16_t mem_off_first[2];
+ int16_t reg_off_first[2];
+ int16_t reg_off_last[2];
/*
- * Perform the predicated read into a temporary, thus ensuring
- * if the load of the last element faults, Vd is not modified.
+ * One element that is misaligned and spans both pages,
+ * or -1 if there is no such active element.
*/
-#ifdef CONFIG_USER_ONLY
- swap_memzero(&scratch, reg_off);
- host_fn(&scratch, vg, g2h(addr), mem_off, mem_max);
-#else
- memset(&scratch, 0, reg_max);
- goto start;
- while (1) {
- reg_off = find_next_active(vg, reg_off, reg_max, esz);
- if (reg_off >= reg_max) {
- break;
- }
- mem_off = reg_off >> diffsz;
- split = max_for_page(addr, mem_off, mem_max);
-
- start:
- if (split - mem_off >= (1 << msz)) {
- /* At least one whole element on this page. */
- host = tlb_vaddr_to_host(env, addr + mem_off,
- MMU_DATA_LOAD, mmu_idx);
- if (host) {
- mem_off = host_fn(&scratch, vg, host - mem_off,
- mem_off, split);
- reg_off = mem_off << diffsz;
- continue;
+ int16_t mem_off_split;
+ int16_t reg_off_split;
+
+ /*
+ * The byte offset at which the entire operation crosses a page boundary.
+ * Set >= 0 if and only if the entire operation spans two pages.
+ */
+ int16_t page_split;
+
+ /* TLB data for the two pages. */
+ SVEHostPage page[2];
+} SVEContLdSt;
+
+/*
+ * Find first active element on each page, and a loose bound for the
+ * final element on each page. Identify any single element that spans
+ * the page boundary. Return true if there are any active elements.
+ */
+static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
+ uint64_t *vg, intptr_t reg_max,
+ int esz, int msize)
+{
+ const int esize = 1 << esz;
+ const uint64_t pg_mask = pred_esz_masks[esz];
+ intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
+ intptr_t mem_off_last, mem_off_split;
+ intptr_t page_split, elt_split;
+ intptr_t i;
+
+ /* Set all of the element indices to -1, and the TLB data to 0. */
+ memset(info, -1, offsetof(SVEContLdSt, page));
+ memset(info->page, 0, sizeof(info->page));
+
+ /* Gross scan over the entire predicate to find bounds. */
+ i = 0;
+ do {
+ uint64_t pg = vg[i] & pg_mask;
+ if (pg) {
+ reg_off_last = i * 64 + 63 - clz64(pg);
+ if (reg_off_first < 0) {
+ reg_off_first = i * 64 + ctz64(pg);
}
}
+ } while (++i * 64 < reg_max);
- /*
- * Perform one normal read. This may fault, longjmping out to the
- * main loop in order to raise an exception. It may succeed, and
- * as a side-effect load the TLB entry for the next round. Finally,
- * in the extremely unlikely case we're performing this operation
- * on I/O memory, it may succeed but not bring in the TLB entry.
- * But even then we have still made forward progress.
- */
- tlb_fn(env, &scratch, reg_off, addr + mem_off, oi, retaddr);
- reg_off += 1 << esz;
+ if (unlikely(reg_off_first < 0)) {
+ /* No active elements, no pages touched. */
+ return false;
}
-#endif
+ tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
- clear_helper_retaddr();
- memcpy(vd, &scratch, reg_max);
-}
-
-#define DO_LD1_1(NAME, ESZ) \
-void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
- sve_##NAME##_host, sve_##NAME##_tlb); \
-}
+ info->reg_off_first[0] = reg_off_first;
+ info->mem_off_first[0] = (reg_off_first >> esz) * msize;
+ mem_off_last = (reg_off_last >> esz) * msize;
-#define DO_LD1_2(NAME, ESZ, MSZ) \
-void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
- sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
-} \
-void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
- target_ulong addr, uint32_t desc) \
-{ \
- sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
- sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
-}
+ page_split = -(addr | TARGET_PAGE_MASK);
+ if (likely(mem_off_last + msize <= page_split)) {
+ /* The entire operation fits within a single page. */
+ info->reg_off_last[0] = reg_off_last;
+ return true;
+ }
-DO_LD1_1(ld1bb, 0)
-DO_LD1_1(ld1bhu, 1)
-DO_LD1_1(ld1bhs, 1)
-DO_LD1_1(ld1bsu, 2)
-DO_LD1_1(ld1bss, 2)
-DO_LD1_1(ld1bdu, 3)
-DO_LD1_1(ld1bds, 3)
+ info->page_split = page_split;
+ elt_split = page_split / msize;
+ reg_off_split = elt_split << esz;
+ mem_off_split = elt_split * msize;
-DO_LD1_2(ld1hh, 1, 1)
-DO_LD1_2(ld1hsu, 2, 1)
-DO_LD1_2(ld1hss, 2, 1)
-DO_LD1_2(ld1hdu, 3, 1)
-DO_LD1_2(ld1hds, 3, 1)
+ /*
+ * This is the last full element on the first page, but it is not
+ * necessarily active. If there is no full element, i.e. the first
+ * active element is the one that's split, this value remains -1.
+ * It is useful as iteration bounds.
+ */
+ if (elt_split != 0) {
+ info->reg_off_last[0] = reg_off_split - esize;
+ }
-DO_LD1_2(ld1ss, 2, 2)
-DO_LD1_2(ld1sdu, 3, 2)
-DO_LD1_2(ld1sds, 3, 2)
+ /* Determine if an unaligned element spans the pages. */
+ if (page_split % msize != 0) {
+ /* It is helpful to know if the split element is active. */
+ if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
+ info->reg_off_split = reg_off_split;
+ info->mem_off_split = mem_off_split;
-DO_LD1_2(ld1dd, 3, 3)
+ if (reg_off_split == reg_off_last) {
+ /* The page crossing element is last. */
+ return true;
+ }
+ }
+ reg_off_split += esize;
+ mem_off_split += msize;
+ }
-#undef DO_LD1_1
-#undef DO_LD1_2
+ /*
+ * We do want the first active element on the second page, because
+ * this may affect the address reported in an exception.
+ */
+ reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
+ tcg_debug_assert(reg_off_split <= reg_off_last);
+ info->reg_off_first[1] = reg_off_split;
+ info->mem_off_first[1] = (reg_off_split >> esz) * msize;
+ info->reg_off_last[1] = reg_off_last;
+ return true;
+}
/*
- * Common helpers for all contiguous 2,3,4-register predicated loads.
+ * Resolve the guest virtual addresses to info->page[].
+ * Control the generation of page faults with @fault. Return false if
+ * there is no work to do, which can only happen with @fault == FAULT_NO.
*/
-static void sve_ld2_r(CPUARMState *env, void *vg, target_ulong addr,
- uint32_t desc, int size, uintptr_t ra,
- sve_ld1_tlb_fn *tlb_fn)
+static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
+ CPUARMState *env, target_ulong addr,
+ MMUAccessType access_type, uintptr_t retaddr)
{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
- intptr_t i, oprsz = simd_oprsz(desc);
- ARMVectorReg scratch[2] = { };
+ int mmu_idx = cpu_mmu_index(env, false);
+ int mem_off = info->mem_off_first[0];
+ bool nofault = fault == FAULT_NO;
+ bool have_work = true;
- set_helper_retaddr(ra);
- for (i = 0; i < oprsz; ) {
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
- do {
- if (pg & 1) {
- tlb_fn(env, &scratch[0], i, addr, oi, ra);
- tlb_fn(env, &scratch[1], i, addr + size, oi, ra);
- }
- i += size, pg >>= size;
- addr += 2 * size;
- } while (i & 15);
+ if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
+ access_type, mmu_idx, retaddr)) {
+ /* No work to be done. */
+ return false;
}
- clear_helper_retaddr();
-
- /* Wait until all exceptions have been raised to write back. */
- memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
- memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
-}
-static void sve_ld3_r(CPUARMState *env, void *vg, target_ulong addr,
- uint32_t desc, int size, uintptr_t ra,
- sve_ld1_tlb_fn *tlb_fn)
-{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
- intptr_t i, oprsz = simd_oprsz(desc);
- ARMVectorReg scratch[3] = { };
+ if (likely(info->page_split < 0)) {
+ /* The entire operation was on the one page. */
+ return true;
+ }
- set_helper_retaddr(ra);
- for (i = 0; i < oprsz; ) {
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
- do {
- if (pg & 1) {
- tlb_fn(env, &scratch[0], i, addr, oi, ra);
- tlb_fn(env, &scratch[1], i, addr + size, oi, ra);
- tlb_fn(env, &scratch[2], i, addr + 2 * size, oi, ra);
- }
- i += size, pg >>= size;
- addr += 3 * size;
- } while (i & 15);
+ /*
+ * If the second page is invalid, then we want the fault address to be
+ * the first byte on that page which is accessed.
+ */
+ if (info->mem_off_split >= 0) {
+ /*
+ * There is an element split across the pages. The fault address
+ * should be the first byte of the second page.
+ */
+ mem_off = info->page_split;
+ /*
+ * If the split element is also the first active element
+ * of the vector, then: For first-fault we should continue
+ * to generate faults for the second page. For no-fault,
+ * we have work only if the second page is valid.
+ */
+ if (info->mem_off_first[0] < info->mem_off_split) {
+ nofault = FAULT_FIRST;
+ have_work = false;
+ }
+ } else {
+ /*
+ * There is no element split across the pages. The fault address
+ * should be the first active element on the second page.
+ */
+ mem_off = info->mem_off_first[1];
+ /*
+ * There must have been one active element on the first page,
+ * so we're out of first-fault territory.
+ */
+ nofault = fault != FAULT_ALL;
}
- clear_helper_retaddr();
- /* Wait until all exceptions have been raised to write back. */
- memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
- memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
- memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
+ have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
+ access_type, mmu_idx, retaddr);
+ return have_work;
}
-static void sve_ld4_r(CPUARMState *env, void *vg, target_ulong addr,
- uint32_t desc, int size, uintptr_t ra,
- sve_ld1_tlb_fn *tlb_fn)
+static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr,
+ int esize, int msize, int wp_access,
+ uintptr_t retaddr)
{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
- intptr_t i, oprsz = simd_oprsz(desc);
- ARMVectorReg scratch[4] = { };
+#ifndef CONFIG_USER_ONLY
+ intptr_t mem_off, reg_off, reg_last;
+ int flags0 = info->page[0].flags;
+ int flags1 = info->page[1].flags;
- set_helper_retaddr(ra);
- for (i = 0; i < oprsz; ) {
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
- do {
- if (pg & 1) {
- tlb_fn(env, &scratch[0], i, addr, oi, ra);
- tlb_fn(env, &scratch[1], i, addr + size, oi, ra);
- tlb_fn(env, &scratch[2], i, addr + 2 * size, oi, ra);
- tlb_fn(env, &scratch[3], i, addr + 3 * size, oi, ra);
- }
- i += size, pg >>= size;
- addr += 4 * size;
- } while (i & 15);
+ if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
+ return;
}
- clear_helper_retaddr();
- /* Wait until all exceptions have been raised to write back. */
- memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
- memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
- memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
- memcpy(&env->vfp.zregs[(rd + 3) & 31], &scratch[3], oprsz);
-}
+ /* Indicate that watchpoints are handled. */
+ info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
+ info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
+
+ if (flags0 & TLB_WATCHPOINT) {
+ mem_off = info->mem_off_first[0];
+ reg_off = info->reg_off_first[0];
+ reg_last = info->reg_off_last[0];
+
+ while (reg_off <= reg_last) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ cpu_check_watchpoint(env_cpu(env), addr + mem_off,
+ msize, info->page[0].attrs,
+ wp_access, retaddr);
+ }
+ reg_off += esize;
+ mem_off += msize;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ }
+ }
-#define DO_LDN_1(N) \
-void QEMU_FLATTEN HELPER(sve_ld##N##bb_r) \
- (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
-{ \
- sve_ld##N##_r(env, vg, addr, desc, 1, GETPC(), sve_ld1bb_tlb); \
-}
+ mem_off = info->mem_off_split;
+ if (mem_off >= 0) {
+ cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
+ info->page[0].attrs, wp_access, retaddr);
+ }
-#define DO_LDN_2(N, SUFF, SIZE) \
-void QEMU_FLATTEN HELPER(sve_ld##N##SUFF##_le_r) \
- (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
-{ \
- sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
- sve_ld1##SUFF##_le_tlb); \
-} \
-void QEMU_FLATTEN HELPER(sve_ld##N##SUFF##_be_r) \
- (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
-{ \
- sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
- sve_ld1##SUFF##_be_tlb); \
+ mem_off = info->mem_off_first[1];
+ if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
+ reg_off = info->reg_off_first[1];
+ reg_last = info->reg_off_last[1];
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ cpu_check_watchpoint(env_cpu(env), addr + mem_off,
+ msize, info->page[1].attrs,
+ wp_access, retaddr);
+ }
+ reg_off += esize;
+ mem_off += msize;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ }
+#endif
}
-DO_LDN_1(2)
-DO_LDN_1(3)
-DO_LDN_1(4)
+typedef uint64_t mte_check_fn(CPUARMState *, uint32_t, uint64_t, uintptr_t);
-DO_LDN_2(2, hh, 2)
-DO_LDN_2(3, hh, 2)
-DO_LDN_2(4, hh, 2)
+static inline QEMU_ALWAYS_INLINE
+void sve_cont_ldst_mte_check_int(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr, int esize,
+ int msize, uint32_t mtedesc, uintptr_t ra,
+ mte_check_fn *check)
+{
+ intptr_t mem_off, reg_off, reg_last;
-DO_LDN_2(2, ss, 4)
-DO_LDN_2(3, ss, 4)
-DO_LDN_2(4, ss, 4)
+ /* Process the page only if MemAttr == Tagged. */
+ if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
+ mem_off = info->mem_off_first[0];
+ reg_off = info->reg_off_first[0];
+ reg_last = info->reg_off_split;
+ if (reg_last < 0) {
+ reg_last = info->reg_off_last[0];
+ }
-DO_LDN_2(2, dd, 8)
-DO_LDN_2(3, dd, 8)
-DO_LDN_2(4, dd, 8)
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ check(env, mtedesc, addr, ra);
+ }
+ reg_off += esize;
+ mem_off += msize;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ } while (reg_off <= reg_last);
+ }
+
+ mem_off = info->mem_off_first[1];
+ if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
+ reg_off = info->reg_off_first[1];
+ reg_last = info->reg_off_last[1];
-#undef DO_LDN_1
-#undef DO_LDN_2
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ check(env, mtedesc, addr, ra);
+ }
+ reg_off += esize;
+ mem_off += msize;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ }
+}
-/*
- * Load contiguous data, first-fault and no-fault.
- *
- * For user-only, one could argue that we should hold the mmap_lock during
- * the operation so that there is no race between page_check_range and the
- * load operation. However, unmapping pages out from under a running thread
- * is extraordinarily unlikely. This theoretical race condition also affects
- * linux-user/ in its get_user/put_user macros.
- *
- * TODO: Construct some helpers, written in assembly, that interact with
- * handle_cpu_signal to produce memory ops which can properly report errors
- * without racing.
- */
+typedef void sve_cont_ldst_mte_check_fn(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr,
+ int esize, int msize, uint32_t mtedesc,
+ uintptr_t ra);
-/* Fault on byte I. All bits in FFR from I are cleared. The vector
- * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
- * option, which leaves subsequent data unchanged.
- */
-static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
+static void sve_cont_ldst_mte_check1(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr,
+ int esize, int msize, uint32_t mtedesc,
+ uintptr_t ra)
{
- uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
+ sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
+ mtedesc, ra, mte_check1);
+}
- if (i & 63) {
- ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
- i = ROUND_UP(i, 64);
- }
- for (; i < oprsz; i += 64) {
- ffr[i / 64] = 0;
- }
+static void sve_cont_ldst_mte_checkN(SVEContLdSt *info, CPUARMState *env,
+ uint64_t *vg, target_ulong addr,
+ int esize, int msize, uint32_t mtedesc,
+ uintptr_t ra)
+{
+ sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
+ mtedesc, ra, mte_checkN);
}
+
/*
- * Common helper for all contiguous first-fault loads.
+ * Common helper for all contiguous 1,2,3,4-register predicated stores.
*/
-static void sve_ldff1_r(CPUARMState *env, void *vg, const target_ulong addr,
- uint32_t desc, const uintptr_t retaddr,
- const int esz, const int msz,
- sve_ld1_host_fn *host_fn,
- sve_ld1_tlb_fn *tlb_fn)
-{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const int mmu_idx = get_mmuidx(oi);
- const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
- void *vd = &env->vfp.zregs[rd];
- const int diffsz = esz - msz;
+static inline QEMU_ALWAYS_INLINE
+void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
+ uint32_t desc, const uintptr_t retaddr,
+ const int esz, const int msz, const int N, uint32_t mtedesc,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn,
+ sve_cont_ldst_mte_check_fn *mte_check_fn)
+{
+ const unsigned rd = simd_data(desc);
const intptr_t reg_max = simd_oprsz(desc);
- const intptr_t mem_max = reg_max >> diffsz;
- intptr_t split, reg_off, mem_off;
+ intptr_t reg_off, reg_last, mem_off;
+ SVEContLdSt info;
void *host;
+ int flags, i;
- /* Skip to the first active element. */
- reg_off = find_next_active(vg, 0, reg_max, esz);
- if (unlikely(reg_off == reg_max)) {
+ /* Find the active elements. */
+ if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
/* The entire predicate was false; no load occurs. */
- memset(vd, 0, reg_max);
+ for (i = 0; i < N; ++i) {
+ memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
+ }
return;
}
- mem_off = reg_off >> diffsz;
- set_helper_retaddr(retaddr);
+
+ /* Probe the page(s). Exit with exception for any invalid page. */
+ sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
+
+ /* Handle watchpoints for all active elements. */
+ sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
+ BP_MEM_READ, retaddr);
/*
- * If the (remaining) load is entirely within a single page, then:
- * For softmmu, and the tlb hits, then no faults will occur;
- * For user-only, either the first load will fault or none will.
- * We can thus perform the load directly to the destination and
- * Vd will be unmodified on any exception path.
+ * Handle mte checks for all active elements.
+ * Since TBI must be set for MTE, !mtedesc => !mte_active.
*/
- split = max_for_page(addr, mem_off, mem_max);
- if (likely(split == mem_max)) {
- host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
- if (test_host_page(host)) {
- mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
- tcg_debug_assert(mem_off == mem_max);
- clear_helper_retaddr();
- /* After any fault, zero any leading inactive elements. */
- swap_memzero(vd, reg_off);
- return;
- }
+ if (mte_check_fn && mtedesc) {
+ mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
+ mtedesc, retaddr);
}
+ flags = info.page[0].flags | info.page[1].flags;
+ if (unlikely(flags != 0)) {
#ifdef CONFIG_USER_ONLY
- /*
- * The page(s) containing this first element at ADDR+MEM_OFF must
- * be valid. Considering that this first element may be misaligned
- * and cross a page boundary itself, take the rest of the page from
- * the last byte of the element.
- */
- split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
- mem_off = host_fn(vd, vg, g2h(addr), mem_off, split);
-
- /* After any fault, zero any leading inactive elements. */
- swap_memzero(vd, reg_off);
- reg_off = mem_off << diffsz;
+ g_assert_not_reached();
#else
+ /*
+ * At least one page includes MMIO.
+ * Any bus operation can fail with cpu_transaction_failed,
+ * which for ARM will raise SyncExternal. Perform the load
+ * into scratch memory to preserve register state until the end.
+ */
+ ARMVectorReg scratch[4] = { };
+
+ mem_off = info.mem_off_first[0];
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[1];
+ if (reg_last < 0) {
+ reg_last = info.reg_off_split;
+ if (reg_last < 0) {
+ reg_last = info.reg_off_last[0];
+ }
+ }
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ tlb_fn(env, &scratch[i], reg_off,
+ addr + mem_off + (i << msz), retaddr);
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+
+ for (i = 0; i < N; ++i) {
+ memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
+ }
+ return;
+#endif
+ }
+
+ /* The entire operation is in RAM, on valid pages. */
+
+ for (i = 0; i < N; ++i) {
+ memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
+ }
+
+ mem_off = info.mem_off_first[0];
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[0];
+ host = info.page[0].host;
+
+ while (reg_off <= reg_last) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+ host + mem_off + (i << msz));
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ }
+
/*
- * Perform one normal read, which will fault or not.
- * But it is likely to bring the page into the tlb.
+ * Use the slow path to manage the cross-page misalignment.
+ * But we know this is RAM and cannot trap.
*/
- tlb_fn(env, vd, reg_off, addr + mem_off, oi, retaddr);
-
- /* After any fault, zero any leading predicated false elts. */
- swap_memzero(vd, reg_off);
- mem_off += 1 << msz;
- reg_off += 1 << esz;
-
- /* Try again to read the balance of the page. */
- split = max_for_page(addr, mem_off - 1, mem_max);
- if (split >= (1 << msz)) {
- host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
- if (host) {
- mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
- reg_off = mem_off << diffsz;
+ mem_off = info.mem_off_split;
+ if (unlikely(mem_off >= 0)) {
+ reg_off = info.reg_off_split;
+ for (i = 0; i < N; ++i) {
+ tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
+ addr + mem_off + (i << msz), retaddr);
}
}
-#endif
- clear_helper_retaddr();
- record_fault(env, reg_off, reg_max);
+ mem_off = info.mem_off_first[1];
+ if (unlikely(mem_off >= 0)) {
+ reg_off = info.reg_off_first[1];
+ reg_last = info.reg_off_last[1];
+ host = info.page[1].host;
+
+ do {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+ host + mem_off + (i << msz));
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ }
}
+static inline QEMU_ALWAYS_INLINE
+void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
+ uint32_t desc, const uintptr_t ra,
+ const int esz, const int msz, const int N,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ int bit55 = extract64(addr, 55, 1);
+
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /* Perform gross MTE suppression early. */
+ if (!tbi_check(desc, bit55) ||
+ tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+ mtedesc = 0;
+ }
+
+ sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
+ N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
+}
+
+#define DO_LD1_1(NAME, ESZ) \
+void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
+ sve_##NAME##_host, sve_##NAME##_tlb, NULL); \
+} \
+void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
+ sve_##NAME##_host, sve_##NAME##_tlb); \
+}
+
+#define DO_LD1_2(NAME, ESZ, MSZ) \
+void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
+ sve_##NAME##_le_host, sve_##NAME##_le_tlb, NULL); \
+} \
+void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
+ sve_##NAME##_be_host, sve_##NAME##_be_tlb, NULL); \
+} \
+void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
+ sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
+} \
+void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
+ sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
+}
+
+DO_LD1_1(ld1bb, MO_8)
+DO_LD1_1(ld1bhu, MO_16)
+DO_LD1_1(ld1bhs, MO_16)
+DO_LD1_1(ld1bsu, MO_32)
+DO_LD1_1(ld1bss, MO_32)
+DO_LD1_1(ld1bdu, MO_64)
+DO_LD1_1(ld1bds, MO_64)
+
+DO_LD1_2(ld1hh, MO_16, MO_16)
+DO_LD1_2(ld1hsu, MO_32, MO_16)
+DO_LD1_2(ld1hss, MO_32, MO_16)
+DO_LD1_2(ld1hdu, MO_64, MO_16)
+DO_LD1_2(ld1hds, MO_64, MO_16)
+
+DO_LD1_2(ld1ss, MO_32, MO_32)
+DO_LD1_2(ld1sdu, MO_64, MO_32)
+DO_LD1_2(ld1sds, MO_64, MO_32)
+
+DO_LD1_2(ld1dd, MO_64, MO_64)
+
+#undef DO_LD1_1
+#undef DO_LD1_2
+
+#define DO_LDN_1(N) \
+void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
+ sve_ld1bb_host, sve_ld1bb_tlb, NULL); \
+} \
+void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
+ sve_ld1bb_host, sve_ld1bb_tlb); \
+}
+
+#define DO_LDN_2(N, SUFF, ESZ) \
+void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
+ sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb, NULL); \
+} \
+void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
+ sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb, NULL); \
+} \
+void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
+ sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
+} \
+void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
+ sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
+}
+
+DO_LDN_1(2)
+DO_LDN_1(3)
+DO_LDN_1(4)
+
+DO_LDN_2(2, hh, MO_16)
+DO_LDN_2(3, hh, MO_16)
+DO_LDN_2(4, hh, MO_16)
+
+DO_LDN_2(2, ss, MO_32)
+DO_LDN_2(3, ss, MO_32)
+DO_LDN_2(4, ss, MO_32)
+
+DO_LDN_2(2, dd, MO_64)
+DO_LDN_2(3, dd, MO_64)
+DO_LDN_2(4, dd, MO_64)
+
+#undef DO_LDN_1
+#undef DO_LDN_2
+
/*
- * Common helper for all contiguous no-fault loads.
+ * Load contiguous data, first-fault and no-fault.
+ *
+ * For user-only, one could argue that we should hold the mmap_lock during
+ * the operation so that there is no race between page_check_range and the
+ * load operation. However, unmapping pages out from under a running thread
+ * is extraordinarily unlikely. This theoretical race condition also affects
+ * linux-user/ in its get_user/put_user macros.
+ *
+ * TODO: Construct some helpers, written in assembly, that interact with
+ * handle_cpu_signal to produce memory ops which can properly report errors
+ * without racing.
+ */
+
+/* Fault on byte I. All bits in FFR from I are cleared. The vector
+ * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
+ * option, which leaves subsequent data unchanged.
*/
-static void sve_ldnf1_r(CPUARMState *env, void *vg, const target_ulong addr,
- uint32_t desc, const int esz, const int msz,
- sve_ld1_host_fn *host_fn)
+static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
{
- const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
+ uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
+
+ if (i & 63) {
+ ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
+ i = ROUND_UP(i, 64);
+ }
+ for (; i < oprsz; i += 64) {
+ ffr[i / 64] = 0;
+ }
+}
+
+/*
+ * Common helper for all contiguous no-fault and first-fault loads.
+ */
+static inline QEMU_ALWAYS_INLINE
+void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
+ uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
+ const int esz, const int msz, const SVEContFault fault,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ const unsigned rd = simd_data(desc);
void *vd = &env->vfp.zregs[rd];
- const int diffsz = esz - msz;
const intptr_t reg_max = simd_oprsz(desc);
- const intptr_t mem_max = reg_max >> diffsz;
- const int mmu_idx = cpu_mmu_index(env, false);
- intptr_t split, reg_off, mem_off;
+ intptr_t reg_off, mem_off, reg_last;
+ SVEContLdSt info;
+ int flags;
void *host;
-#ifdef CONFIG_USER_ONLY
- host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx);
- if (likely(page_check_range(addr, mem_max, PAGE_READ) == 0)) {
- /* The entire operation is valid and will not fault. */
- host_fn(vd, vg, host, 0, mem_max);
+ /* Find the active elements. */
+ if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
+ /* The entire predicate was false; no load occurs. */
+ memset(vd, 0, reg_max);
return;
}
-#endif
+ reg_off = info.reg_off_first[0];
- /* There will be no fault, so we may modify in advance. */
- memset(vd, 0, reg_max);
+ /* Probe the page(s). */
+ if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
+ /* Fault on first element. */
+ tcg_debug_assert(fault == FAULT_NO);
+ memset(vd, 0, reg_max);
+ goto do_fault;
+ }
- /* Skip to the first active element. */
- reg_off = find_next_active(vg, 0, reg_max, esz);
- if (unlikely(reg_off == reg_max)) {
- /* The entire predicate was false; no load occurs. */
- return;
+ mem_off = info.mem_off_first[0];
+ flags = info.page[0].flags;
+
+ /*
+ * Disable MTE checking if the Tagged bit is not set. Since TBI must
+ * be set within MTEDESC for MTE, !mtedesc => !mte_active.
+ */
+ if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
+ mtedesc = 0;
}
- mem_off = reg_off >> diffsz;
-#ifdef CONFIG_USER_ONLY
- if (page_check_range(addr + mem_off, 1 << msz, PAGE_READ) == 0) {
- /* At least one load is valid; take the rest of the page. */
- split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
- mem_off = host_fn(vd, vg, host, mem_off, split);
- reg_off = mem_off << diffsz;
+ if (fault == FAULT_FIRST) {
+ /* Trapping mte check for the first-fault element. */
+ if (mtedesc) {
+ mte_check1(env, mtedesc, addr + mem_off, retaddr);
+ }
+
+ /*
+ * Special handling of the first active element,
+ * if it crosses a page boundary or is MMIO.
+ */
+ bool is_split = mem_off == info.mem_off_split;
+ if (unlikely(flags != 0) || unlikely(is_split)) {
+ /*
+ * Use the slow path for cross-page handling.
+ * Might trap for MMIO or watchpoints.
+ */
+ tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
+
+ /* After any fault, zero the other elements. */
+ swap_memzero(vd, reg_off);
+ reg_off += 1 << esz;
+ mem_off += 1 << msz;
+ swap_memzero(vd + reg_off, reg_max - reg_off);
+
+ if (is_split) {
+ goto second_page;
+ }
+ } else {
+ memset(vd, 0, reg_max);
+ }
+ } else {
+ memset(vd, 0, reg_max);
+ if (unlikely(mem_off == info.mem_off_split)) {
+ /* The first active element crosses a page boundary. */
+ flags |= info.page[1].flags;
+ if (unlikely(flags & TLB_MMIO)) {
+ /* Some page is MMIO, see below. */
+ goto do_fault;
+ }
+ if (unlikely(flags & TLB_WATCHPOINT) &&
+ (cpu_watchpoint_address_matches
+ (env_cpu(env), addr + mem_off, 1 << msz)
+ & BP_MEM_READ)) {
+ /* Watchpoint hit, see below. */
+ goto do_fault;
+ }
+ if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
+ goto do_fault;
+ }
+ /*
+ * Use the slow path for cross-page handling.
+ * This is RAM, without a watchpoint, and will not trap.
+ */
+ tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
+ goto second_page;
+ }
}
-#else
+
/*
- * If the address is not in the TLB, we have no way to bring the
- * entry into the TLB without also risking a fault. Note that
- * the corollary is that we never load from an address not in RAM.
+ * From this point on, all memory operations are MemSingleNF.
*
- * This last is out of spec, in a weird corner case.
- * Per the MemNF/MemSingleNF pseudocode, a NF load from Device memory
- * must not actually hit the bus -- it returns UNKNOWN data instead.
- * But if you map non-RAM with Normal memory attributes and do a NF
- * load then it should access the bus. (Nobody ought actually do this
- * in the real world, obviously.)
+ * Per the MemSingleNF pseudocode, a no-fault load from Device memory
+ * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
*
- * Then there are the annoying special cases with watchpoints...
- * TODO: Add a form of non-faulting loads using cc->tlb_fill(probe=true).
+ * Unfortuately we do not have access to the memory attributes from the
+ * PTE to tell Device memory from Normal memory. So we make a mostly
+ * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
+ * This gives the right answer for the common cases of "Normal memory,
+ * backed by host RAM" and "Device memory, backed by MMIO".
+ * The architecture allows us to suppress an NF load and return
+ * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
+ * case of "Normal memory, backed by MMIO" is permitted. The case we
+ * get wrong is "Device memory, backed by host RAM", for which we
+ * should return (UNKNOWN, FAULT) for but do not.
+ *
+ * Similarly, CPU_BP breakpoints would raise exceptions, and so
+ * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
+ * architectural breakpoints the same.
*/
- host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
- split = max_for_page(addr, mem_off, mem_max);
- if (host && split >= (1 << msz)) {
- mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
- reg_off = mem_off << diffsz;
+ if (unlikely(flags & TLB_MMIO)) {
+ goto do_fault;
+ }
+
+ reg_last = info.reg_off_last[0];
+ host = info.page[0].host;
+
+ do {
+ uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ if (unlikely(flags & TLB_WATCHPOINT) &&
+ (cpu_watchpoint_address_matches
+ (env_cpu(env), addr + mem_off, 1 << msz)
+ & BP_MEM_READ)) {
+ goto do_fault;
+ }
+ if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
+ goto do_fault;
+ }
+ host_fn(vd, reg_off, host + mem_off);
+ }
+ reg_off += 1 << esz;
+ mem_off += 1 << msz;
+ } while (reg_off <= reg_last && (reg_off & 63));
+ } while (reg_off <= reg_last);
+
+ /*
+ * MemSingleNF is allowed to fail for any reason. We have special
+ * code above to handle the first element crossing a page boundary.
+ * As an implementation choice, decline to handle a cross-page element
+ * in any other position.
+ */
+ reg_off = info.reg_off_split;
+ if (reg_off >= 0) {
+ goto do_fault;
}
-#endif
+ second_page:
+ reg_off = info.reg_off_first[1];
+ if (likely(reg_off < 0)) {
+ /* No active elements on the second page. All done. */
+ return;
+ }
+
+ /*
+ * MemSingleNF is allowed to fail for any reason. As an implementation
+ * choice, decline to handle elements on the second page. This should
+ * be low frequency as the guest walks through memory -- the next
+ * iteration of the guest's loop should be aligned on the page boundary,
+ * and then all following iterations will stay aligned.
+ */
+
+ do_fault:
record_fault(env, reg_off, reg_max);
}
-#define DO_LDFF1_LDNF1_1(PART, ESZ) \
+static inline QEMU_ALWAYS_INLINE
+void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
+ uint32_t desc, const uintptr_t retaddr,
+ const int esz, const int msz, const SVEContFault fault,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
+{
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ int bit55 = extract64(addr, 55, 1);
+
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /* Perform gross MTE suppression early. */
+ if (!tbi_check(desc, bit55) ||
+ tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+ mtedesc = 0;
+ }
+
+ sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
+ esz, msz, fault, host_fn, tlb_fn);
+}
+
+#define DO_LDFF1_LDNF1_1(PART, ESZ) \
void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
{ \
- sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
- sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
+ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
} \
void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
{ \
- sve_ldnf1_r(env, vg, addr, desc, ESZ, 0, sve_ld1##PART##_host); \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
+ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
+} \
+void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
+ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
+ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
}
-#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
+#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
{ \
- sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
- sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
+ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
} \
void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
{ \
- sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_le_host); \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
+ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
} \
void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
{ \
- sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
- sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
+ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
} \
void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
target_ulong addr, uint32_t desc) \
{ \
- sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_be_host); \
+ sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
+ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+} \
+void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
+ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
+ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
+} \
+void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
+ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
+} \
+void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
+ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
}
-DO_LDFF1_LDNF1_1(bb, 0)
-DO_LDFF1_LDNF1_1(bhu, 1)
-DO_LDFF1_LDNF1_1(bhs, 1)
-DO_LDFF1_LDNF1_1(bsu, 2)
-DO_LDFF1_LDNF1_1(bss, 2)
-DO_LDFF1_LDNF1_1(bdu, 3)
-DO_LDFF1_LDNF1_1(bds, 3)
+DO_LDFF1_LDNF1_1(bb, MO_8)
+DO_LDFF1_LDNF1_1(bhu, MO_16)
+DO_LDFF1_LDNF1_1(bhs, MO_16)
+DO_LDFF1_LDNF1_1(bsu, MO_32)
+DO_LDFF1_LDNF1_1(bss, MO_32)
+DO_LDFF1_LDNF1_1(bdu, MO_64)
+DO_LDFF1_LDNF1_1(bds, MO_64)
-DO_LDFF1_LDNF1_2(hh, 1, 1)
-DO_LDFF1_LDNF1_2(hsu, 2, 1)
-DO_LDFF1_LDNF1_2(hss, 2, 1)
-DO_LDFF1_LDNF1_2(hdu, 3, 1)
-DO_LDFF1_LDNF1_2(hds, 3, 1)
+DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
+DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
+DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
+DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
+DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
-DO_LDFF1_LDNF1_2(ss, 2, 2)
-DO_LDFF1_LDNF1_2(sdu, 3, 2)
-DO_LDFF1_LDNF1_2(sds, 3, 2)
+DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
+DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
+DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
-DO_LDFF1_LDNF1_2(dd, 3, 3)
+DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
#undef DO_LDFF1_LDNF1_1
#undef DO_LDFF1_LDNF1_2
/*
- * Store contiguous data, protected by a governing predicate.
+ * Common helper for all contiguous 1,2,3,4-register predicated stores.
*/
-#ifdef CONFIG_SOFTMMU
-#define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
-static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
- target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
-{ \
- TLB(env, addr, *(TYPEM *)(vd + H(reg_off)), oi, ra); \
-}
-#else
-#define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
-static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
- target_ulong addr, TCGMemOpIdx oi, uintptr_t ra) \
-{ \
- HOST(g2h(addr), *(TYPEM *)(vd + H(reg_off))); \
-}
-#endif
-
-DO_ST_TLB(st1bb, H1, uint8_t, stb_p, 0, helper_ret_stb_mmu)
-DO_ST_TLB(st1bh, H1_2, uint16_t, stb_p, 0, helper_ret_stb_mmu)
-DO_ST_TLB(st1bs, H1_4, uint32_t, stb_p, 0, helper_ret_stb_mmu)
-DO_ST_TLB(st1bd, , uint64_t, stb_p, 0, helper_ret_stb_mmu)
-
-DO_ST_TLB(st1hh_le, H1_2, uint16_t, stw_le_p, MO_LE, helper_le_stw_mmu)
-DO_ST_TLB(st1hs_le, H1_4, uint32_t, stw_le_p, MO_LE, helper_le_stw_mmu)
-DO_ST_TLB(st1hd_le, , uint64_t, stw_le_p, MO_LE, helper_le_stw_mmu)
-
-DO_ST_TLB(st1ss_le, H1_4, uint32_t, stl_le_p, MO_LE, helper_le_stl_mmu)
-DO_ST_TLB(st1sd_le, , uint64_t, stl_le_p, MO_LE, helper_le_stl_mmu)
-
-DO_ST_TLB(st1dd_le, , uint64_t, stq_le_p, MO_LE, helper_le_stq_mmu)
+static inline QEMU_ALWAYS_INLINE
+void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
+ uint32_t desc, const uintptr_t retaddr,
+ const int esz, const int msz, const int N, uint32_t mtedesc,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn,
+ sve_cont_ldst_mte_check_fn *mte_check_fn)
+{
+ const unsigned rd = simd_data(desc);
+ const intptr_t reg_max = simd_oprsz(desc);
+ intptr_t reg_off, reg_last, mem_off;
+ SVEContLdSt info;
+ void *host;
+ int i, flags;
-DO_ST_TLB(st1hh_be, H1_2, uint16_t, stw_be_p, MO_BE, helper_be_stw_mmu)
-DO_ST_TLB(st1hs_be, H1_4, uint32_t, stw_be_p, MO_BE, helper_be_stw_mmu)
-DO_ST_TLB(st1hd_be, , uint64_t, stw_be_p, MO_BE, helper_be_stw_mmu)
+ /* Find the active elements. */
+ if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
+ /* The entire predicate was false; no store occurs. */
+ return;
+ }
-DO_ST_TLB(st1ss_be, H1_4, uint32_t, stl_be_p, MO_BE, helper_be_stl_mmu)
-DO_ST_TLB(st1sd_be, , uint64_t, stl_be_p, MO_BE, helper_be_stl_mmu)
+ /* Probe the page(s). Exit with exception for any invalid page. */
+ sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
-DO_ST_TLB(st1dd_be, , uint64_t, stq_be_p, MO_BE, helper_be_stq_mmu)
+ /* Handle watchpoints for all active elements. */
+ sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
+ BP_MEM_WRITE, retaddr);
-#undef DO_ST_TLB
+ /*
+ * Handle mte checks for all active elements.
+ * Since TBI must be set for MTE, !mtedesc => !mte_active.
+ */
+ if (mte_check_fn && mtedesc) {
+ mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
+ mtedesc, retaddr);
+ }
-/*
- * Common helpers for all contiguous 1,2,3,4-register predicated stores.
- */
-static void sve_st1_r(CPUARMState *env, void *vg, target_ulong addr,
- uint32_t desc, const uintptr_t ra,
- const int esize, const int msize,
- sve_st1_tlb_fn *tlb_fn)
-{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
- intptr_t i, oprsz = simd_oprsz(desc);
- void *vd = &env->vfp.zregs[rd];
+ flags = info.page[0].flags | info.page[1].flags;
+ if (unlikely(flags != 0)) {
+#ifdef CONFIG_USER_ONLY
+ g_assert_not_reached();
+#else
+ /*
+ * At least one page includes MMIO.
+ * Any bus operation can fail with cpu_transaction_failed,
+ * which for ARM will raise SyncExternal. We cannot avoid
+ * this fault and will leave with the store incomplete.
+ */
+ mem_off = info.mem_off_first[0];
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[1];
+ if (reg_last < 0) {
+ reg_last = info.reg_off_split;
+ if (reg_last < 0) {
+ reg_last = info.reg_off_last[0];
+ }
+ }
- set_helper_retaddr(ra);
- for (i = 0; i < oprsz; ) {
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
do {
- if (pg & 1) {
- tlb_fn(env, vd, i, addr, oi, ra);
- }
- i += esize, pg >>= esize;
- addr += msize;
- } while (i & 15);
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
+ addr + mem_off + (i << msz), retaddr);
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
+ return;
+#endif
}
- clear_helper_retaddr();
-}
-static void sve_st2_r(CPUARMState *env, void *vg, target_ulong addr,
- uint32_t desc, const uintptr_t ra,
- const int esize, const int msize,
- sve_st1_tlb_fn *tlb_fn)
-{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
- intptr_t i, oprsz = simd_oprsz(desc);
- void *d1 = &env->vfp.zregs[rd];
- void *d2 = &env->vfp.zregs[(rd + 1) & 31];
+ mem_off = info.mem_off_first[0];
+ reg_off = info.reg_off_first[0];
+ reg_last = info.reg_off_last[0];
+ host = info.page[0].host;
- set_helper_retaddr(ra);
- for (i = 0; i < oprsz; ) {
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
+ while (reg_off <= reg_last) {
+ uint64_t pg = vg[reg_off >> 6];
do {
- if (pg & 1) {
- tlb_fn(env, d1, i, addr, oi, ra);
- tlb_fn(env, d2, i, addr + msize, oi, ra);
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+ host + mem_off + (i << msz));
+ }
}
- i += esize, pg >>= esize;
- addr += 2 * msize;
- } while (i & 15);
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off <= reg_last && (reg_off & 63));
}
- clear_helper_retaddr();
-}
-static void sve_st3_r(CPUARMState *env, void *vg, target_ulong addr,
- uint32_t desc, const uintptr_t ra,
- const int esize, const int msize,
- sve_st1_tlb_fn *tlb_fn)
-{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
- intptr_t i, oprsz = simd_oprsz(desc);
- void *d1 = &env->vfp.zregs[rd];
- void *d2 = &env->vfp.zregs[(rd + 1) & 31];
- void *d3 = &env->vfp.zregs[(rd + 2) & 31];
+ /*
+ * Use the slow path to manage the cross-page misalignment.
+ * But we know this is RAM and cannot trap.
+ */
+ mem_off = info.mem_off_split;
+ if (unlikely(mem_off >= 0)) {
+ reg_off = info.reg_off_split;
+ for (i = 0; i < N; ++i) {
+ tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
+ addr + mem_off + (i << msz), retaddr);
+ }
+ }
+
+ mem_off = info.mem_off_first[1];
+ if (unlikely(mem_off >= 0)) {
+ reg_off = info.reg_off_first[1];
+ reg_last = info.reg_off_last[1];
+ host = info.page[1].host;
- set_helper_retaddr(ra);
- for (i = 0; i < oprsz; ) {
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
do {
- if (pg & 1) {
- tlb_fn(env, d1, i, addr, oi, ra);
- tlb_fn(env, d2, i, addr + msize, oi, ra);
- tlb_fn(env, d3, i, addr + 2 * msize, oi, ra);
- }
- i += esize, pg >>= esize;
- addr += 3 * msize;
- } while (i & 15);
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if ((pg >> (reg_off & 63)) & 1) {
+ for (i = 0; i < N; ++i) {
+ host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
+ host + mem_off + (i << msz));
+ }
+ }
+ reg_off += 1 << esz;
+ mem_off += N << msz;
+ } while (reg_off & 63);
+ } while (reg_off <= reg_last);
}
- clear_helper_retaddr();
}
-static void sve_st4_r(CPUARMState *env, void *vg, target_ulong addr,
- uint32_t desc, const uintptr_t ra,
- const int esize, const int msize,
- sve_st1_tlb_fn *tlb_fn)
+static inline QEMU_ALWAYS_INLINE
+void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
+ uint32_t desc, const uintptr_t ra,
+ const int esz, const int msz, const int N,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
- intptr_t i, oprsz = simd_oprsz(desc);
- void *d1 = &env->vfp.zregs[rd];
- void *d2 = &env->vfp.zregs[(rd + 1) & 31];
- void *d3 = &env->vfp.zregs[(rd + 2) & 31];
- void *d4 = &env->vfp.zregs[(rd + 3) & 31];
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ int bit55 = extract64(addr, 55, 1);
- set_helper_retaddr(ra);
- for (i = 0; i < oprsz; ) {
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
- do {
- if (pg & 1) {
- tlb_fn(env, d1, i, addr, oi, ra);
- tlb_fn(env, d2, i, addr + msize, oi, ra);
- tlb_fn(env, d3, i, addr + 2 * msize, oi, ra);
- tlb_fn(env, d4, i, addr + 3 * msize, oi, ra);
- }
- i += esize, pg >>= esize;
- addr += 4 * msize;
- } while (i & 15);
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+
+ /* Perform gross MTE suppression early. */
+ if (!tbi_check(desc, bit55) ||
+ tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
+ mtedesc = 0;
}
- clear_helper_retaddr();
+
+ sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
+ N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
}
-#define DO_STN_1(N, NAME, ESIZE) \
-void QEMU_FLATTEN HELPER(sve_st##N##NAME##_r) \
- (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
-{ \
- sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, 1, \
- sve_st1##NAME##_tlb); \
+#define DO_STN_1(N, NAME, ESZ) \
+void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
+ sve_st1##NAME##_host, sve_st1##NAME##_tlb, NULL); \
+} \
+void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
+ sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
}
-#define DO_STN_2(N, NAME, ESIZE, MSIZE) \
-void QEMU_FLATTEN HELPER(sve_st##N##NAME##_le_r) \
- (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
-{ \
- sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
- sve_st1##NAME##_le_tlb); \
-} \
-void QEMU_FLATTEN HELPER(sve_st##N##NAME##_be_r) \
- (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
-{ \
- sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
- sve_st1##NAME##_be_tlb); \
-}
-
-DO_STN_1(1, bb, 1)
-DO_STN_1(1, bh, 2)
-DO_STN_1(1, bs, 4)
-DO_STN_1(1, bd, 8)
-DO_STN_1(2, bb, 1)
-DO_STN_1(3, bb, 1)
-DO_STN_1(4, bb, 1)
-
-DO_STN_2(1, hh, 2, 2)
-DO_STN_2(1, hs, 4, 2)
-DO_STN_2(1, hd, 8, 2)
-DO_STN_2(2, hh, 2, 2)
-DO_STN_2(3, hh, 2, 2)
-DO_STN_2(4, hh, 2, 2)
-
-DO_STN_2(1, ss, 4, 4)
-DO_STN_2(1, sd, 8, 4)
-DO_STN_2(2, ss, 4, 4)
-DO_STN_2(3, ss, 4, 4)
-DO_STN_2(4, ss, 4, 4)
-
-DO_STN_2(1, dd, 8, 8)
-DO_STN_2(2, dd, 8, 8)
-DO_STN_2(3, dd, 8, 8)
-DO_STN_2(4, dd, 8, 8)
+#define DO_STN_2(N, NAME, ESZ, MSZ) \
+void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
+ sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb, NULL); \
+} \
+void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
+ sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb, NULL); \
+} \
+void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
+ sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
+} \
+void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
+ target_ulong addr, uint32_t desc) \
+{ \
+ sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
+ sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
+}
+
+DO_STN_1(1, bb, MO_8)
+DO_STN_1(1, bh, MO_16)
+DO_STN_1(1, bs, MO_32)
+DO_STN_1(1, bd, MO_64)
+DO_STN_1(2, bb, MO_8)
+DO_STN_1(3, bb, MO_8)
+DO_STN_1(4, bb, MO_8)
+
+DO_STN_2(1, hh, MO_16, MO_16)
+DO_STN_2(1, hs, MO_32, MO_16)
+DO_STN_2(1, hd, MO_64, MO_16)
+DO_STN_2(2, hh, MO_16, MO_16)
+DO_STN_2(3, hh, MO_16, MO_16)
+DO_STN_2(4, hh, MO_16, MO_16)
+
+DO_STN_2(1, ss, MO_32, MO_32)
+DO_STN_2(1, sd, MO_64, MO_32)
+DO_STN_2(2, ss, MO_32, MO_32)
+DO_STN_2(3, ss, MO_32, MO_32)
+DO_STN_2(4, ss, MO_32, MO_32)
+
+DO_STN_2(1, dd, MO_64, MO_64)
+DO_STN_2(2, dd, MO_64, MO_64)
+DO_STN_2(3, dd, MO_64, MO_64)
+DO_STN_2(4, dd, MO_64, MO_64)
#undef DO_STN_1
#undef DO_STN_2
return *(uint64_t *)(reg + reg_ofs);
}
-static void sve_ld1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
- target_ulong base, uint32_t desc, uintptr_t ra,
- zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
+static inline QEMU_ALWAYS_INLINE
+void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ uint32_t mtedesc, int esize, int msize,
+ zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
- intptr_t i, oprsz = simd_oprsz(desc);
- ARMVectorReg scratch = { };
+ const int mmu_idx = cpu_mmu_index(env, false);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const int scale = simd_data(desc);
+ ARMVectorReg scratch;
+ intptr_t reg_off;
+ SVEHostPage info, info2;
- set_helper_retaddr(ra);
- for (i = 0; i < oprsz; ) {
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
+ memset(&scratch, 0, reg_max);
+ reg_off = 0;
+ do {
+ uint64_t pg = vg[reg_off >> 6];
do {
if (likely(pg & 1)) {
- target_ulong off = off_fn(vm, i);
- tlb_fn(env, &scratch, i, base + (off << scale), oi, ra);
+ target_ulong addr = base + (off_fn(vm, reg_off) << scale);
+ target_ulong in_page = -(addr | TARGET_PAGE_MASK);
+
+ sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
+ mmu_idx, retaddr);
+
+ if (likely(in_page >= msize)) {
+ if (unlikely(info.flags & TLB_WATCHPOINT)) {
+ cpu_check_watchpoint(env_cpu(env), addr, msize,
+ info.attrs, BP_MEM_READ, retaddr);
+ }
+ if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
+ mte_check1(env, mtedesc, addr, retaddr);
+ }
+ host_fn(&scratch, reg_off, info.host);
+ } else {
+ /* Element crosses the page boundary. */
+ sve_probe_page(&info2, false, env, addr + in_page, 0,
+ MMU_DATA_LOAD, mmu_idx, retaddr);
+ if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
+ cpu_check_watchpoint(env_cpu(env), addr,
+ msize, info.attrs,
+ BP_MEM_READ, retaddr);
+ }
+ if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
+ mte_check1(env, mtedesc, addr, retaddr);
+ }
+ tlb_fn(env, &scratch, reg_off, addr, retaddr);
+ }
}
- i += 4, pg >>= 4;
- } while (i & 15);
- }
- clear_helper_retaddr();
+ reg_off += esize;
+ pg >>= esize;
+ } while (reg_off & 63);
+ } while (reg_off < reg_max);
/* Wait until all exceptions have been raised to write back. */
- memcpy(vd, &scratch, oprsz);
+ memcpy(vd, &scratch, reg_max);
}
-static void sve_ld1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
- target_ulong base, uint32_t desc, uintptr_t ra,
- zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
+static inline QEMU_ALWAYS_INLINE
+void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ int esize, int msize, zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
- intptr_t i, oprsz = simd_oprsz(desc) / 8;
- ARMVectorReg scratch = { };
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
- set_helper_retaddr(ra);
- for (i = 0; i < oprsz; i++) {
- uint8_t pg = *(uint8_t *)(vg + H1(i));
- if (likely(pg & 1)) {
- target_ulong off = off_fn(vm, i * 8);
- tlb_fn(env, &scratch, i * 8, base + (off << scale), oi, ra);
- }
- }
- clear_helper_retaddr();
+ /*
+ * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
+ * offset base entirely over the address space hole to change the
+ * pointer tag, or change the bit55 selector. So we could here
+ * examine TBI + TCMA like we do for sve_ldN_r_mte().
+ */
+ sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
+ esize, msize, off_fn, host_fn, tlb_fn);
+}
- /* Wait until all exceptions have been raised to write back. */
- memcpy(vd, &scratch, oprsz * 8);
-}
-
-#define DO_LD1_ZPZ_S(MEM, OFS) \
-void QEMU_FLATTEN HELPER(sve_ld##MEM##_##OFS) \
- (CPUARMState *env, void *vd, void *vg, void *vm, \
- target_ulong base, uint32_t desc) \
-{ \
- sve_ld1_zs(env, vd, vg, vm, base, desc, GETPC(), \
- off_##OFS##_s, sve_ld1##MEM##_tlb); \
-}
-
-#define DO_LD1_ZPZ_D(MEM, OFS) \
-void QEMU_FLATTEN HELPER(sve_ld##MEM##_##OFS) \
- (CPUARMState *env, void *vd, void *vg, void *vm, \
- target_ulong base, uint32_t desc) \
-{ \
- sve_ld1_zd(env, vd, vg, vm, base, desc, GETPC(), \
- off_##OFS##_d, sve_ld1##MEM##_tlb); \
-}
-
-DO_LD1_ZPZ_S(bsu, zsu)
-DO_LD1_ZPZ_S(bsu, zss)
-DO_LD1_ZPZ_D(bdu, zsu)
-DO_LD1_ZPZ_D(bdu, zss)
-DO_LD1_ZPZ_D(bdu, zd)
-
-DO_LD1_ZPZ_S(bss, zsu)
-DO_LD1_ZPZ_S(bss, zss)
-DO_LD1_ZPZ_D(bds, zsu)
-DO_LD1_ZPZ_D(bds, zss)
-DO_LD1_ZPZ_D(bds, zd)
-
-DO_LD1_ZPZ_S(hsu_le, zsu)
-DO_LD1_ZPZ_S(hsu_le, zss)
-DO_LD1_ZPZ_D(hdu_le, zsu)
-DO_LD1_ZPZ_D(hdu_le, zss)
-DO_LD1_ZPZ_D(hdu_le, zd)
-
-DO_LD1_ZPZ_S(hsu_be, zsu)
-DO_LD1_ZPZ_S(hsu_be, zss)
-DO_LD1_ZPZ_D(hdu_be, zsu)
-DO_LD1_ZPZ_D(hdu_be, zss)
-DO_LD1_ZPZ_D(hdu_be, zd)
-
-DO_LD1_ZPZ_S(hss_le, zsu)
-DO_LD1_ZPZ_S(hss_le, zss)
-DO_LD1_ZPZ_D(hds_le, zsu)
-DO_LD1_ZPZ_D(hds_le, zss)
-DO_LD1_ZPZ_D(hds_le, zd)
-
-DO_LD1_ZPZ_S(hss_be, zsu)
-DO_LD1_ZPZ_S(hss_be, zss)
-DO_LD1_ZPZ_D(hds_be, zsu)
-DO_LD1_ZPZ_D(hds_be, zss)
-DO_LD1_ZPZ_D(hds_be, zd)
-
-DO_LD1_ZPZ_S(ss_le, zsu)
-DO_LD1_ZPZ_S(ss_le, zss)
-DO_LD1_ZPZ_D(sdu_le, zsu)
-DO_LD1_ZPZ_D(sdu_le, zss)
-DO_LD1_ZPZ_D(sdu_le, zd)
-
-DO_LD1_ZPZ_S(ss_be, zsu)
-DO_LD1_ZPZ_S(ss_be, zss)
-DO_LD1_ZPZ_D(sdu_be, zsu)
-DO_LD1_ZPZ_D(sdu_be, zss)
-DO_LD1_ZPZ_D(sdu_be, zd)
-
-DO_LD1_ZPZ_D(sds_le, zsu)
-DO_LD1_ZPZ_D(sds_le, zss)
-DO_LD1_ZPZ_D(sds_le, zd)
-
-DO_LD1_ZPZ_D(sds_be, zsu)
-DO_LD1_ZPZ_D(sds_be, zss)
-DO_LD1_ZPZ_D(sds_be, zd)
-
-DO_LD1_ZPZ_D(dd_le, zsu)
-DO_LD1_ZPZ_D(dd_le, zss)
-DO_LD1_ZPZ_D(dd_le, zd)
-
-DO_LD1_ZPZ_D(dd_be, zsu)
-DO_LD1_ZPZ_D(dd_be, zss)
-DO_LD1_ZPZ_D(dd_be, zd)
+#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
+void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
+ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+} \
+void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
+ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
+
+#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
+void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
+ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+} \
+void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
+ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
+
+DO_LD1_ZPZ_S(bsu, zsu, MO_8)
+DO_LD1_ZPZ_S(bsu, zss, MO_8)
+DO_LD1_ZPZ_D(bdu, zsu, MO_8)
+DO_LD1_ZPZ_D(bdu, zss, MO_8)
+DO_LD1_ZPZ_D(bdu, zd, MO_8)
+
+DO_LD1_ZPZ_S(bss, zsu, MO_8)
+DO_LD1_ZPZ_S(bss, zss, MO_8)
+DO_LD1_ZPZ_D(bds, zsu, MO_8)
+DO_LD1_ZPZ_D(bds, zss, MO_8)
+DO_LD1_ZPZ_D(bds, zd, MO_8)
+
+DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
+DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
+DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
+
+DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
+DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
+DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
+DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
+
+DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
+DO_LD1_ZPZ_S(hss_le, zss, MO_16)
+DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
+DO_LD1_ZPZ_D(hds_le, zss, MO_16)
+DO_LD1_ZPZ_D(hds_le, zd, MO_16)
+
+DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
+DO_LD1_ZPZ_S(hss_be, zss, MO_16)
+DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
+DO_LD1_ZPZ_D(hds_be, zss, MO_16)
+DO_LD1_ZPZ_D(hds_be, zd, MO_16)
+
+DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
+DO_LD1_ZPZ_S(ss_le, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
+DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
+
+DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
+DO_LD1_ZPZ_S(ss_be, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
+DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
+DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
+
+DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
+DO_LD1_ZPZ_D(sds_le, zss, MO_32)
+DO_LD1_ZPZ_D(sds_le, zd, MO_32)
+
+DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
+DO_LD1_ZPZ_D(sds_be, zss, MO_32)
+DO_LD1_ZPZ_D(sds_be, zd, MO_32)
+
+DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
+DO_LD1_ZPZ_D(dd_le, zss, MO_64)
+DO_LD1_ZPZ_D(dd_le, zd, MO_64)
+
+DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
+DO_LD1_ZPZ_D(dd_be, zss, MO_64)
+DO_LD1_ZPZ_D(dd_be, zd, MO_64)
#undef DO_LD1_ZPZ_S
#undef DO_LD1_ZPZ_D
/* First fault loads with a vector index. */
-/* Load one element into VD+REG_OFF from (ENV,VADDR) without faulting.
- * The controlling predicate is known to be true. Return true if the
- * load was successful.
- */
-typedef bool sve_ld1_nf_fn(CPUARMState *env, void *vd, intptr_t reg_off,
- target_ulong vaddr, int mmu_idx);
-
-#ifdef CONFIG_SOFTMMU
-#define DO_LD_NF(NAME, H, TYPEE, TYPEM, HOST) \
-static bool sve_ld##NAME##_nf(CPUARMState *env, void *vd, intptr_t reg_off, \
- target_ulong addr, int mmu_idx) \
-{ \
- target_ulong next_page = -(addr | TARGET_PAGE_MASK); \
- if (likely(next_page - addr >= sizeof(TYPEM))) { \
- void *host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx); \
- if (likely(host)) { \
- TYPEM val = HOST(host); \
- *(TYPEE *)(vd + H(reg_off)) = val; \
- return true; \
- } \
- } \
- return false; \
-}
-#else
-#define DO_LD_NF(NAME, H, TYPEE, TYPEM, HOST) \
-static bool sve_ld##NAME##_nf(CPUARMState *env, void *vd, intptr_t reg_off, \
- target_ulong addr, int mmu_idx) \
-{ \
- if (likely(page_check_range(addr, sizeof(TYPEM), PAGE_READ))) { \
- TYPEM val = HOST(g2h(addr)); \
- *(TYPEE *)(vd + H(reg_off)) = val; \
- return true; \
- } \
- return false; \
-}
-#endif
-
-DO_LD_NF(bsu, H1_4, uint32_t, uint8_t, ldub_p)
-DO_LD_NF(bss, H1_4, uint32_t, int8_t, ldsb_p)
-DO_LD_NF(bdu, , uint64_t, uint8_t, ldub_p)
-DO_LD_NF(bds, , uint64_t, int8_t, ldsb_p)
-
-DO_LD_NF(hsu_le, H1_4, uint32_t, uint16_t, lduw_le_p)
-DO_LD_NF(hss_le, H1_4, uint32_t, int16_t, ldsw_le_p)
-DO_LD_NF(hsu_be, H1_4, uint32_t, uint16_t, lduw_be_p)
-DO_LD_NF(hss_be, H1_4, uint32_t, int16_t, ldsw_be_p)
-DO_LD_NF(hdu_le, , uint64_t, uint16_t, lduw_le_p)
-DO_LD_NF(hds_le, , uint64_t, int16_t, ldsw_le_p)
-DO_LD_NF(hdu_be, , uint64_t, uint16_t, lduw_be_p)
-DO_LD_NF(hds_be, , uint64_t, int16_t, ldsw_be_p)
-
-DO_LD_NF(ss_le, H1_4, uint32_t, uint32_t, ldl_le_p)
-DO_LD_NF(ss_be, H1_4, uint32_t, uint32_t, ldl_be_p)
-DO_LD_NF(sdu_le, , uint64_t, uint32_t, ldl_le_p)
-DO_LD_NF(sds_le, , uint64_t, int32_t, ldl_le_p)
-DO_LD_NF(sdu_be, , uint64_t, uint32_t, ldl_be_p)
-DO_LD_NF(sds_be, , uint64_t, int32_t, ldl_be_p)
-
-DO_LD_NF(dd_le, , uint64_t, uint64_t, ldq_le_p)
-DO_LD_NF(dd_be, , uint64_t, uint64_t, ldq_be_p)
-
/*
- * Common helper for all gather first-faulting loads.
+ * Common helpers for all gather first-faulting loads.
*/
-static inline void sve_ldff1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
- target_ulong base, uint32_t desc, uintptr_t ra,
- zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn,
- sve_ld1_nf_fn *nonfault_fn)
+
+static inline QEMU_ALWAYS_INLINE
+void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ uint32_t mtedesc, const int esz, const int msz,
+ zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const int mmu_idx = get_mmuidx(oi);
- const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
- intptr_t reg_off, reg_max = simd_oprsz(desc);
- target_ulong addr;
+ const int mmu_idx = cpu_mmu_index(env, false);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const int scale = simd_data(desc);
+ const int esize = 1 << esz;
+ const int msize = 1 << msz;
+ intptr_t reg_off;
+ SVEHostPage info;
+ target_ulong addr, in_page;
/* Skip to the first true predicate. */
- reg_off = find_next_active(vg, 0, reg_max, MO_32);
- if (likely(reg_off < reg_max)) {
- /* Perform one normal read, which will fault or not. */
- set_helper_retaddr(ra);
- addr = off_fn(vm, reg_off);
- addr = base + (addr << scale);
- tlb_fn(env, vd, reg_off, addr, oi, ra);
+ reg_off = find_next_active(vg, 0, reg_max, esz);
+ if (unlikely(reg_off >= reg_max)) {
+ /* The entire predicate was false; no load occurs. */
+ memset(vd, 0, reg_max);
+ return;
+ }
- /* The rest of the reads will be non-faulting. */
- clear_helper_retaddr();
+ /*
+ * Probe the first element, allowing faults.
+ */
+ addr = base + (off_fn(vm, reg_off) << scale);
+ if (mtedesc) {
+ mte_check1(env, mtedesc, addr, retaddr);
}
+ tlb_fn(env, vd, reg_off, addr, retaddr);
- /* After any fault, zero the leading predicated false elements. */
+ /* After any fault, zero the other elements. */
swap_memzero(vd, reg_off);
+ reg_off += esize;
+ swap_memzero(vd + reg_off, reg_max - reg_off);
- while (likely((reg_off += 4) < reg_max)) {
- uint64_t pg = *(uint64_t *)(vg + (reg_off >> 6) * 8);
- if (likely((pg >> (reg_off & 63)) & 1)) {
- addr = off_fn(vm, reg_off);
- addr = base + (addr << scale);
- if (!nonfault_fn(env, vd, reg_off, addr, mmu_idx)) {
- record_fault(env, reg_off, reg_max);
- break;
+ /*
+ * Probe the remaining elements, not allowing faults.
+ */
+ while (reg_off < reg_max) {
+ uint64_t pg = vg[reg_off >> 6];
+ do {
+ if (likely((pg >> (reg_off & 63)) & 1)) {
+ addr = base + (off_fn(vm, reg_off) << scale);
+ in_page = -(addr | TARGET_PAGE_MASK);
+
+ if (unlikely(in_page < msize)) {
+ /* Stop if the element crosses a page boundary. */
+ goto fault;
+ }
+
+ sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
+ mmu_idx, retaddr);
+ if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
+ goto fault;
+ }
+ if (unlikely(info.flags & TLB_WATCHPOINT) &&
+ (cpu_watchpoint_address_matches
+ (env_cpu(env), addr, msize) & BP_MEM_READ)) {
+ goto fault;
+ }
+ if (mtedesc &&
+ arm_tlb_mte_tagged(&info.attrs) &&
+ !mte_probe1(env, mtedesc, addr)) {
+ goto fault;
+ }
+
+ host_fn(vd, reg_off, info.host);
}
- } else {
- *(uint32_t *)(vd + H1_4(reg_off)) = 0;
- }
+ reg_off += esize;
+ } while (reg_off & 63);
}
+ return;
+
+ fault:
+ record_fault(env, reg_off, reg_max);
}
-static inline void sve_ldff1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
- target_ulong base, uint32_t desc, uintptr_t ra,
- zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn,
- sve_ld1_nf_fn *nonfault_fn)
+static inline QEMU_ALWAYS_INLINE
+void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ const int esz, const int msz,
+ zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const int mmu_idx = get_mmuidx(oi);
- const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
- intptr_t reg_off, reg_max = simd_oprsz(desc);
- target_ulong addr;
-
- /* Skip to the first true predicate. */
- reg_off = find_next_active(vg, 0, reg_max, MO_64);
- if (likely(reg_off < reg_max)) {
- /* Perform one normal read, which will fault or not. */
- set_helper_retaddr(ra);
- addr = off_fn(vm, reg_off);
- addr = base + (addr << scale);
- tlb_fn(env, vd, reg_off, addr, oi, ra);
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
- /* The rest of the reads will be non-faulting. */
- clear_helper_retaddr();
- }
-
- /* After any fault, zero the leading predicated false elements. */
- swap_memzero(vd, reg_off);
-
- while (likely((reg_off += 8) < reg_max)) {
- uint8_t pg = *(uint8_t *)(vg + H1(reg_off >> 3));
- if (likely(pg & 1)) {
- addr = off_fn(vm, reg_off);
- addr = base + (addr << scale);
- if (!nonfault_fn(env, vd, reg_off, addr, mmu_idx)) {
- record_fault(env, reg_off, reg_max);
- break;
- }
- } else {
- *(uint64_t *)(vd + reg_off) = 0;
- }
- }
+ /*
+ * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
+ * offset base entirely over the address space hole to change the
+ * pointer tag, or change the bit55 selector. So we could here
+ * examine TBI + TCMA like we do for sve_ldN_r_mte().
+ */
+ sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
+ esz, msz, off_fn, host_fn, tlb_fn);
}
-#define DO_LDFF1_ZPZ_S(MEM, OFS) \
+#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
void HELPER(sve_ldff##MEM##_##OFS) \
- (CPUARMState *env, void *vd, void *vg, void *vm, \
- target_ulong base, uint32_t desc) \
+ (CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
{ \
- sve_ldff1_zs(env, vd, vg, vm, base, desc, GETPC(), \
- off_##OFS##_s, sve_ld1##MEM##_tlb, sve_ld##MEM##_nf); \
+ sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
+ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+} \
+void HELPER(sve_ldff##MEM##_##OFS##_mte) \
+ (CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
+ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
}
-#define DO_LDFF1_ZPZ_D(MEM, OFS) \
+#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
void HELPER(sve_ldff##MEM##_##OFS) \
- (CPUARMState *env, void *vd, void *vg, void *vm, \
- target_ulong base, uint32_t desc) \
+ (CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
+ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+} \
+void HELPER(sve_ldff##MEM##_##OFS##_mte) \
+ (CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
{ \
- sve_ldff1_zd(env, vd, vg, vm, base, desc, GETPC(), \
- off_##OFS##_d, sve_ld1##MEM##_tlb, sve_ld##MEM##_nf); \
-}
-
-DO_LDFF1_ZPZ_S(bsu, zsu)
-DO_LDFF1_ZPZ_S(bsu, zss)
-DO_LDFF1_ZPZ_D(bdu, zsu)
-DO_LDFF1_ZPZ_D(bdu, zss)
-DO_LDFF1_ZPZ_D(bdu, zd)
-
-DO_LDFF1_ZPZ_S(bss, zsu)
-DO_LDFF1_ZPZ_S(bss, zss)
-DO_LDFF1_ZPZ_D(bds, zsu)
-DO_LDFF1_ZPZ_D(bds, zss)
-DO_LDFF1_ZPZ_D(bds, zd)
-
-DO_LDFF1_ZPZ_S(hsu_le, zsu)
-DO_LDFF1_ZPZ_S(hsu_le, zss)
-DO_LDFF1_ZPZ_D(hdu_le, zsu)
-DO_LDFF1_ZPZ_D(hdu_le, zss)
-DO_LDFF1_ZPZ_D(hdu_le, zd)
-
-DO_LDFF1_ZPZ_S(hsu_be, zsu)
-DO_LDFF1_ZPZ_S(hsu_be, zss)
-DO_LDFF1_ZPZ_D(hdu_be, zsu)
-DO_LDFF1_ZPZ_D(hdu_be, zss)
-DO_LDFF1_ZPZ_D(hdu_be, zd)
-
-DO_LDFF1_ZPZ_S(hss_le, zsu)
-DO_LDFF1_ZPZ_S(hss_le, zss)
-DO_LDFF1_ZPZ_D(hds_le, zsu)
-DO_LDFF1_ZPZ_D(hds_le, zss)
-DO_LDFF1_ZPZ_D(hds_le, zd)
-
-DO_LDFF1_ZPZ_S(hss_be, zsu)
-DO_LDFF1_ZPZ_S(hss_be, zss)
-DO_LDFF1_ZPZ_D(hds_be, zsu)
-DO_LDFF1_ZPZ_D(hds_be, zss)
-DO_LDFF1_ZPZ_D(hds_be, zd)
-
-DO_LDFF1_ZPZ_S(ss_le, zsu)
-DO_LDFF1_ZPZ_S(ss_le, zss)
-DO_LDFF1_ZPZ_D(sdu_le, zsu)
-DO_LDFF1_ZPZ_D(sdu_le, zss)
-DO_LDFF1_ZPZ_D(sdu_le, zd)
-
-DO_LDFF1_ZPZ_S(ss_be, zsu)
-DO_LDFF1_ZPZ_S(ss_be, zss)
-DO_LDFF1_ZPZ_D(sdu_be, zsu)
-DO_LDFF1_ZPZ_D(sdu_be, zss)
-DO_LDFF1_ZPZ_D(sdu_be, zd)
-
-DO_LDFF1_ZPZ_D(sds_le, zsu)
-DO_LDFF1_ZPZ_D(sds_le, zss)
-DO_LDFF1_ZPZ_D(sds_le, zd)
-
-DO_LDFF1_ZPZ_D(sds_be, zsu)
-DO_LDFF1_ZPZ_D(sds_be, zss)
-DO_LDFF1_ZPZ_D(sds_be, zd)
-
-DO_LDFF1_ZPZ_D(dd_le, zsu)
-DO_LDFF1_ZPZ_D(dd_le, zss)
-DO_LDFF1_ZPZ_D(dd_le, zd)
-
-DO_LDFF1_ZPZ_D(dd_be, zsu)
-DO_LDFF1_ZPZ_D(dd_be, zss)
-DO_LDFF1_ZPZ_D(dd_be, zd)
+ sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
+ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
+}
+
+DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
+DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
+DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
+DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
+DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
+
+DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
+DO_LDFF1_ZPZ_S(bss, zss, MO_8)
+DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
+DO_LDFF1_ZPZ_D(bds, zss, MO_8)
+DO_LDFF1_ZPZ_D(bds, zd, MO_8)
+
+DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
+DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
+DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
+DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
+
+DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
+DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
+
+DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
+DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
+DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
+
+DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
+DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
+
+DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
+DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
+DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
+
+DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
+DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
+DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
+
+DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
+DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
+DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
/* Stores with a vector index. */
-static void sve_st1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
- target_ulong base, uint32_t desc, uintptr_t ra,
- zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
+static inline QEMU_ALWAYS_INLINE
+void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ uint32_t mtedesc, int esize, int msize,
+ zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
- intptr_t i, oprsz = simd_oprsz(desc);
+ const int mmu_idx = cpu_mmu_index(env, false);
+ const intptr_t reg_max = simd_oprsz(desc);
+ const int scale = simd_data(desc);
+ void *host[ARM_MAX_VQ * 4];
+ intptr_t reg_off, i;
+ SVEHostPage info, info2;
- set_helper_retaddr(ra);
- for (i = 0; i < oprsz; ) {
- uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
+ /*
+ * Probe all of the elements for host addresses and flags.
+ */
+ i = reg_off = 0;
+ do {
+ uint64_t pg = vg[reg_off >> 6];
do {
- if (likely(pg & 1)) {
- target_ulong off = off_fn(vm, i);
- tlb_fn(env, vd, i, base + (off << scale), oi, ra);
+ target_ulong addr = base + (off_fn(vm, reg_off) << scale);
+ target_ulong in_page = -(addr | TARGET_PAGE_MASK);
+
+ host[i] = NULL;
+ if (likely((pg >> (reg_off & 63)) & 1)) {
+ if (likely(in_page >= msize)) {
+ sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
+ mmu_idx, retaddr);
+ host[i] = info.host;
+ } else {
+ /*
+ * Element crosses the page boundary.
+ * Probe both pages, but do not record the host address,
+ * so that we use the slow path.
+ */
+ sve_probe_page(&info, false, env, addr, 0,
+ MMU_DATA_STORE, mmu_idx, retaddr);
+ sve_probe_page(&info2, false, env, addr + in_page, 0,
+ MMU_DATA_STORE, mmu_idx, retaddr);
+ info.flags |= info2.flags;
+ }
+
+ if (unlikely(info.flags & TLB_WATCHPOINT)) {
+ cpu_check_watchpoint(env_cpu(env), addr, msize,
+ info.attrs, BP_MEM_WRITE, retaddr);
+ }
+
+ if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
+ mte_check1(env, mtedesc, addr, retaddr);
+ }
}
- i += 4, pg >>= 4;
- } while (i & 15);
- }
- clear_helper_retaddr();
+ i += 1;
+ reg_off += esize;
+ } while (reg_off & 63);
+ } while (reg_off < reg_max);
+
+ /*
+ * Now that we have recognized all exceptions except SyncExternal
+ * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
+ *
+ * Note for the common case of an element in RAM, not crossing a page
+ * boundary, we have stored the host address in host[]. This doubles
+ * as a first-level check against the predicate, since only enabled
+ * elements have non-null host addresses.
+ */
+ i = reg_off = 0;
+ do {
+ void *h = host[i];
+ if (likely(h != NULL)) {
+ host_fn(vd, reg_off, h);
+ } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
+ target_ulong addr = base + (off_fn(vm, reg_off) << scale);
+ tlb_fn(env, vd, reg_off, addr, retaddr);
+ }
+ i += 1;
+ reg_off += esize;
+ } while (reg_off < reg_max);
}
-static void sve_st1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
- target_ulong base, uint32_t desc, uintptr_t ra,
- zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
+static inline QEMU_ALWAYS_INLINE
+void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
+ target_ulong base, uint32_t desc, uintptr_t retaddr,
+ int esize, int msize, zreg_off_fn *off_fn,
+ sve_ldst1_host_fn *host_fn,
+ sve_ldst1_tlb_fn *tlb_fn)
{
- const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
- const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
- intptr_t i, oprsz = simd_oprsz(desc) / 8;
+ uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
+ /* Remove mtedesc from the normal sve descriptor. */
+ desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
- set_helper_retaddr(ra);
- for (i = 0; i < oprsz; i++) {
- uint8_t pg = *(uint8_t *)(vg + H1(i));
- if (likely(pg & 1)) {
- target_ulong off = off_fn(vm, i * 8);
- tlb_fn(env, vd, i * 8, base + (off << scale), oi, ra);
- }
- }
- clear_helper_retaddr();
-}
-
-#define DO_ST1_ZPZ_S(MEM, OFS) \
-void QEMU_FLATTEN HELPER(sve_st##MEM##_##OFS) \
- (CPUARMState *env, void *vd, void *vg, void *vm, \
- target_ulong base, uint32_t desc) \
-{ \
- sve_st1_zs(env, vd, vg, vm, base, desc, GETPC(), \
- off_##OFS##_s, sve_st1##MEM##_tlb); \
-}
-
-#define DO_ST1_ZPZ_D(MEM, OFS) \
-void QEMU_FLATTEN HELPER(sve_st##MEM##_##OFS) \
- (CPUARMState *env, void *vd, void *vg, void *vm, \
- target_ulong base, uint32_t desc) \
-{ \
- sve_st1_zd(env, vd, vg, vm, base, desc, GETPC(), \
- off_##OFS##_d, sve_st1##MEM##_tlb); \
-}
-
-DO_ST1_ZPZ_S(bs, zsu)
-DO_ST1_ZPZ_S(hs_le, zsu)
-DO_ST1_ZPZ_S(hs_be, zsu)
-DO_ST1_ZPZ_S(ss_le, zsu)
-DO_ST1_ZPZ_S(ss_be, zsu)
-
-DO_ST1_ZPZ_S(bs, zss)
-DO_ST1_ZPZ_S(hs_le, zss)
-DO_ST1_ZPZ_S(hs_be, zss)
-DO_ST1_ZPZ_S(ss_le, zss)
-DO_ST1_ZPZ_S(ss_be, zss)
-
-DO_ST1_ZPZ_D(bd, zsu)
-DO_ST1_ZPZ_D(hd_le, zsu)
-DO_ST1_ZPZ_D(hd_be, zsu)
-DO_ST1_ZPZ_D(sd_le, zsu)
-DO_ST1_ZPZ_D(sd_be, zsu)
-DO_ST1_ZPZ_D(dd_le, zsu)
-DO_ST1_ZPZ_D(dd_be, zsu)
-
-DO_ST1_ZPZ_D(bd, zss)
-DO_ST1_ZPZ_D(hd_le, zss)
-DO_ST1_ZPZ_D(hd_be, zss)
-DO_ST1_ZPZ_D(sd_le, zss)
-DO_ST1_ZPZ_D(sd_be, zss)
-DO_ST1_ZPZ_D(dd_le, zss)
-DO_ST1_ZPZ_D(dd_be, zss)
-
-DO_ST1_ZPZ_D(bd, zd)
-DO_ST1_ZPZ_D(hd_le, zd)
-DO_ST1_ZPZ_D(hd_be, zd)
-DO_ST1_ZPZ_D(sd_le, zd)
-DO_ST1_ZPZ_D(sd_be, zd)
-DO_ST1_ZPZ_D(dd_le, zd)
-DO_ST1_ZPZ_D(dd_be, zd)
+ /*
+ * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
+ * offset base entirely over the address space hole to change the
+ * pointer tag, or change the bit55 selector. So we could here
+ * examine TBI + TCMA like we do for sve_ldN_r_mte().
+ */
+ sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
+ esize, msize, off_fn, host_fn, tlb_fn);
+}
+
+#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
+void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
+ off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+} \
+void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
+ off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+}
+
+#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
+void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
+ off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+} \
+void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
+ void *vm, target_ulong base, uint32_t desc) \
+{ \
+ sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
+ off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
+}
+
+DO_ST1_ZPZ_S(bs, zsu, MO_8)
+DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
+DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
+DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
+DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
+
+DO_ST1_ZPZ_S(bs, zss, MO_8)
+DO_ST1_ZPZ_S(hs_le, zss, MO_16)
+DO_ST1_ZPZ_S(hs_be, zss, MO_16)
+DO_ST1_ZPZ_S(ss_le, zss, MO_32)
+DO_ST1_ZPZ_S(ss_be, zss, MO_32)
+
+DO_ST1_ZPZ_D(bd, zsu, MO_8)
+DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
+DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
+DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
+DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
+DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
+DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
+
+DO_ST1_ZPZ_D(bd, zss, MO_8)
+DO_ST1_ZPZ_D(hd_le, zss, MO_16)
+DO_ST1_ZPZ_D(hd_be, zss, MO_16)
+DO_ST1_ZPZ_D(sd_le, zss, MO_32)
+DO_ST1_ZPZ_D(sd_be, zss, MO_32)
+DO_ST1_ZPZ_D(dd_le, zss, MO_64)
+DO_ST1_ZPZ_D(dd_be, zss, MO_64)
+
+DO_ST1_ZPZ_D(bd, zd, MO_8)
+DO_ST1_ZPZ_D(hd_le, zd, MO_16)
+DO_ST1_ZPZ_D(hd_be, zd, MO_16)
+DO_ST1_ZPZ_D(sd_le, zd, MO_32)
+DO_ST1_ZPZ_D(sd_be, zd, MO_32)
+DO_ST1_ZPZ_D(dd_le, zd, MO_64)
+DO_ST1_ZPZ_D(dd_be, zd, MO_64)
#undef DO_ST1_ZPZ_S
#undef DO_ST1_ZPZ_D