X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=target%2Farm%2Ftranslate-sve.c;h=374051cd20aabe0ab5cdb24f585cba3f350158fc;hb=8d65a3ccfd5db7f0436e095cd952f5d0c3a873ba;hp=ea8d2c41122fc3f786805e8a1420d99d8ab54ce9;hpb=a1f233f25fd502f9a5b40c14df1b4dbdda463487;p=mirror_qemu.git diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c index ea8d2c4112..374051cd20 100644 --- a/target/arm/translate-sve.c +++ b/target/arm/translate-sve.c @@ -32,6 +32,20 @@ #include "exec/log.h" #include "trace-tcg.h" #include "translate-a64.h" +#include "fpu/softfloat.h" + + +typedef void GVecGen2sFn(unsigned, uint32_t, uint32_t, + TCGv_i64, uint32_t, uint32_t); + +typedef void gen_helper_gvec_flags_3(TCGv_i32, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); +typedef void gen_helper_gvec_flags_4(TCGv_i32, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_ptr, TCGv_i32); + +typedef void gen_helper_gvec_mem(TCGv_env, TCGv_ptr, TCGv_i64, TCGv_i32); +typedef void gen_helper_gvec_mem_scatter(TCGv_env, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i64, TCGv_i32); /* * Helpers for extracting complex instruction fields. @@ -57,6 +71,31 @@ static int tszimm_shl(int x) return x - (8 << tszimm_esz(x)); } +static inline int plus1(int x) +{ + return x + 1; +} + +/* The SH bit is in bit 8. Extract the low 8 and shift. */ +static inline int expand_imm_sh8s(int x) +{ + return (int8_t)x << (x & 0x100 ? 8 : 0); +} + +static inline int expand_imm_sh8u(int x) +{ + return (uint8_t)x << (x & 0x100 ? 8 : 0); +} + +/* Convert a 2-bit memory size (msz) to a 4-bit data type (dtype) + * with unsigned data. C.f. SVE Memory Contiguous Load Group. + */ +static inline int msz_dtype(int msz) +{ + static const uint8_t dtype[4] = { 0, 5, 10, 15 }; + return dtype[msz]; +} + /* * Include the generated decoder. */ @@ -312,6 +351,23 @@ static bool do_zpzz_ool(DisasContext *s, arg_rprr_esz *a, gen_helper_gvec_4 *fn) return true; } +/* Select active elememnts from Zn and inactive elements from Zm, + * storing the result in Zd. + */ +static void do_sel_z(DisasContext *s, int rd, int rn, int rm, int pg, int esz) +{ + static gen_helper_gvec_4 * const fns[4] = { + gen_helper_sve_sel_zpzz_b, gen_helper_sve_sel_zpzz_h, + gen_helper_sve_sel_zpzz_s, gen_helper_sve_sel_zpzz_d + }; + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + pred_full_reg_offset(s, pg), + vsz, vsz, 0, fns[esz]); +} + #define DO_ZPZZ(NAME, name) \ static bool trans_##NAME##_zpzz(DisasContext *s, arg_rprr_esz *a, \ uint32_t insn) \ @@ -362,6 +418,14 @@ static bool trans_UDIV_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn) return do_zpzz_ool(s, a, fns[a->esz]); } +static bool trans_SEL_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_sel_z(s, a->rd, a->rn, a->rm, a->pg, a->esz); + } + return true; +} + #undef DO_ZPZZ /* @@ -568,6 +632,20 @@ static bool do_clr_zp(DisasContext *s, int rd, int pg, int esz) return true; } +/* Copy Zn into Zd, storing zeros into inactive elements. */ +static void do_movz_zpz(DisasContext *s, int rd, int rn, int pg, int esz) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve_movz_b, gen_helper_sve_movz_h, + gen_helper_sve_movz_s, gen_helper_sve_movz_d, + }; + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + pred_full_reg_offset(s, pg), + vsz, vsz, 0, fns[esz]); +} + static bool do_zpzi_ool(DisasContext *s, arg_rpri_esz *a, gen_helper_gvec_3 *fn) { @@ -1360,7 +1438,7 @@ static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag) setsz = numelem << esz; lastword = word = pred_esz_masks[esz]; if (setsz % 64) { - lastword &= ~(-1ull << (setsz % 64)); + lastword &= MAKE_64BIT_MASK(0, setsz % 64); } } @@ -1379,19 +1457,13 @@ static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag) tcg_gen_gvec_dup64i(ofs, oprsz, maxsz, word); goto done; } - if (oprsz * 8 == setsz + 8) { - tcg_gen_gvec_dup64i(ofs, oprsz, maxsz, word); - tcg_gen_movi_i64(t, 0); - tcg_gen_st_i64(t, cpu_env, ofs + oprsz - 8); - goto done; - } } setsz /= 8; fullsz /= 8; tcg_gen_movi_i64(t, word); - for (i = 0; i < setsz; i += 8) { + for (i = 0; i < QEMU_ALIGN_DOWN(setsz, 8); i += 8) { tcg_gen_st_i64(t, cpu_env, ofs + i); } if (lastword != word) { @@ -1498,114 +1570,3639 @@ static bool trans_PNEXT(DisasContext *s, arg_rr_esz *a, uint32_t insn) } /* - *** SVE Memory - 32-bit Gather and Unsized Contiguous Group + *** SVE Element Count Group */ -/* Subroutine loading a vector register at VOFS of LEN bytes. - * The load should begin at the address Rn + IMM. +/* Perform an inline saturating addition of a 32-bit value within + * a 64-bit register. The second operand is known to be positive, + * which halves the comparisions we must perform to bound the result. */ +static void do_sat_addsub_32(TCGv_i64 reg, TCGv_i64 val, bool u, bool d) +{ + int64_t ibound; + TCGv_i64 bound; + TCGCond cond; -static void do_ldr(DisasContext *s, uint32_t vofs, uint32_t len, - int rn, int imm) + /* Use normal 64-bit arithmetic to detect 32-bit overflow. */ + if (u) { + tcg_gen_ext32u_i64(reg, reg); + } else { + tcg_gen_ext32s_i64(reg, reg); + } + if (d) { + tcg_gen_sub_i64(reg, reg, val); + ibound = (u ? 0 : INT32_MIN); + cond = TCG_COND_LT; + } else { + tcg_gen_add_i64(reg, reg, val); + ibound = (u ? UINT32_MAX : INT32_MAX); + cond = TCG_COND_GT; + } + bound = tcg_const_i64(ibound); + tcg_gen_movcond_i64(cond, reg, reg, bound, bound, reg); + tcg_temp_free_i64(bound); +} + +/* Similarly with 64-bit values. */ +static void do_sat_addsub_64(TCGv_i64 reg, TCGv_i64 val, bool u, bool d) { - uint32_t len_align = QEMU_ALIGN_DOWN(len, 8); - uint32_t len_remain = len % 8; - uint32_t nparts = len / 8 + ctpop8(len_remain); - int midx = get_mem_index(s); - TCGv_i64 addr, t0, t1; + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2; + + if (u) { + if (d) { + tcg_gen_sub_i64(t0, reg, val); + tcg_gen_movi_i64(t1, 0); + tcg_gen_movcond_i64(TCG_COND_LTU, reg, reg, val, t1, t0); + } else { + tcg_gen_add_i64(t0, reg, val); + tcg_gen_movi_i64(t1, -1); + tcg_gen_movcond_i64(TCG_COND_LTU, reg, t0, reg, t1, t0); + } + } else { + if (d) { + /* Detect signed overflow for subtraction. */ + tcg_gen_xor_i64(t0, reg, val); + tcg_gen_sub_i64(t1, reg, val); + tcg_gen_xor_i64(reg, reg, t0); + tcg_gen_and_i64(t0, t0, reg); + + /* Bound the result. */ + tcg_gen_movi_i64(reg, INT64_MIN); + t2 = tcg_const_i64(0); + tcg_gen_movcond_i64(TCG_COND_LT, reg, t0, t2, reg, t1); + } else { + /* Detect signed overflow for addition. */ + tcg_gen_xor_i64(t0, reg, val); + tcg_gen_add_i64(reg, reg, val); + tcg_gen_xor_i64(t1, reg, val); + tcg_gen_andc_i64(t0, t1, t0); + + /* Bound the result. */ + tcg_gen_movi_i64(t1, INT64_MAX); + t2 = tcg_const_i64(0); + tcg_gen_movcond_i64(TCG_COND_LT, reg, t0, t2, t1, reg); + } + tcg_temp_free_i64(t2); + } + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); +} - addr = tcg_temp_new_i64(); - t0 = tcg_temp_new_i64(); +/* Similarly with a vector and a scalar operand. */ +static void do_sat_addsub_vec(DisasContext *s, int esz, int rd, int rn, + TCGv_i64 val, bool u, bool d) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr dptr, nptr; + TCGv_i32 t32, desc; + TCGv_i64 t64; + + dptr = tcg_temp_new_ptr(); + nptr = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(dptr, cpu_env, vec_full_reg_offset(s, rd)); + tcg_gen_addi_ptr(nptr, cpu_env, vec_full_reg_offset(s, rn)); + desc = tcg_const_i32(simd_desc(vsz, vsz, 0)); - /* Note that unpredicated load/store of vector/predicate registers - * are defined as a stream of bytes, which equates to little-endian - * operations on larger quantities. There is no nice way to force - * a little-endian load for aarch64_be-linux-user out of line. - * - * Attempt to keep code expansion to a minimum by limiting the - * amount of unrolling done. - */ - if (nparts <= 4) { - int i; + switch (esz) { + case MO_8: + t32 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t32, val); + if (d) { + tcg_gen_neg_i32(t32, t32); + } + if (u) { + gen_helper_sve_uqaddi_b(dptr, nptr, t32, desc); + } else { + gen_helper_sve_sqaddi_b(dptr, nptr, t32, desc); + } + tcg_temp_free_i32(t32); + break; - for (i = 0; i < len_align; i += 8) { - tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + i); - tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEQ); - tcg_gen_st_i64(t0, cpu_env, vofs + i); + case MO_16: + t32 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t32, val); + if (d) { + tcg_gen_neg_i32(t32, t32); } - } else { - TCGLabel *loop = gen_new_label(); - TCGv_ptr tp, i = tcg_const_local_ptr(0); + if (u) { + gen_helper_sve_uqaddi_h(dptr, nptr, t32, desc); + } else { + gen_helper_sve_sqaddi_h(dptr, nptr, t32, desc); + } + tcg_temp_free_i32(t32); + break; - gen_set_label(loop); + case MO_32: + t64 = tcg_temp_new_i64(); + if (d) { + tcg_gen_neg_i64(t64, val); + } else { + tcg_gen_mov_i64(t64, val); + } + if (u) { + gen_helper_sve_uqaddi_s(dptr, nptr, t64, desc); + } else { + gen_helper_sve_sqaddi_s(dptr, nptr, t64, desc); + } + tcg_temp_free_i64(t64); + break; - /* Minimize the number of local temps that must be re-read from - * the stack each iteration. Instead, re-compute values other - * than the loop counter. - */ - tp = tcg_temp_new_ptr(); - tcg_gen_addi_ptr(tp, i, imm); - tcg_gen_extu_ptr_i64(addr, tp); - tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, rn)); + case MO_64: + if (u) { + if (d) { + gen_helper_sve_uqsubi_d(dptr, nptr, val, desc); + } else { + gen_helper_sve_uqaddi_d(dptr, nptr, val, desc); + } + } else if (d) { + t64 = tcg_temp_new_i64(); + tcg_gen_neg_i64(t64, val); + gen_helper_sve_sqaddi_d(dptr, nptr, t64, desc); + tcg_temp_free_i64(t64); + } else { + gen_helper_sve_sqaddi_d(dptr, nptr, val, desc); + } + break; - tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEQ); + default: + g_assert_not_reached(); + } - tcg_gen_add_ptr(tp, cpu_env, i); - tcg_gen_addi_ptr(i, i, 8); - tcg_gen_st_i64(t0, tp, vofs); - tcg_temp_free_ptr(tp); + tcg_temp_free_ptr(dptr); + tcg_temp_free_ptr(nptr); + tcg_temp_free_i32(desc); +} - tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); - tcg_temp_free_ptr(i); +static bool trans_CNT_r(DisasContext *s, arg_CNT_r *a, uint32_t insn) +{ + if (sve_access_check(s)) { + unsigned fullsz = vec_full_reg_size(s); + unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz); + tcg_gen_movi_i64(cpu_reg(s, a->rd), numelem * a->imm); } + return true; +} - /* Predicate register loads can be any multiple of 2. - * Note that we still store the entire 64-bit unit into cpu_env. - */ - if (len_remain) { - tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + len_align); +static bool trans_INCDEC_r(DisasContext *s, arg_incdec_cnt *a, uint32_t insn) +{ + if (sve_access_check(s)) { + unsigned fullsz = vec_full_reg_size(s); + unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz); + int inc = numelem * a->imm * (a->d ? -1 : 1); + TCGv_i64 reg = cpu_reg(s, a->rd); - switch (len_remain) { - case 2: - case 4: - case 8: - tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LE | ctz32(len_remain)); - break; + tcg_gen_addi_i64(reg, reg, inc); + } + return true; +} - case 6: - t1 = tcg_temp_new_i64(); - tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEUL); - tcg_gen_addi_i64(addr, addr, 4); - tcg_gen_qemu_ld_i64(t1, addr, midx, MO_LEUW); - tcg_gen_deposit_i64(t0, t0, t1, 32, 32); - tcg_temp_free_i64(t1); - break; +static bool trans_SINCDEC_r_32(DisasContext *s, arg_incdec_cnt *a, + uint32_t insn) +{ + if (!sve_access_check(s)) { + return true; + } - default: - g_assert_not_reached(); + unsigned fullsz = vec_full_reg_size(s); + unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz); + int inc = numelem * a->imm; + TCGv_i64 reg = cpu_reg(s, a->rd); + + /* Use normal 64-bit arithmetic to detect 32-bit overflow. */ + if (inc == 0) { + if (a->u) { + tcg_gen_ext32u_i64(reg, reg); + } else { + tcg_gen_ext32s_i64(reg, reg); } - tcg_gen_st_i64(t0, cpu_env, vofs + len_align); + } else { + TCGv_i64 t = tcg_const_i64(inc); + do_sat_addsub_32(reg, t, a->u, a->d); + tcg_temp_free_i64(t); } - tcg_temp_free_i64(addr); - tcg_temp_free_i64(t0); + return true; } -static bool trans_LDR_zri(DisasContext *s, arg_rri *a, uint32_t insn) +static bool trans_SINCDEC_r_64(DisasContext *s, arg_incdec_cnt *a, + uint32_t insn) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned fullsz = vec_full_reg_size(s); + unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz); + int inc = numelem * a->imm; + TCGv_i64 reg = cpu_reg(s, a->rd); + + if (inc != 0) { + TCGv_i64 t = tcg_const_i64(inc); + do_sat_addsub_64(reg, t, a->u, a->d); + tcg_temp_free_i64(t); + } + return true; +} + +static bool trans_INCDEC_v(DisasContext *s, arg_incdec2_cnt *a, uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + + unsigned fullsz = vec_full_reg_size(s); + unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz); + int inc = numelem * a->imm; + + if (inc != 0) { + if (sve_access_check(s)) { + TCGv_i64 t = tcg_const_i64(a->d ? -inc : inc); + tcg_gen_gvec_adds(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + t, fullsz, fullsz); + tcg_temp_free_i64(t); + } + } else { + do_mov_z(s, a->rd, a->rn); + } + return true; +} + +static bool trans_SINCDEC_v(DisasContext *s, arg_incdec2_cnt *a, + uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + + unsigned fullsz = vec_full_reg_size(s); + unsigned numelem = decode_pred_count(fullsz, a->pat, a->esz); + int inc = numelem * a->imm; + + if (inc != 0) { + if (sve_access_check(s)) { + TCGv_i64 t = tcg_const_i64(inc); + do_sat_addsub_vec(s, a->esz, a->rd, a->rn, t, a->u, a->d); + tcg_temp_free_i64(t); + } + } else { + do_mov_z(s, a->rd, a->rn); + } + return true; +} + +/* + *** SVE Bitwise Immediate Group + */ + +static bool do_zz_dbm(DisasContext *s, arg_rr_dbm *a, GVecGen2iFn *gvec_fn) { + uint64_t imm; + if (!logic_imm_decode_wmask(&imm, extract32(a->dbm, 12, 1), + extract32(a->dbm, 0, 6), + extract32(a->dbm, 6, 6))) { + return false; + } if (sve_access_check(s)) { - int size = vec_full_reg_size(s); - int off = vec_full_reg_offset(s, a->rd); - do_ldr(s, off, size, a->rn, a->imm * size); + unsigned vsz = vec_full_reg_size(s); + gvec_fn(MO_64, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), imm, vsz, vsz); } return true; } -static bool trans_LDR_pri(DisasContext *s, arg_rri *a, uint32_t insn) +static bool trans_AND_zzi(DisasContext *s, arg_rr_dbm *a, uint32_t insn) +{ + return do_zz_dbm(s, a, tcg_gen_gvec_andi); +} + +static bool trans_ORR_zzi(DisasContext *s, arg_rr_dbm *a, uint32_t insn) +{ + return do_zz_dbm(s, a, tcg_gen_gvec_ori); +} + +static bool trans_EOR_zzi(DisasContext *s, arg_rr_dbm *a, uint32_t insn) +{ + return do_zz_dbm(s, a, tcg_gen_gvec_xori); +} + +static bool trans_DUPM(DisasContext *s, arg_DUPM *a, uint32_t insn) +{ + uint64_t imm; + if (!logic_imm_decode_wmask(&imm, extract32(a->dbm, 12, 1), + extract32(a->dbm, 0, 6), + extract32(a->dbm, 6, 6))) { + return false; + } + if (sve_access_check(s)) { + do_dupi_z(s, a->rd, imm); + } + return true; +} + +/* + *** SVE Integer Wide Immediate - Predicated Group + */ + +/* Implement all merging copies. This is used for CPY (immediate), + * FCPY, CPY (scalar), CPY (SIMD&FP scalar). + */ +static void do_cpy_m(DisasContext *s, int esz, int rd, int rn, int pg, + TCGv_i64 val) +{ + typedef void gen_cpy(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32); + static gen_cpy * const fns[4] = { + gen_helper_sve_cpy_m_b, gen_helper_sve_cpy_m_h, + gen_helper_sve_cpy_m_s, gen_helper_sve_cpy_m_d, + }; + unsigned vsz = vec_full_reg_size(s); + TCGv_i32 desc = tcg_const_i32(simd_desc(vsz, vsz, 0)); + TCGv_ptr t_zd = tcg_temp_new_ptr(); + TCGv_ptr t_zn = tcg_temp_new_ptr(); + TCGv_ptr t_pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, rd)); + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, rn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + + fns[esz](t_zd, t_zn, t_pg, val, desc); + + tcg_temp_free_ptr(t_zd); + tcg_temp_free_ptr(t_zn); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_i32(desc); +} + +static bool trans_FCPY(DisasContext *s, arg_FCPY *a, uint32_t insn) { + if (a->esz == 0) { + return false; + } if (sve_access_check(s)) { - int size = pred_full_reg_size(s); - int off = pred_full_reg_offset(s, a->rd); - do_ldr(s, off, size, a->rn, a->imm * size); + /* Decode the VFP immediate. */ + uint64_t imm = vfp_expand_imm(a->esz, a->imm); + TCGv_i64 t_imm = tcg_const_i64(imm); + do_cpy_m(s, a->esz, a->rd, a->rn, a->pg, t_imm); + tcg_temp_free_i64(t_imm); + } + return true; +} + +static bool trans_CPY_m_i(DisasContext *s, arg_rpri_esz *a, uint32_t insn) +{ + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 t_imm = tcg_const_i64(a->imm); + do_cpy_m(s, a->esz, a->rd, a->rn, a->pg, t_imm); + tcg_temp_free_i64(t_imm); + } + return true; +} + +static bool trans_CPY_z_i(DisasContext *s, arg_CPY_z_i *a, uint32_t insn) +{ + static gen_helper_gvec_2i * const fns[4] = { + gen_helper_sve_cpy_z_b, gen_helper_sve_cpy_z_h, + gen_helper_sve_cpy_z_s, gen_helper_sve_cpy_z_d, + }; + + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i64 t_imm = tcg_const_i64(a->imm); + tcg_gen_gvec_2i_ool(vec_full_reg_offset(s, a->rd), + pred_full_reg_offset(s, a->pg), + t_imm, vsz, vsz, 0, fns[a->esz]); + tcg_temp_free_i64(t_imm); + } + return true; +} + +/* + *** SVE Permute Extract Group + */ + +static bool trans_EXT(DisasContext *s, arg_EXT *a, uint32_t insn) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = vec_full_reg_size(s); + unsigned n_ofs = a->imm >= vsz ? 0 : a->imm; + unsigned n_siz = vsz - n_ofs; + unsigned d = vec_full_reg_offset(s, a->rd); + unsigned n = vec_full_reg_offset(s, a->rn); + unsigned m = vec_full_reg_offset(s, a->rm); + + /* Use host vector move insns if we have appropriate sizes + * and no unfortunate overlap. + */ + if (m != d + && n_ofs == size_for_gvec(n_ofs) + && n_siz == size_for_gvec(n_siz) + && (d != n || n_siz <= n_ofs)) { + tcg_gen_gvec_mov(0, d, n + n_ofs, n_siz, n_siz); + if (n_ofs != 0) { + tcg_gen_gvec_mov(0, d + n_siz, m, n_ofs, n_ofs); + } + } else { + tcg_gen_gvec_3_ool(d, n, m, vsz, vsz, n_ofs, gen_helper_sve_ext); + } + return true; +} + +/* + *** SVE Permute - Unpredicated Group + */ + +static bool trans_DUP_s(DisasContext *s, arg_DUP_s *a, uint32_t insn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_dup_i64(a->esz, vec_full_reg_offset(s, a->rd), + vsz, vsz, cpu_reg_sp(s, a->rn)); + } + return true; +} + +static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a, uint32_t insn) +{ + if ((a->imm & 0x1f) == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + unsigned dofs = vec_full_reg_offset(s, a->rd); + unsigned esz, index; + + esz = ctz32(a->imm); + index = a->imm >> (esz + 1); + + if ((index << esz) < vsz) { + unsigned nofs = vec_reg_offset(s, a->rn, index, esz); + tcg_gen_gvec_dup_mem(esz, dofs, nofs, vsz, vsz); + } else { + tcg_gen_gvec_dup64i(dofs, vsz, vsz, 0); + } + } + return true; +} + +static void do_insr_i64(DisasContext *s, arg_rrr_esz *a, TCGv_i64 val) +{ + typedef void gen_insr(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32); + static gen_insr * const fns[4] = { + gen_helper_sve_insr_b, gen_helper_sve_insr_h, + gen_helper_sve_insr_s, gen_helper_sve_insr_d, + }; + unsigned vsz = vec_full_reg_size(s); + TCGv_i32 desc = tcg_const_i32(simd_desc(vsz, vsz, 0)); + TCGv_ptr t_zd = tcg_temp_new_ptr(); + TCGv_ptr t_zn = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn)); + + fns[a->esz](t_zd, t_zn, val, desc); + + tcg_temp_free_ptr(t_zd); + tcg_temp_free_ptr(t_zn); + tcg_temp_free_i32(desc); +} + +static bool trans_INSR_f(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 t = tcg_temp_new_i64(); + tcg_gen_ld_i64(t, cpu_env, vec_reg_offset(s, a->rm, 0, MO_64)); + do_insr_i64(s, a, t); + tcg_temp_free_i64(t); + } + return true; +} + +static bool trans_INSR_r(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_insr_i64(s, a, cpu_reg(s, a->rm)); + } + return true; +} + +static bool trans_REV_v(DisasContext *s, arg_rr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_2 * const fns[4] = { + gen_helper_sve_rev_b, gen_helper_sve_rev_h, + gen_helper_sve_rev_s, gen_helper_sve_rev_d + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vsz, vsz, 0, fns[a->esz]); + } + return true; +} + +static bool trans_TBL(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve_tbl_b, gen_helper_sve_tbl_h, + gen_helper_sve_tbl_s, gen_helper_sve_tbl_d + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, 0, fns[a->esz]); + } + return true; +} + +static bool trans_UNPK(DisasContext *s, arg_UNPK *a, uint32_t insn) +{ + static gen_helper_gvec_2 * const fns[4][2] = { + { NULL, NULL }, + { gen_helper_sve_sunpk_h, gen_helper_sve_uunpk_h }, + { gen_helper_sve_sunpk_s, gen_helper_sve_uunpk_s }, + { gen_helper_sve_sunpk_d, gen_helper_sve_uunpk_d }, + }; + + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_2_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn) + + (a->h ? vsz / 2 : 0), + vsz, vsz, 0, fns[a->esz][a->u]); + } + return true; +} + +/* + *** SVE Permute - Predicates Group + */ + +static bool do_perm_pred3(DisasContext *s, arg_rrr_esz *a, bool high_odd, + gen_helper_gvec_3 *fn) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + + /* Predicate sizes may be smaller and cannot use simd_desc. + We cannot round up, as we do elsewhere, because we need + the exact size for ZIP2 and REV. We retain the style for + the other helpers for consistency. */ + TCGv_ptr t_d = tcg_temp_new_ptr(); + TCGv_ptr t_n = tcg_temp_new_ptr(); + TCGv_ptr t_m = tcg_temp_new_ptr(); + TCGv_i32 t_desc; + int desc; + + desc = vsz - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz); + desc = deposit32(desc, SIMD_DATA_SHIFT + 2, 2, high_odd); + + tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(t_m, cpu_env, pred_full_reg_offset(s, a->rm)); + t_desc = tcg_const_i32(desc); + + fn(t_d, t_n, t_m, t_desc); + + tcg_temp_free_ptr(t_d); + tcg_temp_free_ptr(t_n); + tcg_temp_free_ptr(t_m); + tcg_temp_free_i32(t_desc); + return true; +} + +static bool do_perm_pred2(DisasContext *s, arg_rr_esz *a, bool high_odd, + gen_helper_gvec_2 *fn) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + TCGv_ptr t_d = tcg_temp_new_ptr(); + TCGv_ptr t_n = tcg_temp_new_ptr(); + TCGv_i32 t_desc; + int desc; + + tcg_gen_addi_ptr(t_d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(t_n, cpu_env, pred_full_reg_offset(s, a->rn)); + + /* Predicate sizes may be smaller and cannot use simd_desc. + We cannot round up, as we do elsewhere, because we need + the exact size for ZIP2 and REV. We retain the style for + the other helpers for consistency. */ + + desc = vsz - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz); + desc = deposit32(desc, SIMD_DATA_SHIFT + 2, 2, high_odd); + t_desc = tcg_const_i32(desc); + + fn(t_d, t_n, t_desc); + + tcg_temp_free_i32(t_desc); + tcg_temp_free_ptr(t_d); + tcg_temp_free_ptr(t_n); + return true; +} + +static bool trans_ZIP1_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 0, gen_helper_sve_zip_p); +} + +static bool trans_ZIP2_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 1, gen_helper_sve_zip_p); +} + +static bool trans_UZP1_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 0, gen_helper_sve_uzp_p); +} + +static bool trans_UZP2_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 1, gen_helper_sve_uzp_p); +} + +static bool trans_TRN1_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 0, gen_helper_sve_trn_p); +} + +static bool trans_TRN2_p(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_perm_pred3(s, a, 1, gen_helper_sve_trn_p); +} + +static bool trans_REV_p(DisasContext *s, arg_rr_esz *a, uint32_t insn) +{ + return do_perm_pred2(s, a, 0, gen_helper_sve_rev_p); +} + +static bool trans_PUNPKLO(DisasContext *s, arg_PUNPKLO *a, uint32_t insn) +{ + return do_perm_pred2(s, a, 0, gen_helper_sve_punpk_p); +} + +static bool trans_PUNPKHI(DisasContext *s, arg_PUNPKHI *a, uint32_t insn) +{ + return do_perm_pred2(s, a, 1, gen_helper_sve_punpk_p); +} + +/* + *** SVE Permute - Interleaving Group + */ + +static bool do_zip(DisasContext *s, arg_rrr_esz *a, bool high) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve_zip_b, gen_helper_sve_zip_h, + gen_helper_sve_zip_s, gen_helper_sve_zip_d, + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + unsigned high_ofs = high ? vsz / 2 : 0; + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn) + high_ofs, + vec_full_reg_offset(s, a->rm) + high_ofs, + vsz, vsz, 0, fns[a->esz]); + } + return true; +} + +static bool do_zzz_data_ool(DisasContext *s, arg_rrr_esz *a, int data, + gen_helper_gvec_3 *fn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, data, fn); + } + return true; +} + +static bool trans_ZIP1_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zip(s, a, false); +} + +static bool trans_ZIP2_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zip(s, a, true); +} + +static gen_helper_gvec_3 * const uzp_fns[4] = { + gen_helper_sve_uzp_b, gen_helper_sve_uzp_h, + gen_helper_sve_uzp_s, gen_helper_sve_uzp_d, +}; + +static bool trans_UZP1_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zzz_data_ool(s, a, 0, uzp_fns[a->esz]); +} + +static bool trans_UZP2_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zzz_data_ool(s, a, 1 << a->esz, uzp_fns[a->esz]); +} + +static gen_helper_gvec_3 * const trn_fns[4] = { + gen_helper_sve_trn_b, gen_helper_sve_trn_h, + gen_helper_sve_trn_s, gen_helper_sve_trn_d, +}; + +static bool trans_TRN1_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zzz_data_ool(s, a, 0, trn_fns[a->esz]); +} + +static bool trans_TRN2_z(DisasContext *s, arg_rrr_esz *a, uint32_t insn) +{ + return do_zzz_data_ool(s, a, 1 << a->esz, trn_fns[a->esz]); +} + +/* + *** SVE Permute Vector - Predicated Group + */ + +static bool trans_COMPACT(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + NULL, NULL, gen_helper_sve_compact_s, gen_helper_sve_compact_d + }; + return do_zpz_ool(s, a, fns[a->esz]); +} + +/* Call the helper that computes the ARM LastActiveElement pseudocode + * function, scaled by the element size. This includes the not found + * indication; e.g. not found for esz=3 is -8. + */ +static void find_last_active(DisasContext *s, TCGv_i32 ret, int esz, int pg) +{ + /* Predicate sizes may be smaller and cannot use simd_desc. We cannot + * round up, as we do elsewhere, because we need the exact size. + */ + TCGv_ptr t_p = tcg_temp_new_ptr(); + TCGv_i32 t_desc; + unsigned vsz = pred_full_reg_size(s); + unsigned desc; + + desc = vsz - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, esz); + + tcg_gen_addi_ptr(t_p, cpu_env, pred_full_reg_offset(s, pg)); + t_desc = tcg_const_i32(desc); + + gen_helper_sve_last_active_element(ret, t_p, t_desc); + + tcg_temp_free_i32(t_desc); + tcg_temp_free_ptr(t_p); +} + +/* Increment LAST to the offset of the next element in the vector, + * wrapping around to 0. + */ +static void incr_last_active(DisasContext *s, TCGv_i32 last, int esz) +{ + unsigned vsz = vec_full_reg_size(s); + + tcg_gen_addi_i32(last, last, 1 << esz); + if (is_power_of_2(vsz)) { + tcg_gen_andi_i32(last, last, vsz - 1); + } else { + TCGv_i32 max = tcg_const_i32(vsz); + TCGv_i32 zero = tcg_const_i32(0); + tcg_gen_movcond_i32(TCG_COND_GEU, last, last, max, zero, last); + tcg_temp_free_i32(max); + tcg_temp_free_i32(zero); + } +} + +/* If LAST < 0, set LAST to the offset of the last element in the vector. */ +static void wrap_last_active(DisasContext *s, TCGv_i32 last, int esz) +{ + unsigned vsz = vec_full_reg_size(s); + + if (is_power_of_2(vsz)) { + tcg_gen_andi_i32(last, last, vsz - 1); + } else { + TCGv_i32 max = tcg_const_i32(vsz - (1 << esz)); + TCGv_i32 zero = tcg_const_i32(0); + tcg_gen_movcond_i32(TCG_COND_LT, last, last, zero, max, last); + tcg_temp_free_i32(max); + tcg_temp_free_i32(zero); + } +} + +/* Load an unsigned element of ESZ from BASE+OFS. */ +static TCGv_i64 load_esz(TCGv_ptr base, int ofs, int esz) +{ + TCGv_i64 r = tcg_temp_new_i64(); + + switch (esz) { + case 0: + tcg_gen_ld8u_i64(r, base, ofs); + break; + case 1: + tcg_gen_ld16u_i64(r, base, ofs); + break; + case 2: + tcg_gen_ld32u_i64(r, base, ofs); + break; + case 3: + tcg_gen_ld_i64(r, base, ofs); + break; + default: + g_assert_not_reached(); + } + return r; +} + +/* Load an unsigned element of ESZ from RM[LAST]. */ +static TCGv_i64 load_last_active(DisasContext *s, TCGv_i32 last, + int rm, int esz) +{ + TCGv_ptr p = tcg_temp_new_ptr(); + TCGv_i64 r; + + /* Convert offset into vector into offset into ENV. + * The final adjustment for the vector register base + * is added via constant offset to the load. + */ +#ifdef HOST_WORDS_BIGENDIAN + /* Adjust for element ordering. See vec_reg_offset. */ + if (esz < 3) { + tcg_gen_xori_i32(last, last, 8 - (1 << esz)); + } +#endif + tcg_gen_ext_i32_ptr(p, last); + tcg_gen_add_ptr(p, p, cpu_env); + + r = load_esz(p, vec_full_reg_offset(s, rm), esz); + tcg_temp_free_ptr(p); + + return r; +} + +/* Compute CLAST for a Zreg. */ +static bool do_clast_vector(DisasContext *s, arg_rprr_esz *a, bool before) +{ + TCGv_i32 last; + TCGLabel *over; + TCGv_i64 ele; + unsigned vsz, esz = a->esz; + + if (!sve_access_check(s)) { + return true; + } + + last = tcg_temp_local_new_i32(); + over = gen_new_label(); + + find_last_active(s, last, esz, a->pg); + + /* There is of course no movcond for a 2048-bit vector, + * so we must branch over the actual store. + */ + tcg_gen_brcondi_i32(TCG_COND_LT, last, 0, over); + + if (!before) { + incr_last_active(s, last, esz); + } + + ele = load_last_active(s, last, a->rm, esz); + tcg_temp_free_i32(last); + + vsz = vec_full_reg_size(s); + tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd), vsz, vsz, ele); + tcg_temp_free_i64(ele); + + /* If this insn used MOVPRFX, we may need a second move. */ + if (a->rd != a->rn) { + TCGLabel *done = gen_new_label(); + tcg_gen_br(done); + + gen_set_label(over); + do_mov_z(s, a->rd, a->rn); + + gen_set_label(done); + } else { + gen_set_label(over); + } + return true; +} + +static bool trans_CLASTA_z(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + return do_clast_vector(s, a, false); +} + +static bool trans_CLASTB_z(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + return do_clast_vector(s, a, true); +} + +/* Compute CLAST for a scalar. */ +static void do_clast_scalar(DisasContext *s, int esz, int pg, int rm, + bool before, TCGv_i64 reg_val) +{ + TCGv_i32 last = tcg_temp_new_i32(); + TCGv_i64 ele, cmp, zero; + + find_last_active(s, last, esz, pg); + + /* Extend the original value of last prior to incrementing. */ + cmp = tcg_temp_new_i64(); + tcg_gen_ext_i32_i64(cmp, last); + + if (!before) { + incr_last_active(s, last, esz); + } + + /* The conceit here is that while last < 0 indicates not found, after + * adjusting for cpu_env->vfp.zregs[rm], it is still a valid address + * from which we can load garbage. We then discard the garbage with + * a conditional move. + */ + ele = load_last_active(s, last, rm, esz); + tcg_temp_free_i32(last); + + zero = tcg_const_i64(0); + tcg_gen_movcond_i64(TCG_COND_GE, reg_val, cmp, zero, ele, reg_val); + + tcg_temp_free_i64(zero); + tcg_temp_free_i64(cmp); + tcg_temp_free_i64(ele); +} + +/* Compute CLAST for a Vreg. */ +static bool do_clast_fp(DisasContext *s, arg_rpr_esz *a, bool before) +{ + if (sve_access_check(s)) { + int esz = a->esz; + int ofs = vec_reg_offset(s, a->rd, 0, esz); + TCGv_i64 reg = load_esz(cpu_env, ofs, esz); + + do_clast_scalar(s, esz, a->pg, a->rn, before, reg); + write_fp_dreg(s, a->rd, reg); + tcg_temp_free_i64(reg); + } + return true; +} + +static bool trans_CLASTA_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_clast_fp(s, a, false); +} + +static bool trans_CLASTB_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_clast_fp(s, a, true); +} + +/* Compute CLAST for a Xreg. */ +static bool do_clast_general(DisasContext *s, arg_rpr_esz *a, bool before) +{ + TCGv_i64 reg; + + if (!sve_access_check(s)) { + return true; + } + + reg = cpu_reg(s, a->rd); + switch (a->esz) { + case 0: + tcg_gen_ext8u_i64(reg, reg); + break; + case 1: + tcg_gen_ext16u_i64(reg, reg); + break; + case 2: + tcg_gen_ext32u_i64(reg, reg); + break; + case 3: + break; + default: + g_assert_not_reached(); + } + + do_clast_scalar(s, a->esz, a->pg, a->rn, before, reg); + return true; +} + +static bool trans_CLASTA_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_clast_general(s, a, false); +} + +static bool trans_CLASTB_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_clast_general(s, a, true); +} + +/* Compute LAST for a scalar. */ +static TCGv_i64 do_last_scalar(DisasContext *s, int esz, + int pg, int rm, bool before) +{ + TCGv_i32 last = tcg_temp_new_i32(); + TCGv_i64 ret; + + find_last_active(s, last, esz, pg); + if (before) { + wrap_last_active(s, last, esz); + } else { + incr_last_active(s, last, esz); + } + + ret = load_last_active(s, last, rm, esz); + tcg_temp_free_i32(last); + return ret; +} + +/* Compute LAST for a Vreg. */ +static bool do_last_fp(DisasContext *s, arg_rpr_esz *a, bool before) +{ + if (sve_access_check(s)) { + TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before); + write_fp_dreg(s, a->rd, val); + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_LASTA_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_last_fp(s, a, false); +} + +static bool trans_LASTB_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_last_fp(s, a, true); +} + +/* Compute LAST for a Xreg. */ +static bool do_last_general(DisasContext *s, arg_rpr_esz *a, bool before) +{ + if (sve_access_check(s)) { + TCGv_i64 val = do_last_scalar(s, a->esz, a->pg, a->rn, before); + tcg_gen_mov_i64(cpu_reg(s, a->rd), val); + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_LASTA_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_last_general(s, a, false); +} + +static bool trans_LASTB_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_last_general(s, a, true); +} + +static bool trans_CPY_m_r(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, cpu_reg_sp(s, a->rn)); + } + return true; +} + +static bool trans_CPY_m_v(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + int ofs = vec_reg_offset(s, a->rn, 0, a->esz); + TCGv_i64 t = load_esz(cpu_env, ofs, a->esz); + do_cpy_m(s, a->esz, a->rd, a->rd, a->pg, t); + tcg_temp_free_i64(t); + } + return true; +} + +static bool trans_REVB(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + NULL, + gen_helper_sve_revb_h, + gen_helper_sve_revb_s, + gen_helper_sve_revb_d, + }; + return do_zpz_ool(s, a, fns[a->esz]); +} + +static bool trans_REVH(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + NULL, + NULL, + gen_helper_sve_revh_s, + gen_helper_sve_revh_d, + }; + return do_zpz_ool(s, a, fns[a->esz]); +} + +static bool trans_REVW(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ool(s, a, a->esz == 3 ? gen_helper_sve_revw_d : NULL); +} + +static bool trans_RBIT(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[4] = { + gen_helper_sve_rbit_b, + gen_helper_sve_rbit_h, + gen_helper_sve_rbit_s, + gen_helper_sve_rbit_d, + }; + return do_zpz_ool(s, a, fns[a->esz]); +} + +static bool trans_SPLICE(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_4_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + pred_full_reg_offset(s, a->pg), + vsz, vsz, a->esz, gen_helper_sve_splice); + } + return true; +} + +/* + *** SVE Integer Compare - Vectors Group + */ + +static bool do_ppzz_flags(DisasContext *s, arg_rprr_esz *a, + gen_helper_gvec_flags_4 *gen_fn) +{ + TCGv_ptr pd, zn, zm, pg; + unsigned vsz; + TCGv_i32 t; + + if (gen_fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vsz = vec_full_reg_size(s); + t = tcg_const_i32(simd_desc(vsz, vsz, 0)); + pd = tcg_temp_new_ptr(); + zn = tcg_temp_new_ptr(); + zm = tcg_temp_new_ptr(); + pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(zm, cpu_env, vec_full_reg_offset(s, a->rm)); + tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg)); + + gen_fn(t, pd, zn, zm, pg, t); + + tcg_temp_free_ptr(pd); + tcg_temp_free_ptr(zn); + tcg_temp_free_ptr(zm); + tcg_temp_free_ptr(pg); + + do_pred_flags(t); + + tcg_temp_free_i32(t); + return true; +} + +#define DO_PPZZ(NAME, name) \ +static bool trans_##NAME##_ppzz(DisasContext *s, arg_rprr_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_flags_4 * const fns[4] = { \ + gen_helper_sve_##name##_ppzz_b, gen_helper_sve_##name##_ppzz_h, \ + gen_helper_sve_##name##_ppzz_s, gen_helper_sve_##name##_ppzz_d, \ + }; \ + return do_ppzz_flags(s, a, fns[a->esz]); \ +} + +DO_PPZZ(CMPEQ, cmpeq) +DO_PPZZ(CMPNE, cmpne) +DO_PPZZ(CMPGT, cmpgt) +DO_PPZZ(CMPGE, cmpge) +DO_PPZZ(CMPHI, cmphi) +DO_PPZZ(CMPHS, cmphs) + +#undef DO_PPZZ + +#define DO_PPZW(NAME, name) \ +static bool trans_##NAME##_ppzw(DisasContext *s, arg_rprr_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_flags_4 * const fns[4] = { \ + gen_helper_sve_##name##_ppzw_b, gen_helper_sve_##name##_ppzw_h, \ + gen_helper_sve_##name##_ppzw_s, NULL \ + }; \ + return do_ppzz_flags(s, a, fns[a->esz]); \ +} + +DO_PPZW(CMPEQ, cmpeq) +DO_PPZW(CMPNE, cmpne) +DO_PPZW(CMPGT, cmpgt) +DO_PPZW(CMPGE, cmpge) +DO_PPZW(CMPHI, cmphi) +DO_PPZW(CMPHS, cmphs) +DO_PPZW(CMPLT, cmplt) +DO_PPZW(CMPLE, cmple) +DO_PPZW(CMPLO, cmplo) +DO_PPZW(CMPLS, cmpls) + +#undef DO_PPZW + +/* + *** SVE Integer Compare - Immediate Groups + */ + +static bool do_ppzi_flags(DisasContext *s, arg_rpri_esz *a, + gen_helper_gvec_flags_3 *gen_fn) +{ + TCGv_ptr pd, zn, pg; + unsigned vsz; + TCGv_i32 t; + + if (gen_fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + vsz = vec_full_reg_size(s); + t = tcg_const_i32(simd_desc(vsz, vsz, a->imm)); + pd = tcg_temp_new_ptr(); + zn = tcg_temp_new_ptr(); + pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(pd, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(zn, cpu_env, vec_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg)); + + gen_fn(t, pd, zn, pg, t); + + tcg_temp_free_ptr(pd); + tcg_temp_free_ptr(zn); + tcg_temp_free_ptr(pg); + + do_pred_flags(t); + + tcg_temp_free_i32(t); + return true; +} + +#define DO_PPZI(NAME, name) \ +static bool trans_##NAME##_ppzi(DisasContext *s, arg_rpri_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_flags_3 * const fns[4] = { \ + gen_helper_sve_##name##_ppzi_b, gen_helper_sve_##name##_ppzi_h, \ + gen_helper_sve_##name##_ppzi_s, gen_helper_sve_##name##_ppzi_d, \ + }; \ + return do_ppzi_flags(s, a, fns[a->esz]); \ +} + +DO_PPZI(CMPEQ, cmpeq) +DO_PPZI(CMPNE, cmpne) +DO_PPZI(CMPGT, cmpgt) +DO_PPZI(CMPGE, cmpge) +DO_PPZI(CMPHI, cmphi) +DO_PPZI(CMPHS, cmphs) +DO_PPZI(CMPLT, cmplt) +DO_PPZI(CMPLE, cmple) +DO_PPZI(CMPLO, cmplo) +DO_PPZI(CMPLS, cmpls) + +#undef DO_PPZI + +/* + *** SVE Partition Break Group + */ + +static bool do_brk3(DisasContext *s, arg_rprr_s *a, + gen_helper_gvec_4 *fn, gen_helper_gvec_flags_4 *fn_s) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + + /* Predicate sizes may be smaller and cannot use simd_desc. */ + TCGv_ptr d = tcg_temp_new_ptr(); + TCGv_ptr n = tcg_temp_new_ptr(); + TCGv_ptr m = tcg_temp_new_ptr(); + TCGv_ptr g = tcg_temp_new_ptr(); + TCGv_i32 t = tcg_const_i32(vsz - 2); + + tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(m, cpu_env, pred_full_reg_offset(s, a->rm)); + tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg)); + + if (a->s) { + fn_s(t, d, n, m, g, t); + do_pred_flags(t); + } else { + fn(d, n, m, g, t); + } + tcg_temp_free_ptr(d); + tcg_temp_free_ptr(n); + tcg_temp_free_ptr(m); + tcg_temp_free_ptr(g); + tcg_temp_free_i32(t); + return true; +} + +static bool do_brk2(DisasContext *s, arg_rpr_s *a, + gen_helper_gvec_3 *fn, gen_helper_gvec_flags_3 *fn_s) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = pred_full_reg_size(s); + + /* Predicate sizes may be smaller and cannot use simd_desc. */ + TCGv_ptr d = tcg_temp_new_ptr(); + TCGv_ptr n = tcg_temp_new_ptr(); + TCGv_ptr g = tcg_temp_new_ptr(); + TCGv_i32 t = tcg_const_i32(vsz - 2); + + tcg_gen_addi_ptr(d, cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(n, cpu_env, pred_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(g, cpu_env, pred_full_reg_offset(s, a->pg)); + + if (a->s) { + fn_s(t, d, n, g, t); + do_pred_flags(t); + } else { + fn(d, n, g, t); + } + tcg_temp_free_ptr(d); + tcg_temp_free_ptr(n); + tcg_temp_free_ptr(g); + tcg_temp_free_i32(t); + return true; +} + +static bool trans_BRKPA(DisasContext *s, arg_rprr_s *a, uint32_t insn) +{ + return do_brk3(s, a, gen_helper_sve_brkpa, gen_helper_sve_brkpas); +} + +static bool trans_BRKPB(DisasContext *s, arg_rprr_s *a, uint32_t insn) +{ + return do_brk3(s, a, gen_helper_sve_brkpb, gen_helper_sve_brkpbs); +} + +static bool trans_BRKA_m(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brka_m, gen_helper_sve_brkas_m); +} + +static bool trans_BRKB_m(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brkb_m, gen_helper_sve_brkbs_m); +} + +static bool trans_BRKA_z(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brka_z, gen_helper_sve_brkas_z); +} + +static bool trans_BRKB_z(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brkb_z, gen_helper_sve_brkbs_z); +} + +static bool trans_BRKN(DisasContext *s, arg_rpr_s *a, uint32_t insn) +{ + return do_brk2(s, a, gen_helper_sve_brkn, gen_helper_sve_brkns); +} + +/* + *** SVE Predicate Count Group + */ + +static void do_cntp(DisasContext *s, TCGv_i64 val, int esz, int pn, int pg) +{ + unsigned psz = pred_full_reg_size(s); + + if (psz <= 8) { + uint64_t psz_mask; + + tcg_gen_ld_i64(val, cpu_env, pred_full_reg_offset(s, pn)); + if (pn != pg) { + TCGv_i64 g = tcg_temp_new_i64(); + tcg_gen_ld_i64(g, cpu_env, pred_full_reg_offset(s, pg)); + tcg_gen_and_i64(val, val, g); + tcg_temp_free_i64(g); + } + + /* Reduce the pred_esz_masks value simply to reduce the + * size of the code generated here. + */ + psz_mask = MAKE_64BIT_MASK(0, psz * 8); + tcg_gen_andi_i64(val, val, pred_esz_masks[esz] & psz_mask); + + tcg_gen_ctpop_i64(val, val); + } else { + TCGv_ptr t_pn = tcg_temp_new_ptr(); + TCGv_ptr t_pg = tcg_temp_new_ptr(); + unsigned desc; + TCGv_i32 t_desc; + + desc = psz - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, esz); + + tcg_gen_addi_ptr(t_pn, cpu_env, pred_full_reg_offset(s, pn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + t_desc = tcg_const_i32(desc); + + gen_helper_sve_cntp(val, t_pn, t_pg, t_desc); + tcg_temp_free_ptr(t_pn); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_i32(t_desc); + } +} + +static bool trans_CNTP(DisasContext *s, arg_CNTP *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_cntp(s, cpu_reg(s, a->rd), a->esz, a->rn, a->pg); + } + return true; +} + +static bool trans_INCDECP_r(DisasContext *s, arg_incdec_pred *a, + uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + TCGv_i64 val = tcg_temp_new_i64(); + + do_cntp(s, val, a->esz, a->pg, a->pg); + if (a->d) { + tcg_gen_sub_i64(reg, reg, val); + } else { + tcg_gen_add_i64(reg, reg, val); + } + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_INCDECP_z(DisasContext *s, arg_incdec2_pred *a, + uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i64 val = tcg_temp_new_i64(); + GVecGen2sFn *gvec_fn = a->d ? tcg_gen_gvec_subs : tcg_gen_gvec_adds; + + do_cntp(s, val, a->esz, a->pg, a->pg); + gvec_fn(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), val, vsz, vsz); + } + return true; +} + +static bool trans_SINCDECP_r_32(DisasContext *s, arg_incdec_pred *a, + uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + TCGv_i64 val = tcg_temp_new_i64(); + + do_cntp(s, val, a->esz, a->pg, a->pg); + do_sat_addsub_32(reg, val, a->u, a->d); + } + return true; +} + +static bool trans_SINCDECP_r_64(DisasContext *s, arg_incdec_pred *a, + uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 reg = cpu_reg(s, a->rd); + TCGv_i64 val = tcg_temp_new_i64(); + + do_cntp(s, val, a->esz, a->pg, a->pg); + do_sat_addsub_64(reg, val, a->u, a->d); + } + return true; +} + +static bool trans_SINCDECP_z(DisasContext *s, arg_incdec2_pred *a, + uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 val = tcg_temp_new_i64(); + do_cntp(s, val, a->esz, a->pg, a->pg); + do_sat_addsub_vec(s, a->esz, a->rd, a->rn, val, a->u, a->d); + } + return true; +} + +/* + *** SVE Integer Compare Scalars Group + */ + +static bool trans_CTERM(DisasContext *s, arg_CTERM *a, uint32_t insn) +{ + if (!sve_access_check(s)) { + return true; + } + + TCGCond cond = (a->ne ? TCG_COND_NE : TCG_COND_EQ); + TCGv_i64 rn = read_cpu_reg(s, a->rn, a->sf); + TCGv_i64 rm = read_cpu_reg(s, a->rm, a->sf); + TCGv_i64 cmp = tcg_temp_new_i64(); + + tcg_gen_setcond_i64(cond, cmp, rn, rm); + tcg_gen_extrl_i64_i32(cpu_NF, cmp); + tcg_temp_free_i64(cmp); + + /* VF = !NF & !CF. */ + tcg_gen_xori_i32(cpu_VF, cpu_NF, 1); + tcg_gen_andc_i32(cpu_VF, cpu_VF, cpu_CF); + + /* Both NF and VF actually look at bit 31. */ + tcg_gen_neg_i32(cpu_NF, cpu_NF); + tcg_gen_neg_i32(cpu_VF, cpu_VF); + return true; +} + +static bool trans_WHILE(DisasContext *s, arg_WHILE *a, uint32_t insn) +{ + if (!sve_access_check(s)) { + return true; + } + + TCGv_i64 op0 = read_cpu_reg(s, a->rn, 1); + TCGv_i64 op1 = read_cpu_reg(s, a->rm, 1); + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i32 t2, t3; + TCGv_ptr ptr; + unsigned desc, vsz = vec_full_reg_size(s); + TCGCond cond; + + if (!a->sf) { + if (a->u) { + tcg_gen_ext32u_i64(op0, op0); + tcg_gen_ext32u_i64(op1, op1); + } else { + tcg_gen_ext32s_i64(op0, op0); + tcg_gen_ext32s_i64(op1, op1); + } + } + + /* For the helper, compress the different conditions into a computation + * of how many iterations for which the condition is true. + * + * This is slightly complicated by 0 <= UINT64_MAX, which is nominally + * 2**64 iterations, overflowing to 0. Of course, predicate registers + * aren't that large, so any value >= predicate size is sufficient. + */ + tcg_gen_sub_i64(t0, op1, op0); + + /* t0 = MIN(op1 - op0, vsz). */ + tcg_gen_movi_i64(t1, vsz); + tcg_gen_umin_i64(t0, t0, t1); + if (a->eq) { + /* Equality means one more iteration. */ + tcg_gen_addi_i64(t0, t0, 1); + } + + /* t0 = (condition true ? t0 : 0). */ + cond = (a->u + ? (a->eq ? TCG_COND_LEU : TCG_COND_LTU) + : (a->eq ? TCG_COND_LE : TCG_COND_LT)); + tcg_gen_movi_i64(t1, 0); + tcg_gen_movcond_i64(cond, t0, op0, op1, t0, t1); + + t2 = tcg_temp_new_i32(); + tcg_gen_extrl_i64_i32(t2, t0); + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); + + desc = (vsz / 8) - 2; + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz); + t3 = tcg_const_i32(desc); + + ptr = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(ptr, cpu_env, pred_full_reg_offset(s, a->rd)); + + gen_helper_sve_while(t2, ptr, t2, t3); + do_pred_flags(t2); + + tcg_temp_free_ptr(ptr); + tcg_temp_free_i32(t2); + tcg_temp_free_i32(t3); + return true; +} + +/* + *** SVE Integer Wide Immediate - Unpredicated Group + */ + +static bool trans_FDUP(DisasContext *s, arg_FDUP *a, uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + int dofs = vec_full_reg_offset(s, a->rd); + uint64_t imm; + + /* Decode the VFP immediate. */ + imm = vfp_expand_imm(a->esz, a->imm); + imm = dup_const(a->esz, imm); + + tcg_gen_gvec_dup64i(dofs, vsz, vsz, imm); + } + return true; +} + +static bool trans_DUP_i(DisasContext *s, arg_DUP_i *a, uint32_t insn) +{ + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + int dofs = vec_full_reg_offset(s, a->rd); + + tcg_gen_gvec_dup64i(dofs, vsz, vsz, dup_const(a->esz, a->imm)); + } + return true; +} + +static bool trans_ADD_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_addi(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), a->imm, vsz, vsz); + } + return true; +} + +static bool trans_SUB_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + a->imm = -a->imm; + return trans_ADD_zzi(s, a, insn); +} + +static bool trans_SUBR_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + static const GVecGen2s op[4] = { + { .fni8 = tcg_gen_vec_sub8_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_b, + .opc = INDEX_op_sub_vec, + .vece = MO_8, + .scalar_first = true }, + { .fni8 = tcg_gen_vec_sub16_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_h, + .opc = INDEX_op_sub_vec, + .vece = MO_16, + .scalar_first = true }, + { .fni4 = tcg_gen_sub_i32, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_s, + .opc = INDEX_op_sub_vec, + .vece = MO_32, + .scalar_first = true }, + { .fni8 = tcg_gen_sub_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_sve_subri_d, + .opc = INDEX_op_sub_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64, + .scalar_first = true } + }; + + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i64 c = tcg_const_i64(a->imm); + tcg_gen_gvec_2s(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vsz, vsz, c, &op[a->esz]); + tcg_temp_free_i64(c); + } + return true; +} + +static bool trans_MUL_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_muli(a->esz, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), a->imm, vsz, vsz); + } + return true; +} + +static bool do_zzi_sat(DisasContext *s, arg_rri_esz *a, uint32_t insn, + bool u, bool d) +{ + if (a->esz == 0 && extract32(insn, 13, 1)) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 val = tcg_const_i64(a->imm); + do_sat_addsub_vec(s, a->esz, a->rd, a->rn, val, u, d); + tcg_temp_free_i64(val); + } + return true; +} + +static bool trans_SQADD_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + return do_zzi_sat(s, a, insn, false, false); +} + +static bool trans_UQADD_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + return do_zzi_sat(s, a, insn, true, false); +} + +static bool trans_SQSUB_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + return do_zzi_sat(s, a, insn, false, true); +} + +static bool trans_UQSUB_zzi(DisasContext *s, arg_rri_esz *a, uint32_t insn) +{ + return do_zzi_sat(s, a, insn, true, true); +} + +static bool do_zzi_ool(DisasContext *s, arg_rri_esz *a, gen_helper_gvec_2i *fn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i64 c = tcg_const_i64(a->imm); + + tcg_gen_gvec_2i_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + c, vsz, vsz, 0, fn); + tcg_temp_free_i64(c); + } + return true; +} + +#define DO_ZZI(NAME, name) \ +static bool trans_##NAME##_zzi(DisasContext *s, arg_rri_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_2i * const fns[4] = { \ + gen_helper_sve_##name##i_b, gen_helper_sve_##name##i_h, \ + gen_helper_sve_##name##i_s, gen_helper_sve_##name##i_d, \ + }; \ + return do_zzi_ool(s, a, fns[a->esz]); \ +} + +DO_ZZI(SMAX, smax) +DO_ZZI(UMAX, umax) +DO_ZZI(SMIN, smin) +DO_ZZI(UMIN, umin) + +#undef DO_ZZI + +static bool trans_DOT_zzz(DisasContext *s, arg_DOT_zzz *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[2][2] = { + { gen_helper_gvec_sdot_b, gen_helper_gvec_sdot_h }, + { gen_helper_gvec_udot_b, gen_helper_gvec_udot_h } + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, 0, fns[a->u][a->sz]); + } + return true; +} + +static bool trans_DOT_zzx(DisasContext *s, arg_DOT_zzx *a, uint32_t insn) +{ + static gen_helper_gvec_3 * const fns[2][2] = { + { gen_helper_gvec_sdot_idx_b, gen_helper_gvec_sdot_idx_h }, + { gen_helper_gvec_udot_idx_b, gen_helper_gvec_udot_idx_h } + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vsz, vsz, a->index, fns[a->u][a->sz]); + } + return true; +} + + +/* + *** SVE Floating Point Multiply-Add Indexed Group + */ + +static bool trans_FMLA_zzxz(DisasContext *s, arg_FMLA_zzxz *a, uint32_t insn) +{ + static gen_helper_gvec_4_ptr * const fns[3] = { + gen_helper_gvec_fmla_idx_h, + gen_helper_gvec_fmla_idx_s, + gen_helper_gvec_fmla_idx_d, + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + vec_full_reg_offset(s, a->ra), + status, vsz, vsz, (a->index << 1) | a->sub, + fns[a->esz - 1]); + tcg_temp_free_ptr(status); + } + return true; +} + +/* + *** SVE Floating Point Multiply Indexed Group + */ + +static bool trans_FMUL_zzx(DisasContext *s, arg_FMUL_zzx *a, uint32_t insn) +{ + static gen_helper_gvec_3_ptr * const fns[3] = { + gen_helper_gvec_fmul_idx_h, + gen_helper_gvec_fmul_idx_s, + gen_helper_gvec_fmul_idx_d, + }; + + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + status, vsz, vsz, a->index, fns[a->esz - 1]); + tcg_temp_free_ptr(status); + } + return true; +} + +/* + *** SVE Floating Point Fast Reduction Group + */ + +typedef void gen_helper_fp_reduce(TCGv_i64, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); + +static void do_reduce(DisasContext *s, arg_rpr_esz *a, + gen_helper_fp_reduce *fn) +{ + unsigned vsz = vec_full_reg_size(s); + unsigned p2vsz = pow2ceil(vsz); + TCGv_i32 t_desc = tcg_const_i32(simd_desc(vsz, p2vsz, 0)); + TCGv_ptr t_zn, t_pg, status; + TCGv_i64 temp; + + temp = tcg_temp_new_i64(); + t_zn = tcg_temp_new_ptr(); + t_pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, a->rn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg)); + status = get_fpstatus_ptr(a->esz == MO_16); + + fn(temp, t_zn, t_pg, status, t_desc); + tcg_temp_free_ptr(t_zn); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_ptr(status); + tcg_temp_free_i32(t_desc); + + write_fp_dreg(s, a->rd, temp); + tcg_temp_free_i64(temp); +} + +#define DO_VPZ(NAME, name) \ +static bool trans_##NAME(DisasContext *s, arg_rpr_esz *a, uint32_t insn) \ +{ \ + static gen_helper_fp_reduce * const fns[3] = { \ + gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, \ + gen_helper_sve_##name##_d, \ + }; \ + if (a->esz == 0) { \ + return false; \ + } \ + if (sve_access_check(s)) { \ + do_reduce(s, a, fns[a->esz - 1]); \ + } \ + return true; \ +} + +DO_VPZ(FADDV, faddv) +DO_VPZ(FMINNMV, fminnmv) +DO_VPZ(FMAXNMV, fmaxnmv) +DO_VPZ(FMINV, fminv) +DO_VPZ(FMAXV, fmaxv) + +/* + *** SVE Floating Point Unary Operations - Unpredicated Group + */ + +static void do_zz_fp(DisasContext *s, arg_rr_esz *a, gen_helper_gvec_2_ptr *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + + tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); +} + +static bool trans_FRECPE(DisasContext *s, arg_rr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_2_ptr * const fns[3] = { + gen_helper_gvec_frecpe_h, + gen_helper_gvec_frecpe_s, + gen_helper_gvec_frecpe_d, + }; + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + do_zz_fp(s, a, fns[a->esz - 1]); + } + return true; +} + +static bool trans_FRSQRTE(DisasContext *s, arg_rr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_2_ptr * const fns[3] = { + gen_helper_gvec_frsqrte_h, + gen_helper_gvec_frsqrte_s, + gen_helper_gvec_frsqrte_d, + }; + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + do_zz_fp(s, a, fns[a->esz - 1]); + } + return true; +} + +/* + *** SVE Floating Point Compare with Zero Group + */ + +static void do_ppz_fp(DisasContext *s, arg_rpr_esz *a, + gen_helper_gvec_3_ptr *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + + tcg_gen_gvec_3_ptr(pred_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); +} + +#define DO_PPZ(NAME, name) \ +static bool trans_##NAME(DisasContext *s, arg_rpr_esz *a, uint32_t insn) \ +{ \ + static gen_helper_gvec_3_ptr * const fns[3] = { \ + gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, \ + gen_helper_sve_##name##_d, \ + }; \ + if (a->esz == 0) { \ + return false; \ + } \ + if (sve_access_check(s)) { \ + do_ppz_fp(s, a, fns[a->esz - 1]); \ + } \ + return true; \ +} + +DO_PPZ(FCMGE_ppz0, fcmge0) +DO_PPZ(FCMGT_ppz0, fcmgt0) +DO_PPZ(FCMLE_ppz0, fcmle0) +DO_PPZ(FCMLT_ppz0, fcmlt0) +DO_PPZ(FCMEQ_ppz0, fcmeq0) +DO_PPZ(FCMNE_ppz0, fcmne0) + +#undef DO_PPZ + +/* + *** SVE floating-point trig multiply-add coefficient + */ + +static bool trans_FTMAD(DisasContext *s, arg_FTMAD *a, uint32_t insn) +{ + static gen_helper_gvec_3_ptr * const fns[3] = { + gen_helper_sve_ftmad_h, + gen_helper_sve_ftmad_s, + gen_helper_sve_ftmad_d, + }; + + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + status, vsz, vsz, a->imm, fns[a->esz - 1]); + tcg_temp_free_ptr(status); + } + return true; +} + +/* + *** SVE Floating Point Accumulating Reduction Group + */ + +static bool trans_FADDA(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + typedef void fadda_fn(TCGv_i64, TCGv_i64, TCGv_ptr, + TCGv_ptr, TCGv_ptr, TCGv_i32); + static fadda_fn * const fns[3] = { + gen_helper_sve_fadda_h, + gen_helper_sve_fadda_s, + gen_helper_sve_fadda_d, + }; + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_rm, t_pg, t_fpst; + TCGv_i64 t_val; + TCGv_i32 t_desc; + + if (a->esz == 0) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + t_val = load_esz(cpu_env, vec_reg_offset(s, a->rn, 0, a->esz), a->esz); + t_rm = tcg_temp_new_ptr(); + t_pg = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(t_rm, cpu_env, vec_full_reg_offset(s, a->rm)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, a->pg)); + t_fpst = get_fpstatus_ptr(a->esz == MO_16); + t_desc = tcg_const_i32(simd_desc(vsz, vsz, 0)); + + fns[a->esz - 1](t_val, t_val, t_rm, t_pg, t_fpst, t_desc); + + tcg_temp_free_i32(t_desc); + tcg_temp_free_ptr(t_fpst); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_ptr(t_rm); + + write_fp_dreg(s, a->rd, t_val); + tcg_temp_free_i64(t_val); + return true; +} + +/* + *** SVE Floating Point Arithmetic - Unpredicated Group + */ + +static bool do_zzz_fp(DisasContext *s, arg_rrr_esz *a, + gen_helper_gvec_3_ptr *fn) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); + } + return true; +} + + +#define DO_FP3(NAME, name) \ +static bool trans_##NAME(DisasContext *s, arg_rrr_esz *a, uint32_t insn) \ +{ \ + static gen_helper_gvec_3_ptr * const fns[4] = { \ + NULL, gen_helper_gvec_##name##_h, \ + gen_helper_gvec_##name##_s, gen_helper_gvec_##name##_d \ + }; \ + return do_zzz_fp(s, a, fns[a->esz]); \ +} + +DO_FP3(FADD_zzz, fadd) +DO_FP3(FSUB_zzz, fsub) +DO_FP3(FMUL_zzz, fmul) +DO_FP3(FTSMUL, ftsmul) +DO_FP3(FRECPS, recps) +DO_FP3(FRSQRTS, rsqrts) + +#undef DO_FP3 + +/* + *** SVE Floating Point Arithmetic - Predicated Group + */ + +static bool do_zpzz_fp(DisasContext *s, arg_rprr_esz *a, + gen_helper_gvec_4_ptr *fn) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); + } + return true; +} + +#define DO_FP3(NAME, name) \ +static bool trans_##NAME(DisasContext *s, arg_rprr_esz *a, uint32_t insn) \ +{ \ + static gen_helper_gvec_4_ptr * const fns[4] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \ + }; \ + return do_zpzz_fp(s, a, fns[a->esz]); \ +} + +DO_FP3(FADD_zpzz, fadd) +DO_FP3(FSUB_zpzz, fsub) +DO_FP3(FMUL_zpzz, fmul) +DO_FP3(FMIN_zpzz, fmin) +DO_FP3(FMAX_zpzz, fmax) +DO_FP3(FMINNM_zpzz, fminnum) +DO_FP3(FMAXNM_zpzz, fmaxnum) +DO_FP3(FABD, fabd) +DO_FP3(FSCALE, fscalbn) +DO_FP3(FDIV, fdiv) +DO_FP3(FMULX, fmulx) + +#undef DO_FP3 + +typedef void gen_helper_sve_fp2scalar(TCGv_ptr, TCGv_ptr, TCGv_ptr, + TCGv_i64, TCGv_ptr, TCGv_i32); + +static void do_fp_scalar(DisasContext *s, int zd, int zn, int pg, bool is_fp16, + TCGv_i64 scalar, gen_helper_sve_fp2scalar *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_zd, t_zn, t_pg, status; + TCGv_i32 desc; + + t_zd = tcg_temp_new_ptr(); + t_zn = tcg_temp_new_ptr(); + t_pg = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(t_zd, cpu_env, vec_full_reg_offset(s, zd)); + tcg_gen_addi_ptr(t_zn, cpu_env, vec_full_reg_offset(s, zn)); + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + + status = get_fpstatus_ptr(is_fp16); + desc = tcg_const_i32(simd_desc(vsz, vsz, 0)); + fn(t_zd, t_zn, t_pg, scalar, status, desc); + + tcg_temp_free_i32(desc); + tcg_temp_free_ptr(status); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_ptr(t_zn); + tcg_temp_free_ptr(t_zd); +} + +static void do_fp_imm(DisasContext *s, arg_rpri_esz *a, uint64_t imm, + gen_helper_sve_fp2scalar *fn) +{ + TCGv_i64 temp = tcg_const_i64(imm); + do_fp_scalar(s, a->rd, a->rn, a->pg, a->esz == MO_16, temp, fn); + tcg_temp_free_i64(temp); +} + +#define DO_FP_IMM(NAME, name, const0, const1) \ +static bool trans_##NAME##_zpzi(DisasContext *s, arg_rpri_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_sve_fp2scalar * const fns[3] = { \ + gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, \ + gen_helper_sve_##name##_d \ + }; \ + static uint64_t const val[3][2] = { \ + { float16_##const0, float16_##const1 }, \ + { float32_##const0, float32_##const1 }, \ + { float64_##const0, float64_##const1 }, \ + }; \ + if (a->esz == 0) { \ + return false; \ + } \ + if (sve_access_check(s)) { \ + do_fp_imm(s, a, val[a->esz - 1][a->imm], fns[a->esz - 1]); \ + } \ + return true; \ +} + +#define float16_two make_float16(0x4000) +#define float32_two make_float32(0x40000000) +#define float64_two make_float64(0x4000000000000000ULL) + +DO_FP_IMM(FADD, fadds, half, one) +DO_FP_IMM(FSUB, fsubs, half, one) +DO_FP_IMM(FMUL, fmuls, half, two) +DO_FP_IMM(FSUBR, fsubrs, half, one) +DO_FP_IMM(FMAXNM, fmaxnms, zero, one) +DO_FP_IMM(FMINNM, fminnms, zero, one) +DO_FP_IMM(FMAX, fmaxs, zero, one) +DO_FP_IMM(FMIN, fmins, zero, one) + +#undef DO_FP_IMM + +static bool do_fp_cmp(DisasContext *s, arg_rprr_esz *a, + gen_helper_gvec_4_ptr *fn) +{ + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_4_ptr(pred_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); + } + return true; +} + +#define DO_FPCMP(NAME, name) \ +static bool trans_##NAME##_ppzz(DisasContext *s, arg_rprr_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_4_ptr * const fns[4] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \ + }; \ + return do_fp_cmp(s, a, fns[a->esz]); \ +} + +DO_FPCMP(FCMGE, fcmge) +DO_FPCMP(FCMGT, fcmgt) +DO_FPCMP(FCMEQ, fcmeq) +DO_FPCMP(FCMNE, fcmne) +DO_FPCMP(FCMUO, fcmuo) +DO_FPCMP(FACGE, facge) +DO_FPCMP(FACGT, facgt) + +#undef DO_FPCMP + +static bool trans_FCADD(DisasContext *s, arg_FCADD *a, uint32_t insn) +{ + static gen_helper_gvec_4_ptr * const fns[3] = { + gen_helper_sve_fcadd_h, + gen_helper_sve_fcadd_s, + gen_helper_sve_fcadd_d + }; + + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, a->rot, fns[a->esz - 1]); + tcg_temp_free_ptr(status); + } + return true; +} + +typedef void gen_helper_sve_fmla(TCGv_env, TCGv_ptr, TCGv_i32); + +static bool do_fmla(DisasContext *s, arg_rprrr_esz *a, gen_helper_sve_fmla *fn) +{ + if (fn == NULL) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = vec_full_reg_size(s); + unsigned desc; + TCGv_i32 t_desc; + TCGv_ptr pg = tcg_temp_new_ptr(); + + /* We would need 7 operands to pass these arguments "properly". + * So we encode all the register numbers into the descriptor. + */ + desc = deposit32(a->rd, 5, 5, a->rn); + desc = deposit32(desc, 10, 5, a->rm); + desc = deposit32(desc, 15, 5, a->ra); + desc = simd_desc(vsz, vsz, desc); + + t_desc = tcg_const_i32(desc); + tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg)); + fn(cpu_env, pg, t_desc); + tcg_temp_free_i32(t_desc); + tcg_temp_free_ptr(pg); + return true; +} + +#define DO_FMLA(NAME, name) \ +static bool trans_##NAME(DisasContext *s, arg_rprrr_esz *a, uint32_t insn) \ +{ \ + static gen_helper_sve_fmla * const fns[4] = { \ + NULL, gen_helper_sve_##name##_h, \ + gen_helper_sve_##name##_s, gen_helper_sve_##name##_d \ + }; \ + return do_fmla(s, a, fns[a->esz]); \ +} + +DO_FMLA(FMLA_zpzzz, fmla_zpzzz) +DO_FMLA(FMLS_zpzzz, fmls_zpzzz) +DO_FMLA(FNMLA_zpzzz, fnmla_zpzzz) +DO_FMLA(FNMLS_zpzzz, fnmls_zpzzz) + +#undef DO_FMLA + +static bool trans_FCMLA_zpzzz(DisasContext *s, + arg_FCMLA_zpzzz *a, uint32_t insn) +{ + static gen_helper_sve_fmla * const fns[3] = { + gen_helper_sve_fcmla_zpzzz_h, + gen_helper_sve_fcmla_zpzzz_s, + gen_helper_sve_fcmla_zpzzz_d, + }; + + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + unsigned desc; + TCGv_i32 t_desc; + TCGv_ptr pg = tcg_temp_new_ptr(); + + /* We would need 7 operands to pass these arguments "properly". + * So we encode all the register numbers into the descriptor. + */ + desc = deposit32(a->rd, 5, 5, a->rn); + desc = deposit32(desc, 10, 5, a->rm); + desc = deposit32(desc, 15, 5, a->ra); + desc = deposit32(desc, 20, 2, a->rot); + desc = sextract32(desc, 0, 22); + desc = simd_desc(vsz, vsz, desc); + + t_desc = tcg_const_i32(desc); + tcg_gen_addi_ptr(pg, cpu_env, pred_full_reg_offset(s, a->pg)); + fns[a->esz - 1](cpu_env, pg, t_desc); + tcg_temp_free_i32(t_desc); + tcg_temp_free_ptr(pg); + } + return true; +} + +static bool trans_FCMLA_zzxz(DisasContext *s, arg_FCMLA_zzxz *a, uint32_t insn) +{ + static gen_helper_gvec_3_ptr * const fns[2] = { + gen_helper_gvec_fcmlah_idx, + gen_helper_gvec_fcmlas_idx, + }; + + tcg_debug_assert(a->esz == 1 || a->esz == 2); + tcg_debug_assert(a->rd == a->ra); + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + status, vsz, vsz, + a->index * 4 + a->rot, + fns[a->esz - 1]); + tcg_temp_free_ptr(status); + } + return true; +} + +/* + *** SVE Floating Point Unary Operations Predicated Group + */ + +static bool do_zpz_ptr(DisasContext *s, int rd, int rn, int pg, + bool is_fp16, gen_helper_gvec_3_ptr *fn) +{ + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(is_fp16); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + pred_full_reg_offset(s, pg), + status, vsz, vsz, 0, fn); + tcg_temp_free_ptr(status); + } + return true; +} + +static bool trans_FCVT_sh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvt_sh); +} + +static bool trans_FCVT_hs(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_hs); +} + +static bool trans_FCVT_dh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvt_dh); +} + +static bool trans_FCVT_hd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_hd); +} + +static bool trans_FCVT_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_ds); +} + +static bool trans_FCVT_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_sd); +} + +static bool trans_FCVTZS_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzs_hh); +} + +static bool trans_FCVTZU_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzu_hh); +} + +static bool trans_FCVTZS_hs(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzs_hs); +} + +static bool trans_FCVTZU_hs(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzu_hs); +} + +static bool trans_FCVTZS_hd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzs_hd); +} + +static bool trans_FCVTZU_hd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_fcvtzu_hd); +} + +static bool trans_FCVTZS_ss(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzs_ss); +} + +static bool trans_FCVTZU_ss(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_ss); +} + +static bool trans_FCVTZS_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzs_sd); +} + +static bool trans_FCVTZU_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_sd); +} + +static bool trans_FCVTZS_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzs_ds); +} + +static bool trans_FCVTZU_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_ds); +} + +static bool trans_FCVTZS_dd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzs_dd); +} + +static bool trans_FCVTZU_dd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvtzu_dd); +} + +static gen_helper_gvec_3_ptr * const frint_fns[3] = { + gen_helper_sve_frint_h, + gen_helper_sve_frint_s, + gen_helper_sve_frint_d +}; + +static bool trans_FRINTI(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + if (a->esz == 0) { + return false; + } + return do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, + frint_fns[a->esz - 1]); +} + +static bool trans_FRINTX(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3_ptr * const fns[3] = { + gen_helper_sve_frintx_h, + gen_helper_sve_frintx_s, + gen_helper_sve_frintx_d + }; + if (a->esz == 0) { + return false; + } + return do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, fns[a->esz - 1]); +} + +static bool do_frint_mode(DisasContext *s, arg_rpr_esz *a, int mode) +{ + if (a->esz == 0) { + return false; + } + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_i32 tmode = tcg_const_i32(mode); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + + gen_helper_set_rmode(tmode, tmode, status); + + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + pred_full_reg_offset(s, a->pg), + status, vsz, vsz, 0, frint_fns[a->esz - 1]); + + gen_helper_set_rmode(tmode, tmode, status); + tcg_temp_free_i32(tmode); + tcg_temp_free_ptr(status); + } + return true; +} + +static bool trans_FRINTN(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_frint_mode(s, a, float_round_nearest_even); +} + +static bool trans_FRINTP(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_frint_mode(s, a, float_round_up); +} + +static bool trans_FRINTM(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_frint_mode(s, a, float_round_down); +} + +static bool trans_FRINTZ(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_frint_mode(s, a, float_round_to_zero); +} + +static bool trans_FRINTA(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_frint_mode(s, a, float_round_ties_away); +} + +static bool trans_FRECPX(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3_ptr * const fns[3] = { + gen_helper_sve_frecpx_h, + gen_helper_sve_frecpx_s, + gen_helper_sve_frecpx_d + }; + if (a->esz == 0) { + return false; + } + return do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, fns[a->esz - 1]); +} + +static bool trans_FSQRT(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_3_ptr * const fns[3] = { + gen_helper_sve_fsqrt_h, + gen_helper_sve_fsqrt_s, + gen_helper_sve_fsqrt_d + }; + if (a->esz == 0) { + return false; + } + return do_zpz_ptr(s, a->rd, a->rn, a->pg, a->esz == MO_16, fns[a->esz - 1]); +} + +static bool trans_SCVTF_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_scvt_hh); +} + +static bool trans_SCVTF_sh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_scvt_sh); +} + +static bool trans_SCVTF_dh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_scvt_dh); +} + +static bool trans_SCVTF_ss(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_scvt_ss); +} + +static bool trans_SCVTF_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_scvt_ds); +} + +static bool trans_SCVTF_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_scvt_sd); +} + +static bool trans_SCVTF_dd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_scvt_dd); +} + +static bool trans_UCVTF_hh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_ucvt_hh); +} + +static bool trans_UCVTF_sh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_ucvt_sh); +} + +static bool trans_UCVTF_dh(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, true, gen_helper_sve_ucvt_dh); +} + +static bool trans_UCVTF_ss(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_ucvt_ss); +} + +static bool trans_UCVTF_ds(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_ucvt_ds); +} + +static bool trans_UCVTF_sd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_ucvt_sd); +} + +static bool trans_UCVTF_dd(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_ucvt_dd); +} + +/* + *** SVE Memory - 32-bit Gather and Unsized Contiguous Group + */ + +/* Subroutine loading a vector register at VOFS of LEN bytes. + * The load should begin at the address Rn + IMM. + */ + +static void do_ldr(DisasContext *s, uint32_t vofs, uint32_t len, + int rn, int imm) +{ + uint32_t len_align = QEMU_ALIGN_DOWN(len, 8); + uint32_t len_remain = len % 8; + uint32_t nparts = len / 8 + ctpop8(len_remain); + int midx = get_mem_index(s); + TCGv_i64 addr, t0, t1; + + addr = tcg_temp_new_i64(); + t0 = tcg_temp_new_i64(); + + /* Note that unpredicated load/store of vector/predicate registers + * are defined as a stream of bytes, which equates to little-endian + * operations on larger quantities. There is no nice way to force + * a little-endian load for aarch64_be-linux-user out of line. + * + * Attempt to keep code expansion to a minimum by limiting the + * amount of unrolling done. + */ + if (nparts <= 4) { + int i; + + for (i = 0; i < len_align; i += 8) { + tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + i); + tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEQ); + tcg_gen_st_i64(t0, cpu_env, vofs + i); + } + } else { + TCGLabel *loop = gen_new_label(); + TCGv_ptr tp, i = tcg_const_local_ptr(0); + + gen_set_label(loop); + + /* Minimize the number of local temps that must be re-read from + * the stack each iteration. Instead, re-compute values other + * than the loop counter. + */ + tp = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(tp, i, imm); + tcg_gen_extu_ptr_i64(addr, tp); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, rn)); + + tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEQ); + + tcg_gen_add_ptr(tp, cpu_env, i); + tcg_gen_addi_ptr(i, i, 8); + tcg_gen_st_i64(t0, tp, vofs); + tcg_temp_free_ptr(tp); + + tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); + tcg_temp_free_ptr(i); + } + + /* Predicate register loads can be any multiple of 2. + * Note that we still store the entire 64-bit unit into cpu_env. + */ + if (len_remain) { + tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + len_align); + + switch (len_remain) { + case 2: + case 4: + case 8: + tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LE | ctz32(len_remain)); + break; + + case 6: + t1 = tcg_temp_new_i64(); + tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEUL); + tcg_gen_addi_i64(addr, addr, 4); + tcg_gen_qemu_ld_i64(t1, addr, midx, MO_LEUW); + tcg_gen_deposit_i64(t0, t0, t1, 32, 32); + tcg_temp_free_i64(t1); + break; + + default: + g_assert_not_reached(); + } + tcg_gen_st_i64(t0, cpu_env, vofs + len_align); + } + tcg_temp_free_i64(addr); + tcg_temp_free_i64(t0); +} + +/* Similarly for stores. */ +static void do_str(DisasContext *s, uint32_t vofs, uint32_t len, + int rn, int imm) +{ + uint32_t len_align = QEMU_ALIGN_DOWN(len, 8); + uint32_t len_remain = len % 8; + uint32_t nparts = len / 8 + ctpop8(len_remain); + int midx = get_mem_index(s); + TCGv_i64 addr, t0; + + addr = tcg_temp_new_i64(); + t0 = tcg_temp_new_i64(); + + /* Note that unpredicated load/store of vector/predicate registers + * are defined as a stream of bytes, which equates to little-endian + * operations on larger quantities. There is no nice way to force + * a little-endian store for aarch64_be-linux-user out of line. + * + * Attempt to keep code expansion to a minimum by limiting the + * amount of unrolling done. + */ + if (nparts <= 4) { + int i; + + for (i = 0; i < len_align; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, vofs + i); + tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + i); + tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEQ); + } + } else { + TCGLabel *loop = gen_new_label(); + TCGv_ptr t2, i = tcg_const_local_ptr(0); + + gen_set_label(loop); + + t2 = tcg_temp_new_ptr(); + tcg_gen_add_ptr(t2, cpu_env, i); + tcg_gen_ld_i64(t0, t2, vofs); + + /* Minimize the number of local temps that must be re-read from + * the stack each iteration. Instead, re-compute values other + * than the loop counter. + */ + tcg_gen_addi_ptr(t2, i, imm); + tcg_gen_extu_ptr_i64(addr, t2); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, rn)); + tcg_temp_free_ptr(t2); + + tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEQ); + + tcg_gen_addi_ptr(i, i, 8); + + tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); + tcg_temp_free_ptr(i); + } + + /* Predicate register stores can be any multiple of 2. */ + if (len_remain) { + tcg_gen_ld_i64(t0, cpu_env, vofs + len_align); + tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + len_align); + + switch (len_remain) { + case 2: + case 4: + case 8: + tcg_gen_qemu_st_i64(t0, addr, midx, MO_LE | ctz32(len_remain)); + break; + + case 6: + tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUL); + tcg_gen_addi_i64(addr, addr, 4); + tcg_gen_shri_i64(t0, t0, 32); + tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUW); + break; + + default: + g_assert_not_reached(); + } + } + tcg_temp_free_i64(addr); + tcg_temp_free_i64(t0); +} + +static bool trans_LDR_zri(DisasContext *s, arg_rri *a, uint32_t insn) +{ + if (sve_access_check(s)) { + int size = vec_full_reg_size(s); + int off = vec_full_reg_offset(s, a->rd); + do_ldr(s, off, size, a->rn, a->imm * size); + } + return true; +} + +static bool trans_LDR_pri(DisasContext *s, arg_rri *a, uint32_t insn) +{ + if (sve_access_check(s)) { + int size = pred_full_reg_size(s); + int off = pred_full_reg_offset(s, a->rd); + do_ldr(s, off, size, a->rn, a->imm * size); + } + return true; +} + +static bool trans_STR_zri(DisasContext *s, arg_rri *a, uint32_t insn) +{ + if (sve_access_check(s)) { + int size = vec_full_reg_size(s); + int off = vec_full_reg_offset(s, a->rd); + do_str(s, off, size, a->rn, a->imm * size); + } + return true; +} + +static bool trans_STR_pri(DisasContext *s, arg_rri *a, uint32_t insn) +{ + if (sve_access_check(s)) { + int size = pred_full_reg_size(s); + int off = pred_full_reg_offset(s, a->rd); + do_str(s, off, size, a->rn, a->imm * size); + } + return true; +} + +/* + *** SVE Memory - Contiguous Load Group + */ + +/* The memory mode of the dtype. */ +static const TCGMemOp dtype_mop[16] = { + MO_UB, MO_UB, MO_UB, MO_UB, + MO_SL, MO_UW, MO_UW, MO_UW, + MO_SW, MO_SW, MO_UL, MO_UL, + MO_SB, MO_SB, MO_SB, MO_Q +}; + +#define dtype_msz(x) (dtype_mop[x] & MO_SIZE) + +/* The vector element size of dtype. */ +static const uint8_t dtype_esz[16] = { + 0, 1, 2, 3, + 3, 1, 2, 3, + 3, 2, 2, 3, + 3, 2, 1, 3 +}; + +static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, + gen_helper_gvec_mem *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_pg; + TCGv_i32 desc; + + /* For e.g. LD4, there are not enough arguments to pass all 4 + * registers as pointers, so encode the regno into the data field. + * For consistency, do this even for LD1. + */ + desc = tcg_const_i32(simd_desc(vsz, vsz, zt)); + t_pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + fn(cpu_env, t_pg, addr, desc); + + tcg_temp_free_ptr(t_pg); + tcg_temp_free_i32(desc); +} + +static void do_ld_zpa(DisasContext *s, int zt, int pg, + TCGv_i64 addr, int dtype, int nreg) +{ + static gen_helper_gvec_mem * const fns[16][4] = { + { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r, + gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r }, + { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1sds_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hh_r, gen_helper_sve_ld2hh_r, + gen_helper_sve_ld3hh_r, gen_helper_sve_ld4hh_r }, + { gen_helper_sve_ld1hsu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hdu_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1hds_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hss_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1ss_r, gen_helper_sve_ld2ss_r, + gen_helper_sve_ld3ss_r, gen_helper_sve_ld4ss_r }, + { gen_helper_sve_ld1sdu_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1bds_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bss_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1dd_r, gen_helper_sve_ld2dd_r, + gen_helper_sve_ld3dd_r, gen_helper_sve_ld4dd_r }, + }; + gen_helper_gvec_mem *fn = fns[dtype][nreg]; + + /* While there are holes in the table, they are not + * accessible via the instruction encoding. + */ + assert(fn != NULL); + do_mem_zpa(s, zt, pg, addr, fn); +} + +static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn) +{ + if (a->rm == 31) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_muli_i64(addr, cpu_reg(s, a->rm), + (a->nreg + 1) << dtype_msz(a->dtype)); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg); + } + return true; +} + +static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn) +{ + if (sve_access_check(s)) { + int vsz = vec_full_reg_size(s); + int elements = vsz >> dtype_esz[a->dtype]; + TCGv_i64 addr = new_tmp_a64(s); + + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), + (a->imm * elements * (a->nreg + 1)) + << dtype_msz(a->dtype)); + do_ld_zpa(s, a->rd, a->pg, addr, a->dtype, a->nreg); + } + return true; +} + +static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn) +{ + static gen_helper_gvec_mem * const fns[16] = { + gen_helper_sve_ldff1bb_r, + gen_helper_sve_ldff1bhu_r, + gen_helper_sve_ldff1bsu_r, + gen_helper_sve_ldff1bdu_r, + + gen_helper_sve_ldff1sds_r, + gen_helper_sve_ldff1hh_r, + gen_helper_sve_ldff1hsu_r, + gen_helper_sve_ldff1hdu_r, + + gen_helper_sve_ldff1hds_r, + gen_helper_sve_ldff1hss_r, + gen_helper_sve_ldff1ss_r, + gen_helper_sve_ldff1sdu_r, + + gen_helper_sve_ldff1bds_r, + gen_helper_sve_ldff1bss_r, + gen_helper_sve_ldff1bhs_r, + gen_helper_sve_ldff1dd_r, + }; + + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype)); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]); + } + return true; +} + +static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn) +{ + static gen_helper_gvec_mem * const fns[16] = { + gen_helper_sve_ldnf1bb_r, + gen_helper_sve_ldnf1bhu_r, + gen_helper_sve_ldnf1bsu_r, + gen_helper_sve_ldnf1bdu_r, + + gen_helper_sve_ldnf1sds_r, + gen_helper_sve_ldnf1hh_r, + gen_helper_sve_ldnf1hsu_r, + gen_helper_sve_ldnf1hdu_r, + + gen_helper_sve_ldnf1hds_r, + gen_helper_sve_ldnf1hss_r, + gen_helper_sve_ldnf1ss_r, + gen_helper_sve_ldnf1sdu_r, + + gen_helper_sve_ldnf1bds_r, + gen_helper_sve_ldnf1bss_r, + gen_helper_sve_ldnf1bhs_r, + gen_helper_sve_ldnf1dd_r, + }; + + if (sve_access_check(s)) { + int vsz = vec_full_reg_size(s); + int elements = vsz >> dtype_esz[a->dtype]; + int off = (a->imm * elements) << dtype_msz(a->dtype); + TCGv_i64 addr = new_tmp_a64(s); + + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), off); + do_mem_zpa(s, a->rd, a->pg, addr, fns[a->dtype]); + } + return true; +} + +static void do_ldrq(DisasContext *s, int zt, int pg, TCGv_i64 addr, int msz) +{ + static gen_helper_gvec_mem * const fns[4] = { + gen_helper_sve_ld1bb_r, gen_helper_sve_ld1hh_r, + gen_helper_sve_ld1ss_r, gen_helper_sve_ld1dd_r, + }; + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr t_pg; + TCGv_i32 desc; + + /* Load the first quadword using the normal predicated load helpers. */ + desc = tcg_const_i32(simd_desc(16, 16, zt)); + t_pg = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + fns[msz](cpu_env, t_pg, addr, desc); + + tcg_temp_free_ptr(t_pg); + tcg_temp_free_i32(desc); + + /* Replicate that first quadword. */ + if (vsz > 16) { + unsigned dofs = vec_full_reg_offset(s, zt); + tcg_gen_gvec_dup_mem(4, dofs + 16, dofs, vsz - 16, vsz - 16); + } +} + +static bool trans_LD1RQ_zprr(DisasContext *s, arg_rprr_load *a, uint32_t insn) +{ + if (a->rm == 31) { + return false; + } + if (sve_access_check(s)) { + int msz = dtype_msz(a->dtype); + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), msz); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_ldrq(s, a->rd, a->pg, addr, msz); + } + return true; +} + +static bool trans_LD1RQ_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn) +{ + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), a->imm * 16); + do_ldrq(s, a->rd, a->pg, addr, dtype_msz(a->dtype)); + } + return true; +} + +/* Load and broadcast element. */ +static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a, uint32_t insn) +{ + if (!sve_access_check(s)) { + return true; + } + + unsigned vsz = vec_full_reg_size(s); + unsigned psz = pred_full_reg_size(s); + unsigned esz = dtype_esz[a->dtype]; + TCGLabel *over = gen_new_label(); + TCGv_i64 temp; + + /* If the guarding predicate has no bits set, no load occurs. */ + if (psz <= 8) { + /* Reduce the pred_esz_masks value simply to reduce the + * size of the code generated here. + */ + uint64_t psz_mask = MAKE_64BIT_MASK(0, psz * 8); + temp = tcg_temp_new_i64(); + tcg_gen_ld_i64(temp, cpu_env, pred_full_reg_offset(s, a->pg)); + tcg_gen_andi_i64(temp, temp, pred_esz_masks[esz] & psz_mask); + tcg_gen_brcondi_i64(TCG_COND_EQ, temp, 0, over); + tcg_temp_free_i64(temp); + } else { + TCGv_i32 t32 = tcg_temp_new_i32(); + find_last_active(s, t32, esz, a->pg); + tcg_gen_brcondi_i32(TCG_COND_LT, t32, 0, over); + tcg_temp_free_i32(t32); + } + + /* Load the data. */ + temp = tcg_temp_new_i64(); + tcg_gen_addi_i64(temp, cpu_reg_sp(s, a->rn), a->imm << esz); + tcg_gen_qemu_ld_i64(temp, temp, get_mem_index(s), + s->be_data | dtype_mop[a->dtype]); + + /* Broadcast to *all* elements. */ + tcg_gen_gvec_dup_i64(esz, vec_full_reg_offset(s, a->rd), + vsz, vsz, temp); + tcg_temp_free_i64(temp); + + /* Zero the inactive elements. */ + gen_set_label(over); + do_movz_zpz(s, a->rd, a->rd, a->pg, esz); + return true; +} + +static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, + int msz, int esz, int nreg) +{ + static gen_helper_gvec_mem * const fn_single[4][4] = { + { gen_helper_sve_st1bb_r, gen_helper_sve_st1bh_r, + gen_helper_sve_st1bs_r, gen_helper_sve_st1bd_r }, + { NULL, gen_helper_sve_st1hh_r, + gen_helper_sve_st1hs_r, gen_helper_sve_st1hd_r }, + { NULL, NULL, + gen_helper_sve_st1ss_r, gen_helper_sve_st1sd_r }, + { NULL, NULL, NULL, gen_helper_sve_st1dd_r }, + }; + static gen_helper_gvec_mem * const fn_multiple[3][4] = { + { gen_helper_sve_st2bb_r, gen_helper_sve_st2hh_r, + gen_helper_sve_st2ss_r, gen_helper_sve_st2dd_r }, + { gen_helper_sve_st3bb_r, gen_helper_sve_st3hh_r, + gen_helper_sve_st3ss_r, gen_helper_sve_st3dd_r }, + { gen_helper_sve_st4bb_r, gen_helper_sve_st4hh_r, + gen_helper_sve_st4ss_r, gen_helper_sve_st4dd_r }, + }; + gen_helper_gvec_mem *fn; + + if (nreg == 0) { + /* ST1 */ + fn = fn_single[msz][esz]; + } else { + /* ST2, ST3, ST4 -- msz == esz, enforced by encoding */ + assert(msz == esz); + fn = fn_multiple[nreg - 1][msz]; + } + assert(fn != NULL); + do_mem_zpa(s, zt, pg, addr, fn); +} + +static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a, uint32_t insn) +{ + if (a->rm == 31 || a->msz > a->esz) { + return false; + } + if (sve_access_check(s)) { + TCGv_i64 addr = new_tmp_a64(s); + tcg_gen_muli_i64(addr, cpu_reg(s, a->rm), (a->nreg + 1) << a->msz); + tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); + do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg); + } + return true; +} + +static bool trans_ST_zpri(DisasContext *s, arg_rpri_store *a, uint32_t insn) +{ + if (a->msz > a->esz) { + return false; + } + if (sve_access_check(s)) { + int vsz = vec_full_reg_size(s); + int elements = vsz >> a->esz; + TCGv_i64 addr = new_tmp_a64(s); + + tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), + (a->imm * elements * (a->nreg + 1)) << a->msz); + do_st_zpa(s, a->rd, a->pg, addr, a->msz, a->esz, a->nreg); + } + return true; +} + +/* + *** SVE gather loads / scatter stores + */ + +static void do_mem_zpz(DisasContext *s, int zt, int pg, int zm, int scale, + TCGv_i64 scalar, gen_helper_gvec_mem_scatter *fn) +{ + unsigned vsz = vec_full_reg_size(s); + TCGv_i32 desc = tcg_const_i32(simd_desc(vsz, vsz, scale)); + TCGv_ptr t_zm = tcg_temp_new_ptr(); + TCGv_ptr t_pg = tcg_temp_new_ptr(); + TCGv_ptr t_zt = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(t_pg, cpu_env, pred_full_reg_offset(s, pg)); + tcg_gen_addi_ptr(t_zm, cpu_env, vec_full_reg_offset(s, zm)); + tcg_gen_addi_ptr(t_zt, cpu_env, vec_full_reg_offset(s, zt)); + fn(cpu_env, t_zt, t_pg, t_zm, scalar, desc); + + tcg_temp_free_ptr(t_zt); + tcg_temp_free_ptr(t_zm); + tcg_temp_free_ptr(t_pg); + tcg_temp_free_i32(desc); +} + +/* Indexed by [ff][xs][u][msz]. */ +static gen_helper_gvec_mem_scatter * const gather_load_fn32[2][2][2][3] = { + { { { gen_helper_sve_ldbss_zsu, + gen_helper_sve_ldhss_zsu, + NULL, }, + { gen_helper_sve_ldbsu_zsu, + gen_helper_sve_ldhsu_zsu, + gen_helper_sve_ldssu_zsu, } }, + { { gen_helper_sve_ldbss_zss, + gen_helper_sve_ldhss_zss, + NULL, }, + { gen_helper_sve_ldbsu_zss, + gen_helper_sve_ldhsu_zss, + gen_helper_sve_ldssu_zss, } } }, + + { { { gen_helper_sve_ldffbss_zsu, + gen_helper_sve_ldffhss_zsu, + NULL, }, + { gen_helper_sve_ldffbsu_zsu, + gen_helper_sve_ldffhsu_zsu, + gen_helper_sve_ldffssu_zsu, } }, + { { gen_helper_sve_ldffbss_zss, + gen_helper_sve_ldffhss_zss, + NULL, }, + { gen_helper_sve_ldffbsu_zss, + gen_helper_sve_ldffhsu_zss, + gen_helper_sve_ldffssu_zss, } } } +}; + +/* Note that we overload xs=2 to indicate 64-bit offset. */ +static gen_helper_gvec_mem_scatter * const gather_load_fn64[2][3][2][4] = { + { { { gen_helper_sve_ldbds_zsu, + gen_helper_sve_ldhds_zsu, + gen_helper_sve_ldsds_zsu, + NULL, }, + { gen_helper_sve_ldbdu_zsu, + gen_helper_sve_ldhdu_zsu, + gen_helper_sve_ldsdu_zsu, + gen_helper_sve_ldddu_zsu, } }, + { { gen_helper_sve_ldbds_zss, + gen_helper_sve_ldhds_zss, + gen_helper_sve_ldsds_zss, + NULL, }, + { gen_helper_sve_ldbdu_zss, + gen_helper_sve_ldhdu_zss, + gen_helper_sve_ldsdu_zss, + gen_helper_sve_ldddu_zss, } }, + { { gen_helper_sve_ldbds_zd, + gen_helper_sve_ldhds_zd, + gen_helper_sve_ldsds_zd, + NULL, }, + { gen_helper_sve_ldbdu_zd, + gen_helper_sve_ldhdu_zd, + gen_helper_sve_ldsdu_zd, + gen_helper_sve_ldddu_zd, } } }, + + { { { gen_helper_sve_ldffbds_zsu, + gen_helper_sve_ldffhds_zsu, + gen_helper_sve_ldffsds_zsu, + NULL, }, + { gen_helper_sve_ldffbdu_zsu, + gen_helper_sve_ldffhdu_zsu, + gen_helper_sve_ldffsdu_zsu, + gen_helper_sve_ldffddu_zsu, } }, + { { gen_helper_sve_ldffbds_zss, + gen_helper_sve_ldffhds_zss, + gen_helper_sve_ldffsds_zss, + NULL, }, + { gen_helper_sve_ldffbdu_zss, + gen_helper_sve_ldffhdu_zss, + gen_helper_sve_ldffsdu_zss, + gen_helper_sve_ldffddu_zss, } }, + { { gen_helper_sve_ldffbds_zd, + gen_helper_sve_ldffhds_zd, + gen_helper_sve_ldffsds_zd, + NULL, }, + { gen_helper_sve_ldffbdu_zd, + gen_helper_sve_ldffhdu_zd, + gen_helper_sve_ldffsdu_zd, + gen_helper_sve_ldffddu_zd, } } } +}; + +static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a, uint32_t insn) +{ + gen_helper_gvec_mem_scatter *fn = NULL; + + if (!sve_access_check(s)) { + return true; + } + + switch (a->esz) { + case MO_32: + fn = gather_load_fn32[a->ff][a->xs][a->u][a->msz]; + break; + case MO_64: + fn = gather_load_fn64[a->ff][a->xs][a->u][a->msz]; + break; + } + assert(fn != NULL); + + do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz, + cpu_reg_sp(s, a->rn), fn); + return true; +} + +static bool trans_LD1_zpiz(DisasContext *s, arg_LD1_zpiz *a, uint32_t insn) +{ + gen_helper_gvec_mem_scatter *fn = NULL; + TCGv_i64 imm; + + if (a->esz < a->msz || (a->esz == a->msz && !a->u)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + switch (a->esz) { + case MO_32: + fn = gather_load_fn32[a->ff][0][a->u][a->msz]; + break; + case MO_64: + fn = gather_load_fn64[a->ff][2][a->u][a->msz]; + break; + } + assert(fn != NULL); + + /* Treat LD1_zpiz (zn[x] + imm) the same way as LD1_zprz (rn + zm[x]) + * by loading the immediate into the scalar parameter. + */ + imm = tcg_const_i64(a->imm << a->msz); + do_mem_zpz(s, a->rd, a->pg, a->rn, 0, imm, fn); + tcg_temp_free_i64(imm); + return true; +} + +/* Indexed by [xs][msz]. */ +static gen_helper_gvec_mem_scatter * const scatter_store_fn32[2][3] = { + { gen_helper_sve_stbs_zsu, + gen_helper_sve_sths_zsu, + gen_helper_sve_stss_zsu, }, + { gen_helper_sve_stbs_zss, + gen_helper_sve_sths_zss, + gen_helper_sve_stss_zss, }, +}; + +/* Note that we overload xs=2 to indicate 64-bit offset. */ +static gen_helper_gvec_mem_scatter * const scatter_store_fn64[3][4] = { + { gen_helper_sve_stbd_zsu, + gen_helper_sve_sthd_zsu, + gen_helper_sve_stsd_zsu, + gen_helper_sve_stdd_zsu, }, + { gen_helper_sve_stbd_zss, + gen_helper_sve_sthd_zss, + gen_helper_sve_stsd_zss, + gen_helper_sve_stdd_zss, }, + { gen_helper_sve_stbd_zd, + gen_helper_sve_sthd_zd, + gen_helper_sve_stsd_zd, + gen_helper_sve_stdd_zd, }, +}; + +static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a, uint32_t insn) +{ + gen_helper_gvec_mem_scatter *fn; + + if (a->esz < a->msz || (a->msz == 0 && a->scale)) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + switch (a->esz) { + case MO_32: + fn = scatter_store_fn32[a->xs][a->msz]; + break; + case MO_64: + fn = scatter_store_fn64[a->xs][a->msz]; + break; + default: + g_assert_not_reached(); + } + do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz, + cpu_reg_sp(s, a->rn), fn); + return true; +} + +static bool trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a, uint32_t insn) +{ + gen_helper_gvec_mem_scatter *fn = NULL; + TCGv_i64 imm; + + if (a->esz < a->msz) { + return false; + } + if (!sve_access_check(s)) { + return true; + } + + switch (a->esz) { + case MO_32: + fn = scatter_store_fn32[0][a->msz]; + break; + case MO_64: + fn = scatter_store_fn64[2][a->msz]; + break; + } + assert(fn != NULL); + + /* Treat ST1_zpiz (zn[x] + imm) the same way as ST1_zprz (rn + zm[x]) + * by loading the immediate into the scalar parameter. + */ + imm = tcg_const_i64(a->imm << a->msz); + do_mem_zpz(s, a->rd, a->pg, a->rn, 0, imm, fn); + tcg_temp_free_i64(imm); + return true; +} + +/* + * Prefetches + */ + +static bool trans_PRF(DisasContext *s, arg_PRF *a, uint32_t insn) +{ + /* Prefetch is a nop within QEMU. */ + (void)sve_access_check(s); + return true; +} + +static bool trans_PRF_rr(DisasContext *s, arg_PRF_rr *a, uint32_t insn) +{ + if (a->rm == 31) { + return false; + } + /* Prefetch is a nop within QEMU. */ + (void)sve_access_check(s); + return true; +} + +/* + * Move Prefix + * + * TODO: The implementation so far could handle predicated merging movprfx. + * The helper functions as written take an extra source register to + * use in the operation, but the result is only written when predication + * succeeds. For unpredicated movprfx, we need to rearrange the helpers + * to allow the final write back to the destination to be unconditional. + * For predicated zeroing movprfx, we need to rearrange the helpers to + * allow the final write back to zero inactives. + * + * In the meantime, just emit the moves. + */ + +static bool trans_MOVPRFX(DisasContext *s, arg_MOVPRFX *a, uint32_t insn) +{ + return do_mov_z(s, a->rd, a->rn); +} + +static bool trans_MOVPRFX_m(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_sel_z(s, a->rd, a->rn, a->rd, a->pg, a->esz); + } + return true; +} + +static bool trans_MOVPRFX_z(DisasContext *s, arg_rpr_esz *a, uint32_t insn) +{ + if (sve_access_check(s)) { + do_movz_zpz(s, a->rd, a->rn, a->pg, a->esz); } return true; }