target/riscv/vector_helper.c

   1 /*
   2  * RISC-V Vector Extension Helpers for QEMU.
   3  *
   4  * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
   5  *
   6  * This program is free software; you can redistribute it and/or modify it
   7  * under the terms and conditions of the GNU General Public License,
   8  * version 2 or later, as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  13  * more details.
  14  *
  15  * You should have received a copy of the GNU General Public License along with
  16  * this program.  If not, see <http://www.gnu.org/licenses/>.
  17  */
  18
  19 #include "qemu/osdep.h"
  20 #include "qemu/host-utils.h"
  21 #include "qemu/bitops.h"
  22 #include "cpu.h"
  23 #include "exec/memop.h"
  24 #include "exec/exec-all.h"
  25 #include "exec/helper-proto.h"
  26 #include "fpu/softfloat.h"
  27 #include "tcg/tcg-gvec-desc.h"
  28 #include "internals.h"
  29 #include <math.h>
  30
  31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
  32                             target_ulong s2)
  33 {
  34     int vlmax, vl;
  35     RISCVCPU *cpu = env_archcpu(env);
  36     uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
  37     uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
  38     uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
  39     int xlen = riscv_cpu_xlen(env);
  40     bool vill = (s2 >> (xlen - 1)) & 0x1;
  41     target_ulong reserved = s2 &
  42                             MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
  43                                             xlen - 1 - R_VTYPE_RESERVED_SHIFT);
  44
  45     if (lmul & 4) {
  46         /* Fractional LMUL. */
  47         if (lmul == 4 ||
  48             cpu->cfg.elen >> (8 - lmul) < sew) {
  49             vill = true;
  50         }
  51     }
  52
  53     if ((sew > cpu->cfg.elen)
  54         || vill
  55         || (ediv != 0)
  56         || (reserved != 0)) {
  57         /* only set vill bit. */
  58         env->vill = 1;
  59         env->vtype = 0;
  60         env->vl = 0;
  61         env->vstart = 0;
  62         return 0;
  63     }
  64
  65     vlmax = vext_get_vlmax(cpu, s2);
  66     if (s1 <= vlmax) {
  67         vl = s1;
  68     } else {
  69         vl = vlmax;
  70     }
  71     env->vl = vl;
  72     env->vtype = s2;
  73     env->vstart = 0;
  74     env->vill = 0;
  75     return vl;
  76 }
  77
  78 /*
  79  * Note that vector data is stored in host-endian 64-bit chunks,
  80  * so addressing units smaller than that needs a host-endian fixup.
  81  */
  82 #if HOST_BIG_ENDIAN
  83 #define H1(x)   ((x) ^ 7)
  84 #define H1_2(x) ((x) ^ 6)
  85 #define H1_4(x) ((x) ^ 4)
  86 #define H2(x)   ((x) ^ 3)
  87 #define H4(x)   ((x) ^ 1)
  88 #define H8(x)   ((x))
  89 #else
  90 #define H1(x)   (x)
  91 #define H1_2(x) (x)
  92 #define H1_4(x) (x)
  93 #define H2(x)   (x)
  94 #define H4(x)   (x)
  95 #define H8(x)   (x)
  96 #endif
  97
  98 static inline uint32_t vext_nf(uint32_t desc)
  99 {
 100     return FIELD_EX32(simd_data(desc), VDATA, NF);
 101 }
 102
 103 static inline uint32_t vext_vm(uint32_t desc)
 104 {
 105     return FIELD_EX32(simd_data(desc), VDATA, VM);
 106 }
 107
 108 /*
 109  * Encode LMUL to lmul as following:
 110  *     LMUL    vlmul    lmul
 111  *      1       000       0
 112  *      2       001       1
 113  *      4       010       2
 114  *      8       011       3
 115  *      -       100       -
 116  *     1/8      101      -3
 117  *     1/4      110      -2
 118  *     1/2      111      -1
 119  */
 120 static inline int32_t vext_lmul(uint32_t desc)
 121 {
 122     return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
 123 }
 124
 125 static inline uint32_t vext_vta(uint32_t desc)
 126 {
 127     return FIELD_EX32(simd_data(desc), VDATA, VTA);
 128 }
 129
 130 static inline uint32_t vext_vta_all_1s(uint32_t desc)
 131 {
 132     return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
 133 }
 134
 135 /*
 136  * Get the maximum number of elements can be operated.
 137  *
 138  * log2_esz: log2 of element size in bytes.
 139  */
 140 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
 141 {
 142     /*
 143      * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
 144      * so vlen in bytes (vlenb) is encoded as maxsz.
 145      */
 146     uint32_t vlenb = simd_maxsz(desc);
 147
 148     /* Return VLMAX */
 149     int scale = vext_lmul(desc) - log2_esz;
 150     return scale < 0 ? vlenb >> -scale : vlenb << scale;
 151 }
 152
 153 /*
 154  * Get number of total elements, including prestart, body and tail elements.
 155  * Note that when LMUL < 1, the tail includes the elements past VLMAX that
 156  * are held in the same vector register.
 157  */
 158 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
 159                                             uint32_t esz)
 160 {
 161     uint32_t vlenb = simd_maxsz(desc);
 162     uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
 163     int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
 164                   ctzl(esz) - ctzl(sew) + vext_lmul(desc);
 165     return (vlenb << emul) / esz;
 166 }
 167
 168 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
 169 {
 170     return (addr & env->cur_pmmask) | env->cur_pmbase;
 171 }
 172
 173 /*
 174  * This function checks watchpoint before real load operation.
 175  *
 176  * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
 177  * In user mode, there is no watchpoint support now.
 178  *
 179  * It will trigger an exception if there is no mapping in TLB
 180  * and page table walk can't fill the TLB entry. Then the guest
 181  * software can return here after process the exception or never return.
 182  */
 183 static void probe_pages(CPURISCVState *env, target_ulong addr,
 184                         target_ulong len, uintptr_t ra,
 185                         MMUAccessType access_type)
 186 {
 187     target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
 188     target_ulong curlen = MIN(pagelen, len);
 189
 190     probe_access(env, adjust_addr(env, addr), curlen, access_type,
 191                  cpu_mmu_index(env, false), ra);
 192     if (len > curlen) {
 193         addr += curlen;
 194         curlen = len - curlen;
 195         probe_access(env, adjust_addr(env, addr), curlen, access_type,
 196                      cpu_mmu_index(env, false), ra);
 197     }
 198 }
 199
 200 /* set agnostic elements to 1s */
 201 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
 202                               uint32_t tot)
 203 {
 204     if (is_agnostic == 0) {
 205         /* policy undisturbed */
 206         return;
 207     }
 208     if (tot - cnt == 0) {
 209         return ;
 210     }
 211     memset(base + cnt, -1, tot - cnt);
 212 }
 213
 214 static inline void vext_set_elem_mask(void *v0, int index,
 215                                       uint8_t value)
 216 {
 217     int idx = index / 64;
 218     int pos = index % 64;
 219     uint64_t old = ((uint64_t *)v0)[idx];
 220     ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
 221 }
 222
 223 /*
 224  * Earlier designs (pre-0.9) had a varying number of bits
 225  * per mask value (MLEN). In the 0.9 design, MLEN=1.
 226  * (Section 4.5)
 227  */
 228 static inline int vext_elem_mask(void *v0, int index)
 229 {
 230     int idx = index / 64;
 231     int pos = index  % 64;
 232     return (((uint64_t *)v0)[idx] >> pos) & 1;
 233 }
 234
 235 /* elements operations for load and store */
 236 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
 237                                uint32_t idx, void *vd, uintptr_t retaddr);
 238
 239 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
 240 static void NAME(CPURISCVState *env, abi_ptr addr,         \
 241                  uint32_t idx, void *vd, uintptr_t retaddr)\
 242 {                                                          \
 243     ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
 244     *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
 245 }                                                          \
 246
 247 GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
 248 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
 249 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
 250 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
 251
 252 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
 253 static void NAME(CPURISCVState *env, abi_ptr addr,         \
 254                  uint32_t idx, void *vd, uintptr_t retaddr)\
 255 {                                                          \
 256     ETYPE data = *((ETYPE *)vd + H(idx));                  \
 257     cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
 258 }
 259
 260 GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
 261 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
 262 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
 263 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
 264
 265 /*
 266  *** stride: access vector element from strided memory
 267  */
 268 static void
 269 vext_ldst_stride(void *vd, void *v0, target_ulong base,
 270                  target_ulong stride, CPURISCVState *env,
 271                  uint32_t desc, uint32_t vm,
 272                  vext_ldst_elem_fn *ldst_elem,
 273                  uint32_t log2_esz, uintptr_t ra)
 274 {
 275     uint32_t i, k;
 276     uint32_t nf = vext_nf(desc);
 277     uint32_t max_elems = vext_max_elems(desc, log2_esz);
 278     uint32_t esz = 1 << log2_esz;
 279     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
 280     uint32_t vta = vext_vta(desc);
 281
 282     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
 283         if (!vm && !vext_elem_mask(v0, i)) {
 284             continue;
 285         }
 286
 287         k = 0;
 288         while (k < nf) {
 289             target_ulong addr = base + stride * i + (k << log2_esz);
 290             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
 291             k++;
 292         }
 293     }
 294     env->vstart = 0;
 295     /* set tail elements to 1s */
 296     for (k = 0; k < nf; ++k) {
 297         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
 298                           (k * max_elems + max_elems) * esz);
 299     }
 300     if (nf * max_elems % total_elems != 0) {
 301         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
 302         uint32_t registers_used =
 303             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
 304         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
 305                           registers_used * vlenb);
 306     }
 307 }
 308
 309 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)                        \
 310 void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
 311                   target_ulong stride, CPURISCVState *env,              \
 312                   uint32_t desc)                                        \
 313 {                                                                       \
 314     uint32_t vm = vext_vm(desc);                                        \
 315     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN,      \
 316                      ctzl(sizeof(ETYPE)), GETPC());                     \
 317 }
 318
 319 GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
 320 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
 321 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
 322 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
 323
 324 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
 325 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
 326                   target_ulong stride, CPURISCVState *env,              \
 327                   uint32_t desc)                                        \
 328 {                                                                       \
 329     uint32_t vm = vext_vm(desc);                                        \
 330     vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN,     \
 331                      ctzl(sizeof(ETYPE)), GETPC());                     \
 332 }
 333
 334 GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
 335 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
 336 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
 337 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
 338
 339 /*
 340  *** unit-stride: access elements stored contiguously in memory
 341  */
 342
 343 /* unmasked unit-stride load and store operation*/
 344 static void
 345 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
 346              vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
 347              uintptr_t ra)
 348 {
 349     uint32_t i, k;
 350     uint32_t nf = vext_nf(desc);
 351     uint32_t max_elems = vext_max_elems(desc, log2_esz);
 352     uint32_t esz = 1 << log2_esz;
 353     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
 354     uint32_t vta = vext_vta(desc);
 355
 356     /* load bytes from guest memory */
 357     for (i = env->vstart; i < evl; i++, env->vstart++) {
 358         k = 0;
 359         while (k < nf) {
 360             target_ulong addr = base + ((i * nf + k) << log2_esz);
 361             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
 362             k++;
 363         }
 364     }
 365     env->vstart = 0;
 366     /* set tail elements to 1s */
 367     for (k = 0; k < nf; ++k) {
 368         vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
 369                           (k * max_elems + max_elems) * esz);
 370     }
 371     if (nf * max_elems % total_elems != 0) {
 372         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
 373         uint32_t registers_used =
 374             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
 375         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
 376                           registers_used * vlenb);
 377     }
 378 }
 379
 380 /*
 381  * masked unit-stride load and store operation will be a special case of stride,
 382  * stride = NF * sizeof (MTYPE)
 383  */
 384
 385 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
 386 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
 387                          CPURISCVState *env, uint32_t desc)             \
 388 {                                                                       \
 389     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
 390     vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
 391                      ctzl(sizeof(ETYPE)), GETPC());                     \
 392 }                                                                       \
 393                                                                         \
 394 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
 395                   CPURISCVState *env, uint32_t desc)                    \
 396 {                                                                       \
 397     vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
 398                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
 399 }
 400
 401 GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
 402 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
 403 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
 404 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
 405
 406 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
 407 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
 408                          CPURISCVState *env, uint32_t desc)              \
 409 {                                                                        \
 410     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
 411     vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
 412                      ctzl(sizeof(ETYPE)), GETPC());                      \
 413 }                                                                        \
 414                                                                          \
 415 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
 416                   CPURISCVState *env, uint32_t desc)                     \
 417 {                                                                        \
 418     vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
 419                  ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
 420 }
 421
 422 GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
 423 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
 424 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
 425 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
 426
 427 /*
 428  *** unit stride mask load and store, EEW = 1
 429  */
 430 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
 431                     CPURISCVState *env, uint32_t desc)
 432 {
 433     /* evl = ceil(vl/8) */
 434     uint8_t evl = (env->vl + 7) >> 3;
 435     vext_ldst_us(vd, base, env, desc, lde_b,
 436                  0, evl, GETPC());
 437 }
 438
 439 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
 440                     CPURISCVState *env, uint32_t desc)
 441 {
 442     /* evl = ceil(vl/8) */
 443     uint8_t evl = (env->vl + 7) >> 3;
 444     vext_ldst_us(vd, base, env, desc, ste_b,
 445                  0, evl, GETPC());
 446 }
 447
 448 /*
 449  *** index: access vector element from indexed memory
 450  */
 451 typedef target_ulong vext_get_index_addr(target_ulong base,
 452         uint32_t idx, void *vs2);
 453
 454 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H)        \
 455 static target_ulong NAME(target_ulong base,            \
 456                          uint32_t idx, void *vs2)      \
 457 {                                                      \
 458     return (base + *((ETYPE *)vs2 + H(idx)));          \
 459 }
 460
 461 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t,  H1)
 462 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
 463 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
 464 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
 465
 466 static inline void
 467 vext_ldst_index(void *vd, void *v0, target_ulong base,
 468                 void *vs2, CPURISCVState *env, uint32_t desc,
 469                 vext_get_index_addr get_index_addr,
 470                 vext_ldst_elem_fn *ldst_elem,
 471                 uint32_t log2_esz, uintptr_t ra)
 472 {
 473     uint32_t i, k;
 474     uint32_t nf = vext_nf(desc);
 475     uint32_t vm = vext_vm(desc);
 476     uint32_t max_elems = vext_max_elems(desc, log2_esz);
 477     uint32_t esz = 1 << log2_esz;
 478     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
 479     uint32_t vta = vext_vta(desc);
 480
 481     /* load bytes from guest memory */
 482     for (i = env->vstart; i < env->vl; i++, env->vstart++) {
 483         if (!vm && !vext_elem_mask(v0, i)) {
 484             continue;
 485         }
 486
 487         k = 0;
 488         while (k < nf) {
 489             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
 490             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
 491             k++;
 492         }
 493     }
 494     env->vstart = 0;
 495     /* set tail elements to 1s */
 496     for (k = 0; k < nf; ++k) {
 497         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
 498                           (k * max_elems + max_elems) * esz);
 499     }
 500     if (nf * max_elems % total_elems != 0) {
 501         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
 502         uint32_t registers_used =
 503             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
 504         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
 505                           registers_used * vlenb);
 506     }
 507 }
 508
 509 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN)                  \
 510 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
 511                   void *vs2, CPURISCVState *env, uint32_t desc)            \
 512 {                                                                          \
 513     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,                \
 514                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
 515 }
 516
 517 GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
 518 GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
 519 GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
 520 GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
 521 GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
 522 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
 523 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
 524 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
 525 GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
 526 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
 527 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
 528 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
 529 GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
 530 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
 531 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
 532 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
 533
 534 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
 535 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
 536                   void *vs2, CPURISCVState *env, uint32_t desc)  \
 537 {                                                                \
 538     vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN,      \
 539                     STORE_FN, ctzl(sizeof(ETYPE)),               \
 540                     GETPC());                                    \
 541 }
 542
 543 GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
 544 GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
 545 GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
 546 GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
 547 GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
 548 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
 549 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
 550 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
 551 GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
 552 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
 553 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
 554 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
 555 GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
 556 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
 557 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
 558 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
 559
 560 /*
 561  *** unit-stride fault-only-fisrt load instructions
 562  */
 563 static inline void
 564 vext_ldff(void *vd, void *v0, target_ulong base,
 565           CPURISCVState *env, uint32_t desc,
 566           vext_ldst_elem_fn *ldst_elem,
 567           uint32_t log2_esz, uintptr_t ra)
 568 {
 569     void *host;
 570     uint32_t i, k, vl = 0;
 571     uint32_t nf = vext_nf(desc);
 572     uint32_t vm = vext_vm(desc);
 573     uint32_t max_elems = vext_max_elems(desc, log2_esz);
 574     uint32_t esz = 1 << log2_esz;
 575     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
 576     uint32_t vta = vext_vta(desc);
 577     target_ulong addr, offset, remain;
 578
 579     /* probe every access*/
 580     for (i = env->vstart; i < env->vl; i++) {
 581         if (!vm && !vext_elem_mask(v0, i)) {
 582             continue;
 583         }
 584         addr = adjust_addr(env, base + i * (nf << log2_esz));
 585         if (i == 0) {
 586             probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
 587         } else {
 588             /* if it triggers an exception, no need to check watchpoint */
 589             remain = nf << log2_esz;
 590             while (remain > 0) {
 591                 offset = -(addr | TARGET_PAGE_MASK);
 592                 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
 593                                          cpu_mmu_index(env, false));
 594                 if (host) {
 595 #ifdef CONFIG_USER_ONLY
 596                     if (page_check_range(addr, offset, PAGE_READ) < 0) {
 597                         vl = i;
 598                         goto ProbeSuccess;
 599                     }
 600 #else
 601                     probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
 602 #endif
 603                 } else {
 604                     vl = i;
 605                     goto ProbeSuccess;
 606                 }
 607                 if (remain <=  offset) {
 608                     break;
 609                 }
 610                 remain -= offset;
 611                 addr = adjust_addr(env, addr + offset);
 612             }
 613         }
 614     }
 615 ProbeSuccess:
 616     /* load bytes from guest memory */
 617     if (vl != 0) {
 618         env->vl = vl;
 619     }
 620     for (i = env->vstart; i < env->vl; i++) {
 621         k = 0;
 622         if (!vm && !vext_elem_mask(v0, i)) {
 623             continue;
 624         }
 625         while (k < nf) {
 626             target_ulong addr = base + ((i * nf + k) << log2_esz);
 627             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
 628             k++;
 629         }
 630     }
 631     env->vstart = 0;
 632     /* set tail elements to 1s */
 633     for (k = 0; k < nf; ++k) {
 634         vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
 635                           (k * max_elems + max_elems) * esz);
 636     }
 637     if (nf * max_elems % total_elems != 0) {
 638         uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
 639         uint32_t registers_used =
 640             ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
 641         vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
 642                           registers_used * vlenb);
 643     }
 644 }
 645
 646 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN)               \
 647 void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
 648                   CPURISCVState *env, uint32_t desc)      \
 649 {                                                         \
 650     vext_ldff(vd, v0, base, env, desc, LOAD_FN,           \
 651               ctzl(sizeof(ETYPE)), GETPC());              \
 652 }
 653
 654 GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
 655 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
 656 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
 657 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
 658
 659 #define DO_SWAP(N, M) (M)
 660 #define DO_AND(N, M)  (N & M)
 661 #define DO_XOR(N, M)  (N ^ M)
 662 #define DO_OR(N, M)   (N | M)
 663 #define DO_ADD(N, M)  (N + M)
 664
 665 /* Signed min/max */
 666 #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
 667 #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
 668
 669 /* Unsigned min/max */
 670 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
 671 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
 672
 673 /*
 674  *** load and store whole register instructions
 675  */
 676 static void
 677 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
 678                 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
 679 {
 680     uint32_t i, k, off, pos;
 681     uint32_t nf = vext_nf(desc);
 682     uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
 683     uint32_t max_elems = vlenb >> log2_esz;
 684
 685     k = env->vstart / max_elems;
 686     off = env->vstart % max_elems;
 687
 688     if (off) {
 689         /* load/store rest of elements of current segment pointed by vstart */
 690         for (pos = off; pos < max_elems; pos++, env->vstart++) {
 691             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
 692             ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
 693         }
 694         k++;
 695     }
 696
 697     /* load/store elements for rest of segments */
 698     for (; k < nf; k++) {
 699         for (i = 0; i < max_elems; i++, env->vstart++) {
 700             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
 701             ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
 702         }
 703     }
 704
 705     env->vstart = 0;
 706 }
 707
 708 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN)      \
 709 void HELPER(NAME)(void *vd, target_ulong base,       \
 710                   CPURISCVState *env, uint32_t desc) \
 711 {                                                    \
 712     vext_ldst_whole(vd, base, env, desc, LOAD_FN,    \
 713                     ctzl(sizeof(ETYPE)), GETPC());   \
 714 }
 715
 716 GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
 717 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
 718 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
 719 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
 720 GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
 721 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
 722 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
 723 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
 724 GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
 725 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
 726 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
 727 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
 728 GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
 729 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
 730 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
 731 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
 732
 733 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
 734 void HELPER(NAME)(void *vd, target_ulong base,       \
 735                   CPURISCVState *env, uint32_t desc) \
 736 {                                                    \
 737     vext_ldst_whole(vd, base, env, desc, STORE_FN,   \
 738                     ctzl(sizeof(ETYPE)), GETPC());   \
 739 }
 740
 741 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
 742 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
 743 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
 744 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
 745
 746 /*
 747  *** Vector Integer Arithmetic Instructions
 748  */
 749
 750 /* expand macro args before macro */
 751 #define RVVCALL(macro, ...)  macro(__VA_ARGS__)
 752
 753 /* (TD, T1, T2, TX1, TX2) */
 754 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
 755 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
 756 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
 757 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
 758 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
 759 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
 760 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
 761 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
 762 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
 763 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
 764 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
 765 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
 766 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
 767 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
 768 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
 769 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
 770 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
 771 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
 772 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
 773 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
 774 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
 775 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
 776 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
 777 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
 778 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
 779 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
 780 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
 781 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
 782 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
 783 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
 784
 785 /* operation of two vector elements */
 786 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
 787
 788 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)    \
 789 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)    \
 790 {                                                               \
 791     TX1 s1 = *((T1 *)vs1 + HS1(i));                             \
 792     TX2 s2 = *((T2 *)vs2 + HS2(i));                             \
 793     *((TD *)vd + HD(i)) = OP(s2, s1);                           \
 794 }
 795 #define DO_SUB(N, M) (N - M)
 796 #define DO_RSUB(N, M) (M - N)
 797
 798 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
 799 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
 800 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
 801 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
 802 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
 803 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
 804 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
 805 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
 806
 807 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
 808                        CPURISCVState *env, uint32_t desc,
 809                        opivv2_fn *fn, uint32_t esz)
 810 {
 811     uint32_t vm = vext_vm(desc);
 812     uint32_t vl = env->vl;
 813     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
 814     uint32_t vta = vext_vta(desc);
 815     uint32_t i;
 816
 817     for (i = env->vstart; i < vl; i++) {
 818         if (!vm && !vext_elem_mask(v0, i)) {
 819             continue;
 820         }
 821         fn(vd, vs1, vs2, i);
 822     }
 823     env->vstart = 0;
 824     /* set tail elements to 1s */
 825     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
 826 }
 827
 828 /* generate the helpers for OPIVV */
 829 #define GEN_VEXT_VV(NAME, ESZ)                            \
 830 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
 831                   void *vs2, CPURISCVState *env,          \
 832                   uint32_t desc)                          \
 833 {                                                         \
 834     do_vext_vv(vd, v0, vs1, vs2, env, desc,               \
 835                do_##NAME, ESZ);                           \
 836 }
 837
 838 GEN_VEXT_VV(vadd_vv_b, 1)
 839 GEN_VEXT_VV(vadd_vv_h, 2)
 840 GEN_VEXT_VV(vadd_vv_w, 4)
 841 GEN_VEXT_VV(vadd_vv_d, 8)
 842 GEN_VEXT_VV(vsub_vv_b, 1)
 843 GEN_VEXT_VV(vsub_vv_h, 2)
 844 GEN_VEXT_VV(vsub_vv_w, 4)
 845 GEN_VEXT_VV(vsub_vv_d, 8)
 846
 847 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
 848
 849 /*
 850  * (T1)s1 gives the real operator type.
 851  * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
 852  */
 853 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
 854 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
 855 {                                                                   \
 856     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
 857     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1);                      \
 858 }
 859
 860 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
 861 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
 862 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
 863 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
 864 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
 865 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
 866 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
 867 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
 868 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
 869 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
 870 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
 871 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
 872
 873 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
 874                        CPURISCVState *env, uint32_t desc,
 875                        opivx2_fn fn, uint32_t esz)
 876 {
 877     uint32_t vm = vext_vm(desc);
 878     uint32_t vl = env->vl;
 879     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
 880     uint32_t vta = vext_vta(desc);
 881     uint32_t i;
 882
 883     for (i = env->vstart; i < vl; i++) {
 884         if (!vm && !vext_elem_mask(v0, i)) {
 885             continue;
 886         }
 887         fn(vd, s1, vs2, i);
 888     }
 889     env->vstart = 0;
 890     /* set tail elements to 1s */
 891     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
 892 }
 893
 894 /* generate the helpers for OPIVX */
 895 #define GEN_VEXT_VX(NAME, ESZ)                            \
 896 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
 897                   void *vs2, CPURISCVState *env,          \
 898                   uint32_t desc)                          \
 899 {                                                         \
 900     do_vext_vx(vd, v0, s1, vs2, env, desc,                \
 901                do_##NAME, ESZ);                           \
 902 }
 903
 904 GEN_VEXT_VX(vadd_vx_b, 1)
 905 GEN_VEXT_VX(vadd_vx_h, 2)
 906 GEN_VEXT_VX(vadd_vx_w, 4)
 907 GEN_VEXT_VX(vadd_vx_d, 8)
 908 GEN_VEXT_VX(vsub_vx_b, 1)
 909 GEN_VEXT_VX(vsub_vx_h, 2)
 910 GEN_VEXT_VX(vsub_vx_w, 4)
 911 GEN_VEXT_VX(vsub_vx_d, 8)
 912 GEN_VEXT_VX(vrsub_vx_b, 1)
 913 GEN_VEXT_VX(vrsub_vx_h, 2)
 914 GEN_VEXT_VX(vrsub_vx_w, 4)
 915 GEN_VEXT_VX(vrsub_vx_d, 8)
 916
 917 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
 918 {
 919     intptr_t oprsz = simd_oprsz(desc);
 920     intptr_t i;
 921
 922     for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
 923         *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
 924     }
 925 }
 926
 927 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
 928 {
 929     intptr_t oprsz = simd_oprsz(desc);
 930     intptr_t i;
 931
 932     for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
 933         *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
 934     }
 935 }
 936
 937 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
 938 {
 939     intptr_t oprsz = simd_oprsz(desc);
 940     intptr_t i;
 941
 942     for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
 943         *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
 944     }
 945 }
 946
 947 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
 948 {
 949     intptr_t oprsz = simd_oprsz(desc);
 950     intptr_t i;
 951
 952     for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
 953         *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
 954     }
 955 }
 956
 957 /* Vector Widening Integer Add/Subtract */
 958 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
 959 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
 960 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
 961 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
 962 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
 963 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
 964 #define WOP_WUUU_B  uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
 965 #define WOP_WUUU_H  uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
 966 #define WOP_WUUU_W  uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
 967 #define WOP_WSSS_B  int16_t, int8_t, int16_t, int16_t, int16_t
 968 #define WOP_WSSS_H  int32_t, int16_t, int32_t, int32_t, int32_t
 969 #define WOP_WSSS_W  int64_t, int32_t, int64_t, int64_t, int64_t
 970 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
 971 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
 972 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
 973 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
 974 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
 975 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
 976 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
 977 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
 978 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
 979 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
 980 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
 981 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
 982 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
 983 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
 984 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
 985 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
 986 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
 987 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
 988 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
 989 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
 990 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
 991 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
 992 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
 993 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
 994 GEN_VEXT_VV(vwaddu_vv_b, 2)
 995 GEN_VEXT_VV(vwaddu_vv_h, 4)
 996 GEN_VEXT_VV(vwaddu_vv_w, 8)
 997 GEN_VEXT_VV(vwsubu_vv_b, 2)
 998 GEN_VEXT_VV(vwsubu_vv_h, 4)
 999 GEN_VEXT_VV(vwsubu_vv_w, 8)
1000 GEN_VEXT_VV(vwadd_vv_b, 2)
1001 GEN_VEXT_VV(vwadd_vv_h, 4)
1002 GEN_VEXT_VV(vwadd_vv_w, 8)
1003 GEN_VEXT_VV(vwsub_vv_b, 2)
1004 GEN_VEXT_VV(vwsub_vv_h, 4)
1005 GEN_VEXT_VV(vwsub_vv_w, 8)
1006 GEN_VEXT_VV(vwaddu_wv_b, 2)
1007 GEN_VEXT_VV(vwaddu_wv_h, 4)
1008 GEN_VEXT_VV(vwaddu_wv_w, 8)
1009 GEN_VEXT_VV(vwsubu_wv_b, 2)
1010 GEN_VEXT_VV(vwsubu_wv_h, 4)
1011 GEN_VEXT_VV(vwsubu_wv_w, 8)
1012 GEN_VEXT_VV(vwadd_wv_b, 2)
1013 GEN_VEXT_VV(vwadd_wv_h, 4)
1014 GEN_VEXT_VV(vwadd_wv_w, 8)
1015 GEN_VEXT_VV(vwsub_wv_b, 2)
1016 GEN_VEXT_VV(vwsub_wv_h, 4)
1017 GEN_VEXT_VV(vwsub_wv_w, 8)
1018
1019 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1020 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1021 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1022 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1023 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1024 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1025 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1026 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1027 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1028 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1029 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1030 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1031 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1032 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1033 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1034 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1035 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1036 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1037 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1038 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1039 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1040 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1041 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1042 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1043 GEN_VEXT_VX(vwaddu_vx_b, 2)
1044 GEN_VEXT_VX(vwaddu_vx_h, 4)
1045 GEN_VEXT_VX(vwaddu_vx_w, 8)
1046 GEN_VEXT_VX(vwsubu_vx_b, 2)
1047 GEN_VEXT_VX(vwsubu_vx_h, 4)
1048 GEN_VEXT_VX(vwsubu_vx_w, 8)
1049 GEN_VEXT_VX(vwadd_vx_b, 2)
1050 GEN_VEXT_VX(vwadd_vx_h, 4)
1051 GEN_VEXT_VX(vwadd_vx_w, 8)
1052 GEN_VEXT_VX(vwsub_vx_b, 2)
1053 GEN_VEXT_VX(vwsub_vx_h, 4)
1054 GEN_VEXT_VX(vwsub_vx_w, 8)
1055 GEN_VEXT_VX(vwaddu_wx_b, 2)
1056 GEN_VEXT_VX(vwaddu_wx_h, 4)
1057 GEN_VEXT_VX(vwaddu_wx_w, 8)
1058 GEN_VEXT_VX(vwsubu_wx_b, 2)
1059 GEN_VEXT_VX(vwsubu_wx_h, 4)
1060 GEN_VEXT_VX(vwsubu_wx_w, 8)
1061 GEN_VEXT_VX(vwadd_wx_b, 2)
1062 GEN_VEXT_VX(vwadd_wx_h, 4)
1063 GEN_VEXT_VX(vwadd_wx_w, 8)
1064 GEN_VEXT_VX(vwsub_wx_b, 2)
1065 GEN_VEXT_VX(vwsub_wx_h, 4)
1066 GEN_VEXT_VX(vwsub_wx_w, 8)
1067
1068 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1069 #define DO_VADC(N, M, C) (N + M + C)
1070 #define DO_VSBC(N, M, C) (N - M - C)
1071
1072 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP)              \
1073 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1074                   CPURISCVState *env, uint32_t desc)          \
1075 {                                                             \
1076     uint32_t vl = env->vl;                                    \
1077     uint32_t esz = sizeof(ETYPE);                             \
1078     uint32_t total_elems =                                    \
1079         vext_get_total_elems(env, desc, esz);                 \
1080     uint32_t vta = vext_vta(desc);                            \
1081     uint32_t i;                                               \
1082                                                               \
1083     for (i = env->vstart; i < vl; i++) {                      \
1084         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1085         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1086         ETYPE carry = vext_elem_mask(v0, i);                  \
1087                                                               \
1088         *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry);         \
1089     }                                                         \
1090     env->vstart = 0;                                          \
1091     /* set tail elements to 1s */                             \
1092     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
1093 }
1094
1095 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t,  H1, DO_VADC)
1096 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1097 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1098 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1099
1100 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t,  H1, DO_VSBC)
1101 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1102 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1103 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1104
1105 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP)                         \
1106 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,        \
1107                   CPURISCVState *env, uint32_t desc)                     \
1108 {                                                                        \
1109     uint32_t vl = env->vl;                                               \
1110     uint32_t esz = sizeof(ETYPE);                                        \
1111     uint32_t total_elems = vext_get_total_elems(env, desc, esz);         \
1112     uint32_t vta = vext_vta(desc);                                       \
1113     uint32_t i;                                                          \
1114                                                                          \
1115     for (i = env->vstart; i < vl; i++) {                                 \
1116         ETYPE s2 = *((ETYPE *)vs2 + H(i));                               \
1117         ETYPE carry = vext_elem_mask(v0, i);                             \
1118                                                                          \
1119         *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1120     }                                                                    \
1121     env->vstart = 0;                                          \
1122     /* set tail elements to 1s */                                        \
1123     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);             \
1124 }
1125
1126 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t,  H1, DO_VADC)
1127 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1128 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1129 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1130
1131 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t,  H1, DO_VSBC)
1132 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1133 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1134 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1135
1136 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N :           \
1137                           (__typeof(N))(N + M) < N)
1138 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1139
1140 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP)             \
1141 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1142                   CPURISCVState *env, uint32_t desc)          \
1143 {                                                             \
1144     uint32_t vl = env->vl;                                    \
1145     uint32_t vm = vext_vm(desc);                              \
1146     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1147     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1148     uint32_t i;                                               \
1149                                                               \
1150     for (i = env->vstart; i < vl; i++) {                      \
1151         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1152         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1153         ETYPE carry = !vm && vext_elem_mask(v0, i);           \
1154         vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry));      \
1155     }                                                         \
1156     env->vstart = 0;                                          \
1157     /* mask destination register are always tail-agnostic */  \
1158     /* set tail elements to 1s */                             \
1159     if (vta_all_1s) {                                         \
1160         for (; i < total_elems; i++) {                        \
1161             vext_set_elem_mask(vd, i, 1);                     \
1162         }                                                     \
1163     }                                                         \
1164 }
1165
1166 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t,  H1, DO_MADC)
1167 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1168 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1169 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1170
1171 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t,  H1, DO_MSBC)
1172 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1173 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1174 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1175
1176 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP)               \
1177 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,          \
1178                   void *vs2, CPURISCVState *env, uint32_t desc) \
1179 {                                                               \
1180     uint32_t vl = env->vl;                                      \
1181     uint32_t vm = vext_vm(desc);                                \
1182     uint32_t total_elems = env_archcpu(env)->cfg.vlen;          \
1183     uint32_t vta_all_1s = vext_vta_all_1s(desc);                \
1184     uint32_t i;                                                 \
1185                                                                 \
1186     for (i = env->vstart; i < vl; i++) {                        \
1187         ETYPE s2 = *((ETYPE *)vs2 + H(i));                      \
1188         ETYPE carry = !vm && vext_elem_mask(v0, i);             \
1189         vext_set_elem_mask(vd, i,                               \
1190                 DO_OP(s2, (ETYPE)(target_long)s1, carry));      \
1191     }                                                           \
1192     env->vstart = 0;                                            \
1193     /* mask destination register are always tail-agnostic */    \
1194     /* set tail elements to 1s */                               \
1195     if (vta_all_1s) {                                           \
1196         for (; i < total_elems; i++) {                          \
1197             vext_set_elem_mask(vd, i, 1);                       \
1198         }                                                       \
1199     }                                                           \
1200 }
1201
1202 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t,  H1, DO_MADC)
1203 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1204 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1205 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1206
1207 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t,  H1, DO_MSBC)
1208 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1209 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1210 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1211
1212 /* Vector Bitwise Logical Instructions */
1213 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1214 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1215 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1216 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1217 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1218 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1219 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1220 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1221 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1222 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1223 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1224 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1225 GEN_VEXT_VV(vand_vv_b, 1)
1226 GEN_VEXT_VV(vand_vv_h, 2)
1227 GEN_VEXT_VV(vand_vv_w, 4)
1228 GEN_VEXT_VV(vand_vv_d, 8)
1229 GEN_VEXT_VV(vor_vv_b, 1)
1230 GEN_VEXT_VV(vor_vv_h, 2)
1231 GEN_VEXT_VV(vor_vv_w, 4)
1232 GEN_VEXT_VV(vor_vv_d, 8)
1233 GEN_VEXT_VV(vxor_vv_b, 1)
1234 GEN_VEXT_VV(vxor_vv_h, 2)
1235 GEN_VEXT_VV(vxor_vv_w, 4)
1236 GEN_VEXT_VV(vxor_vv_d, 8)
1237
1238 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1239 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1240 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1241 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1242 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1243 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1244 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1245 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1246 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1247 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1248 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1249 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1250 GEN_VEXT_VX(vand_vx_b, 1)
1251 GEN_VEXT_VX(vand_vx_h, 2)
1252 GEN_VEXT_VX(vand_vx_w, 4)
1253 GEN_VEXT_VX(vand_vx_d, 8)
1254 GEN_VEXT_VX(vor_vx_b, 1)
1255 GEN_VEXT_VX(vor_vx_h, 2)
1256 GEN_VEXT_VX(vor_vx_w, 4)
1257 GEN_VEXT_VX(vor_vx_d, 8)
1258 GEN_VEXT_VX(vxor_vx_b, 1)
1259 GEN_VEXT_VX(vxor_vx_h, 2)
1260 GEN_VEXT_VX(vxor_vx_w, 4)
1261 GEN_VEXT_VX(vxor_vx_d, 8)
1262
1263 /* Vector Single-Width Bit Shift Instructions */
1264 #define DO_SLL(N, M)  (N << (M))
1265 #define DO_SRL(N, M)  (N >> (M))
1266
1267 /* generate the helpers for shift instructions with two vector operators */
1268 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK)             \
1269 void HELPER(NAME)(void *vd, void *v0, void *vs1,                          \
1270                   void *vs2, CPURISCVState *env, uint32_t desc)           \
1271 {                                                                         \
1272     uint32_t vm = vext_vm(desc);                                          \
1273     uint32_t vl = env->vl;                                                \
1274     uint32_t esz = sizeof(TS1);                                           \
1275     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
1276     uint32_t vta = vext_vta(desc);                                        \
1277     uint32_t i;                                                           \
1278                                                                           \
1279     for (i = env->vstart; i < vl; i++) {                                  \
1280         if (!vm && !vext_elem_mask(v0, i)) {                              \
1281             continue;                                                     \
1282         }                                                                 \
1283         TS1 s1 = *((TS1 *)vs1 + HS1(i));                                  \
1284         TS2 s2 = *((TS2 *)vs2 + HS2(i));                                  \
1285         *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK);                        \
1286     }                                                                     \
1287     env->vstart = 0;                                                      \
1288     /* set tail elements to 1s */                                         \
1289     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
1290 }
1291
1292 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t,  uint8_t, H1, H1, DO_SLL, 0x7)
1293 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1294 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1295 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1296
1297 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1298 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1299 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1300 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1301
1302 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t,  int8_t, H1, H1, DO_SRL, 0x7)
1303 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1304 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1305 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1306
1307 /* generate the helpers for shift instructions with one vector and one scalar */
1308 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1309 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,      \
1310         void *vs2, CPURISCVState *env, uint32_t desc)       \
1311 {                                                           \
1312     uint32_t vm = vext_vm(desc);                            \
1313     uint32_t vl = env->vl;                                  \
1314     uint32_t esz = sizeof(TD);                              \
1315     uint32_t total_elems =                                  \
1316         vext_get_total_elems(env, desc, esz);               \
1317     uint32_t vta = vext_vta(desc);                          \
1318     uint32_t i;                                             \
1319                                                             \
1320     for (i = env->vstart; i < vl; i++) {                    \
1321         if (!vm && !vext_elem_mask(v0, i)) {                \
1322             continue;                                       \
1323         }                                                   \
1324         TS2 s2 = *((TS2 *)vs2 + HS2(i));                    \
1325         *((TD *)vd + HD(i)) = OP(s2, s1 & MASK);            \
1326     }                                                       \
1327     env->vstart = 0;                                        \
1328     /* set tail elements to 1s */                           \
1329     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1330 }
1331
1332 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1333 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1334 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1335 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1336
1337 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1338 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1339 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1340 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1341
1342 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1343 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1344 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1345 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1346
1347 /* Vector Narrowing Integer Right Shift Instructions */
1348 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t,  uint16_t, H1, H2, DO_SRL, 0xf)
1349 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1350 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1351 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t,  int16_t, H1, H2, DO_SRL, 0xf)
1352 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1353 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1354 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1355 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1356 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1357 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1358 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1359 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1360
1361 /* Vector Integer Comparison Instructions */
1362 #define DO_MSEQ(N, M) (N == M)
1363 #define DO_MSNE(N, M) (N != M)
1364 #define DO_MSLT(N, M) (N < M)
1365 #define DO_MSLE(N, M) (N <= M)
1366 #define DO_MSGT(N, M) (N > M)
1367
1368 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP)                \
1369 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
1370                   CPURISCVState *env, uint32_t desc)          \
1371 {                                                             \
1372     uint32_t vm = vext_vm(desc);                              \
1373     uint32_t vl = env->vl;                                    \
1374     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
1375     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
1376     uint32_t i;                                               \
1377                                                               \
1378     for (i = env->vstart; i < vl; i++) {                      \
1379         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
1380         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
1381         if (!vm && !vext_elem_mask(v0, i)) {                  \
1382             continue;                                         \
1383         }                                                     \
1384         vext_set_elem_mask(vd, i, DO_OP(s2, s1));             \
1385     }                                                         \
1386     env->vstart = 0;                                          \
1387     /* mask destination register are always tail-agnostic */  \
1388     /* set tail elements to 1s */                             \
1389     if (vta_all_1s) {                                         \
1390         for (; i < total_elems; i++) {                        \
1391             vext_set_elem_mask(vd, i, 1);                     \
1392         }                                                     \
1393     }                                                         \
1394 }
1395
1396 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t,  H1, DO_MSEQ)
1397 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1398 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1399 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1400
1401 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t,  H1, DO_MSNE)
1402 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1403 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1404 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1405
1406 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t,  H1, DO_MSLT)
1407 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1408 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1409 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1410
1411 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t,  H1, DO_MSLT)
1412 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1413 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1414 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1415
1416 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t,  H1, DO_MSLE)
1417 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1418 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1419 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1420
1421 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t,  H1, DO_MSLE)
1422 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1423 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1424 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1425
1426 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP)                      \
1427 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,   \
1428                   CPURISCVState *env, uint32_t desc)                \
1429 {                                                                   \
1430     uint32_t vm = vext_vm(desc);                                    \
1431     uint32_t vl = env->vl;                                          \
1432     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
1433     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
1434     uint32_t i;                                                     \
1435                                                                     \
1436     for (i = env->vstart; i < vl; i++) {                            \
1437         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
1438         if (!vm && !vext_elem_mask(v0, i)) {                        \
1439             continue;                                               \
1440         }                                                           \
1441         vext_set_elem_mask(vd, i,                                   \
1442                 DO_OP(s2, (ETYPE)(target_long)s1));                 \
1443     }                                                               \
1444     env->vstart = 0;                                                \
1445     /* mask destination register are always tail-agnostic */        \
1446     /* set tail elements to 1s */                                   \
1447     if (vta_all_1s) {                                               \
1448         for (; i < total_elems; i++) {                              \
1449             vext_set_elem_mask(vd, i, 1);                           \
1450         }                                                           \
1451     }                                                               \
1452 }
1453
1454 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t,  H1, DO_MSEQ)
1455 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1456 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1457 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1458
1459 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t,  H1, DO_MSNE)
1460 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1461 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1462 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1463
1464 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t,  H1, DO_MSLT)
1465 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1466 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1467 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1468
1469 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t,  H1, DO_MSLT)
1470 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1471 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1472 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1473
1474 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t,  H1, DO_MSLE)
1475 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1476 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1477 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1478
1479 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t,  H1, DO_MSLE)
1480 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1481 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1482 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1483
1484 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t,  H1, DO_MSGT)
1485 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1486 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1487 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1488
1489 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t,  H1, DO_MSGT)
1490 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1491 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1492 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1493
1494 /* Vector Integer Min/Max Instructions */
1495 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1496 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1497 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1498 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1499 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1500 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1501 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1502 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1503 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1504 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1505 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1506 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1507 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1508 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1509 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1510 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1511 GEN_VEXT_VV(vminu_vv_b, 1)
1512 GEN_VEXT_VV(vminu_vv_h, 2)
1513 GEN_VEXT_VV(vminu_vv_w, 4)
1514 GEN_VEXT_VV(vminu_vv_d, 8)
1515 GEN_VEXT_VV(vmin_vv_b, 1)
1516 GEN_VEXT_VV(vmin_vv_h, 2)
1517 GEN_VEXT_VV(vmin_vv_w, 4)
1518 GEN_VEXT_VV(vmin_vv_d, 8)
1519 GEN_VEXT_VV(vmaxu_vv_b, 1)
1520 GEN_VEXT_VV(vmaxu_vv_h, 2)
1521 GEN_VEXT_VV(vmaxu_vv_w, 4)
1522 GEN_VEXT_VV(vmaxu_vv_d, 8)
1523 GEN_VEXT_VV(vmax_vv_b, 1)
1524 GEN_VEXT_VV(vmax_vv_h, 2)
1525 GEN_VEXT_VV(vmax_vv_w, 4)
1526 GEN_VEXT_VV(vmax_vv_d, 8)
1527
1528 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1529 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1530 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1531 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1532 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1533 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1534 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1535 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1536 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1537 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1538 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1539 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1540 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1541 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1542 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1543 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1544 GEN_VEXT_VX(vminu_vx_b, 1)
1545 GEN_VEXT_VX(vminu_vx_h, 2)
1546 GEN_VEXT_VX(vminu_vx_w, 4)
1547 GEN_VEXT_VX(vminu_vx_d, 8)
1548 GEN_VEXT_VX(vmin_vx_b, 1)
1549 GEN_VEXT_VX(vmin_vx_h, 2)
1550 GEN_VEXT_VX(vmin_vx_w, 4)
1551 GEN_VEXT_VX(vmin_vx_d, 8)
1552 GEN_VEXT_VX(vmaxu_vx_b, 1)
1553 GEN_VEXT_VX(vmaxu_vx_h, 2)
1554 GEN_VEXT_VX(vmaxu_vx_w, 4)
1555 GEN_VEXT_VX(vmaxu_vx_d, 8)
1556 GEN_VEXT_VX(vmax_vx_b, 1)
1557 GEN_VEXT_VX(vmax_vx_h, 2)
1558 GEN_VEXT_VX(vmax_vx_w, 4)
1559 GEN_VEXT_VX(vmax_vx_d, 8)
1560
1561 /* Vector Single-Width Integer Multiply Instructions */
1562 #define DO_MUL(N, M) (N * M)
1563 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1564 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1565 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1566 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1567 GEN_VEXT_VV(vmul_vv_b, 1)
1568 GEN_VEXT_VV(vmul_vv_h, 2)
1569 GEN_VEXT_VV(vmul_vv_w, 4)
1570 GEN_VEXT_VV(vmul_vv_d, 8)
1571
1572 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1573 {
1574     return (int16_t)s2 * (int16_t)s1 >> 8;
1575 }
1576
1577 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1578 {
1579     return (int32_t)s2 * (int32_t)s1 >> 16;
1580 }
1581
1582 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1583 {
1584     return (int64_t)s2 * (int64_t)s1 >> 32;
1585 }
1586
1587 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1588 {
1589     uint64_t hi_64, lo_64;
1590
1591     muls64(&lo_64, &hi_64, s1, s2);
1592     return hi_64;
1593 }
1594
1595 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1596 {
1597     return (uint16_t)s2 * (uint16_t)s1 >> 8;
1598 }
1599
1600 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1601 {
1602     return (uint32_t)s2 * (uint32_t)s1 >> 16;
1603 }
1604
1605 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1606 {
1607     return (uint64_t)s2 * (uint64_t)s1 >> 32;
1608 }
1609
1610 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1611 {
1612     uint64_t hi_64, lo_64;
1613
1614     mulu64(&lo_64, &hi_64, s2, s1);
1615     return hi_64;
1616 }
1617
1618 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1619 {
1620     return (int16_t)s2 * (uint16_t)s1 >> 8;
1621 }
1622
1623 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1624 {
1625     return (int32_t)s2 * (uint32_t)s1 >> 16;
1626 }
1627
1628 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1629 {
1630     return (int64_t)s2 * (uint64_t)s1 >> 32;
1631 }
1632
1633 /*
1634  * Let  A = signed operand,
1635  *      B = unsigned operand
1636  *      P = mulu64(A, B), unsigned product
1637  *
1638  * LET  X = 2 ** 64  - A, 2's complement of A
1639  *      SP = signed product
1640  * THEN
1641  *      IF A < 0
1642  *          SP = -X * B
1643  *             = -(2 ** 64 - A) * B
1644  *             = A * B - 2 ** 64 * B
1645  *             = P - 2 ** 64 * B
1646  *      ELSE
1647  *          SP = P
1648  * THEN
1649  *      HI_P -= (A < 0 ? B : 0)
1650  */
1651
1652 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1653 {
1654     uint64_t hi_64, lo_64;
1655
1656     mulu64(&lo_64, &hi_64, s2, s1);
1657
1658     hi_64 -= s2 < 0 ? s1 : 0;
1659     return hi_64;
1660 }
1661
1662 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1663 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1664 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1665 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1666 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1667 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1668 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1669 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1670 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1671 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1672 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1673 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1674 GEN_VEXT_VV(vmulh_vv_b, 1)
1675 GEN_VEXT_VV(vmulh_vv_h, 2)
1676 GEN_VEXT_VV(vmulh_vv_w, 4)
1677 GEN_VEXT_VV(vmulh_vv_d, 8)
1678 GEN_VEXT_VV(vmulhu_vv_b, 1)
1679 GEN_VEXT_VV(vmulhu_vv_h, 2)
1680 GEN_VEXT_VV(vmulhu_vv_w, 4)
1681 GEN_VEXT_VV(vmulhu_vv_d, 8)
1682 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1683 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1684 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1685 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1686
1687 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1688 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1689 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1690 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1691 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1692 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1693 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1694 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1695 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1696 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1697 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1698 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1699 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1700 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1701 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1702 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1703 GEN_VEXT_VX(vmul_vx_b, 1)
1704 GEN_VEXT_VX(vmul_vx_h, 2)
1705 GEN_VEXT_VX(vmul_vx_w, 4)
1706 GEN_VEXT_VX(vmul_vx_d, 8)
1707 GEN_VEXT_VX(vmulh_vx_b, 1)
1708 GEN_VEXT_VX(vmulh_vx_h, 2)
1709 GEN_VEXT_VX(vmulh_vx_w, 4)
1710 GEN_VEXT_VX(vmulh_vx_d, 8)
1711 GEN_VEXT_VX(vmulhu_vx_b, 1)
1712 GEN_VEXT_VX(vmulhu_vx_h, 2)
1713 GEN_VEXT_VX(vmulhu_vx_w, 4)
1714 GEN_VEXT_VX(vmulhu_vx_d, 8)
1715 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1716 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1717 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1718 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1719
1720 /* Vector Integer Divide Instructions */
1721 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1722 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1723 #define DO_DIV(N, M)  (unlikely(M == 0) ? (__typeof(N))(-1) :\
1724         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1725 #define DO_REM(N, M)  (unlikely(M == 0) ? N :\
1726         unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1727
1728 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1729 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1730 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1731 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1732 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1733 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1734 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1735 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1736 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1737 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1738 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1739 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1740 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1741 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1742 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1743 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1744 GEN_VEXT_VV(vdivu_vv_b, 1)
1745 GEN_VEXT_VV(vdivu_vv_h, 2)
1746 GEN_VEXT_VV(vdivu_vv_w, 4)
1747 GEN_VEXT_VV(vdivu_vv_d, 8)
1748 GEN_VEXT_VV(vdiv_vv_b, 1)
1749 GEN_VEXT_VV(vdiv_vv_h, 2)
1750 GEN_VEXT_VV(vdiv_vv_w, 4)
1751 GEN_VEXT_VV(vdiv_vv_d, 8)
1752 GEN_VEXT_VV(vremu_vv_b, 1)
1753 GEN_VEXT_VV(vremu_vv_h, 2)
1754 GEN_VEXT_VV(vremu_vv_w, 4)
1755 GEN_VEXT_VV(vremu_vv_d, 8)
1756 GEN_VEXT_VV(vrem_vv_b, 1)
1757 GEN_VEXT_VV(vrem_vv_h, 2)
1758 GEN_VEXT_VV(vrem_vv_w, 4)
1759 GEN_VEXT_VV(vrem_vv_d, 8)
1760
1761 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1762 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1763 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1764 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1765 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1766 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1767 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1768 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1769 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1770 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1771 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1772 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1773 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1774 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1775 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1776 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1777 GEN_VEXT_VX(vdivu_vx_b, 1)
1778 GEN_VEXT_VX(vdivu_vx_h, 2)
1779 GEN_VEXT_VX(vdivu_vx_w, 4)
1780 GEN_VEXT_VX(vdivu_vx_d, 8)
1781 GEN_VEXT_VX(vdiv_vx_b, 1)
1782 GEN_VEXT_VX(vdiv_vx_h, 2)
1783 GEN_VEXT_VX(vdiv_vx_w, 4)
1784 GEN_VEXT_VX(vdiv_vx_d, 8)
1785 GEN_VEXT_VX(vremu_vx_b, 1)
1786 GEN_VEXT_VX(vremu_vx_h, 2)
1787 GEN_VEXT_VX(vremu_vx_w, 4)
1788 GEN_VEXT_VX(vremu_vx_d, 8)
1789 GEN_VEXT_VX(vrem_vx_b, 1)
1790 GEN_VEXT_VX(vrem_vx_h, 2)
1791 GEN_VEXT_VX(vrem_vx_w, 4)
1792 GEN_VEXT_VX(vrem_vx_d, 8)
1793
1794 /* Vector Widening Integer Multiply Instructions */
1795 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1796 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1797 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1798 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1799 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1800 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1801 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1802 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1803 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1804 GEN_VEXT_VV(vwmul_vv_b, 2)
1805 GEN_VEXT_VV(vwmul_vv_h, 4)
1806 GEN_VEXT_VV(vwmul_vv_w, 8)
1807 GEN_VEXT_VV(vwmulu_vv_b, 2)
1808 GEN_VEXT_VV(vwmulu_vv_h, 4)
1809 GEN_VEXT_VV(vwmulu_vv_w, 8)
1810 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1811 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1812 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1813
1814 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1815 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1816 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1817 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1818 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1819 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1820 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1821 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1822 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1823 GEN_VEXT_VX(vwmul_vx_b, 2)
1824 GEN_VEXT_VX(vwmul_vx_h, 4)
1825 GEN_VEXT_VX(vwmul_vx_w, 8)
1826 GEN_VEXT_VX(vwmulu_vx_b, 2)
1827 GEN_VEXT_VX(vwmulu_vx_h, 4)
1828 GEN_VEXT_VX(vwmulu_vx_w, 8)
1829 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1830 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1831 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1832
1833 /* Vector Single-Width Integer Multiply-Add Instructions */
1834 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
1835 static void do_##NAME(void *vd, void *vs1, void *vs2, int i)       \
1836 {                                                                  \
1837     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
1838     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
1839     TD d = *((TD *)vd + HD(i));                                    \
1840     *((TD *)vd + HD(i)) = OP(s2, s1, d);                           \
1841 }
1842
1843 #define DO_MACC(N, M, D) (M * N + D)
1844 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1845 #define DO_MADD(N, M, D) (M * D + N)
1846 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1847 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1848 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1849 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1850 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1851 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1852 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1853 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1854 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1855 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1856 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1857 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1858 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1859 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1860 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1861 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1862 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1863 GEN_VEXT_VV(vmacc_vv_b, 1)
1864 GEN_VEXT_VV(vmacc_vv_h, 2)
1865 GEN_VEXT_VV(vmacc_vv_w, 4)
1866 GEN_VEXT_VV(vmacc_vv_d, 8)
1867 GEN_VEXT_VV(vnmsac_vv_b, 1)
1868 GEN_VEXT_VV(vnmsac_vv_h, 2)
1869 GEN_VEXT_VV(vnmsac_vv_w, 4)
1870 GEN_VEXT_VV(vnmsac_vv_d, 8)
1871 GEN_VEXT_VV(vmadd_vv_b, 1)
1872 GEN_VEXT_VV(vmadd_vv_h, 2)
1873 GEN_VEXT_VV(vmadd_vv_w, 4)
1874 GEN_VEXT_VV(vmadd_vv_d, 8)
1875 GEN_VEXT_VV(vnmsub_vv_b, 1)
1876 GEN_VEXT_VV(vnmsub_vv_h, 2)
1877 GEN_VEXT_VV(vnmsub_vv_w, 4)
1878 GEN_VEXT_VV(vnmsub_vv_d, 8)
1879
1880 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)             \
1881 static void do_##NAME(void *vd, target_long s1, void *vs2, int i)   \
1882 {                                                                   \
1883     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
1884     TD d = *((TD *)vd + HD(i));                                     \
1885     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d);                   \
1886 }
1887
1888 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1889 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1890 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1891 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1892 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1893 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1894 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1895 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1896 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1897 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1898 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1899 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1900 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1901 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1902 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1903 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1904 GEN_VEXT_VX(vmacc_vx_b, 1)
1905 GEN_VEXT_VX(vmacc_vx_h, 2)
1906 GEN_VEXT_VX(vmacc_vx_w, 4)
1907 GEN_VEXT_VX(vmacc_vx_d, 8)
1908 GEN_VEXT_VX(vnmsac_vx_b, 1)
1909 GEN_VEXT_VX(vnmsac_vx_h, 2)
1910 GEN_VEXT_VX(vnmsac_vx_w, 4)
1911 GEN_VEXT_VX(vnmsac_vx_d, 8)
1912 GEN_VEXT_VX(vmadd_vx_b, 1)
1913 GEN_VEXT_VX(vmadd_vx_h, 2)
1914 GEN_VEXT_VX(vmadd_vx_w, 4)
1915 GEN_VEXT_VX(vmadd_vx_d, 8)
1916 GEN_VEXT_VX(vnmsub_vx_b, 1)
1917 GEN_VEXT_VX(vnmsub_vx_h, 2)
1918 GEN_VEXT_VX(vnmsub_vx_w, 4)
1919 GEN_VEXT_VX(vnmsub_vx_d, 8)
1920
1921 /* Vector Widening Integer Multiply-Add Instructions */
1922 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1923 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1924 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1925 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1926 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1927 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1928 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1929 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1930 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1931 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1932 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1933 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1934 GEN_VEXT_VV(vwmacc_vv_b, 2)
1935 GEN_VEXT_VV(vwmacc_vv_h, 4)
1936 GEN_VEXT_VV(vwmacc_vv_w, 8)
1937 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1938 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1939 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1940
1941 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1942 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1943 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1944 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1945 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1946 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1947 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1948 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1949 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1950 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1951 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1952 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1953 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1954 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1955 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1956 GEN_VEXT_VX(vwmacc_vx_b, 2)
1957 GEN_VEXT_VX(vwmacc_vx_h, 4)
1958 GEN_VEXT_VX(vwmacc_vx_w, 8)
1959 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1960 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1961 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1962 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1963 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1964 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1965
1966 /* Vector Integer Merge and Move Instructions */
1967 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H)                              \
1968 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env,           \
1969                   uint32_t desc)                                     \
1970 {                                                                    \
1971     uint32_t vl = env->vl;                                           \
1972     uint32_t esz = sizeof(ETYPE);                                    \
1973     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1974     uint32_t vta = vext_vta(desc);                                   \
1975     uint32_t i;                                                      \
1976                                                                      \
1977     for (i = env->vstart; i < vl; i++) {                             \
1978         ETYPE s1 = *((ETYPE *)vs1 + H(i));                           \
1979         *((ETYPE *)vd + H(i)) = s1;                                  \
1980     }                                                                \
1981     env->vstart = 0;                                                 \
1982     /* set tail elements to 1s */                                    \
1983     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
1984 }
1985
1986 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t,  H1)
1987 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
1988 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
1989 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
1990
1991 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H)                              \
1992 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env,         \
1993                   uint32_t desc)                                     \
1994 {                                                                    \
1995     uint32_t vl = env->vl;                                           \
1996     uint32_t esz = sizeof(ETYPE);                                    \
1997     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
1998     uint32_t vta = vext_vta(desc);                                   \
1999     uint32_t i;                                                      \
2000                                                                      \
2001     for (i = env->vstart; i < vl; i++) {                             \
2002         *((ETYPE *)vd + H(i)) = (ETYPE)s1;                           \
2003     }                                                                \
2004     env->vstart = 0;                                                 \
2005     /* set tail elements to 1s */                                    \
2006     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2007 }
2008
2009 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t,  H1)
2010 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2011 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2012 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2013
2014 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H)                           \
2015 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,          \
2016                   CPURISCVState *env, uint32_t desc)                 \
2017 {                                                                    \
2018     uint32_t vl = env->vl;                                           \
2019     uint32_t esz = sizeof(ETYPE);                                    \
2020     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2021     uint32_t vta = vext_vta(desc);                                   \
2022     uint32_t i;                                                      \
2023                                                                      \
2024     for (i = env->vstart; i < vl; i++) {                             \
2025         ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1);            \
2026         *((ETYPE *)vd + H(i)) = *(vt + H(i));                        \
2027     }                                                                \
2028     env->vstart = 0;                                                 \
2029     /* set tail elements to 1s */                                    \
2030     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2031 }
2032
2033 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t,  H1)
2034 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2035 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2036 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2037
2038 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H)                           \
2039 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,               \
2040                   void *vs2, CPURISCVState *env, uint32_t desc)      \
2041 {                                                                    \
2042     uint32_t vl = env->vl;                                           \
2043     uint32_t esz = sizeof(ETYPE);                                    \
2044     uint32_t total_elems = vext_get_total_elems(env, desc, esz);     \
2045     uint32_t vta = vext_vta(desc);                                   \
2046     uint32_t i;                                                      \
2047                                                                      \
2048     for (i = env->vstart; i < vl; i++) {                             \
2049         ETYPE s2 = *((ETYPE *)vs2 + H(i));                           \
2050         ETYPE d = (!vext_elem_mask(v0, i) ? s2 :                     \
2051                    (ETYPE)(target_long)s1);                          \
2052         *((ETYPE *)vd + H(i)) = d;                                   \
2053     }                                                                \
2054     env->vstart = 0;                                                 \
2055     /* set tail elements to 1s */                                    \
2056     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);         \
2057 }
2058
2059 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t,  H1)
2060 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2061 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2062 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2063
2064 /*
2065  *** Vector Fixed-Point Arithmetic Instructions
2066  */
2067
2068 /* Vector Single-Width Saturating Add and Subtract */
2069
2070 /*
2071  * As fixed point instructions probably have round mode and saturation,
2072  * define common macros for fixed point here.
2073  */
2074 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2075                           CPURISCVState *env, int vxrm);
2076
2077 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
2078 static inline void                                                  \
2079 do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
2080           CPURISCVState *env, int vxrm)                             \
2081 {                                                                   \
2082     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
2083     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2084     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
2085 }
2086
2087 static inline void
2088 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2089              CPURISCVState *env,
2090              uint32_t vl, uint32_t vm, int vxrm,
2091              opivv2_rm_fn *fn)
2092 {
2093     for (uint32_t i = env->vstart; i < vl; i++) {
2094         if (!vm && !vext_elem_mask(v0, i)) {
2095             continue;
2096         }
2097         fn(vd, vs1, vs2, i, env, vxrm);
2098     }
2099     env->vstart = 0;
2100 }
2101
2102 static inline void
2103 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2104              CPURISCVState *env,
2105              uint32_t desc,
2106              opivv2_rm_fn *fn, uint32_t esz)
2107 {
2108     uint32_t vm = vext_vm(desc);
2109     uint32_t vl = env->vl;
2110     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2111     uint32_t vta = vext_vta(desc);
2112
2113     switch (env->vxrm) {
2114     case 0: /* rnu */
2115         vext_vv_rm_1(vd, v0, vs1, vs2,
2116                      env, vl, vm, 0, fn);
2117         break;
2118     case 1: /* rne */
2119         vext_vv_rm_1(vd, v0, vs1, vs2,
2120                      env, vl, vm, 1, fn);
2121         break;
2122     case 2: /* rdn */
2123         vext_vv_rm_1(vd, v0, vs1, vs2,
2124                      env, vl, vm, 2, fn);
2125         break;
2126     default: /* rod */
2127         vext_vv_rm_1(vd, v0, vs1, vs2,
2128                      env, vl, vm, 3, fn);
2129         break;
2130     }
2131     /* set tail elements to 1s */
2132     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2133 }
2134
2135 /* generate helpers for fixed point instructions with OPIVV format */
2136 #define GEN_VEXT_VV_RM(NAME, ESZ)                               \
2137 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,     \
2138                   CPURISCVState *env, uint32_t desc)            \
2139 {                                                               \
2140     vext_vv_rm_2(vd, v0, vs1, vs2, env, desc,                   \
2141                  do_##NAME, ESZ);                               \
2142 }
2143
2144 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2145 {
2146     uint8_t res = a + b;
2147     if (res < a) {
2148         res = UINT8_MAX;
2149         env->vxsat = 0x1;
2150     }
2151     return res;
2152 }
2153
2154 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2155                                uint16_t b)
2156 {
2157     uint16_t res = a + b;
2158     if (res < a) {
2159         res = UINT16_MAX;
2160         env->vxsat = 0x1;
2161     }
2162     return res;
2163 }
2164
2165 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2166                                uint32_t b)
2167 {
2168     uint32_t res = a + b;
2169     if (res < a) {
2170         res = UINT32_MAX;
2171         env->vxsat = 0x1;
2172     }
2173     return res;
2174 }
2175
2176 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2177                                uint64_t b)
2178 {
2179     uint64_t res = a + b;
2180     if (res < a) {
2181         res = UINT64_MAX;
2182         env->vxsat = 0x1;
2183     }
2184     return res;
2185 }
2186
2187 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2188 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2189 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2190 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2191 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2192 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2193 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2194 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2195
2196 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2197                           CPURISCVState *env, int vxrm);
2198
2199 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)          \
2200 static inline void                                                  \
2201 do_##NAME(void *vd, target_long s1, void *vs2, int i,               \
2202           CPURISCVState *env, int vxrm)                             \
2203 {                                                                   \
2204     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
2205     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1);           \
2206 }
2207
2208 static inline void
2209 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2210              CPURISCVState *env,
2211              uint32_t vl, uint32_t vm, int vxrm,
2212              opivx2_rm_fn *fn)
2213 {
2214     for (uint32_t i = env->vstart; i < vl; i++) {
2215         if (!vm && !vext_elem_mask(v0, i)) {
2216             continue;
2217         }
2218         fn(vd, s1, vs2, i, env, vxrm);
2219     }
2220     env->vstart = 0;
2221 }
2222
2223 static inline void
2224 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2225              CPURISCVState *env,
2226              uint32_t desc,
2227              opivx2_rm_fn *fn, uint32_t esz)
2228 {
2229     uint32_t vm = vext_vm(desc);
2230     uint32_t vl = env->vl;
2231     uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2232     uint32_t vta = vext_vta(desc);
2233
2234     switch (env->vxrm) {
2235     case 0: /* rnu */
2236         vext_vx_rm_1(vd, v0, s1, vs2,
2237                      env, vl, vm, 0, fn);
2238         break;
2239     case 1: /* rne */
2240         vext_vx_rm_1(vd, v0, s1, vs2,
2241                      env, vl, vm, 1, fn);
2242         break;
2243     case 2: /* rdn */
2244         vext_vx_rm_1(vd, v0, s1, vs2,
2245                      env, vl, vm, 2, fn);
2246         break;
2247     default: /* rod */
2248         vext_vx_rm_1(vd, v0, s1, vs2,
2249                      env, vl, vm, 3, fn);
2250         break;
2251     }
2252     /* set tail elements to 1s */
2253     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2254 }
2255
2256 /* generate helpers for fixed point instructions with OPIVX format */
2257 #define GEN_VEXT_VX_RM(NAME, ESZ)                         \
2258 void HELPER(NAME)(void *vd, void *v0, target_ulong s1,    \
2259         void *vs2, CPURISCVState *env, uint32_t desc)     \
2260 {                                                         \
2261     vext_vx_rm_2(vd, v0, s1, vs2, env, desc,              \
2262                  do_##NAME, ESZ);                         \
2263 }
2264
2265 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2266 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2267 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2268 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2269 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2270 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2271 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2272 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2273
2274 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2275 {
2276     int8_t res = a + b;
2277     if ((res ^ a) & (res ^ b) & INT8_MIN) {
2278         res = a > 0 ? INT8_MAX : INT8_MIN;
2279         env->vxsat = 0x1;
2280     }
2281     return res;
2282 }
2283
2284 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2285 {
2286     int16_t res = a + b;
2287     if ((res ^ a) & (res ^ b) & INT16_MIN) {
2288         res = a > 0 ? INT16_MAX : INT16_MIN;
2289         env->vxsat = 0x1;
2290     }
2291     return res;
2292 }
2293
2294 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2295 {
2296     int32_t res = a + b;
2297     if ((res ^ a) & (res ^ b) & INT32_MIN) {
2298         res = a > 0 ? INT32_MAX : INT32_MIN;
2299         env->vxsat = 0x1;
2300     }
2301     return res;
2302 }
2303
2304 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2305 {
2306     int64_t res = a + b;
2307     if ((res ^ a) & (res ^ b) & INT64_MIN) {
2308         res = a > 0 ? INT64_MAX : INT64_MIN;
2309         env->vxsat = 0x1;
2310     }
2311     return res;
2312 }
2313
2314 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2315 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2316 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2317 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2318 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2319 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2320 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2321 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2322
2323 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2324 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2325 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2326 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2327 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2328 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2329 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2330 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2331
2332 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2333 {
2334     uint8_t res = a - b;
2335     if (res > a) {
2336         res = 0;
2337         env->vxsat = 0x1;
2338     }
2339     return res;
2340 }
2341
2342 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2343                                uint16_t b)
2344 {
2345     uint16_t res = a - b;
2346     if (res > a) {
2347         res = 0;
2348         env->vxsat = 0x1;
2349     }
2350     return res;
2351 }
2352
2353 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2354                                uint32_t b)
2355 {
2356     uint32_t res = a - b;
2357     if (res > a) {
2358         res = 0;
2359         env->vxsat = 0x1;
2360     }
2361     return res;
2362 }
2363
2364 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2365                                uint64_t b)
2366 {
2367     uint64_t res = a - b;
2368     if (res > a) {
2369         res = 0;
2370         env->vxsat = 0x1;
2371     }
2372     return res;
2373 }
2374
2375 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2376 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2377 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2378 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2379 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2380 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2381 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2382 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2383
2384 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2385 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2386 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2387 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2388 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2389 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2390 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2391 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2392
2393 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2394 {
2395     int8_t res = a - b;
2396     if ((res ^ a) & (a ^ b) & INT8_MIN) {
2397         res = a >= 0 ? INT8_MAX : INT8_MIN;
2398         env->vxsat = 0x1;
2399     }
2400     return res;
2401 }
2402
2403 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2404 {
2405     int16_t res = a - b;
2406     if ((res ^ a) & (a ^ b) & INT16_MIN) {
2407         res = a >= 0 ? INT16_MAX : INT16_MIN;
2408         env->vxsat = 0x1;
2409     }
2410     return res;
2411 }
2412
2413 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2414 {
2415     int32_t res = a - b;
2416     if ((res ^ a) & (a ^ b) & INT32_MIN) {
2417         res = a >= 0 ? INT32_MAX : INT32_MIN;
2418         env->vxsat = 0x1;
2419     }
2420     return res;
2421 }
2422
2423 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2424 {
2425     int64_t res = a - b;
2426     if ((res ^ a) & (a ^ b) & INT64_MIN) {
2427         res = a >= 0 ? INT64_MAX : INT64_MIN;
2428         env->vxsat = 0x1;
2429     }
2430     return res;
2431 }
2432
2433 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2434 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2435 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2436 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2437 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2438 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2439 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2440 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2441
2442 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2443 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2444 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2445 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2446 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2447 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2448 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2449 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2450
2451 /* Vector Single-Width Averaging Add and Subtract */
2452 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2453 {
2454     uint8_t d = extract64(v, shift, 1);
2455     uint8_t d1;
2456     uint64_t D1, D2;
2457
2458     if (shift == 0 || shift > 64) {
2459         return 0;
2460     }
2461
2462     d1 = extract64(v, shift - 1, 1);
2463     D1 = extract64(v, 0, shift);
2464     if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2465         return d1;
2466     } else if (vxrm == 1) { /* round-to-nearest-even */
2467         if (shift > 1) {
2468             D2 = extract64(v, 0, shift - 1);
2469             return d1 & ((D2 != 0) | d);
2470         } else {
2471             return d1 & d;
2472         }
2473     } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2474         return !d & (D1 != 0);
2475     }
2476     return 0; /* round-down (truncate) */
2477 }
2478
2479 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2480 {
2481     int64_t res = (int64_t)a + b;
2482     uint8_t round = get_round(vxrm, res, 1);
2483
2484     return (res >> 1) + round;
2485 }
2486
2487 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2488 {
2489     int64_t res = a + b;
2490     uint8_t round = get_round(vxrm, res, 1);
2491     int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2492
2493     /* With signed overflow, bit 64 is inverse of bit 63. */
2494     return ((res >> 1) ^ over) + round;
2495 }
2496
2497 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2498 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2499 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2500 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2501 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2502 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2503 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2504 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2505
2506 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2507 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2508 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2509 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2510 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2511 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2512 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2513 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2514
2515 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2516                                uint32_t a, uint32_t b)
2517 {
2518     uint64_t res = (uint64_t)a + b;
2519     uint8_t round = get_round(vxrm, res, 1);
2520
2521     return (res >> 1) + round;
2522 }
2523
2524 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2525                                uint64_t a, uint64_t b)
2526 {
2527     uint64_t res = a + b;
2528     uint8_t round = get_round(vxrm, res, 1);
2529     uint64_t over = (uint64_t)(res < a) << 63;
2530
2531     return ((res >> 1) | over) + round;
2532 }
2533
2534 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2535 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2536 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2537 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2538 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2539 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2540 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2541 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2542
2543 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2544 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2545 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2546 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2547 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2548 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2549 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2550 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2551
2552 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2553 {
2554     int64_t res = (int64_t)a - b;
2555     uint8_t round = get_round(vxrm, res, 1);
2556
2557     return (res >> 1) + round;
2558 }
2559
2560 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2561 {
2562     int64_t res = (int64_t)a - b;
2563     uint8_t round = get_round(vxrm, res, 1);
2564     int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2565
2566     /* With signed overflow, bit 64 is inverse of bit 63. */
2567     return ((res >> 1) ^ over) + round;
2568 }
2569
2570 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2571 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2572 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2573 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2574 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2575 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2576 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2577 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2578
2579 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2580 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2581 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2582 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2583 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2584 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2585 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2586 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2587
2588 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2589                                uint32_t a, uint32_t b)
2590 {
2591     int64_t res = (int64_t)a - b;
2592     uint8_t round = get_round(vxrm, res, 1);
2593
2594     return (res >> 1) + round;
2595 }
2596
2597 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2598                                uint64_t a, uint64_t b)
2599 {
2600     uint64_t res = (uint64_t)a - b;
2601     uint8_t round = get_round(vxrm, res, 1);
2602     uint64_t over = (uint64_t)(res > a) << 63;
2603
2604     return ((res >> 1) | over) + round;
2605 }
2606
2607 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2608 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2609 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2610 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2611 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2612 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2613 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2614 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2615
2616 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2617 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2618 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2619 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2620 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2621 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2622 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2623 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2624
2625 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2626 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2627 {
2628     uint8_t round;
2629     int16_t res;
2630
2631     res = (int16_t)a * (int16_t)b;
2632     round = get_round(vxrm, res, 7);
2633     res   = (res >> 7) + round;
2634
2635     if (res > INT8_MAX) {
2636         env->vxsat = 0x1;
2637         return INT8_MAX;
2638     } else if (res < INT8_MIN) {
2639         env->vxsat = 0x1;
2640         return INT8_MIN;
2641     } else {
2642         return res;
2643     }
2644 }
2645
2646 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2647 {
2648     uint8_t round;
2649     int32_t res;
2650
2651     res = (int32_t)a * (int32_t)b;
2652     round = get_round(vxrm, res, 15);
2653     res   = (res >> 15) + round;
2654
2655     if (res > INT16_MAX) {
2656         env->vxsat = 0x1;
2657         return INT16_MAX;
2658     } else if (res < INT16_MIN) {
2659         env->vxsat = 0x1;
2660         return INT16_MIN;
2661     } else {
2662         return res;
2663     }
2664 }
2665
2666 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2667 {
2668     uint8_t round;
2669     int64_t res;
2670
2671     res = (int64_t)a * (int64_t)b;
2672     round = get_round(vxrm, res, 31);
2673     res   = (res >> 31) + round;
2674
2675     if (res > INT32_MAX) {
2676         env->vxsat = 0x1;
2677         return INT32_MAX;
2678     } else if (res < INT32_MIN) {
2679         env->vxsat = 0x1;
2680         return INT32_MIN;
2681     } else {
2682         return res;
2683     }
2684 }
2685
2686 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2687 {
2688     uint8_t round;
2689     uint64_t hi_64, lo_64;
2690     int64_t res;
2691
2692     if (a == INT64_MIN && b == INT64_MIN) {
2693         env->vxsat = 1;
2694         return INT64_MAX;
2695     }
2696
2697     muls64(&lo_64, &hi_64, a, b);
2698     round = get_round(vxrm, lo_64, 63);
2699     /*
2700      * Cannot overflow, as there are always
2701      * 2 sign bits after multiply.
2702      */
2703     res = (hi_64 << 1) | (lo_64 >> 63);
2704     if (round) {
2705         if (res == INT64_MAX) {
2706             env->vxsat = 1;
2707         } else {
2708             res += 1;
2709         }
2710     }
2711     return res;
2712 }
2713
2714 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2715 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2716 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2717 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2718 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2719 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2720 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2721 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2722
2723 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2724 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2725 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2726 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2727 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2728 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2729 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2730 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2731
2732 /* Vector Single-Width Scaling Shift Instructions */
2733 static inline uint8_t
2734 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2735 {
2736     uint8_t round, shift = b & 0x7;
2737     uint8_t res;
2738
2739     round = get_round(vxrm, a, shift);
2740     res   = (a >> shift)  + round;
2741     return res;
2742 }
2743 static inline uint16_t
2744 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2745 {
2746     uint8_t round, shift = b & 0xf;
2747     uint16_t res;
2748
2749     round = get_round(vxrm, a, shift);
2750     res   = (a >> shift)  + round;
2751     return res;
2752 }
2753 static inline uint32_t
2754 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2755 {
2756     uint8_t round, shift = b & 0x1f;
2757     uint32_t res;
2758
2759     round = get_round(vxrm, a, shift);
2760     res   = (a >> shift)  + round;
2761     return res;
2762 }
2763 static inline uint64_t
2764 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2765 {
2766     uint8_t round, shift = b & 0x3f;
2767     uint64_t res;
2768
2769     round = get_round(vxrm, a, shift);
2770     res   = (a >> shift)  + round;
2771     return res;
2772 }
2773 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2774 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2775 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2776 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2777 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2778 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2779 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2780 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2781
2782 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2783 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2784 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2785 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2786 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2787 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2788 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2789 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2790
2791 static inline int8_t
2792 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2793 {
2794     uint8_t round, shift = b & 0x7;
2795     int8_t res;
2796
2797     round = get_round(vxrm, a, shift);
2798     res   = (a >> shift)  + round;
2799     return res;
2800 }
2801 static inline int16_t
2802 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2803 {
2804     uint8_t round, shift = b & 0xf;
2805     int16_t res;
2806
2807     round = get_round(vxrm, a, shift);
2808     res   = (a >> shift)  + round;
2809     return res;
2810 }
2811 static inline int32_t
2812 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2813 {
2814     uint8_t round, shift = b & 0x1f;
2815     int32_t res;
2816
2817     round = get_round(vxrm, a, shift);
2818     res   = (a >> shift)  + round;
2819     return res;
2820 }
2821 static inline int64_t
2822 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2823 {
2824     uint8_t round, shift = b & 0x3f;
2825     int64_t res;
2826
2827     round = get_round(vxrm, a, shift);
2828     res   = (a >> shift)  + round;
2829     return res;
2830 }
2831
2832 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2833 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2834 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2835 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2836 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2837 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2838 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2839 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2840
2841 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2842 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2843 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2844 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2845 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2846 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2847 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2848 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2849
2850 /* Vector Narrowing Fixed-Point Clip Instructions */
2851 static inline int8_t
2852 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2853 {
2854     uint8_t round, shift = b & 0xf;
2855     int16_t res;
2856
2857     round = get_round(vxrm, a, shift);
2858     res   = (a >> shift)  + round;
2859     if (res > INT8_MAX) {
2860         env->vxsat = 0x1;
2861         return INT8_MAX;
2862     } else if (res < INT8_MIN) {
2863         env->vxsat = 0x1;
2864         return INT8_MIN;
2865     } else {
2866         return res;
2867     }
2868 }
2869
2870 static inline int16_t
2871 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2872 {
2873     uint8_t round, shift = b & 0x1f;
2874     int32_t res;
2875
2876     round = get_round(vxrm, a, shift);
2877     res   = (a >> shift)  + round;
2878     if (res > INT16_MAX) {
2879         env->vxsat = 0x1;
2880         return INT16_MAX;
2881     } else if (res < INT16_MIN) {
2882         env->vxsat = 0x1;
2883         return INT16_MIN;
2884     } else {
2885         return res;
2886     }
2887 }
2888
2889 static inline int32_t
2890 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2891 {
2892     uint8_t round, shift = b & 0x3f;
2893     int64_t res;
2894
2895     round = get_round(vxrm, a, shift);
2896     res   = (a >> shift)  + round;
2897     if (res > INT32_MAX) {
2898         env->vxsat = 0x1;
2899         return INT32_MAX;
2900     } else if (res < INT32_MIN) {
2901         env->vxsat = 0x1;
2902         return INT32_MIN;
2903     } else {
2904         return res;
2905     }
2906 }
2907
2908 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2909 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2910 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2911 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2912 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2913 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2914
2915 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2916 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2917 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2918 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2919 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2920 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2921
2922 static inline uint8_t
2923 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2924 {
2925     uint8_t round, shift = b & 0xf;
2926     uint16_t res;
2927
2928     round = get_round(vxrm, a, shift);
2929     res   = (a >> shift)  + round;
2930     if (res > UINT8_MAX) {
2931         env->vxsat = 0x1;
2932         return UINT8_MAX;
2933     } else {
2934         return res;
2935     }
2936 }
2937
2938 static inline uint16_t
2939 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2940 {
2941     uint8_t round, shift = b & 0x1f;
2942     uint32_t res;
2943
2944     round = get_round(vxrm, a, shift);
2945     res   = (a >> shift)  + round;
2946     if (res > UINT16_MAX) {
2947         env->vxsat = 0x1;
2948         return UINT16_MAX;
2949     } else {
2950         return res;
2951     }
2952 }
2953
2954 static inline uint32_t
2955 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2956 {
2957     uint8_t round, shift = b & 0x3f;
2958     uint64_t res;
2959
2960     round = get_round(vxrm, a, shift);
2961     res   = (a >> shift)  + round;
2962     if (res > UINT32_MAX) {
2963         env->vxsat = 0x1;
2964         return UINT32_MAX;
2965     } else {
2966         return res;
2967     }
2968 }
2969
2970 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2971 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2972 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2973 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2974 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2975 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2976
2977 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2978 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2979 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2980 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2981 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2982 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2983
2984 /*
2985  *** Vector Float Point Arithmetic Instructions
2986  */
2987 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2988 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)   \
2989 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,   \
2990                       CPURISCVState *env)                      \
2991 {                                                              \
2992     TX1 s1 = *((T1 *)vs1 + HS1(i));                            \
2993     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
2994     *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status);         \
2995 }
2996
2997 #define GEN_VEXT_VV_ENV(NAME, ESZ)                        \
2998 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
2999                   void *vs2, CPURISCVState *env,          \
3000                   uint32_t desc)                          \
3001 {                                                         \
3002     uint32_t vm = vext_vm(desc);                          \
3003     uint32_t vl = env->vl;                                \
3004     uint32_t total_elems =                                \
3005         vext_get_total_elems(env, desc, ESZ);             \
3006     uint32_t vta = vext_vta(desc);                        \
3007     uint32_t i;                                           \
3008                                                           \
3009     for (i = env->vstart; i < vl; i++) {                  \
3010         if (!vm && !vext_elem_mask(v0, i)) {              \
3011             continue;                                     \
3012         }                                                 \
3013         do_##NAME(vd, vs1, vs2, i, env);                  \
3014     }                                                     \
3015     env->vstart = 0;                                      \
3016     /* set tail elements to 1s */                         \
3017     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3018                       total_elems * ESZ);                 \
3019 }
3020
3021 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3022 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3023 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3024 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3025 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3026 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3027
3028 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)        \
3029 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3030                       CPURISCVState *env)                      \
3031 {                                                              \
3032     TX2 s2 = *((T2 *)vs2 + HS2(i));                            \
3033     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3034 }
3035
3036 #define GEN_VEXT_VF(NAME, ESZ)                            \
3037 void HELPER(NAME)(void *vd, void *v0, uint64_t s1,        \
3038                   void *vs2, CPURISCVState *env,          \
3039                   uint32_t desc)                          \
3040 {                                                         \
3041     uint32_t vm = vext_vm(desc);                          \
3042     uint32_t vl = env->vl;                                \
3043     uint32_t total_elems =                                \
3044         vext_get_total_elems(env, desc, ESZ);              \
3045     uint32_t vta = vext_vta(desc);                        \
3046     uint32_t i;                                           \
3047                                                           \
3048     for (i = env->vstart; i < vl; i++) {                  \
3049         if (!vm && !vext_elem_mask(v0, i)) {              \
3050             continue;                                     \
3051         }                                                 \
3052         do_##NAME(vd, s1, vs2, i, env);                   \
3053     }                                                     \
3054     env->vstart = 0;                                      \
3055     /* set tail elements to 1s */                         \
3056     vext_set_elems_1s(vd, vta, vl * ESZ,                  \
3057                       total_elems * ESZ);                 \
3058 }
3059
3060 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3061 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3062 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3063 GEN_VEXT_VF(vfadd_vf_h, 2)
3064 GEN_VEXT_VF(vfadd_vf_w, 4)
3065 GEN_VEXT_VF(vfadd_vf_d, 8)
3066
3067 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3068 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3069 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3070 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3071 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3072 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3073 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3074 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3075 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3076 GEN_VEXT_VF(vfsub_vf_h, 2)
3077 GEN_VEXT_VF(vfsub_vf_w, 4)
3078 GEN_VEXT_VF(vfsub_vf_d, 8)
3079
3080 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3081 {
3082     return float16_sub(b, a, s);
3083 }
3084
3085 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3086 {
3087     return float32_sub(b, a, s);
3088 }
3089
3090 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3091 {
3092     return float64_sub(b, a, s);
3093 }
3094
3095 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3096 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3097 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3098 GEN_VEXT_VF(vfrsub_vf_h, 2)
3099 GEN_VEXT_VF(vfrsub_vf_w, 4)
3100 GEN_VEXT_VF(vfrsub_vf_d, 8)
3101
3102 /* Vector Widening Floating-Point Add/Subtract Instructions */
3103 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3104 {
3105     return float32_add(float16_to_float32(a, true, s),
3106             float16_to_float32(b, true, s), s);
3107 }
3108
3109 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3110 {
3111     return float64_add(float32_to_float64(a, s),
3112             float32_to_float64(b, s), s);
3113
3114 }
3115
3116 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3117 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3118 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3119 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3120 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3121 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3122 GEN_VEXT_VF(vfwadd_vf_h, 4)
3123 GEN_VEXT_VF(vfwadd_vf_w, 8)
3124
3125 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3126 {
3127     return float32_sub(float16_to_float32(a, true, s),
3128             float16_to_float32(b, true, s), s);
3129 }
3130
3131 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3132 {
3133     return float64_sub(float32_to_float64(a, s),
3134             float32_to_float64(b, s), s);
3135
3136 }
3137
3138 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3139 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3140 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3141 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3142 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3143 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3144 GEN_VEXT_VF(vfwsub_vf_h, 4)
3145 GEN_VEXT_VF(vfwsub_vf_w, 8)
3146
3147 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3148 {
3149     return float32_add(a, float16_to_float32(b, true, s), s);
3150 }
3151
3152 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3153 {
3154     return float64_add(a, float32_to_float64(b, s), s);
3155 }
3156
3157 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3158 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3159 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3160 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3161 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3162 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3163 GEN_VEXT_VF(vfwadd_wf_h, 4)
3164 GEN_VEXT_VF(vfwadd_wf_w, 8)
3165
3166 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3167 {
3168     return float32_sub(a, float16_to_float32(b, true, s), s);
3169 }
3170
3171 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3172 {
3173     return float64_sub(a, float32_to_float64(b, s), s);
3174 }
3175
3176 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3177 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3178 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3179 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3180 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3181 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3182 GEN_VEXT_VF(vfwsub_wf_h, 4)
3183 GEN_VEXT_VF(vfwsub_wf_w, 8)
3184
3185 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3186 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3187 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3188 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3189 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3190 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3191 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3192 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3193 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3194 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3195 GEN_VEXT_VF(vfmul_vf_h, 2)
3196 GEN_VEXT_VF(vfmul_vf_w, 4)
3197 GEN_VEXT_VF(vfmul_vf_d, 8)
3198
3199 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3200 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3201 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3202 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3203 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3204 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3205 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3206 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3207 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3208 GEN_VEXT_VF(vfdiv_vf_h, 2)
3209 GEN_VEXT_VF(vfdiv_vf_w, 4)
3210 GEN_VEXT_VF(vfdiv_vf_d, 8)
3211
3212 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3213 {
3214     return float16_div(b, a, s);
3215 }
3216
3217 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3218 {
3219     return float32_div(b, a, s);
3220 }
3221
3222 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3223 {
3224     return float64_div(b, a, s);
3225 }
3226
3227 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3228 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3229 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3230 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3231 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3232 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3233
3234 /* Vector Widening Floating-Point Multiply */
3235 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3236 {
3237     return float32_mul(float16_to_float32(a, true, s),
3238             float16_to_float32(b, true, s), s);
3239 }
3240
3241 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3242 {
3243     return float64_mul(float32_to_float64(a, s),
3244             float32_to_float64(b, s), s);
3245
3246 }
3247 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3248 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3249 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3250 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3251 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3252 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3253 GEN_VEXT_VF(vfwmul_vf_h, 4)
3254 GEN_VEXT_VF(vfwmul_vf_w, 8)
3255
3256 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3257 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)       \
3258 static void do_##NAME(void *vd, void *vs1, void *vs2, int i,       \
3259         CPURISCVState *env)                                        \
3260 {                                                                  \
3261     TX1 s1 = *((T1 *)vs1 + HS1(i));                                \
3262     TX2 s2 = *((T2 *)vs2 + HS2(i));                                \
3263     TD d = *((TD *)vd + HD(i));                                    \
3264     *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status);          \
3265 }
3266
3267 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3268 {
3269     return float16_muladd(a, b, d, 0, s);
3270 }
3271
3272 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3273 {
3274     return float32_muladd(a, b, d, 0, s);
3275 }
3276
3277 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3278 {
3279     return float64_muladd(a, b, d, 0, s);
3280 }
3281
3282 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3283 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3284 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3285 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3286 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3287 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3288
3289 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP)           \
3290 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i,    \
3291         CPURISCVState *env)                                       \
3292 {                                                                 \
3293     TX2 s2 = *((T2 *)vs2 + HS2(i));                               \
3294     TD d = *((TD *)vd + HD(i));                                   \
3295     *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3296 }
3297
3298 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3299 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3300 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3301 GEN_VEXT_VF(vfmacc_vf_h, 2)
3302 GEN_VEXT_VF(vfmacc_vf_w, 4)
3303 GEN_VEXT_VF(vfmacc_vf_d, 8)
3304
3305 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3306 {
3307     return float16_muladd(a, b, d,
3308             float_muladd_negate_c | float_muladd_negate_product, s);
3309 }
3310
3311 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3312 {
3313     return float32_muladd(a, b, d,
3314             float_muladd_negate_c | float_muladd_negate_product, s);
3315 }
3316
3317 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3318 {
3319     return float64_muladd(a, b, d,
3320             float_muladd_negate_c | float_muladd_negate_product, s);
3321 }
3322
3323 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3324 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3325 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3326 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3327 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3328 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3329 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3330 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3331 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3332 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3333 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3334 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3335
3336 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3337 {
3338     return float16_muladd(a, b, d, float_muladd_negate_c, s);
3339 }
3340
3341 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3342 {
3343     return float32_muladd(a, b, d, float_muladd_negate_c, s);
3344 }
3345
3346 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3347 {
3348     return float64_muladd(a, b, d, float_muladd_negate_c, s);
3349 }
3350
3351 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3352 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3353 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3354 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3355 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3356 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3357 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3358 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3359 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3360 GEN_VEXT_VF(vfmsac_vf_h, 2)
3361 GEN_VEXT_VF(vfmsac_vf_w, 4)
3362 GEN_VEXT_VF(vfmsac_vf_d, 8)
3363
3364 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3365 {
3366     return float16_muladd(a, b, d, float_muladd_negate_product, s);
3367 }
3368
3369 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3370 {
3371     return float32_muladd(a, b, d, float_muladd_negate_product, s);
3372 }
3373
3374 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3375 {
3376     return float64_muladd(a, b, d, float_muladd_negate_product, s);
3377 }
3378
3379 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3380 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3381 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3382 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3383 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3384 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3385 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3386 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3387 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3388 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3389 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3390 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3391
3392 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3393 {
3394     return float16_muladd(d, b, a, 0, s);
3395 }
3396
3397 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3398 {
3399     return float32_muladd(d, b, a, 0, s);
3400 }
3401
3402 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3403 {
3404     return float64_muladd(d, b, a, 0, s);
3405 }
3406
3407 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3408 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3409 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3410 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3411 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3412 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3413 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3414 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3415 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3416 GEN_VEXT_VF(vfmadd_vf_h, 2)
3417 GEN_VEXT_VF(vfmadd_vf_w, 4)
3418 GEN_VEXT_VF(vfmadd_vf_d, 8)
3419
3420 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3421 {
3422     return float16_muladd(d, b, a,
3423             float_muladd_negate_c | float_muladd_negate_product, s);
3424 }
3425
3426 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3427 {
3428     return float32_muladd(d, b, a,
3429             float_muladd_negate_c | float_muladd_negate_product, s);
3430 }
3431
3432 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3433 {
3434     return float64_muladd(d, b, a,
3435             float_muladd_negate_c | float_muladd_negate_product, s);
3436 }
3437
3438 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3439 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3440 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3441 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3442 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3443 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3444 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3445 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3446 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3447 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3448 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3449 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3450
3451 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3452 {
3453     return float16_muladd(d, b, a, float_muladd_negate_c, s);
3454 }
3455
3456 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3457 {
3458     return float32_muladd(d, b, a, float_muladd_negate_c, s);
3459 }
3460
3461 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3462 {
3463     return float64_muladd(d, b, a, float_muladd_negate_c, s);
3464 }
3465
3466 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3467 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3468 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3469 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3470 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3471 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3472 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3473 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3474 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3475 GEN_VEXT_VF(vfmsub_vf_h, 2)
3476 GEN_VEXT_VF(vfmsub_vf_w, 4)
3477 GEN_VEXT_VF(vfmsub_vf_d, 8)
3478
3479 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3480 {
3481     return float16_muladd(d, b, a, float_muladd_negate_product, s);
3482 }
3483
3484 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3485 {
3486     return float32_muladd(d, b, a, float_muladd_negate_product, s);
3487 }
3488
3489 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3490 {
3491     return float64_muladd(d, b, a, float_muladd_negate_product, s);
3492 }
3493
3494 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3495 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3496 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3497 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3498 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3499 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3500 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3501 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3502 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3503 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3504 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3505 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3506
3507 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3508 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3509 {
3510     return float32_muladd(float16_to_float32(a, true, s),
3511                         float16_to_float32(b, true, s), d, 0, s);
3512 }
3513
3514 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3515 {
3516     return float64_muladd(float32_to_float64(a, s),
3517                         float32_to_float64(b, s), d, 0, s);
3518 }
3519
3520 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3521 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3522 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3523 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3524 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3525 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3526 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3527 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3528
3529 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3530 {
3531     return float32_muladd(float16_to_float32(a, true, s),
3532                         float16_to_float32(b, true, s), d,
3533                         float_muladd_negate_c | float_muladd_negate_product, s);
3534 }
3535
3536 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3537 {
3538     return float64_muladd(float32_to_float64(a, s),
3539                         float32_to_float64(b, s), d,
3540                         float_muladd_negate_c | float_muladd_negate_product, s);
3541 }
3542
3543 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3544 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3545 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3546 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3547 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3548 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3549 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3550 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3551
3552 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3553 {
3554     return float32_muladd(float16_to_float32(a, true, s),
3555                         float16_to_float32(b, true, s), d,
3556                         float_muladd_negate_c, s);
3557 }
3558
3559 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3560 {
3561     return float64_muladd(float32_to_float64(a, s),
3562                         float32_to_float64(b, s), d,
3563                         float_muladd_negate_c, s);
3564 }
3565
3566 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3567 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3568 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3569 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3570 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3571 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3572 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3573 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3574
3575 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3576 {
3577     return float32_muladd(float16_to_float32(a, true, s),
3578                         float16_to_float32(b, true, s), d,
3579                         float_muladd_negate_product, s);
3580 }
3581
3582 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3583 {
3584     return float64_muladd(float32_to_float64(a, s),
3585                         float32_to_float64(b, s), d,
3586                         float_muladd_negate_product, s);
3587 }
3588
3589 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3590 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3591 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3592 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3593 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3594 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3595 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3596 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3597
3598 /* Vector Floating-Point Square-Root Instruction */
3599 /* (TD, T2, TX2) */
3600 #define OP_UU_H uint16_t, uint16_t, uint16_t
3601 #define OP_UU_W uint32_t, uint32_t, uint32_t
3602 #define OP_UU_D uint64_t, uint64_t, uint64_t
3603
3604 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP)        \
3605 static void do_##NAME(void *vd, void *vs2, int i,      \
3606         CPURISCVState *env)                            \
3607 {                                                      \
3608     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
3609     *((TD *)vd + HD(i)) = OP(s2, &env->fp_status);     \
3610 }
3611
3612 #define GEN_VEXT_V_ENV(NAME, ESZ)                      \
3613 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
3614         CPURISCVState *env, uint32_t desc)             \
3615 {                                                      \
3616     uint32_t vm = vext_vm(desc);                       \
3617     uint32_t vl = env->vl;                             \
3618     uint32_t total_elems =                             \
3619         vext_get_total_elems(env, desc, ESZ);          \
3620     uint32_t vta = vext_vta(desc);                     \
3621     uint32_t i;                                        \
3622                                                        \
3623     if (vl == 0) {                                     \
3624         return;                                        \
3625     }                                                  \
3626     for (i = env->vstart; i < vl; i++) {               \
3627         if (!vm && !vext_elem_mask(v0, i)) {           \
3628             continue;                                  \
3629         }                                              \
3630         do_##NAME(vd, vs2, i, env);                    \
3631     }                                                  \
3632     env->vstart = 0;                                   \
3633     vext_set_elems_1s(vd, vta, vl * ESZ,               \
3634                       total_elems * ESZ);              \
3635 }
3636
3637 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3638 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3639 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3640 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3641 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3642 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3643
3644 /*
3645  * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3646  *
3647  * Adapted from riscv-v-spec recip.c:
3648  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3649  */
3650 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3651 {
3652     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3653     uint64_t exp = extract64(f, frac_size, exp_size);
3654     uint64_t frac = extract64(f, 0, frac_size);
3655
3656     const uint8_t lookup_table[] = {
3657         52, 51, 50, 48, 47, 46, 44, 43,
3658         42, 41, 40, 39, 38, 36, 35, 34,
3659         33, 32, 31, 30, 30, 29, 28, 27,
3660         26, 25, 24, 23, 23, 22, 21, 20,
3661         19, 19, 18, 17, 16, 16, 15, 14,
3662         14, 13, 12, 12, 11, 10, 10, 9,
3663         9, 8, 7, 7, 6, 6, 5, 4,
3664         4, 3, 3, 2, 2, 1, 1, 0,
3665         127, 125, 123, 121, 119, 118, 116, 114,
3666         113, 111, 109, 108, 106, 105, 103, 102,
3667         100, 99, 97, 96, 95, 93, 92, 91,
3668         90, 88, 87, 86, 85, 84, 83, 82,
3669         80, 79, 78, 77, 76, 75, 74, 73,
3670         72, 71, 70, 70, 69, 68, 67, 66,
3671         65, 64, 63, 63, 62, 61, 60, 59,
3672         59, 58, 57, 56, 56, 55, 54, 53
3673     };
3674     const int precision = 7;
3675
3676     if (exp == 0 && frac != 0) { /* subnormal */
3677         /* Normalize the subnormal. */
3678         while (extract64(frac, frac_size - 1, 1) == 0) {
3679             exp--;
3680             frac <<= 1;
3681         }
3682
3683         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3684     }
3685
3686     int idx = ((exp & 1) << (precision - 1)) |
3687                 (frac >> (frac_size - precision + 1));
3688     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3689                             (frac_size - precision);
3690     uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3691
3692     uint64_t val = 0;
3693     val = deposit64(val, 0, frac_size, out_frac);
3694     val = deposit64(val, frac_size, exp_size, out_exp);
3695     val = deposit64(val, frac_size + exp_size, 1, sign);
3696     return val;
3697 }
3698
3699 static float16 frsqrt7_h(float16 f, float_status *s)
3700 {
3701     int exp_size = 5, frac_size = 10;
3702     bool sign = float16_is_neg(f);
3703
3704     /*
3705      * frsqrt7(sNaN) = canonical NaN
3706      * frsqrt7(-inf) = canonical NaN
3707      * frsqrt7(-normal) = canonical NaN
3708      * frsqrt7(-subnormal) = canonical NaN
3709      */
3710     if (float16_is_signaling_nan(f, s) ||
3711             (float16_is_infinity(f) && sign) ||
3712             (float16_is_normal(f) && sign) ||
3713             (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3714         s->float_exception_flags |= float_flag_invalid;
3715         return float16_default_nan(s);
3716     }
3717
3718     /* frsqrt7(qNaN) = canonical NaN */
3719     if (float16_is_quiet_nan(f, s)) {
3720         return float16_default_nan(s);
3721     }
3722
3723     /* frsqrt7(+-0) = +-inf */
3724     if (float16_is_zero(f)) {
3725         s->float_exception_flags |= float_flag_divbyzero;
3726         return float16_set_sign(float16_infinity, sign);
3727     }
3728
3729     /* frsqrt7(+inf) = +0 */
3730     if (float16_is_infinity(f) && !sign) {
3731         return float16_set_sign(float16_zero, sign);
3732     }
3733
3734     /* +normal, +subnormal */
3735     uint64_t val = frsqrt7(f, exp_size, frac_size);
3736     return make_float16(val);
3737 }
3738
3739 static float32 frsqrt7_s(float32 f, float_status *s)
3740 {
3741     int exp_size = 8, frac_size = 23;
3742     bool sign = float32_is_neg(f);
3743
3744     /*
3745      * frsqrt7(sNaN) = canonical NaN
3746      * frsqrt7(-inf) = canonical NaN
3747      * frsqrt7(-normal) = canonical NaN
3748      * frsqrt7(-subnormal) = canonical NaN
3749      */
3750     if (float32_is_signaling_nan(f, s) ||
3751             (float32_is_infinity(f) && sign) ||
3752             (float32_is_normal(f) && sign) ||
3753             (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3754         s->float_exception_flags |= float_flag_invalid;
3755         return float32_default_nan(s);
3756     }
3757
3758     /* frsqrt7(qNaN) = canonical NaN */
3759     if (float32_is_quiet_nan(f, s)) {
3760         return float32_default_nan(s);
3761     }
3762
3763     /* frsqrt7(+-0) = +-inf */
3764     if (float32_is_zero(f)) {
3765         s->float_exception_flags |= float_flag_divbyzero;
3766         return float32_set_sign(float32_infinity, sign);
3767     }
3768
3769     /* frsqrt7(+inf) = +0 */
3770     if (float32_is_infinity(f) && !sign) {
3771         return float32_set_sign(float32_zero, sign);
3772     }
3773
3774     /* +normal, +subnormal */
3775     uint64_t val = frsqrt7(f, exp_size, frac_size);
3776     return make_float32(val);
3777 }
3778
3779 static float64 frsqrt7_d(float64 f, float_status *s)
3780 {
3781     int exp_size = 11, frac_size = 52;
3782     bool sign = float64_is_neg(f);
3783
3784     /*
3785      * frsqrt7(sNaN) = canonical NaN
3786      * frsqrt7(-inf) = canonical NaN
3787      * frsqrt7(-normal) = canonical NaN
3788      * frsqrt7(-subnormal) = canonical NaN
3789      */
3790     if (float64_is_signaling_nan(f, s) ||
3791             (float64_is_infinity(f) && sign) ||
3792             (float64_is_normal(f) && sign) ||
3793             (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3794         s->float_exception_flags |= float_flag_invalid;
3795         return float64_default_nan(s);
3796     }
3797
3798     /* frsqrt7(qNaN) = canonical NaN */
3799     if (float64_is_quiet_nan(f, s)) {
3800         return float64_default_nan(s);
3801     }
3802
3803     /* frsqrt7(+-0) = +-inf */
3804     if (float64_is_zero(f)) {
3805         s->float_exception_flags |= float_flag_divbyzero;
3806         return float64_set_sign(float64_infinity, sign);
3807     }
3808
3809     /* frsqrt7(+inf) = +0 */
3810     if (float64_is_infinity(f) && !sign) {
3811         return float64_set_sign(float64_zero, sign);
3812     }
3813
3814     /* +normal, +subnormal */
3815     uint64_t val = frsqrt7(f, exp_size, frac_size);
3816     return make_float64(val);
3817 }
3818
3819 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3820 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3821 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3822 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3823 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3824 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3825
3826 /*
3827  * Vector Floating-Point Reciprocal Estimate Instruction
3828  *
3829  * Adapted from riscv-v-spec recip.c:
3830  * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3831  */
3832 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3833                       float_status *s)
3834 {
3835     uint64_t sign = extract64(f, frac_size + exp_size, 1);
3836     uint64_t exp = extract64(f, frac_size, exp_size);
3837     uint64_t frac = extract64(f, 0, frac_size);
3838
3839     const uint8_t lookup_table[] = {
3840         127, 125, 123, 121, 119, 117, 116, 114,
3841         112, 110, 109, 107, 105, 104, 102, 100,
3842         99, 97, 96, 94, 93, 91, 90, 88,
3843         87, 85, 84, 83, 81, 80, 79, 77,
3844         76, 75, 74, 72, 71, 70, 69, 68,
3845         66, 65, 64, 63, 62, 61, 60, 59,
3846         58, 57, 56, 55, 54, 53, 52, 51,
3847         50, 49, 48, 47, 46, 45, 44, 43,
3848         42, 41, 40, 40, 39, 38, 37, 36,
3849         35, 35, 34, 33, 32, 31, 31, 30,
3850         29, 28, 28, 27, 26, 25, 25, 24,
3851         23, 23, 22, 21, 21, 20, 19, 19,
3852         18, 17, 17, 16, 15, 15, 14, 14,
3853         13, 12, 12, 11, 11, 10, 9, 9,
3854         8, 8, 7, 7, 6, 5, 5, 4,
3855         4, 3, 3, 2, 2, 1, 1, 0
3856     };
3857     const int precision = 7;
3858
3859     if (exp == 0 && frac != 0) { /* subnormal */
3860         /* Normalize the subnormal. */
3861         while (extract64(frac, frac_size - 1, 1) == 0) {
3862             exp--;
3863             frac <<= 1;
3864         }
3865
3866         frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3867
3868         if (exp != 0 && exp != UINT64_MAX) {
3869             /*
3870              * Overflow to inf or max value of same sign,
3871              * depending on sign and rounding mode.
3872              */
3873             s->float_exception_flags |= (float_flag_inexact |
3874                                          float_flag_overflow);
3875
3876             if ((s->float_rounding_mode == float_round_to_zero) ||
3877                 ((s->float_rounding_mode == float_round_down) && !sign) ||
3878                 ((s->float_rounding_mode == float_round_up) && sign)) {
3879                 /* Return greatest/negative finite value. */
3880                 return (sign << (exp_size + frac_size)) |
3881                     (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3882             } else {
3883                 /* Return +-inf. */
3884                 return (sign << (exp_size + frac_size)) |
3885                     MAKE_64BIT_MASK(frac_size, exp_size);
3886             }
3887         }
3888     }
3889
3890     int idx = frac >> (frac_size - precision);
3891     uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3892                             (frac_size - precision);
3893     uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3894
3895     if (out_exp == 0 || out_exp == UINT64_MAX) {
3896         /*
3897          * The result is subnormal, but don't raise the underflow exception,
3898          * because there's no additional loss of precision.
3899          */
3900         out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3901         if (out_exp == UINT64_MAX) {
3902             out_frac >>= 1;
3903             out_exp = 0;
3904         }
3905     }
3906
3907     uint64_t val = 0;
3908     val = deposit64(val, 0, frac_size, out_frac);
3909     val = deposit64(val, frac_size, exp_size, out_exp);
3910     val = deposit64(val, frac_size + exp_size, 1, sign);
3911     return val;
3912 }
3913
3914 static float16 frec7_h(float16 f, float_status *s)
3915 {
3916     int exp_size = 5, frac_size = 10;
3917     bool sign = float16_is_neg(f);
3918
3919     /* frec7(+-inf) = +-0 */
3920     if (float16_is_infinity(f)) {
3921         return float16_set_sign(float16_zero, sign);
3922     }
3923
3924     /* frec7(+-0) = +-inf */
3925     if (float16_is_zero(f)) {
3926         s->float_exception_flags |= float_flag_divbyzero;
3927         return float16_set_sign(float16_infinity, sign);
3928     }
3929
3930     /* frec7(sNaN) = canonical NaN */
3931     if (float16_is_signaling_nan(f, s)) {
3932         s->float_exception_flags |= float_flag_invalid;
3933         return float16_default_nan(s);
3934     }
3935
3936     /* frec7(qNaN) = canonical NaN */
3937     if (float16_is_quiet_nan(f, s)) {
3938         return float16_default_nan(s);
3939     }
3940
3941     /* +-normal, +-subnormal */
3942     uint64_t val = frec7(f, exp_size, frac_size, s);
3943     return make_float16(val);
3944 }
3945
3946 static float32 frec7_s(float32 f, float_status *s)
3947 {
3948     int exp_size = 8, frac_size = 23;
3949     bool sign = float32_is_neg(f);
3950
3951     /* frec7(+-inf) = +-0 */
3952     if (float32_is_infinity(f)) {
3953         return float32_set_sign(float32_zero, sign);
3954     }
3955
3956     /* frec7(+-0) = +-inf */
3957     if (float32_is_zero(f)) {
3958         s->float_exception_flags |= float_flag_divbyzero;
3959         return float32_set_sign(float32_infinity, sign);
3960     }
3961
3962     /* frec7(sNaN) = canonical NaN */
3963     if (float32_is_signaling_nan(f, s)) {
3964         s->float_exception_flags |= float_flag_invalid;
3965         return float32_default_nan(s);
3966     }
3967
3968     /* frec7(qNaN) = canonical NaN */
3969     if (float32_is_quiet_nan(f, s)) {
3970         return float32_default_nan(s);
3971     }
3972
3973     /* +-normal, +-subnormal */
3974     uint64_t val = frec7(f, exp_size, frac_size, s);
3975     return make_float32(val);
3976 }
3977
3978 static float64 frec7_d(float64 f, float_status *s)
3979 {
3980     int exp_size = 11, frac_size = 52;
3981     bool sign = float64_is_neg(f);
3982
3983     /* frec7(+-inf) = +-0 */
3984     if (float64_is_infinity(f)) {
3985         return float64_set_sign(float64_zero, sign);
3986     }
3987
3988     /* frec7(+-0) = +-inf */
3989     if (float64_is_zero(f)) {
3990         s->float_exception_flags |= float_flag_divbyzero;
3991         return float64_set_sign(float64_infinity, sign);
3992     }
3993
3994     /* frec7(sNaN) = canonical NaN */
3995     if (float64_is_signaling_nan(f, s)) {
3996         s->float_exception_flags |= float_flag_invalid;
3997         return float64_default_nan(s);
3998     }
3999
4000     /* frec7(qNaN) = canonical NaN */
4001     if (float64_is_quiet_nan(f, s)) {
4002         return float64_default_nan(s);
4003     }
4004
4005     /* +-normal, +-subnormal */
4006     uint64_t val = frec7(f, exp_size, frac_size, s);
4007     return make_float64(val);
4008 }
4009
4010 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4011 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4012 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4013 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4014 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4015 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4016
4017 /* Vector Floating-Point MIN/MAX Instructions */
4018 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4019 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4020 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4021 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4022 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4023 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4024 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4025 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4026 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4027 GEN_VEXT_VF(vfmin_vf_h, 2)
4028 GEN_VEXT_VF(vfmin_vf_w, 4)
4029 GEN_VEXT_VF(vfmin_vf_d, 8)
4030
4031 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4032 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4033 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4034 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4035 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4036 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4037 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4038 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4039 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4040 GEN_VEXT_VF(vfmax_vf_h, 2)
4041 GEN_VEXT_VF(vfmax_vf_w, 4)
4042 GEN_VEXT_VF(vfmax_vf_d, 8)
4043
4044 /* Vector Floating-Point Sign-Injection Instructions */
4045 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4046 {
4047     return deposit64(b, 0, 15, a);
4048 }
4049
4050 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4051 {
4052     return deposit64(b, 0, 31, a);
4053 }
4054
4055 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4056 {
4057     return deposit64(b, 0, 63, a);
4058 }
4059
4060 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4061 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4062 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4063 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4064 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4065 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4066 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4067 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4068 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4069 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4070 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4071 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4072
4073 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4074 {
4075     return deposit64(~b, 0, 15, a);
4076 }
4077
4078 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4079 {
4080     return deposit64(~b, 0, 31, a);
4081 }
4082
4083 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4084 {
4085     return deposit64(~b, 0, 63, a);
4086 }
4087
4088 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4089 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4090 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4091 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4092 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4093 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4094 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4095 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4096 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4097 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4098 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4099 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4100
4101 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4102 {
4103     return deposit64(b ^ a, 0, 15, a);
4104 }
4105
4106 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4107 {
4108     return deposit64(b ^ a, 0, 31, a);
4109 }
4110
4111 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4112 {
4113     return deposit64(b ^ a, 0, 63, a);
4114 }
4115
4116 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4117 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4118 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4119 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4120 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4121 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4122 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4123 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4124 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4125 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4126 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4127 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4128
4129 /* Vector Floating-Point Compare Instructions */
4130 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP)            \
4131 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,   \
4132                   CPURISCVState *env, uint32_t desc)          \
4133 {                                                             \
4134     uint32_t vm = vext_vm(desc);                              \
4135     uint32_t vl = env->vl;                                    \
4136     uint32_t total_elems = env_archcpu(env)->cfg.vlen;        \
4137     uint32_t vta_all_1s = vext_vta_all_1s(desc);              \
4138     uint32_t i;                                               \
4139                                                               \
4140     for (i = env->vstart; i < vl; i++) {                      \
4141         ETYPE s1 = *((ETYPE *)vs1 + H(i));                    \
4142         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4143         if (!vm && !vext_elem_mask(v0, i)) {                  \
4144             continue;                                         \
4145         }                                                     \
4146         vext_set_elem_mask(vd, i,                             \
4147                            DO_OP(s2, s1, &env->fp_status));   \
4148     }                                                         \
4149     env->vstart = 0;                                          \
4150     /* mask destination register are always tail-agnostic */  \
4151     /* set tail elements to 1s */                             \
4152     if (vta_all_1s) {                                         \
4153         for (; i < total_elems; i++) {                        \
4154             vext_set_elem_mask(vd, i, 1);                     \
4155         }                                                     \
4156     }                                                         \
4157 }
4158
4159 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4160 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4161 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4162
4163 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP)                      \
4164 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2,       \
4165                   CPURISCVState *env, uint32_t desc)                \
4166 {                                                                   \
4167     uint32_t vm = vext_vm(desc);                                    \
4168     uint32_t vl = env->vl;                                          \
4169     uint32_t total_elems = env_archcpu(env)->cfg.vlen;              \
4170     uint32_t vta_all_1s = vext_vta_all_1s(desc);                    \
4171     uint32_t i;                                                     \
4172                                                                     \
4173     for (i = env->vstart; i < vl; i++) {                            \
4174         ETYPE s2 = *((ETYPE *)vs2 + H(i));                          \
4175         if (!vm && !vext_elem_mask(v0, i)) {                        \
4176             continue;                                               \
4177         }                                                           \
4178         vext_set_elem_mask(vd, i,                                   \
4179                            DO_OP(s2, (ETYPE)s1, &env->fp_status));  \
4180     }                                                               \
4181     env->vstart = 0;                                                \
4182     /* mask destination register are always tail-agnostic */        \
4183     /* set tail elements to 1s */                                   \
4184     if (vta_all_1s) {                                               \
4185         for (; i < total_elems; i++) {                              \
4186             vext_set_elem_mask(vd, i, 1);                           \
4187         }                                                           \
4188     }                                                               \
4189 }
4190
4191 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4192 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4193 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4194
4195 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4196 {
4197     FloatRelation compare = float16_compare_quiet(a, b, s);
4198     return compare != float_relation_equal;
4199 }
4200
4201 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4202 {
4203     FloatRelation compare = float32_compare_quiet(a, b, s);
4204     return compare != float_relation_equal;
4205 }
4206
4207 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4208 {
4209     FloatRelation compare = float64_compare_quiet(a, b, s);
4210     return compare != float_relation_equal;
4211 }
4212
4213 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4214 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4215 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4216 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4217 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4218 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4219
4220 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4221 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4222 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4223 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4224 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4225 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4226
4227 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4228 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4229 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4230 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4231 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4232 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4233
4234 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4235 {
4236     FloatRelation compare = float16_compare(a, b, s);
4237     return compare == float_relation_greater;
4238 }
4239
4240 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4241 {
4242     FloatRelation compare = float32_compare(a, b, s);
4243     return compare == float_relation_greater;
4244 }
4245
4246 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4247 {
4248     FloatRelation compare = float64_compare(a, b, s);
4249     return compare == float_relation_greater;
4250 }
4251
4252 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4253 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4254 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4255
4256 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4257 {
4258     FloatRelation compare = float16_compare(a, b, s);
4259     return compare == float_relation_greater ||
4260            compare == float_relation_equal;
4261 }
4262
4263 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4264 {
4265     FloatRelation compare = float32_compare(a, b, s);
4266     return compare == float_relation_greater ||
4267            compare == float_relation_equal;
4268 }
4269
4270 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4271 {
4272     FloatRelation compare = float64_compare(a, b, s);
4273     return compare == float_relation_greater ||
4274            compare == float_relation_equal;
4275 }
4276
4277 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4278 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4279 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4280
4281 /* Vector Floating-Point Classify Instruction */
4282 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP)         \
4283 static void do_##NAME(void *vd, void *vs2, int i)      \
4284 {                                                      \
4285     TX2 s2 = *((T2 *)vs2 + HS2(i));                    \
4286     *((TD *)vd + HD(i)) = OP(s2);                      \
4287 }
4288
4289 #define GEN_VEXT_V(NAME, ESZ)                          \
4290 void HELPER(NAME)(void *vd, void *v0, void *vs2,       \
4291                   CPURISCVState *env, uint32_t desc)   \
4292 {                                                      \
4293     uint32_t vm = vext_vm(desc);                       \
4294     uint32_t vl = env->vl;                             \
4295     uint32_t total_elems =                             \
4296         vext_get_total_elems(env, desc, ESZ);          \
4297     uint32_t vta = vext_vta(desc);                     \
4298     uint32_t i;                                        \
4299                                                        \
4300     for (i = env->vstart; i < vl; i++) {               \
4301         if (!vm && !vext_elem_mask(v0, i)) {           \
4302             continue;                                  \
4303         }                                              \
4304         do_##NAME(vd, vs2, i);                         \
4305     }                                                  \
4306     env->vstart = 0;                                   \
4307     /* set tail elements to 1s */                      \
4308     vext_set_elems_1s(vd, vta, vl * ESZ,               \
4309                       total_elems * ESZ);              \
4310 }
4311
4312 target_ulong fclass_h(uint64_t frs1)
4313 {
4314     float16 f = frs1;
4315     bool sign = float16_is_neg(f);
4316
4317     if (float16_is_infinity(f)) {
4318         return sign ? 1 << 0 : 1 << 7;
4319     } else if (float16_is_zero(f)) {
4320         return sign ? 1 << 3 : 1 << 4;
4321     } else if (float16_is_zero_or_denormal(f)) {
4322         return sign ? 1 << 2 : 1 << 5;
4323     } else if (float16_is_any_nan(f)) {
4324         float_status s = { }; /* for snan_bit_is_one */
4325         return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4326     } else {
4327         return sign ? 1 << 1 : 1 << 6;
4328     }
4329 }
4330
4331 target_ulong fclass_s(uint64_t frs1)
4332 {
4333     float32 f = frs1;
4334     bool sign = float32_is_neg(f);
4335
4336     if (float32_is_infinity(f)) {
4337         return sign ? 1 << 0 : 1 << 7;
4338     } else if (float32_is_zero(f)) {
4339         return sign ? 1 << 3 : 1 << 4;
4340     } else if (float32_is_zero_or_denormal(f)) {
4341         return sign ? 1 << 2 : 1 << 5;
4342     } else if (float32_is_any_nan(f)) {
4343         float_status s = { }; /* for snan_bit_is_one */
4344         return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4345     } else {
4346         return sign ? 1 << 1 : 1 << 6;
4347     }
4348 }
4349
4350 target_ulong fclass_d(uint64_t frs1)
4351 {
4352     float64 f = frs1;
4353     bool sign = float64_is_neg(f);
4354
4355     if (float64_is_infinity(f)) {
4356         return sign ? 1 << 0 : 1 << 7;
4357     } else if (float64_is_zero(f)) {
4358         return sign ? 1 << 3 : 1 << 4;
4359     } else if (float64_is_zero_or_denormal(f)) {
4360         return sign ? 1 << 2 : 1 << 5;
4361     } else if (float64_is_any_nan(f)) {
4362         float_status s = { }; /* for snan_bit_is_one */
4363         return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4364     } else {
4365         return sign ? 1 << 1 : 1 << 6;
4366     }
4367 }
4368
4369 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4370 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4371 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4372 GEN_VEXT_V(vfclass_v_h, 2)
4373 GEN_VEXT_V(vfclass_v_w, 4)
4374 GEN_VEXT_V(vfclass_v_d, 8)
4375
4376 /* Vector Floating-Point Merge Instruction */
4377
4378 #define GEN_VFMERGE_VF(NAME, ETYPE, H)                        \
4379 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4380                   CPURISCVState *env, uint32_t desc)          \
4381 {                                                             \
4382     uint32_t vm = vext_vm(desc);                              \
4383     uint32_t vl = env->vl;                                    \
4384     uint32_t esz = sizeof(ETYPE);                             \
4385     uint32_t total_elems =                                    \
4386         vext_get_total_elems(env, desc, esz);                 \
4387     uint32_t vta = vext_vta(desc);                            \
4388     uint32_t i;                                               \
4389                                                               \
4390     for (i = env->vstart; i < vl; i++) {                      \
4391         ETYPE s2 = *((ETYPE *)vs2 + H(i));                    \
4392         *((ETYPE *)vd + H(i))                                 \
4393           = (!vm && !vext_elem_mask(v0, i) ? s2 : s1);        \
4394     }                                                         \
4395     env->vstart = 0;                                          \
4396     /* set tail elements to 1s */                             \
4397     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);  \
4398 }
4399
4400 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4401 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4402 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4403
4404 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4405 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4406 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4407 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4408 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4409 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4410 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4411 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4412
4413 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4414 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4415 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4416 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4417 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4418 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4419 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4420
4421 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4422 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4423 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4424 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4425 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4426 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4427 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4428
4429 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4430 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4431 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4432 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4433 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4434 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4435 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4436
4437 /* Widening Floating-Point/Integer Type-Convert Instructions */
4438 /* (TD, T2, TX2) */
4439 #define WOP_UU_B uint16_t, uint8_t,  uint8_t
4440 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4441 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4442 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4443 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4444 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4445 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4446 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4447
4448 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4449 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4450 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4451 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4452 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4453
4454 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4455 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4456 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4457 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4458 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4459 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4460 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4461
4462 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4463 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4464 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4465 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4466 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4467 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4468 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4469
4470 /*
4471  * vfwcvt.f.f.v vd, vs2, vm
4472  * Convert single-width float to double-width float.
4473  */
4474 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4475 {
4476     return float16_to_float32(a, true, s);
4477 }
4478
4479 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4480 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4481 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4482 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4483
4484 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4485 /* (TD, T2, TX2) */
4486 #define NOP_UU_B uint8_t,  uint16_t, uint32_t
4487 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4488 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4489 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4490 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4491 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4492 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4493 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4494 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4495 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4496
4497 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4498 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4499 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4500 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4501 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4502 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4503 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4504
4505 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4506 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4507 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4508 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4509 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4510
4511 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4512 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4513 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4514 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4515 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4516
4517 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4518 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4519 {
4520     return float32_to_float16(a, true, s);
4521 }
4522
4523 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4524 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4525 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4526 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4527
4528 /*
4529  *** Vector Reduction Operations
4530  */
4531 /* Vector Single-Width Integer Reduction Instructions */
4532 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP)          \
4533 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4534         void *vs2, CPURISCVState *env, uint32_t desc)     \
4535 {                                                         \
4536     uint32_t vm = vext_vm(desc);                          \
4537     uint32_t vl = env->vl;                                \
4538     uint32_t esz = sizeof(TD);                            \
4539     uint32_t vlenb = simd_maxsz(desc);                    \
4540     uint32_t vta = vext_vta(desc);                        \
4541     uint32_t i;                                           \
4542     TD s1 =  *((TD *)vs1 + HD(0));                        \
4543                                                           \
4544     for (i = env->vstart; i < vl; i++) {                  \
4545         TS2 s2 = *((TS2 *)vs2 + HS2(i));                  \
4546         if (!vm && !vext_elem_mask(v0, i)) {              \
4547             continue;                                     \
4548         }                                                 \
4549         s1 = OP(s1, (TD)s2);                              \
4550     }                                                     \
4551     *((TD *)vd + HD(0)) = s1;                             \
4552     env->vstart = 0;                                      \
4553     /* set tail elements to 1s */                         \
4554     vext_set_elems_1s(vd, vta, esz, vlenb);               \
4555 }
4556
4557 /* vd[0] = sum(vs1[0], vs2[*]) */
4558 GEN_VEXT_RED(vredsum_vs_b, int8_t,  int8_t,  H1, H1, DO_ADD)
4559 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4560 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4561 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4562
4563 /* vd[0] = maxu(vs1[0], vs2[*]) */
4564 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MAX)
4565 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4566 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4567 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4568
4569 /* vd[0] = max(vs1[0], vs2[*]) */
4570 GEN_VEXT_RED(vredmax_vs_b, int8_t,  int8_t,  H1, H1, DO_MAX)
4571 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4572 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4573 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4574
4575 /* vd[0] = minu(vs1[0], vs2[*]) */
4576 GEN_VEXT_RED(vredminu_vs_b, uint8_t,  uint8_t,  H1, H1, DO_MIN)
4577 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4578 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4579 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4580
4581 /* vd[0] = min(vs1[0], vs2[*]) */
4582 GEN_VEXT_RED(vredmin_vs_b, int8_t,  int8_t,  H1, H1, DO_MIN)
4583 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4584 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4585 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4586
4587 /* vd[0] = and(vs1[0], vs2[*]) */
4588 GEN_VEXT_RED(vredand_vs_b, int8_t,  int8_t,  H1, H1, DO_AND)
4589 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4590 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4591 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4592
4593 /* vd[0] = or(vs1[0], vs2[*]) */
4594 GEN_VEXT_RED(vredor_vs_b, int8_t,  int8_t,  H1, H1, DO_OR)
4595 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4596 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4597 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4598
4599 /* vd[0] = xor(vs1[0], vs2[*]) */
4600 GEN_VEXT_RED(vredxor_vs_b, int8_t,  int8_t,  H1, H1, DO_XOR)
4601 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4602 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4603 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4604
4605 /* Vector Widening Integer Reduction Instructions */
4606 /* signed sum reduction into double-width accumulator */
4607 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t,  H2, H1, DO_ADD)
4608 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4609 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4610
4611 /* Unsigned sum reduction into double-width accumulator */
4612 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t,  H2, H1, DO_ADD)
4613 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4614 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4615
4616 /* Vector Single-Width Floating-Point Reduction Instructions */
4617 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP)          \
4618 void HELPER(NAME)(void *vd, void *v0, void *vs1,           \
4619                   void *vs2, CPURISCVState *env,           \
4620                   uint32_t desc)                           \
4621 {                                                          \
4622     uint32_t vm = vext_vm(desc);                           \
4623     uint32_t vl = env->vl;                                 \
4624     uint32_t esz = sizeof(TD);                             \
4625     uint32_t vlenb = simd_maxsz(desc);                     \
4626     uint32_t vta = vext_vta(desc);                         \
4627     uint32_t i;                                            \
4628     TD s1 =  *((TD *)vs1 + HD(0));                         \
4629                                                            \
4630     for (i = env->vstart; i < vl; i++) {                   \
4631         TS2 s2 = *((TS2 *)vs2 + HS2(i));                   \
4632         if (!vm && !vext_elem_mask(v0, i)) {               \
4633             continue;                                      \
4634         }                                                  \
4635         s1 = OP(s1, (TD)s2, &env->fp_status);              \
4636     }                                                      \
4637     *((TD *)vd + HD(0)) = s1;                              \
4638     env->vstart = 0;                                       \
4639     /* set tail elements to 1s */                          \
4640     vext_set_elems_1s(vd, vta, esz, vlenb);                \
4641 }
4642
4643 /* Unordered sum */
4644 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4645 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4646 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4647
4648 /* Maximum value */
4649 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4650 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4651 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4652
4653 /* Minimum value */
4654 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4655 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4656 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4657
4658 /* Vector Widening Floating-Point Reduction Instructions */
4659 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4660 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4661                             void *vs2, CPURISCVState *env, uint32_t desc)
4662 {
4663     uint32_t vm = vext_vm(desc);
4664     uint32_t vl = env->vl;
4665     uint32_t esz = sizeof(uint32_t);
4666     uint32_t vlenb = simd_maxsz(desc);
4667     uint32_t vta = vext_vta(desc);
4668     uint32_t i;
4669     uint32_t s1 =  *((uint32_t *)vs1 + H4(0));
4670
4671     for (i = env->vstart; i < vl; i++) {
4672         uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4673         if (!vm && !vext_elem_mask(v0, i)) {
4674             continue;
4675         }
4676         s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4677                          &env->fp_status);
4678     }
4679     *((uint32_t *)vd + H4(0)) = s1;
4680     env->vstart = 0;
4681     /* set tail elements to 1s */
4682     vext_set_elems_1s(vd, vta, esz, vlenb);
4683 }
4684
4685 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4686                             void *vs2, CPURISCVState *env, uint32_t desc)
4687 {
4688     uint32_t vm = vext_vm(desc);
4689     uint32_t vl = env->vl;
4690     uint32_t esz = sizeof(uint64_t);
4691     uint32_t vlenb = simd_maxsz(desc);
4692     uint32_t vta = vext_vta(desc);
4693     uint32_t i;
4694     uint64_t s1 =  *((uint64_t *)vs1);
4695
4696     for (i = env->vstart; i < vl; i++) {
4697         uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4698         if (!vm && !vext_elem_mask(v0, i)) {
4699             continue;
4700         }
4701         s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4702                          &env->fp_status);
4703     }
4704     *((uint64_t *)vd) = s1;
4705     env->vstart = 0;
4706     /* set tail elements to 1s */
4707     vext_set_elems_1s(vd, vta, esz, vlenb);
4708 }
4709
4710 /*
4711  *** Vector Mask Operations
4712  */
4713 /* Vector Mask-Register Logical Instructions */
4714 #define GEN_VEXT_MASK_VV(NAME, OP)                        \
4715 void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
4716                   void *vs2, CPURISCVState *env,          \
4717                   uint32_t desc)                          \
4718 {                                                         \
4719     uint32_t vl = env->vl;                                \
4720     uint32_t total_elems = env_archcpu(env)->cfg.vlen;    \
4721     uint32_t vta_all_1s = vext_vta_all_1s(desc);          \
4722     uint32_t i;                                           \
4723     int a, b;                                             \
4724                                                           \
4725     for (i = env->vstart; i < vl; i++) {                  \
4726         a = vext_elem_mask(vs1, i);                       \
4727         b = vext_elem_mask(vs2, i);                       \
4728         vext_set_elem_mask(vd, i, OP(b, a));              \
4729     }                                                     \
4730     env->vstart = 0;                                      \
4731     /* mask destination register are always tail-         \
4732      * agnostic                                           \
4733      */                                                   \
4734     /* set tail elements to 1s */                         \
4735     if (vta_all_1s) {                                     \
4736         for (; i < total_elems; i++) {                    \
4737             vext_set_elem_mask(vd, i, 1);                 \
4738         }                                                 \
4739     }                                                     \
4740 }
4741
4742 #define DO_NAND(N, M)  (!(N & M))
4743 #define DO_ANDNOT(N, M)  (N & !M)
4744 #define DO_NOR(N, M)  (!(N | M))
4745 #define DO_ORNOT(N, M)  (N | !M)
4746 #define DO_XNOR(N, M)  (!(N ^ M))
4747
4748 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4749 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4750 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4751 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4752 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4753 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4754 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4755 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4756
4757 /* Vector count population in mask vcpop */
4758 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4759                              uint32_t desc)
4760 {
4761     target_ulong cnt = 0;
4762     uint32_t vm = vext_vm(desc);
4763     uint32_t vl = env->vl;
4764     int i;
4765
4766     for (i = env->vstart; i < vl; i++) {
4767         if (vm || vext_elem_mask(v0, i)) {
4768             if (vext_elem_mask(vs2, i)) {
4769                 cnt++;
4770             }
4771         }
4772     }
4773     env->vstart = 0;
4774     return cnt;
4775 }
4776
4777 /* vfirst find-first-set mask bit*/
4778 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4779                               uint32_t desc)
4780 {
4781     uint32_t vm = vext_vm(desc);
4782     uint32_t vl = env->vl;
4783     int i;
4784
4785     for (i = env->vstart; i < vl; i++) {
4786         if (vm || vext_elem_mask(v0, i)) {
4787             if (vext_elem_mask(vs2, i)) {
4788                 return i;
4789             }
4790         }
4791     }
4792     env->vstart = 0;
4793     return -1LL;
4794 }
4795
4796 enum set_mask_type {
4797     ONLY_FIRST = 1,
4798     INCLUDE_FIRST,
4799     BEFORE_FIRST,
4800 };
4801
4802 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4803                    uint32_t desc, enum set_mask_type type)
4804 {
4805     uint32_t vm = vext_vm(desc);
4806     uint32_t vl = env->vl;
4807     uint32_t total_elems = env_archcpu(env)->cfg.vlen;
4808     uint32_t vta_all_1s = vext_vta_all_1s(desc);
4809     int i;
4810     bool first_mask_bit = false;
4811
4812     for (i = env->vstart; i < vl; i++) {
4813         if (!vm && !vext_elem_mask(v0, i)) {
4814             continue;
4815         }
4816         /* write a zero to all following active elements */
4817         if (first_mask_bit) {
4818             vext_set_elem_mask(vd, i, 0);
4819             continue;
4820         }
4821         if (vext_elem_mask(vs2, i)) {
4822             first_mask_bit = true;
4823             if (type == BEFORE_FIRST) {
4824                 vext_set_elem_mask(vd, i, 0);
4825             } else {
4826                 vext_set_elem_mask(vd, i, 1);
4827             }
4828         } else {
4829             if (type == ONLY_FIRST) {
4830                 vext_set_elem_mask(vd, i, 0);
4831             } else {
4832                 vext_set_elem_mask(vd, i, 1);
4833             }
4834         }
4835     }
4836     env->vstart = 0;
4837     /* mask destination register are always tail-agnostic */
4838     /* set tail elements to 1s */
4839     if (vta_all_1s) {
4840         for (; i < total_elems; i++) {
4841             vext_set_elem_mask(vd, i, 1);
4842         }
4843     }
4844 }
4845
4846 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4847                      uint32_t desc)
4848 {
4849     vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4850 }
4851
4852 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4853                      uint32_t desc)
4854 {
4855     vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4856 }
4857
4858 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4859                      uint32_t desc)
4860 {
4861     vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4862 }
4863
4864 /* Vector Iota Instruction */
4865 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H)                                  \
4866 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env,      \
4867                   uint32_t desc)                                          \
4868 {                                                                         \
4869     uint32_t vm = vext_vm(desc);                                          \
4870     uint32_t vl = env->vl;                                                \
4871     uint32_t esz = sizeof(ETYPE);                                         \
4872     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4873     uint32_t vta = vext_vta(desc);                                        \
4874     uint32_t sum = 0;                                                     \
4875     int i;                                                                \
4876                                                                           \
4877     for (i = env->vstart; i < vl; i++) {                                  \
4878         if (!vm && !vext_elem_mask(v0, i)) {                              \
4879             continue;                                                     \
4880         }                                                                 \
4881         *((ETYPE *)vd + H(i)) = sum;                                      \
4882         if (vext_elem_mask(vs2, i)) {                                     \
4883             sum++;                                                        \
4884         }                                                                 \
4885     }                                                                     \
4886     env->vstart = 0;                                                      \
4887     /* set tail elements to 1s */                                         \
4888     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4889 }
4890
4891 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t,  H1)
4892 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4893 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4894 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4895
4896 /* Vector Element Index Instruction */
4897 #define GEN_VEXT_VID_V(NAME, ETYPE, H)                                    \
4898 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc)  \
4899 {                                                                         \
4900     uint32_t vm = vext_vm(desc);                                          \
4901     uint32_t vl = env->vl;                                                \
4902     uint32_t esz = sizeof(ETYPE);                                         \
4903     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4904     uint32_t vta = vext_vta(desc);                                        \
4905     int i;                                                                \
4906                                                                           \
4907     for (i = env->vstart; i < vl; i++) {                                  \
4908         if (!vm && !vext_elem_mask(v0, i)) {                              \
4909             continue;                                                     \
4910         }                                                                 \
4911         *((ETYPE *)vd + H(i)) = i;                                        \
4912     }                                                                     \
4913     env->vstart = 0;                                                      \
4914     /* set tail elements to 1s */                                         \
4915     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4916 }
4917
4918 GEN_VEXT_VID_V(vid_v_b, uint8_t,  H1)
4919 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4920 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4921 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4922
4923 /*
4924  *** Vector Permutation Instructions
4925  */
4926
4927 /* Vector Slide Instructions */
4928 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H)                              \
4929 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4930                   CPURISCVState *env, uint32_t desc)                      \
4931 {                                                                         \
4932     uint32_t vm = vext_vm(desc);                                          \
4933     uint32_t vl = env->vl;                                                \
4934     uint32_t esz = sizeof(ETYPE);                                         \
4935     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4936     uint32_t vta = vext_vta(desc);                                        \
4937     target_ulong offset = s1, i_min, i;                                   \
4938                                                                           \
4939     i_min = MAX(env->vstart, offset);                                     \
4940     for (i = i_min; i < vl; i++) {                                        \
4941         if (!vm && !vext_elem_mask(v0, i)) {                              \
4942             continue;                                                     \
4943         }                                                                 \
4944         *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset));          \
4945     }                                                                     \
4946     /* set tail elements to 1s */                                         \
4947     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4948 }
4949
4950 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4951 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t,  H1)
4952 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4953 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4954 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4955
4956 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H)                            \
4957 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
4958                   CPURISCVState *env, uint32_t desc)                      \
4959 {                                                                         \
4960     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
4961     uint32_t vm = vext_vm(desc);                                          \
4962     uint32_t vl = env->vl;                                                \
4963     uint32_t esz = sizeof(ETYPE);                                         \
4964     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
4965     uint32_t vta = vext_vta(desc);                                        \
4966     target_ulong i_max, i;                                                \
4967                                                                           \
4968     i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart);       \
4969     for (i = env->vstart; i < i_max; ++i) {                               \
4970         if (vm || vext_elem_mask(v0, i)) {                                \
4971             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1));          \
4972         }                                                                 \
4973     }                                                                     \
4974                                                                           \
4975     for (i = i_max; i < vl; ++i) {                                        \
4976         if (vm || vext_elem_mask(v0, i)) {                                \
4977             *((ETYPE *)vd + H(i)) = 0;                                    \
4978         }                                                                 \
4979     }                                                                     \
4980                                                                           \
4981     env->vstart = 0;                                                      \
4982     /* set tail elements to 1s */                                         \
4983     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
4984 }
4985
4986 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
4987 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t,  H1)
4988 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
4989 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
4990 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
4991
4992 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H)                                      \
4993 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
4994                      void *vs2, CPURISCVState *env, uint32_t desc)          \
4995 {                                                                           \
4996     typedef uint##BITWIDTH##_t ETYPE;                                       \
4997     uint32_t vm = vext_vm(desc);                                            \
4998     uint32_t vl = env->vl;                                                  \
4999     uint32_t esz = sizeof(ETYPE);                                           \
5000     uint32_t total_elems = vext_get_total_elems(env, desc, esz);            \
5001     uint32_t vta = vext_vta(desc);                                          \
5002     uint32_t i;                                                             \
5003                                                                             \
5004     for (i = env->vstart; i < vl; i++) {                                    \
5005         if (!vm && !vext_elem_mask(v0, i)) {                                \
5006             continue;                                                       \
5007         }                                                                   \
5008         if (i == 0) {                                                       \
5009             *((ETYPE *)vd + H(i)) = s1;                                     \
5010         } else {                                                            \
5011             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1));             \
5012         }                                                                   \
5013     }                                                                       \
5014     env->vstart = 0;                                                        \
5015     /* set tail elements to 1s */                                           \
5016     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                \
5017 }
5018
5019 GEN_VEXT_VSLIE1UP(8,  H1)
5020 GEN_VEXT_VSLIE1UP(16, H2)
5021 GEN_VEXT_VSLIE1UP(32, H4)
5022 GEN_VEXT_VSLIE1UP(64, H8)
5023
5024 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH)                     \
5025 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5026                   CPURISCVState *env, uint32_t desc)              \
5027 {                                                                 \
5028     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);             \
5029 }
5030
5031 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5032 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5033 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5034 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5035 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5036
5037 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H)                                     \
5038 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1,       \
5039                        void *vs2, CPURISCVState *env, uint32_t desc)          \
5040 {                                                                             \
5041     typedef uint##BITWIDTH##_t ETYPE;                                         \
5042     uint32_t vm = vext_vm(desc);                                              \
5043     uint32_t vl = env->vl;                                                    \
5044     uint32_t esz = sizeof(ETYPE);                                             \
5045     uint32_t total_elems = vext_get_total_elems(env, desc, esz);              \
5046     uint32_t vta = vext_vta(desc);                                            \
5047     uint32_t i;                                                               \
5048                                                                               \
5049     for (i = env->vstart; i < vl; i++) {                                      \
5050         if (!vm && !vext_elem_mask(v0, i)) {                                  \
5051             continue;                                                         \
5052         }                                                                     \
5053         if (i == vl - 1) {                                                    \
5054             *((ETYPE *)vd + H(i)) = s1;                                       \
5055         } else {                                                              \
5056             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1));               \
5057         }                                                                     \
5058     }                                                                         \
5059     env->vstart = 0;                                                          \
5060     /* set tail elements to 1s */                                             \
5061     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);                  \
5062 }
5063
5064 GEN_VEXT_VSLIDE1DOWN(8,  H1)
5065 GEN_VEXT_VSLIDE1DOWN(16, H2)
5066 GEN_VEXT_VSLIDE1DOWN(32, H4)
5067 GEN_VEXT_VSLIDE1DOWN(64, H8)
5068
5069 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH)                   \
5070 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5071                   CPURISCVState *env, uint32_t desc)              \
5072 {                                                                 \
5073     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);           \
5074 }
5075
5076 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5077 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5078 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5079 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5080 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5081
5082 /* Vector Floating-Point Slide Instructions */
5083 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH)                \
5084 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5085                   CPURISCVState *env, uint32_t desc)          \
5086 {                                                             \
5087     vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc);         \
5088 }
5089
5090 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5091 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5092 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5093 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5094
5095 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH)              \
5096 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5097                   CPURISCVState *env, uint32_t desc)          \
5098 {                                                             \
5099     vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc);       \
5100 }
5101
5102 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5103 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5104 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5105 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5106
5107 /* Vector Register Gather Instruction */
5108 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2)                    \
5109 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5110                   CPURISCVState *env, uint32_t desc)                      \
5111 {                                                                         \
5112     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2)));             \
5113     uint32_t vm = vext_vm(desc);                                          \
5114     uint32_t vl = env->vl;                                                \
5115     uint32_t esz = sizeof(TS2);                                           \
5116     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5117     uint32_t vta = vext_vta(desc);                                        \
5118     uint64_t index;                                                       \
5119     uint32_t i;                                                           \
5120                                                                           \
5121     for (i = env->vstart; i < vl; i++) {                                  \
5122         if (!vm && !vext_elem_mask(v0, i)) {                              \
5123             continue;                                                     \
5124         }                                                                 \
5125         index = *((TS1 *)vs1 + HS1(i));                                   \
5126         if (index >= vlmax) {                                             \
5127             *((TS2 *)vd + HS2(i)) = 0;                                    \
5128         } else {                                                          \
5129             *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index));           \
5130         }                                                                 \
5131     }                                                                     \
5132     env->vstart = 0;                                                      \
5133     /* set tail elements to 1s */                                         \
5134     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5135 }
5136
5137 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5138 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t,  uint8_t,  H1, H1)
5139 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5140 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5141 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5142
5143 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t,  H2, H1)
5144 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5145 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5146 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5147
5148 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H)                              \
5149 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2,         \
5150                   CPURISCVState *env, uint32_t desc)                      \
5151 {                                                                         \
5152     uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE)));           \
5153     uint32_t vm = vext_vm(desc);                                          \
5154     uint32_t vl = env->vl;                                                \
5155     uint32_t esz = sizeof(ETYPE);                                         \
5156     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5157     uint32_t vta = vext_vta(desc);                                        \
5158     uint64_t index = s1;                                                  \
5159     uint32_t i;                                                           \
5160                                                                           \
5161     for (i = env->vstart; i < vl; i++) {                                  \
5162         if (!vm && !vext_elem_mask(v0, i)) {                              \
5163             continue;                                                     \
5164         }                                                                 \
5165         if (index >= vlmax) {                                             \
5166             *((ETYPE *)vd + H(i)) = 0;                                    \
5167         } else {                                                          \
5168             *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index));           \
5169         }                                                                 \
5170     }                                                                     \
5171     env->vstart = 0;                                                      \
5172     /* set tail elements to 1s */                                         \
5173     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5174 }
5175
5176 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5177 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t,  H1)
5178 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5179 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5180 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5181
5182 /* Vector Compress Instruction */
5183 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H)                             \
5184 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2,               \
5185                   CPURISCVState *env, uint32_t desc)                      \
5186 {                                                                         \
5187     uint32_t vl = env->vl;                                                \
5188     uint32_t esz = sizeof(ETYPE);                                         \
5189     uint32_t total_elems = vext_get_total_elems(env, desc, esz);          \
5190     uint32_t vta = vext_vta(desc);                                        \
5191     uint32_t num = 0, i;                                                  \
5192                                                                           \
5193     for (i = env->vstart; i < vl; i++) {                                  \
5194         if (!vext_elem_mask(vs1, i)) {                                    \
5195             continue;                                                     \
5196         }                                                                 \
5197         *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i));                 \
5198         num++;                                                            \
5199     }                                                                     \
5200     env->vstart = 0;                                                      \
5201     /* set tail elements to 1s */                                         \
5202     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);              \
5203 }
5204
5205 /* Compress into vd elements of vs2 where vs1 is enabled */
5206 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t,  H1)
5207 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5208 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5209 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5210
5211 /* Vector Whole Register Move */
5212 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5213 {
5214     /* EEW = SEW */
5215     uint32_t maxsz = simd_maxsz(desc);
5216     uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5217     uint32_t startb = env->vstart * sewb;
5218     uint32_t i = startb;
5219
5220     memcpy((uint8_t *)vd + H1(i),
5221            (uint8_t *)vs2 + H1(i),
5222            maxsz - startb);
5223
5224     env->vstart = 0;
5225 }
5226
5227 /* Vector Integer Extension */
5228 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1)            \
5229 void HELPER(NAME)(void *vd, void *v0, void *vs2,                 \
5230                   CPURISCVState *env, uint32_t desc)             \
5231 {                                                                \
5232     uint32_t vl = env->vl;                                       \
5233     uint32_t vm = vext_vm(desc);                                 \
5234     uint32_t esz = sizeof(ETYPE);                                \
5235     uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5236     uint32_t vta = vext_vta(desc);                               \
5237     uint32_t i;                                                  \
5238                                                                  \
5239     for (i = env->vstart; i < vl; i++) {                         \
5240         if (!vm && !vext_elem_mask(v0, i)) {                     \
5241             continue;                                            \
5242         }                                                        \
5243         *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i));       \
5244     }                                                            \
5245     env->vstart = 0;                                             \
5246     /* set tail elements to 1s */                                \
5247     vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);     \
5248 }
5249
5250 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t,  H2, H1)
5251 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5252 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5253 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t,  H4, H1)
5254 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5255 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t,  H8, H1)
5256
5257 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t,  H2, H1)
5258 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5259 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5260 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t,  H4, H1)
5261 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5262 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t,  H8, H1)