]> git.proxmox.com Git - mirror_qemu.git/blob - target/riscv/vector_helper.c
Merge patch series "target/riscv: some vector_helper.c cleanups"
[mirror_qemu.git] / target / riscv / vector_helper.c
1 /*
2 * RISC-V Vector Extension Helpers for QEMU.
3 *
4 * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2 or later, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32 target_ulong s2)
33 {
34 int vlmax, vl;
35 RISCVCPU *cpu = env_archcpu(env);
36 uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37 uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38 uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39 int xlen = riscv_cpu_xlen(env);
40 bool vill = (s2 >> (xlen - 1)) & 0x1;
41 target_ulong reserved = s2 &
42 MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43 xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44
45 if (lmul & 4) {
46 /* Fractional LMUL. */
47 if (lmul == 4 ||
48 cpu->cfg.elen >> (8 - lmul) < sew) {
49 vill = true;
50 }
51 }
52
53 if ((sew > cpu->cfg.elen)
54 || vill
55 || (ediv != 0)
56 || (reserved != 0)) {
57 /* only set vill bit. */
58 env->vill = 1;
59 env->vtype = 0;
60 env->vl = 0;
61 env->vstart = 0;
62 return 0;
63 }
64
65 vlmax = vext_get_vlmax(cpu, s2);
66 if (s1 <= vlmax) {
67 vl = s1;
68 } else {
69 vl = vlmax;
70 }
71 env->vl = vl;
72 env->vtype = s2;
73 env->vstart = 0;
74 env->vill = 0;
75 return vl;
76 }
77
78 /*
79 * Note that vector data is stored in host-endian 64-bit chunks,
80 * so addressing units smaller than that needs a host-endian fixup.
81 */
82 #if HOST_BIG_ENDIAN
83 #define H1(x) ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x) ((x) ^ 3)
87 #define H4(x) ((x) ^ 1)
88 #define H8(x) ((x))
89 #else
90 #define H1(x) (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x) (x)
94 #define H4(x) (x)
95 #define H8(x) (x)
96 #endif
97
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100 return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105 return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107
108 /*
109 * Encode LMUL to lmul as following:
110 * LMUL vlmul lmul
111 * 1 000 0
112 * 2 001 1
113 * 4 010 2
114 * 8 011 3
115 * - 100 -
116 * 1/8 101 -3
117 * 1/4 110 -2
118 * 1/2 111 -1
119 */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122 return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127 return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129
130 static inline uint32_t vext_vma(uint32_t desc)
131 {
132 return FIELD_EX32(simd_data(desc), VDATA, VMA);
133 }
134
135 static inline uint32_t vext_vta_all_1s(uint32_t desc)
136 {
137 return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
138 }
139
140 /*
141 * Get the maximum number of elements can be operated.
142 *
143 * log2_esz: log2 of element size in bytes.
144 */
145 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
146 {
147 /*
148 * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
149 * so vlen in bytes (vlenb) is encoded as maxsz.
150 */
151 uint32_t vlenb = simd_maxsz(desc);
152
153 /* Return VLMAX */
154 int scale = vext_lmul(desc) - log2_esz;
155 return scale < 0 ? vlenb >> -scale : vlenb << scale;
156 }
157
158 /*
159 * Get number of total elements, including prestart, body and tail elements.
160 * Note that when LMUL < 1, the tail includes the elements past VLMAX that
161 * are held in the same vector register.
162 */
163 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
164 uint32_t esz)
165 {
166 uint32_t vlenb = simd_maxsz(desc);
167 uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
168 int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
169 ctzl(esz) - ctzl(sew) + vext_lmul(desc);
170 return (vlenb << emul) / esz;
171 }
172
173 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
174 {
175 return (addr & env->cur_pmmask) | env->cur_pmbase;
176 }
177
178 /*
179 * This function checks watchpoint before real load operation.
180 *
181 * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
182 * In user mode, there is no watchpoint support now.
183 *
184 * It will trigger an exception if there is no mapping in TLB
185 * and page table walk can't fill the TLB entry. Then the guest
186 * software can return here after process the exception or never return.
187 */
188 static void probe_pages(CPURISCVState *env, target_ulong addr,
189 target_ulong len, uintptr_t ra,
190 MMUAccessType access_type)
191 {
192 target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
193 target_ulong curlen = MIN(pagelen, len);
194
195 probe_access(env, adjust_addr(env, addr), curlen, access_type,
196 cpu_mmu_index(env, false), ra);
197 if (len > curlen) {
198 addr += curlen;
199 curlen = len - curlen;
200 probe_access(env, adjust_addr(env, addr), curlen, access_type,
201 cpu_mmu_index(env, false), ra);
202 }
203 }
204
205 /* set agnostic elements to 1s */
206 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
207 uint32_t tot)
208 {
209 if (is_agnostic == 0) {
210 /* policy undisturbed */
211 return;
212 }
213 if (tot - cnt == 0) {
214 return;
215 }
216 memset(base + cnt, -1, tot - cnt);
217 }
218
219 static inline void vext_set_elem_mask(void *v0, int index,
220 uint8_t value)
221 {
222 int idx = index / 64;
223 int pos = index % 64;
224 uint64_t old = ((uint64_t *)v0)[idx];
225 ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
226 }
227
228 /*
229 * Earlier designs (pre-0.9) had a varying number of bits
230 * per mask value (MLEN). In the 0.9 design, MLEN=1.
231 * (Section 4.5)
232 */
233 static inline int vext_elem_mask(void *v0, int index)
234 {
235 int idx = index / 64;
236 int pos = index % 64;
237 return (((uint64_t *)v0)[idx] >> pos) & 1;
238 }
239
240 /* elements operations for load and store */
241 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
242 uint32_t idx, void *vd, uintptr_t retaddr);
243
244 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
245 static void NAME(CPURISCVState *env, abi_ptr addr, \
246 uint32_t idx, void *vd, uintptr_t retaddr)\
247 { \
248 ETYPE *cur = ((ETYPE *)vd + H(idx)); \
249 *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
250 } \
251
252 GEN_VEXT_LD_ELEM(lde_b, int8_t, H1, ldsb)
253 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
254 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
255 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
256
257 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
258 static void NAME(CPURISCVState *env, abi_ptr addr, \
259 uint32_t idx, void *vd, uintptr_t retaddr)\
260 { \
261 ETYPE data = *((ETYPE *)vd + H(idx)); \
262 cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
263 }
264
265 GEN_VEXT_ST_ELEM(ste_b, int8_t, H1, stb)
266 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
267 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
268 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
269
270 static void vext_set_tail_elems_1s(CPURISCVState *env, target_ulong vl,
271 void *vd, uint32_t desc, uint32_t nf,
272 uint32_t esz, uint32_t max_elems)
273 {
274 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
275 uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
276 uint32_t vta = vext_vta(desc);
277 uint32_t registers_used;
278 int k;
279
280 for (k = 0; k < nf; ++k) {
281 vext_set_elems_1s(vd, vta, (k * max_elems + vl) * esz,
282 (k * max_elems + max_elems) * esz);
283 }
284
285 if (nf * max_elems % total_elems != 0) {
286 registers_used = ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
287 vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
288 registers_used * vlenb);
289 }
290 }
291
292 /*
293 *** stride: access vector element from strided memory
294 */
295 static void
296 vext_ldst_stride(void *vd, void *v0, target_ulong base,
297 target_ulong stride, CPURISCVState *env,
298 uint32_t desc, uint32_t vm,
299 vext_ldst_elem_fn *ldst_elem,
300 uint32_t log2_esz, uintptr_t ra)
301 {
302 uint32_t i, k;
303 uint32_t nf = vext_nf(desc);
304 uint32_t max_elems = vext_max_elems(desc, log2_esz);
305 uint32_t esz = 1 << log2_esz;
306 uint32_t vma = vext_vma(desc);
307
308 for (i = env->vstart; i < env->vl; i++, env->vstart++) {
309 k = 0;
310 while (k < nf) {
311 if (!vm && !vext_elem_mask(v0, i)) {
312 /* set masked-off elements to 1s */
313 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
314 (i + k * max_elems + 1) * esz);
315 k++;
316 continue;
317 }
318 target_ulong addr = base + stride * i + (k << log2_esz);
319 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
320 k++;
321 }
322 }
323 env->vstart = 0;
324
325 vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
326 }
327
328 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \
329 void HELPER(NAME)(void *vd, void * v0, target_ulong base, \
330 target_ulong stride, CPURISCVState *env, \
331 uint32_t desc) \
332 { \
333 uint32_t vm = vext_vm(desc); \
334 vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN, \
335 ctzl(sizeof(ETYPE)), GETPC()); \
336 }
337
338 GEN_VEXT_LD_STRIDE(vlse8_v, int8_t, lde_b)
339 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
340 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
341 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
342
343 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN) \
344 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \
345 target_ulong stride, CPURISCVState *env, \
346 uint32_t desc) \
347 { \
348 uint32_t vm = vext_vm(desc); \
349 vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN, \
350 ctzl(sizeof(ETYPE)), GETPC()); \
351 }
352
353 GEN_VEXT_ST_STRIDE(vsse8_v, int8_t, ste_b)
354 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
355 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
356 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
357
358 /*
359 *** unit-stride: access elements stored contiguously in memory
360 */
361
362 /* unmasked unit-stride load and store operation*/
363 static void
364 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
365 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
366 uintptr_t ra)
367 {
368 uint32_t i, k;
369 uint32_t nf = vext_nf(desc);
370 uint32_t max_elems = vext_max_elems(desc, log2_esz);
371 uint32_t esz = 1 << log2_esz;
372
373 /* load bytes from guest memory */
374 for (i = env->vstart; i < evl; i++, env->vstart++) {
375 k = 0;
376 while (k < nf) {
377 target_ulong addr = base + ((i * nf + k) << log2_esz);
378 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
379 k++;
380 }
381 }
382 env->vstart = 0;
383
384 vext_set_tail_elems_1s(env, evl, vd, desc, nf, esz, max_elems);
385 }
386
387 /*
388 * masked unit-stride load and store operation will be a special case of stride,
389 * stride = NF * sizeof (MTYPE)
390 */
391
392 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN) \
393 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base, \
394 CPURISCVState *env, uint32_t desc) \
395 { \
396 uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE)); \
397 vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN, \
398 ctzl(sizeof(ETYPE)), GETPC()); \
399 } \
400 \
401 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \
402 CPURISCVState *env, uint32_t desc) \
403 { \
404 vext_ldst_us(vd, base, env, desc, LOAD_FN, \
405 ctzl(sizeof(ETYPE)), env->vl, GETPC()); \
406 }
407
408 GEN_VEXT_LD_US(vle8_v, int8_t, lde_b)
409 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
410 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
411 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
412
413 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN) \
414 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base, \
415 CPURISCVState *env, uint32_t desc) \
416 { \
417 uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE)); \
418 vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN, \
419 ctzl(sizeof(ETYPE)), GETPC()); \
420 } \
421 \
422 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \
423 CPURISCVState *env, uint32_t desc) \
424 { \
425 vext_ldst_us(vd, base, env, desc, STORE_FN, \
426 ctzl(sizeof(ETYPE)), env->vl, GETPC()); \
427 }
428
429 GEN_VEXT_ST_US(vse8_v, int8_t, ste_b)
430 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
431 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
432 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
433
434 /*
435 *** unit stride mask load and store, EEW = 1
436 */
437 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
438 CPURISCVState *env, uint32_t desc)
439 {
440 /* evl = ceil(vl/8) */
441 uint8_t evl = (env->vl + 7) >> 3;
442 vext_ldst_us(vd, base, env, desc, lde_b,
443 0, evl, GETPC());
444 }
445
446 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
447 CPURISCVState *env, uint32_t desc)
448 {
449 /* evl = ceil(vl/8) */
450 uint8_t evl = (env->vl + 7) >> 3;
451 vext_ldst_us(vd, base, env, desc, ste_b,
452 0, evl, GETPC());
453 }
454
455 /*
456 *** index: access vector element from indexed memory
457 */
458 typedef target_ulong vext_get_index_addr(target_ulong base,
459 uint32_t idx, void *vs2);
460
461 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H) \
462 static target_ulong NAME(target_ulong base, \
463 uint32_t idx, void *vs2) \
464 { \
465 return (base + *((ETYPE *)vs2 + H(idx))); \
466 }
467
468 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t, H1)
469 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
470 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
471 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
472
473 static inline void
474 vext_ldst_index(void *vd, void *v0, target_ulong base,
475 void *vs2, CPURISCVState *env, uint32_t desc,
476 vext_get_index_addr get_index_addr,
477 vext_ldst_elem_fn *ldst_elem,
478 uint32_t log2_esz, uintptr_t ra)
479 {
480 uint32_t i, k;
481 uint32_t nf = vext_nf(desc);
482 uint32_t vm = vext_vm(desc);
483 uint32_t max_elems = vext_max_elems(desc, log2_esz);
484 uint32_t esz = 1 << log2_esz;
485 uint32_t vma = vext_vma(desc);
486
487 /* load bytes from guest memory */
488 for (i = env->vstart; i < env->vl; i++, env->vstart++) {
489 k = 0;
490 while (k < nf) {
491 if (!vm && !vext_elem_mask(v0, i)) {
492 /* set masked-off elements to 1s */
493 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
494 (i + k * max_elems + 1) * esz);
495 k++;
496 continue;
497 }
498 abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
499 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
500 k++;
501 }
502 }
503 env->vstart = 0;
504
505 vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
506 }
507
508 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN) \
509 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \
510 void *vs2, CPURISCVState *env, uint32_t desc) \
511 { \
512 vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN, \
513 LOAD_FN, ctzl(sizeof(ETYPE)), GETPC()); \
514 }
515
516 GEN_VEXT_LD_INDEX(vlxei8_8_v, int8_t, idx_b, lde_b)
517 GEN_VEXT_LD_INDEX(vlxei8_16_v, int16_t, idx_b, lde_h)
518 GEN_VEXT_LD_INDEX(vlxei8_32_v, int32_t, idx_b, lde_w)
519 GEN_VEXT_LD_INDEX(vlxei8_64_v, int64_t, idx_b, lde_d)
520 GEN_VEXT_LD_INDEX(vlxei16_8_v, int8_t, idx_h, lde_b)
521 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
522 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
523 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
524 GEN_VEXT_LD_INDEX(vlxei32_8_v, int8_t, idx_w, lde_b)
525 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
526 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
527 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
528 GEN_VEXT_LD_INDEX(vlxei64_8_v, int8_t, idx_d, lde_b)
529 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
530 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
531 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
532
533 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN) \
534 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \
535 void *vs2, CPURISCVState *env, uint32_t desc) \
536 { \
537 vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN, \
538 STORE_FN, ctzl(sizeof(ETYPE)), \
539 GETPC()); \
540 }
541
542 GEN_VEXT_ST_INDEX(vsxei8_8_v, int8_t, idx_b, ste_b)
543 GEN_VEXT_ST_INDEX(vsxei8_16_v, int16_t, idx_b, ste_h)
544 GEN_VEXT_ST_INDEX(vsxei8_32_v, int32_t, idx_b, ste_w)
545 GEN_VEXT_ST_INDEX(vsxei8_64_v, int64_t, idx_b, ste_d)
546 GEN_VEXT_ST_INDEX(vsxei16_8_v, int8_t, idx_h, ste_b)
547 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
548 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
549 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
550 GEN_VEXT_ST_INDEX(vsxei32_8_v, int8_t, idx_w, ste_b)
551 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
552 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
553 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
554 GEN_VEXT_ST_INDEX(vsxei64_8_v, int8_t, idx_d, ste_b)
555 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
556 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
557 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
558
559 /*
560 *** unit-stride fault-only-fisrt load instructions
561 */
562 static inline void
563 vext_ldff(void *vd, void *v0, target_ulong base,
564 CPURISCVState *env, uint32_t desc,
565 vext_ldst_elem_fn *ldst_elem,
566 uint32_t log2_esz, uintptr_t ra)
567 {
568 void *host;
569 uint32_t i, k, vl = 0;
570 uint32_t nf = vext_nf(desc);
571 uint32_t vm = vext_vm(desc);
572 uint32_t max_elems = vext_max_elems(desc, log2_esz);
573 uint32_t esz = 1 << log2_esz;
574 uint32_t vma = vext_vma(desc);
575 target_ulong addr, offset, remain;
576
577 /* probe every access*/
578 for (i = env->vstart; i < env->vl; i++) {
579 if (!vm && !vext_elem_mask(v0, i)) {
580 continue;
581 }
582 addr = adjust_addr(env, base + i * (nf << log2_esz));
583 if (i == 0) {
584 probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
585 } else {
586 /* if it triggers an exception, no need to check watchpoint */
587 remain = nf << log2_esz;
588 while (remain > 0) {
589 offset = -(addr | TARGET_PAGE_MASK);
590 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
591 cpu_mmu_index(env, false));
592 if (host) {
593 #ifdef CONFIG_USER_ONLY
594 if (page_check_range(addr, offset, PAGE_READ) < 0) {
595 vl = i;
596 goto ProbeSuccess;
597 }
598 #else
599 probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
600 #endif
601 } else {
602 vl = i;
603 goto ProbeSuccess;
604 }
605 if (remain <= offset) {
606 break;
607 }
608 remain -= offset;
609 addr = adjust_addr(env, addr + offset);
610 }
611 }
612 }
613 ProbeSuccess:
614 /* load bytes from guest memory */
615 if (vl != 0) {
616 env->vl = vl;
617 }
618 for (i = env->vstart; i < env->vl; i++) {
619 k = 0;
620 while (k < nf) {
621 if (!vm && !vext_elem_mask(v0, i)) {
622 /* set masked-off elements to 1s */
623 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
624 (i + k * max_elems + 1) * esz);
625 k++;
626 continue;
627 }
628 target_ulong addr = base + ((i * nf + k) << log2_esz);
629 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
630 k++;
631 }
632 }
633 env->vstart = 0;
634
635 vext_set_tail_elems_1s(env, env->vl, vd, desc, nf, esz, max_elems);
636 }
637
638 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN) \
639 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \
640 CPURISCVState *env, uint32_t desc) \
641 { \
642 vext_ldff(vd, v0, base, env, desc, LOAD_FN, \
643 ctzl(sizeof(ETYPE)), GETPC()); \
644 }
645
646 GEN_VEXT_LDFF(vle8ff_v, int8_t, lde_b)
647 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
648 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
649 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
650
651 #define DO_SWAP(N, M) (M)
652 #define DO_AND(N, M) (N & M)
653 #define DO_XOR(N, M) (N ^ M)
654 #define DO_OR(N, M) (N | M)
655 #define DO_ADD(N, M) (N + M)
656
657 /* Signed min/max */
658 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
659 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
660
661 /* Unsigned min/max */
662 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
663 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
664
665 /*
666 *** load and store whole register instructions
667 */
668 static void
669 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
670 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
671 {
672 uint32_t i, k, off, pos;
673 uint32_t nf = vext_nf(desc);
674 uint32_t vlenb = riscv_cpu_cfg(env)->vlen >> 3;
675 uint32_t max_elems = vlenb >> log2_esz;
676
677 k = env->vstart / max_elems;
678 off = env->vstart % max_elems;
679
680 if (off) {
681 /* load/store rest of elements of current segment pointed by vstart */
682 for (pos = off; pos < max_elems; pos++, env->vstart++) {
683 target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
684 ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
685 }
686 k++;
687 }
688
689 /* load/store elements for rest of segments */
690 for (; k < nf; k++) {
691 for (i = 0; i < max_elems; i++, env->vstart++) {
692 target_ulong addr = base + ((i + k * max_elems) << log2_esz);
693 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
694 }
695 }
696
697 env->vstart = 0;
698 }
699
700 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN) \
701 void HELPER(NAME)(void *vd, target_ulong base, \
702 CPURISCVState *env, uint32_t desc) \
703 { \
704 vext_ldst_whole(vd, base, env, desc, LOAD_FN, \
705 ctzl(sizeof(ETYPE)), GETPC()); \
706 }
707
708 GEN_VEXT_LD_WHOLE(vl1re8_v, int8_t, lde_b)
709 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
710 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
711 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
712 GEN_VEXT_LD_WHOLE(vl2re8_v, int8_t, lde_b)
713 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
714 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
715 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
716 GEN_VEXT_LD_WHOLE(vl4re8_v, int8_t, lde_b)
717 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
718 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
719 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
720 GEN_VEXT_LD_WHOLE(vl8re8_v, int8_t, lde_b)
721 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
722 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
723 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
724
725 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN) \
726 void HELPER(NAME)(void *vd, target_ulong base, \
727 CPURISCVState *env, uint32_t desc) \
728 { \
729 vext_ldst_whole(vd, base, env, desc, STORE_FN, \
730 ctzl(sizeof(ETYPE)), GETPC()); \
731 }
732
733 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
734 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
735 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
736 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
737
738 /*
739 *** Vector Integer Arithmetic Instructions
740 */
741
742 /* expand macro args before macro */
743 #define RVVCALL(macro, ...) macro(__VA_ARGS__)
744
745 /* (TD, T1, T2, TX1, TX2) */
746 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
747 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
748 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
749 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
750 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
751 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
752 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
753 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
754 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
755 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
756 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
757 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
758 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
759 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
760 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
761 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
762 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
763 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
764 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
765 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
766 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
767 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
768 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
769 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
770 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
771 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
772 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
773 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
774 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
775 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
776
777 /* operation of two vector elements */
778 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
779
780 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \
781 static void do_##NAME(void *vd, void *vs1, void *vs2, int i) \
782 { \
783 TX1 s1 = *((T1 *)vs1 + HS1(i)); \
784 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
785 *((TD *)vd + HD(i)) = OP(s2, s1); \
786 }
787 #define DO_SUB(N, M) (N - M)
788 #define DO_RSUB(N, M) (M - N)
789
790 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
791 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
792 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
793 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
794 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
795 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
796 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
797 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
798
799 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
800 CPURISCVState *env, uint32_t desc,
801 opivv2_fn *fn, uint32_t esz)
802 {
803 uint32_t vm = vext_vm(desc);
804 uint32_t vl = env->vl;
805 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
806 uint32_t vta = vext_vta(desc);
807 uint32_t vma = vext_vma(desc);
808 uint32_t i;
809
810 for (i = env->vstart; i < vl; i++) {
811 if (!vm && !vext_elem_mask(v0, i)) {
812 /* set masked-off elements to 1s */
813 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
814 continue;
815 }
816 fn(vd, vs1, vs2, i);
817 }
818 env->vstart = 0;
819 /* set tail elements to 1s */
820 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
821 }
822
823 /* generate the helpers for OPIVV */
824 #define GEN_VEXT_VV(NAME, ESZ) \
825 void HELPER(NAME)(void *vd, void *v0, void *vs1, \
826 void *vs2, CPURISCVState *env, \
827 uint32_t desc) \
828 { \
829 do_vext_vv(vd, v0, vs1, vs2, env, desc, \
830 do_##NAME, ESZ); \
831 }
832
833 GEN_VEXT_VV(vadd_vv_b, 1)
834 GEN_VEXT_VV(vadd_vv_h, 2)
835 GEN_VEXT_VV(vadd_vv_w, 4)
836 GEN_VEXT_VV(vadd_vv_d, 8)
837 GEN_VEXT_VV(vsub_vv_b, 1)
838 GEN_VEXT_VV(vsub_vv_h, 2)
839 GEN_VEXT_VV(vsub_vv_w, 4)
840 GEN_VEXT_VV(vsub_vv_d, 8)
841
842 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
843
844 /*
845 * (T1)s1 gives the real operator type.
846 * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
847 */
848 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \
849 static void do_##NAME(void *vd, target_long s1, void *vs2, int i) \
850 { \
851 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
852 *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1); \
853 }
854
855 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
856 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
857 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
858 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
859 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
860 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
861 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
862 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
863 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
864 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
865 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
866 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
867
868 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
869 CPURISCVState *env, uint32_t desc,
870 opivx2_fn fn, uint32_t esz)
871 {
872 uint32_t vm = vext_vm(desc);
873 uint32_t vl = env->vl;
874 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
875 uint32_t vta = vext_vta(desc);
876 uint32_t vma = vext_vma(desc);
877 uint32_t i;
878
879 for (i = env->vstart; i < vl; i++) {
880 if (!vm && !vext_elem_mask(v0, i)) {
881 /* set masked-off elements to 1s */
882 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
883 continue;
884 }
885 fn(vd, s1, vs2, i);
886 }
887 env->vstart = 0;
888 /* set tail elements to 1s */
889 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
890 }
891
892 /* generate the helpers for OPIVX */
893 #define GEN_VEXT_VX(NAME, ESZ) \
894 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \
895 void *vs2, CPURISCVState *env, \
896 uint32_t desc) \
897 { \
898 do_vext_vx(vd, v0, s1, vs2, env, desc, \
899 do_##NAME, ESZ); \
900 }
901
902 GEN_VEXT_VX(vadd_vx_b, 1)
903 GEN_VEXT_VX(vadd_vx_h, 2)
904 GEN_VEXT_VX(vadd_vx_w, 4)
905 GEN_VEXT_VX(vadd_vx_d, 8)
906 GEN_VEXT_VX(vsub_vx_b, 1)
907 GEN_VEXT_VX(vsub_vx_h, 2)
908 GEN_VEXT_VX(vsub_vx_w, 4)
909 GEN_VEXT_VX(vsub_vx_d, 8)
910 GEN_VEXT_VX(vrsub_vx_b, 1)
911 GEN_VEXT_VX(vrsub_vx_h, 2)
912 GEN_VEXT_VX(vrsub_vx_w, 4)
913 GEN_VEXT_VX(vrsub_vx_d, 8)
914
915 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
916 {
917 intptr_t oprsz = simd_oprsz(desc);
918 intptr_t i;
919
920 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
921 *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
922 }
923 }
924
925 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
926 {
927 intptr_t oprsz = simd_oprsz(desc);
928 intptr_t i;
929
930 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
931 *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
932 }
933 }
934
935 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
936 {
937 intptr_t oprsz = simd_oprsz(desc);
938 intptr_t i;
939
940 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
941 *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
942 }
943 }
944
945 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
946 {
947 intptr_t oprsz = simd_oprsz(desc);
948 intptr_t i;
949
950 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
951 *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
952 }
953 }
954
955 /* Vector Widening Integer Add/Subtract */
956 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
957 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
958 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
959 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
960 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
961 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
962 #define WOP_WUUU_B uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
963 #define WOP_WUUU_H uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
964 #define WOP_WUUU_W uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
965 #define WOP_WSSS_B int16_t, int8_t, int16_t, int16_t, int16_t
966 #define WOP_WSSS_H int32_t, int16_t, int32_t, int32_t, int32_t
967 #define WOP_WSSS_W int64_t, int32_t, int64_t, int64_t, int64_t
968 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
969 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
970 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
971 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
972 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
973 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
974 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
975 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
976 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
977 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
978 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
979 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
980 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
981 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
982 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
983 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
984 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
985 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
986 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
987 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
988 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
989 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
990 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
991 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
992 GEN_VEXT_VV(vwaddu_vv_b, 2)
993 GEN_VEXT_VV(vwaddu_vv_h, 4)
994 GEN_VEXT_VV(vwaddu_vv_w, 8)
995 GEN_VEXT_VV(vwsubu_vv_b, 2)
996 GEN_VEXT_VV(vwsubu_vv_h, 4)
997 GEN_VEXT_VV(vwsubu_vv_w, 8)
998 GEN_VEXT_VV(vwadd_vv_b, 2)
999 GEN_VEXT_VV(vwadd_vv_h, 4)
1000 GEN_VEXT_VV(vwadd_vv_w, 8)
1001 GEN_VEXT_VV(vwsub_vv_b, 2)
1002 GEN_VEXT_VV(vwsub_vv_h, 4)
1003 GEN_VEXT_VV(vwsub_vv_w, 8)
1004 GEN_VEXT_VV(vwaddu_wv_b, 2)
1005 GEN_VEXT_VV(vwaddu_wv_h, 4)
1006 GEN_VEXT_VV(vwaddu_wv_w, 8)
1007 GEN_VEXT_VV(vwsubu_wv_b, 2)
1008 GEN_VEXT_VV(vwsubu_wv_h, 4)
1009 GEN_VEXT_VV(vwsubu_wv_w, 8)
1010 GEN_VEXT_VV(vwadd_wv_b, 2)
1011 GEN_VEXT_VV(vwadd_wv_h, 4)
1012 GEN_VEXT_VV(vwadd_wv_w, 8)
1013 GEN_VEXT_VV(vwsub_wv_b, 2)
1014 GEN_VEXT_VV(vwsub_wv_h, 4)
1015 GEN_VEXT_VV(vwsub_wv_w, 8)
1016
1017 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1018 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1019 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1020 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1021 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1022 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1023 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1024 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1025 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1026 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1027 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1028 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1029 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1030 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1031 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1032 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1033 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1034 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1035 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1036 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1037 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1038 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1039 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1040 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1041 GEN_VEXT_VX(vwaddu_vx_b, 2)
1042 GEN_VEXT_VX(vwaddu_vx_h, 4)
1043 GEN_VEXT_VX(vwaddu_vx_w, 8)
1044 GEN_VEXT_VX(vwsubu_vx_b, 2)
1045 GEN_VEXT_VX(vwsubu_vx_h, 4)
1046 GEN_VEXT_VX(vwsubu_vx_w, 8)
1047 GEN_VEXT_VX(vwadd_vx_b, 2)
1048 GEN_VEXT_VX(vwadd_vx_h, 4)
1049 GEN_VEXT_VX(vwadd_vx_w, 8)
1050 GEN_VEXT_VX(vwsub_vx_b, 2)
1051 GEN_VEXT_VX(vwsub_vx_h, 4)
1052 GEN_VEXT_VX(vwsub_vx_w, 8)
1053 GEN_VEXT_VX(vwaddu_wx_b, 2)
1054 GEN_VEXT_VX(vwaddu_wx_h, 4)
1055 GEN_VEXT_VX(vwaddu_wx_w, 8)
1056 GEN_VEXT_VX(vwsubu_wx_b, 2)
1057 GEN_VEXT_VX(vwsubu_wx_h, 4)
1058 GEN_VEXT_VX(vwsubu_wx_w, 8)
1059 GEN_VEXT_VX(vwadd_wx_b, 2)
1060 GEN_VEXT_VX(vwadd_wx_h, 4)
1061 GEN_VEXT_VX(vwadd_wx_w, 8)
1062 GEN_VEXT_VX(vwsub_wx_b, 2)
1063 GEN_VEXT_VX(vwsub_wx_h, 4)
1064 GEN_VEXT_VX(vwsub_wx_w, 8)
1065
1066 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1067 #define DO_VADC(N, M, C) (N + M + C)
1068 #define DO_VSBC(N, M, C) (N - M - C)
1069
1070 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP) \
1071 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
1072 CPURISCVState *env, uint32_t desc) \
1073 { \
1074 uint32_t vl = env->vl; \
1075 uint32_t esz = sizeof(ETYPE); \
1076 uint32_t total_elems = \
1077 vext_get_total_elems(env, desc, esz); \
1078 uint32_t vta = vext_vta(desc); \
1079 uint32_t i; \
1080 \
1081 for (i = env->vstart; i < vl; i++) { \
1082 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \
1083 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
1084 ETYPE carry = vext_elem_mask(v0, i); \
1085 \
1086 *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry); \
1087 } \
1088 env->vstart = 0; \
1089 /* set tail elements to 1s */ \
1090 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
1091 }
1092
1093 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t, H1, DO_VADC)
1094 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1095 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1096 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1097
1098 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t, H1, DO_VSBC)
1099 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1100 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1101 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1102
1103 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP) \
1104 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
1105 CPURISCVState *env, uint32_t desc) \
1106 { \
1107 uint32_t vl = env->vl; \
1108 uint32_t esz = sizeof(ETYPE); \
1109 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
1110 uint32_t vta = vext_vta(desc); \
1111 uint32_t i; \
1112 \
1113 for (i = env->vstart; i < vl; i++) { \
1114 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
1115 ETYPE carry = vext_elem_mask(v0, i); \
1116 \
1117 *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1118 } \
1119 env->vstart = 0; \
1120 /* set tail elements to 1s */ \
1121 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
1122 }
1123
1124 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t, H1, DO_VADC)
1125 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1126 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1127 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1128
1129 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t, H1, DO_VSBC)
1130 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1131 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1132 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1133
1134 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N : \
1135 (__typeof(N))(N + M) < N)
1136 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1137
1138 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP) \
1139 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
1140 CPURISCVState *env, uint32_t desc) \
1141 { \
1142 uint32_t vl = env->vl; \
1143 uint32_t vm = vext_vm(desc); \
1144 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \
1145 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
1146 uint32_t i; \
1147 \
1148 for (i = env->vstart; i < vl; i++) { \
1149 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \
1150 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
1151 ETYPE carry = !vm && vext_elem_mask(v0, i); \
1152 vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry)); \
1153 } \
1154 env->vstart = 0; \
1155 /* mask destination register are always tail-agnostic */ \
1156 /* set tail elements to 1s */ \
1157 if (vta_all_1s) { \
1158 for (; i < total_elems; i++) { \
1159 vext_set_elem_mask(vd, i, 1); \
1160 } \
1161 } \
1162 }
1163
1164 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t, H1, DO_MADC)
1165 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1166 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1167 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1168
1169 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t, H1, DO_MSBC)
1170 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1171 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1172 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1173
1174 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP) \
1175 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \
1176 void *vs2, CPURISCVState *env, uint32_t desc) \
1177 { \
1178 uint32_t vl = env->vl; \
1179 uint32_t vm = vext_vm(desc); \
1180 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \
1181 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
1182 uint32_t i; \
1183 \
1184 for (i = env->vstart; i < vl; i++) { \
1185 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
1186 ETYPE carry = !vm && vext_elem_mask(v0, i); \
1187 vext_set_elem_mask(vd, i, \
1188 DO_OP(s2, (ETYPE)(target_long)s1, carry)); \
1189 } \
1190 env->vstart = 0; \
1191 /* mask destination register are always tail-agnostic */ \
1192 /* set tail elements to 1s */ \
1193 if (vta_all_1s) { \
1194 for (; i < total_elems; i++) { \
1195 vext_set_elem_mask(vd, i, 1); \
1196 } \
1197 } \
1198 }
1199
1200 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t, H1, DO_MADC)
1201 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1202 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1203 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1204
1205 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t, H1, DO_MSBC)
1206 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1207 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1208 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1209
1210 /* Vector Bitwise Logical Instructions */
1211 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1212 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1213 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1214 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1215 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1216 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1217 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1218 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1219 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1220 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1221 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1222 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1223 GEN_VEXT_VV(vand_vv_b, 1)
1224 GEN_VEXT_VV(vand_vv_h, 2)
1225 GEN_VEXT_VV(vand_vv_w, 4)
1226 GEN_VEXT_VV(vand_vv_d, 8)
1227 GEN_VEXT_VV(vor_vv_b, 1)
1228 GEN_VEXT_VV(vor_vv_h, 2)
1229 GEN_VEXT_VV(vor_vv_w, 4)
1230 GEN_VEXT_VV(vor_vv_d, 8)
1231 GEN_VEXT_VV(vxor_vv_b, 1)
1232 GEN_VEXT_VV(vxor_vv_h, 2)
1233 GEN_VEXT_VV(vxor_vv_w, 4)
1234 GEN_VEXT_VV(vxor_vv_d, 8)
1235
1236 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1237 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1238 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1239 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1240 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1241 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1242 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1243 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1244 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1245 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1246 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1247 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1248 GEN_VEXT_VX(vand_vx_b, 1)
1249 GEN_VEXT_VX(vand_vx_h, 2)
1250 GEN_VEXT_VX(vand_vx_w, 4)
1251 GEN_VEXT_VX(vand_vx_d, 8)
1252 GEN_VEXT_VX(vor_vx_b, 1)
1253 GEN_VEXT_VX(vor_vx_h, 2)
1254 GEN_VEXT_VX(vor_vx_w, 4)
1255 GEN_VEXT_VX(vor_vx_d, 8)
1256 GEN_VEXT_VX(vxor_vx_b, 1)
1257 GEN_VEXT_VX(vxor_vx_h, 2)
1258 GEN_VEXT_VX(vxor_vx_w, 4)
1259 GEN_VEXT_VX(vxor_vx_d, 8)
1260
1261 /* Vector Single-Width Bit Shift Instructions */
1262 #define DO_SLL(N, M) (N << (M))
1263 #define DO_SRL(N, M) (N >> (M))
1264
1265 /* generate the helpers for shift instructions with two vector operators */
1266 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK) \
1267 void HELPER(NAME)(void *vd, void *v0, void *vs1, \
1268 void *vs2, CPURISCVState *env, uint32_t desc) \
1269 { \
1270 uint32_t vm = vext_vm(desc); \
1271 uint32_t vl = env->vl; \
1272 uint32_t esz = sizeof(TS1); \
1273 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
1274 uint32_t vta = vext_vta(desc); \
1275 uint32_t vma = vext_vma(desc); \
1276 uint32_t i; \
1277 \
1278 for (i = env->vstart; i < vl; i++) { \
1279 if (!vm && !vext_elem_mask(v0, i)) { \
1280 /* set masked-off elements to 1s */ \
1281 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \
1282 continue; \
1283 } \
1284 TS1 s1 = *((TS1 *)vs1 + HS1(i)); \
1285 TS2 s2 = *((TS2 *)vs2 + HS2(i)); \
1286 *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK); \
1287 } \
1288 env->vstart = 0; \
1289 /* set tail elements to 1s */ \
1290 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
1291 }
1292
1293 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t, uint8_t, H1, H1, DO_SLL, 0x7)
1294 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1295 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1296 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1297
1298 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1299 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1300 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1301 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1302
1303 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t, int8_t, H1, H1, DO_SRL, 0x7)
1304 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1305 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1306 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1307
1308 /* generate the helpers for shift instructions with one vector and one scalar */
1309 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1310 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \
1311 void *vs2, CPURISCVState *env, uint32_t desc) \
1312 { \
1313 uint32_t vm = vext_vm(desc); \
1314 uint32_t vl = env->vl; \
1315 uint32_t esz = sizeof(TD); \
1316 uint32_t total_elems = \
1317 vext_get_total_elems(env, desc, esz); \
1318 uint32_t vta = vext_vta(desc); \
1319 uint32_t vma = vext_vma(desc); \
1320 uint32_t i; \
1321 \
1322 for (i = env->vstart; i < vl; i++) { \
1323 if (!vm && !vext_elem_mask(v0, i)) { \
1324 /* set masked-off elements to 1s */ \
1325 vext_set_elems_1s(vd, vma, i * esz, \
1326 (i + 1) * esz); \
1327 continue; \
1328 } \
1329 TS2 s2 = *((TS2 *)vs2 + HS2(i)); \
1330 *((TD *)vd + HD(i)) = OP(s2, s1 & MASK); \
1331 } \
1332 env->vstart = 0; \
1333 /* set tail elements to 1s */ \
1334 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1335 }
1336
1337 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1338 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1339 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1340 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1341
1342 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1343 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1344 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1345 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1346
1347 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1348 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1349 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1350 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1351
1352 /* Vector Narrowing Integer Right Shift Instructions */
1353 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1354 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1355 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1356 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t, int16_t, H1, H2, DO_SRL, 0xf)
1357 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1358 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1359 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1360 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1361 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1362 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1363 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1364 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1365
1366 /* Vector Integer Comparison Instructions */
1367 #define DO_MSEQ(N, M) (N == M)
1368 #define DO_MSNE(N, M) (N != M)
1369 #define DO_MSLT(N, M) (N < M)
1370 #define DO_MSLE(N, M) (N <= M)
1371 #define DO_MSGT(N, M) (N > M)
1372
1373 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP) \
1374 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
1375 CPURISCVState *env, uint32_t desc) \
1376 { \
1377 uint32_t vm = vext_vm(desc); \
1378 uint32_t vl = env->vl; \
1379 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \
1380 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
1381 uint32_t vma = vext_vma(desc); \
1382 uint32_t i; \
1383 \
1384 for (i = env->vstart; i < vl; i++) { \
1385 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \
1386 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
1387 if (!vm && !vext_elem_mask(v0, i)) { \
1388 /* set masked-off elements to 1s */ \
1389 if (vma) { \
1390 vext_set_elem_mask(vd, i, 1); \
1391 } \
1392 continue; \
1393 } \
1394 vext_set_elem_mask(vd, i, DO_OP(s2, s1)); \
1395 } \
1396 env->vstart = 0; \
1397 /* mask destination register are always tail-agnostic */ \
1398 /* set tail elements to 1s */ \
1399 if (vta_all_1s) { \
1400 for (; i < total_elems; i++) { \
1401 vext_set_elem_mask(vd, i, 1); \
1402 } \
1403 } \
1404 }
1405
1406 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t, H1, DO_MSEQ)
1407 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1408 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1409 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1410
1411 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t, H1, DO_MSNE)
1412 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1413 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1414 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1415
1416 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t, H1, DO_MSLT)
1417 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1418 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1419 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1420
1421 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t, H1, DO_MSLT)
1422 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1423 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1424 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1425
1426 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t, H1, DO_MSLE)
1427 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1428 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1429 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1430
1431 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t, H1, DO_MSLE)
1432 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1433 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1434 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1435
1436 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP) \
1437 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
1438 CPURISCVState *env, uint32_t desc) \
1439 { \
1440 uint32_t vm = vext_vm(desc); \
1441 uint32_t vl = env->vl; \
1442 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \
1443 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
1444 uint32_t vma = vext_vma(desc); \
1445 uint32_t i; \
1446 \
1447 for (i = env->vstart; i < vl; i++) { \
1448 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
1449 if (!vm && !vext_elem_mask(v0, i)) { \
1450 /* set masked-off elements to 1s */ \
1451 if (vma) { \
1452 vext_set_elem_mask(vd, i, 1); \
1453 } \
1454 continue; \
1455 } \
1456 vext_set_elem_mask(vd, i, \
1457 DO_OP(s2, (ETYPE)(target_long)s1)); \
1458 } \
1459 env->vstart = 0; \
1460 /* mask destination register are always tail-agnostic */ \
1461 /* set tail elements to 1s */ \
1462 if (vta_all_1s) { \
1463 for (; i < total_elems; i++) { \
1464 vext_set_elem_mask(vd, i, 1); \
1465 } \
1466 } \
1467 }
1468
1469 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t, H1, DO_MSEQ)
1470 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1471 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1472 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1473
1474 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t, H1, DO_MSNE)
1475 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1476 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1477 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1478
1479 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t, H1, DO_MSLT)
1480 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1481 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1482 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1483
1484 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t, H1, DO_MSLT)
1485 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1486 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1487 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1488
1489 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t, H1, DO_MSLE)
1490 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1491 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1492 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1493
1494 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t, H1, DO_MSLE)
1495 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1496 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1497 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1498
1499 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t, H1, DO_MSGT)
1500 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1501 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1502 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1503
1504 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t, H1, DO_MSGT)
1505 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1506 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1507 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1508
1509 /* Vector Integer Min/Max Instructions */
1510 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1511 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1512 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1513 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1514 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1515 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1516 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1517 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1518 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1519 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1520 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1521 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1522 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1523 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1524 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1525 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1526 GEN_VEXT_VV(vminu_vv_b, 1)
1527 GEN_VEXT_VV(vminu_vv_h, 2)
1528 GEN_VEXT_VV(vminu_vv_w, 4)
1529 GEN_VEXT_VV(vminu_vv_d, 8)
1530 GEN_VEXT_VV(vmin_vv_b, 1)
1531 GEN_VEXT_VV(vmin_vv_h, 2)
1532 GEN_VEXT_VV(vmin_vv_w, 4)
1533 GEN_VEXT_VV(vmin_vv_d, 8)
1534 GEN_VEXT_VV(vmaxu_vv_b, 1)
1535 GEN_VEXT_VV(vmaxu_vv_h, 2)
1536 GEN_VEXT_VV(vmaxu_vv_w, 4)
1537 GEN_VEXT_VV(vmaxu_vv_d, 8)
1538 GEN_VEXT_VV(vmax_vv_b, 1)
1539 GEN_VEXT_VV(vmax_vv_h, 2)
1540 GEN_VEXT_VV(vmax_vv_w, 4)
1541 GEN_VEXT_VV(vmax_vv_d, 8)
1542
1543 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1544 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1545 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1546 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1547 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1548 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1549 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1550 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1551 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1552 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1553 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1554 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1555 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1556 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1557 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1558 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1559 GEN_VEXT_VX(vminu_vx_b, 1)
1560 GEN_VEXT_VX(vminu_vx_h, 2)
1561 GEN_VEXT_VX(vminu_vx_w, 4)
1562 GEN_VEXT_VX(vminu_vx_d, 8)
1563 GEN_VEXT_VX(vmin_vx_b, 1)
1564 GEN_VEXT_VX(vmin_vx_h, 2)
1565 GEN_VEXT_VX(vmin_vx_w, 4)
1566 GEN_VEXT_VX(vmin_vx_d, 8)
1567 GEN_VEXT_VX(vmaxu_vx_b, 1)
1568 GEN_VEXT_VX(vmaxu_vx_h, 2)
1569 GEN_VEXT_VX(vmaxu_vx_w, 4)
1570 GEN_VEXT_VX(vmaxu_vx_d, 8)
1571 GEN_VEXT_VX(vmax_vx_b, 1)
1572 GEN_VEXT_VX(vmax_vx_h, 2)
1573 GEN_VEXT_VX(vmax_vx_w, 4)
1574 GEN_VEXT_VX(vmax_vx_d, 8)
1575
1576 /* Vector Single-Width Integer Multiply Instructions */
1577 #define DO_MUL(N, M) (N * M)
1578 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1579 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1580 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1581 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1582 GEN_VEXT_VV(vmul_vv_b, 1)
1583 GEN_VEXT_VV(vmul_vv_h, 2)
1584 GEN_VEXT_VV(vmul_vv_w, 4)
1585 GEN_VEXT_VV(vmul_vv_d, 8)
1586
1587 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1588 {
1589 return (int16_t)s2 * (int16_t)s1 >> 8;
1590 }
1591
1592 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1593 {
1594 return (int32_t)s2 * (int32_t)s1 >> 16;
1595 }
1596
1597 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1598 {
1599 return (int64_t)s2 * (int64_t)s1 >> 32;
1600 }
1601
1602 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1603 {
1604 uint64_t hi_64, lo_64;
1605
1606 muls64(&lo_64, &hi_64, s1, s2);
1607 return hi_64;
1608 }
1609
1610 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1611 {
1612 return (uint16_t)s2 * (uint16_t)s1 >> 8;
1613 }
1614
1615 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1616 {
1617 return (uint32_t)s2 * (uint32_t)s1 >> 16;
1618 }
1619
1620 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1621 {
1622 return (uint64_t)s2 * (uint64_t)s1 >> 32;
1623 }
1624
1625 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1626 {
1627 uint64_t hi_64, lo_64;
1628
1629 mulu64(&lo_64, &hi_64, s2, s1);
1630 return hi_64;
1631 }
1632
1633 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1634 {
1635 return (int16_t)s2 * (uint16_t)s1 >> 8;
1636 }
1637
1638 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1639 {
1640 return (int32_t)s2 * (uint32_t)s1 >> 16;
1641 }
1642
1643 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1644 {
1645 return (int64_t)s2 * (uint64_t)s1 >> 32;
1646 }
1647
1648 /*
1649 * Let A = signed operand,
1650 * B = unsigned operand
1651 * P = mulu64(A, B), unsigned product
1652 *
1653 * LET X = 2 ** 64 - A, 2's complement of A
1654 * SP = signed product
1655 * THEN
1656 * IF A < 0
1657 * SP = -X * B
1658 * = -(2 ** 64 - A) * B
1659 * = A * B - 2 ** 64 * B
1660 * = P - 2 ** 64 * B
1661 * ELSE
1662 * SP = P
1663 * THEN
1664 * HI_P -= (A < 0 ? B : 0)
1665 */
1666
1667 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1668 {
1669 uint64_t hi_64, lo_64;
1670
1671 mulu64(&lo_64, &hi_64, s2, s1);
1672
1673 hi_64 -= s2 < 0 ? s1 : 0;
1674 return hi_64;
1675 }
1676
1677 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1678 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1679 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1680 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1681 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1682 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1683 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1684 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1685 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1686 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1687 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1688 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1689 GEN_VEXT_VV(vmulh_vv_b, 1)
1690 GEN_VEXT_VV(vmulh_vv_h, 2)
1691 GEN_VEXT_VV(vmulh_vv_w, 4)
1692 GEN_VEXT_VV(vmulh_vv_d, 8)
1693 GEN_VEXT_VV(vmulhu_vv_b, 1)
1694 GEN_VEXT_VV(vmulhu_vv_h, 2)
1695 GEN_VEXT_VV(vmulhu_vv_w, 4)
1696 GEN_VEXT_VV(vmulhu_vv_d, 8)
1697 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1698 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1699 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1700 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1701
1702 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1703 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1704 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1705 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1706 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1707 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1708 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1709 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1710 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1711 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1712 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1713 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1714 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1715 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1716 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1717 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1718 GEN_VEXT_VX(vmul_vx_b, 1)
1719 GEN_VEXT_VX(vmul_vx_h, 2)
1720 GEN_VEXT_VX(vmul_vx_w, 4)
1721 GEN_VEXT_VX(vmul_vx_d, 8)
1722 GEN_VEXT_VX(vmulh_vx_b, 1)
1723 GEN_VEXT_VX(vmulh_vx_h, 2)
1724 GEN_VEXT_VX(vmulh_vx_w, 4)
1725 GEN_VEXT_VX(vmulh_vx_d, 8)
1726 GEN_VEXT_VX(vmulhu_vx_b, 1)
1727 GEN_VEXT_VX(vmulhu_vx_h, 2)
1728 GEN_VEXT_VX(vmulhu_vx_w, 4)
1729 GEN_VEXT_VX(vmulhu_vx_d, 8)
1730 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1731 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1732 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1733 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1734
1735 /* Vector Integer Divide Instructions */
1736 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1737 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1738 #define DO_DIV(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) :\
1739 unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1740 #define DO_REM(N, M) (unlikely(M == 0) ? N :\
1741 unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1742
1743 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1744 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1745 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1746 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1747 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1748 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1749 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1750 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1751 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1752 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1753 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1754 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1755 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1756 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1757 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1758 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1759 GEN_VEXT_VV(vdivu_vv_b, 1)
1760 GEN_VEXT_VV(vdivu_vv_h, 2)
1761 GEN_VEXT_VV(vdivu_vv_w, 4)
1762 GEN_VEXT_VV(vdivu_vv_d, 8)
1763 GEN_VEXT_VV(vdiv_vv_b, 1)
1764 GEN_VEXT_VV(vdiv_vv_h, 2)
1765 GEN_VEXT_VV(vdiv_vv_w, 4)
1766 GEN_VEXT_VV(vdiv_vv_d, 8)
1767 GEN_VEXT_VV(vremu_vv_b, 1)
1768 GEN_VEXT_VV(vremu_vv_h, 2)
1769 GEN_VEXT_VV(vremu_vv_w, 4)
1770 GEN_VEXT_VV(vremu_vv_d, 8)
1771 GEN_VEXT_VV(vrem_vv_b, 1)
1772 GEN_VEXT_VV(vrem_vv_h, 2)
1773 GEN_VEXT_VV(vrem_vv_w, 4)
1774 GEN_VEXT_VV(vrem_vv_d, 8)
1775
1776 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1777 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1778 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1779 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1780 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1781 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1782 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1783 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1784 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1785 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1786 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1787 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1788 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1789 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1790 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1791 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1792 GEN_VEXT_VX(vdivu_vx_b, 1)
1793 GEN_VEXT_VX(vdivu_vx_h, 2)
1794 GEN_VEXT_VX(vdivu_vx_w, 4)
1795 GEN_VEXT_VX(vdivu_vx_d, 8)
1796 GEN_VEXT_VX(vdiv_vx_b, 1)
1797 GEN_VEXT_VX(vdiv_vx_h, 2)
1798 GEN_VEXT_VX(vdiv_vx_w, 4)
1799 GEN_VEXT_VX(vdiv_vx_d, 8)
1800 GEN_VEXT_VX(vremu_vx_b, 1)
1801 GEN_VEXT_VX(vremu_vx_h, 2)
1802 GEN_VEXT_VX(vremu_vx_w, 4)
1803 GEN_VEXT_VX(vremu_vx_d, 8)
1804 GEN_VEXT_VX(vrem_vx_b, 1)
1805 GEN_VEXT_VX(vrem_vx_h, 2)
1806 GEN_VEXT_VX(vrem_vx_w, 4)
1807 GEN_VEXT_VX(vrem_vx_d, 8)
1808
1809 /* Vector Widening Integer Multiply Instructions */
1810 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1811 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1812 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1813 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1814 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1815 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1816 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1817 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1818 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1819 GEN_VEXT_VV(vwmul_vv_b, 2)
1820 GEN_VEXT_VV(vwmul_vv_h, 4)
1821 GEN_VEXT_VV(vwmul_vv_w, 8)
1822 GEN_VEXT_VV(vwmulu_vv_b, 2)
1823 GEN_VEXT_VV(vwmulu_vv_h, 4)
1824 GEN_VEXT_VV(vwmulu_vv_w, 8)
1825 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1826 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1827 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1828
1829 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1830 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1831 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1832 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1833 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1834 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1835 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1836 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1837 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1838 GEN_VEXT_VX(vwmul_vx_b, 2)
1839 GEN_VEXT_VX(vwmul_vx_h, 4)
1840 GEN_VEXT_VX(vwmul_vx_w, 8)
1841 GEN_VEXT_VX(vwmulu_vx_b, 2)
1842 GEN_VEXT_VX(vwmulu_vx_h, 4)
1843 GEN_VEXT_VX(vwmulu_vx_w, 8)
1844 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1845 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1846 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1847
1848 /* Vector Single-Width Integer Multiply-Add Instructions */
1849 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \
1850 static void do_##NAME(void *vd, void *vs1, void *vs2, int i) \
1851 { \
1852 TX1 s1 = *((T1 *)vs1 + HS1(i)); \
1853 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
1854 TD d = *((TD *)vd + HD(i)); \
1855 *((TD *)vd + HD(i)) = OP(s2, s1, d); \
1856 }
1857
1858 #define DO_MACC(N, M, D) (M * N + D)
1859 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1860 #define DO_MADD(N, M, D) (M * D + N)
1861 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1862 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1863 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1864 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1865 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1866 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1867 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1868 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1869 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1870 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1871 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1872 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1873 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1874 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1875 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1876 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1877 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1878 GEN_VEXT_VV(vmacc_vv_b, 1)
1879 GEN_VEXT_VV(vmacc_vv_h, 2)
1880 GEN_VEXT_VV(vmacc_vv_w, 4)
1881 GEN_VEXT_VV(vmacc_vv_d, 8)
1882 GEN_VEXT_VV(vnmsac_vv_b, 1)
1883 GEN_VEXT_VV(vnmsac_vv_h, 2)
1884 GEN_VEXT_VV(vnmsac_vv_w, 4)
1885 GEN_VEXT_VV(vnmsac_vv_d, 8)
1886 GEN_VEXT_VV(vmadd_vv_b, 1)
1887 GEN_VEXT_VV(vmadd_vv_h, 2)
1888 GEN_VEXT_VV(vmadd_vv_w, 4)
1889 GEN_VEXT_VV(vmadd_vv_d, 8)
1890 GEN_VEXT_VV(vnmsub_vv_b, 1)
1891 GEN_VEXT_VV(vnmsub_vv_h, 2)
1892 GEN_VEXT_VV(vnmsub_vv_w, 4)
1893 GEN_VEXT_VV(vnmsub_vv_d, 8)
1894
1895 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \
1896 static void do_##NAME(void *vd, target_long s1, void *vs2, int i) \
1897 { \
1898 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
1899 TD d = *((TD *)vd + HD(i)); \
1900 *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d); \
1901 }
1902
1903 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1904 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1905 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1906 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1907 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1908 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1909 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1910 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1911 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1912 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1913 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1914 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1915 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1916 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1917 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1918 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1919 GEN_VEXT_VX(vmacc_vx_b, 1)
1920 GEN_VEXT_VX(vmacc_vx_h, 2)
1921 GEN_VEXT_VX(vmacc_vx_w, 4)
1922 GEN_VEXT_VX(vmacc_vx_d, 8)
1923 GEN_VEXT_VX(vnmsac_vx_b, 1)
1924 GEN_VEXT_VX(vnmsac_vx_h, 2)
1925 GEN_VEXT_VX(vnmsac_vx_w, 4)
1926 GEN_VEXT_VX(vnmsac_vx_d, 8)
1927 GEN_VEXT_VX(vmadd_vx_b, 1)
1928 GEN_VEXT_VX(vmadd_vx_h, 2)
1929 GEN_VEXT_VX(vmadd_vx_w, 4)
1930 GEN_VEXT_VX(vmadd_vx_d, 8)
1931 GEN_VEXT_VX(vnmsub_vx_b, 1)
1932 GEN_VEXT_VX(vnmsub_vx_h, 2)
1933 GEN_VEXT_VX(vnmsub_vx_w, 4)
1934 GEN_VEXT_VX(vnmsub_vx_d, 8)
1935
1936 /* Vector Widening Integer Multiply-Add Instructions */
1937 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1938 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1939 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1940 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1941 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1942 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1943 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1944 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1945 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1946 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1947 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1948 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1949 GEN_VEXT_VV(vwmacc_vv_b, 2)
1950 GEN_VEXT_VV(vwmacc_vv_h, 4)
1951 GEN_VEXT_VV(vwmacc_vv_w, 8)
1952 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1953 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1954 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1955
1956 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1957 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1958 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1959 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1960 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1961 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1962 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1963 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1964 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1965 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1966 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1967 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1968 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1969 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1970 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1971 GEN_VEXT_VX(vwmacc_vx_b, 2)
1972 GEN_VEXT_VX(vwmacc_vx_h, 4)
1973 GEN_VEXT_VX(vwmacc_vx_w, 8)
1974 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1975 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1976 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1977 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1978 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1979 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1980
1981 /* Vector Integer Merge and Move Instructions */
1982 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H) \
1983 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env, \
1984 uint32_t desc) \
1985 { \
1986 uint32_t vl = env->vl; \
1987 uint32_t esz = sizeof(ETYPE); \
1988 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
1989 uint32_t vta = vext_vta(desc); \
1990 uint32_t i; \
1991 \
1992 for (i = env->vstart; i < vl; i++) { \
1993 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \
1994 *((ETYPE *)vd + H(i)) = s1; \
1995 } \
1996 env->vstart = 0; \
1997 /* set tail elements to 1s */ \
1998 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
1999 }
2000
2001 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t, H1)
2002 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2003 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2004 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2005
2006 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H) \
2007 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env, \
2008 uint32_t desc) \
2009 { \
2010 uint32_t vl = env->vl; \
2011 uint32_t esz = sizeof(ETYPE); \
2012 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
2013 uint32_t vta = vext_vta(desc); \
2014 uint32_t i; \
2015 \
2016 for (i = env->vstart; i < vl; i++) { \
2017 *((ETYPE *)vd + H(i)) = (ETYPE)s1; \
2018 } \
2019 env->vstart = 0; \
2020 /* set tail elements to 1s */ \
2021 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
2022 }
2023
2024 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t, H1)
2025 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2026 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2027 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2028
2029 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H) \
2030 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
2031 CPURISCVState *env, uint32_t desc) \
2032 { \
2033 uint32_t vl = env->vl; \
2034 uint32_t esz = sizeof(ETYPE); \
2035 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
2036 uint32_t vta = vext_vta(desc); \
2037 uint32_t i; \
2038 \
2039 for (i = env->vstart; i < vl; i++) { \
2040 ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1); \
2041 *((ETYPE *)vd + H(i)) = *(vt + H(i)); \
2042 } \
2043 env->vstart = 0; \
2044 /* set tail elements to 1s */ \
2045 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
2046 }
2047
2048 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t, H1)
2049 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2050 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2051 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2052
2053 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H) \
2054 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \
2055 void *vs2, CPURISCVState *env, uint32_t desc) \
2056 { \
2057 uint32_t vl = env->vl; \
2058 uint32_t esz = sizeof(ETYPE); \
2059 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
2060 uint32_t vta = vext_vta(desc); \
2061 uint32_t i; \
2062 \
2063 for (i = env->vstart; i < vl; i++) { \
2064 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
2065 ETYPE d = (!vext_elem_mask(v0, i) ? s2 : \
2066 (ETYPE)(target_long)s1); \
2067 *((ETYPE *)vd + H(i)) = d; \
2068 } \
2069 env->vstart = 0; \
2070 /* set tail elements to 1s */ \
2071 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
2072 }
2073
2074 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t, H1)
2075 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2076 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2077 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2078
2079 /*
2080 *** Vector Fixed-Point Arithmetic Instructions
2081 */
2082
2083 /* Vector Single-Width Saturating Add and Subtract */
2084
2085 /*
2086 * As fixed point instructions probably have round mode and saturation,
2087 * define common macros for fixed point here.
2088 */
2089 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2090 CPURISCVState *env, int vxrm);
2091
2092 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \
2093 static inline void \
2094 do_##NAME(void *vd, void *vs1, void *vs2, int i, \
2095 CPURISCVState *env, int vxrm) \
2096 { \
2097 TX1 s1 = *((T1 *)vs1 + HS1(i)); \
2098 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
2099 *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1); \
2100 }
2101
2102 static inline void
2103 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2104 CPURISCVState *env,
2105 uint32_t vl, uint32_t vm, int vxrm,
2106 opivv2_rm_fn *fn, uint32_t vma, uint32_t esz)
2107 {
2108 for (uint32_t i = env->vstart; i < vl; i++) {
2109 if (!vm && !vext_elem_mask(v0, i)) {
2110 /* set masked-off elements to 1s */
2111 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2112 continue;
2113 }
2114 fn(vd, vs1, vs2, i, env, vxrm);
2115 }
2116 env->vstart = 0;
2117 }
2118
2119 static inline void
2120 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2121 CPURISCVState *env,
2122 uint32_t desc,
2123 opivv2_rm_fn *fn, uint32_t esz)
2124 {
2125 uint32_t vm = vext_vm(desc);
2126 uint32_t vl = env->vl;
2127 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2128 uint32_t vta = vext_vta(desc);
2129 uint32_t vma = vext_vma(desc);
2130
2131 switch (env->vxrm) {
2132 case 0: /* rnu */
2133 vext_vv_rm_1(vd, v0, vs1, vs2,
2134 env, vl, vm, 0, fn, vma, esz);
2135 break;
2136 case 1: /* rne */
2137 vext_vv_rm_1(vd, v0, vs1, vs2,
2138 env, vl, vm, 1, fn, vma, esz);
2139 break;
2140 case 2: /* rdn */
2141 vext_vv_rm_1(vd, v0, vs1, vs2,
2142 env, vl, vm, 2, fn, vma, esz);
2143 break;
2144 default: /* rod */
2145 vext_vv_rm_1(vd, v0, vs1, vs2,
2146 env, vl, vm, 3, fn, vma, esz);
2147 break;
2148 }
2149 /* set tail elements to 1s */
2150 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2151 }
2152
2153 /* generate helpers for fixed point instructions with OPIVV format */
2154 #define GEN_VEXT_VV_RM(NAME, ESZ) \
2155 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
2156 CPURISCVState *env, uint32_t desc) \
2157 { \
2158 vext_vv_rm_2(vd, v0, vs1, vs2, env, desc, \
2159 do_##NAME, ESZ); \
2160 }
2161
2162 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2163 {
2164 uint8_t res = a + b;
2165 if (res < a) {
2166 res = UINT8_MAX;
2167 env->vxsat = 0x1;
2168 }
2169 return res;
2170 }
2171
2172 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2173 uint16_t b)
2174 {
2175 uint16_t res = a + b;
2176 if (res < a) {
2177 res = UINT16_MAX;
2178 env->vxsat = 0x1;
2179 }
2180 return res;
2181 }
2182
2183 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2184 uint32_t b)
2185 {
2186 uint32_t res = a + b;
2187 if (res < a) {
2188 res = UINT32_MAX;
2189 env->vxsat = 0x1;
2190 }
2191 return res;
2192 }
2193
2194 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2195 uint64_t b)
2196 {
2197 uint64_t res = a + b;
2198 if (res < a) {
2199 res = UINT64_MAX;
2200 env->vxsat = 0x1;
2201 }
2202 return res;
2203 }
2204
2205 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2206 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2207 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2208 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2209 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2210 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2211 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2212 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2213
2214 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2215 CPURISCVState *env, int vxrm);
2216
2217 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \
2218 static inline void \
2219 do_##NAME(void *vd, target_long s1, void *vs2, int i, \
2220 CPURISCVState *env, int vxrm) \
2221 { \
2222 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
2223 *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1); \
2224 }
2225
2226 static inline void
2227 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2228 CPURISCVState *env,
2229 uint32_t vl, uint32_t vm, int vxrm,
2230 opivx2_rm_fn *fn, uint32_t vma, uint32_t esz)
2231 {
2232 for (uint32_t i = env->vstart; i < vl; i++) {
2233 if (!vm && !vext_elem_mask(v0, i)) {
2234 /* set masked-off elements to 1s */
2235 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
2236 continue;
2237 }
2238 fn(vd, s1, vs2, i, env, vxrm);
2239 }
2240 env->vstart = 0;
2241 }
2242
2243 static inline void
2244 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2245 CPURISCVState *env,
2246 uint32_t desc,
2247 opivx2_rm_fn *fn, uint32_t esz)
2248 {
2249 uint32_t vm = vext_vm(desc);
2250 uint32_t vl = env->vl;
2251 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2252 uint32_t vta = vext_vta(desc);
2253 uint32_t vma = vext_vma(desc);
2254
2255 switch (env->vxrm) {
2256 case 0: /* rnu */
2257 vext_vx_rm_1(vd, v0, s1, vs2,
2258 env, vl, vm, 0, fn, vma, esz);
2259 break;
2260 case 1: /* rne */
2261 vext_vx_rm_1(vd, v0, s1, vs2,
2262 env, vl, vm, 1, fn, vma, esz);
2263 break;
2264 case 2: /* rdn */
2265 vext_vx_rm_1(vd, v0, s1, vs2,
2266 env, vl, vm, 2, fn, vma, esz);
2267 break;
2268 default: /* rod */
2269 vext_vx_rm_1(vd, v0, s1, vs2,
2270 env, vl, vm, 3, fn, vma, esz);
2271 break;
2272 }
2273 /* set tail elements to 1s */
2274 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2275 }
2276
2277 /* generate helpers for fixed point instructions with OPIVX format */
2278 #define GEN_VEXT_VX_RM(NAME, ESZ) \
2279 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \
2280 void *vs2, CPURISCVState *env, uint32_t desc) \
2281 { \
2282 vext_vx_rm_2(vd, v0, s1, vs2, env, desc, \
2283 do_##NAME, ESZ); \
2284 }
2285
2286 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2287 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2288 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2289 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2290 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2291 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2292 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2293 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2294
2295 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2296 {
2297 int8_t res = a + b;
2298 if ((res ^ a) & (res ^ b) & INT8_MIN) {
2299 res = a > 0 ? INT8_MAX : INT8_MIN;
2300 env->vxsat = 0x1;
2301 }
2302 return res;
2303 }
2304
2305 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2306 {
2307 int16_t res = a + b;
2308 if ((res ^ a) & (res ^ b) & INT16_MIN) {
2309 res = a > 0 ? INT16_MAX : INT16_MIN;
2310 env->vxsat = 0x1;
2311 }
2312 return res;
2313 }
2314
2315 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2316 {
2317 int32_t res = a + b;
2318 if ((res ^ a) & (res ^ b) & INT32_MIN) {
2319 res = a > 0 ? INT32_MAX : INT32_MIN;
2320 env->vxsat = 0x1;
2321 }
2322 return res;
2323 }
2324
2325 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2326 {
2327 int64_t res = a + b;
2328 if ((res ^ a) & (res ^ b) & INT64_MIN) {
2329 res = a > 0 ? INT64_MAX : INT64_MIN;
2330 env->vxsat = 0x1;
2331 }
2332 return res;
2333 }
2334
2335 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2336 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2337 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2338 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2339 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2340 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2341 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2342 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2343
2344 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2345 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2346 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2347 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2348 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2349 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2350 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2351 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2352
2353 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2354 {
2355 uint8_t res = a - b;
2356 if (res > a) {
2357 res = 0;
2358 env->vxsat = 0x1;
2359 }
2360 return res;
2361 }
2362
2363 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2364 uint16_t b)
2365 {
2366 uint16_t res = a - b;
2367 if (res > a) {
2368 res = 0;
2369 env->vxsat = 0x1;
2370 }
2371 return res;
2372 }
2373
2374 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2375 uint32_t b)
2376 {
2377 uint32_t res = a - b;
2378 if (res > a) {
2379 res = 0;
2380 env->vxsat = 0x1;
2381 }
2382 return res;
2383 }
2384
2385 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2386 uint64_t b)
2387 {
2388 uint64_t res = a - b;
2389 if (res > a) {
2390 res = 0;
2391 env->vxsat = 0x1;
2392 }
2393 return res;
2394 }
2395
2396 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2397 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2398 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2399 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2400 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2401 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2402 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2403 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2404
2405 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2406 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2407 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2408 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2409 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2410 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2411 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2412 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2413
2414 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2415 {
2416 int8_t res = a - b;
2417 if ((res ^ a) & (a ^ b) & INT8_MIN) {
2418 res = a >= 0 ? INT8_MAX : INT8_MIN;
2419 env->vxsat = 0x1;
2420 }
2421 return res;
2422 }
2423
2424 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2425 {
2426 int16_t res = a - b;
2427 if ((res ^ a) & (a ^ b) & INT16_MIN) {
2428 res = a >= 0 ? INT16_MAX : INT16_MIN;
2429 env->vxsat = 0x1;
2430 }
2431 return res;
2432 }
2433
2434 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2435 {
2436 int32_t res = a - b;
2437 if ((res ^ a) & (a ^ b) & INT32_MIN) {
2438 res = a >= 0 ? INT32_MAX : INT32_MIN;
2439 env->vxsat = 0x1;
2440 }
2441 return res;
2442 }
2443
2444 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2445 {
2446 int64_t res = a - b;
2447 if ((res ^ a) & (a ^ b) & INT64_MIN) {
2448 res = a >= 0 ? INT64_MAX : INT64_MIN;
2449 env->vxsat = 0x1;
2450 }
2451 return res;
2452 }
2453
2454 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2455 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2456 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2457 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2458 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2459 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2460 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2461 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2462
2463 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2464 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2465 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2466 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2467 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2468 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2469 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2470 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2471
2472 /* Vector Single-Width Averaging Add and Subtract */
2473 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2474 {
2475 uint8_t d = extract64(v, shift, 1);
2476 uint8_t d1;
2477 uint64_t D1, D2;
2478
2479 if (shift == 0 || shift > 64) {
2480 return 0;
2481 }
2482
2483 d1 = extract64(v, shift - 1, 1);
2484 D1 = extract64(v, 0, shift);
2485 if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2486 return d1;
2487 } else if (vxrm == 1) { /* round-to-nearest-even */
2488 if (shift > 1) {
2489 D2 = extract64(v, 0, shift - 1);
2490 return d1 & ((D2 != 0) | d);
2491 } else {
2492 return d1 & d;
2493 }
2494 } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2495 return !d & (D1 != 0);
2496 }
2497 return 0; /* round-down (truncate) */
2498 }
2499
2500 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2501 {
2502 int64_t res = (int64_t)a + b;
2503 uint8_t round = get_round(vxrm, res, 1);
2504
2505 return (res >> 1) + round;
2506 }
2507
2508 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2509 {
2510 int64_t res = a + b;
2511 uint8_t round = get_round(vxrm, res, 1);
2512 int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2513
2514 /* With signed overflow, bit 64 is inverse of bit 63. */
2515 return ((res >> 1) ^ over) + round;
2516 }
2517
2518 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2519 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2520 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2521 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2522 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2523 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2524 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2525 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2526
2527 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2528 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2529 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2530 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2531 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2532 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2533 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2534 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2535
2536 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2537 uint32_t a, uint32_t b)
2538 {
2539 uint64_t res = (uint64_t)a + b;
2540 uint8_t round = get_round(vxrm, res, 1);
2541
2542 return (res >> 1) + round;
2543 }
2544
2545 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2546 uint64_t a, uint64_t b)
2547 {
2548 uint64_t res = a + b;
2549 uint8_t round = get_round(vxrm, res, 1);
2550 uint64_t over = (uint64_t)(res < a) << 63;
2551
2552 return ((res >> 1) | over) + round;
2553 }
2554
2555 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2556 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2557 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2558 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2559 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2560 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2561 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2562 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2563
2564 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2565 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2566 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2567 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2568 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2569 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2570 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2571 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2572
2573 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2574 {
2575 int64_t res = (int64_t)a - b;
2576 uint8_t round = get_round(vxrm, res, 1);
2577
2578 return (res >> 1) + round;
2579 }
2580
2581 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2582 {
2583 int64_t res = (int64_t)a - b;
2584 uint8_t round = get_round(vxrm, res, 1);
2585 int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2586
2587 /* With signed overflow, bit 64 is inverse of bit 63. */
2588 return ((res >> 1) ^ over) + round;
2589 }
2590
2591 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2592 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2593 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2594 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2595 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2596 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2597 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2598 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2599
2600 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2601 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2602 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2603 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2604 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2605 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2606 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2607 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2608
2609 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2610 uint32_t a, uint32_t b)
2611 {
2612 int64_t res = (int64_t)a - b;
2613 uint8_t round = get_round(vxrm, res, 1);
2614
2615 return (res >> 1) + round;
2616 }
2617
2618 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2619 uint64_t a, uint64_t b)
2620 {
2621 uint64_t res = (uint64_t)a - b;
2622 uint8_t round = get_round(vxrm, res, 1);
2623 uint64_t over = (uint64_t)(res > a) << 63;
2624
2625 return ((res >> 1) | over) + round;
2626 }
2627
2628 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2629 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2630 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2631 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2632 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2633 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2634 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2635 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2636
2637 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2638 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2639 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2640 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2641 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2642 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2643 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2644 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2645
2646 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2647 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2648 {
2649 uint8_t round;
2650 int16_t res;
2651
2652 res = (int16_t)a * (int16_t)b;
2653 round = get_round(vxrm, res, 7);
2654 res = (res >> 7) + round;
2655
2656 if (res > INT8_MAX) {
2657 env->vxsat = 0x1;
2658 return INT8_MAX;
2659 } else if (res < INT8_MIN) {
2660 env->vxsat = 0x1;
2661 return INT8_MIN;
2662 } else {
2663 return res;
2664 }
2665 }
2666
2667 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2668 {
2669 uint8_t round;
2670 int32_t res;
2671
2672 res = (int32_t)a * (int32_t)b;
2673 round = get_round(vxrm, res, 15);
2674 res = (res >> 15) + round;
2675
2676 if (res > INT16_MAX) {
2677 env->vxsat = 0x1;
2678 return INT16_MAX;
2679 } else if (res < INT16_MIN) {
2680 env->vxsat = 0x1;
2681 return INT16_MIN;
2682 } else {
2683 return res;
2684 }
2685 }
2686
2687 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2688 {
2689 uint8_t round;
2690 int64_t res;
2691
2692 res = (int64_t)a * (int64_t)b;
2693 round = get_round(vxrm, res, 31);
2694 res = (res >> 31) + round;
2695
2696 if (res > INT32_MAX) {
2697 env->vxsat = 0x1;
2698 return INT32_MAX;
2699 } else if (res < INT32_MIN) {
2700 env->vxsat = 0x1;
2701 return INT32_MIN;
2702 } else {
2703 return res;
2704 }
2705 }
2706
2707 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2708 {
2709 uint8_t round;
2710 uint64_t hi_64, lo_64;
2711 int64_t res;
2712
2713 if (a == INT64_MIN && b == INT64_MIN) {
2714 env->vxsat = 1;
2715 return INT64_MAX;
2716 }
2717
2718 muls64(&lo_64, &hi_64, a, b);
2719 round = get_round(vxrm, lo_64, 63);
2720 /*
2721 * Cannot overflow, as there are always
2722 * 2 sign bits after multiply.
2723 */
2724 res = (hi_64 << 1) | (lo_64 >> 63);
2725 if (round) {
2726 if (res == INT64_MAX) {
2727 env->vxsat = 1;
2728 } else {
2729 res += 1;
2730 }
2731 }
2732 return res;
2733 }
2734
2735 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2736 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2737 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2738 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2739 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2740 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2741 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2742 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2743
2744 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2745 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2746 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2747 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2748 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2749 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2750 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2751 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2752
2753 /* Vector Single-Width Scaling Shift Instructions */
2754 static inline uint8_t
2755 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2756 {
2757 uint8_t round, shift = b & 0x7;
2758 uint8_t res;
2759
2760 round = get_round(vxrm, a, shift);
2761 res = (a >> shift) + round;
2762 return res;
2763 }
2764 static inline uint16_t
2765 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2766 {
2767 uint8_t round, shift = b & 0xf;
2768
2769 round = get_round(vxrm, a, shift);
2770 return (a >> shift) + round;
2771 }
2772 static inline uint32_t
2773 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2774 {
2775 uint8_t round, shift = b & 0x1f;
2776
2777 round = get_round(vxrm, a, shift);
2778 return (a >> shift) + round;
2779 }
2780 static inline uint64_t
2781 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2782 {
2783 uint8_t round, shift = b & 0x3f;
2784
2785 round = get_round(vxrm, a, shift);
2786 return (a >> shift) + round;
2787 }
2788 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2789 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2790 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2791 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2792 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2793 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2794 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2795 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2796
2797 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2798 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2799 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2800 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2801 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2802 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2803 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2804 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2805
2806 static inline int8_t
2807 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2808 {
2809 uint8_t round, shift = b & 0x7;
2810
2811 round = get_round(vxrm, a, shift);
2812 return (a >> shift) + round;
2813 }
2814 static inline int16_t
2815 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2816 {
2817 uint8_t round, shift = b & 0xf;
2818
2819 round = get_round(vxrm, a, shift);
2820 return (a >> shift) + round;
2821 }
2822 static inline int32_t
2823 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2824 {
2825 uint8_t round, shift = b & 0x1f;
2826
2827 round = get_round(vxrm, a, shift);
2828 return (a >> shift) + round;
2829 }
2830 static inline int64_t
2831 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2832 {
2833 uint8_t round, shift = b & 0x3f;
2834
2835 round = get_round(vxrm, a, shift);
2836 return (a >> shift) + round;
2837 }
2838
2839 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2840 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2841 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2842 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2843 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2844 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2845 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2846 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2847
2848 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2849 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2850 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2851 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2852 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2853 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2854 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2855 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2856
2857 /* Vector Narrowing Fixed-Point Clip Instructions */
2858 static inline int8_t
2859 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2860 {
2861 uint8_t round, shift = b & 0xf;
2862 int16_t res;
2863
2864 round = get_round(vxrm, a, shift);
2865 res = (a >> shift) + round;
2866 if (res > INT8_MAX) {
2867 env->vxsat = 0x1;
2868 return INT8_MAX;
2869 } else if (res < INT8_MIN) {
2870 env->vxsat = 0x1;
2871 return INT8_MIN;
2872 } else {
2873 return res;
2874 }
2875 }
2876
2877 static inline int16_t
2878 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2879 {
2880 uint8_t round, shift = b & 0x1f;
2881 int32_t res;
2882
2883 round = get_round(vxrm, a, shift);
2884 res = (a >> shift) + round;
2885 if (res > INT16_MAX) {
2886 env->vxsat = 0x1;
2887 return INT16_MAX;
2888 } else if (res < INT16_MIN) {
2889 env->vxsat = 0x1;
2890 return INT16_MIN;
2891 } else {
2892 return res;
2893 }
2894 }
2895
2896 static inline int32_t
2897 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2898 {
2899 uint8_t round, shift = b & 0x3f;
2900 int64_t res;
2901
2902 round = get_round(vxrm, a, shift);
2903 res = (a >> shift) + round;
2904 if (res > INT32_MAX) {
2905 env->vxsat = 0x1;
2906 return INT32_MAX;
2907 } else if (res < INT32_MIN) {
2908 env->vxsat = 0x1;
2909 return INT32_MIN;
2910 } else {
2911 return res;
2912 }
2913 }
2914
2915 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2916 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2917 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2918 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2919 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2920 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2921
2922 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2923 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2924 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2925 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2926 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2927 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2928
2929 static inline uint8_t
2930 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2931 {
2932 uint8_t round, shift = b & 0xf;
2933 uint16_t res;
2934
2935 round = get_round(vxrm, a, shift);
2936 res = (a >> shift) + round;
2937 if (res > UINT8_MAX) {
2938 env->vxsat = 0x1;
2939 return UINT8_MAX;
2940 } else {
2941 return res;
2942 }
2943 }
2944
2945 static inline uint16_t
2946 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2947 {
2948 uint8_t round, shift = b & 0x1f;
2949 uint32_t res;
2950
2951 round = get_round(vxrm, a, shift);
2952 res = (a >> shift) + round;
2953 if (res > UINT16_MAX) {
2954 env->vxsat = 0x1;
2955 return UINT16_MAX;
2956 } else {
2957 return res;
2958 }
2959 }
2960
2961 static inline uint32_t
2962 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2963 {
2964 uint8_t round, shift = b & 0x3f;
2965 uint64_t res;
2966
2967 round = get_round(vxrm, a, shift);
2968 res = (a >> shift) + round;
2969 if (res > UINT32_MAX) {
2970 env->vxsat = 0x1;
2971 return UINT32_MAX;
2972 } else {
2973 return res;
2974 }
2975 }
2976
2977 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2978 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2979 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2980 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2981 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2982 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2983
2984 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2985 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
2986 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
2987 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
2988 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
2989 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
2990
2991 /*
2992 *** Vector Float Point Arithmetic Instructions
2993 */
2994 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
2995 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \
2996 static void do_##NAME(void *vd, void *vs1, void *vs2, int i, \
2997 CPURISCVState *env) \
2998 { \
2999 TX1 s1 = *((T1 *)vs1 + HS1(i)); \
3000 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
3001 *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status); \
3002 }
3003
3004 #define GEN_VEXT_VV_ENV(NAME, ESZ) \
3005 void HELPER(NAME)(void *vd, void *v0, void *vs1, \
3006 void *vs2, CPURISCVState *env, \
3007 uint32_t desc) \
3008 { \
3009 uint32_t vm = vext_vm(desc); \
3010 uint32_t vl = env->vl; \
3011 uint32_t total_elems = \
3012 vext_get_total_elems(env, desc, ESZ); \
3013 uint32_t vta = vext_vta(desc); \
3014 uint32_t vma = vext_vma(desc); \
3015 uint32_t i; \
3016 \
3017 for (i = env->vstart; i < vl; i++) { \
3018 if (!vm && !vext_elem_mask(v0, i)) { \
3019 /* set masked-off elements to 1s */ \
3020 vext_set_elems_1s(vd, vma, i * ESZ, \
3021 (i + 1) * ESZ); \
3022 continue; \
3023 } \
3024 do_##NAME(vd, vs1, vs2, i, env); \
3025 } \
3026 env->vstart = 0; \
3027 /* set tail elements to 1s */ \
3028 vext_set_elems_1s(vd, vta, vl * ESZ, \
3029 total_elems * ESZ); \
3030 }
3031
3032 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3033 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3034 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3035 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3036 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3037 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3038
3039 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \
3040 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3041 CPURISCVState *env) \
3042 { \
3043 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
3044 *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3045 }
3046
3047 #define GEN_VEXT_VF(NAME, ESZ) \
3048 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, \
3049 void *vs2, CPURISCVState *env, \
3050 uint32_t desc) \
3051 { \
3052 uint32_t vm = vext_vm(desc); \
3053 uint32_t vl = env->vl; \
3054 uint32_t total_elems = \
3055 vext_get_total_elems(env, desc, ESZ); \
3056 uint32_t vta = vext_vta(desc); \
3057 uint32_t vma = vext_vma(desc); \
3058 uint32_t i; \
3059 \
3060 for (i = env->vstart; i < vl; i++) { \
3061 if (!vm && !vext_elem_mask(v0, i)) { \
3062 /* set masked-off elements to 1s */ \
3063 vext_set_elems_1s(vd, vma, i * ESZ, \
3064 (i + 1) * ESZ); \
3065 continue; \
3066 } \
3067 do_##NAME(vd, s1, vs2, i, env); \
3068 } \
3069 env->vstart = 0; \
3070 /* set tail elements to 1s */ \
3071 vext_set_elems_1s(vd, vta, vl * ESZ, \
3072 total_elems * ESZ); \
3073 }
3074
3075 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3076 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3077 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3078 GEN_VEXT_VF(vfadd_vf_h, 2)
3079 GEN_VEXT_VF(vfadd_vf_w, 4)
3080 GEN_VEXT_VF(vfadd_vf_d, 8)
3081
3082 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3083 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3084 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3085 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3086 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3087 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3088 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3089 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3090 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3091 GEN_VEXT_VF(vfsub_vf_h, 2)
3092 GEN_VEXT_VF(vfsub_vf_w, 4)
3093 GEN_VEXT_VF(vfsub_vf_d, 8)
3094
3095 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3096 {
3097 return float16_sub(b, a, s);
3098 }
3099
3100 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3101 {
3102 return float32_sub(b, a, s);
3103 }
3104
3105 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3106 {
3107 return float64_sub(b, a, s);
3108 }
3109
3110 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3111 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3112 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3113 GEN_VEXT_VF(vfrsub_vf_h, 2)
3114 GEN_VEXT_VF(vfrsub_vf_w, 4)
3115 GEN_VEXT_VF(vfrsub_vf_d, 8)
3116
3117 /* Vector Widening Floating-Point Add/Subtract Instructions */
3118 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3119 {
3120 return float32_add(float16_to_float32(a, true, s),
3121 float16_to_float32(b, true, s), s);
3122 }
3123
3124 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3125 {
3126 return float64_add(float32_to_float64(a, s),
3127 float32_to_float64(b, s), s);
3128
3129 }
3130
3131 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3132 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3133 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3134 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3135 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3136 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3137 GEN_VEXT_VF(vfwadd_vf_h, 4)
3138 GEN_VEXT_VF(vfwadd_vf_w, 8)
3139
3140 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3141 {
3142 return float32_sub(float16_to_float32(a, true, s),
3143 float16_to_float32(b, true, s), s);
3144 }
3145
3146 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3147 {
3148 return float64_sub(float32_to_float64(a, s),
3149 float32_to_float64(b, s), s);
3150
3151 }
3152
3153 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3154 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3155 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3156 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3157 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3158 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3159 GEN_VEXT_VF(vfwsub_vf_h, 4)
3160 GEN_VEXT_VF(vfwsub_vf_w, 8)
3161
3162 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3163 {
3164 return float32_add(a, float16_to_float32(b, true, s), s);
3165 }
3166
3167 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3168 {
3169 return float64_add(a, float32_to_float64(b, s), s);
3170 }
3171
3172 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3173 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3174 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3175 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3176 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3177 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3178 GEN_VEXT_VF(vfwadd_wf_h, 4)
3179 GEN_VEXT_VF(vfwadd_wf_w, 8)
3180
3181 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3182 {
3183 return float32_sub(a, float16_to_float32(b, true, s), s);
3184 }
3185
3186 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3187 {
3188 return float64_sub(a, float32_to_float64(b, s), s);
3189 }
3190
3191 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3192 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3193 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3194 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3195 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3196 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3197 GEN_VEXT_VF(vfwsub_wf_h, 4)
3198 GEN_VEXT_VF(vfwsub_wf_w, 8)
3199
3200 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3201 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3202 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3203 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3204 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3205 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3206 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3207 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3208 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3209 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3210 GEN_VEXT_VF(vfmul_vf_h, 2)
3211 GEN_VEXT_VF(vfmul_vf_w, 4)
3212 GEN_VEXT_VF(vfmul_vf_d, 8)
3213
3214 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3215 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3216 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3217 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3218 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3219 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3220 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3221 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3222 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3223 GEN_VEXT_VF(vfdiv_vf_h, 2)
3224 GEN_VEXT_VF(vfdiv_vf_w, 4)
3225 GEN_VEXT_VF(vfdiv_vf_d, 8)
3226
3227 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3228 {
3229 return float16_div(b, a, s);
3230 }
3231
3232 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3233 {
3234 return float32_div(b, a, s);
3235 }
3236
3237 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3238 {
3239 return float64_div(b, a, s);
3240 }
3241
3242 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3243 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3244 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3245 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3246 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3247 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3248
3249 /* Vector Widening Floating-Point Multiply */
3250 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3251 {
3252 return float32_mul(float16_to_float32(a, true, s),
3253 float16_to_float32(b, true, s), s);
3254 }
3255
3256 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3257 {
3258 return float64_mul(float32_to_float64(a, s),
3259 float32_to_float64(b, s), s);
3260
3261 }
3262 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3263 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3264 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3265 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3266 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3267 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3268 GEN_VEXT_VF(vfwmul_vf_h, 4)
3269 GEN_VEXT_VF(vfwmul_vf_w, 8)
3270
3271 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3272 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \
3273 static void do_##NAME(void *vd, void *vs1, void *vs2, int i, \
3274 CPURISCVState *env) \
3275 { \
3276 TX1 s1 = *((T1 *)vs1 + HS1(i)); \
3277 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
3278 TD d = *((TD *)vd + HD(i)); \
3279 *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status); \
3280 }
3281
3282 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3283 {
3284 return float16_muladd(a, b, d, 0, s);
3285 }
3286
3287 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3288 {
3289 return float32_muladd(a, b, d, 0, s);
3290 }
3291
3292 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3293 {
3294 return float64_muladd(a, b, d, 0, s);
3295 }
3296
3297 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3298 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3299 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3300 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3301 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3302 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3303
3304 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \
3305 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3306 CPURISCVState *env) \
3307 { \
3308 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
3309 TD d = *((TD *)vd + HD(i)); \
3310 *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3311 }
3312
3313 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3314 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3315 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3316 GEN_VEXT_VF(vfmacc_vf_h, 2)
3317 GEN_VEXT_VF(vfmacc_vf_w, 4)
3318 GEN_VEXT_VF(vfmacc_vf_d, 8)
3319
3320 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3321 {
3322 return float16_muladd(a, b, d,
3323 float_muladd_negate_c | float_muladd_negate_product, s);
3324 }
3325
3326 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3327 {
3328 return float32_muladd(a, b, d,
3329 float_muladd_negate_c | float_muladd_negate_product, s);
3330 }
3331
3332 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3333 {
3334 return float64_muladd(a, b, d,
3335 float_muladd_negate_c | float_muladd_negate_product, s);
3336 }
3337
3338 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3339 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3340 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3341 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3342 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3343 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3344 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3345 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3346 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3347 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3348 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3349 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3350
3351 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3352 {
3353 return float16_muladd(a, b, d, float_muladd_negate_c, s);
3354 }
3355
3356 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3357 {
3358 return float32_muladd(a, b, d, float_muladd_negate_c, s);
3359 }
3360
3361 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3362 {
3363 return float64_muladd(a, b, d, float_muladd_negate_c, s);
3364 }
3365
3366 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3367 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3368 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3369 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3370 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3371 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3372 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3373 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3374 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3375 GEN_VEXT_VF(vfmsac_vf_h, 2)
3376 GEN_VEXT_VF(vfmsac_vf_w, 4)
3377 GEN_VEXT_VF(vfmsac_vf_d, 8)
3378
3379 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3380 {
3381 return float16_muladd(a, b, d, float_muladd_negate_product, s);
3382 }
3383
3384 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3385 {
3386 return float32_muladd(a, b, d, float_muladd_negate_product, s);
3387 }
3388
3389 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3390 {
3391 return float64_muladd(a, b, d, float_muladd_negate_product, s);
3392 }
3393
3394 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3395 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3396 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3397 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3398 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3399 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3400 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3401 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3402 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3403 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3404 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3405 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3406
3407 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3408 {
3409 return float16_muladd(d, b, a, 0, s);
3410 }
3411
3412 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3413 {
3414 return float32_muladd(d, b, a, 0, s);
3415 }
3416
3417 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3418 {
3419 return float64_muladd(d, b, a, 0, s);
3420 }
3421
3422 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3423 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3424 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3425 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3426 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3427 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3428 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3429 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3430 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3431 GEN_VEXT_VF(vfmadd_vf_h, 2)
3432 GEN_VEXT_VF(vfmadd_vf_w, 4)
3433 GEN_VEXT_VF(vfmadd_vf_d, 8)
3434
3435 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3436 {
3437 return float16_muladd(d, b, a,
3438 float_muladd_negate_c | float_muladd_negate_product, s);
3439 }
3440
3441 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3442 {
3443 return float32_muladd(d, b, a,
3444 float_muladd_negate_c | float_muladd_negate_product, s);
3445 }
3446
3447 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3448 {
3449 return float64_muladd(d, b, a,
3450 float_muladd_negate_c | float_muladd_negate_product, s);
3451 }
3452
3453 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3454 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3455 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3456 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3457 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3458 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3459 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3460 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3461 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3462 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3463 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3464 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3465
3466 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3467 {
3468 return float16_muladd(d, b, a, float_muladd_negate_c, s);
3469 }
3470
3471 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3472 {
3473 return float32_muladd(d, b, a, float_muladd_negate_c, s);
3474 }
3475
3476 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3477 {
3478 return float64_muladd(d, b, a, float_muladd_negate_c, s);
3479 }
3480
3481 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3482 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3483 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3484 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3485 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3486 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3487 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3488 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3489 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3490 GEN_VEXT_VF(vfmsub_vf_h, 2)
3491 GEN_VEXT_VF(vfmsub_vf_w, 4)
3492 GEN_VEXT_VF(vfmsub_vf_d, 8)
3493
3494 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3495 {
3496 return float16_muladd(d, b, a, float_muladd_negate_product, s);
3497 }
3498
3499 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3500 {
3501 return float32_muladd(d, b, a, float_muladd_negate_product, s);
3502 }
3503
3504 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3505 {
3506 return float64_muladd(d, b, a, float_muladd_negate_product, s);
3507 }
3508
3509 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3510 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3511 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3512 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3513 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3514 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3515 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3516 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3517 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3518 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3519 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3520 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3521
3522 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3523 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3524 {
3525 return float32_muladd(float16_to_float32(a, true, s),
3526 float16_to_float32(b, true, s), d, 0, s);
3527 }
3528
3529 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3530 {
3531 return float64_muladd(float32_to_float64(a, s),
3532 float32_to_float64(b, s), d, 0, s);
3533 }
3534
3535 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3536 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3537 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3538 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3539 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3540 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3541 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3542 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3543
3544 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3545 {
3546 return float32_muladd(float16_to_float32(a, true, s),
3547 float16_to_float32(b, true, s), d,
3548 float_muladd_negate_c | float_muladd_negate_product, s);
3549 }
3550
3551 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3552 {
3553 return float64_muladd(float32_to_float64(a, s),
3554 float32_to_float64(b, s), d,
3555 float_muladd_negate_c | float_muladd_negate_product, s);
3556 }
3557
3558 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3559 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3560 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3561 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3562 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3563 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3564 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3565 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3566
3567 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3568 {
3569 return float32_muladd(float16_to_float32(a, true, s),
3570 float16_to_float32(b, true, s), d,
3571 float_muladd_negate_c, s);
3572 }
3573
3574 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3575 {
3576 return float64_muladd(float32_to_float64(a, s),
3577 float32_to_float64(b, s), d,
3578 float_muladd_negate_c, s);
3579 }
3580
3581 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3582 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3583 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3584 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3585 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3586 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3587 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3588 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3589
3590 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3591 {
3592 return float32_muladd(float16_to_float32(a, true, s),
3593 float16_to_float32(b, true, s), d,
3594 float_muladd_negate_product, s);
3595 }
3596
3597 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3598 {
3599 return float64_muladd(float32_to_float64(a, s),
3600 float32_to_float64(b, s), d,
3601 float_muladd_negate_product, s);
3602 }
3603
3604 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3605 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3606 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3607 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3608 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3609 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3610 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3611 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3612
3613 /* Vector Floating-Point Square-Root Instruction */
3614 /* (TD, T2, TX2) */
3615 #define OP_UU_H uint16_t, uint16_t, uint16_t
3616 #define OP_UU_W uint32_t, uint32_t, uint32_t
3617 #define OP_UU_D uint64_t, uint64_t, uint64_t
3618
3619 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP) \
3620 static void do_##NAME(void *vd, void *vs2, int i, \
3621 CPURISCVState *env) \
3622 { \
3623 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
3624 *((TD *)vd + HD(i)) = OP(s2, &env->fp_status); \
3625 }
3626
3627 #define GEN_VEXT_V_ENV(NAME, ESZ) \
3628 void HELPER(NAME)(void *vd, void *v0, void *vs2, \
3629 CPURISCVState *env, uint32_t desc) \
3630 { \
3631 uint32_t vm = vext_vm(desc); \
3632 uint32_t vl = env->vl; \
3633 uint32_t total_elems = \
3634 vext_get_total_elems(env, desc, ESZ); \
3635 uint32_t vta = vext_vta(desc); \
3636 uint32_t vma = vext_vma(desc); \
3637 uint32_t i; \
3638 \
3639 if (vl == 0) { \
3640 return; \
3641 } \
3642 for (i = env->vstart; i < vl; i++) { \
3643 if (!vm && !vext_elem_mask(v0, i)) { \
3644 /* set masked-off elements to 1s */ \
3645 vext_set_elems_1s(vd, vma, i * ESZ, \
3646 (i + 1) * ESZ); \
3647 continue; \
3648 } \
3649 do_##NAME(vd, vs2, i, env); \
3650 } \
3651 env->vstart = 0; \
3652 vext_set_elems_1s(vd, vta, vl * ESZ, \
3653 total_elems * ESZ); \
3654 }
3655
3656 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3657 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3658 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3659 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3660 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3661 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3662
3663 /*
3664 * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3665 *
3666 * Adapted from riscv-v-spec recip.c:
3667 * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3668 */
3669 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3670 {
3671 uint64_t sign = extract64(f, frac_size + exp_size, 1);
3672 uint64_t exp = extract64(f, frac_size, exp_size);
3673 uint64_t frac = extract64(f, 0, frac_size);
3674
3675 const uint8_t lookup_table[] = {
3676 52, 51, 50, 48, 47, 46, 44, 43,
3677 42, 41, 40, 39, 38, 36, 35, 34,
3678 33, 32, 31, 30, 30, 29, 28, 27,
3679 26, 25, 24, 23, 23, 22, 21, 20,
3680 19, 19, 18, 17, 16, 16, 15, 14,
3681 14, 13, 12, 12, 11, 10, 10, 9,
3682 9, 8, 7, 7, 6, 6, 5, 4,
3683 4, 3, 3, 2, 2, 1, 1, 0,
3684 127, 125, 123, 121, 119, 118, 116, 114,
3685 113, 111, 109, 108, 106, 105, 103, 102,
3686 100, 99, 97, 96, 95, 93, 92, 91,
3687 90, 88, 87, 86, 85, 84, 83, 82,
3688 80, 79, 78, 77, 76, 75, 74, 73,
3689 72, 71, 70, 70, 69, 68, 67, 66,
3690 65, 64, 63, 63, 62, 61, 60, 59,
3691 59, 58, 57, 56, 56, 55, 54, 53
3692 };
3693 const int precision = 7;
3694
3695 if (exp == 0 && frac != 0) { /* subnormal */
3696 /* Normalize the subnormal. */
3697 while (extract64(frac, frac_size - 1, 1) == 0) {
3698 exp--;
3699 frac <<= 1;
3700 }
3701
3702 frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3703 }
3704
3705 int idx = ((exp & 1) << (precision - 1)) |
3706 (frac >> (frac_size - precision + 1));
3707 uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3708 (frac_size - precision);
3709 uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3710
3711 uint64_t val = 0;
3712 val = deposit64(val, 0, frac_size, out_frac);
3713 val = deposit64(val, frac_size, exp_size, out_exp);
3714 val = deposit64(val, frac_size + exp_size, 1, sign);
3715 return val;
3716 }
3717
3718 static float16 frsqrt7_h(float16 f, float_status *s)
3719 {
3720 int exp_size = 5, frac_size = 10;
3721 bool sign = float16_is_neg(f);
3722
3723 /*
3724 * frsqrt7(sNaN) = canonical NaN
3725 * frsqrt7(-inf) = canonical NaN
3726 * frsqrt7(-normal) = canonical NaN
3727 * frsqrt7(-subnormal) = canonical NaN
3728 */
3729 if (float16_is_signaling_nan(f, s) ||
3730 (float16_is_infinity(f) && sign) ||
3731 (float16_is_normal(f) && sign) ||
3732 (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3733 s->float_exception_flags |= float_flag_invalid;
3734 return float16_default_nan(s);
3735 }
3736
3737 /* frsqrt7(qNaN) = canonical NaN */
3738 if (float16_is_quiet_nan(f, s)) {
3739 return float16_default_nan(s);
3740 }
3741
3742 /* frsqrt7(+-0) = +-inf */
3743 if (float16_is_zero(f)) {
3744 s->float_exception_flags |= float_flag_divbyzero;
3745 return float16_set_sign(float16_infinity, sign);
3746 }
3747
3748 /* frsqrt7(+inf) = +0 */
3749 if (float16_is_infinity(f) && !sign) {
3750 return float16_set_sign(float16_zero, sign);
3751 }
3752
3753 /* +normal, +subnormal */
3754 uint64_t val = frsqrt7(f, exp_size, frac_size);
3755 return make_float16(val);
3756 }
3757
3758 static float32 frsqrt7_s(float32 f, float_status *s)
3759 {
3760 int exp_size = 8, frac_size = 23;
3761 bool sign = float32_is_neg(f);
3762
3763 /*
3764 * frsqrt7(sNaN) = canonical NaN
3765 * frsqrt7(-inf) = canonical NaN
3766 * frsqrt7(-normal) = canonical NaN
3767 * frsqrt7(-subnormal) = canonical NaN
3768 */
3769 if (float32_is_signaling_nan(f, s) ||
3770 (float32_is_infinity(f) && sign) ||
3771 (float32_is_normal(f) && sign) ||
3772 (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3773 s->float_exception_flags |= float_flag_invalid;
3774 return float32_default_nan(s);
3775 }
3776
3777 /* frsqrt7(qNaN) = canonical NaN */
3778 if (float32_is_quiet_nan(f, s)) {
3779 return float32_default_nan(s);
3780 }
3781
3782 /* frsqrt7(+-0) = +-inf */
3783 if (float32_is_zero(f)) {
3784 s->float_exception_flags |= float_flag_divbyzero;
3785 return float32_set_sign(float32_infinity, sign);
3786 }
3787
3788 /* frsqrt7(+inf) = +0 */
3789 if (float32_is_infinity(f) && !sign) {
3790 return float32_set_sign(float32_zero, sign);
3791 }
3792
3793 /* +normal, +subnormal */
3794 uint64_t val = frsqrt7(f, exp_size, frac_size);
3795 return make_float32(val);
3796 }
3797
3798 static float64 frsqrt7_d(float64 f, float_status *s)
3799 {
3800 int exp_size = 11, frac_size = 52;
3801 bool sign = float64_is_neg(f);
3802
3803 /*
3804 * frsqrt7(sNaN) = canonical NaN
3805 * frsqrt7(-inf) = canonical NaN
3806 * frsqrt7(-normal) = canonical NaN
3807 * frsqrt7(-subnormal) = canonical NaN
3808 */
3809 if (float64_is_signaling_nan(f, s) ||
3810 (float64_is_infinity(f) && sign) ||
3811 (float64_is_normal(f) && sign) ||
3812 (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3813 s->float_exception_flags |= float_flag_invalid;
3814 return float64_default_nan(s);
3815 }
3816
3817 /* frsqrt7(qNaN) = canonical NaN */
3818 if (float64_is_quiet_nan(f, s)) {
3819 return float64_default_nan(s);
3820 }
3821
3822 /* frsqrt7(+-0) = +-inf */
3823 if (float64_is_zero(f)) {
3824 s->float_exception_flags |= float_flag_divbyzero;
3825 return float64_set_sign(float64_infinity, sign);
3826 }
3827
3828 /* frsqrt7(+inf) = +0 */
3829 if (float64_is_infinity(f) && !sign) {
3830 return float64_set_sign(float64_zero, sign);
3831 }
3832
3833 /* +normal, +subnormal */
3834 uint64_t val = frsqrt7(f, exp_size, frac_size);
3835 return make_float64(val);
3836 }
3837
3838 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3839 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3840 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3841 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3842 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3843 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3844
3845 /*
3846 * Vector Floating-Point Reciprocal Estimate Instruction
3847 *
3848 * Adapted from riscv-v-spec recip.c:
3849 * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3850 */
3851 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3852 float_status *s)
3853 {
3854 uint64_t sign = extract64(f, frac_size + exp_size, 1);
3855 uint64_t exp = extract64(f, frac_size, exp_size);
3856 uint64_t frac = extract64(f, 0, frac_size);
3857
3858 const uint8_t lookup_table[] = {
3859 127, 125, 123, 121, 119, 117, 116, 114,
3860 112, 110, 109, 107, 105, 104, 102, 100,
3861 99, 97, 96, 94, 93, 91, 90, 88,
3862 87, 85, 84, 83, 81, 80, 79, 77,
3863 76, 75, 74, 72, 71, 70, 69, 68,
3864 66, 65, 64, 63, 62, 61, 60, 59,
3865 58, 57, 56, 55, 54, 53, 52, 51,
3866 50, 49, 48, 47, 46, 45, 44, 43,
3867 42, 41, 40, 40, 39, 38, 37, 36,
3868 35, 35, 34, 33, 32, 31, 31, 30,
3869 29, 28, 28, 27, 26, 25, 25, 24,
3870 23, 23, 22, 21, 21, 20, 19, 19,
3871 18, 17, 17, 16, 15, 15, 14, 14,
3872 13, 12, 12, 11, 11, 10, 9, 9,
3873 8, 8, 7, 7, 6, 5, 5, 4,
3874 4, 3, 3, 2, 2, 1, 1, 0
3875 };
3876 const int precision = 7;
3877
3878 if (exp == 0 && frac != 0) { /* subnormal */
3879 /* Normalize the subnormal. */
3880 while (extract64(frac, frac_size - 1, 1) == 0) {
3881 exp--;
3882 frac <<= 1;
3883 }
3884
3885 frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3886
3887 if (exp != 0 && exp != UINT64_MAX) {
3888 /*
3889 * Overflow to inf or max value of same sign,
3890 * depending on sign and rounding mode.
3891 */
3892 s->float_exception_flags |= (float_flag_inexact |
3893 float_flag_overflow);
3894
3895 if ((s->float_rounding_mode == float_round_to_zero) ||
3896 ((s->float_rounding_mode == float_round_down) && !sign) ||
3897 ((s->float_rounding_mode == float_round_up) && sign)) {
3898 /* Return greatest/negative finite value. */
3899 return (sign << (exp_size + frac_size)) |
3900 (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3901 } else {
3902 /* Return +-inf. */
3903 return (sign << (exp_size + frac_size)) |
3904 MAKE_64BIT_MASK(frac_size, exp_size);
3905 }
3906 }
3907 }
3908
3909 int idx = frac >> (frac_size - precision);
3910 uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3911 (frac_size - precision);
3912 uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3913
3914 if (out_exp == 0 || out_exp == UINT64_MAX) {
3915 /*
3916 * The result is subnormal, but don't raise the underflow exception,
3917 * because there's no additional loss of precision.
3918 */
3919 out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3920 if (out_exp == UINT64_MAX) {
3921 out_frac >>= 1;
3922 out_exp = 0;
3923 }
3924 }
3925
3926 uint64_t val = 0;
3927 val = deposit64(val, 0, frac_size, out_frac);
3928 val = deposit64(val, frac_size, exp_size, out_exp);
3929 val = deposit64(val, frac_size + exp_size, 1, sign);
3930 return val;
3931 }
3932
3933 static float16 frec7_h(float16 f, float_status *s)
3934 {
3935 int exp_size = 5, frac_size = 10;
3936 bool sign = float16_is_neg(f);
3937
3938 /* frec7(+-inf) = +-0 */
3939 if (float16_is_infinity(f)) {
3940 return float16_set_sign(float16_zero, sign);
3941 }
3942
3943 /* frec7(+-0) = +-inf */
3944 if (float16_is_zero(f)) {
3945 s->float_exception_flags |= float_flag_divbyzero;
3946 return float16_set_sign(float16_infinity, sign);
3947 }
3948
3949 /* frec7(sNaN) = canonical NaN */
3950 if (float16_is_signaling_nan(f, s)) {
3951 s->float_exception_flags |= float_flag_invalid;
3952 return float16_default_nan(s);
3953 }
3954
3955 /* frec7(qNaN) = canonical NaN */
3956 if (float16_is_quiet_nan(f, s)) {
3957 return float16_default_nan(s);
3958 }
3959
3960 /* +-normal, +-subnormal */
3961 uint64_t val = frec7(f, exp_size, frac_size, s);
3962 return make_float16(val);
3963 }
3964
3965 static float32 frec7_s(float32 f, float_status *s)
3966 {
3967 int exp_size = 8, frac_size = 23;
3968 bool sign = float32_is_neg(f);
3969
3970 /* frec7(+-inf) = +-0 */
3971 if (float32_is_infinity(f)) {
3972 return float32_set_sign(float32_zero, sign);
3973 }
3974
3975 /* frec7(+-0) = +-inf */
3976 if (float32_is_zero(f)) {
3977 s->float_exception_flags |= float_flag_divbyzero;
3978 return float32_set_sign(float32_infinity, sign);
3979 }
3980
3981 /* frec7(sNaN) = canonical NaN */
3982 if (float32_is_signaling_nan(f, s)) {
3983 s->float_exception_flags |= float_flag_invalid;
3984 return float32_default_nan(s);
3985 }
3986
3987 /* frec7(qNaN) = canonical NaN */
3988 if (float32_is_quiet_nan(f, s)) {
3989 return float32_default_nan(s);
3990 }
3991
3992 /* +-normal, +-subnormal */
3993 uint64_t val = frec7(f, exp_size, frac_size, s);
3994 return make_float32(val);
3995 }
3996
3997 static float64 frec7_d(float64 f, float_status *s)
3998 {
3999 int exp_size = 11, frac_size = 52;
4000 bool sign = float64_is_neg(f);
4001
4002 /* frec7(+-inf) = +-0 */
4003 if (float64_is_infinity(f)) {
4004 return float64_set_sign(float64_zero, sign);
4005 }
4006
4007 /* frec7(+-0) = +-inf */
4008 if (float64_is_zero(f)) {
4009 s->float_exception_flags |= float_flag_divbyzero;
4010 return float64_set_sign(float64_infinity, sign);
4011 }
4012
4013 /* frec7(sNaN) = canonical NaN */
4014 if (float64_is_signaling_nan(f, s)) {
4015 s->float_exception_flags |= float_flag_invalid;
4016 return float64_default_nan(s);
4017 }
4018
4019 /* frec7(qNaN) = canonical NaN */
4020 if (float64_is_quiet_nan(f, s)) {
4021 return float64_default_nan(s);
4022 }
4023
4024 /* +-normal, +-subnormal */
4025 uint64_t val = frec7(f, exp_size, frac_size, s);
4026 return make_float64(val);
4027 }
4028
4029 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4030 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4031 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4032 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4033 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4034 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4035
4036 /* Vector Floating-Point MIN/MAX Instructions */
4037 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4038 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4039 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4040 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4041 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4042 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4043 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4044 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4045 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4046 GEN_VEXT_VF(vfmin_vf_h, 2)
4047 GEN_VEXT_VF(vfmin_vf_w, 4)
4048 GEN_VEXT_VF(vfmin_vf_d, 8)
4049
4050 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4051 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4052 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4053 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4054 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4055 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4056 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4057 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4058 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4059 GEN_VEXT_VF(vfmax_vf_h, 2)
4060 GEN_VEXT_VF(vfmax_vf_w, 4)
4061 GEN_VEXT_VF(vfmax_vf_d, 8)
4062
4063 /* Vector Floating-Point Sign-Injection Instructions */
4064 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4065 {
4066 return deposit64(b, 0, 15, a);
4067 }
4068
4069 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4070 {
4071 return deposit64(b, 0, 31, a);
4072 }
4073
4074 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4075 {
4076 return deposit64(b, 0, 63, a);
4077 }
4078
4079 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4080 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4081 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4082 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4083 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4084 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4085 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4086 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4087 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4088 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4089 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4090 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4091
4092 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4093 {
4094 return deposit64(~b, 0, 15, a);
4095 }
4096
4097 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4098 {
4099 return deposit64(~b, 0, 31, a);
4100 }
4101
4102 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4103 {
4104 return deposit64(~b, 0, 63, a);
4105 }
4106
4107 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4108 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4109 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4110 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4111 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4112 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4113 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4114 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4115 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4116 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4117 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4118 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4119
4120 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4121 {
4122 return deposit64(b ^ a, 0, 15, a);
4123 }
4124
4125 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4126 {
4127 return deposit64(b ^ a, 0, 31, a);
4128 }
4129
4130 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4131 {
4132 return deposit64(b ^ a, 0, 63, a);
4133 }
4134
4135 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4136 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4137 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4138 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4139 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4140 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4141 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4142 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4143 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4144 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4145 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4146 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4147
4148 /* Vector Floating-Point Compare Instructions */
4149 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP) \
4150 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
4151 CPURISCVState *env, uint32_t desc) \
4152 { \
4153 uint32_t vm = vext_vm(desc); \
4154 uint32_t vl = env->vl; \
4155 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \
4156 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
4157 uint32_t vma = vext_vma(desc); \
4158 uint32_t i; \
4159 \
4160 for (i = env->vstart; i < vl; i++) { \
4161 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \
4162 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
4163 if (!vm && !vext_elem_mask(v0, i)) { \
4164 /* set masked-off elements to 1s */ \
4165 if (vma) { \
4166 vext_set_elem_mask(vd, i, 1); \
4167 } \
4168 continue; \
4169 } \
4170 vext_set_elem_mask(vd, i, \
4171 DO_OP(s2, s1, &env->fp_status)); \
4172 } \
4173 env->vstart = 0; \
4174 /* mask destination register are always tail-agnostic */ \
4175 /* set tail elements to 1s */ \
4176 if (vta_all_1s) { \
4177 for (; i < total_elems; i++) { \
4178 vext_set_elem_mask(vd, i, 1); \
4179 } \
4180 } \
4181 }
4182
4183 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4184 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4185 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4186
4187 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP) \
4188 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4189 CPURISCVState *env, uint32_t desc) \
4190 { \
4191 uint32_t vm = vext_vm(desc); \
4192 uint32_t vl = env->vl; \
4193 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \
4194 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
4195 uint32_t vma = vext_vma(desc); \
4196 uint32_t i; \
4197 \
4198 for (i = env->vstart; i < vl; i++) { \
4199 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
4200 if (!vm && !vext_elem_mask(v0, i)) { \
4201 /* set masked-off elements to 1s */ \
4202 if (vma) { \
4203 vext_set_elem_mask(vd, i, 1); \
4204 } \
4205 continue; \
4206 } \
4207 vext_set_elem_mask(vd, i, \
4208 DO_OP(s2, (ETYPE)s1, &env->fp_status)); \
4209 } \
4210 env->vstart = 0; \
4211 /* mask destination register are always tail-agnostic */ \
4212 /* set tail elements to 1s */ \
4213 if (vta_all_1s) { \
4214 for (; i < total_elems; i++) { \
4215 vext_set_elem_mask(vd, i, 1); \
4216 } \
4217 } \
4218 }
4219
4220 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4221 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4222 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4223
4224 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4225 {
4226 FloatRelation compare = float16_compare_quiet(a, b, s);
4227 return compare != float_relation_equal;
4228 }
4229
4230 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4231 {
4232 FloatRelation compare = float32_compare_quiet(a, b, s);
4233 return compare != float_relation_equal;
4234 }
4235
4236 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4237 {
4238 FloatRelation compare = float64_compare_quiet(a, b, s);
4239 return compare != float_relation_equal;
4240 }
4241
4242 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4243 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4244 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4245 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4246 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4247 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4248
4249 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4250 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4251 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4252 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4253 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4254 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4255
4256 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4257 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4258 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4259 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4260 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4261 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4262
4263 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4264 {
4265 FloatRelation compare = float16_compare(a, b, s);
4266 return compare == float_relation_greater;
4267 }
4268
4269 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4270 {
4271 FloatRelation compare = float32_compare(a, b, s);
4272 return compare == float_relation_greater;
4273 }
4274
4275 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4276 {
4277 FloatRelation compare = float64_compare(a, b, s);
4278 return compare == float_relation_greater;
4279 }
4280
4281 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4282 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4283 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4284
4285 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4286 {
4287 FloatRelation compare = float16_compare(a, b, s);
4288 return compare == float_relation_greater ||
4289 compare == float_relation_equal;
4290 }
4291
4292 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4293 {
4294 FloatRelation compare = float32_compare(a, b, s);
4295 return compare == float_relation_greater ||
4296 compare == float_relation_equal;
4297 }
4298
4299 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4300 {
4301 FloatRelation compare = float64_compare(a, b, s);
4302 return compare == float_relation_greater ||
4303 compare == float_relation_equal;
4304 }
4305
4306 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4307 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4308 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4309
4310 /* Vector Floating-Point Classify Instruction */
4311 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP) \
4312 static void do_##NAME(void *vd, void *vs2, int i) \
4313 { \
4314 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
4315 *((TD *)vd + HD(i)) = OP(s2); \
4316 }
4317
4318 #define GEN_VEXT_V(NAME, ESZ) \
4319 void HELPER(NAME)(void *vd, void *v0, void *vs2, \
4320 CPURISCVState *env, uint32_t desc) \
4321 { \
4322 uint32_t vm = vext_vm(desc); \
4323 uint32_t vl = env->vl; \
4324 uint32_t total_elems = \
4325 vext_get_total_elems(env, desc, ESZ); \
4326 uint32_t vta = vext_vta(desc); \
4327 uint32_t vma = vext_vma(desc); \
4328 uint32_t i; \
4329 \
4330 for (i = env->vstart; i < vl; i++) { \
4331 if (!vm && !vext_elem_mask(v0, i)) { \
4332 /* set masked-off elements to 1s */ \
4333 vext_set_elems_1s(vd, vma, i * ESZ, \
4334 (i + 1) * ESZ); \
4335 continue; \
4336 } \
4337 do_##NAME(vd, vs2, i); \
4338 } \
4339 env->vstart = 0; \
4340 /* set tail elements to 1s */ \
4341 vext_set_elems_1s(vd, vta, vl * ESZ, \
4342 total_elems * ESZ); \
4343 }
4344
4345 target_ulong fclass_h(uint64_t frs1)
4346 {
4347 float16 f = frs1;
4348 bool sign = float16_is_neg(f);
4349
4350 if (float16_is_infinity(f)) {
4351 return sign ? 1 << 0 : 1 << 7;
4352 } else if (float16_is_zero(f)) {
4353 return sign ? 1 << 3 : 1 << 4;
4354 } else if (float16_is_zero_or_denormal(f)) {
4355 return sign ? 1 << 2 : 1 << 5;
4356 } else if (float16_is_any_nan(f)) {
4357 float_status s = { }; /* for snan_bit_is_one */
4358 return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4359 } else {
4360 return sign ? 1 << 1 : 1 << 6;
4361 }
4362 }
4363
4364 target_ulong fclass_s(uint64_t frs1)
4365 {
4366 float32 f = frs1;
4367 bool sign = float32_is_neg(f);
4368
4369 if (float32_is_infinity(f)) {
4370 return sign ? 1 << 0 : 1 << 7;
4371 } else if (float32_is_zero(f)) {
4372 return sign ? 1 << 3 : 1 << 4;
4373 } else if (float32_is_zero_or_denormal(f)) {
4374 return sign ? 1 << 2 : 1 << 5;
4375 } else if (float32_is_any_nan(f)) {
4376 float_status s = { }; /* for snan_bit_is_one */
4377 return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4378 } else {
4379 return sign ? 1 << 1 : 1 << 6;
4380 }
4381 }
4382
4383 target_ulong fclass_d(uint64_t frs1)
4384 {
4385 float64 f = frs1;
4386 bool sign = float64_is_neg(f);
4387
4388 if (float64_is_infinity(f)) {
4389 return sign ? 1 << 0 : 1 << 7;
4390 } else if (float64_is_zero(f)) {
4391 return sign ? 1 << 3 : 1 << 4;
4392 } else if (float64_is_zero_or_denormal(f)) {
4393 return sign ? 1 << 2 : 1 << 5;
4394 } else if (float64_is_any_nan(f)) {
4395 float_status s = { }; /* for snan_bit_is_one */
4396 return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4397 } else {
4398 return sign ? 1 << 1 : 1 << 6;
4399 }
4400 }
4401
4402 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4403 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4404 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4405 GEN_VEXT_V(vfclass_v_h, 2)
4406 GEN_VEXT_V(vfclass_v_w, 4)
4407 GEN_VEXT_V(vfclass_v_d, 8)
4408
4409 /* Vector Floating-Point Merge Instruction */
4410
4411 #define GEN_VFMERGE_VF(NAME, ETYPE, H) \
4412 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4413 CPURISCVState *env, uint32_t desc) \
4414 { \
4415 uint32_t vm = vext_vm(desc); \
4416 uint32_t vl = env->vl; \
4417 uint32_t esz = sizeof(ETYPE); \
4418 uint32_t total_elems = \
4419 vext_get_total_elems(env, desc, esz); \
4420 uint32_t vta = vext_vta(desc); \
4421 uint32_t i; \
4422 \
4423 for (i = env->vstart; i < vl; i++) { \
4424 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
4425 *((ETYPE *)vd + H(i)) \
4426 = (!vm && !vext_elem_mask(v0, i) ? s2 : s1); \
4427 } \
4428 env->vstart = 0; \
4429 /* set tail elements to 1s */ \
4430 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
4431 }
4432
4433 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4434 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4435 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4436
4437 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4438 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4439 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4440 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4441 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4442 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4443 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4444 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4445
4446 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4447 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4448 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4449 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4450 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4451 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4452 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4453
4454 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4455 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4456 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4457 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4458 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4459 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4460 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4461
4462 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4463 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4464 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4465 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4466 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4467 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4468 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4469
4470 /* Widening Floating-Point/Integer Type-Convert Instructions */
4471 /* (TD, T2, TX2) */
4472 #define WOP_UU_B uint16_t, uint8_t, uint8_t
4473 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4474 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4475 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4476 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4477 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4478 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4479 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4480
4481 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4482 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4483 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4484 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4485 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4486
4487 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4488 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4489 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4490 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4491 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4492 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4493 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4494
4495 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4496 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4497 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4498 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4499 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4500 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4501 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4502
4503 /*
4504 * vfwcvt.f.f.v vd, vs2, vm
4505 * Convert single-width float to double-width float.
4506 */
4507 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4508 {
4509 return float16_to_float32(a, true, s);
4510 }
4511
4512 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4513 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4514 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4515 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4516
4517 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4518 /* (TD, T2, TX2) */
4519 #define NOP_UU_B uint8_t, uint16_t, uint32_t
4520 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4521 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4522 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4523 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4524 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4525 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4526 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4527 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4528 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4529
4530 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4531 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4532 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4533 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4534 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4535 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4536 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4537
4538 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4539 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4540 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4541 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4542 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4543
4544 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4545 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4546 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4547 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4548 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4549
4550 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4551 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4552 {
4553 return float32_to_float16(a, true, s);
4554 }
4555
4556 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4557 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4558 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4559 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4560
4561 /*
4562 *** Vector Reduction Operations
4563 */
4564 /* Vector Single-Width Integer Reduction Instructions */
4565 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP) \
4566 void HELPER(NAME)(void *vd, void *v0, void *vs1, \
4567 void *vs2, CPURISCVState *env, uint32_t desc) \
4568 { \
4569 uint32_t vm = vext_vm(desc); \
4570 uint32_t vl = env->vl; \
4571 uint32_t esz = sizeof(TD); \
4572 uint32_t vlenb = simd_maxsz(desc); \
4573 uint32_t vta = vext_vta(desc); \
4574 uint32_t i; \
4575 TD s1 = *((TD *)vs1 + HD(0)); \
4576 \
4577 for (i = env->vstart; i < vl; i++) { \
4578 TS2 s2 = *((TS2 *)vs2 + HS2(i)); \
4579 if (!vm && !vext_elem_mask(v0, i)) { \
4580 continue; \
4581 } \
4582 s1 = OP(s1, (TD)s2); \
4583 } \
4584 *((TD *)vd + HD(0)) = s1; \
4585 env->vstart = 0; \
4586 /* set tail elements to 1s */ \
4587 vext_set_elems_1s(vd, vta, esz, vlenb); \
4588 }
4589
4590 /* vd[0] = sum(vs1[0], vs2[*]) */
4591 GEN_VEXT_RED(vredsum_vs_b, int8_t, int8_t, H1, H1, DO_ADD)
4592 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4593 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4594 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4595
4596 /* vd[0] = maxu(vs1[0], vs2[*]) */
4597 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t, uint8_t, H1, H1, DO_MAX)
4598 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4599 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4600 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4601
4602 /* vd[0] = max(vs1[0], vs2[*]) */
4603 GEN_VEXT_RED(vredmax_vs_b, int8_t, int8_t, H1, H1, DO_MAX)
4604 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4605 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4606 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4607
4608 /* vd[0] = minu(vs1[0], vs2[*]) */
4609 GEN_VEXT_RED(vredminu_vs_b, uint8_t, uint8_t, H1, H1, DO_MIN)
4610 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4611 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4612 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4613
4614 /* vd[0] = min(vs1[0], vs2[*]) */
4615 GEN_VEXT_RED(vredmin_vs_b, int8_t, int8_t, H1, H1, DO_MIN)
4616 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4617 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4618 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4619
4620 /* vd[0] = and(vs1[0], vs2[*]) */
4621 GEN_VEXT_RED(vredand_vs_b, int8_t, int8_t, H1, H1, DO_AND)
4622 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4623 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4624 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4625
4626 /* vd[0] = or(vs1[0], vs2[*]) */
4627 GEN_VEXT_RED(vredor_vs_b, int8_t, int8_t, H1, H1, DO_OR)
4628 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4629 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4630 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4631
4632 /* vd[0] = xor(vs1[0], vs2[*]) */
4633 GEN_VEXT_RED(vredxor_vs_b, int8_t, int8_t, H1, H1, DO_XOR)
4634 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4635 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4636 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4637
4638 /* Vector Widening Integer Reduction Instructions */
4639 /* signed sum reduction into double-width accumulator */
4640 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t, H2, H1, DO_ADD)
4641 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4642 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4643
4644 /* Unsigned sum reduction into double-width accumulator */
4645 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t, H2, H1, DO_ADD)
4646 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4647 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4648
4649 /* Vector Single-Width Floating-Point Reduction Instructions */
4650 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP) \
4651 void HELPER(NAME)(void *vd, void *v0, void *vs1, \
4652 void *vs2, CPURISCVState *env, \
4653 uint32_t desc) \
4654 { \
4655 uint32_t vm = vext_vm(desc); \
4656 uint32_t vl = env->vl; \
4657 uint32_t esz = sizeof(TD); \
4658 uint32_t vlenb = simd_maxsz(desc); \
4659 uint32_t vta = vext_vta(desc); \
4660 uint32_t i; \
4661 TD s1 = *((TD *)vs1 + HD(0)); \
4662 \
4663 for (i = env->vstart; i < vl; i++) { \
4664 TS2 s2 = *((TS2 *)vs2 + HS2(i)); \
4665 if (!vm && !vext_elem_mask(v0, i)) { \
4666 continue; \
4667 } \
4668 s1 = OP(s1, (TD)s2, &env->fp_status); \
4669 } \
4670 *((TD *)vd + HD(0)) = s1; \
4671 env->vstart = 0; \
4672 /* set tail elements to 1s */ \
4673 vext_set_elems_1s(vd, vta, esz, vlenb); \
4674 }
4675
4676 /* Unordered sum */
4677 GEN_VEXT_FRED(vfredusum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4678 GEN_VEXT_FRED(vfredusum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4679 GEN_VEXT_FRED(vfredusum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4680
4681 /* Ordered sum */
4682 GEN_VEXT_FRED(vfredosum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4683 GEN_VEXT_FRED(vfredosum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4684 GEN_VEXT_FRED(vfredosum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4685
4686 /* Maximum value */
4687 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4688 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4689 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4690
4691 /* Minimum value */
4692 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4693 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4694 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4695
4696 /* Vector Widening Floating-Point Add Instructions */
4697 static uint32_t fwadd16(uint32_t a, uint16_t b, float_status *s)
4698 {
4699 return float32_add(a, float16_to_float32(b, true, s), s);
4700 }
4701
4702 static uint64_t fwadd32(uint64_t a, uint32_t b, float_status *s)
4703 {
4704 return float64_add(a, float32_to_float64(b, s), s);
4705 }
4706
4707 /* Vector Widening Floating-Point Reduction Instructions */
4708 /* Ordered/unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4709 GEN_VEXT_FRED(vfwredusum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4710 GEN_VEXT_FRED(vfwredusum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4711 GEN_VEXT_FRED(vfwredosum_vs_h, uint32_t, uint16_t, H4, H2, fwadd16)
4712 GEN_VEXT_FRED(vfwredosum_vs_w, uint64_t, uint32_t, H8, H4, fwadd32)
4713
4714 /*
4715 *** Vector Mask Operations
4716 */
4717 /* Vector Mask-Register Logical Instructions */
4718 #define GEN_VEXT_MASK_VV(NAME, OP) \
4719 void HELPER(NAME)(void *vd, void *v0, void *vs1, \
4720 void *vs2, CPURISCVState *env, \
4721 uint32_t desc) \
4722 { \
4723 uint32_t vl = env->vl; \
4724 uint32_t total_elems = riscv_cpu_cfg(env)->vlen; \
4725 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
4726 uint32_t i; \
4727 int a, b; \
4728 \
4729 for (i = env->vstart; i < vl; i++) { \
4730 a = vext_elem_mask(vs1, i); \
4731 b = vext_elem_mask(vs2, i); \
4732 vext_set_elem_mask(vd, i, OP(b, a)); \
4733 } \
4734 env->vstart = 0; \
4735 /* mask destination register are always tail- \
4736 * agnostic \
4737 */ \
4738 /* set tail elements to 1s */ \
4739 if (vta_all_1s) { \
4740 for (; i < total_elems; i++) { \
4741 vext_set_elem_mask(vd, i, 1); \
4742 } \
4743 } \
4744 }
4745
4746 #define DO_NAND(N, M) (!(N & M))
4747 #define DO_ANDNOT(N, M) (N & !M)
4748 #define DO_NOR(N, M) (!(N | M))
4749 #define DO_ORNOT(N, M) (N | !M)
4750 #define DO_XNOR(N, M) (!(N ^ M))
4751
4752 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4753 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4754 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4755 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4756 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4757 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4758 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4759 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4760
4761 /* Vector count population in mask vcpop */
4762 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4763 uint32_t desc)
4764 {
4765 target_ulong cnt = 0;
4766 uint32_t vm = vext_vm(desc);
4767 uint32_t vl = env->vl;
4768 int i;
4769
4770 for (i = env->vstart; i < vl; i++) {
4771 if (vm || vext_elem_mask(v0, i)) {
4772 if (vext_elem_mask(vs2, i)) {
4773 cnt++;
4774 }
4775 }
4776 }
4777 env->vstart = 0;
4778 return cnt;
4779 }
4780
4781 /* vfirst find-first-set mask bit*/
4782 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4783 uint32_t desc)
4784 {
4785 uint32_t vm = vext_vm(desc);
4786 uint32_t vl = env->vl;
4787 int i;
4788
4789 for (i = env->vstart; i < vl; i++) {
4790 if (vm || vext_elem_mask(v0, i)) {
4791 if (vext_elem_mask(vs2, i)) {
4792 return i;
4793 }
4794 }
4795 }
4796 env->vstart = 0;
4797 return -1LL;
4798 }
4799
4800 enum set_mask_type {
4801 ONLY_FIRST = 1,
4802 INCLUDE_FIRST,
4803 BEFORE_FIRST,
4804 };
4805
4806 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4807 uint32_t desc, enum set_mask_type type)
4808 {
4809 uint32_t vm = vext_vm(desc);
4810 uint32_t vl = env->vl;
4811 uint32_t total_elems = riscv_cpu_cfg(env)->vlen;
4812 uint32_t vta_all_1s = vext_vta_all_1s(desc);
4813 uint32_t vma = vext_vma(desc);
4814 int i;
4815 bool first_mask_bit = false;
4816
4817 for (i = env->vstart; i < vl; i++) {
4818 if (!vm && !vext_elem_mask(v0, i)) {
4819 /* set masked-off elements to 1s */
4820 if (vma) {
4821 vext_set_elem_mask(vd, i, 1);
4822 }
4823 continue;
4824 }
4825 /* write a zero to all following active elements */
4826 if (first_mask_bit) {
4827 vext_set_elem_mask(vd, i, 0);
4828 continue;
4829 }
4830 if (vext_elem_mask(vs2, i)) {
4831 first_mask_bit = true;
4832 if (type == BEFORE_FIRST) {
4833 vext_set_elem_mask(vd, i, 0);
4834 } else {
4835 vext_set_elem_mask(vd, i, 1);
4836 }
4837 } else {
4838 if (type == ONLY_FIRST) {
4839 vext_set_elem_mask(vd, i, 0);
4840 } else {
4841 vext_set_elem_mask(vd, i, 1);
4842 }
4843 }
4844 }
4845 env->vstart = 0;
4846 /* mask destination register are always tail-agnostic */
4847 /* set tail elements to 1s */
4848 if (vta_all_1s) {
4849 for (; i < total_elems; i++) {
4850 vext_set_elem_mask(vd, i, 1);
4851 }
4852 }
4853 }
4854
4855 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4856 uint32_t desc)
4857 {
4858 vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4859 }
4860
4861 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4862 uint32_t desc)
4863 {
4864 vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4865 }
4866
4867 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4868 uint32_t desc)
4869 {
4870 vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4871 }
4872
4873 /* Vector Iota Instruction */
4874 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H) \
4875 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env, \
4876 uint32_t desc) \
4877 { \
4878 uint32_t vm = vext_vm(desc); \
4879 uint32_t vl = env->vl; \
4880 uint32_t esz = sizeof(ETYPE); \
4881 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
4882 uint32_t vta = vext_vta(desc); \
4883 uint32_t vma = vext_vma(desc); \
4884 uint32_t sum = 0; \
4885 int i; \
4886 \
4887 for (i = env->vstart; i < vl; i++) { \
4888 if (!vm && !vext_elem_mask(v0, i)) { \
4889 /* set masked-off elements to 1s */ \
4890 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \
4891 continue; \
4892 } \
4893 *((ETYPE *)vd + H(i)) = sum; \
4894 if (vext_elem_mask(vs2, i)) { \
4895 sum++; \
4896 } \
4897 } \
4898 env->vstart = 0; \
4899 /* set tail elements to 1s */ \
4900 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
4901 }
4902
4903 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t, H1)
4904 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4905 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4906 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4907
4908 /* Vector Element Index Instruction */
4909 #define GEN_VEXT_VID_V(NAME, ETYPE, H) \
4910 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc) \
4911 { \
4912 uint32_t vm = vext_vm(desc); \
4913 uint32_t vl = env->vl; \
4914 uint32_t esz = sizeof(ETYPE); \
4915 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
4916 uint32_t vta = vext_vta(desc); \
4917 uint32_t vma = vext_vma(desc); \
4918 int i; \
4919 \
4920 for (i = env->vstart; i < vl; i++) { \
4921 if (!vm && !vext_elem_mask(v0, i)) { \
4922 /* set masked-off elements to 1s */ \
4923 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \
4924 continue; \
4925 } \
4926 *((ETYPE *)vd + H(i)) = i; \
4927 } \
4928 env->vstart = 0; \
4929 /* set tail elements to 1s */ \
4930 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
4931 }
4932
4933 GEN_VEXT_VID_V(vid_v_b, uint8_t, H1)
4934 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4935 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4936 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4937
4938 /*
4939 *** Vector Permutation Instructions
4940 */
4941
4942 /* Vector Slide Instructions */
4943 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H) \
4944 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4945 CPURISCVState *env, uint32_t desc) \
4946 { \
4947 uint32_t vm = vext_vm(desc); \
4948 uint32_t vl = env->vl; \
4949 uint32_t esz = sizeof(ETYPE); \
4950 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
4951 uint32_t vta = vext_vta(desc); \
4952 uint32_t vma = vext_vma(desc); \
4953 target_ulong offset = s1, i_min, i; \
4954 \
4955 i_min = MAX(env->vstart, offset); \
4956 for (i = i_min; i < vl; i++) { \
4957 if (!vm && !vext_elem_mask(v0, i)) { \
4958 /* set masked-off elements to 1s */ \
4959 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \
4960 continue; \
4961 } \
4962 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset)); \
4963 } \
4964 /* set tail elements to 1s */ \
4965 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
4966 }
4967
4968 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4969 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t, H1)
4970 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4971 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4972 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4973
4974 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H) \
4975 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4976 CPURISCVState *env, uint32_t desc) \
4977 { \
4978 uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE))); \
4979 uint32_t vm = vext_vm(desc); \
4980 uint32_t vl = env->vl; \
4981 uint32_t esz = sizeof(ETYPE); \
4982 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
4983 uint32_t vta = vext_vta(desc); \
4984 uint32_t vma = vext_vma(desc); \
4985 target_ulong i_max, i; \
4986 \
4987 i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart); \
4988 for (i = env->vstart; i < i_max; ++i) { \
4989 if (!vm && !vext_elem_mask(v0, i)) { \
4990 /* set masked-off elements to 1s */ \
4991 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \
4992 continue; \
4993 } \
4994 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1)); \
4995 } \
4996 \
4997 for (i = i_max; i < vl; ++i) { \
4998 if (vm || vext_elem_mask(v0, i)) { \
4999 *((ETYPE *)vd + H(i)) = 0; \
5000 } \
5001 } \
5002 \
5003 env->vstart = 0; \
5004 /* set tail elements to 1s */ \
5005 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5006 }
5007
5008 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5009 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t, H1)
5010 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5011 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5012 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5013
5014 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H) \
5015 static void vslide1up_##BITWIDTH(void *vd, void *v0, uint64_t s1, \
5016 void *vs2, CPURISCVState *env, uint32_t desc) \
5017 { \
5018 typedef uint##BITWIDTH##_t ETYPE; \
5019 uint32_t vm = vext_vm(desc); \
5020 uint32_t vl = env->vl; \
5021 uint32_t esz = sizeof(ETYPE); \
5022 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5023 uint32_t vta = vext_vta(desc); \
5024 uint32_t vma = vext_vma(desc); \
5025 uint32_t i; \
5026 \
5027 for (i = env->vstart; i < vl; i++) { \
5028 if (!vm && !vext_elem_mask(v0, i)) { \
5029 /* set masked-off elements to 1s */ \
5030 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \
5031 continue; \
5032 } \
5033 if (i == 0) { \
5034 *((ETYPE *)vd + H(i)) = s1; \
5035 } else { \
5036 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1)); \
5037 } \
5038 } \
5039 env->vstart = 0; \
5040 /* set tail elements to 1s */ \
5041 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5042 }
5043
5044 GEN_VEXT_VSLIE1UP(8, H1)
5045 GEN_VEXT_VSLIE1UP(16, H2)
5046 GEN_VEXT_VSLIE1UP(32, H4)
5047 GEN_VEXT_VSLIE1UP(64, H8)
5048
5049 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH) \
5050 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5051 CPURISCVState *env, uint32_t desc) \
5052 { \
5053 vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc); \
5054 }
5055
5056 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5057 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5058 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5059 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5060 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5061
5062 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H) \
5063 static void vslide1down_##BITWIDTH(void *vd, void *v0, uint64_t s1, \
5064 void *vs2, CPURISCVState *env, uint32_t desc) \
5065 { \
5066 typedef uint##BITWIDTH##_t ETYPE; \
5067 uint32_t vm = vext_vm(desc); \
5068 uint32_t vl = env->vl; \
5069 uint32_t esz = sizeof(ETYPE); \
5070 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5071 uint32_t vta = vext_vta(desc); \
5072 uint32_t vma = vext_vma(desc); \
5073 uint32_t i; \
5074 \
5075 for (i = env->vstart; i < vl; i++) { \
5076 if (!vm && !vext_elem_mask(v0, i)) { \
5077 /* set masked-off elements to 1s */ \
5078 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \
5079 continue; \
5080 } \
5081 if (i == vl - 1) { \
5082 *((ETYPE *)vd + H(i)) = s1; \
5083 } else { \
5084 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1)); \
5085 } \
5086 } \
5087 env->vstart = 0; \
5088 /* set tail elements to 1s */ \
5089 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5090 }
5091
5092 GEN_VEXT_VSLIDE1DOWN(8, H1)
5093 GEN_VEXT_VSLIDE1DOWN(16, H2)
5094 GEN_VEXT_VSLIDE1DOWN(32, H4)
5095 GEN_VEXT_VSLIDE1DOWN(64, H8)
5096
5097 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH) \
5098 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5099 CPURISCVState *env, uint32_t desc) \
5100 { \
5101 vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc); \
5102 }
5103
5104 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5105 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5106 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5107 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5108 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5109
5110 /* Vector Floating-Point Slide Instructions */
5111 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH) \
5112 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5113 CPURISCVState *env, uint32_t desc) \
5114 { \
5115 vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc); \
5116 }
5117
5118 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5119 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5120 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5121 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5122
5123 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH) \
5124 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5125 CPURISCVState *env, uint32_t desc) \
5126 { \
5127 vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc); \
5128 }
5129
5130 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5131 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5132 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5133 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5134
5135 /* Vector Register Gather Instruction */
5136 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2) \
5137 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
5138 CPURISCVState *env, uint32_t desc) \
5139 { \
5140 uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2))); \
5141 uint32_t vm = vext_vm(desc); \
5142 uint32_t vl = env->vl; \
5143 uint32_t esz = sizeof(TS2); \
5144 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5145 uint32_t vta = vext_vta(desc); \
5146 uint32_t vma = vext_vma(desc); \
5147 uint64_t index; \
5148 uint32_t i; \
5149 \
5150 for (i = env->vstart; i < vl; i++) { \
5151 if (!vm && !vext_elem_mask(v0, i)) { \
5152 /* set masked-off elements to 1s */ \
5153 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \
5154 continue; \
5155 } \
5156 index = *((TS1 *)vs1 + HS1(i)); \
5157 if (index >= vlmax) { \
5158 *((TS2 *)vd + HS2(i)) = 0; \
5159 } else { \
5160 *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index)); \
5161 } \
5162 } \
5163 env->vstart = 0; \
5164 /* set tail elements to 1s */ \
5165 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5166 }
5167
5168 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5169 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t, uint8_t, H1, H1)
5170 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5171 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5172 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5173
5174 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t, H2, H1)
5175 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5176 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5177 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5178
5179 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H) \
5180 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5181 CPURISCVState *env, uint32_t desc) \
5182 { \
5183 uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE))); \
5184 uint32_t vm = vext_vm(desc); \
5185 uint32_t vl = env->vl; \
5186 uint32_t esz = sizeof(ETYPE); \
5187 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5188 uint32_t vta = vext_vta(desc); \
5189 uint32_t vma = vext_vma(desc); \
5190 uint64_t index = s1; \
5191 uint32_t i; \
5192 \
5193 for (i = env->vstart; i < vl; i++) { \
5194 if (!vm && !vext_elem_mask(v0, i)) { \
5195 /* set masked-off elements to 1s */ \
5196 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \
5197 continue; \
5198 } \
5199 if (index >= vlmax) { \
5200 *((ETYPE *)vd + H(i)) = 0; \
5201 } else { \
5202 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index)); \
5203 } \
5204 } \
5205 env->vstart = 0; \
5206 /* set tail elements to 1s */ \
5207 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5208 }
5209
5210 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5211 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t, H1)
5212 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5213 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5214 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5215
5216 /* Vector Compress Instruction */
5217 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H) \
5218 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
5219 CPURISCVState *env, uint32_t desc) \
5220 { \
5221 uint32_t vl = env->vl; \
5222 uint32_t esz = sizeof(ETYPE); \
5223 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5224 uint32_t vta = vext_vta(desc); \
5225 uint32_t num = 0, i; \
5226 \
5227 for (i = env->vstart; i < vl; i++) { \
5228 if (!vext_elem_mask(vs1, i)) { \
5229 continue; \
5230 } \
5231 *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i)); \
5232 num++; \
5233 } \
5234 env->vstart = 0; \
5235 /* set tail elements to 1s */ \
5236 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5237 }
5238
5239 /* Compress into vd elements of vs2 where vs1 is enabled */
5240 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t, H1)
5241 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5242 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5243 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5244
5245 /* Vector Whole Register Move */
5246 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5247 {
5248 /* EEW = SEW */
5249 uint32_t maxsz = simd_maxsz(desc);
5250 uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5251 uint32_t startb = env->vstart * sewb;
5252 uint32_t i = startb;
5253
5254 memcpy((uint8_t *)vd + H1(i),
5255 (uint8_t *)vs2 + H1(i),
5256 maxsz - startb);
5257
5258 env->vstart = 0;
5259 }
5260
5261 /* Vector Integer Extension */
5262 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1) \
5263 void HELPER(NAME)(void *vd, void *v0, void *vs2, \
5264 CPURISCVState *env, uint32_t desc) \
5265 { \
5266 uint32_t vl = env->vl; \
5267 uint32_t vm = vext_vm(desc); \
5268 uint32_t esz = sizeof(ETYPE); \
5269 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5270 uint32_t vta = vext_vta(desc); \
5271 uint32_t vma = vext_vma(desc); \
5272 uint32_t i; \
5273 \
5274 for (i = env->vstart; i < vl; i++) { \
5275 if (!vm && !vext_elem_mask(v0, i)) { \
5276 /* set masked-off elements to 1s */ \
5277 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz); \
5278 continue; \
5279 } \
5280 *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i)); \
5281 } \
5282 env->vstart = 0; \
5283 /* set tail elements to 1s */ \
5284 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5285 }
5286
5287 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t, H2, H1)
5288 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5289 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5290 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t, H4, H1)
5291 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5292 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t, H8, H1)
5293
5294 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t, H2, H1)
5295 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5296 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5297 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t, H4, H1)
5298 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5299 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t, H8, H1)