]> git.proxmox.com Git - mirror_qemu.git/blob - target/riscv/vector_helper.c
target/riscv: rvv: Add mask agnostic for vector load / store instructions
[mirror_qemu.git] / target / riscv / vector_helper.c
1 /*
2 * RISC-V Vector Extension Helpers for QEMU.
3 *
4 * Copyright (c) 2020 T-Head Semiconductor Co., Ltd. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2 or later, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include "qemu/osdep.h"
20 #include "qemu/host-utils.h"
21 #include "qemu/bitops.h"
22 #include "cpu.h"
23 #include "exec/memop.h"
24 #include "exec/exec-all.h"
25 #include "exec/helper-proto.h"
26 #include "fpu/softfloat.h"
27 #include "tcg/tcg-gvec-desc.h"
28 #include "internals.h"
29 #include <math.h>
30
31 target_ulong HELPER(vsetvl)(CPURISCVState *env, target_ulong s1,
32 target_ulong s2)
33 {
34 int vlmax, vl;
35 RISCVCPU *cpu = env_archcpu(env);
36 uint64_t lmul = FIELD_EX64(s2, VTYPE, VLMUL);
37 uint16_t sew = 8 << FIELD_EX64(s2, VTYPE, VSEW);
38 uint8_t ediv = FIELD_EX64(s2, VTYPE, VEDIV);
39 int xlen = riscv_cpu_xlen(env);
40 bool vill = (s2 >> (xlen - 1)) & 0x1;
41 target_ulong reserved = s2 &
42 MAKE_64BIT_MASK(R_VTYPE_RESERVED_SHIFT,
43 xlen - 1 - R_VTYPE_RESERVED_SHIFT);
44
45 if (lmul & 4) {
46 /* Fractional LMUL. */
47 if (lmul == 4 ||
48 cpu->cfg.elen >> (8 - lmul) < sew) {
49 vill = true;
50 }
51 }
52
53 if ((sew > cpu->cfg.elen)
54 || vill
55 || (ediv != 0)
56 || (reserved != 0)) {
57 /* only set vill bit. */
58 env->vill = 1;
59 env->vtype = 0;
60 env->vl = 0;
61 env->vstart = 0;
62 return 0;
63 }
64
65 vlmax = vext_get_vlmax(cpu, s2);
66 if (s1 <= vlmax) {
67 vl = s1;
68 } else {
69 vl = vlmax;
70 }
71 env->vl = vl;
72 env->vtype = s2;
73 env->vstart = 0;
74 env->vill = 0;
75 return vl;
76 }
77
78 /*
79 * Note that vector data is stored in host-endian 64-bit chunks,
80 * so addressing units smaller than that needs a host-endian fixup.
81 */
82 #if HOST_BIG_ENDIAN
83 #define H1(x) ((x) ^ 7)
84 #define H1_2(x) ((x) ^ 6)
85 #define H1_4(x) ((x) ^ 4)
86 #define H2(x) ((x) ^ 3)
87 #define H4(x) ((x) ^ 1)
88 #define H8(x) ((x))
89 #else
90 #define H1(x) (x)
91 #define H1_2(x) (x)
92 #define H1_4(x) (x)
93 #define H2(x) (x)
94 #define H4(x) (x)
95 #define H8(x) (x)
96 #endif
97
98 static inline uint32_t vext_nf(uint32_t desc)
99 {
100 return FIELD_EX32(simd_data(desc), VDATA, NF);
101 }
102
103 static inline uint32_t vext_vm(uint32_t desc)
104 {
105 return FIELD_EX32(simd_data(desc), VDATA, VM);
106 }
107
108 /*
109 * Encode LMUL to lmul as following:
110 * LMUL vlmul lmul
111 * 1 000 0
112 * 2 001 1
113 * 4 010 2
114 * 8 011 3
115 * - 100 -
116 * 1/8 101 -3
117 * 1/4 110 -2
118 * 1/2 111 -1
119 */
120 static inline int32_t vext_lmul(uint32_t desc)
121 {
122 return sextract32(FIELD_EX32(simd_data(desc), VDATA, LMUL), 0, 3);
123 }
124
125 static inline uint32_t vext_vta(uint32_t desc)
126 {
127 return FIELD_EX32(simd_data(desc), VDATA, VTA);
128 }
129
130 static inline uint32_t vext_vma(uint32_t desc)
131 {
132 return FIELD_EX32(simd_data(desc), VDATA, VMA);
133 }
134
135 static inline uint32_t vext_vta_all_1s(uint32_t desc)
136 {
137 return FIELD_EX32(simd_data(desc), VDATA, VTA_ALL_1S);
138 }
139
140 /*
141 * Get the maximum number of elements can be operated.
142 *
143 * log2_esz: log2 of element size in bytes.
144 */
145 static inline uint32_t vext_max_elems(uint32_t desc, uint32_t log2_esz)
146 {
147 /*
148 * As simd_desc support at most 2048 bytes, the max vlen is 1024 bits.
149 * so vlen in bytes (vlenb) is encoded as maxsz.
150 */
151 uint32_t vlenb = simd_maxsz(desc);
152
153 /* Return VLMAX */
154 int scale = vext_lmul(desc) - log2_esz;
155 return scale < 0 ? vlenb >> -scale : vlenb << scale;
156 }
157
158 /*
159 * Get number of total elements, including prestart, body and tail elements.
160 * Note that when LMUL < 1, the tail includes the elements past VLMAX that
161 * are held in the same vector register.
162 */
163 static inline uint32_t vext_get_total_elems(CPURISCVState *env, uint32_t desc,
164 uint32_t esz)
165 {
166 uint32_t vlenb = simd_maxsz(desc);
167 uint32_t sew = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
168 int8_t emul = ctzl(esz) - ctzl(sew) + vext_lmul(desc) < 0 ? 0 :
169 ctzl(esz) - ctzl(sew) + vext_lmul(desc);
170 return (vlenb << emul) / esz;
171 }
172
173 static inline target_ulong adjust_addr(CPURISCVState *env, target_ulong addr)
174 {
175 return (addr & env->cur_pmmask) | env->cur_pmbase;
176 }
177
178 /*
179 * This function checks watchpoint before real load operation.
180 *
181 * In softmmu mode, the TLB API probe_access is enough for watchpoint check.
182 * In user mode, there is no watchpoint support now.
183 *
184 * It will trigger an exception if there is no mapping in TLB
185 * and page table walk can't fill the TLB entry. Then the guest
186 * software can return here after process the exception or never return.
187 */
188 static void probe_pages(CPURISCVState *env, target_ulong addr,
189 target_ulong len, uintptr_t ra,
190 MMUAccessType access_type)
191 {
192 target_ulong pagelen = -(addr | TARGET_PAGE_MASK);
193 target_ulong curlen = MIN(pagelen, len);
194
195 probe_access(env, adjust_addr(env, addr), curlen, access_type,
196 cpu_mmu_index(env, false), ra);
197 if (len > curlen) {
198 addr += curlen;
199 curlen = len - curlen;
200 probe_access(env, adjust_addr(env, addr), curlen, access_type,
201 cpu_mmu_index(env, false), ra);
202 }
203 }
204
205 /* set agnostic elements to 1s */
206 static void vext_set_elems_1s(void *base, uint32_t is_agnostic, uint32_t cnt,
207 uint32_t tot)
208 {
209 if (is_agnostic == 0) {
210 /* policy undisturbed */
211 return;
212 }
213 if (tot - cnt == 0) {
214 return ;
215 }
216 memset(base + cnt, -1, tot - cnt);
217 }
218
219 static inline void vext_set_elem_mask(void *v0, int index,
220 uint8_t value)
221 {
222 int idx = index / 64;
223 int pos = index % 64;
224 uint64_t old = ((uint64_t *)v0)[idx];
225 ((uint64_t *)v0)[idx] = deposit64(old, pos, 1, value);
226 }
227
228 /*
229 * Earlier designs (pre-0.9) had a varying number of bits
230 * per mask value (MLEN). In the 0.9 design, MLEN=1.
231 * (Section 4.5)
232 */
233 static inline int vext_elem_mask(void *v0, int index)
234 {
235 int idx = index / 64;
236 int pos = index % 64;
237 return (((uint64_t *)v0)[idx] >> pos) & 1;
238 }
239
240 /* elements operations for load and store */
241 typedef void vext_ldst_elem_fn(CPURISCVState *env, target_ulong addr,
242 uint32_t idx, void *vd, uintptr_t retaddr);
243
244 #define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \
245 static void NAME(CPURISCVState *env, abi_ptr addr, \
246 uint32_t idx, void *vd, uintptr_t retaddr)\
247 { \
248 ETYPE *cur = ((ETYPE *)vd + H(idx)); \
249 *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr); \
250 } \
251
252 GEN_VEXT_LD_ELEM(lde_b, int8_t, H1, ldsb)
253 GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
254 GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
255 GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
256
257 #define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \
258 static void NAME(CPURISCVState *env, abi_ptr addr, \
259 uint32_t idx, void *vd, uintptr_t retaddr)\
260 { \
261 ETYPE data = *((ETYPE *)vd + H(idx)); \
262 cpu_##STSUF##_data_ra(env, addr, data, retaddr); \
263 }
264
265 GEN_VEXT_ST_ELEM(ste_b, int8_t, H1, stb)
266 GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
267 GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
268 GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
269
270 /*
271 *** stride: access vector element from strided memory
272 */
273 static void
274 vext_ldst_stride(void *vd, void *v0, target_ulong base,
275 target_ulong stride, CPURISCVState *env,
276 uint32_t desc, uint32_t vm,
277 vext_ldst_elem_fn *ldst_elem,
278 uint32_t log2_esz, uintptr_t ra)
279 {
280 uint32_t i, k;
281 uint32_t nf = vext_nf(desc);
282 uint32_t max_elems = vext_max_elems(desc, log2_esz);
283 uint32_t esz = 1 << log2_esz;
284 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
285 uint32_t vta = vext_vta(desc);
286 uint32_t vma = vext_vma(desc);
287
288 for (i = env->vstart; i < env->vl; i++, env->vstart++) {
289 k = 0;
290 while (k < nf) {
291 if (!vm && !vext_elem_mask(v0, i)) {
292 /* set masked-off elements to 1s */
293 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
294 (i + k * max_elems + 1) * esz);
295 k++;
296 continue;
297 }
298 target_ulong addr = base + stride * i + (k << log2_esz);
299 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
300 k++;
301 }
302 }
303 env->vstart = 0;
304 /* set tail elements to 1s */
305 for (k = 0; k < nf; ++k) {
306 vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
307 (k * max_elems + max_elems) * esz);
308 }
309 if (nf * max_elems % total_elems != 0) {
310 uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
311 uint32_t registers_used =
312 ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
313 vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
314 registers_used * vlenb);
315 }
316 }
317
318 #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \
319 void HELPER(NAME)(void *vd, void * v0, target_ulong base, \
320 target_ulong stride, CPURISCVState *env, \
321 uint32_t desc) \
322 { \
323 uint32_t vm = vext_vm(desc); \
324 vext_ldst_stride(vd, v0, base, stride, env, desc, vm, LOAD_FN, \
325 ctzl(sizeof(ETYPE)), GETPC()); \
326 }
327
328 GEN_VEXT_LD_STRIDE(vlse8_v, int8_t, lde_b)
329 GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
330 GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
331 GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
332
333 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN) \
334 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \
335 target_ulong stride, CPURISCVState *env, \
336 uint32_t desc) \
337 { \
338 uint32_t vm = vext_vm(desc); \
339 vext_ldst_stride(vd, v0, base, stride, env, desc, vm, STORE_FN, \
340 ctzl(sizeof(ETYPE)), GETPC()); \
341 }
342
343 GEN_VEXT_ST_STRIDE(vsse8_v, int8_t, ste_b)
344 GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
345 GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
346 GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
347
348 /*
349 *** unit-stride: access elements stored contiguously in memory
350 */
351
352 /* unmasked unit-stride load and store operation*/
353 static void
354 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
355 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
356 uintptr_t ra)
357 {
358 uint32_t i, k;
359 uint32_t nf = vext_nf(desc);
360 uint32_t max_elems = vext_max_elems(desc, log2_esz);
361 uint32_t esz = 1 << log2_esz;
362 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
363 uint32_t vta = vext_vta(desc);
364
365 /* load bytes from guest memory */
366 for (i = env->vstart; i < evl; i++, env->vstart++) {
367 k = 0;
368 while (k < nf) {
369 target_ulong addr = base + ((i * nf + k) << log2_esz);
370 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
371 k++;
372 }
373 }
374 env->vstart = 0;
375 /* set tail elements to 1s */
376 for (k = 0; k < nf; ++k) {
377 vext_set_elems_1s(vd, vta, (k * max_elems + evl) * esz,
378 (k * max_elems + max_elems) * esz);
379 }
380 if (nf * max_elems % total_elems != 0) {
381 uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
382 uint32_t registers_used =
383 ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
384 vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
385 registers_used * vlenb);
386 }
387 }
388
389 /*
390 * masked unit-stride load and store operation will be a special case of stride,
391 * stride = NF * sizeof (MTYPE)
392 */
393
394 #define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN) \
395 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base, \
396 CPURISCVState *env, uint32_t desc) \
397 { \
398 uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE)); \
399 vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN, \
400 ctzl(sizeof(ETYPE)), GETPC()); \
401 } \
402 \
403 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \
404 CPURISCVState *env, uint32_t desc) \
405 { \
406 vext_ldst_us(vd, base, env, desc, LOAD_FN, \
407 ctzl(sizeof(ETYPE)), env->vl, GETPC()); \
408 }
409
410 GEN_VEXT_LD_US(vle8_v, int8_t, lde_b)
411 GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
412 GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
413 GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
414
415 #define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN) \
416 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base, \
417 CPURISCVState *env, uint32_t desc) \
418 { \
419 uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE)); \
420 vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN, \
421 ctzl(sizeof(ETYPE)), GETPC()); \
422 } \
423 \
424 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \
425 CPURISCVState *env, uint32_t desc) \
426 { \
427 vext_ldst_us(vd, base, env, desc, STORE_FN, \
428 ctzl(sizeof(ETYPE)), env->vl, GETPC()); \
429 }
430
431 GEN_VEXT_ST_US(vse8_v, int8_t, ste_b)
432 GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
433 GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
434 GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
435
436 /*
437 *** unit stride mask load and store, EEW = 1
438 */
439 void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
440 CPURISCVState *env, uint32_t desc)
441 {
442 /* evl = ceil(vl/8) */
443 uint8_t evl = (env->vl + 7) >> 3;
444 vext_ldst_us(vd, base, env, desc, lde_b,
445 0, evl, GETPC());
446 }
447
448 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
449 CPURISCVState *env, uint32_t desc)
450 {
451 /* evl = ceil(vl/8) */
452 uint8_t evl = (env->vl + 7) >> 3;
453 vext_ldst_us(vd, base, env, desc, ste_b,
454 0, evl, GETPC());
455 }
456
457 /*
458 *** index: access vector element from indexed memory
459 */
460 typedef target_ulong vext_get_index_addr(target_ulong base,
461 uint32_t idx, void *vs2);
462
463 #define GEN_VEXT_GET_INDEX_ADDR(NAME, ETYPE, H) \
464 static target_ulong NAME(target_ulong base, \
465 uint32_t idx, void *vs2) \
466 { \
467 return (base + *((ETYPE *)vs2 + H(idx))); \
468 }
469
470 GEN_VEXT_GET_INDEX_ADDR(idx_b, uint8_t, H1)
471 GEN_VEXT_GET_INDEX_ADDR(idx_h, uint16_t, H2)
472 GEN_VEXT_GET_INDEX_ADDR(idx_w, uint32_t, H4)
473 GEN_VEXT_GET_INDEX_ADDR(idx_d, uint64_t, H8)
474
475 static inline void
476 vext_ldst_index(void *vd, void *v0, target_ulong base,
477 void *vs2, CPURISCVState *env, uint32_t desc,
478 vext_get_index_addr get_index_addr,
479 vext_ldst_elem_fn *ldst_elem,
480 uint32_t log2_esz, uintptr_t ra)
481 {
482 uint32_t i, k;
483 uint32_t nf = vext_nf(desc);
484 uint32_t vm = vext_vm(desc);
485 uint32_t max_elems = vext_max_elems(desc, log2_esz);
486 uint32_t esz = 1 << log2_esz;
487 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
488 uint32_t vta = vext_vta(desc);
489 uint32_t vma = vext_vma(desc);
490
491 /* load bytes from guest memory */
492 for (i = env->vstart; i < env->vl; i++, env->vstart++) {
493 k = 0;
494 while (k < nf) {
495 if (!vm && !vext_elem_mask(v0, i)) {
496 /* set masked-off elements to 1s */
497 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
498 (i + k * max_elems + 1) * esz);
499 k++;
500 continue;
501 }
502 abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
503 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
504 k++;
505 }
506 }
507 env->vstart = 0;
508 /* set tail elements to 1s */
509 for (k = 0; k < nf; ++k) {
510 vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
511 (k * max_elems + max_elems) * esz);
512 }
513 if (nf * max_elems % total_elems != 0) {
514 uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
515 uint32_t registers_used =
516 ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
517 vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
518 registers_used * vlenb);
519 }
520 }
521
522 #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN) \
523 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \
524 void *vs2, CPURISCVState *env, uint32_t desc) \
525 { \
526 vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN, \
527 LOAD_FN, ctzl(sizeof(ETYPE)), GETPC()); \
528 }
529
530 GEN_VEXT_LD_INDEX(vlxei8_8_v, int8_t, idx_b, lde_b)
531 GEN_VEXT_LD_INDEX(vlxei8_16_v, int16_t, idx_b, lde_h)
532 GEN_VEXT_LD_INDEX(vlxei8_32_v, int32_t, idx_b, lde_w)
533 GEN_VEXT_LD_INDEX(vlxei8_64_v, int64_t, idx_b, lde_d)
534 GEN_VEXT_LD_INDEX(vlxei16_8_v, int8_t, idx_h, lde_b)
535 GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
536 GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
537 GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
538 GEN_VEXT_LD_INDEX(vlxei32_8_v, int8_t, idx_w, lde_b)
539 GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
540 GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
541 GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
542 GEN_VEXT_LD_INDEX(vlxei64_8_v, int8_t, idx_d, lde_b)
543 GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
544 GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
545 GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
546
547 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN) \
548 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \
549 void *vs2, CPURISCVState *env, uint32_t desc) \
550 { \
551 vext_ldst_index(vd, v0, base, vs2, env, desc, INDEX_FN, \
552 STORE_FN, ctzl(sizeof(ETYPE)), \
553 GETPC()); \
554 }
555
556 GEN_VEXT_ST_INDEX(vsxei8_8_v, int8_t, idx_b, ste_b)
557 GEN_VEXT_ST_INDEX(vsxei8_16_v, int16_t, idx_b, ste_h)
558 GEN_VEXT_ST_INDEX(vsxei8_32_v, int32_t, idx_b, ste_w)
559 GEN_VEXT_ST_INDEX(vsxei8_64_v, int64_t, idx_b, ste_d)
560 GEN_VEXT_ST_INDEX(vsxei16_8_v, int8_t, idx_h, ste_b)
561 GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
562 GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
563 GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
564 GEN_VEXT_ST_INDEX(vsxei32_8_v, int8_t, idx_w, ste_b)
565 GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
566 GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
567 GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
568 GEN_VEXT_ST_INDEX(vsxei64_8_v, int8_t, idx_d, ste_b)
569 GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
570 GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
571 GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
572
573 /*
574 *** unit-stride fault-only-fisrt load instructions
575 */
576 static inline void
577 vext_ldff(void *vd, void *v0, target_ulong base,
578 CPURISCVState *env, uint32_t desc,
579 vext_ldst_elem_fn *ldst_elem,
580 uint32_t log2_esz, uintptr_t ra)
581 {
582 void *host;
583 uint32_t i, k, vl = 0;
584 uint32_t nf = vext_nf(desc);
585 uint32_t vm = vext_vm(desc);
586 uint32_t max_elems = vext_max_elems(desc, log2_esz);
587 uint32_t esz = 1 << log2_esz;
588 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
589 uint32_t vta = vext_vta(desc);
590 uint32_t vma = vext_vma(desc);
591 target_ulong addr, offset, remain;
592
593 /* probe every access*/
594 for (i = env->vstart; i < env->vl; i++) {
595 if (!vm && !vext_elem_mask(v0, i)) {
596 continue;
597 }
598 addr = adjust_addr(env, base + i * (nf << log2_esz));
599 if (i == 0) {
600 probe_pages(env, addr, nf << log2_esz, ra, MMU_DATA_LOAD);
601 } else {
602 /* if it triggers an exception, no need to check watchpoint */
603 remain = nf << log2_esz;
604 while (remain > 0) {
605 offset = -(addr | TARGET_PAGE_MASK);
606 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD,
607 cpu_mmu_index(env, false));
608 if (host) {
609 #ifdef CONFIG_USER_ONLY
610 if (page_check_range(addr, offset, PAGE_READ) < 0) {
611 vl = i;
612 goto ProbeSuccess;
613 }
614 #else
615 probe_pages(env, addr, offset, ra, MMU_DATA_LOAD);
616 #endif
617 } else {
618 vl = i;
619 goto ProbeSuccess;
620 }
621 if (remain <= offset) {
622 break;
623 }
624 remain -= offset;
625 addr = adjust_addr(env, addr + offset);
626 }
627 }
628 }
629 ProbeSuccess:
630 /* load bytes from guest memory */
631 if (vl != 0) {
632 env->vl = vl;
633 }
634 for (i = env->vstart; i < env->vl; i++) {
635 k = 0;
636 while (k < nf) {
637 if (!vm && !vext_elem_mask(v0, i)) {
638 /* set masked-off elements to 1s */
639 vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
640 (i + k * max_elems + 1) * esz);
641 k++;
642 continue;
643 }
644 target_ulong addr = base + ((i * nf + k) << log2_esz);
645 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
646 k++;
647 }
648 }
649 env->vstart = 0;
650 /* set tail elements to 1s */
651 for (k = 0; k < nf; ++k) {
652 vext_set_elems_1s(vd, vta, (k * max_elems + env->vl) * esz,
653 (k * max_elems + max_elems) * esz);
654 }
655 if (nf * max_elems % total_elems != 0) {
656 uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
657 uint32_t registers_used =
658 ((nf * max_elems) * esz + (vlenb - 1)) / vlenb;
659 vext_set_elems_1s(vd, vta, (nf * max_elems) * esz,
660 registers_used * vlenb);
661 }
662 }
663
664 #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN) \
665 void HELPER(NAME)(void *vd, void *v0, target_ulong base, \
666 CPURISCVState *env, uint32_t desc) \
667 { \
668 vext_ldff(vd, v0, base, env, desc, LOAD_FN, \
669 ctzl(sizeof(ETYPE)), GETPC()); \
670 }
671
672 GEN_VEXT_LDFF(vle8ff_v, int8_t, lde_b)
673 GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
674 GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
675 GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
676
677 #define DO_SWAP(N, M) (M)
678 #define DO_AND(N, M) (N & M)
679 #define DO_XOR(N, M) (N ^ M)
680 #define DO_OR(N, M) (N | M)
681 #define DO_ADD(N, M) (N + M)
682
683 /* Signed min/max */
684 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
685 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
686
687 /* Unsigned min/max */
688 #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
689 #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
690
691 /*
692 *** load and store whole register instructions
693 */
694 static void
695 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
696 vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
697 {
698 uint32_t i, k, off, pos;
699 uint32_t nf = vext_nf(desc);
700 uint32_t vlenb = env_archcpu(env)->cfg.vlen >> 3;
701 uint32_t max_elems = vlenb >> log2_esz;
702
703 k = env->vstart / max_elems;
704 off = env->vstart % max_elems;
705
706 if (off) {
707 /* load/store rest of elements of current segment pointed by vstart */
708 for (pos = off; pos < max_elems; pos++, env->vstart++) {
709 target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
710 ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd, ra);
711 }
712 k++;
713 }
714
715 /* load/store elements for rest of segments */
716 for (; k < nf; k++) {
717 for (i = 0; i < max_elems; i++, env->vstart++) {
718 target_ulong addr = base + ((i + k * max_elems) << log2_esz);
719 ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
720 }
721 }
722
723 env->vstart = 0;
724 }
725
726 #define GEN_VEXT_LD_WHOLE(NAME, ETYPE, LOAD_FN) \
727 void HELPER(NAME)(void *vd, target_ulong base, \
728 CPURISCVState *env, uint32_t desc) \
729 { \
730 vext_ldst_whole(vd, base, env, desc, LOAD_FN, \
731 ctzl(sizeof(ETYPE)), GETPC()); \
732 }
733
734 GEN_VEXT_LD_WHOLE(vl1re8_v, int8_t, lde_b)
735 GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
736 GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
737 GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
738 GEN_VEXT_LD_WHOLE(vl2re8_v, int8_t, lde_b)
739 GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
740 GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
741 GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
742 GEN_VEXT_LD_WHOLE(vl4re8_v, int8_t, lde_b)
743 GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
744 GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
745 GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
746 GEN_VEXT_LD_WHOLE(vl8re8_v, int8_t, lde_b)
747 GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
748 GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
749 GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
750
751 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN) \
752 void HELPER(NAME)(void *vd, target_ulong base, \
753 CPURISCVState *env, uint32_t desc) \
754 { \
755 vext_ldst_whole(vd, base, env, desc, STORE_FN, \
756 ctzl(sizeof(ETYPE)), GETPC()); \
757 }
758
759 GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
760 GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
761 GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
762 GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
763
764 /*
765 *** Vector Integer Arithmetic Instructions
766 */
767
768 /* expand macro args before macro */
769 #define RVVCALL(macro, ...) macro(__VA_ARGS__)
770
771 /* (TD, T1, T2, TX1, TX2) */
772 #define OP_SSS_B int8_t, int8_t, int8_t, int8_t, int8_t
773 #define OP_SSS_H int16_t, int16_t, int16_t, int16_t, int16_t
774 #define OP_SSS_W int32_t, int32_t, int32_t, int32_t, int32_t
775 #define OP_SSS_D int64_t, int64_t, int64_t, int64_t, int64_t
776 #define OP_UUU_B uint8_t, uint8_t, uint8_t, uint8_t, uint8_t
777 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
778 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
779 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
780 #define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
781 #define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
782 #define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
783 #define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
784 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
785 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
786 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
787 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
788 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
789 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
790 #define WOP_SUS_B int16_t, uint8_t, int8_t, uint16_t, int16_t
791 #define WOP_SUS_H int32_t, uint16_t, int16_t, uint32_t, int32_t
792 #define WOP_SUS_W int64_t, uint32_t, int32_t, uint64_t, int64_t
793 #define WOP_SSU_B int16_t, int8_t, uint8_t, int16_t, uint16_t
794 #define WOP_SSU_H int32_t, int16_t, uint16_t, int32_t, uint32_t
795 #define WOP_SSU_W int64_t, int32_t, uint32_t, int64_t, uint64_t
796 #define NOP_SSS_B int8_t, int8_t, int16_t, int8_t, int16_t
797 #define NOP_SSS_H int16_t, int16_t, int32_t, int16_t, int32_t
798 #define NOP_SSS_W int32_t, int32_t, int64_t, int32_t, int64_t
799 #define NOP_UUU_B uint8_t, uint8_t, uint16_t, uint8_t, uint16_t
800 #define NOP_UUU_H uint16_t, uint16_t, uint32_t, uint16_t, uint32_t
801 #define NOP_UUU_W uint32_t, uint32_t, uint64_t, uint32_t, uint64_t
802
803 /* operation of two vector elements */
804 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
805
806 #define OPIVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \
807 static void do_##NAME(void *vd, void *vs1, void *vs2, int i) \
808 { \
809 TX1 s1 = *((T1 *)vs1 + HS1(i)); \
810 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
811 *((TD *)vd + HD(i)) = OP(s2, s1); \
812 }
813 #define DO_SUB(N, M) (N - M)
814 #define DO_RSUB(N, M) (M - N)
815
816 RVVCALL(OPIVV2, vadd_vv_b, OP_SSS_B, H1, H1, H1, DO_ADD)
817 RVVCALL(OPIVV2, vadd_vv_h, OP_SSS_H, H2, H2, H2, DO_ADD)
818 RVVCALL(OPIVV2, vadd_vv_w, OP_SSS_W, H4, H4, H4, DO_ADD)
819 RVVCALL(OPIVV2, vadd_vv_d, OP_SSS_D, H8, H8, H8, DO_ADD)
820 RVVCALL(OPIVV2, vsub_vv_b, OP_SSS_B, H1, H1, H1, DO_SUB)
821 RVVCALL(OPIVV2, vsub_vv_h, OP_SSS_H, H2, H2, H2, DO_SUB)
822 RVVCALL(OPIVV2, vsub_vv_w, OP_SSS_W, H4, H4, H4, DO_SUB)
823 RVVCALL(OPIVV2, vsub_vv_d, OP_SSS_D, H8, H8, H8, DO_SUB)
824
825 static void do_vext_vv(void *vd, void *v0, void *vs1, void *vs2,
826 CPURISCVState *env, uint32_t desc,
827 opivv2_fn *fn, uint32_t esz)
828 {
829 uint32_t vm = vext_vm(desc);
830 uint32_t vl = env->vl;
831 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
832 uint32_t vta = vext_vta(desc);
833 uint32_t vma = vext_vma(desc);
834 uint32_t i;
835
836 for (i = env->vstart; i < vl; i++) {
837 if (!vm && !vext_elem_mask(v0, i)) {
838 /* set masked-off elements to 1s */
839 vext_set_elems_1s(vd, vma, i * esz, (i + 1) * esz);
840 continue;
841 }
842 fn(vd, vs1, vs2, i);
843 }
844 env->vstart = 0;
845 /* set tail elements to 1s */
846 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
847 }
848
849 /* generate the helpers for OPIVV */
850 #define GEN_VEXT_VV(NAME, ESZ) \
851 void HELPER(NAME)(void *vd, void *v0, void *vs1, \
852 void *vs2, CPURISCVState *env, \
853 uint32_t desc) \
854 { \
855 do_vext_vv(vd, v0, vs1, vs2, env, desc, \
856 do_##NAME, ESZ); \
857 }
858
859 GEN_VEXT_VV(vadd_vv_b, 1)
860 GEN_VEXT_VV(vadd_vv_h, 2)
861 GEN_VEXT_VV(vadd_vv_w, 4)
862 GEN_VEXT_VV(vadd_vv_d, 8)
863 GEN_VEXT_VV(vsub_vv_b, 1)
864 GEN_VEXT_VV(vsub_vv_h, 2)
865 GEN_VEXT_VV(vsub_vv_w, 4)
866 GEN_VEXT_VV(vsub_vv_d, 8)
867
868 typedef void opivx2_fn(void *vd, target_long s1, void *vs2, int i);
869
870 /*
871 * (T1)s1 gives the real operator type.
872 * (TX1)(T1)s1 expands the operator type of widen or narrow operations.
873 */
874 #define OPIVX2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \
875 static void do_##NAME(void *vd, target_long s1, void *vs2, int i) \
876 { \
877 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
878 *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1); \
879 }
880
881 RVVCALL(OPIVX2, vadd_vx_b, OP_SSS_B, H1, H1, DO_ADD)
882 RVVCALL(OPIVX2, vadd_vx_h, OP_SSS_H, H2, H2, DO_ADD)
883 RVVCALL(OPIVX2, vadd_vx_w, OP_SSS_W, H4, H4, DO_ADD)
884 RVVCALL(OPIVX2, vadd_vx_d, OP_SSS_D, H8, H8, DO_ADD)
885 RVVCALL(OPIVX2, vsub_vx_b, OP_SSS_B, H1, H1, DO_SUB)
886 RVVCALL(OPIVX2, vsub_vx_h, OP_SSS_H, H2, H2, DO_SUB)
887 RVVCALL(OPIVX2, vsub_vx_w, OP_SSS_W, H4, H4, DO_SUB)
888 RVVCALL(OPIVX2, vsub_vx_d, OP_SSS_D, H8, H8, DO_SUB)
889 RVVCALL(OPIVX2, vrsub_vx_b, OP_SSS_B, H1, H1, DO_RSUB)
890 RVVCALL(OPIVX2, vrsub_vx_h, OP_SSS_H, H2, H2, DO_RSUB)
891 RVVCALL(OPIVX2, vrsub_vx_w, OP_SSS_W, H4, H4, DO_RSUB)
892 RVVCALL(OPIVX2, vrsub_vx_d, OP_SSS_D, H8, H8, DO_RSUB)
893
894 static void do_vext_vx(void *vd, void *v0, target_long s1, void *vs2,
895 CPURISCVState *env, uint32_t desc,
896 opivx2_fn fn, uint32_t esz)
897 {
898 uint32_t vm = vext_vm(desc);
899 uint32_t vl = env->vl;
900 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
901 uint32_t vta = vext_vta(desc);
902 uint32_t i;
903
904 for (i = env->vstart; i < vl; i++) {
905 if (!vm && !vext_elem_mask(v0, i)) {
906 continue;
907 }
908 fn(vd, s1, vs2, i);
909 }
910 env->vstart = 0;
911 /* set tail elements to 1s */
912 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
913 }
914
915 /* generate the helpers for OPIVX */
916 #define GEN_VEXT_VX(NAME, ESZ) \
917 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \
918 void *vs2, CPURISCVState *env, \
919 uint32_t desc) \
920 { \
921 do_vext_vx(vd, v0, s1, vs2, env, desc, \
922 do_##NAME, ESZ); \
923 }
924
925 GEN_VEXT_VX(vadd_vx_b, 1)
926 GEN_VEXT_VX(vadd_vx_h, 2)
927 GEN_VEXT_VX(vadd_vx_w, 4)
928 GEN_VEXT_VX(vadd_vx_d, 8)
929 GEN_VEXT_VX(vsub_vx_b, 1)
930 GEN_VEXT_VX(vsub_vx_h, 2)
931 GEN_VEXT_VX(vsub_vx_w, 4)
932 GEN_VEXT_VX(vsub_vx_d, 8)
933 GEN_VEXT_VX(vrsub_vx_b, 1)
934 GEN_VEXT_VX(vrsub_vx_h, 2)
935 GEN_VEXT_VX(vrsub_vx_w, 4)
936 GEN_VEXT_VX(vrsub_vx_d, 8)
937
938 void HELPER(vec_rsubs8)(void *d, void *a, uint64_t b, uint32_t desc)
939 {
940 intptr_t oprsz = simd_oprsz(desc);
941 intptr_t i;
942
943 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
944 *(uint8_t *)(d + i) = (uint8_t)b - *(uint8_t *)(a + i);
945 }
946 }
947
948 void HELPER(vec_rsubs16)(void *d, void *a, uint64_t b, uint32_t desc)
949 {
950 intptr_t oprsz = simd_oprsz(desc);
951 intptr_t i;
952
953 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
954 *(uint16_t *)(d + i) = (uint16_t)b - *(uint16_t *)(a + i);
955 }
956 }
957
958 void HELPER(vec_rsubs32)(void *d, void *a, uint64_t b, uint32_t desc)
959 {
960 intptr_t oprsz = simd_oprsz(desc);
961 intptr_t i;
962
963 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
964 *(uint32_t *)(d + i) = (uint32_t)b - *(uint32_t *)(a + i);
965 }
966 }
967
968 void HELPER(vec_rsubs64)(void *d, void *a, uint64_t b, uint32_t desc)
969 {
970 intptr_t oprsz = simd_oprsz(desc);
971 intptr_t i;
972
973 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
974 *(uint64_t *)(d + i) = b - *(uint64_t *)(a + i);
975 }
976 }
977
978 /* Vector Widening Integer Add/Subtract */
979 #define WOP_UUU_B uint16_t, uint8_t, uint8_t, uint16_t, uint16_t
980 #define WOP_UUU_H uint32_t, uint16_t, uint16_t, uint32_t, uint32_t
981 #define WOP_UUU_W uint64_t, uint32_t, uint32_t, uint64_t, uint64_t
982 #define WOP_SSS_B int16_t, int8_t, int8_t, int16_t, int16_t
983 #define WOP_SSS_H int32_t, int16_t, int16_t, int32_t, int32_t
984 #define WOP_SSS_W int64_t, int32_t, int32_t, int64_t, int64_t
985 #define WOP_WUUU_B uint16_t, uint8_t, uint16_t, uint16_t, uint16_t
986 #define WOP_WUUU_H uint32_t, uint16_t, uint32_t, uint32_t, uint32_t
987 #define WOP_WUUU_W uint64_t, uint32_t, uint64_t, uint64_t, uint64_t
988 #define WOP_WSSS_B int16_t, int8_t, int16_t, int16_t, int16_t
989 #define WOP_WSSS_H int32_t, int16_t, int32_t, int32_t, int32_t
990 #define WOP_WSSS_W int64_t, int32_t, int64_t, int64_t, int64_t
991 RVVCALL(OPIVV2, vwaddu_vv_b, WOP_UUU_B, H2, H1, H1, DO_ADD)
992 RVVCALL(OPIVV2, vwaddu_vv_h, WOP_UUU_H, H4, H2, H2, DO_ADD)
993 RVVCALL(OPIVV2, vwaddu_vv_w, WOP_UUU_W, H8, H4, H4, DO_ADD)
994 RVVCALL(OPIVV2, vwsubu_vv_b, WOP_UUU_B, H2, H1, H1, DO_SUB)
995 RVVCALL(OPIVV2, vwsubu_vv_h, WOP_UUU_H, H4, H2, H2, DO_SUB)
996 RVVCALL(OPIVV2, vwsubu_vv_w, WOP_UUU_W, H8, H4, H4, DO_SUB)
997 RVVCALL(OPIVV2, vwadd_vv_b, WOP_SSS_B, H2, H1, H1, DO_ADD)
998 RVVCALL(OPIVV2, vwadd_vv_h, WOP_SSS_H, H4, H2, H2, DO_ADD)
999 RVVCALL(OPIVV2, vwadd_vv_w, WOP_SSS_W, H8, H4, H4, DO_ADD)
1000 RVVCALL(OPIVV2, vwsub_vv_b, WOP_SSS_B, H2, H1, H1, DO_SUB)
1001 RVVCALL(OPIVV2, vwsub_vv_h, WOP_SSS_H, H4, H2, H2, DO_SUB)
1002 RVVCALL(OPIVV2, vwsub_vv_w, WOP_SSS_W, H8, H4, H4, DO_SUB)
1003 RVVCALL(OPIVV2, vwaddu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_ADD)
1004 RVVCALL(OPIVV2, vwaddu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_ADD)
1005 RVVCALL(OPIVV2, vwaddu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_ADD)
1006 RVVCALL(OPIVV2, vwsubu_wv_b, WOP_WUUU_B, H2, H1, H1, DO_SUB)
1007 RVVCALL(OPIVV2, vwsubu_wv_h, WOP_WUUU_H, H4, H2, H2, DO_SUB)
1008 RVVCALL(OPIVV2, vwsubu_wv_w, WOP_WUUU_W, H8, H4, H4, DO_SUB)
1009 RVVCALL(OPIVV2, vwadd_wv_b, WOP_WSSS_B, H2, H1, H1, DO_ADD)
1010 RVVCALL(OPIVV2, vwadd_wv_h, WOP_WSSS_H, H4, H2, H2, DO_ADD)
1011 RVVCALL(OPIVV2, vwadd_wv_w, WOP_WSSS_W, H8, H4, H4, DO_ADD)
1012 RVVCALL(OPIVV2, vwsub_wv_b, WOP_WSSS_B, H2, H1, H1, DO_SUB)
1013 RVVCALL(OPIVV2, vwsub_wv_h, WOP_WSSS_H, H4, H2, H2, DO_SUB)
1014 RVVCALL(OPIVV2, vwsub_wv_w, WOP_WSSS_W, H8, H4, H4, DO_SUB)
1015 GEN_VEXT_VV(vwaddu_vv_b, 2)
1016 GEN_VEXT_VV(vwaddu_vv_h, 4)
1017 GEN_VEXT_VV(vwaddu_vv_w, 8)
1018 GEN_VEXT_VV(vwsubu_vv_b, 2)
1019 GEN_VEXT_VV(vwsubu_vv_h, 4)
1020 GEN_VEXT_VV(vwsubu_vv_w, 8)
1021 GEN_VEXT_VV(vwadd_vv_b, 2)
1022 GEN_VEXT_VV(vwadd_vv_h, 4)
1023 GEN_VEXT_VV(vwadd_vv_w, 8)
1024 GEN_VEXT_VV(vwsub_vv_b, 2)
1025 GEN_VEXT_VV(vwsub_vv_h, 4)
1026 GEN_VEXT_VV(vwsub_vv_w, 8)
1027 GEN_VEXT_VV(vwaddu_wv_b, 2)
1028 GEN_VEXT_VV(vwaddu_wv_h, 4)
1029 GEN_VEXT_VV(vwaddu_wv_w, 8)
1030 GEN_VEXT_VV(vwsubu_wv_b, 2)
1031 GEN_VEXT_VV(vwsubu_wv_h, 4)
1032 GEN_VEXT_VV(vwsubu_wv_w, 8)
1033 GEN_VEXT_VV(vwadd_wv_b, 2)
1034 GEN_VEXT_VV(vwadd_wv_h, 4)
1035 GEN_VEXT_VV(vwadd_wv_w, 8)
1036 GEN_VEXT_VV(vwsub_wv_b, 2)
1037 GEN_VEXT_VV(vwsub_wv_h, 4)
1038 GEN_VEXT_VV(vwsub_wv_w, 8)
1039
1040 RVVCALL(OPIVX2, vwaddu_vx_b, WOP_UUU_B, H2, H1, DO_ADD)
1041 RVVCALL(OPIVX2, vwaddu_vx_h, WOP_UUU_H, H4, H2, DO_ADD)
1042 RVVCALL(OPIVX2, vwaddu_vx_w, WOP_UUU_W, H8, H4, DO_ADD)
1043 RVVCALL(OPIVX2, vwsubu_vx_b, WOP_UUU_B, H2, H1, DO_SUB)
1044 RVVCALL(OPIVX2, vwsubu_vx_h, WOP_UUU_H, H4, H2, DO_SUB)
1045 RVVCALL(OPIVX2, vwsubu_vx_w, WOP_UUU_W, H8, H4, DO_SUB)
1046 RVVCALL(OPIVX2, vwadd_vx_b, WOP_SSS_B, H2, H1, DO_ADD)
1047 RVVCALL(OPIVX2, vwadd_vx_h, WOP_SSS_H, H4, H2, DO_ADD)
1048 RVVCALL(OPIVX2, vwadd_vx_w, WOP_SSS_W, H8, H4, DO_ADD)
1049 RVVCALL(OPIVX2, vwsub_vx_b, WOP_SSS_B, H2, H1, DO_SUB)
1050 RVVCALL(OPIVX2, vwsub_vx_h, WOP_SSS_H, H4, H2, DO_SUB)
1051 RVVCALL(OPIVX2, vwsub_vx_w, WOP_SSS_W, H8, H4, DO_SUB)
1052 RVVCALL(OPIVX2, vwaddu_wx_b, WOP_WUUU_B, H2, H1, DO_ADD)
1053 RVVCALL(OPIVX2, vwaddu_wx_h, WOP_WUUU_H, H4, H2, DO_ADD)
1054 RVVCALL(OPIVX2, vwaddu_wx_w, WOP_WUUU_W, H8, H4, DO_ADD)
1055 RVVCALL(OPIVX2, vwsubu_wx_b, WOP_WUUU_B, H2, H1, DO_SUB)
1056 RVVCALL(OPIVX2, vwsubu_wx_h, WOP_WUUU_H, H4, H2, DO_SUB)
1057 RVVCALL(OPIVX2, vwsubu_wx_w, WOP_WUUU_W, H8, H4, DO_SUB)
1058 RVVCALL(OPIVX2, vwadd_wx_b, WOP_WSSS_B, H2, H1, DO_ADD)
1059 RVVCALL(OPIVX2, vwadd_wx_h, WOP_WSSS_H, H4, H2, DO_ADD)
1060 RVVCALL(OPIVX2, vwadd_wx_w, WOP_WSSS_W, H8, H4, DO_ADD)
1061 RVVCALL(OPIVX2, vwsub_wx_b, WOP_WSSS_B, H2, H1, DO_SUB)
1062 RVVCALL(OPIVX2, vwsub_wx_h, WOP_WSSS_H, H4, H2, DO_SUB)
1063 RVVCALL(OPIVX2, vwsub_wx_w, WOP_WSSS_W, H8, H4, DO_SUB)
1064 GEN_VEXT_VX(vwaddu_vx_b, 2)
1065 GEN_VEXT_VX(vwaddu_vx_h, 4)
1066 GEN_VEXT_VX(vwaddu_vx_w, 8)
1067 GEN_VEXT_VX(vwsubu_vx_b, 2)
1068 GEN_VEXT_VX(vwsubu_vx_h, 4)
1069 GEN_VEXT_VX(vwsubu_vx_w, 8)
1070 GEN_VEXT_VX(vwadd_vx_b, 2)
1071 GEN_VEXT_VX(vwadd_vx_h, 4)
1072 GEN_VEXT_VX(vwadd_vx_w, 8)
1073 GEN_VEXT_VX(vwsub_vx_b, 2)
1074 GEN_VEXT_VX(vwsub_vx_h, 4)
1075 GEN_VEXT_VX(vwsub_vx_w, 8)
1076 GEN_VEXT_VX(vwaddu_wx_b, 2)
1077 GEN_VEXT_VX(vwaddu_wx_h, 4)
1078 GEN_VEXT_VX(vwaddu_wx_w, 8)
1079 GEN_VEXT_VX(vwsubu_wx_b, 2)
1080 GEN_VEXT_VX(vwsubu_wx_h, 4)
1081 GEN_VEXT_VX(vwsubu_wx_w, 8)
1082 GEN_VEXT_VX(vwadd_wx_b, 2)
1083 GEN_VEXT_VX(vwadd_wx_h, 4)
1084 GEN_VEXT_VX(vwadd_wx_w, 8)
1085 GEN_VEXT_VX(vwsub_wx_b, 2)
1086 GEN_VEXT_VX(vwsub_wx_h, 4)
1087 GEN_VEXT_VX(vwsub_wx_w, 8)
1088
1089 /* Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions */
1090 #define DO_VADC(N, M, C) (N + M + C)
1091 #define DO_VSBC(N, M, C) (N - M - C)
1092
1093 #define GEN_VEXT_VADC_VVM(NAME, ETYPE, H, DO_OP) \
1094 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
1095 CPURISCVState *env, uint32_t desc) \
1096 { \
1097 uint32_t vl = env->vl; \
1098 uint32_t esz = sizeof(ETYPE); \
1099 uint32_t total_elems = \
1100 vext_get_total_elems(env, desc, esz); \
1101 uint32_t vta = vext_vta(desc); \
1102 uint32_t i; \
1103 \
1104 for (i = env->vstart; i < vl; i++) { \
1105 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \
1106 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
1107 ETYPE carry = vext_elem_mask(v0, i); \
1108 \
1109 *((ETYPE *)vd + H(i)) = DO_OP(s2, s1, carry); \
1110 } \
1111 env->vstart = 0; \
1112 /* set tail elements to 1s */ \
1113 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
1114 }
1115
1116 GEN_VEXT_VADC_VVM(vadc_vvm_b, uint8_t, H1, DO_VADC)
1117 GEN_VEXT_VADC_VVM(vadc_vvm_h, uint16_t, H2, DO_VADC)
1118 GEN_VEXT_VADC_VVM(vadc_vvm_w, uint32_t, H4, DO_VADC)
1119 GEN_VEXT_VADC_VVM(vadc_vvm_d, uint64_t, H8, DO_VADC)
1120
1121 GEN_VEXT_VADC_VVM(vsbc_vvm_b, uint8_t, H1, DO_VSBC)
1122 GEN_VEXT_VADC_VVM(vsbc_vvm_h, uint16_t, H2, DO_VSBC)
1123 GEN_VEXT_VADC_VVM(vsbc_vvm_w, uint32_t, H4, DO_VSBC)
1124 GEN_VEXT_VADC_VVM(vsbc_vvm_d, uint64_t, H8, DO_VSBC)
1125
1126 #define GEN_VEXT_VADC_VXM(NAME, ETYPE, H, DO_OP) \
1127 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
1128 CPURISCVState *env, uint32_t desc) \
1129 { \
1130 uint32_t vl = env->vl; \
1131 uint32_t esz = sizeof(ETYPE); \
1132 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
1133 uint32_t vta = vext_vta(desc); \
1134 uint32_t i; \
1135 \
1136 for (i = env->vstart; i < vl; i++) { \
1137 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
1138 ETYPE carry = vext_elem_mask(v0, i); \
1139 \
1140 *((ETYPE *)vd + H(i)) = DO_OP(s2, (ETYPE)(target_long)s1, carry);\
1141 } \
1142 env->vstart = 0; \
1143 /* set tail elements to 1s */ \
1144 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
1145 }
1146
1147 GEN_VEXT_VADC_VXM(vadc_vxm_b, uint8_t, H1, DO_VADC)
1148 GEN_VEXT_VADC_VXM(vadc_vxm_h, uint16_t, H2, DO_VADC)
1149 GEN_VEXT_VADC_VXM(vadc_vxm_w, uint32_t, H4, DO_VADC)
1150 GEN_VEXT_VADC_VXM(vadc_vxm_d, uint64_t, H8, DO_VADC)
1151
1152 GEN_VEXT_VADC_VXM(vsbc_vxm_b, uint8_t, H1, DO_VSBC)
1153 GEN_VEXT_VADC_VXM(vsbc_vxm_h, uint16_t, H2, DO_VSBC)
1154 GEN_VEXT_VADC_VXM(vsbc_vxm_w, uint32_t, H4, DO_VSBC)
1155 GEN_VEXT_VADC_VXM(vsbc_vxm_d, uint64_t, H8, DO_VSBC)
1156
1157 #define DO_MADC(N, M, C) (C ? (__typeof(N))(N + M + 1) <= N : \
1158 (__typeof(N))(N + M) < N)
1159 #define DO_MSBC(N, M, C) (C ? N <= M : N < M)
1160
1161 #define GEN_VEXT_VMADC_VVM(NAME, ETYPE, H, DO_OP) \
1162 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
1163 CPURISCVState *env, uint32_t desc) \
1164 { \
1165 uint32_t vl = env->vl; \
1166 uint32_t vm = vext_vm(desc); \
1167 uint32_t total_elems = env_archcpu(env)->cfg.vlen; \
1168 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
1169 uint32_t i; \
1170 \
1171 for (i = env->vstart; i < vl; i++) { \
1172 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \
1173 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
1174 ETYPE carry = !vm && vext_elem_mask(v0, i); \
1175 vext_set_elem_mask(vd, i, DO_OP(s2, s1, carry)); \
1176 } \
1177 env->vstart = 0; \
1178 /* mask destination register are always tail-agnostic */ \
1179 /* set tail elements to 1s */ \
1180 if (vta_all_1s) { \
1181 for (; i < total_elems; i++) { \
1182 vext_set_elem_mask(vd, i, 1); \
1183 } \
1184 } \
1185 }
1186
1187 GEN_VEXT_VMADC_VVM(vmadc_vvm_b, uint8_t, H1, DO_MADC)
1188 GEN_VEXT_VMADC_VVM(vmadc_vvm_h, uint16_t, H2, DO_MADC)
1189 GEN_VEXT_VMADC_VVM(vmadc_vvm_w, uint32_t, H4, DO_MADC)
1190 GEN_VEXT_VMADC_VVM(vmadc_vvm_d, uint64_t, H8, DO_MADC)
1191
1192 GEN_VEXT_VMADC_VVM(vmsbc_vvm_b, uint8_t, H1, DO_MSBC)
1193 GEN_VEXT_VMADC_VVM(vmsbc_vvm_h, uint16_t, H2, DO_MSBC)
1194 GEN_VEXT_VMADC_VVM(vmsbc_vvm_w, uint32_t, H4, DO_MSBC)
1195 GEN_VEXT_VMADC_VVM(vmsbc_vvm_d, uint64_t, H8, DO_MSBC)
1196
1197 #define GEN_VEXT_VMADC_VXM(NAME, ETYPE, H, DO_OP) \
1198 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \
1199 void *vs2, CPURISCVState *env, uint32_t desc) \
1200 { \
1201 uint32_t vl = env->vl; \
1202 uint32_t vm = vext_vm(desc); \
1203 uint32_t total_elems = env_archcpu(env)->cfg.vlen; \
1204 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
1205 uint32_t i; \
1206 \
1207 for (i = env->vstart; i < vl; i++) { \
1208 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
1209 ETYPE carry = !vm && vext_elem_mask(v0, i); \
1210 vext_set_elem_mask(vd, i, \
1211 DO_OP(s2, (ETYPE)(target_long)s1, carry)); \
1212 } \
1213 env->vstart = 0; \
1214 /* mask destination register are always tail-agnostic */ \
1215 /* set tail elements to 1s */ \
1216 if (vta_all_1s) { \
1217 for (; i < total_elems; i++) { \
1218 vext_set_elem_mask(vd, i, 1); \
1219 } \
1220 } \
1221 }
1222
1223 GEN_VEXT_VMADC_VXM(vmadc_vxm_b, uint8_t, H1, DO_MADC)
1224 GEN_VEXT_VMADC_VXM(vmadc_vxm_h, uint16_t, H2, DO_MADC)
1225 GEN_VEXT_VMADC_VXM(vmadc_vxm_w, uint32_t, H4, DO_MADC)
1226 GEN_VEXT_VMADC_VXM(vmadc_vxm_d, uint64_t, H8, DO_MADC)
1227
1228 GEN_VEXT_VMADC_VXM(vmsbc_vxm_b, uint8_t, H1, DO_MSBC)
1229 GEN_VEXT_VMADC_VXM(vmsbc_vxm_h, uint16_t, H2, DO_MSBC)
1230 GEN_VEXT_VMADC_VXM(vmsbc_vxm_w, uint32_t, H4, DO_MSBC)
1231 GEN_VEXT_VMADC_VXM(vmsbc_vxm_d, uint64_t, H8, DO_MSBC)
1232
1233 /* Vector Bitwise Logical Instructions */
1234 RVVCALL(OPIVV2, vand_vv_b, OP_SSS_B, H1, H1, H1, DO_AND)
1235 RVVCALL(OPIVV2, vand_vv_h, OP_SSS_H, H2, H2, H2, DO_AND)
1236 RVVCALL(OPIVV2, vand_vv_w, OP_SSS_W, H4, H4, H4, DO_AND)
1237 RVVCALL(OPIVV2, vand_vv_d, OP_SSS_D, H8, H8, H8, DO_AND)
1238 RVVCALL(OPIVV2, vor_vv_b, OP_SSS_B, H1, H1, H1, DO_OR)
1239 RVVCALL(OPIVV2, vor_vv_h, OP_SSS_H, H2, H2, H2, DO_OR)
1240 RVVCALL(OPIVV2, vor_vv_w, OP_SSS_W, H4, H4, H4, DO_OR)
1241 RVVCALL(OPIVV2, vor_vv_d, OP_SSS_D, H8, H8, H8, DO_OR)
1242 RVVCALL(OPIVV2, vxor_vv_b, OP_SSS_B, H1, H1, H1, DO_XOR)
1243 RVVCALL(OPIVV2, vxor_vv_h, OP_SSS_H, H2, H2, H2, DO_XOR)
1244 RVVCALL(OPIVV2, vxor_vv_w, OP_SSS_W, H4, H4, H4, DO_XOR)
1245 RVVCALL(OPIVV2, vxor_vv_d, OP_SSS_D, H8, H8, H8, DO_XOR)
1246 GEN_VEXT_VV(vand_vv_b, 1)
1247 GEN_VEXT_VV(vand_vv_h, 2)
1248 GEN_VEXT_VV(vand_vv_w, 4)
1249 GEN_VEXT_VV(vand_vv_d, 8)
1250 GEN_VEXT_VV(vor_vv_b, 1)
1251 GEN_VEXT_VV(vor_vv_h, 2)
1252 GEN_VEXT_VV(vor_vv_w, 4)
1253 GEN_VEXT_VV(vor_vv_d, 8)
1254 GEN_VEXT_VV(vxor_vv_b, 1)
1255 GEN_VEXT_VV(vxor_vv_h, 2)
1256 GEN_VEXT_VV(vxor_vv_w, 4)
1257 GEN_VEXT_VV(vxor_vv_d, 8)
1258
1259 RVVCALL(OPIVX2, vand_vx_b, OP_SSS_B, H1, H1, DO_AND)
1260 RVVCALL(OPIVX2, vand_vx_h, OP_SSS_H, H2, H2, DO_AND)
1261 RVVCALL(OPIVX2, vand_vx_w, OP_SSS_W, H4, H4, DO_AND)
1262 RVVCALL(OPIVX2, vand_vx_d, OP_SSS_D, H8, H8, DO_AND)
1263 RVVCALL(OPIVX2, vor_vx_b, OP_SSS_B, H1, H1, DO_OR)
1264 RVVCALL(OPIVX2, vor_vx_h, OP_SSS_H, H2, H2, DO_OR)
1265 RVVCALL(OPIVX2, vor_vx_w, OP_SSS_W, H4, H4, DO_OR)
1266 RVVCALL(OPIVX2, vor_vx_d, OP_SSS_D, H8, H8, DO_OR)
1267 RVVCALL(OPIVX2, vxor_vx_b, OP_SSS_B, H1, H1, DO_XOR)
1268 RVVCALL(OPIVX2, vxor_vx_h, OP_SSS_H, H2, H2, DO_XOR)
1269 RVVCALL(OPIVX2, vxor_vx_w, OP_SSS_W, H4, H4, DO_XOR)
1270 RVVCALL(OPIVX2, vxor_vx_d, OP_SSS_D, H8, H8, DO_XOR)
1271 GEN_VEXT_VX(vand_vx_b, 1)
1272 GEN_VEXT_VX(vand_vx_h, 2)
1273 GEN_VEXT_VX(vand_vx_w, 4)
1274 GEN_VEXT_VX(vand_vx_d, 8)
1275 GEN_VEXT_VX(vor_vx_b, 1)
1276 GEN_VEXT_VX(vor_vx_h, 2)
1277 GEN_VEXT_VX(vor_vx_w, 4)
1278 GEN_VEXT_VX(vor_vx_d, 8)
1279 GEN_VEXT_VX(vxor_vx_b, 1)
1280 GEN_VEXT_VX(vxor_vx_h, 2)
1281 GEN_VEXT_VX(vxor_vx_w, 4)
1282 GEN_VEXT_VX(vxor_vx_d, 8)
1283
1284 /* Vector Single-Width Bit Shift Instructions */
1285 #define DO_SLL(N, M) (N << (M))
1286 #define DO_SRL(N, M) (N >> (M))
1287
1288 /* generate the helpers for shift instructions with two vector operators */
1289 #define GEN_VEXT_SHIFT_VV(NAME, TS1, TS2, HS1, HS2, OP, MASK) \
1290 void HELPER(NAME)(void *vd, void *v0, void *vs1, \
1291 void *vs2, CPURISCVState *env, uint32_t desc) \
1292 { \
1293 uint32_t vm = vext_vm(desc); \
1294 uint32_t vl = env->vl; \
1295 uint32_t esz = sizeof(TS1); \
1296 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
1297 uint32_t vta = vext_vta(desc); \
1298 uint32_t i; \
1299 \
1300 for (i = env->vstart; i < vl; i++) { \
1301 if (!vm && !vext_elem_mask(v0, i)) { \
1302 continue; \
1303 } \
1304 TS1 s1 = *((TS1 *)vs1 + HS1(i)); \
1305 TS2 s2 = *((TS2 *)vs2 + HS2(i)); \
1306 *((TS1 *)vd + HS1(i)) = OP(s2, s1 & MASK); \
1307 } \
1308 env->vstart = 0; \
1309 /* set tail elements to 1s */ \
1310 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
1311 }
1312
1313 GEN_VEXT_SHIFT_VV(vsll_vv_b, uint8_t, uint8_t, H1, H1, DO_SLL, 0x7)
1314 GEN_VEXT_SHIFT_VV(vsll_vv_h, uint16_t, uint16_t, H2, H2, DO_SLL, 0xf)
1315 GEN_VEXT_SHIFT_VV(vsll_vv_w, uint32_t, uint32_t, H4, H4, DO_SLL, 0x1f)
1316 GEN_VEXT_SHIFT_VV(vsll_vv_d, uint64_t, uint64_t, H8, H8, DO_SLL, 0x3f)
1317
1318 GEN_VEXT_SHIFT_VV(vsrl_vv_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1319 GEN_VEXT_SHIFT_VV(vsrl_vv_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1320 GEN_VEXT_SHIFT_VV(vsrl_vv_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1321 GEN_VEXT_SHIFT_VV(vsrl_vv_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1322
1323 GEN_VEXT_SHIFT_VV(vsra_vv_b, uint8_t, int8_t, H1, H1, DO_SRL, 0x7)
1324 GEN_VEXT_SHIFT_VV(vsra_vv_h, uint16_t, int16_t, H2, H2, DO_SRL, 0xf)
1325 GEN_VEXT_SHIFT_VV(vsra_vv_w, uint32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1326 GEN_VEXT_SHIFT_VV(vsra_vv_d, uint64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1327
1328 /* generate the helpers for shift instructions with one vector and one scalar */
1329 #define GEN_VEXT_SHIFT_VX(NAME, TD, TS2, HD, HS2, OP, MASK) \
1330 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \
1331 void *vs2, CPURISCVState *env, uint32_t desc) \
1332 { \
1333 uint32_t vm = vext_vm(desc); \
1334 uint32_t vl = env->vl; \
1335 uint32_t esz = sizeof(TD); \
1336 uint32_t total_elems = \
1337 vext_get_total_elems(env, desc, esz); \
1338 uint32_t vta = vext_vta(desc); \
1339 uint32_t i; \
1340 \
1341 for (i = env->vstart; i < vl; i++) { \
1342 if (!vm && !vext_elem_mask(v0, i)) { \
1343 continue; \
1344 } \
1345 TS2 s2 = *((TS2 *)vs2 + HS2(i)); \
1346 *((TD *)vd + HD(i)) = OP(s2, s1 & MASK); \
1347 } \
1348 env->vstart = 0; \
1349 /* set tail elements to 1s */ \
1350 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);\
1351 }
1352
1353 GEN_VEXT_SHIFT_VX(vsll_vx_b, uint8_t, int8_t, H1, H1, DO_SLL, 0x7)
1354 GEN_VEXT_SHIFT_VX(vsll_vx_h, uint16_t, int16_t, H2, H2, DO_SLL, 0xf)
1355 GEN_VEXT_SHIFT_VX(vsll_vx_w, uint32_t, int32_t, H4, H4, DO_SLL, 0x1f)
1356 GEN_VEXT_SHIFT_VX(vsll_vx_d, uint64_t, int64_t, H8, H8, DO_SLL, 0x3f)
1357
1358 GEN_VEXT_SHIFT_VX(vsrl_vx_b, uint8_t, uint8_t, H1, H1, DO_SRL, 0x7)
1359 GEN_VEXT_SHIFT_VX(vsrl_vx_h, uint16_t, uint16_t, H2, H2, DO_SRL, 0xf)
1360 GEN_VEXT_SHIFT_VX(vsrl_vx_w, uint32_t, uint32_t, H4, H4, DO_SRL, 0x1f)
1361 GEN_VEXT_SHIFT_VX(vsrl_vx_d, uint64_t, uint64_t, H8, H8, DO_SRL, 0x3f)
1362
1363 GEN_VEXT_SHIFT_VX(vsra_vx_b, int8_t, int8_t, H1, H1, DO_SRL, 0x7)
1364 GEN_VEXT_SHIFT_VX(vsra_vx_h, int16_t, int16_t, H2, H2, DO_SRL, 0xf)
1365 GEN_VEXT_SHIFT_VX(vsra_vx_w, int32_t, int32_t, H4, H4, DO_SRL, 0x1f)
1366 GEN_VEXT_SHIFT_VX(vsra_vx_d, int64_t, int64_t, H8, H8, DO_SRL, 0x3f)
1367
1368 /* Vector Narrowing Integer Right Shift Instructions */
1369 GEN_VEXT_SHIFT_VV(vnsrl_wv_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1370 GEN_VEXT_SHIFT_VV(vnsrl_wv_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1371 GEN_VEXT_SHIFT_VV(vnsrl_wv_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1372 GEN_VEXT_SHIFT_VV(vnsra_wv_b, uint8_t, int16_t, H1, H2, DO_SRL, 0xf)
1373 GEN_VEXT_SHIFT_VV(vnsra_wv_h, uint16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1374 GEN_VEXT_SHIFT_VV(vnsra_wv_w, uint32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1375 GEN_VEXT_SHIFT_VX(vnsrl_wx_b, uint8_t, uint16_t, H1, H2, DO_SRL, 0xf)
1376 GEN_VEXT_SHIFT_VX(vnsrl_wx_h, uint16_t, uint32_t, H2, H4, DO_SRL, 0x1f)
1377 GEN_VEXT_SHIFT_VX(vnsrl_wx_w, uint32_t, uint64_t, H4, H8, DO_SRL, 0x3f)
1378 GEN_VEXT_SHIFT_VX(vnsra_wx_b, int8_t, int16_t, H1, H2, DO_SRL, 0xf)
1379 GEN_VEXT_SHIFT_VX(vnsra_wx_h, int16_t, int32_t, H2, H4, DO_SRL, 0x1f)
1380 GEN_VEXT_SHIFT_VX(vnsra_wx_w, int32_t, int64_t, H4, H8, DO_SRL, 0x3f)
1381
1382 /* Vector Integer Comparison Instructions */
1383 #define DO_MSEQ(N, M) (N == M)
1384 #define DO_MSNE(N, M) (N != M)
1385 #define DO_MSLT(N, M) (N < M)
1386 #define DO_MSLE(N, M) (N <= M)
1387 #define DO_MSGT(N, M) (N > M)
1388
1389 #define GEN_VEXT_CMP_VV(NAME, ETYPE, H, DO_OP) \
1390 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
1391 CPURISCVState *env, uint32_t desc) \
1392 { \
1393 uint32_t vm = vext_vm(desc); \
1394 uint32_t vl = env->vl; \
1395 uint32_t total_elems = env_archcpu(env)->cfg.vlen; \
1396 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
1397 uint32_t i; \
1398 \
1399 for (i = env->vstart; i < vl; i++) { \
1400 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \
1401 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
1402 if (!vm && !vext_elem_mask(v0, i)) { \
1403 continue; \
1404 } \
1405 vext_set_elem_mask(vd, i, DO_OP(s2, s1)); \
1406 } \
1407 env->vstart = 0; \
1408 /* mask destination register are always tail-agnostic */ \
1409 /* set tail elements to 1s */ \
1410 if (vta_all_1s) { \
1411 for (; i < total_elems; i++) { \
1412 vext_set_elem_mask(vd, i, 1); \
1413 } \
1414 } \
1415 }
1416
1417 GEN_VEXT_CMP_VV(vmseq_vv_b, uint8_t, H1, DO_MSEQ)
1418 GEN_VEXT_CMP_VV(vmseq_vv_h, uint16_t, H2, DO_MSEQ)
1419 GEN_VEXT_CMP_VV(vmseq_vv_w, uint32_t, H4, DO_MSEQ)
1420 GEN_VEXT_CMP_VV(vmseq_vv_d, uint64_t, H8, DO_MSEQ)
1421
1422 GEN_VEXT_CMP_VV(vmsne_vv_b, uint8_t, H1, DO_MSNE)
1423 GEN_VEXT_CMP_VV(vmsne_vv_h, uint16_t, H2, DO_MSNE)
1424 GEN_VEXT_CMP_VV(vmsne_vv_w, uint32_t, H4, DO_MSNE)
1425 GEN_VEXT_CMP_VV(vmsne_vv_d, uint64_t, H8, DO_MSNE)
1426
1427 GEN_VEXT_CMP_VV(vmsltu_vv_b, uint8_t, H1, DO_MSLT)
1428 GEN_VEXT_CMP_VV(vmsltu_vv_h, uint16_t, H2, DO_MSLT)
1429 GEN_VEXT_CMP_VV(vmsltu_vv_w, uint32_t, H4, DO_MSLT)
1430 GEN_VEXT_CMP_VV(vmsltu_vv_d, uint64_t, H8, DO_MSLT)
1431
1432 GEN_VEXT_CMP_VV(vmslt_vv_b, int8_t, H1, DO_MSLT)
1433 GEN_VEXT_CMP_VV(vmslt_vv_h, int16_t, H2, DO_MSLT)
1434 GEN_VEXT_CMP_VV(vmslt_vv_w, int32_t, H4, DO_MSLT)
1435 GEN_VEXT_CMP_VV(vmslt_vv_d, int64_t, H8, DO_MSLT)
1436
1437 GEN_VEXT_CMP_VV(vmsleu_vv_b, uint8_t, H1, DO_MSLE)
1438 GEN_VEXT_CMP_VV(vmsleu_vv_h, uint16_t, H2, DO_MSLE)
1439 GEN_VEXT_CMP_VV(vmsleu_vv_w, uint32_t, H4, DO_MSLE)
1440 GEN_VEXT_CMP_VV(vmsleu_vv_d, uint64_t, H8, DO_MSLE)
1441
1442 GEN_VEXT_CMP_VV(vmsle_vv_b, int8_t, H1, DO_MSLE)
1443 GEN_VEXT_CMP_VV(vmsle_vv_h, int16_t, H2, DO_MSLE)
1444 GEN_VEXT_CMP_VV(vmsle_vv_w, int32_t, H4, DO_MSLE)
1445 GEN_VEXT_CMP_VV(vmsle_vv_d, int64_t, H8, DO_MSLE)
1446
1447 #define GEN_VEXT_CMP_VX(NAME, ETYPE, H, DO_OP) \
1448 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
1449 CPURISCVState *env, uint32_t desc) \
1450 { \
1451 uint32_t vm = vext_vm(desc); \
1452 uint32_t vl = env->vl; \
1453 uint32_t total_elems = env_archcpu(env)->cfg.vlen; \
1454 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
1455 uint32_t i; \
1456 \
1457 for (i = env->vstart; i < vl; i++) { \
1458 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
1459 if (!vm && !vext_elem_mask(v0, i)) { \
1460 continue; \
1461 } \
1462 vext_set_elem_mask(vd, i, \
1463 DO_OP(s2, (ETYPE)(target_long)s1)); \
1464 } \
1465 env->vstart = 0; \
1466 /* mask destination register are always tail-agnostic */ \
1467 /* set tail elements to 1s */ \
1468 if (vta_all_1s) { \
1469 for (; i < total_elems; i++) { \
1470 vext_set_elem_mask(vd, i, 1); \
1471 } \
1472 } \
1473 }
1474
1475 GEN_VEXT_CMP_VX(vmseq_vx_b, uint8_t, H1, DO_MSEQ)
1476 GEN_VEXT_CMP_VX(vmseq_vx_h, uint16_t, H2, DO_MSEQ)
1477 GEN_VEXT_CMP_VX(vmseq_vx_w, uint32_t, H4, DO_MSEQ)
1478 GEN_VEXT_CMP_VX(vmseq_vx_d, uint64_t, H8, DO_MSEQ)
1479
1480 GEN_VEXT_CMP_VX(vmsne_vx_b, uint8_t, H1, DO_MSNE)
1481 GEN_VEXT_CMP_VX(vmsne_vx_h, uint16_t, H2, DO_MSNE)
1482 GEN_VEXT_CMP_VX(vmsne_vx_w, uint32_t, H4, DO_MSNE)
1483 GEN_VEXT_CMP_VX(vmsne_vx_d, uint64_t, H8, DO_MSNE)
1484
1485 GEN_VEXT_CMP_VX(vmsltu_vx_b, uint8_t, H1, DO_MSLT)
1486 GEN_VEXT_CMP_VX(vmsltu_vx_h, uint16_t, H2, DO_MSLT)
1487 GEN_VEXT_CMP_VX(vmsltu_vx_w, uint32_t, H4, DO_MSLT)
1488 GEN_VEXT_CMP_VX(vmsltu_vx_d, uint64_t, H8, DO_MSLT)
1489
1490 GEN_VEXT_CMP_VX(vmslt_vx_b, int8_t, H1, DO_MSLT)
1491 GEN_VEXT_CMP_VX(vmslt_vx_h, int16_t, H2, DO_MSLT)
1492 GEN_VEXT_CMP_VX(vmslt_vx_w, int32_t, H4, DO_MSLT)
1493 GEN_VEXT_CMP_VX(vmslt_vx_d, int64_t, H8, DO_MSLT)
1494
1495 GEN_VEXT_CMP_VX(vmsleu_vx_b, uint8_t, H1, DO_MSLE)
1496 GEN_VEXT_CMP_VX(vmsleu_vx_h, uint16_t, H2, DO_MSLE)
1497 GEN_VEXT_CMP_VX(vmsleu_vx_w, uint32_t, H4, DO_MSLE)
1498 GEN_VEXT_CMP_VX(vmsleu_vx_d, uint64_t, H8, DO_MSLE)
1499
1500 GEN_VEXT_CMP_VX(vmsle_vx_b, int8_t, H1, DO_MSLE)
1501 GEN_VEXT_CMP_VX(vmsle_vx_h, int16_t, H2, DO_MSLE)
1502 GEN_VEXT_CMP_VX(vmsle_vx_w, int32_t, H4, DO_MSLE)
1503 GEN_VEXT_CMP_VX(vmsle_vx_d, int64_t, H8, DO_MSLE)
1504
1505 GEN_VEXT_CMP_VX(vmsgtu_vx_b, uint8_t, H1, DO_MSGT)
1506 GEN_VEXT_CMP_VX(vmsgtu_vx_h, uint16_t, H2, DO_MSGT)
1507 GEN_VEXT_CMP_VX(vmsgtu_vx_w, uint32_t, H4, DO_MSGT)
1508 GEN_VEXT_CMP_VX(vmsgtu_vx_d, uint64_t, H8, DO_MSGT)
1509
1510 GEN_VEXT_CMP_VX(vmsgt_vx_b, int8_t, H1, DO_MSGT)
1511 GEN_VEXT_CMP_VX(vmsgt_vx_h, int16_t, H2, DO_MSGT)
1512 GEN_VEXT_CMP_VX(vmsgt_vx_w, int32_t, H4, DO_MSGT)
1513 GEN_VEXT_CMP_VX(vmsgt_vx_d, int64_t, H8, DO_MSGT)
1514
1515 /* Vector Integer Min/Max Instructions */
1516 RVVCALL(OPIVV2, vminu_vv_b, OP_UUU_B, H1, H1, H1, DO_MIN)
1517 RVVCALL(OPIVV2, vminu_vv_h, OP_UUU_H, H2, H2, H2, DO_MIN)
1518 RVVCALL(OPIVV2, vminu_vv_w, OP_UUU_W, H4, H4, H4, DO_MIN)
1519 RVVCALL(OPIVV2, vminu_vv_d, OP_UUU_D, H8, H8, H8, DO_MIN)
1520 RVVCALL(OPIVV2, vmin_vv_b, OP_SSS_B, H1, H1, H1, DO_MIN)
1521 RVVCALL(OPIVV2, vmin_vv_h, OP_SSS_H, H2, H2, H2, DO_MIN)
1522 RVVCALL(OPIVV2, vmin_vv_w, OP_SSS_W, H4, H4, H4, DO_MIN)
1523 RVVCALL(OPIVV2, vmin_vv_d, OP_SSS_D, H8, H8, H8, DO_MIN)
1524 RVVCALL(OPIVV2, vmaxu_vv_b, OP_UUU_B, H1, H1, H1, DO_MAX)
1525 RVVCALL(OPIVV2, vmaxu_vv_h, OP_UUU_H, H2, H2, H2, DO_MAX)
1526 RVVCALL(OPIVV2, vmaxu_vv_w, OP_UUU_W, H4, H4, H4, DO_MAX)
1527 RVVCALL(OPIVV2, vmaxu_vv_d, OP_UUU_D, H8, H8, H8, DO_MAX)
1528 RVVCALL(OPIVV2, vmax_vv_b, OP_SSS_B, H1, H1, H1, DO_MAX)
1529 RVVCALL(OPIVV2, vmax_vv_h, OP_SSS_H, H2, H2, H2, DO_MAX)
1530 RVVCALL(OPIVV2, vmax_vv_w, OP_SSS_W, H4, H4, H4, DO_MAX)
1531 RVVCALL(OPIVV2, vmax_vv_d, OP_SSS_D, H8, H8, H8, DO_MAX)
1532 GEN_VEXT_VV(vminu_vv_b, 1)
1533 GEN_VEXT_VV(vminu_vv_h, 2)
1534 GEN_VEXT_VV(vminu_vv_w, 4)
1535 GEN_VEXT_VV(vminu_vv_d, 8)
1536 GEN_VEXT_VV(vmin_vv_b, 1)
1537 GEN_VEXT_VV(vmin_vv_h, 2)
1538 GEN_VEXT_VV(vmin_vv_w, 4)
1539 GEN_VEXT_VV(vmin_vv_d, 8)
1540 GEN_VEXT_VV(vmaxu_vv_b, 1)
1541 GEN_VEXT_VV(vmaxu_vv_h, 2)
1542 GEN_VEXT_VV(vmaxu_vv_w, 4)
1543 GEN_VEXT_VV(vmaxu_vv_d, 8)
1544 GEN_VEXT_VV(vmax_vv_b, 1)
1545 GEN_VEXT_VV(vmax_vv_h, 2)
1546 GEN_VEXT_VV(vmax_vv_w, 4)
1547 GEN_VEXT_VV(vmax_vv_d, 8)
1548
1549 RVVCALL(OPIVX2, vminu_vx_b, OP_UUU_B, H1, H1, DO_MIN)
1550 RVVCALL(OPIVX2, vminu_vx_h, OP_UUU_H, H2, H2, DO_MIN)
1551 RVVCALL(OPIVX2, vminu_vx_w, OP_UUU_W, H4, H4, DO_MIN)
1552 RVVCALL(OPIVX2, vminu_vx_d, OP_UUU_D, H8, H8, DO_MIN)
1553 RVVCALL(OPIVX2, vmin_vx_b, OP_SSS_B, H1, H1, DO_MIN)
1554 RVVCALL(OPIVX2, vmin_vx_h, OP_SSS_H, H2, H2, DO_MIN)
1555 RVVCALL(OPIVX2, vmin_vx_w, OP_SSS_W, H4, H4, DO_MIN)
1556 RVVCALL(OPIVX2, vmin_vx_d, OP_SSS_D, H8, H8, DO_MIN)
1557 RVVCALL(OPIVX2, vmaxu_vx_b, OP_UUU_B, H1, H1, DO_MAX)
1558 RVVCALL(OPIVX2, vmaxu_vx_h, OP_UUU_H, H2, H2, DO_MAX)
1559 RVVCALL(OPIVX2, vmaxu_vx_w, OP_UUU_W, H4, H4, DO_MAX)
1560 RVVCALL(OPIVX2, vmaxu_vx_d, OP_UUU_D, H8, H8, DO_MAX)
1561 RVVCALL(OPIVX2, vmax_vx_b, OP_SSS_B, H1, H1, DO_MAX)
1562 RVVCALL(OPIVX2, vmax_vx_h, OP_SSS_H, H2, H2, DO_MAX)
1563 RVVCALL(OPIVX2, vmax_vx_w, OP_SSS_W, H4, H4, DO_MAX)
1564 RVVCALL(OPIVX2, vmax_vx_d, OP_SSS_D, H8, H8, DO_MAX)
1565 GEN_VEXT_VX(vminu_vx_b, 1)
1566 GEN_VEXT_VX(vminu_vx_h, 2)
1567 GEN_VEXT_VX(vminu_vx_w, 4)
1568 GEN_VEXT_VX(vminu_vx_d, 8)
1569 GEN_VEXT_VX(vmin_vx_b, 1)
1570 GEN_VEXT_VX(vmin_vx_h, 2)
1571 GEN_VEXT_VX(vmin_vx_w, 4)
1572 GEN_VEXT_VX(vmin_vx_d, 8)
1573 GEN_VEXT_VX(vmaxu_vx_b, 1)
1574 GEN_VEXT_VX(vmaxu_vx_h, 2)
1575 GEN_VEXT_VX(vmaxu_vx_w, 4)
1576 GEN_VEXT_VX(vmaxu_vx_d, 8)
1577 GEN_VEXT_VX(vmax_vx_b, 1)
1578 GEN_VEXT_VX(vmax_vx_h, 2)
1579 GEN_VEXT_VX(vmax_vx_w, 4)
1580 GEN_VEXT_VX(vmax_vx_d, 8)
1581
1582 /* Vector Single-Width Integer Multiply Instructions */
1583 #define DO_MUL(N, M) (N * M)
1584 RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
1585 RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
1586 RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
1587 RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
1588 GEN_VEXT_VV(vmul_vv_b, 1)
1589 GEN_VEXT_VV(vmul_vv_h, 2)
1590 GEN_VEXT_VV(vmul_vv_w, 4)
1591 GEN_VEXT_VV(vmul_vv_d, 8)
1592
1593 static int8_t do_mulh_b(int8_t s2, int8_t s1)
1594 {
1595 return (int16_t)s2 * (int16_t)s1 >> 8;
1596 }
1597
1598 static int16_t do_mulh_h(int16_t s2, int16_t s1)
1599 {
1600 return (int32_t)s2 * (int32_t)s1 >> 16;
1601 }
1602
1603 static int32_t do_mulh_w(int32_t s2, int32_t s1)
1604 {
1605 return (int64_t)s2 * (int64_t)s1 >> 32;
1606 }
1607
1608 static int64_t do_mulh_d(int64_t s2, int64_t s1)
1609 {
1610 uint64_t hi_64, lo_64;
1611
1612 muls64(&lo_64, &hi_64, s1, s2);
1613 return hi_64;
1614 }
1615
1616 static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
1617 {
1618 return (uint16_t)s2 * (uint16_t)s1 >> 8;
1619 }
1620
1621 static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
1622 {
1623 return (uint32_t)s2 * (uint32_t)s1 >> 16;
1624 }
1625
1626 static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
1627 {
1628 return (uint64_t)s2 * (uint64_t)s1 >> 32;
1629 }
1630
1631 static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
1632 {
1633 uint64_t hi_64, lo_64;
1634
1635 mulu64(&lo_64, &hi_64, s2, s1);
1636 return hi_64;
1637 }
1638
1639 static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
1640 {
1641 return (int16_t)s2 * (uint16_t)s1 >> 8;
1642 }
1643
1644 static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
1645 {
1646 return (int32_t)s2 * (uint32_t)s1 >> 16;
1647 }
1648
1649 static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
1650 {
1651 return (int64_t)s2 * (uint64_t)s1 >> 32;
1652 }
1653
1654 /*
1655 * Let A = signed operand,
1656 * B = unsigned operand
1657 * P = mulu64(A, B), unsigned product
1658 *
1659 * LET X = 2 ** 64 - A, 2's complement of A
1660 * SP = signed product
1661 * THEN
1662 * IF A < 0
1663 * SP = -X * B
1664 * = -(2 ** 64 - A) * B
1665 * = A * B - 2 ** 64 * B
1666 * = P - 2 ** 64 * B
1667 * ELSE
1668 * SP = P
1669 * THEN
1670 * HI_P -= (A < 0 ? B : 0)
1671 */
1672
1673 static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
1674 {
1675 uint64_t hi_64, lo_64;
1676
1677 mulu64(&lo_64, &hi_64, s2, s1);
1678
1679 hi_64 -= s2 < 0 ? s1 : 0;
1680 return hi_64;
1681 }
1682
1683 RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
1684 RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
1685 RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
1686 RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
1687 RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
1688 RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
1689 RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
1690 RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
1691 RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
1692 RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
1693 RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
1694 RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
1695 GEN_VEXT_VV(vmulh_vv_b, 1)
1696 GEN_VEXT_VV(vmulh_vv_h, 2)
1697 GEN_VEXT_VV(vmulh_vv_w, 4)
1698 GEN_VEXT_VV(vmulh_vv_d, 8)
1699 GEN_VEXT_VV(vmulhu_vv_b, 1)
1700 GEN_VEXT_VV(vmulhu_vv_h, 2)
1701 GEN_VEXT_VV(vmulhu_vv_w, 4)
1702 GEN_VEXT_VV(vmulhu_vv_d, 8)
1703 GEN_VEXT_VV(vmulhsu_vv_b, 1)
1704 GEN_VEXT_VV(vmulhsu_vv_h, 2)
1705 GEN_VEXT_VV(vmulhsu_vv_w, 4)
1706 GEN_VEXT_VV(vmulhsu_vv_d, 8)
1707
1708 RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
1709 RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
1710 RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
1711 RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
1712 RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
1713 RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
1714 RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
1715 RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
1716 RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
1717 RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
1718 RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
1719 RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
1720 RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
1721 RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
1722 RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
1723 RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
1724 GEN_VEXT_VX(vmul_vx_b, 1)
1725 GEN_VEXT_VX(vmul_vx_h, 2)
1726 GEN_VEXT_VX(vmul_vx_w, 4)
1727 GEN_VEXT_VX(vmul_vx_d, 8)
1728 GEN_VEXT_VX(vmulh_vx_b, 1)
1729 GEN_VEXT_VX(vmulh_vx_h, 2)
1730 GEN_VEXT_VX(vmulh_vx_w, 4)
1731 GEN_VEXT_VX(vmulh_vx_d, 8)
1732 GEN_VEXT_VX(vmulhu_vx_b, 1)
1733 GEN_VEXT_VX(vmulhu_vx_h, 2)
1734 GEN_VEXT_VX(vmulhu_vx_w, 4)
1735 GEN_VEXT_VX(vmulhu_vx_d, 8)
1736 GEN_VEXT_VX(vmulhsu_vx_b, 1)
1737 GEN_VEXT_VX(vmulhsu_vx_h, 2)
1738 GEN_VEXT_VX(vmulhsu_vx_w, 4)
1739 GEN_VEXT_VX(vmulhsu_vx_d, 8)
1740
1741 /* Vector Integer Divide Instructions */
1742 #define DO_DIVU(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) : N / M)
1743 #define DO_REMU(N, M) (unlikely(M == 0) ? N : N % M)
1744 #define DO_DIV(N, M) (unlikely(M == 0) ? (__typeof(N))(-1) :\
1745 unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M)
1746 #define DO_REM(N, M) (unlikely(M == 0) ? N :\
1747 unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M)
1748
1749 RVVCALL(OPIVV2, vdivu_vv_b, OP_UUU_B, H1, H1, H1, DO_DIVU)
1750 RVVCALL(OPIVV2, vdivu_vv_h, OP_UUU_H, H2, H2, H2, DO_DIVU)
1751 RVVCALL(OPIVV2, vdivu_vv_w, OP_UUU_W, H4, H4, H4, DO_DIVU)
1752 RVVCALL(OPIVV2, vdivu_vv_d, OP_UUU_D, H8, H8, H8, DO_DIVU)
1753 RVVCALL(OPIVV2, vdiv_vv_b, OP_SSS_B, H1, H1, H1, DO_DIV)
1754 RVVCALL(OPIVV2, vdiv_vv_h, OP_SSS_H, H2, H2, H2, DO_DIV)
1755 RVVCALL(OPIVV2, vdiv_vv_w, OP_SSS_W, H4, H4, H4, DO_DIV)
1756 RVVCALL(OPIVV2, vdiv_vv_d, OP_SSS_D, H8, H8, H8, DO_DIV)
1757 RVVCALL(OPIVV2, vremu_vv_b, OP_UUU_B, H1, H1, H1, DO_REMU)
1758 RVVCALL(OPIVV2, vremu_vv_h, OP_UUU_H, H2, H2, H2, DO_REMU)
1759 RVVCALL(OPIVV2, vremu_vv_w, OP_UUU_W, H4, H4, H4, DO_REMU)
1760 RVVCALL(OPIVV2, vremu_vv_d, OP_UUU_D, H8, H8, H8, DO_REMU)
1761 RVVCALL(OPIVV2, vrem_vv_b, OP_SSS_B, H1, H1, H1, DO_REM)
1762 RVVCALL(OPIVV2, vrem_vv_h, OP_SSS_H, H2, H2, H2, DO_REM)
1763 RVVCALL(OPIVV2, vrem_vv_w, OP_SSS_W, H4, H4, H4, DO_REM)
1764 RVVCALL(OPIVV2, vrem_vv_d, OP_SSS_D, H8, H8, H8, DO_REM)
1765 GEN_VEXT_VV(vdivu_vv_b, 1)
1766 GEN_VEXT_VV(vdivu_vv_h, 2)
1767 GEN_VEXT_VV(vdivu_vv_w, 4)
1768 GEN_VEXT_VV(vdivu_vv_d, 8)
1769 GEN_VEXT_VV(vdiv_vv_b, 1)
1770 GEN_VEXT_VV(vdiv_vv_h, 2)
1771 GEN_VEXT_VV(vdiv_vv_w, 4)
1772 GEN_VEXT_VV(vdiv_vv_d, 8)
1773 GEN_VEXT_VV(vremu_vv_b, 1)
1774 GEN_VEXT_VV(vremu_vv_h, 2)
1775 GEN_VEXT_VV(vremu_vv_w, 4)
1776 GEN_VEXT_VV(vremu_vv_d, 8)
1777 GEN_VEXT_VV(vrem_vv_b, 1)
1778 GEN_VEXT_VV(vrem_vv_h, 2)
1779 GEN_VEXT_VV(vrem_vv_w, 4)
1780 GEN_VEXT_VV(vrem_vv_d, 8)
1781
1782 RVVCALL(OPIVX2, vdivu_vx_b, OP_UUU_B, H1, H1, DO_DIVU)
1783 RVVCALL(OPIVX2, vdivu_vx_h, OP_UUU_H, H2, H2, DO_DIVU)
1784 RVVCALL(OPIVX2, vdivu_vx_w, OP_UUU_W, H4, H4, DO_DIVU)
1785 RVVCALL(OPIVX2, vdivu_vx_d, OP_UUU_D, H8, H8, DO_DIVU)
1786 RVVCALL(OPIVX2, vdiv_vx_b, OP_SSS_B, H1, H1, DO_DIV)
1787 RVVCALL(OPIVX2, vdiv_vx_h, OP_SSS_H, H2, H2, DO_DIV)
1788 RVVCALL(OPIVX2, vdiv_vx_w, OP_SSS_W, H4, H4, DO_DIV)
1789 RVVCALL(OPIVX2, vdiv_vx_d, OP_SSS_D, H8, H8, DO_DIV)
1790 RVVCALL(OPIVX2, vremu_vx_b, OP_UUU_B, H1, H1, DO_REMU)
1791 RVVCALL(OPIVX2, vremu_vx_h, OP_UUU_H, H2, H2, DO_REMU)
1792 RVVCALL(OPIVX2, vremu_vx_w, OP_UUU_W, H4, H4, DO_REMU)
1793 RVVCALL(OPIVX2, vremu_vx_d, OP_UUU_D, H8, H8, DO_REMU)
1794 RVVCALL(OPIVX2, vrem_vx_b, OP_SSS_B, H1, H1, DO_REM)
1795 RVVCALL(OPIVX2, vrem_vx_h, OP_SSS_H, H2, H2, DO_REM)
1796 RVVCALL(OPIVX2, vrem_vx_w, OP_SSS_W, H4, H4, DO_REM)
1797 RVVCALL(OPIVX2, vrem_vx_d, OP_SSS_D, H8, H8, DO_REM)
1798 GEN_VEXT_VX(vdivu_vx_b, 1)
1799 GEN_VEXT_VX(vdivu_vx_h, 2)
1800 GEN_VEXT_VX(vdivu_vx_w, 4)
1801 GEN_VEXT_VX(vdivu_vx_d, 8)
1802 GEN_VEXT_VX(vdiv_vx_b, 1)
1803 GEN_VEXT_VX(vdiv_vx_h, 2)
1804 GEN_VEXT_VX(vdiv_vx_w, 4)
1805 GEN_VEXT_VX(vdiv_vx_d, 8)
1806 GEN_VEXT_VX(vremu_vx_b, 1)
1807 GEN_VEXT_VX(vremu_vx_h, 2)
1808 GEN_VEXT_VX(vremu_vx_w, 4)
1809 GEN_VEXT_VX(vremu_vx_d, 8)
1810 GEN_VEXT_VX(vrem_vx_b, 1)
1811 GEN_VEXT_VX(vrem_vx_h, 2)
1812 GEN_VEXT_VX(vrem_vx_w, 4)
1813 GEN_VEXT_VX(vrem_vx_d, 8)
1814
1815 /* Vector Widening Integer Multiply Instructions */
1816 RVVCALL(OPIVV2, vwmul_vv_b, WOP_SSS_B, H2, H1, H1, DO_MUL)
1817 RVVCALL(OPIVV2, vwmul_vv_h, WOP_SSS_H, H4, H2, H2, DO_MUL)
1818 RVVCALL(OPIVV2, vwmul_vv_w, WOP_SSS_W, H8, H4, H4, DO_MUL)
1819 RVVCALL(OPIVV2, vwmulu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MUL)
1820 RVVCALL(OPIVV2, vwmulu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MUL)
1821 RVVCALL(OPIVV2, vwmulu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MUL)
1822 RVVCALL(OPIVV2, vwmulsu_vv_b, WOP_SUS_B, H2, H1, H1, DO_MUL)
1823 RVVCALL(OPIVV2, vwmulsu_vv_h, WOP_SUS_H, H4, H2, H2, DO_MUL)
1824 RVVCALL(OPIVV2, vwmulsu_vv_w, WOP_SUS_W, H8, H4, H4, DO_MUL)
1825 GEN_VEXT_VV(vwmul_vv_b, 2)
1826 GEN_VEXT_VV(vwmul_vv_h, 4)
1827 GEN_VEXT_VV(vwmul_vv_w, 8)
1828 GEN_VEXT_VV(vwmulu_vv_b, 2)
1829 GEN_VEXT_VV(vwmulu_vv_h, 4)
1830 GEN_VEXT_VV(vwmulu_vv_w, 8)
1831 GEN_VEXT_VV(vwmulsu_vv_b, 2)
1832 GEN_VEXT_VV(vwmulsu_vv_h, 4)
1833 GEN_VEXT_VV(vwmulsu_vv_w, 8)
1834
1835 RVVCALL(OPIVX2, vwmul_vx_b, WOP_SSS_B, H2, H1, DO_MUL)
1836 RVVCALL(OPIVX2, vwmul_vx_h, WOP_SSS_H, H4, H2, DO_MUL)
1837 RVVCALL(OPIVX2, vwmul_vx_w, WOP_SSS_W, H8, H4, DO_MUL)
1838 RVVCALL(OPIVX2, vwmulu_vx_b, WOP_UUU_B, H2, H1, DO_MUL)
1839 RVVCALL(OPIVX2, vwmulu_vx_h, WOP_UUU_H, H4, H2, DO_MUL)
1840 RVVCALL(OPIVX2, vwmulu_vx_w, WOP_UUU_W, H8, H4, DO_MUL)
1841 RVVCALL(OPIVX2, vwmulsu_vx_b, WOP_SUS_B, H2, H1, DO_MUL)
1842 RVVCALL(OPIVX2, vwmulsu_vx_h, WOP_SUS_H, H4, H2, DO_MUL)
1843 RVVCALL(OPIVX2, vwmulsu_vx_w, WOP_SUS_W, H8, H4, DO_MUL)
1844 GEN_VEXT_VX(vwmul_vx_b, 2)
1845 GEN_VEXT_VX(vwmul_vx_h, 4)
1846 GEN_VEXT_VX(vwmul_vx_w, 8)
1847 GEN_VEXT_VX(vwmulu_vx_b, 2)
1848 GEN_VEXT_VX(vwmulu_vx_h, 4)
1849 GEN_VEXT_VX(vwmulu_vx_w, 8)
1850 GEN_VEXT_VX(vwmulsu_vx_b, 2)
1851 GEN_VEXT_VX(vwmulsu_vx_h, 4)
1852 GEN_VEXT_VX(vwmulsu_vx_w, 8)
1853
1854 /* Vector Single-Width Integer Multiply-Add Instructions */
1855 #define OPIVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \
1856 static void do_##NAME(void *vd, void *vs1, void *vs2, int i) \
1857 { \
1858 TX1 s1 = *((T1 *)vs1 + HS1(i)); \
1859 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
1860 TD d = *((TD *)vd + HD(i)); \
1861 *((TD *)vd + HD(i)) = OP(s2, s1, d); \
1862 }
1863
1864 #define DO_MACC(N, M, D) (M * N + D)
1865 #define DO_NMSAC(N, M, D) (-(M * N) + D)
1866 #define DO_MADD(N, M, D) (M * D + N)
1867 #define DO_NMSUB(N, M, D) (-(M * D) + N)
1868 RVVCALL(OPIVV3, vmacc_vv_b, OP_SSS_B, H1, H1, H1, DO_MACC)
1869 RVVCALL(OPIVV3, vmacc_vv_h, OP_SSS_H, H2, H2, H2, DO_MACC)
1870 RVVCALL(OPIVV3, vmacc_vv_w, OP_SSS_W, H4, H4, H4, DO_MACC)
1871 RVVCALL(OPIVV3, vmacc_vv_d, OP_SSS_D, H8, H8, H8, DO_MACC)
1872 RVVCALL(OPIVV3, vnmsac_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSAC)
1873 RVVCALL(OPIVV3, vnmsac_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSAC)
1874 RVVCALL(OPIVV3, vnmsac_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSAC)
1875 RVVCALL(OPIVV3, vnmsac_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSAC)
1876 RVVCALL(OPIVV3, vmadd_vv_b, OP_SSS_B, H1, H1, H1, DO_MADD)
1877 RVVCALL(OPIVV3, vmadd_vv_h, OP_SSS_H, H2, H2, H2, DO_MADD)
1878 RVVCALL(OPIVV3, vmadd_vv_w, OP_SSS_W, H4, H4, H4, DO_MADD)
1879 RVVCALL(OPIVV3, vmadd_vv_d, OP_SSS_D, H8, H8, H8, DO_MADD)
1880 RVVCALL(OPIVV3, vnmsub_vv_b, OP_SSS_B, H1, H1, H1, DO_NMSUB)
1881 RVVCALL(OPIVV3, vnmsub_vv_h, OP_SSS_H, H2, H2, H2, DO_NMSUB)
1882 RVVCALL(OPIVV3, vnmsub_vv_w, OP_SSS_W, H4, H4, H4, DO_NMSUB)
1883 RVVCALL(OPIVV3, vnmsub_vv_d, OP_SSS_D, H8, H8, H8, DO_NMSUB)
1884 GEN_VEXT_VV(vmacc_vv_b, 1)
1885 GEN_VEXT_VV(vmacc_vv_h, 2)
1886 GEN_VEXT_VV(vmacc_vv_w, 4)
1887 GEN_VEXT_VV(vmacc_vv_d, 8)
1888 GEN_VEXT_VV(vnmsac_vv_b, 1)
1889 GEN_VEXT_VV(vnmsac_vv_h, 2)
1890 GEN_VEXT_VV(vnmsac_vv_w, 4)
1891 GEN_VEXT_VV(vnmsac_vv_d, 8)
1892 GEN_VEXT_VV(vmadd_vv_b, 1)
1893 GEN_VEXT_VV(vmadd_vv_h, 2)
1894 GEN_VEXT_VV(vmadd_vv_w, 4)
1895 GEN_VEXT_VV(vmadd_vv_d, 8)
1896 GEN_VEXT_VV(vnmsub_vv_b, 1)
1897 GEN_VEXT_VV(vnmsub_vv_h, 2)
1898 GEN_VEXT_VV(vnmsub_vv_w, 4)
1899 GEN_VEXT_VV(vnmsub_vv_d, 8)
1900
1901 #define OPIVX3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \
1902 static void do_##NAME(void *vd, target_long s1, void *vs2, int i) \
1903 { \
1904 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
1905 TD d = *((TD *)vd + HD(i)); \
1906 *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d); \
1907 }
1908
1909 RVVCALL(OPIVX3, vmacc_vx_b, OP_SSS_B, H1, H1, DO_MACC)
1910 RVVCALL(OPIVX3, vmacc_vx_h, OP_SSS_H, H2, H2, DO_MACC)
1911 RVVCALL(OPIVX3, vmacc_vx_w, OP_SSS_W, H4, H4, DO_MACC)
1912 RVVCALL(OPIVX3, vmacc_vx_d, OP_SSS_D, H8, H8, DO_MACC)
1913 RVVCALL(OPIVX3, vnmsac_vx_b, OP_SSS_B, H1, H1, DO_NMSAC)
1914 RVVCALL(OPIVX3, vnmsac_vx_h, OP_SSS_H, H2, H2, DO_NMSAC)
1915 RVVCALL(OPIVX3, vnmsac_vx_w, OP_SSS_W, H4, H4, DO_NMSAC)
1916 RVVCALL(OPIVX3, vnmsac_vx_d, OP_SSS_D, H8, H8, DO_NMSAC)
1917 RVVCALL(OPIVX3, vmadd_vx_b, OP_SSS_B, H1, H1, DO_MADD)
1918 RVVCALL(OPIVX3, vmadd_vx_h, OP_SSS_H, H2, H2, DO_MADD)
1919 RVVCALL(OPIVX3, vmadd_vx_w, OP_SSS_W, H4, H4, DO_MADD)
1920 RVVCALL(OPIVX3, vmadd_vx_d, OP_SSS_D, H8, H8, DO_MADD)
1921 RVVCALL(OPIVX3, vnmsub_vx_b, OP_SSS_B, H1, H1, DO_NMSUB)
1922 RVVCALL(OPIVX3, vnmsub_vx_h, OP_SSS_H, H2, H2, DO_NMSUB)
1923 RVVCALL(OPIVX3, vnmsub_vx_w, OP_SSS_W, H4, H4, DO_NMSUB)
1924 RVVCALL(OPIVX3, vnmsub_vx_d, OP_SSS_D, H8, H8, DO_NMSUB)
1925 GEN_VEXT_VX(vmacc_vx_b, 1)
1926 GEN_VEXT_VX(vmacc_vx_h, 2)
1927 GEN_VEXT_VX(vmacc_vx_w, 4)
1928 GEN_VEXT_VX(vmacc_vx_d, 8)
1929 GEN_VEXT_VX(vnmsac_vx_b, 1)
1930 GEN_VEXT_VX(vnmsac_vx_h, 2)
1931 GEN_VEXT_VX(vnmsac_vx_w, 4)
1932 GEN_VEXT_VX(vnmsac_vx_d, 8)
1933 GEN_VEXT_VX(vmadd_vx_b, 1)
1934 GEN_VEXT_VX(vmadd_vx_h, 2)
1935 GEN_VEXT_VX(vmadd_vx_w, 4)
1936 GEN_VEXT_VX(vmadd_vx_d, 8)
1937 GEN_VEXT_VX(vnmsub_vx_b, 1)
1938 GEN_VEXT_VX(vnmsub_vx_h, 2)
1939 GEN_VEXT_VX(vnmsub_vx_w, 4)
1940 GEN_VEXT_VX(vnmsub_vx_d, 8)
1941
1942 /* Vector Widening Integer Multiply-Add Instructions */
1943 RVVCALL(OPIVV3, vwmaccu_vv_b, WOP_UUU_B, H2, H1, H1, DO_MACC)
1944 RVVCALL(OPIVV3, vwmaccu_vv_h, WOP_UUU_H, H4, H2, H2, DO_MACC)
1945 RVVCALL(OPIVV3, vwmaccu_vv_w, WOP_UUU_W, H8, H4, H4, DO_MACC)
1946 RVVCALL(OPIVV3, vwmacc_vv_b, WOP_SSS_B, H2, H1, H1, DO_MACC)
1947 RVVCALL(OPIVV3, vwmacc_vv_h, WOP_SSS_H, H4, H2, H2, DO_MACC)
1948 RVVCALL(OPIVV3, vwmacc_vv_w, WOP_SSS_W, H8, H4, H4, DO_MACC)
1949 RVVCALL(OPIVV3, vwmaccsu_vv_b, WOP_SSU_B, H2, H1, H1, DO_MACC)
1950 RVVCALL(OPIVV3, vwmaccsu_vv_h, WOP_SSU_H, H4, H2, H2, DO_MACC)
1951 RVVCALL(OPIVV3, vwmaccsu_vv_w, WOP_SSU_W, H8, H4, H4, DO_MACC)
1952 GEN_VEXT_VV(vwmaccu_vv_b, 2)
1953 GEN_VEXT_VV(vwmaccu_vv_h, 4)
1954 GEN_VEXT_VV(vwmaccu_vv_w, 8)
1955 GEN_VEXT_VV(vwmacc_vv_b, 2)
1956 GEN_VEXT_VV(vwmacc_vv_h, 4)
1957 GEN_VEXT_VV(vwmacc_vv_w, 8)
1958 GEN_VEXT_VV(vwmaccsu_vv_b, 2)
1959 GEN_VEXT_VV(vwmaccsu_vv_h, 4)
1960 GEN_VEXT_VV(vwmaccsu_vv_w, 8)
1961
1962 RVVCALL(OPIVX3, vwmaccu_vx_b, WOP_UUU_B, H2, H1, DO_MACC)
1963 RVVCALL(OPIVX3, vwmaccu_vx_h, WOP_UUU_H, H4, H2, DO_MACC)
1964 RVVCALL(OPIVX3, vwmaccu_vx_w, WOP_UUU_W, H8, H4, DO_MACC)
1965 RVVCALL(OPIVX3, vwmacc_vx_b, WOP_SSS_B, H2, H1, DO_MACC)
1966 RVVCALL(OPIVX3, vwmacc_vx_h, WOP_SSS_H, H4, H2, DO_MACC)
1967 RVVCALL(OPIVX3, vwmacc_vx_w, WOP_SSS_W, H8, H4, DO_MACC)
1968 RVVCALL(OPIVX3, vwmaccsu_vx_b, WOP_SSU_B, H2, H1, DO_MACC)
1969 RVVCALL(OPIVX3, vwmaccsu_vx_h, WOP_SSU_H, H4, H2, DO_MACC)
1970 RVVCALL(OPIVX3, vwmaccsu_vx_w, WOP_SSU_W, H8, H4, DO_MACC)
1971 RVVCALL(OPIVX3, vwmaccus_vx_b, WOP_SUS_B, H2, H1, DO_MACC)
1972 RVVCALL(OPIVX3, vwmaccus_vx_h, WOP_SUS_H, H4, H2, DO_MACC)
1973 RVVCALL(OPIVX3, vwmaccus_vx_w, WOP_SUS_W, H8, H4, DO_MACC)
1974 GEN_VEXT_VX(vwmaccu_vx_b, 2)
1975 GEN_VEXT_VX(vwmaccu_vx_h, 4)
1976 GEN_VEXT_VX(vwmaccu_vx_w, 8)
1977 GEN_VEXT_VX(vwmacc_vx_b, 2)
1978 GEN_VEXT_VX(vwmacc_vx_h, 4)
1979 GEN_VEXT_VX(vwmacc_vx_w, 8)
1980 GEN_VEXT_VX(vwmaccsu_vx_b, 2)
1981 GEN_VEXT_VX(vwmaccsu_vx_h, 4)
1982 GEN_VEXT_VX(vwmaccsu_vx_w, 8)
1983 GEN_VEXT_VX(vwmaccus_vx_b, 2)
1984 GEN_VEXT_VX(vwmaccus_vx_h, 4)
1985 GEN_VEXT_VX(vwmaccus_vx_w, 8)
1986
1987 /* Vector Integer Merge and Move Instructions */
1988 #define GEN_VEXT_VMV_VV(NAME, ETYPE, H) \
1989 void HELPER(NAME)(void *vd, void *vs1, CPURISCVState *env, \
1990 uint32_t desc) \
1991 { \
1992 uint32_t vl = env->vl; \
1993 uint32_t esz = sizeof(ETYPE); \
1994 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
1995 uint32_t vta = vext_vta(desc); \
1996 uint32_t i; \
1997 \
1998 for (i = env->vstart; i < vl; i++) { \
1999 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \
2000 *((ETYPE *)vd + H(i)) = s1; \
2001 } \
2002 env->vstart = 0; \
2003 /* set tail elements to 1s */ \
2004 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
2005 }
2006
2007 GEN_VEXT_VMV_VV(vmv_v_v_b, int8_t, H1)
2008 GEN_VEXT_VMV_VV(vmv_v_v_h, int16_t, H2)
2009 GEN_VEXT_VMV_VV(vmv_v_v_w, int32_t, H4)
2010 GEN_VEXT_VMV_VV(vmv_v_v_d, int64_t, H8)
2011
2012 #define GEN_VEXT_VMV_VX(NAME, ETYPE, H) \
2013 void HELPER(NAME)(void *vd, uint64_t s1, CPURISCVState *env, \
2014 uint32_t desc) \
2015 { \
2016 uint32_t vl = env->vl; \
2017 uint32_t esz = sizeof(ETYPE); \
2018 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
2019 uint32_t vta = vext_vta(desc); \
2020 uint32_t i; \
2021 \
2022 for (i = env->vstart; i < vl; i++) { \
2023 *((ETYPE *)vd + H(i)) = (ETYPE)s1; \
2024 } \
2025 env->vstart = 0; \
2026 /* set tail elements to 1s */ \
2027 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
2028 }
2029
2030 GEN_VEXT_VMV_VX(vmv_v_x_b, int8_t, H1)
2031 GEN_VEXT_VMV_VX(vmv_v_x_h, int16_t, H2)
2032 GEN_VEXT_VMV_VX(vmv_v_x_w, int32_t, H4)
2033 GEN_VEXT_VMV_VX(vmv_v_x_d, int64_t, H8)
2034
2035 #define GEN_VEXT_VMERGE_VV(NAME, ETYPE, H) \
2036 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
2037 CPURISCVState *env, uint32_t desc) \
2038 { \
2039 uint32_t vl = env->vl; \
2040 uint32_t esz = sizeof(ETYPE); \
2041 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
2042 uint32_t vta = vext_vta(desc); \
2043 uint32_t i; \
2044 \
2045 for (i = env->vstart; i < vl; i++) { \
2046 ETYPE *vt = (!vext_elem_mask(v0, i) ? vs2 : vs1); \
2047 *((ETYPE *)vd + H(i)) = *(vt + H(i)); \
2048 } \
2049 env->vstart = 0; \
2050 /* set tail elements to 1s */ \
2051 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
2052 }
2053
2054 GEN_VEXT_VMERGE_VV(vmerge_vvm_b, int8_t, H1)
2055 GEN_VEXT_VMERGE_VV(vmerge_vvm_h, int16_t, H2)
2056 GEN_VEXT_VMERGE_VV(vmerge_vvm_w, int32_t, H4)
2057 GEN_VEXT_VMERGE_VV(vmerge_vvm_d, int64_t, H8)
2058
2059 #define GEN_VEXT_VMERGE_VX(NAME, ETYPE, H) \
2060 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \
2061 void *vs2, CPURISCVState *env, uint32_t desc) \
2062 { \
2063 uint32_t vl = env->vl; \
2064 uint32_t esz = sizeof(ETYPE); \
2065 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
2066 uint32_t vta = vext_vta(desc); \
2067 uint32_t i; \
2068 \
2069 for (i = env->vstart; i < vl; i++) { \
2070 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
2071 ETYPE d = (!vext_elem_mask(v0, i) ? s2 : \
2072 (ETYPE)(target_long)s1); \
2073 *((ETYPE *)vd + H(i)) = d; \
2074 } \
2075 env->vstart = 0; \
2076 /* set tail elements to 1s */ \
2077 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
2078 }
2079
2080 GEN_VEXT_VMERGE_VX(vmerge_vxm_b, int8_t, H1)
2081 GEN_VEXT_VMERGE_VX(vmerge_vxm_h, int16_t, H2)
2082 GEN_VEXT_VMERGE_VX(vmerge_vxm_w, int32_t, H4)
2083 GEN_VEXT_VMERGE_VX(vmerge_vxm_d, int64_t, H8)
2084
2085 /*
2086 *** Vector Fixed-Point Arithmetic Instructions
2087 */
2088
2089 /* Vector Single-Width Saturating Add and Subtract */
2090
2091 /*
2092 * As fixed point instructions probably have round mode and saturation,
2093 * define common macros for fixed point here.
2094 */
2095 typedef void opivv2_rm_fn(void *vd, void *vs1, void *vs2, int i,
2096 CPURISCVState *env, int vxrm);
2097
2098 #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \
2099 static inline void \
2100 do_##NAME(void *vd, void *vs1, void *vs2, int i, \
2101 CPURISCVState *env, int vxrm) \
2102 { \
2103 TX1 s1 = *((T1 *)vs1 + HS1(i)); \
2104 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
2105 *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1); \
2106 }
2107
2108 static inline void
2109 vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
2110 CPURISCVState *env,
2111 uint32_t vl, uint32_t vm, int vxrm,
2112 opivv2_rm_fn *fn)
2113 {
2114 for (uint32_t i = env->vstart; i < vl; i++) {
2115 if (!vm && !vext_elem_mask(v0, i)) {
2116 continue;
2117 }
2118 fn(vd, vs1, vs2, i, env, vxrm);
2119 }
2120 env->vstart = 0;
2121 }
2122
2123 static inline void
2124 vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2,
2125 CPURISCVState *env,
2126 uint32_t desc,
2127 opivv2_rm_fn *fn, uint32_t esz)
2128 {
2129 uint32_t vm = vext_vm(desc);
2130 uint32_t vl = env->vl;
2131 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2132 uint32_t vta = vext_vta(desc);
2133
2134 switch (env->vxrm) {
2135 case 0: /* rnu */
2136 vext_vv_rm_1(vd, v0, vs1, vs2,
2137 env, vl, vm, 0, fn);
2138 break;
2139 case 1: /* rne */
2140 vext_vv_rm_1(vd, v0, vs1, vs2,
2141 env, vl, vm, 1, fn);
2142 break;
2143 case 2: /* rdn */
2144 vext_vv_rm_1(vd, v0, vs1, vs2,
2145 env, vl, vm, 2, fn);
2146 break;
2147 default: /* rod */
2148 vext_vv_rm_1(vd, v0, vs1, vs2,
2149 env, vl, vm, 3, fn);
2150 break;
2151 }
2152 /* set tail elements to 1s */
2153 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2154 }
2155
2156 /* generate helpers for fixed point instructions with OPIVV format */
2157 #define GEN_VEXT_VV_RM(NAME, ESZ) \
2158 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
2159 CPURISCVState *env, uint32_t desc) \
2160 { \
2161 vext_vv_rm_2(vd, v0, vs1, vs2, env, desc, \
2162 do_##NAME, ESZ); \
2163 }
2164
2165 static inline uint8_t saddu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2166 {
2167 uint8_t res = a + b;
2168 if (res < a) {
2169 res = UINT8_MAX;
2170 env->vxsat = 0x1;
2171 }
2172 return res;
2173 }
2174
2175 static inline uint16_t saddu16(CPURISCVState *env, int vxrm, uint16_t a,
2176 uint16_t b)
2177 {
2178 uint16_t res = a + b;
2179 if (res < a) {
2180 res = UINT16_MAX;
2181 env->vxsat = 0x1;
2182 }
2183 return res;
2184 }
2185
2186 static inline uint32_t saddu32(CPURISCVState *env, int vxrm, uint32_t a,
2187 uint32_t b)
2188 {
2189 uint32_t res = a + b;
2190 if (res < a) {
2191 res = UINT32_MAX;
2192 env->vxsat = 0x1;
2193 }
2194 return res;
2195 }
2196
2197 static inline uint64_t saddu64(CPURISCVState *env, int vxrm, uint64_t a,
2198 uint64_t b)
2199 {
2200 uint64_t res = a + b;
2201 if (res < a) {
2202 res = UINT64_MAX;
2203 env->vxsat = 0x1;
2204 }
2205 return res;
2206 }
2207
2208 RVVCALL(OPIVV2_RM, vsaddu_vv_b, OP_UUU_B, H1, H1, H1, saddu8)
2209 RVVCALL(OPIVV2_RM, vsaddu_vv_h, OP_UUU_H, H2, H2, H2, saddu16)
2210 RVVCALL(OPIVV2_RM, vsaddu_vv_w, OP_UUU_W, H4, H4, H4, saddu32)
2211 RVVCALL(OPIVV2_RM, vsaddu_vv_d, OP_UUU_D, H8, H8, H8, saddu64)
2212 GEN_VEXT_VV_RM(vsaddu_vv_b, 1)
2213 GEN_VEXT_VV_RM(vsaddu_vv_h, 2)
2214 GEN_VEXT_VV_RM(vsaddu_vv_w, 4)
2215 GEN_VEXT_VV_RM(vsaddu_vv_d, 8)
2216
2217 typedef void opivx2_rm_fn(void *vd, target_long s1, void *vs2, int i,
2218 CPURISCVState *env, int vxrm);
2219
2220 #define OPIVX2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \
2221 static inline void \
2222 do_##NAME(void *vd, target_long s1, void *vs2, int i, \
2223 CPURISCVState *env, int vxrm) \
2224 { \
2225 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
2226 *((TD *)vd + HD(i)) = OP(env, vxrm, s2, (TX1)(T1)s1); \
2227 }
2228
2229 static inline void
2230 vext_vx_rm_1(void *vd, void *v0, target_long s1, void *vs2,
2231 CPURISCVState *env,
2232 uint32_t vl, uint32_t vm, int vxrm,
2233 opivx2_rm_fn *fn)
2234 {
2235 for (uint32_t i = env->vstart; i < vl; i++) {
2236 if (!vm && !vext_elem_mask(v0, i)) {
2237 continue;
2238 }
2239 fn(vd, s1, vs2, i, env, vxrm);
2240 }
2241 env->vstart = 0;
2242 }
2243
2244 static inline void
2245 vext_vx_rm_2(void *vd, void *v0, target_long s1, void *vs2,
2246 CPURISCVState *env,
2247 uint32_t desc,
2248 opivx2_rm_fn *fn, uint32_t esz)
2249 {
2250 uint32_t vm = vext_vm(desc);
2251 uint32_t vl = env->vl;
2252 uint32_t total_elems = vext_get_total_elems(env, desc, esz);
2253 uint32_t vta = vext_vta(desc);
2254
2255 switch (env->vxrm) {
2256 case 0: /* rnu */
2257 vext_vx_rm_1(vd, v0, s1, vs2,
2258 env, vl, vm, 0, fn);
2259 break;
2260 case 1: /* rne */
2261 vext_vx_rm_1(vd, v0, s1, vs2,
2262 env, vl, vm, 1, fn);
2263 break;
2264 case 2: /* rdn */
2265 vext_vx_rm_1(vd, v0, s1, vs2,
2266 env, vl, vm, 2, fn);
2267 break;
2268 default: /* rod */
2269 vext_vx_rm_1(vd, v0, s1, vs2,
2270 env, vl, vm, 3, fn);
2271 break;
2272 }
2273 /* set tail elements to 1s */
2274 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz);
2275 }
2276
2277 /* generate helpers for fixed point instructions with OPIVX format */
2278 #define GEN_VEXT_VX_RM(NAME, ESZ) \
2279 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, \
2280 void *vs2, CPURISCVState *env, uint32_t desc) \
2281 { \
2282 vext_vx_rm_2(vd, v0, s1, vs2, env, desc, \
2283 do_##NAME, ESZ); \
2284 }
2285
2286 RVVCALL(OPIVX2_RM, vsaddu_vx_b, OP_UUU_B, H1, H1, saddu8)
2287 RVVCALL(OPIVX2_RM, vsaddu_vx_h, OP_UUU_H, H2, H2, saddu16)
2288 RVVCALL(OPIVX2_RM, vsaddu_vx_w, OP_UUU_W, H4, H4, saddu32)
2289 RVVCALL(OPIVX2_RM, vsaddu_vx_d, OP_UUU_D, H8, H8, saddu64)
2290 GEN_VEXT_VX_RM(vsaddu_vx_b, 1)
2291 GEN_VEXT_VX_RM(vsaddu_vx_h, 2)
2292 GEN_VEXT_VX_RM(vsaddu_vx_w, 4)
2293 GEN_VEXT_VX_RM(vsaddu_vx_d, 8)
2294
2295 static inline int8_t sadd8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2296 {
2297 int8_t res = a + b;
2298 if ((res ^ a) & (res ^ b) & INT8_MIN) {
2299 res = a > 0 ? INT8_MAX : INT8_MIN;
2300 env->vxsat = 0x1;
2301 }
2302 return res;
2303 }
2304
2305 static inline int16_t sadd16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2306 {
2307 int16_t res = a + b;
2308 if ((res ^ a) & (res ^ b) & INT16_MIN) {
2309 res = a > 0 ? INT16_MAX : INT16_MIN;
2310 env->vxsat = 0x1;
2311 }
2312 return res;
2313 }
2314
2315 static inline int32_t sadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2316 {
2317 int32_t res = a + b;
2318 if ((res ^ a) & (res ^ b) & INT32_MIN) {
2319 res = a > 0 ? INT32_MAX : INT32_MIN;
2320 env->vxsat = 0x1;
2321 }
2322 return res;
2323 }
2324
2325 static inline int64_t sadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2326 {
2327 int64_t res = a + b;
2328 if ((res ^ a) & (res ^ b) & INT64_MIN) {
2329 res = a > 0 ? INT64_MAX : INT64_MIN;
2330 env->vxsat = 0x1;
2331 }
2332 return res;
2333 }
2334
2335 RVVCALL(OPIVV2_RM, vsadd_vv_b, OP_SSS_B, H1, H1, H1, sadd8)
2336 RVVCALL(OPIVV2_RM, vsadd_vv_h, OP_SSS_H, H2, H2, H2, sadd16)
2337 RVVCALL(OPIVV2_RM, vsadd_vv_w, OP_SSS_W, H4, H4, H4, sadd32)
2338 RVVCALL(OPIVV2_RM, vsadd_vv_d, OP_SSS_D, H8, H8, H8, sadd64)
2339 GEN_VEXT_VV_RM(vsadd_vv_b, 1)
2340 GEN_VEXT_VV_RM(vsadd_vv_h, 2)
2341 GEN_VEXT_VV_RM(vsadd_vv_w, 4)
2342 GEN_VEXT_VV_RM(vsadd_vv_d, 8)
2343
2344 RVVCALL(OPIVX2_RM, vsadd_vx_b, OP_SSS_B, H1, H1, sadd8)
2345 RVVCALL(OPIVX2_RM, vsadd_vx_h, OP_SSS_H, H2, H2, sadd16)
2346 RVVCALL(OPIVX2_RM, vsadd_vx_w, OP_SSS_W, H4, H4, sadd32)
2347 RVVCALL(OPIVX2_RM, vsadd_vx_d, OP_SSS_D, H8, H8, sadd64)
2348 GEN_VEXT_VX_RM(vsadd_vx_b, 1)
2349 GEN_VEXT_VX_RM(vsadd_vx_h, 2)
2350 GEN_VEXT_VX_RM(vsadd_vx_w, 4)
2351 GEN_VEXT_VX_RM(vsadd_vx_d, 8)
2352
2353 static inline uint8_t ssubu8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2354 {
2355 uint8_t res = a - b;
2356 if (res > a) {
2357 res = 0;
2358 env->vxsat = 0x1;
2359 }
2360 return res;
2361 }
2362
2363 static inline uint16_t ssubu16(CPURISCVState *env, int vxrm, uint16_t a,
2364 uint16_t b)
2365 {
2366 uint16_t res = a - b;
2367 if (res > a) {
2368 res = 0;
2369 env->vxsat = 0x1;
2370 }
2371 return res;
2372 }
2373
2374 static inline uint32_t ssubu32(CPURISCVState *env, int vxrm, uint32_t a,
2375 uint32_t b)
2376 {
2377 uint32_t res = a - b;
2378 if (res > a) {
2379 res = 0;
2380 env->vxsat = 0x1;
2381 }
2382 return res;
2383 }
2384
2385 static inline uint64_t ssubu64(CPURISCVState *env, int vxrm, uint64_t a,
2386 uint64_t b)
2387 {
2388 uint64_t res = a - b;
2389 if (res > a) {
2390 res = 0;
2391 env->vxsat = 0x1;
2392 }
2393 return res;
2394 }
2395
2396 RVVCALL(OPIVV2_RM, vssubu_vv_b, OP_UUU_B, H1, H1, H1, ssubu8)
2397 RVVCALL(OPIVV2_RM, vssubu_vv_h, OP_UUU_H, H2, H2, H2, ssubu16)
2398 RVVCALL(OPIVV2_RM, vssubu_vv_w, OP_UUU_W, H4, H4, H4, ssubu32)
2399 RVVCALL(OPIVV2_RM, vssubu_vv_d, OP_UUU_D, H8, H8, H8, ssubu64)
2400 GEN_VEXT_VV_RM(vssubu_vv_b, 1)
2401 GEN_VEXT_VV_RM(vssubu_vv_h, 2)
2402 GEN_VEXT_VV_RM(vssubu_vv_w, 4)
2403 GEN_VEXT_VV_RM(vssubu_vv_d, 8)
2404
2405 RVVCALL(OPIVX2_RM, vssubu_vx_b, OP_UUU_B, H1, H1, ssubu8)
2406 RVVCALL(OPIVX2_RM, vssubu_vx_h, OP_UUU_H, H2, H2, ssubu16)
2407 RVVCALL(OPIVX2_RM, vssubu_vx_w, OP_UUU_W, H4, H4, ssubu32)
2408 RVVCALL(OPIVX2_RM, vssubu_vx_d, OP_UUU_D, H8, H8, ssubu64)
2409 GEN_VEXT_VX_RM(vssubu_vx_b, 1)
2410 GEN_VEXT_VX_RM(vssubu_vx_h, 2)
2411 GEN_VEXT_VX_RM(vssubu_vx_w, 4)
2412 GEN_VEXT_VX_RM(vssubu_vx_d, 8)
2413
2414 static inline int8_t ssub8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2415 {
2416 int8_t res = a - b;
2417 if ((res ^ a) & (a ^ b) & INT8_MIN) {
2418 res = a >= 0 ? INT8_MAX : INT8_MIN;
2419 env->vxsat = 0x1;
2420 }
2421 return res;
2422 }
2423
2424 static inline int16_t ssub16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2425 {
2426 int16_t res = a - b;
2427 if ((res ^ a) & (a ^ b) & INT16_MIN) {
2428 res = a >= 0 ? INT16_MAX : INT16_MIN;
2429 env->vxsat = 0x1;
2430 }
2431 return res;
2432 }
2433
2434 static inline int32_t ssub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2435 {
2436 int32_t res = a - b;
2437 if ((res ^ a) & (a ^ b) & INT32_MIN) {
2438 res = a >= 0 ? INT32_MAX : INT32_MIN;
2439 env->vxsat = 0x1;
2440 }
2441 return res;
2442 }
2443
2444 static inline int64_t ssub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2445 {
2446 int64_t res = a - b;
2447 if ((res ^ a) & (a ^ b) & INT64_MIN) {
2448 res = a >= 0 ? INT64_MAX : INT64_MIN;
2449 env->vxsat = 0x1;
2450 }
2451 return res;
2452 }
2453
2454 RVVCALL(OPIVV2_RM, vssub_vv_b, OP_SSS_B, H1, H1, H1, ssub8)
2455 RVVCALL(OPIVV2_RM, vssub_vv_h, OP_SSS_H, H2, H2, H2, ssub16)
2456 RVVCALL(OPIVV2_RM, vssub_vv_w, OP_SSS_W, H4, H4, H4, ssub32)
2457 RVVCALL(OPIVV2_RM, vssub_vv_d, OP_SSS_D, H8, H8, H8, ssub64)
2458 GEN_VEXT_VV_RM(vssub_vv_b, 1)
2459 GEN_VEXT_VV_RM(vssub_vv_h, 2)
2460 GEN_VEXT_VV_RM(vssub_vv_w, 4)
2461 GEN_VEXT_VV_RM(vssub_vv_d, 8)
2462
2463 RVVCALL(OPIVX2_RM, vssub_vx_b, OP_SSS_B, H1, H1, ssub8)
2464 RVVCALL(OPIVX2_RM, vssub_vx_h, OP_SSS_H, H2, H2, ssub16)
2465 RVVCALL(OPIVX2_RM, vssub_vx_w, OP_SSS_W, H4, H4, ssub32)
2466 RVVCALL(OPIVX2_RM, vssub_vx_d, OP_SSS_D, H8, H8, ssub64)
2467 GEN_VEXT_VX_RM(vssub_vx_b, 1)
2468 GEN_VEXT_VX_RM(vssub_vx_h, 2)
2469 GEN_VEXT_VX_RM(vssub_vx_w, 4)
2470 GEN_VEXT_VX_RM(vssub_vx_d, 8)
2471
2472 /* Vector Single-Width Averaging Add and Subtract */
2473 static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift)
2474 {
2475 uint8_t d = extract64(v, shift, 1);
2476 uint8_t d1;
2477 uint64_t D1, D2;
2478
2479 if (shift == 0 || shift > 64) {
2480 return 0;
2481 }
2482
2483 d1 = extract64(v, shift - 1, 1);
2484 D1 = extract64(v, 0, shift);
2485 if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */
2486 return d1;
2487 } else if (vxrm == 1) { /* round-to-nearest-even */
2488 if (shift > 1) {
2489 D2 = extract64(v, 0, shift - 1);
2490 return d1 & ((D2 != 0) | d);
2491 } else {
2492 return d1 & d;
2493 }
2494 } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
2495 return !d & (D1 != 0);
2496 }
2497 return 0; /* round-down (truncate) */
2498 }
2499
2500 static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2501 {
2502 int64_t res = (int64_t)a + b;
2503 uint8_t round = get_round(vxrm, res, 1);
2504
2505 return (res >> 1) + round;
2506 }
2507
2508 static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2509 {
2510 int64_t res = a + b;
2511 uint8_t round = get_round(vxrm, res, 1);
2512 int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
2513
2514 /* With signed overflow, bit 64 is inverse of bit 63. */
2515 return ((res >> 1) ^ over) + round;
2516 }
2517
2518 RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
2519 RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
2520 RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
2521 RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
2522 GEN_VEXT_VV_RM(vaadd_vv_b, 1)
2523 GEN_VEXT_VV_RM(vaadd_vv_h, 2)
2524 GEN_VEXT_VV_RM(vaadd_vv_w, 4)
2525 GEN_VEXT_VV_RM(vaadd_vv_d, 8)
2526
2527 RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32)
2528 RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32)
2529 RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
2530 RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
2531 GEN_VEXT_VX_RM(vaadd_vx_b, 1)
2532 GEN_VEXT_VX_RM(vaadd_vx_h, 2)
2533 GEN_VEXT_VX_RM(vaadd_vx_w, 4)
2534 GEN_VEXT_VX_RM(vaadd_vx_d, 8)
2535
2536 static inline uint32_t aaddu32(CPURISCVState *env, int vxrm,
2537 uint32_t a, uint32_t b)
2538 {
2539 uint64_t res = (uint64_t)a + b;
2540 uint8_t round = get_round(vxrm, res, 1);
2541
2542 return (res >> 1) + round;
2543 }
2544
2545 static inline uint64_t aaddu64(CPURISCVState *env, int vxrm,
2546 uint64_t a, uint64_t b)
2547 {
2548 uint64_t res = a + b;
2549 uint8_t round = get_round(vxrm, res, 1);
2550 uint64_t over = (uint64_t)(res < a) << 63;
2551
2552 return ((res >> 1) | over) + round;
2553 }
2554
2555 RVVCALL(OPIVV2_RM, vaaddu_vv_b, OP_UUU_B, H1, H1, H1, aaddu32)
2556 RVVCALL(OPIVV2_RM, vaaddu_vv_h, OP_UUU_H, H2, H2, H2, aaddu32)
2557 RVVCALL(OPIVV2_RM, vaaddu_vv_w, OP_UUU_W, H4, H4, H4, aaddu32)
2558 RVVCALL(OPIVV2_RM, vaaddu_vv_d, OP_UUU_D, H8, H8, H8, aaddu64)
2559 GEN_VEXT_VV_RM(vaaddu_vv_b, 1)
2560 GEN_VEXT_VV_RM(vaaddu_vv_h, 2)
2561 GEN_VEXT_VV_RM(vaaddu_vv_w, 4)
2562 GEN_VEXT_VV_RM(vaaddu_vv_d, 8)
2563
2564 RVVCALL(OPIVX2_RM, vaaddu_vx_b, OP_UUU_B, H1, H1, aaddu32)
2565 RVVCALL(OPIVX2_RM, vaaddu_vx_h, OP_UUU_H, H2, H2, aaddu32)
2566 RVVCALL(OPIVX2_RM, vaaddu_vx_w, OP_UUU_W, H4, H4, aaddu32)
2567 RVVCALL(OPIVX2_RM, vaaddu_vx_d, OP_UUU_D, H8, H8, aaddu64)
2568 GEN_VEXT_VX_RM(vaaddu_vx_b, 1)
2569 GEN_VEXT_VX_RM(vaaddu_vx_h, 2)
2570 GEN_VEXT_VX_RM(vaaddu_vx_w, 4)
2571 GEN_VEXT_VX_RM(vaaddu_vx_d, 8)
2572
2573 static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2574 {
2575 int64_t res = (int64_t)a - b;
2576 uint8_t round = get_round(vxrm, res, 1);
2577
2578 return (res >> 1) + round;
2579 }
2580
2581 static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2582 {
2583 int64_t res = (int64_t)a - b;
2584 uint8_t round = get_round(vxrm, res, 1);
2585 int64_t over = (res ^ a) & (a ^ b) & INT64_MIN;
2586
2587 /* With signed overflow, bit 64 is inverse of bit 63. */
2588 return ((res >> 1) ^ over) + round;
2589 }
2590
2591 RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32)
2592 RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32)
2593 RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
2594 RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
2595 GEN_VEXT_VV_RM(vasub_vv_b, 1)
2596 GEN_VEXT_VV_RM(vasub_vv_h, 2)
2597 GEN_VEXT_VV_RM(vasub_vv_w, 4)
2598 GEN_VEXT_VV_RM(vasub_vv_d, 8)
2599
2600 RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32)
2601 RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32)
2602 RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
2603 RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
2604 GEN_VEXT_VX_RM(vasub_vx_b, 1)
2605 GEN_VEXT_VX_RM(vasub_vx_h, 2)
2606 GEN_VEXT_VX_RM(vasub_vx_w, 4)
2607 GEN_VEXT_VX_RM(vasub_vx_d, 8)
2608
2609 static inline uint32_t asubu32(CPURISCVState *env, int vxrm,
2610 uint32_t a, uint32_t b)
2611 {
2612 int64_t res = (int64_t)a - b;
2613 uint8_t round = get_round(vxrm, res, 1);
2614
2615 return (res >> 1) + round;
2616 }
2617
2618 static inline uint64_t asubu64(CPURISCVState *env, int vxrm,
2619 uint64_t a, uint64_t b)
2620 {
2621 uint64_t res = (uint64_t)a - b;
2622 uint8_t round = get_round(vxrm, res, 1);
2623 uint64_t over = (uint64_t)(res > a) << 63;
2624
2625 return ((res >> 1) | over) + round;
2626 }
2627
2628 RVVCALL(OPIVV2_RM, vasubu_vv_b, OP_UUU_B, H1, H1, H1, asubu32)
2629 RVVCALL(OPIVV2_RM, vasubu_vv_h, OP_UUU_H, H2, H2, H2, asubu32)
2630 RVVCALL(OPIVV2_RM, vasubu_vv_w, OP_UUU_W, H4, H4, H4, asubu32)
2631 RVVCALL(OPIVV2_RM, vasubu_vv_d, OP_UUU_D, H8, H8, H8, asubu64)
2632 GEN_VEXT_VV_RM(vasubu_vv_b, 1)
2633 GEN_VEXT_VV_RM(vasubu_vv_h, 2)
2634 GEN_VEXT_VV_RM(vasubu_vv_w, 4)
2635 GEN_VEXT_VV_RM(vasubu_vv_d, 8)
2636
2637 RVVCALL(OPIVX2_RM, vasubu_vx_b, OP_UUU_B, H1, H1, asubu32)
2638 RVVCALL(OPIVX2_RM, vasubu_vx_h, OP_UUU_H, H2, H2, asubu32)
2639 RVVCALL(OPIVX2_RM, vasubu_vx_w, OP_UUU_W, H4, H4, asubu32)
2640 RVVCALL(OPIVX2_RM, vasubu_vx_d, OP_UUU_D, H8, H8, asubu64)
2641 GEN_VEXT_VX_RM(vasubu_vx_b, 1)
2642 GEN_VEXT_VX_RM(vasubu_vx_h, 2)
2643 GEN_VEXT_VX_RM(vasubu_vx_w, 4)
2644 GEN_VEXT_VX_RM(vasubu_vx_d, 8)
2645
2646 /* Vector Single-Width Fractional Multiply with Rounding and Saturation */
2647 static inline int8_t vsmul8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2648 {
2649 uint8_t round;
2650 int16_t res;
2651
2652 res = (int16_t)a * (int16_t)b;
2653 round = get_round(vxrm, res, 7);
2654 res = (res >> 7) + round;
2655
2656 if (res > INT8_MAX) {
2657 env->vxsat = 0x1;
2658 return INT8_MAX;
2659 } else if (res < INT8_MIN) {
2660 env->vxsat = 0x1;
2661 return INT8_MIN;
2662 } else {
2663 return res;
2664 }
2665 }
2666
2667 static int16_t vsmul16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2668 {
2669 uint8_t round;
2670 int32_t res;
2671
2672 res = (int32_t)a * (int32_t)b;
2673 round = get_round(vxrm, res, 15);
2674 res = (res >> 15) + round;
2675
2676 if (res > INT16_MAX) {
2677 env->vxsat = 0x1;
2678 return INT16_MAX;
2679 } else if (res < INT16_MIN) {
2680 env->vxsat = 0x1;
2681 return INT16_MIN;
2682 } else {
2683 return res;
2684 }
2685 }
2686
2687 static int32_t vsmul32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2688 {
2689 uint8_t round;
2690 int64_t res;
2691
2692 res = (int64_t)a * (int64_t)b;
2693 round = get_round(vxrm, res, 31);
2694 res = (res >> 31) + round;
2695
2696 if (res > INT32_MAX) {
2697 env->vxsat = 0x1;
2698 return INT32_MAX;
2699 } else if (res < INT32_MIN) {
2700 env->vxsat = 0x1;
2701 return INT32_MIN;
2702 } else {
2703 return res;
2704 }
2705 }
2706
2707 static int64_t vsmul64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2708 {
2709 uint8_t round;
2710 uint64_t hi_64, lo_64;
2711 int64_t res;
2712
2713 if (a == INT64_MIN && b == INT64_MIN) {
2714 env->vxsat = 1;
2715 return INT64_MAX;
2716 }
2717
2718 muls64(&lo_64, &hi_64, a, b);
2719 round = get_round(vxrm, lo_64, 63);
2720 /*
2721 * Cannot overflow, as there are always
2722 * 2 sign bits after multiply.
2723 */
2724 res = (hi_64 << 1) | (lo_64 >> 63);
2725 if (round) {
2726 if (res == INT64_MAX) {
2727 env->vxsat = 1;
2728 } else {
2729 res += 1;
2730 }
2731 }
2732 return res;
2733 }
2734
2735 RVVCALL(OPIVV2_RM, vsmul_vv_b, OP_SSS_B, H1, H1, H1, vsmul8)
2736 RVVCALL(OPIVV2_RM, vsmul_vv_h, OP_SSS_H, H2, H2, H2, vsmul16)
2737 RVVCALL(OPIVV2_RM, vsmul_vv_w, OP_SSS_W, H4, H4, H4, vsmul32)
2738 RVVCALL(OPIVV2_RM, vsmul_vv_d, OP_SSS_D, H8, H8, H8, vsmul64)
2739 GEN_VEXT_VV_RM(vsmul_vv_b, 1)
2740 GEN_VEXT_VV_RM(vsmul_vv_h, 2)
2741 GEN_VEXT_VV_RM(vsmul_vv_w, 4)
2742 GEN_VEXT_VV_RM(vsmul_vv_d, 8)
2743
2744 RVVCALL(OPIVX2_RM, vsmul_vx_b, OP_SSS_B, H1, H1, vsmul8)
2745 RVVCALL(OPIVX2_RM, vsmul_vx_h, OP_SSS_H, H2, H2, vsmul16)
2746 RVVCALL(OPIVX2_RM, vsmul_vx_w, OP_SSS_W, H4, H4, vsmul32)
2747 RVVCALL(OPIVX2_RM, vsmul_vx_d, OP_SSS_D, H8, H8, vsmul64)
2748 GEN_VEXT_VX_RM(vsmul_vx_b, 1)
2749 GEN_VEXT_VX_RM(vsmul_vx_h, 2)
2750 GEN_VEXT_VX_RM(vsmul_vx_w, 4)
2751 GEN_VEXT_VX_RM(vsmul_vx_d, 8)
2752
2753 /* Vector Single-Width Scaling Shift Instructions */
2754 static inline uint8_t
2755 vssrl8(CPURISCVState *env, int vxrm, uint8_t a, uint8_t b)
2756 {
2757 uint8_t round, shift = b & 0x7;
2758 uint8_t res;
2759
2760 round = get_round(vxrm, a, shift);
2761 res = (a >> shift) + round;
2762 return res;
2763 }
2764 static inline uint16_t
2765 vssrl16(CPURISCVState *env, int vxrm, uint16_t a, uint16_t b)
2766 {
2767 uint8_t round, shift = b & 0xf;
2768 uint16_t res;
2769
2770 round = get_round(vxrm, a, shift);
2771 res = (a >> shift) + round;
2772 return res;
2773 }
2774 static inline uint32_t
2775 vssrl32(CPURISCVState *env, int vxrm, uint32_t a, uint32_t b)
2776 {
2777 uint8_t round, shift = b & 0x1f;
2778 uint32_t res;
2779
2780 round = get_round(vxrm, a, shift);
2781 res = (a >> shift) + round;
2782 return res;
2783 }
2784 static inline uint64_t
2785 vssrl64(CPURISCVState *env, int vxrm, uint64_t a, uint64_t b)
2786 {
2787 uint8_t round, shift = b & 0x3f;
2788 uint64_t res;
2789
2790 round = get_round(vxrm, a, shift);
2791 res = (a >> shift) + round;
2792 return res;
2793 }
2794 RVVCALL(OPIVV2_RM, vssrl_vv_b, OP_UUU_B, H1, H1, H1, vssrl8)
2795 RVVCALL(OPIVV2_RM, vssrl_vv_h, OP_UUU_H, H2, H2, H2, vssrl16)
2796 RVVCALL(OPIVV2_RM, vssrl_vv_w, OP_UUU_W, H4, H4, H4, vssrl32)
2797 RVVCALL(OPIVV2_RM, vssrl_vv_d, OP_UUU_D, H8, H8, H8, vssrl64)
2798 GEN_VEXT_VV_RM(vssrl_vv_b, 1)
2799 GEN_VEXT_VV_RM(vssrl_vv_h, 2)
2800 GEN_VEXT_VV_RM(vssrl_vv_w, 4)
2801 GEN_VEXT_VV_RM(vssrl_vv_d, 8)
2802
2803 RVVCALL(OPIVX2_RM, vssrl_vx_b, OP_UUU_B, H1, H1, vssrl8)
2804 RVVCALL(OPIVX2_RM, vssrl_vx_h, OP_UUU_H, H2, H2, vssrl16)
2805 RVVCALL(OPIVX2_RM, vssrl_vx_w, OP_UUU_W, H4, H4, vssrl32)
2806 RVVCALL(OPIVX2_RM, vssrl_vx_d, OP_UUU_D, H8, H8, vssrl64)
2807 GEN_VEXT_VX_RM(vssrl_vx_b, 1)
2808 GEN_VEXT_VX_RM(vssrl_vx_h, 2)
2809 GEN_VEXT_VX_RM(vssrl_vx_w, 4)
2810 GEN_VEXT_VX_RM(vssrl_vx_d, 8)
2811
2812 static inline int8_t
2813 vssra8(CPURISCVState *env, int vxrm, int8_t a, int8_t b)
2814 {
2815 uint8_t round, shift = b & 0x7;
2816 int8_t res;
2817
2818 round = get_round(vxrm, a, shift);
2819 res = (a >> shift) + round;
2820 return res;
2821 }
2822 static inline int16_t
2823 vssra16(CPURISCVState *env, int vxrm, int16_t a, int16_t b)
2824 {
2825 uint8_t round, shift = b & 0xf;
2826 int16_t res;
2827
2828 round = get_round(vxrm, a, shift);
2829 res = (a >> shift) + round;
2830 return res;
2831 }
2832 static inline int32_t
2833 vssra32(CPURISCVState *env, int vxrm, int32_t a, int32_t b)
2834 {
2835 uint8_t round, shift = b & 0x1f;
2836 int32_t res;
2837
2838 round = get_round(vxrm, a, shift);
2839 res = (a >> shift) + round;
2840 return res;
2841 }
2842 static inline int64_t
2843 vssra64(CPURISCVState *env, int vxrm, int64_t a, int64_t b)
2844 {
2845 uint8_t round, shift = b & 0x3f;
2846 int64_t res;
2847
2848 round = get_round(vxrm, a, shift);
2849 res = (a >> shift) + round;
2850 return res;
2851 }
2852
2853 RVVCALL(OPIVV2_RM, vssra_vv_b, OP_SSS_B, H1, H1, H1, vssra8)
2854 RVVCALL(OPIVV2_RM, vssra_vv_h, OP_SSS_H, H2, H2, H2, vssra16)
2855 RVVCALL(OPIVV2_RM, vssra_vv_w, OP_SSS_W, H4, H4, H4, vssra32)
2856 RVVCALL(OPIVV2_RM, vssra_vv_d, OP_SSS_D, H8, H8, H8, vssra64)
2857 GEN_VEXT_VV_RM(vssra_vv_b, 1)
2858 GEN_VEXT_VV_RM(vssra_vv_h, 2)
2859 GEN_VEXT_VV_RM(vssra_vv_w, 4)
2860 GEN_VEXT_VV_RM(vssra_vv_d, 8)
2861
2862 RVVCALL(OPIVX2_RM, vssra_vx_b, OP_SSS_B, H1, H1, vssra8)
2863 RVVCALL(OPIVX2_RM, vssra_vx_h, OP_SSS_H, H2, H2, vssra16)
2864 RVVCALL(OPIVX2_RM, vssra_vx_w, OP_SSS_W, H4, H4, vssra32)
2865 RVVCALL(OPIVX2_RM, vssra_vx_d, OP_SSS_D, H8, H8, vssra64)
2866 GEN_VEXT_VX_RM(vssra_vx_b, 1)
2867 GEN_VEXT_VX_RM(vssra_vx_h, 2)
2868 GEN_VEXT_VX_RM(vssra_vx_w, 4)
2869 GEN_VEXT_VX_RM(vssra_vx_d, 8)
2870
2871 /* Vector Narrowing Fixed-Point Clip Instructions */
2872 static inline int8_t
2873 vnclip8(CPURISCVState *env, int vxrm, int16_t a, int8_t b)
2874 {
2875 uint8_t round, shift = b & 0xf;
2876 int16_t res;
2877
2878 round = get_round(vxrm, a, shift);
2879 res = (a >> shift) + round;
2880 if (res > INT8_MAX) {
2881 env->vxsat = 0x1;
2882 return INT8_MAX;
2883 } else if (res < INT8_MIN) {
2884 env->vxsat = 0x1;
2885 return INT8_MIN;
2886 } else {
2887 return res;
2888 }
2889 }
2890
2891 static inline int16_t
2892 vnclip16(CPURISCVState *env, int vxrm, int32_t a, int16_t b)
2893 {
2894 uint8_t round, shift = b & 0x1f;
2895 int32_t res;
2896
2897 round = get_round(vxrm, a, shift);
2898 res = (a >> shift) + round;
2899 if (res > INT16_MAX) {
2900 env->vxsat = 0x1;
2901 return INT16_MAX;
2902 } else if (res < INT16_MIN) {
2903 env->vxsat = 0x1;
2904 return INT16_MIN;
2905 } else {
2906 return res;
2907 }
2908 }
2909
2910 static inline int32_t
2911 vnclip32(CPURISCVState *env, int vxrm, int64_t a, int32_t b)
2912 {
2913 uint8_t round, shift = b & 0x3f;
2914 int64_t res;
2915
2916 round = get_round(vxrm, a, shift);
2917 res = (a >> shift) + round;
2918 if (res > INT32_MAX) {
2919 env->vxsat = 0x1;
2920 return INT32_MAX;
2921 } else if (res < INT32_MIN) {
2922 env->vxsat = 0x1;
2923 return INT32_MIN;
2924 } else {
2925 return res;
2926 }
2927 }
2928
2929 RVVCALL(OPIVV2_RM, vnclip_wv_b, NOP_SSS_B, H1, H2, H1, vnclip8)
2930 RVVCALL(OPIVV2_RM, vnclip_wv_h, NOP_SSS_H, H2, H4, H2, vnclip16)
2931 RVVCALL(OPIVV2_RM, vnclip_wv_w, NOP_SSS_W, H4, H8, H4, vnclip32)
2932 GEN_VEXT_VV_RM(vnclip_wv_b, 1)
2933 GEN_VEXT_VV_RM(vnclip_wv_h, 2)
2934 GEN_VEXT_VV_RM(vnclip_wv_w, 4)
2935
2936 RVVCALL(OPIVX2_RM, vnclip_wx_b, NOP_SSS_B, H1, H2, vnclip8)
2937 RVVCALL(OPIVX2_RM, vnclip_wx_h, NOP_SSS_H, H2, H4, vnclip16)
2938 RVVCALL(OPIVX2_RM, vnclip_wx_w, NOP_SSS_W, H4, H8, vnclip32)
2939 GEN_VEXT_VX_RM(vnclip_wx_b, 1)
2940 GEN_VEXT_VX_RM(vnclip_wx_h, 2)
2941 GEN_VEXT_VX_RM(vnclip_wx_w, 4)
2942
2943 static inline uint8_t
2944 vnclipu8(CPURISCVState *env, int vxrm, uint16_t a, uint8_t b)
2945 {
2946 uint8_t round, shift = b & 0xf;
2947 uint16_t res;
2948
2949 round = get_round(vxrm, a, shift);
2950 res = (a >> shift) + round;
2951 if (res > UINT8_MAX) {
2952 env->vxsat = 0x1;
2953 return UINT8_MAX;
2954 } else {
2955 return res;
2956 }
2957 }
2958
2959 static inline uint16_t
2960 vnclipu16(CPURISCVState *env, int vxrm, uint32_t a, uint16_t b)
2961 {
2962 uint8_t round, shift = b & 0x1f;
2963 uint32_t res;
2964
2965 round = get_round(vxrm, a, shift);
2966 res = (a >> shift) + round;
2967 if (res > UINT16_MAX) {
2968 env->vxsat = 0x1;
2969 return UINT16_MAX;
2970 } else {
2971 return res;
2972 }
2973 }
2974
2975 static inline uint32_t
2976 vnclipu32(CPURISCVState *env, int vxrm, uint64_t a, uint32_t b)
2977 {
2978 uint8_t round, shift = b & 0x3f;
2979 uint64_t res;
2980
2981 round = get_round(vxrm, a, shift);
2982 res = (a >> shift) + round;
2983 if (res > UINT32_MAX) {
2984 env->vxsat = 0x1;
2985 return UINT32_MAX;
2986 } else {
2987 return res;
2988 }
2989 }
2990
2991 RVVCALL(OPIVV2_RM, vnclipu_wv_b, NOP_UUU_B, H1, H2, H1, vnclipu8)
2992 RVVCALL(OPIVV2_RM, vnclipu_wv_h, NOP_UUU_H, H2, H4, H2, vnclipu16)
2993 RVVCALL(OPIVV2_RM, vnclipu_wv_w, NOP_UUU_W, H4, H8, H4, vnclipu32)
2994 GEN_VEXT_VV_RM(vnclipu_wv_b, 1)
2995 GEN_VEXT_VV_RM(vnclipu_wv_h, 2)
2996 GEN_VEXT_VV_RM(vnclipu_wv_w, 4)
2997
2998 RVVCALL(OPIVX2_RM, vnclipu_wx_b, NOP_UUU_B, H1, H2, vnclipu8)
2999 RVVCALL(OPIVX2_RM, vnclipu_wx_h, NOP_UUU_H, H2, H4, vnclipu16)
3000 RVVCALL(OPIVX2_RM, vnclipu_wx_w, NOP_UUU_W, H4, H8, vnclipu32)
3001 GEN_VEXT_VX_RM(vnclipu_wx_b, 1)
3002 GEN_VEXT_VX_RM(vnclipu_wx_h, 2)
3003 GEN_VEXT_VX_RM(vnclipu_wx_w, 4)
3004
3005 /*
3006 *** Vector Float Point Arithmetic Instructions
3007 */
3008 /* Vector Single-Width Floating-Point Add/Subtract Instructions */
3009 #define OPFVV2(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \
3010 static void do_##NAME(void *vd, void *vs1, void *vs2, int i, \
3011 CPURISCVState *env) \
3012 { \
3013 TX1 s1 = *((T1 *)vs1 + HS1(i)); \
3014 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
3015 *((TD *)vd + HD(i)) = OP(s2, s1, &env->fp_status); \
3016 }
3017
3018 #define GEN_VEXT_VV_ENV(NAME, ESZ) \
3019 void HELPER(NAME)(void *vd, void *v0, void *vs1, \
3020 void *vs2, CPURISCVState *env, \
3021 uint32_t desc) \
3022 { \
3023 uint32_t vm = vext_vm(desc); \
3024 uint32_t vl = env->vl; \
3025 uint32_t total_elems = \
3026 vext_get_total_elems(env, desc, ESZ); \
3027 uint32_t vta = vext_vta(desc); \
3028 uint32_t i; \
3029 \
3030 for (i = env->vstart; i < vl; i++) { \
3031 if (!vm && !vext_elem_mask(v0, i)) { \
3032 continue; \
3033 } \
3034 do_##NAME(vd, vs1, vs2, i, env); \
3035 } \
3036 env->vstart = 0; \
3037 /* set tail elements to 1s */ \
3038 vext_set_elems_1s(vd, vta, vl * ESZ, \
3039 total_elems * ESZ); \
3040 }
3041
3042 RVVCALL(OPFVV2, vfadd_vv_h, OP_UUU_H, H2, H2, H2, float16_add)
3043 RVVCALL(OPFVV2, vfadd_vv_w, OP_UUU_W, H4, H4, H4, float32_add)
3044 RVVCALL(OPFVV2, vfadd_vv_d, OP_UUU_D, H8, H8, H8, float64_add)
3045 GEN_VEXT_VV_ENV(vfadd_vv_h, 2)
3046 GEN_VEXT_VV_ENV(vfadd_vv_w, 4)
3047 GEN_VEXT_VV_ENV(vfadd_vv_d, 8)
3048
3049 #define OPFVF2(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \
3050 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3051 CPURISCVState *env) \
3052 { \
3053 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
3054 *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, &env->fp_status);\
3055 }
3056
3057 #define GEN_VEXT_VF(NAME, ESZ) \
3058 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, \
3059 void *vs2, CPURISCVState *env, \
3060 uint32_t desc) \
3061 { \
3062 uint32_t vm = vext_vm(desc); \
3063 uint32_t vl = env->vl; \
3064 uint32_t total_elems = \
3065 vext_get_total_elems(env, desc, ESZ); \
3066 uint32_t vta = vext_vta(desc); \
3067 uint32_t i; \
3068 \
3069 for (i = env->vstart; i < vl; i++) { \
3070 if (!vm && !vext_elem_mask(v0, i)) { \
3071 continue; \
3072 } \
3073 do_##NAME(vd, s1, vs2, i, env); \
3074 } \
3075 env->vstart = 0; \
3076 /* set tail elements to 1s */ \
3077 vext_set_elems_1s(vd, vta, vl * ESZ, \
3078 total_elems * ESZ); \
3079 }
3080
3081 RVVCALL(OPFVF2, vfadd_vf_h, OP_UUU_H, H2, H2, float16_add)
3082 RVVCALL(OPFVF2, vfadd_vf_w, OP_UUU_W, H4, H4, float32_add)
3083 RVVCALL(OPFVF2, vfadd_vf_d, OP_UUU_D, H8, H8, float64_add)
3084 GEN_VEXT_VF(vfadd_vf_h, 2)
3085 GEN_VEXT_VF(vfadd_vf_w, 4)
3086 GEN_VEXT_VF(vfadd_vf_d, 8)
3087
3088 RVVCALL(OPFVV2, vfsub_vv_h, OP_UUU_H, H2, H2, H2, float16_sub)
3089 RVVCALL(OPFVV2, vfsub_vv_w, OP_UUU_W, H4, H4, H4, float32_sub)
3090 RVVCALL(OPFVV2, vfsub_vv_d, OP_UUU_D, H8, H8, H8, float64_sub)
3091 GEN_VEXT_VV_ENV(vfsub_vv_h, 2)
3092 GEN_VEXT_VV_ENV(vfsub_vv_w, 4)
3093 GEN_VEXT_VV_ENV(vfsub_vv_d, 8)
3094 RVVCALL(OPFVF2, vfsub_vf_h, OP_UUU_H, H2, H2, float16_sub)
3095 RVVCALL(OPFVF2, vfsub_vf_w, OP_UUU_W, H4, H4, float32_sub)
3096 RVVCALL(OPFVF2, vfsub_vf_d, OP_UUU_D, H8, H8, float64_sub)
3097 GEN_VEXT_VF(vfsub_vf_h, 2)
3098 GEN_VEXT_VF(vfsub_vf_w, 4)
3099 GEN_VEXT_VF(vfsub_vf_d, 8)
3100
3101 static uint16_t float16_rsub(uint16_t a, uint16_t b, float_status *s)
3102 {
3103 return float16_sub(b, a, s);
3104 }
3105
3106 static uint32_t float32_rsub(uint32_t a, uint32_t b, float_status *s)
3107 {
3108 return float32_sub(b, a, s);
3109 }
3110
3111 static uint64_t float64_rsub(uint64_t a, uint64_t b, float_status *s)
3112 {
3113 return float64_sub(b, a, s);
3114 }
3115
3116 RVVCALL(OPFVF2, vfrsub_vf_h, OP_UUU_H, H2, H2, float16_rsub)
3117 RVVCALL(OPFVF2, vfrsub_vf_w, OP_UUU_W, H4, H4, float32_rsub)
3118 RVVCALL(OPFVF2, vfrsub_vf_d, OP_UUU_D, H8, H8, float64_rsub)
3119 GEN_VEXT_VF(vfrsub_vf_h, 2)
3120 GEN_VEXT_VF(vfrsub_vf_w, 4)
3121 GEN_VEXT_VF(vfrsub_vf_d, 8)
3122
3123 /* Vector Widening Floating-Point Add/Subtract Instructions */
3124 static uint32_t vfwadd16(uint16_t a, uint16_t b, float_status *s)
3125 {
3126 return float32_add(float16_to_float32(a, true, s),
3127 float16_to_float32(b, true, s), s);
3128 }
3129
3130 static uint64_t vfwadd32(uint32_t a, uint32_t b, float_status *s)
3131 {
3132 return float64_add(float32_to_float64(a, s),
3133 float32_to_float64(b, s), s);
3134
3135 }
3136
3137 RVVCALL(OPFVV2, vfwadd_vv_h, WOP_UUU_H, H4, H2, H2, vfwadd16)
3138 RVVCALL(OPFVV2, vfwadd_vv_w, WOP_UUU_W, H8, H4, H4, vfwadd32)
3139 GEN_VEXT_VV_ENV(vfwadd_vv_h, 4)
3140 GEN_VEXT_VV_ENV(vfwadd_vv_w, 8)
3141 RVVCALL(OPFVF2, vfwadd_vf_h, WOP_UUU_H, H4, H2, vfwadd16)
3142 RVVCALL(OPFVF2, vfwadd_vf_w, WOP_UUU_W, H8, H4, vfwadd32)
3143 GEN_VEXT_VF(vfwadd_vf_h, 4)
3144 GEN_VEXT_VF(vfwadd_vf_w, 8)
3145
3146 static uint32_t vfwsub16(uint16_t a, uint16_t b, float_status *s)
3147 {
3148 return float32_sub(float16_to_float32(a, true, s),
3149 float16_to_float32(b, true, s), s);
3150 }
3151
3152 static uint64_t vfwsub32(uint32_t a, uint32_t b, float_status *s)
3153 {
3154 return float64_sub(float32_to_float64(a, s),
3155 float32_to_float64(b, s), s);
3156
3157 }
3158
3159 RVVCALL(OPFVV2, vfwsub_vv_h, WOP_UUU_H, H4, H2, H2, vfwsub16)
3160 RVVCALL(OPFVV2, vfwsub_vv_w, WOP_UUU_W, H8, H4, H4, vfwsub32)
3161 GEN_VEXT_VV_ENV(vfwsub_vv_h, 4)
3162 GEN_VEXT_VV_ENV(vfwsub_vv_w, 8)
3163 RVVCALL(OPFVF2, vfwsub_vf_h, WOP_UUU_H, H4, H2, vfwsub16)
3164 RVVCALL(OPFVF2, vfwsub_vf_w, WOP_UUU_W, H8, H4, vfwsub32)
3165 GEN_VEXT_VF(vfwsub_vf_h, 4)
3166 GEN_VEXT_VF(vfwsub_vf_w, 8)
3167
3168 static uint32_t vfwaddw16(uint32_t a, uint16_t b, float_status *s)
3169 {
3170 return float32_add(a, float16_to_float32(b, true, s), s);
3171 }
3172
3173 static uint64_t vfwaddw32(uint64_t a, uint32_t b, float_status *s)
3174 {
3175 return float64_add(a, float32_to_float64(b, s), s);
3176 }
3177
3178 RVVCALL(OPFVV2, vfwadd_wv_h, WOP_WUUU_H, H4, H2, H2, vfwaddw16)
3179 RVVCALL(OPFVV2, vfwadd_wv_w, WOP_WUUU_W, H8, H4, H4, vfwaddw32)
3180 GEN_VEXT_VV_ENV(vfwadd_wv_h, 4)
3181 GEN_VEXT_VV_ENV(vfwadd_wv_w, 8)
3182 RVVCALL(OPFVF2, vfwadd_wf_h, WOP_WUUU_H, H4, H2, vfwaddw16)
3183 RVVCALL(OPFVF2, vfwadd_wf_w, WOP_WUUU_W, H8, H4, vfwaddw32)
3184 GEN_VEXT_VF(vfwadd_wf_h, 4)
3185 GEN_VEXT_VF(vfwadd_wf_w, 8)
3186
3187 static uint32_t vfwsubw16(uint32_t a, uint16_t b, float_status *s)
3188 {
3189 return float32_sub(a, float16_to_float32(b, true, s), s);
3190 }
3191
3192 static uint64_t vfwsubw32(uint64_t a, uint32_t b, float_status *s)
3193 {
3194 return float64_sub(a, float32_to_float64(b, s), s);
3195 }
3196
3197 RVVCALL(OPFVV2, vfwsub_wv_h, WOP_WUUU_H, H4, H2, H2, vfwsubw16)
3198 RVVCALL(OPFVV2, vfwsub_wv_w, WOP_WUUU_W, H8, H4, H4, vfwsubw32)
3199 GEN_VEXT_VV_ENV(vfwsub_wv_h, 4)
3200 GEN_VEXT_VV_ENV(vfwsub_wv_w, 8)
3201 RVVCALL(OPFVF2, vfwsub_wf_h, WOP_WUUU_H, H4, H2, vfwsubw16)
3202 RVVCALL(OPFVF2, vfwsub_wf_w, WOP_WUUU_W, H8, H4, vfwsubw32)
3203 GEN_VEXT_VF(vfwsub_wf_h, 4)
3204 GEN_VEXT_VF(vfwsub_wf_w, 8)
3205
3206 /* Vector Single-Width Floating-Point Multiply/Divide Instructions */
3207 RVVCALL(OPFVV2, vfmul_vv_h, OP_UUU_H, H2, H2, H2, float16_mul)
3208 RVVCALL(OPFVV2, vfmul_vv_w, OP_UUU_W, H4, H4, H4, float32_mul)
3209 RVVCALL(OPFVV2, vfmul_vv_d, OP_UUU_D, H8, H8, H8, float64_mul)
3210 GEN_VEXT_VV_ENV(vfmul_vv_h, 2)
3211 GEN_VEXT_VV_ENV(vfmul_vv_w, 4)
3212 GEN_VEXT_VV_ENV(vfmul_vv_d, 8)
3213 RVVCALL(OPFVF2, vfmul_vf_h, OP_UUU_H, H2, H2, float16_mul)
3214 RVVCALL(OPFVF2, vfmul_vf_w, OP_UUU_W, H4, H4, float32_mul)
3215 RVVCALL(OPFVF2, vfmul_vf_d, OP_UUU_D, H8, H8, float64_mul)
3216 GEN_VEXT_VF(vfmul_vf_h, 2)
3217 GEN_VEXT_VF(vfmul_vf_w, 4)
3218 GEN_VEXT_VF(vfmul_vf_d, 8)
3219
3220 RVVCALL(OPFVV2, vfdiv_vv_h, OP_UUU_H, H2, H2, H2, float16_div)
3221 RVVCALL(OPFVV2, vfdiv_vv_w, OP_UUU_W, H4, H4, H4, float32_div)
3222 RVVCALL(OPFVV2, vfdiv_vv_d, OP_UUU_D, H8, H8, H8, float64_div)
3223 GEN_VEXT_VV_ENV(vfdiv_vv_h, 2)
3224 GEN_VEXT_VV_ENV(vfdiv_vv_w, 4)
3225 GEN_VEXT_VV_ENV(vfdiv_vv_d, 8)
3226 RVVCALL(OPFVF2, vfdiv_vf_h, OP_UUU_H, H2, H2, float16_div)
3227 RVVCALL(OPFVF2, vfdiv_vf_w, OP_UUU_W, H4, H4, float32_div)
3228 RVVCALL(OPFVF2, vfdiv_vf_d, OP_UUU_D, H8, H8, float64_div)
3229 GEN_VEXT_VF(vfdiv_vf_h, 2)
3230 GEN_VEXT_VF(vfdiv_vf_w, 4)
3231 GEN_VEXT_VF(vfdiv_vf_d, 8)
3232
3233 static uint16_t float16_rdiv(uint16_t a, uint16_t b, float_status *s)
3234 {
3235 return float16_div(b, a, s);
3236 }
3237
3238 static uint32_t float32_rdiv(uint32_t a, uint32_t b, float_status *s)
3239 {
3240 return float32_div(b, a, s);
3241 }
3242
3243 static uint64_t float64_rdiv(uint64_t a, uint64_t b, float_status *s)
3244 {
3245 return float64_div(b, a, s);
3246 }
3247
3248 RVVCALL(OPFVF2, vfrdiv_vf_h, OP_UUU_H, H2, H2, float16_rdiv)
3249 RVVCALL(OPFVF2, vfrdiv_vf_w, OP_UUU_W, H4, H4, float32_rdiv)
3250 RVVCALL(OPFVF2, vfrdiv_vf_d, OP_UUU_D, H8, H8, float64_rdiv)
3251 GEN_VEXT_VF(vfrdiv_vf_h, 2)
3252 GEN_VEXT_VF(vfrdiv_vf_w, 4)
3253 GEN_VEXT_VF(vfrdiv_vf_d, 8)
3254
3255 /* Vector Widening Floating-Point Multiply */
3256 static uint32_t vfwmul16(uint16_t a, uint16_t b, float_status *s)
3257 {
3258 return float32_mul(float16_to_float32(a, true, s),
3259 float16_to_float32(b, true, s), s);
3260 }
3261
3262 static uint64_t vfwmul32(uint32_t a, uint32_t b, float_status *s)
3263 {
3264 return float64_mul(float32_to_float64(a, s),
3265 float32_to_float64(b, s), s);
3266
3267 }
3268 RVVCALL(OPFVV2, vfwmul_vv_h, WOP_UUU_H, H4, H2, H2, vfwmul16)
3269 RVVCALL(OPFVV2, vfwmul_vv_w, WOP_UUU_W, H8, H4, H4, vfwmul32)
3270 GEN_VEXT_VV_ENV(vfwmul_vv_h, 4)
3271 GEN_VEXT_VV_ENV(vfwmul_vv_w, 8)
3272 RVVCALL(OPFVF2, vfwmul_vf_h, WOP_UUU_H, H4, H2, vfwmul16)
3273 RVVCALL(OPFVF2, vfwmul_vf_w, WOP_UUU_W, H8, H4, vfwmul32)
3274 GEN_VEXT_VF(vfwmul_vf_h, 4)
3275 GEN_VEXT_VF(vfwmul_vf_w, 8)
3276
3277 /* Vector Single-Width Floating-Point Fused Multiply-Add Instructions */
3278 #define OPFVV3(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \
3279 static void do_##NAME(void *vd, void *vs1, void *vs2, int i, \
3280 CPURISCVState *env) \
3281 { \
3282 TX1 s1 = *((T1 *)vs1 + HS1(i)); \
3283 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
3284 TD d = *((TD *)vd + HD(i)); \
3285 *((TD *)vd + HD(i)) = OP(s2, s1, d, &env->fp_status); \
3286 }
3287
3288 static uint16_t fmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3289 {
3290 return float16_muladd(a, b, d, 0, s);
3291 }
3292
3293 static uint32_t fmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3294 {
3295 return float32_muladd(a, b, d, 0, s);
3296 }
3297
3298 static uint64_t fmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3299 {
3300 return float64_muladd(a, b, d, 0, s);
3301 }
3302
3303 RVVCALL(OPFVV3, vfmacc_vv_h, OP_UUU_H, H2, H2, H2, fmacc16)
3304 RVVCALL(OPFVV3, vfmacc_vv_w, OP_UUU_W, H4, H4, H4, fmacc32)
3305 RVVCALL(OPFVV3, vfmacc_vv_d, OP_UUU_D, H8, H8, H8, fmacc64)
3306 GEN_VEXT_VV_ENV(vfmacc_vv_h, 2)
3307 GEN_VEXT_VV_ENV(vfmacc_vv_w, 4)
3308 GEN_VEXT_VV_ENV(vfmacc_vv_d, 8)
3309
3310 #define OPFVF3(NAME, TD, T1, T2, TX1, TX2, HD, HS2, OP) \
3311 static void do_##NAME(void *vd, uint64_t s1, void *vs2, int i, \
3312 CPURISCVState *env) \
3313 { \
3314 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
3315 TD d = *((TD *)vd + HD(i)); \
3316 *((TD *)vd + HD(i)) = OP(s2, (TX1)(T1)s1, d, &env->fp_status);\
3317 }
3318
3319 RVVCALL(OPFVF3, vfmacc_vf_h, OP_UUU_H, H2, H2, fmacc16)
3320 RVVCALL(OPFVF3, vfmacc_vf_w, OP_UUU_W, H4, H4, fmacc32)
3321 RVVCALL(OPFVF3, vfmacc_vf_d, OP_UUU_D, H8, H8, fmacc64)
3322 GEN_VEXT_VF(vfmacc_vf_h, 2)
3323 GEN_VEXT_VF(vfmacc_vf_w, 4)
3324 GEN_VEXT_VF(vfmacc_vf_d, 8)
3325
3326 static uint16_t fnmacc16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3327 {
3328 return float16_muladd(a, b, d,
3329 float_muladd_negate_c | float_muladd_negate_product, s);
3330 }
3331
3332 static uint32_t fnmacc32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3333 {
3334 return float32_muladd(a, b, d,
3335 float_muladd_negate_c | float_muladd_negate_product, s);
3336 }
3337
3338 static uint64_t fnmacc64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3339 {
3340 return float64_muladd(a, b, d,
3341 float_muladd_negate_c | float_muladd_negate_product, s);
3342 }
3343
3344 RVVCALL(OPFVV3, vfnmacc_vv_h, OP_UUU_H, H2, H2, H2, fnmacc16)
3345 RVVCALL(OPFVV3, vfnmacc_vv_w, OP_UUU_W, H4, H4, H4, fnmacc32)
3346 RVVCALL(OPFVV3, vfnmacc_vv_d, OP_UUU_D, H8, H8, H8, fnmacc64)
3347 GEN_VEXT_VV_ENV(vfnmacc_vv_h, 2)
3348 GEN_VEXT_VV_ENV(vfnmacc_vv_w, 4)
3349 GEN_VEXT_VV_ENV(vfnmacc_vv_d, 8)
3350 RVVCALL(OPFVF3, vfnmacc_vf_h, OP_UUU_H, H2, H2, fnmacc16)
3351 RVVCALL(OPFVF3, vfnmacc_vf_w, OP_UUU_W, H4, H4, fnmacc32)
3352 RVVCALL(OPFVF3, vfnmacc_vf_d, OP_UUU_D, H8, H8, fnmacc64)
3353 GEN_VEXT_VF(vfnmacc_vf_h, 2)
3354 GEN_VEXT_VF(vfnmacc_vf_w, 4)
3355 GEN_VEXT_VF(vfnmacc_vf_d, 8)
3356
3357 static uint16_t fmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3358 {
3359 return float16_muladd(a, b, d, float_muladd_negate_c, s);
3360 }
3361
3362 static uint32_t fmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3363 {
3364 return float32_muladd(a, b, d, float_muladd_negate_c, s);
3365 }
3366
3367 static uint64_t fmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3368 {
3369 return float64_muladd(a, b, d, float_muladd_negate_c, s);
3370 }
3371
3372 RVVCALL(OPFVV3, vfmsac_vv_h, OP_UUU_H, H2, H2, H2, fmsac16)
3373 RVVCALL(OPFVV3, vfmsac_vv_w, OP_UUU_W, H4, H4, H4, fmsac32)
3374 RVVCALL(OPFVV3, vfmsac_vv_d, OP_UUU_D, H8, H8, H8, fmsac64)
3375 GEN_VEXT_VV_ENV(vfmsac_vv_h, 2)
3376 GEN_VEXT_VV_ENV(vfmsac_vv_w, 4)
3377 GEN_VEXT_VV_ENV(vfmsac_vv_d, 8)
3378 RVVCALL(OPFVF3, vfmsac_vf_h, OP_UUU_H, H2, H2, fmsac16)
3379 RVVCALL(OPFVF3, vfmsac_vf_w, OP_UUU_W, H4, H4, fmsac32)
3380 RVVCALL(OPFVF3, vfmsac_vf_d, OP_UUU_D, H8, H8, fmsac64)
3381 GEN_VEXT_VF(vfmsac_vf_h, 2)
3382 GEN_VEXT_VF(vfmsac_vf_w, 4)
3383 GEN_VEXT_VF(vfmsac_vf_d, 8)
3384
3385 static uint16_t fnmsac16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3386 {
3387 return float16_muladd(a, b, d, float_muladd_negate_product, s);
3388 }
3389
3390 static uint32_t fnmsac32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3391 {
3392 return float32_muladd(a, b, d, float_muladd_negate_product, s);
3393 }
3394
3395 static uint64_t fnmsac64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3396 {
3397 return float64_muladd(a, b, d, float_muladd_negate_product, s);
3398 }
3399
3400 RVVCALL(OPFVV3, vfnmsac_vv_h, OP_UUU_H, H2, H2, H2, fnmsac16)
3401 RVVCALL(OPFVV3, vfnmsac_vv_w, OP_UUU_W, H4, H4, H4, fnmsac32)
3402 RVVCALL(OPFVV3, vfnmsac_vv_d, OP_UUU_D, H8, H8, H8, fnmsac64)
3403 GEN_VEXT_VV_ENV(vfnmsac_vv_h, 2)
3404 GEN_VEXT_VV_ENV(vfnmsac_vv_w, 4)
3405 GEN_VEXT_VV_ENV(vfnmsac_vv_d, 8)
3406 RVVCALL(OPFVF3, vfnmsac_vf_h, OP_UUU_H, H2, H2, fnmsac16)
3407 RVVCALL(OPFVF3, vfnmsac_vf_w, OP_UUU_W, H4, H4, fnmsac32)
3408 RVVCALL(OPFVF3, vfnmsac_vf_d, OP_UUU_D, H8, H8, fnmsac64)
3409 GEN_VEXT_VF(vfnmsac_vf_h, 2)
3410 GEN_VEXT_VF(vfnmsac_vf_w, 4)
3411 GEN_VEXT_VF(vfnmsac_vf_d, 8)
3412
3413 static uint16_t fmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3414 {
3415 return float16_muladd(d, b, a, 0, s);
3416 }
3417
3418 static uint32_t fmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3419 {
3420 return float32_muladd(d, b, a, 0, s);
3421 }
3422
3423 static uint64_t fmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3424 {
3425 return float64_muladd(d, b, a, 0, s);
3426 }
3427
3428 RVVCALL(OPFVV3, vfmadd_vv_h, OP_UUU_H, H2, H2, H2, fmadd16)
3429 RVVCALL(OPFVV3, vfmadd_vv_w, OP_UUU_W, H4, H4, H4, fmadd32)
3430 RVVCALL(OPFVV3, vfmadd_vv_d, OP_UUU_D, H8, H8, H8, fmadd64)
3431 GEN_VEXT_VV_ENV(vfmadd_vv_h, 2)
3432 GEN_VEXT_VV_ENV(vfmadd_vv_w, 4)
3433 GEN_VEXT_VV_ENV(vfmadd_vv_d, 8)
3434 RVVCALL(OPFVF3, vfmadd_vf_h, OP_UUU_H, H2, H2, fmadd16)
3435 RVVCALL(OPFVF3, vfmadd_vf_w, OP_UUU_W, H4, H4, fmadd32)
3436 RVVCALL(OPFVF3, vfmadd_vf_d, OP_UUU_D, H8, H8, fmadd64)
3437 GEN_VEXT_VF(vfmadd_vf_h, 2)
3438 GEN_VEXT_VF(vfmadd_vf_w, 4)
3439 GEN_VEXT_VF(vfmadd_vf_d, 8)
3440
3441 static uint16_t fnmadd16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3442 {
3443 return float16_muladd(d, b, a,
3444 float_muladd_negate_c | float_muladd_negate_product, s);
3445 }
3446
3447 static uint32_t fnmadd32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3448 {
3449 return float32_muladd(d, b, a,
3450 float_muladd_negate_c | float_muladd_negate_product, s);
3451 }
3452
3453 static uint64_t fnmadd64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3454 {
3455 return float64_muladd(d, b, a,
3456 float_muladd_negate_c | float_muladd_negate_product, s);
3457 }
3458
3459 RVVCALL(OPFVV3, vfnmadd_vv_h, OP_UUU_H, H2, H2, H2, fnmadd16)
3460 RVVCALL(OPFVV3, vfnmadd_vv_w, OP_UUU_W, H4, H4, H4, fnmadd32)
3461 RVVCALL(OPFVV3, vfnmadd_vv_d, OP_UUU_D, H8, H8, H8, fnmadd64)
3462 GEN_VEXT_VV_ENV(vfnmadd_vv_h, 2)
3463 GEN_VEXT_VV_ENV(vfnmadd_vv_w, 4)
3464 GEN_VEXT_VV_ENV(vfnmadd_vv_d, 8)
3465 RVVCALL(OPFVF3, vfnmadd_vf_h, OP_UUU_H, H2, H2, fnmadd16)
3466 RVVCALL(OPFVF3, vfnmadd_vf_w, OP_UUU_W, H4, H4, fnmadd32)
3467 RVVCALL(OPFVF3, vfnmadd_vf_d, OP_UUU_D, H8, H8, fnmadd64)
3468 GEN_VEXT_VF(vfnmadd_vf_h, 2)
3469 GEN_VEXT_VF(vfnmadd_vf_w, 4)
3470 GEN_VEXT_VF(vfnmadd_vf_d, 8)
3471
3472 static uint16_t fmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3473 {
3474 return float16_muladd(d, b, a, float_muladd_negate_c, s);
3475 }
3476
3477 static uint32_t fmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3478 {
3479 return float32_muladd(d, b, a, float_muladd_negate_c, s);
3480 }
3481
3482 static uint64_t fmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3483 {
3484 return float64_muladd(d, b, a, float_muladd_negate_c, s);
3485 }
3486
3487 RVVCALL(OPFVV3, vfmsub_vv_h, OP_UUU_H, H2, H2, H2, fmsub16)
3488 RVVCALL(OPFVV3, vfmsub_vv_w, OP_UUU_W, H4, H4, H4, fmsub32)
3489 RVVCALL(OPFVV3, vfmsub_vv_d, OP_UUU_D, H8, H8, H8, fmsub64)
3490 GEN_VEXT_VV_ENV(vfmsub_vv_h, 2)
3491 GEN_VEXT_VV_ENV(vfmsub_vv_w, 4)
3492 GEN_VEXT_VV_ENV(vfmsub_vv_d, 8)
3493 RVVCALL(OPFVF3, vfmsub_vf_h, OP_UUU_H, H2, H2, fmsub16)
3494 RVVCALL(OPFVF3, vfmsub_vf_w, OP_UUU_W, H4, H4, fmsub32)
3495 RVVCALL(OPFVF3, vfmsub_vf_d, OP_UUU_D, H8, H8, fmsub64)
3496 GEN_VEXT_VF(vfmsub_vf_h, 2)
3497 GEN_VEXT_VF(vfmsub_vf_w, 4)
3498 GEN_VEXT_VF(vfmsub_vf_d, 8)
3499
3500 static uint16_t fnmsub16(uint16_t a, uint16_t b, uint16_t d, float_status *s)
3501 {
3502 return float16_muladd(d, b, a, float_muladd_negate_product, s);
3503 }
3504
3505 static uint32_t fnmsub32(uint32_t a, uint32_t b, uint32_t d, float_status *s)
3506 {
3507 return float32_muladd(d, b, a, float_muladd_negate_product, s);
3508 }
3509
3510 static uint64_t fnmsub64(uint64_t a, uint64_t b, uint64_t d, float_status *s)
3511 {
3512 return float64_muladd(d, b, a, float_muladd_negate_product, s);
3513 }
3514
3515 RVVCALL(OPFVV3, vfnmsub_vv_h, OP_UUU_H, H2, H2, H2, fnmsub16)
3516 RVVCALL(OPFVV3, vfnmsub_vv_w, OP_UUU_W, H4, H4, H4, fnmsub32)
3517 RVVCALL(OPFVV3, vfnmsub_vv_d, OP_UUU_D, H8, H8, H8, fnmsub64)
3518 GEN_VEXT_VV_ENV(vfnmsub_vv_h, 2)
3519 GEN_VEXT_VV_ENV(vfnmsub_vv_w, 4)
3520 GEN_VEXT_VV_ENV(vfnmsub_vv_d, 8)
3521 RVVCALL(OPFVF3, vfnmsub_vf_h, OP_UUU_H, H2, H2, fnmsub16)
3522 RVVCALL(OPFVF3, vfnmsub_vf_w, OP_UUU_W, H4, H4, fnmsub32)
3523 RVVCALL(OPFVF3, vfnmsub_vf_d, OP_UUU_D, H8, H8, fnmsub64)
3524 GEN_VEXT_VF(vfnmsub_vf_h, 2)
3525 GEN_VEXT_VF(vfnmsub_vf_w, 4)
3526 GEN_VEXT_VF(vfnmsub_vf_d, 8)
3527
3528 /* Vector Widening Floating-Point Fused Multiply-Add Instructions */
3529 static uint32_t fwmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3530 {
3531 return float32_muladd(float16_to_float32(a, true, s),
3532 float16_to_float32(b, true, s), d, 0, s);
3533 }
3534
3535 static uint64_t fwmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3536 {
3537 return float64_muladd(float32_to_float64(a, s),
3538 float32_to_float64(b, s), d, 0, s);
3539 }
3540
3541 RVVCALL(OPFVV3, vfwmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwmacc16)
3542 RVVCALL(OPFVV3, vfwmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwmacc32)
3543 GEN_VEXT_VV_ENV(vfwmacc_vv_h, 4)
3544 GEN_VEXT_VV_ENV(vfwmacc_vv_w, 8)
3545 RVVCALL(OPFVF3, vfwmacc_vf_h, WOP_UUU_H, H4, H2, fwmacc16)
3546 RVVCALL(OPFVF3, vfwmacc_vf_w, WOP_UUU_W, H8, H4, fwmacc32)
3547 GEN_VEXT_VF(vfwmacc_vf_h, 4)
3548 GEN_VEXT_VF(vfwmacc_vf_w, 8)
3549
3550 static uint32_t fwnmacc16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3551 {
3552 return float32_muladd(float16_to_float32(a, true, s),
3553 float16_to_float32(b, true, s), d,
3554 float_muladd_negate_c | float_muladd_negate_product, s);
3555 }
3556
3557 static uint64_t fwnmacc32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3558 {
3559 return float64_muladd(float32_to_float64(a, s),
3560 float32_to_float64(b, s), d,
3561 float_muladd_negate_c | float_muladd_negate_product, s);
3562 }
3563
3564 RVVCALL(OPFVV3, vfwnmacc_vv_h, WOP_UUU_H, H4, H2, H2, fwnmacc16)
3565 RVVCALL(OPFVV3, vfwnmacc_vv_w, WOP_UUU_W, H8, H4, H4, fwnmacc32)
3566 GEN_VEXT_VV_ENV(vfwnmacc_vv_h, 4)
3567 GEN_VEXT_VV_ENV(vfwnmacc_vv_w, 8)
3568 RVVCALL(OPFVF3, vfwnmacc_vf_h, WOP_UUU_H, H4, H2, fwnmacc16)
3569 RVVCALL(OPFVF3, vfwnmacc_vf_w, WOP_UUU_W, H8, H4, fwnmacc32)
3570 GEN_VEXT_VF(vfwnmacc_vf_h, 4)
3571 GEN_VEXT_VF(vfwnmacc_vf_w, 8)
3572
3573 static uint32_t fwmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3574 {
3575 return float32_muladd(float16_to_float32(a, true, s),
3576 float16_to_float32(b, true, s), d,
3577 float_muladd_negate_c, s);
3578 }
3579
3580 static uint64_t fwmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3581 {
3582 return float64_muladd(float32_to_float64(a, s),
3583 float32_to_float64(b, s), d,
3584 float_muladd_negate_c, s);
3585 }
3586
3587 RVVCALL(OPFVV3, vfwmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwmsac16)
3588 RVVCALL(OPFVV3, vfwmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwmsac32)
3589 GEN_VEXT_VV_ENV(vfwmsac_vv_h, 4)
3590 GEN_VEXT_VV_ENV(vfwmsac_vv_w, 8)
3591 RVVCALL(OPFVF3, vfwmsac_vf_h, WOP_UUU_H, H4, H2, fwmsac16)
3592 RVVCALL(OPFVF3, vfwmsac_vf_w, WOP_UUU_W, H8, H4, fwmsac32)
3593 GEN_VEXT_VF(vfwmsac_vf_h, 4)
3594 GEN_VEXT_VF(vfwmsac_vf_w, 8)
3595
3596 static uint32_t fwnmsac16(uint16_t a, uint16_t b, uint32_t d, float_status *s)
3597 {
3598 return float32_muladd(float16_to_float32(a, true, s),
3599 float16_to_float32(b, true, s), d,
3600 float_muladd_negate_product, s);
3601 }
3602
3603 static uint64_t fwnmsac32(uint32_t a, uint32_t b, uint64_t d, float_status *s)
3604 {
3605 return float64_muladd(float32_to_float64(a, s),
3606 float32_to_float64(b, s), d,
3607 float_muladd_negate_product, s);
3608 }
3609
3610 RVVCALL(OPFVV3, vfwnmsac_vv_h, WOP_UUU_H, H4, H2, H2, fwnmsac16)
3611 RVVCALL(OPFVV3, vfwnmsac_vv_w, WOP_UUU_W, H8, H4, H4, fwnmsac32)
3612 GEN_VEXT_VV_ENV(vfwnmsac_vv_h, 4)
3613 GEN_VEXT_VV_ENV(vfwnmsac_vv_w, 8)
3614 RVVCALL(OPFVF3, vfwnmsac_vf_h, WOP_UUU_H, H4, H2, fwnmsac16)
3615 RVVCALL(OPFVF3, vfwnmsac_vf_w, WOP_UUU_W, H8, H4, fwnmsac32)
3616 GEN_VEXT_VF(vfwnmsac_vf_h, 4)
3617 GEN_VEXT_VF(vfwnmsac_vf_w, 8)
3618
3619 /* Vector Floating-Point Square-Root Instruction */
3620 /* (TD, T2, TX2) */
3621 #define OP_UU_H uint16_t, uint16_t, uint16_t
3622 #define OP_UU_W uint32_t, uint32_t, uint32_t
3623 #define OP_UU_D uint64_t, uint64_t, uint64_t
3624
3625 #define OPFVV1(NAME, TD, T2, TX2, HD, HS2, OP) \
3626 static void do_##NAME(void *vd, void *vs2, int i, \
3627 CPURISCVState *env) \
3628 { \
3629 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
3630 *((TD *)vd + HD(i)) = OP(s2, &env->fp_status); \
3631 }
3632
3633 #define GEN_VEXT_V_ENV(NAME, ESZ) \
3634 void HELPER(NAME)(void *vd, void *v0, void *vs2, \
3635 CPURISCVState *env, uint32_t desc) \
3636 { \
3637 uint32_t vm = vext_vm(desc); \
3638 uint32_t vl = env->vl; \
3639 uint32_t total_elems = \
3640 vext_get_total_elems(env, desc, ESZ); \
3641 uint32_t vta = vext_vta(desc); \
3642 uint32_t i; \
3643 \
3644 if (vl == 0) { \
3645 return; \
3646 } \
3647 for (i = env->vstart; i < vl; i++) { \
3648 if (!vm && !vext_elem_mask(v0, i)) { \
3649 continue; \
3650 } \
3651 do_##NAME(vd, vs2, i, env); \
3652 } \
3653 env->vstart = 0; \
3654 vext_set_elems_1s(vd, vta, vl * ESZ, \
3655 total_elems * ESZ); \
3656 }
3657
3658 RVVCALL(OPFVV1, vfsqrt_v_h, OP_UU_H, H2, H2, float16_sqrt)
3659 RVVCALL(OPFVV1, vfsqrt_v_w, OP_UU_W, H4, H4, float32_sqrt)
3660 RVVCALL(OPFVV1, vfsqrt_v_d, OP_UU_D, H8, H8, float64_sqrt)
3661 GEN_VEXT_V_ENV(vfsqrt_v_h, 2)
3662 GEN_VEXT_V_ENV(vfsqrt_v_w, 4)
3663 GEN_VEXT_V_ENV(vfsqrt_v_d, 8)
3664
3665 /*
3666 * Vector Floating-Point Reciprocal Square-Root Estimate Instruction
3667 *
3668 * Adapted from riscv-v-spec recip.c:
3669 * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3670 */
3671 static uint64_t frsqrt7(uint64_t f, int exp_size, int frac_size)
3672 {
3673 uint64_t sign = extract64(f, frac_size + exp_size, 1);
3674 uint64_t exp = extract64(f, frac_size, exp_size);
3675 uint64_t frac = extract64(f, 0, frac_size);
3676
3677 const uint8_t lookup_table[] = {
3678 52, 51, 50, 48, 47, 46, 44, 43,
3679 42, 41, 40, 39, 38, 36, 35, 34,
3680 33, 32, 31, 30, 30, 29, 28, 27,
3681 26, 25, 24, 23, 23, 22, 21, 20,
3682 19, 19, 18, 17, 16, 16, 15, 14,
3683 14, 13, 12, 12, 11, 10, 10, 9,
3684 9, 8, 7, 7, 6, 6, 5, 4,
3685 4, 3, 3, 2, 2, 1, 1, 0,
3686 127, 125, 123, 121, 119, 118, 116, 114,
3687 113, 111, 109, 108, 106, 105, 103, 102,
3688 100, 99, 97, 96, 95, 93, 92, 91,
3689 90, 88, 87, 86, 85, 84, 83, 82,
3690 80, 79, 78, 77, 76, 75, 74, 73,
3691 72, 71, 70, 70, 69, 68, 67, 66,
3692 65, 64, 63, 63, 62, 61, 60, 59,
3693 59, 58, 57, 56, 56, 55, 54, 53
3694 };
3695 const int precision = 7;
3696
3697 if (exp == 0 && frac != 0) { /* subnormal */
3698 /* Normalize the subnormal. */
3699 while (extract64(frac, frac_size - 1, 1) == 0) {
3700 exp--;
3701 frac <<= 1;
3702 }
3703
3704 frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3705 }
3706
3707 int idx = ((exp & 1) << (precision - 1)) |
3708 (frac >> (frac_size - precision + 1));
3709 uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3710 (frac_size - precision);
3711 uint64_t out_exp = (3 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp) / 2;
3712
3713 uint64_t val = 0;
3714 val = deposit64(val, 0, frac_size, out_frac);
3715 val = deposit64(val, frac_size, exp_size, out_exp);
3716 val = deposit64(val, frac_size + exp_size, 1, sign);
3717 return val;
3718 }
3719
3720 static float16 frsqrt7_h(float16 f, float_status *s)
3721 {
3722 int exp_size = 5, frac_size = 10;
3723 bool sign = float16_is_neg(f);
3724
3725 /*
3726 * frsqrt7(sNaN) = canonical NaN
3727 * frsqrt7(-inf) = canonical NaN
3728 * frsqrt7(-normal) = canonical NaN
3729 * frsqrt7(-subnormal) = canonical NaN
3730 */
3731 if (float16_is_signaling_nan(f, s) ||
3732 (float16_is_infinity(f) && sign) ||
3733 (float16_is_normal(f) && sign) ||
3734 (float16_is_zero_or_denormal(f) && !float16_is_zero(f) && sign)) {
3735 s->float_exception_flags |= float_flag_invalid;
3736 return float16_default_nan(s);
3737 }
3738
3739 /* frsqrt7(qNaN) = canonical NaN */
3740 if (float16_is_quiet_nan(f, s)) {
3741 return float16_default_nan(s);
3742 }
3743
3744 /* frsqrt7(+-0) = +-inf */
3745 if (float16_is_zero(f)) {
3746 s->float_exception_flags |= float_flag_divbyzero;
3747 return float16_set_sign(float16_infinity, sign);
3748 }
3749
3750 /* frsqrt7(+inf) = +0 */
3751 if (float16_is_infinity(f) && !sign) {
3752 return float16_set_sign(float16_zero, sign);
3753 }
3754
3755 /* +normal, +subnormal */
3756 uint64_t val = frsqrt7(f, exp_size, frac_size);
3757 return make_float16(val);
3758 }
3759
3760 static float32 frsqrt7_s(float32 f, float_status *s)
3761 {
3762 int exp_size = 8, frac_size = 23;
3763 bool sign = float32_is_neg(f);
3764
3765 /*
3766 * frsqrt7(sNaN) = canonical NaN
3767 * frsqrt7(-inf) = canonical NaN
3768 * frsqrt7(-normal) = canonical NaN
3769 * frsqrt7(-subnormal) = canonical NaN
3770 */
3771 if (float32_is_signaling_nan(f, s) ||
3772 (float32_is_infinity(f) && sign) ||
3773 (float32_is_normal(f) && sign) ||
3774 (float32_is_zero_or_denormal(f) && !float32_is_zero(f) && sign)) {
3775 s->float_exception_flags |= float_flag_invalid;
3776 return float32_default_nan(s);
3777 }
3778
3779 /* frsqrt7(qNaN) = canonical NaN */
3780 if (float32_is_quiet_nan(f, s)) {
3781 return float32_default_nan(s);
3782 }
3783
3784 /* frsqrt7(+-0) = +-inf */
3785 if (float32_is_zero(f)) {
3786 s->float_exception_flags |= float_flag_divbyzero;
3787 return float32_set_sign(float32_infinity, sign);
3788 }
3789
3790 /* frsqrt7(+inf) = +0 */
3791 if (float32_is_infinity(f) && !sign) {
3792 return float32_set_sign(float32_zero, sign);
3793 }
3794
3795 /* +normal, +subnormal */
3796 uint64_t val = frsqrt7(f, exp_size, frac_size);
3797 return make_float32(val);
3798 }
3799
3800 static float64 frsqrt7_d(float64 f, float_status *s)
3801 {
3802 int exp_size = 11, frac_size = 52;
3803 bool sign = float64_is_neg(f);
3804
3805 /*
3806 * frsqrt7(sNaN) = canonical NaN
3807 * frsqrt7(-inf) = canonical NaN
3808 * frsqrt7(-normal) = canonical NaN
3809 * frsqrt7(-subnormal) = canonical NaN
3810 */
3811 if (float64_is_signaling_nan(f, s) ||
3812 (float64_is_infinity(f) && sign) ||
3813 (float64_is_normal(f) && sign) ||
3814 (float64_is_zero_or_denormal(f) && !float64_is_zero(f) && sign)) {
3815 s->float_exception_flags |= float_flag_invalid;
3816 return float64_default_nan(s);
3817 }
3818
3819 /* frsqrt7(qNaN) = canonical NaN */
3820 if (float64_is_quiet_nan(f, s)) {
3821 return float64_default_nan(s);
3822 }
3823
3824 /* frsqrt7(+-0) = +-inf */
3825 if (float64_is_zero(f)) {
3826 s->float_exception_flags |= float_flag_divbyzero;
3827 return float64_set_sign(float64_infinity, sign);
3828 }
3829
3830 /* frsqrt7(+inf) = +0 */
3831 if (float64_is_infinity(f) && !sign) {
3832 return float64_set_sign(float64_zero, sign);
3833 }
3834
3835 /* +normal, +subnormal */
3836 uint64_t val = frsqrt7(f, exp_size, frac_size);
3837 return make_float64(val);
3838 }
3839
3840 RVVCALL(OPFVV1, vfrsqrt7_v_h, OP_UU_H, H2, H2, frsqrt7_h)
3841 RVVCALL(OPFVV1, vfrsqrt7_v_w, OP_UU_W, H4, H4, frsqrt7_s)
3842 RVVCALL(OPFVV1, vfrsqrt7_v_d, OP_UU_D, H8, H8, frsqrt7_d)
3843 GEN_VEXT_V_ENV(vfrsqrt7_v_h, 2)
3844 GEN_VEXT_V_ENV(vfrsqrt7_v_w, 4)
3845 GEN_VEXT_V_ENV(vfrsqrt7_v_d, 8)
3846
3847 /*
3848 * Vector Floating-Point Reciprocal Estimate Instruction
3849 *
3850 * Adapted from riscv-v-spec recip.c:
3851 * https://github.com/riscv/riscv-v-spec/blob/master/recip.c
3852 */
3853 static uint64_t frec7(uint64_t f, int exp_size, int frac_size,
3854 float_status *s)
3855 {
3856 uint64_t sign = extract64(f, frac_size + exp_size, 1);
3857 uint64_t exp = extract64(f, frac_size, exp_size);
3858 uint64_t frac = extract64(f, 0, frac_size);
3859
3860 const uint8_t lookup_table[] = {
3861 127, 125, 123, 121, 119, 117, 116, 114,
3862 112, 110, 109, 107, 105, 104, 102, 100,
3863 99, 97, 96, 94, 93, 91, 90, 88,
3864 87, 85, 84, 83, 81, 80, 79, 77,
3865 76, 75, 74, 72, 71, 70, 69, 68,
3866 66, 65, 64, 63, 62, 61, 60, 59,
3867 58, 57, 56, 55, 54, 53, 52, 51,
3868 50, 49, 48, 47, 46, 45, 44, 43,
3869 42, 41, 40, 40, 39, 38, 37, 36,
3870 35, 35, 34, 33, 32, 31, 31, 30,
3871 29, 28, 28, 27, 26, 25, 25, 24,
3872 23, 23, 22, 21, 21, 20, 19, 19,
3873 18, 17, 17, 16, 15, 15, 14, 14,
3874 13, 12, 12, 11, 11, 10, 9, 9,
3875 8, 8, 7, 7, 6, 5, 5, 4,
3876 4, 3, 3, 2, 2, 1, 1, 0
3877 };
3878 const int precision = 7;
3879
3880 if (exp == 0 && frac != 0) { /* subnormal */
3881 /* Normalize the subnormal. */
3882 while (extract64(frac, frac_size - 1, 1) == 0) {
3883 exp--;
3884 frac <<= 1;
3885 }
3886
3887 frac = (frac << 1) & MAKE_64BIT_MASK(0, frac_size);
3888
3889 if (exp != 0 && exp != UINT64_MAX) {
3890 /*
3891 * Overflow to inf or max value of same sign,
3892 * depending on sign and rounding mode.
3893 */
3894 s->float_exception_flags |= (float_flag_inexact |
3895 float_flag_overflow);
3896
3897 if ((s->float_rounding_mode == float_round_to_zero) ||
3898 ((s->float_rounding_mode == float_round_down) && !sign) ||
3899 ((s->float_rounding_mode == float_round_up) && sign)) {
3900 /* Return greatest/negative finite value. */
3901 return (sign << (exp_size + frac_size)) |
3902 (MAKE_64BIT_MASK(frac_size, exp_size) - 1);
3903 } else {
3904 /* Return +-inf. */
3905 return (sign << (exp_size + frac_size)) |
3906 MAKE_64BIT_MASK(frac_size, exp_size);
3907 }
3908 }
3909 }
3910
3911 int idx = frac >> (frac_size - precision);
3912 uint64_t out_frac = (uint64_t)(lookup_table[idx]) <<
3913 (frac_size - precision);
3914 uint64_t out_exp = 2 * MAKE_64BIT_MASK(0, exp_size - 1) + ~exp;
3915
3916 if (out_exp == 0 || out_exp == UINT64_MAX) {
3917 /*
3918 * The result is subnormal, but don't raise the underflow exception,
3919 * because there's no additional loss of precision.
3920 */
3921 out_frac = (out_frac >> 1) | MAKE_64BIT_MASK(frac_size - 1, 1);
3922 if (out_exp == UINT64_MAX) {
3923 out_frac >>= 1;
3924 out_exp = 0;
3925 }
3926 }
3927
3928 uint64_t val = 0;
3929 val = deposit64(val, 0, frac_size, out_frac);
3930 val = deposit64(val, frac_size, exp_size, out_exp);
3931 val = deposit64(val, frac_size + exp_size, 1, sign);
3932 return val;
3933 }
3934
3935 static float16 frec7_h(float16 f, float_status *s)
3936 {
3937 int exp_size = 5, frac_size = 10;
3938 bool sign = float16_is_neg(f);
3939
3940 /* frec7(+-inf) = +-0 */
3941 if (float16_is_infinity(f)) {
3942 return float16_set_sign(float16_zero, sign);
3943 }
3944
3945 /* frec7(+-0) = +-inf */
3946 if (float16_is_zero(f)) {
3947 s->float_exception_flags |= float_flag_divbyzero;
3948 return float16_set_sign(float16_infinity, sign);
3949 }
3950
3951 /* frec7(sNaN) = canonical NaN */
3952 if (float16_is_signaling_nan(f, s)) {
3953 s->float_exception_flags |= float_flag_invalid;
3954 return float16_default_nan(s);
3955 }
3956
3957 /* frec7(qNaN) = canonical NaN */
3958 if (float16_is_quiet_nan(f, s)) {
3959 return float16_default_nan(s);
3960 }
3961
3962 /* +-normal, +-subnormal */
3963 uint64_t val = frec7(f, exp_size, frac_size, s);
3964 return make_float16(val);
3965 }
3966
3967 static float32 frec7_s(float32 f, float_status *s)
3968 {
3969 int exp_size = 8, frac_size = 23;
3970 bool sign = float32_is_neg(f);
3971
3972 /* frec7(+-inf) = +-0 */
3973 if (float32_is_infinity(f)) {
3974 return float32_set_sign(float32_zero, sign);
3975 }
3976
3977 /* frec7(+-0) = +-inf */
3978 if (float32_is_zero(f)) {
3979 s->float_exception_flags |= float_flag_divbyzero;
3980 return float32_set_sign(float32_infinity, sign);
3981 }
3982
3983 /* frec7(sNaN) = canonical NaN */
3984 if (float32_is_signaling_nan(f, s)) {
3985 s->float_exception_flags |= float_flag_invalid;
3986 return float32_default_nan(s);
3987 }
3988
3989 /* frec7(qNaN) = canonical NaN */
3990 if (float32_is_quiet_nan(f, s)) {
3991 return float32_default_nan(s);
3992 }
3993
3994 /* +-normal, +-subnormal */
3995 uint64_t val = frec7(f, exp_size, frac_size, s);
3996 return make_float32(val);
3997 }
3998
3999 static float64 frec7_d(float64 f, float_status *s)
4000 {
4001 int exp_size = 11, frac_size = 52;
4002 bool sign = float64_is_neg(f);
4003
4004 /* frec7(+-inf) = +-0 */
4005 if (float64_is_infinity(f)) {
4006 return float64_set_sign(float64_zero, sign);
4007 }
4008
4009 /* frec7(+-0) = +-inf */
4010 if (float64_is_zero(f)) {
4011 s->float_exception_flags |= float_flag_divbyzero;
4012 return float64_set_sign(float64_infinity, sign);
4013 }
4014
4015 /* frec7(sNaN) = canonical NaN */
4016 if (float64_is_signaling_nan(f, s)) {
4017 s->float_exception_flags |= float_flag_invalid;
4018 return float64_default_nan(s);
4019 }
4020
4021 /* frec7(qNaN) = canonical NaN */
4022 if (float64_is_quiet_nan(f, s)) {
4023 return float64_default_nan(s);
4024 }
4025
4026 /* +-normal, +-subnormal */
4027 uint64_t val = frec7(f, exp_size, frac_size, s);
4028 return make_float64(val);
4029 }
4030
4031 RVVCALL(OPFVV1, vfrec7_v_h, OP_UU_H, H2, H2, frec7_h)
4032 RVVCALL(OPFVV1, vfrec7_v_w, OP_UU_W, H4, H4, frec7_s)
4033 RVVCALL(OPFVV1, vfrec7_v_d, OP_UU_D, H8, H8, frec7_d)
4034 GEN_VEXT_V_ENV(vfrec7_v_h, 2)
4035 GEN_VEXT_V_ENV(vfrec7_v_w, 4)
4036 GEN_VEXT_V_ENV(vfrec7_v_d, 8)
4037
4038 /* Vector Floating-Point MIN/MAX Instructions */
4039 RVVCALL(OPFVV2, vfmin_vv_h, OP_UUU_H, H2, H2, H2, float16_minimum_number)
4040 RVVCALL(OPFVV2, vfmin_vv_w, OP_UUU_W, H4, H4, H4, float32_minimum_number)
4041 RVVCALL(OPFVV2, vfmin_vv_d, OP_UUU_D, H8, H8, H8, float64_minimum_number)
4042 GEN_VEXT_VV_ENV(vfmin_vv_h, 2)
4043 GEN_VEXT_VV_ENV(vfmin_vv_w, 4)
4044 GEN_VEXT_VV_ENV(vfmin_vv_d, 8)
4045 RVVCALL(OPFVF2, vfmin_vf_h, OP_UUU_H, H2, H2, float16_minimum_number)
4046 RVVCALL(OPFVF2, vfmin_vf_w, OP_UUU_W, H4, H4, float32_minimum_number)
4047 RVVCALL(OPFVF2, vfmin_vf_d, OP_UUU_D, H8, H8, float64_minimum_number)
4048 GEN_VEXT_VF(vfmin_vf_h, 2)
4049 GEN_VEXT_VF(vfmin_vf_w, 4)
4050 GEN_VEXT_VF(vfmin_vf_d, 8)
4051
4052 RVVCALL(OPFVV2, vfmax_vv_h, OP_UUU_H, H2, H2, H2, float16_maximum_number)
4053 RVVCALL(OPFVV2, vfmax_vv_w, OP_UUU_W, H4, H4, H4, float32_maximum_number)
4054 RVVCALL(OPFVV2, vfmax_vv_d, OP_UUU_D, H8, H8, H8, float64_maximum_number)
4055 GEN_VEXT_VV_ENV(vfmax_vv_h, 2)
4056 GEN_VEXT_VV_ENV(vfmax_vv_w, 4)
4057 GEN_VEXT_VV_ENV(vfmax_vv_d, 8)
4058 RVVCALL(OPFVF2, vfmax_vf_h, OP_UUU_H, H2, H2, float16_maximum_number)
4059 RVVCALL(OPFVF2, vfmax_vf_w, OP_UUU_W, H4, H4, float32_maximum_number)
4060 RVVCALL(OPFVF2, vfmax_vf_d, OP_UUU_D, H8, H8, float64_maximum_number)
4061 GEN_VEXT_VF(vfmax_vf_h, 2)
4062 GEN_VEXT_VF(vfmax_vf_w, 4)
4063 GEN_VEXT_VF(vfmax_vf_d, 8)
4064
4065 /* Vector Floating-Point Sign-Injection Instructions */
4066 static uint16_t fsgnj16(uint16_t a, uint16_t b, float_status *s)
4067 {
4068 return deposit64(b, 0, 15, a);
4069 }
4070
4071 static uint32_t fsgnj32(uint32_t a, uint32_t b, float_status *s)
4072 {
4073 return deposit64(b, 0, 31, a);
4074 }
4075
4076 static uint64_t fsgnj64(uint64_t a, uint64_t b, float_status *s)
4077 {
4078 return deposit64(b, 0, 63, a);
4079 }
4080
4081 RVVCALL(OPFVV2, vfsgnj_vv_h, OP_UUU_H, H2, H2, H2, fsgnj16)
4082 RVVCALL(OPFVV2, vfsgnj_vv_w, OP_UUU_W, H4, H4, H4, fsgnj32)
4083 RVVCALL(OPFVV2, vfsgnj_vv_d, OP_UUU_D, H8, H8, H8, fsgnj64)
4084 GEN_VEXT_VV_ENV(vfsgnj_vv_h, 2)
4085 GEN_VEXT_VV_ENV(vfsgnj_vv_w, 4)
4086 GEN_VEXT_VV_ENV(vfsgnj_vv_d, 8)
4087 RVVCALL(OPFVF2, vfsgnj_vf_h, OP_UUU_H, H2, H2, fsgnj16)
4088 RVVCALL(OPFVF2, vfsgnj_vf_w, OP_UUU_W, H4, H4, fsgnj32)
4089 RVVCALL(OPFVF2, vfsgnj_vf_d, OP_UUU_D, H8, H8, fsgnj64)
4090 GEN_VEXT_VF(vfsgnj_vf_h, 2)
4091 GEN_VEXT_VF(vfsgnj_vf_w, 4)
4092 GEN_VEXT_VF(vfsgnj_vf_d, 8)
4093
4094 static uint16_t fsgnjn16(uint16_t a, uint16_t b, float_status *s)
4095 {
4096 return deposit64(~b, 0, 15, a);
4097 }
4098
4099 static uint32_t fsgnjn32(uint32_t a, uint32_t b, float_status *s)
4100 {
4101 return deposit64(~b, 0, 31, a);
4102 }
4103
4104 static uint64_t fsgnjn64(uint64_t a, uint64_t b, float_status *s)
4105 {
4106 return deposit64(~b, 0, 63, a);
4107 }
4108
4109 RVVCALL(OPFVV2, vfsgnjn_vv_h, OP_UUU_H, H2, H2, H2, fsgnjn16)
4110 RVVCALL(OPFVV2, vfsgnjn_vv_w, OP_UUU_W, H4, H4, H4, fsgnjn32)
4111 RVVCALL(OPFVV2, vfsgnjn_vv_d, OP_UUU_D, H8, H8, H8, fsgnjn64)
4112 GEN_VEXT_VV_ENV(vfsgnjn_vv_h, 2)
4113 GEN_VEXT_VV_ENV(vfsgnjn_vv_w, 4)
4114 GEN_VEXT_VV_ENV(vfsgnjn_vv_d, 8)
4115 RVVCALL(OPFVF2, vfsgnjn_vf_h, OP_UUU_H, H2, H2, fsgnjn16)
4116 RVVCALL(OPFVF2, vfsgnjn_vf_w, OP_UUU_W, H4, H4, fsgnjn32)
4117 RVVCALL(OPFVF2, vfsgnjn_vf_d, OP_UUU_D, H8, H8, fsgnjn64)
4118 GEN_VEXT_VF(vfsgnjn_vf_h, 2)
4119 GEN_VEXT_VF(vfsgnjn_vf_w, 4)
4120 GEN_VEXT_VF(vfsgnjn_vf_d, 8)
4121
4122 static uint16_t fsgnjx16(uint16_t a, uint16_t b, float_status *s)
4123 {
4124 return deposit64(b ^ a, 0, 15, a);
4125 }
4126
4127 static uint32_t fsgnjx32(uint32_t a, uint32_t b, float_status *s)
4128 {
4129 return deposit64(b ^ a, 0, 31, a);
4130 }
4131
4132 static uint64_t fsgnjx64(uint64_t a, uint64_t b, float_status *s)
4133 {
4134 return deposit64(b ^ a, 0, 63, a);
4135 }
4136
4137 RVVCALL(OPFVV2, vfsgnjx_vv_h, OP_UUU_H, H2, H2, H2, fsgnjx16)
4138 RVVCALL(OPFVV2, vfsgnjx_vv_w, OP_UUU_W, H4, H4, H4, fsgnjx32)
4139 RVVCALL(OPFVV2, vfsgnjx_vv_d, OP_UUU_D, H8, H8, H8, fsgnjx64)
4140 GEN_VEXT_VV_ENV(vfsgnjx_vv_h, 2)
4141 GEN_VEXT_VV_ENV(vfsgnjx_vv_w, 4)
4142 GEN_VEXT_VV_ENV(vfsgnjx_vv_d, 8)
4143 RVVCALL(OPFVF2, vfsgnjx_vf_h, OP_UUU_H, H2, H2, fsgnjx16)
4144 RVVCALL(OPFVF2, vfsgnjx_vf_w, OP_UUU_W, H4, H4, fsgnjx32)
4145 RVVCALL(OPFVF2, vfsgnjx_vf_d, OP_UUU_D, H8, H8, fsgnjx64)
4146 GEN_VEXT_VF(vfsgnjx_vf_h, 2)
4147 GEN_VEXT_VF(vfsgnjx_vf_w, 4)
4148 GEN_VEXT_VF(vfsgnjx_vf_d, 8)
4149
4150 /* Vector Floating-Point Compare Instructions */
4151 #define GEN_VEXT_CMP_VV_ENV(NAME, ETYPE, H, DO_OP) \
4152 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
4153 CPURISCVState *env, uint32_t desc) \
4154 { \
4155 uint32_t vm = vext_vm(desc); \
4156 uint32_t vl = env->vl; \
4157 uint32_t total_elems = env_archcpu(env)->cfg.vlen; \
4158 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
4159 uint32_t i; \
4160 \
4161 for (i = env->vstart; i < vl; i++) { \
4162 ETYPE s1 = *((ETYPE *)vs1 + H(i)); \
4163 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
4164 if (!vm && !vext_elem_mask(v0, i)) { \
4165 continue; \
4166 } \
4167 vext_set_elem_mask(vd, i, \
4168 DO_OP(s2, s1, &env->fp_status)); \
4169 } \
4170 env->vstart = 0; \
4171 /* mask destination register are always tail-agnostic */ \
4172 /* set tail elements to 1s */ \
4173 if (vta_all_1s) { \
4174 for (; i < total_elems; i++) { \
4175 vext_set_elem_mask(vd, i, 1); \
4176 } \
4177 } \
4178 }
4179
4180 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_h, uint16_t, H2, float16_eq_quiet)
4181 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_w, uint32_t, H4, float32_eq_quiet)
4182 GEN_VEXT_CMP_VV_ENV(vmfeq_vv_d, uint64_t, H8, float64_eq_quiet)
4183
4184 #define GEN_VEXT_CMP_VF(NAME, ETYPE, H, DO_OP) \
4185 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4186 CPURISCVState *env, uint32_t desc) \
4187 { \
4188 uint32_t vm = vext_vm(desc); \
4189 uint32_t vl = env->vl; \
4190 uint32_t total_elems = env_archcpu(env)->cfg.vlen; \
4191 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
4192 uint32_t i; \
4193 \
4194 for (i = env->vstart; i < vl; i++) { \
4195 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
4196 if (!vm && !vext_elem_mask(v0, i)) { \
4197 continue; \
4198 } \
4199 vext_set_elem_mask(vd, i, \
4200 DO_OP(s2, (ETYPE)s1, &env->fp_status)); \
4201 } \
4202 env->vstart = 0; \
4203 /* mask destination register are always tail-agnostic */ \
4204 /* set tail elements to 1s */ \
4205 if (vta_all_1s) { \
4206 for (; i < total_elems; i++) { \
4207 vext_set_elem_mask(vd, i, 1); \
4208 } \
4209 } \
4210 }
4211
4212 GEN_VEXT_CMP_VF(vmfeq_vf_h, uint16_t, H2, float16_eq_quiet)
4213 GEN_VEXT_CMP_VF(vmfeq_vf_w, uint32_t, H4, float32_eq_quiet)
4214 GEN_VEXT_CMP_VF(vmfeq_vf_d, uint64_t, H8, float64_eq_quiet)
4215
4216 static bool vmfne16(uint16_t a, uint16_t b, float_status *s)
4217 {
4218 FloatRelation compare = float16_compare_quiet(a, b, s);
4219 return compare != float_relation_equal;
4220 }
4221
4222 static bool vmfne32(uint32_t a, uint32_t b, float_status *s)
4223 {
4224 FloatRelation compare = float32_compare_quiet(a, b, s);
4225 return compare != float_relation_equal;
4226 }
4227
4228 static bool vmfne64(uint64_t a, uint64_t b, float_status *s)
4229 {
4230 FloatRelation compare = float64_compare_quiet(a, b, s);
4231 return compare != float_relation_equal;
4232 }
4233
4234 GEN_VEXT_CMP_VV_ENV(vmfne_vv_h, uint16_t, H2, vmfne16)
4235 GEN_VEXT_CMP_VV_ENV(vmfne_vv_w, uint32_t, H4, vmfne32)
4236 GEN_VEXT_CMP_VV_ENV(vmfne_vv_d, uint64_t, H8, vmfne64)
4237 GEN_VEXT_CMP_VF(vmfne_vf_h, uint16_t, H2, vmfne16)
4238 GEN_VEXT_CMP_VF(vmfne_vf_w, uint32_t, H4, vmfne32)
4239 GEN_VEXT_CMP_VF(vmfne_vf_d, uint64_t, H8, vmfne64)
4240
4241 GEN_VEXT_CMP_VV_ENV(vmflt_vv_h, uint16_t, H2, float16_lt)
4242 GEN_VEXT_CMP_VV_ENV(vmflt_vv_w, uint32_t, H4, float32_lt)
4243 GEN_VEXT_CMP_VV_ENV(vmflt_vv_d, uint64_t, H8, float64_lt)
4244 GEN_VEXT_CMP_VF(vmflt_vf_h, uint16_t, H2, float16_lt)
4245 GEN_VEXT_CMP_VF(vmflt_vf_w, uint32_t, H4, float32_lt)
4246 GEN_VEXT_CMP_VF(vmflt_vf_d, uint64_t, H8, float64_lt)
4247
4248 GEN_VEXT_CMP_VV_ENV(vmfle_vv_h, uint16_t, H2, float16_le)
4249 GEN_VEXT_CMP_VV_ENV(vmfle_vv_w, uint32_t, H4, float32_le)
4250 GEN_VEXT_CMP_VV_ENV(vmfle_vv_d, uint64_t, H8, float64_le)
4251 GEN_VEXT_CMP_VF(vmfle_vf_h, uint16_t, H2, float16_le)
4252 GEN_VEXT_CMP_VF(vmfle_vf_w, uint32_t, H4, float32_le)
4253 GEN_VEXT_CMP_VF(vmfle_vf_d, uint64_t, H8, float64_le)
4254
4255 static bool vmfgt16(uint16_t a, uint16_t b, float_status *s)
4256 {
4257 FloatRelation compare = float16_compare(a, b, s);
4258 return compare == float_relation_greater;
4259 }
4260
4261 static bool vmfgt32(uint32_t a, uint32_t b, float_status *s)
4262 {
4263 FloatRelation compare = float32_compare(a, b, s);
4264 return compare == float_relation_greater;
4265 }
4266
4267 static bool vmfgt64(uint64_t a, uint64_t b, float_status *s)
4268 {
4269 FloatRelation compare = float64_compare(a, b, s);
4270 return compare == float_relation_greater;
4271 }
4272
4273 GEN_VEXT_CMP_VF(vmfgt_vf_h, uint16_t, H2, vmfgt16)
4274 GEN_VEXT_CMP_VF(vmfgt_vf_w, uint32_t, H4, vmfgt32)
4275 GEN_VEXT_CMP_VF(vmfgt_vf_d, uint64_t, H8, vmfgt64)
4276
4277 static bool vmfge16(uint16_t a, uint16_t b, float_status *s)
4278 {
4279 FloatRelation compare = float16_compare(a, b, s);
4280 return compare == float_relation_greater ||
4281 compare == float_relation_equal;
4282 }
4283
4284 static bool vmfge32(uint32_t a, uint32_t b, float_status *s)
4285 {
4286 FloatRelation compare = float32_compare(a, b, s);
4287 return compare == float_relation_greater ||
4288 compare == float_relation_equal;
4289 }
4290
4291 static bool vmfge64(uint64_t a, uint64_t b, float_status *s)
4292 {
4293 FloatRelation compare = float64_compare(a, b, s);
4294 return compare == float_relation_greater ||
4295 compare == float_relation_equal;
4296 }
4297
4298 GEN_VEXT_CMP_VF(vmfge_vf_h, uint16_t, H2, vmfge16)
4299 GEN_VEXT_CMP_VF(vmfge_vf_w, uint32_t, H4, vmfge32)
4300 GEN_VEXT_CMP_VF(vmfge_vf_d, uint64_t, H8, vmfge64)
4301
4302 /* Vector Floating-Point Classify Instruction */
4303 #define OPIVV1(NAME, TD, T2, TX2, HD, HS2, OP) \
4304 static void do_##NAME(void *vd, void *vs2, int i) \
4305 { \
4306 TX2 s2 = *((T2 *)vs2 + HS2(i)); \
4307 *((TD *)vd + HD(i)) = OP(s2); \
4308 }
4309
4310 #define GEN_VEXT_V(NAME, ESZ) \
4311 void HELPER(NAME)(void *vd, void *v0, void *vs2, \
4312 CPURISCVState *env, uint32_t desc) \
4313 { \
4314 uint32_t vm = vext_vm(desc); \
4315 uint32_t vl = env->vl; \
4316 uint32_t total_elems = \
4317 vext_get_total_elems(env, desc, ESZ); \
4318 uint32_t vta = vext_vta(desc); \
4319 uint32_t i; \
4320 \
4321 for (i = env->vstart; i < vl; i++) { \
4322 if (!vm && !vext_elem_mask(v0, i)) { \
4323 continue; \
4324 } \
4325 do_##NAME(vd, vs2, i); \
4326 } \
4327 env->vstart = 0; \
4328 /* set tail elements to 1s */ \
4329 vext_set_elems_1s(vd, vta, vl * ESZ, \
4330 total_elems * ESZ); \
4331 }
4332
4333 target_ulong fclass_h(uint64_t frs1)
4334 {
4335 float16 f = frs1;
4336 bool sign = float16_is_neg(f);
4337
4338 if (float16_is_infinity(f)) {
4339 return sign ? 1 << 0 : 1 << 7;
4340 } else if (float16_is_zero(f)) {
4341 return sign ? 1 << 3 : 1 << 4;
4342 } else if (float16_is_zero_or_denormal(f)) {
4343 return sign ? 1 << 2 : 1 << 5;
4344 } else if (float16_is_any_nan(f)) {
4345 float_status s = { }; /* for snan_bit_is_one */
4346 return float16_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4347 } else {
4348 return sign ? 1 << 1 : 1 << 6;
4349 }
4350 }
4351
4352 target_ulong fclass_s(uint64_t frs1)
4353 {
4354 float32 f = frs1;
4355 bool sign = float32_is_neg(f);
4356
4357 if (float32_is_infinity(f)) {
4358 return sign ? 1 << 0 : 1 << 7;
4359 } else if (float32_is_zero(f)) {
4360 return sign ? 1 << 3 : 1 << 4;
4361 } else if (float32_is_zero_or_denormal(f)) {
4362 return sign ? 1 << 2 : 1 << 5;
4363 } else if (float32_is_any_nan(f)) {
4364 float_status s = { }; /* for snan_bit_is_one */
4365 return float32_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4366 } else {
4367 return sign ? 1 << 1 : 1 << 6;
4368 }
4369 }
4370
4371 target_ulong fclass_d(uint64_t frs1)
4372 {
4373 float64 f = frs1;
4374 bool sign = float64_is_neg(f);
4375
4376 if (float64_is_infinity(f)) {
4377 return sign ? 1 << 0 : 1 << 7;
4378 } else if (float64_is_zero(f)) {
4379 return sign ? 1 << 3 : 1 << 4;
4380 } else if (float64_is_zero_or_denormal(f)) {
4381 return sign ? 1 << 2 : 1 << 5;
4382 } else if (float64_is_any_nan(f)) {
4383 float_status s = { }; /* for snan_bit_is_one */
4384 return float64_is_quiet_nan(f, &s) ? 1 << 9 : 1 << 8;
4385 } else {
4386 return sign ? 1 << 1 : 1 << 6;
4387 }
4388 }
4389
4390 RVVCALL(OPIVV1, vfclass_v_h, OP_UU_H, H2, H2, fclass_h)
4391 RVVCALL(OPIVV1, vfclass_v_w, OP_UU_W, H4, H4, fclass_s)
4392 RVVCALL(OPIVV1, vfclass_v_d, OP_UU_D, H8, H8, fclass_d)
4393 GEN_VEXT_V(vfclass_v_h, 2)
4394 GEN_VEXT_V(vfclass_v_w, 4)
4395 GEN_VEXT_V(vfclass_v_d, 8)
4396
4397 /* Vector Floating-Point Merge Instruction */
4398
4399 #define GEN_VFMERGE_VF(NAME, ETYPE, H) \
4400 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
4401 CPURISCVState *env, uint32_t desc) \
4402 { \
4403 uint32_t vm = vext_vm(desc); \
4404 uint32_t vl = env->vl; \
4405 uint32_t esz = sizeof(ETYPE); \
4406 uint32_t total_elems = \
4407 vext_get_total_elems(env, desc, esz); \
4408 uint32_t vta = vext_vta(desc); \
4409 uint32_t i; \
4410 \
4411 for (i = env->vstart; i < vl; i++) { \
4412 ETYPE s2 = *((ETYPE *)vs2 + H(i)); \
4413 *((ETYPE *)vd + H(i)) \
4414 = (!vm && !vext_elem_mask(v0, i) ? s2 : s1); \
4415 } \
4416 env->vstart = 0; \
4417 /* set tail elements to 1s */ \
4418 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
4419 }
4420
4421 GEN_VFMERGE_VF(vfmerge_vfm_h, int16_t, H2)
4422 GEN_VFMERGE_VF(vfmerge_vfm_w, int32_t, H4)
4423 GEN_VFMERGE_VF(vfmerge_vfm_d, int64_t, H8)
4424
4425 /* Single-Width Floating-Point/Integer Type-Convert Instructions */
4426 /* vfcvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4427 RVVCALL(OPFVV1, vfcvt_xu_f_v_h, OP_UU_H, H2, H2, float16_to_uint16)
4428 RVVCALL(OPFVV1, vfcvt_xu_f_v_w, OP_UU_W, H4, H4, float32_to_uint32)
4429 RVVCALL(OPFVV1, vfcvt_xu_f_v_d, OP_UU_D, H8, H8, float64_to_uint64)
4430 GEN_VEXT_V_ENV(vfcvt_xu_f_v_h, 2)
4431 GEN_VEXT_V_ENV(vfcvt_xu_f_v_w, 4)
4432 GEN_VEXT_V_ENV(vfcvt_xu_f_v_d, 8)
4433
4434 /* vfcvt.x.f.v vd, vs2, vm # Convert float to signed integer. */
4435 RVVCALL(OPFVV1, vfcvt_x_f_v_h, OP_UU_H, H2, H2, float16_to_int16)
4436 RVVCALL(OPFVV1, vfcvt_x_f_v_w, OP_UU_W, H4, H4, float32_to_int32)
4437 RVVCALL(OPFVV1, vfcvt_x_f_v_d, OP_UU_D, H8, H8, float64_to_int64)
4438 GEN_VEXT_V_ENV(vfcvt_x_f_v_h, 2)
4439 GEN_VEXT_V_ENV(vfcvt_x_f_v_w, 4)
4440 GEN_VEXT_V_ENV(vfcvt_x_f_v_d, 8)
4441
4442 /* vfcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to float. */
4443 RVVCALL(OPFVV1, vfcvt_f_xu_v_h, OP_UU_H, H2, H2, uint16_to_float16)
4444 RVVCALL(OPFVV1, vfcvt_f_xu_v_w, OP_UU_W, H4, H4, uint32_to_float32)
4445 RVVCALL(OPFVV1, vfcvt_f_xu_v_d, OP_UU_D, H8, H8, uint64_to_float64)
4446 GEN_VEXT_V_ENV(vfcvt_f_xu_v_h, 2)
4447 GEN_VEXT_V_ENV(vfcvt_f_xu_v_w, 4)
4448 GEN_VEXT_V_ENV(vfcvt_f_xu_v_d, 8)
4449
4450 /* vfcvt.f.x.v vd, vs2, vm # Convert integer to float. */
4451 RVVCALL(OPFVV1, vfcvt_f_x_v_h, OP_UU_H, H2, H2, int16_to_float16)
4452 RVVCALL(OPFVV1, vfcvt_f_x_v_w, OP_UU_W, H4, H4, int32_to_float32)
4453 RVVCALL(OPFVV1, vfcvt_f_x_v_d, OP_UU_D, H8, H8, int64_to_float64)
4454 GEN_VEXT_V_ENV(vfcvt_f_x_v_h, 2)
4455 GEN_VEXT_V_ENV(vfcvt_f_x_v_w, 4)
4456 GEN_VEXT_V_ENV(vfcvt_f_x_v_d, 8)
4457
4458 /* Widening Floating-Point/Integer Type-Convert Instructions */
4459 /* (TD, T2, TX2) */
4460 #define WOP_UU_B uint16_t, uint8_t, uint8_t
4461 #define WOP_UU_H uint32_t, uint16_t, uint16_t
4462 #define WOP_UU_W uint64_t, uint32_t, uint32_t
4463 /* vfwcvt.xu.f.v vd, vs2, vm # Convert float to double-width unsigned integer.*/
4464 RVVCALL(OPFVV1, vfwcvt_xu_f_v_h, WOP_UU_H, H4, H2, float16_to_uint32)
4465 RVVCALL(OPFVV1, vfwcvt_xu_f_v_w, WOP_UU_W, H8, H4, float32_to_uint64)
4466 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_h, 4)
4467 GEN_VEXT_V_ENV(vfwcvt_xu_f_v_w, 8)
4468
4469 /* vfwcvt.x.f.v vd, vs2, vm # Convert float to double-width signed integer. */
4470 RVVCALL(OPFVV1, vfwcvt_x_f_v_h, WOP_UU_H, H4, H2, float16_to_int32)
4471 RVVCALL(OPFVV1, vfwcvt_x_f_v_w, WOP_UU_W, H8, H4, float32_to_int64)
4472 GEN_VEXT_V_ENV(vfwcvt_x_f_v_h, 4)
4473 GEN_VEXT_V_ENV(vfwcvt_x_f_v_w, 8)
4474
4475 /* vfwcvt.f.xu.v vd, vs2, vm # Convert unsigned integer to double-width float */
4476 RVVCALL(OPFVV1, vfwcvt_f_xu_v_b, WOP_UU_B, H2, H1, uint8_to_float16)
4477 RVVCALL(OPFVV1, vfwcvt_f_xu_v_h, WOP_UU_H, H4, H2, uint16_to_float32)
4478 RVVCALL(OPFVV1, vfwcvt_f_xu_v_w, WOP_UU_W, H8, H4, uint32_to_float64)
4479 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_b, 2)
4480 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_h, 4)
4481 GEN_VEXT_V_ENV(vfwcvt_f_xu_v_w, 8)
4482
4483 /* vfwcvt.f.x.v vd, vs2, vm # Convert integer to double-width float. */
4484 RVVCALL(OPFVV1, vfwcvt_f_x_v_b, WOP_UU_B, H2, H1, int8_to_float16)
4485 RVVCALL(OPFVV1, vfwcvt_f_x_v_h, WOP_UU_H, H4, H2, int16_to_float32)
4486 RVVCALL(OPFVV1, vfwcvt_f_x_v_w, WOP_UU_W, H8, H4, int32_to_float64)
4487 GEN_VEXT_V_ENV(vfwcvt_f_x_v_b, 2)
4488 GEN_VEXT_V_ENV(vfwcvt_f_x_v_h, 4)
4489 GEN_VEXT_V_ENV(vfwcvt_f_x_v_w, 8)
4490
4491 /*
4492 * vfwcvt.f.f.v vd, vs2, vm
4493 * Convert single-width float to double-width float.
4494 */
4495 static uint32_t vfwcvtffv16(uint16_t a, float_status *s)
4496 {
4497 return float16_to_float32(a, true, s);
4498 }
4499
4500 RVVCALL(OPFVV1, vfwcvt_f_f_v_h, WOP_UU_H, H4, H2, vfwcvtffv16)
4501 RVVCALL(OPFVV1, vfwcvt_f_f_v_w, WOP_UU_W, H8, H4, float32_to_float64)
4502 GEN_VEXT_V_ENV(vfwcvt_f_f_v_h, 4)
4503 GEN_VEXT_V_ENV(vfwcvt_f_f_v_w, 8)
4504
4505 /* Narrowing Floating-Point/Integer Type-Convert Instructions */
4506 /* (TD, T2, TX2) */
4507 #define NOP_UU_B uint8_t, uint16_t, uint32_t
4508 #define NOP_UU_H uint16_t, uint32_t, uint32_t
4509 #define NOP_UU_W uint32_t, uint64_t, uint64_t
4510 /* vfncvt.xu.f.v vd, vs2, vm # Convert float to unsigned integer. */
4511 RVVCALL(OPFVV1, vfncvt_xu_f_w_b, NOP_UU_B, H1, H2, float16_to_uint8)
4512 RVVCALL(OPFVV1, vfncvt_xu_f_w_h, NOP_UU_H, H2, H4, float32_to_uint16)
4513 RVVCALL(OPFVV1, vfncvt_xu_f_w_w, NOP_UU_W, H4, H8, float64_to_uint32)
4514 GEN_VEXT_V_ENV(vfncvt_xu_f_w_b, 1)
4515 GEN_VEXT_V_ENV(vfncvt_xu_f_w_h, 2)
4516 GEN_VEXT_V_ENV(vfncvt_xu_f_w_w, 4)
4517
4518 /* vfncvt.x.f.v vd, vs2, vm # Convert double-width float to signed integer. */
4519 RVVCALL(OPFVV1, vfncvt_x_f_w_b, NOP_UU_B, H1, H2, float16_to_int8)
4520 RVVCALL(OPFVV1, vfncvt_x_f_w_h, NOP_UU_H, H2, H4, float32_to_int16)
4521 RVVCALL(OPFVV1, vfncvt_x_f_w_w, NOP_UU_W, H4, H8, float64_to_int32)
4522 GEN_VEXT_V_ENV(vfncvt_x_f_w_b, 1)
4523 GEN_VEXT_V_ENV(vfncvt_x_f_w_h, 2)
4524 GEN_VEXT_V_ENV(vfncvt_x_f_w_w, 4)
4525
4526 /* vfncvt.f.xu.v vd, vs2, vm # Convert double-width unsigned integer to float */
4527 RVVCALL(OPFVV1, vfncvt_f_xu_w_h, NOP_UU_H, H2, H4, uint32_to_float16)
4528 RVVCALL(OPFVV1, vfncvt_f_xu_w_w, NOP_UU_W, H4, H8, uint64_to_float32)
4529 GEN_VEXT_V_ENV(vfncvt_f_xu_w_h, 2)
4530 GEN_VEXT_V_ENV(vfncvt_f_xu_w_w, 4)
4531
4532 /* vfncvt.f.x.v vd, vs2, vm # Convert double-width integer to float. */
4533 RVVCALL(OPFVV1, vfncvt_f_x_w_h, NOP_UU_H, H2, H4, int32_to_float16)
4534 RVVCALL(OPFVV1, vfncvt_f_x_w_w, NOP_UU_W, H4, H8, int64_to_float32)
4535 GEN_VEXT_V_ENV(vfncvt_f_x_w_h, 2)
4536 GEN_VEXT_V_ENV(vfncvt_f_x_w_w, 4)
4537
4538 /* vfncvt.f.f.v vd, vs2, vm # Convert double float to single-width float. */
4539 static uint16_t vfncvtffv16(uint32_t a, float_status *s)
4540 {
4541 return float32_to_float16(a, true, s);
4542 }
4543
4544 RVVCALL(OPFVV1, vfncvt_f_f_w_h, NOP_UU_H, H2, H4, vfncvtffv16)
4545 RVVCALL(OPFVV1, vfncvt_f_f_w_w, NOP_UU_W, H4, H8, float64_to_float32)
4546 GEN_VEXT_V_ENV(vfncvt_f_f_w_h, 2)
4547 GEN_VEXT_V_ENV(vfncvt_f_f_w_w, 4)
4548
4549 /*
4550 *** Vector Reduction Operations
4551 */
4552 /* Vector Single-Width Integer Reduction Instructions */
4553 #define GEN_VEXT_RED(NAME, TD, TS2, HD, HS2, OP) \
4554 void HELPER(NAME)(void *vd, void *v0, void *vs1, \
4555 void *vs2, CPURISCVState *env, uint32_t desc) \
4556 { \
4557 uint32_t vm = vext_vm(desc); \
4558 uint32_t vl = env->vl; \
4559 uint32_t esz = sizeof(TD); \
4560 uint32_t vlenb = simd_maxsz(desc); \
4561 uint32_t vta = vext_vta(desc); \
4562 uint32_t i; \
4563 TD s1 = *((TD *)vs1 + HD(0)); \
4564 \
4565 for (i = env->vstart; i < vl; i++) { \
4566 TS2 s2 = *((TS2 *)vs2 + HS2(i)); \
4567 if (!vm && !vext_elem_mask(v0, i)) { \
4568 continue; \
4569 } \
4570 s1 = OP(s1, (TD)s2); \
4571 } \
4572 *((TD *)vd + HD(0)) = s1; \
4573 env->vstart = 0; \
4574 /* set tail elements to 1s */ \
4575 vext_set_elems_1s(vd, vta, esz, vlenb); \
4576 }
4577
4578 /* vd[0] = sum(vs1[0], vs2[*]) */
4579 GEN_VEXT_RED(vredsum_vs_b, int8_t, int8_t, H1, H1, DO_ADD)
4580 GEN_VEXT_RED(vredsum_vs_h, int16_t, int16_t, H2, H2, DO_ADD)
4581 GEN_VEXT_RED(vredsum_vs_w, int32_t, int32_t, H4, H4, DO_ADD)
4582 GEN_VEXT_RED(vredsum_vs_d, int64_t, int64_t, H8, H8, DO_ADD)
4583
4584 /* vd[0] = maxu(vs1[0], vs2[*]) */
4585 GEN_VEXT_RED(vredmaxu_vs_b, uint8_t, uint8_t, H1, H1, DO_MAX)
4586 GEN_VEXT_RED(vredmaxu_vs_h, uint16_t, uint16_t, H2, H2, DO_MAX)
4587 GEN_VEXT_RED(vredmaxu_vs_w, uint32_t, uint32_t, H4, H4, DO_MAX)
4588 GEN_VEXT_RED(vredmaxu_vs_d, uint64_t, uint64_t, H8, H8, DO_MAX)
4589
4590 /* vd[0] = max(vs1[0], vs2[*]) */
4591 GEN_VEXT_RED(vredmax_vs_b, int8_t, int8_t, H1, H1, DO_MAX)
4592 GEN_VEXT_RED(vredmax_vs_h, int16_t, int16_t, H2, H2, DO_MAX)
4593 GEN_VEXT_RED(vredmax_vs_w, int32_t, int32_t, H4, H4, DO_MAX)
4594 GEN_VEXT_RED(vredmax_vs_d, int64_t, int64_t, H8, H8, DO_MAX)
4595
4596 /* vd[0] = minu(vs1[0], vs2[*]) */
4597 GEN_VEXT_RED(vredminu_vs_b, uint8_t, uint8_t, H1, H1, DO_MIN)
4598 GEN_VEXT_RED(vredminu_vs_h, uint16_t, uint16_t, H2, H2, DO_MIN)
4599 GEN_VEXT_RED(vredminu_vs_w, uint32_t, uint32_t, H4, H4, DO_MIN)
4600 GEN_VEXT_RED(vredminu_vs_d, uint64_t, uint64_t, H8, H8, DO_MIN)
4601
4602 /* vd[0] = min(vs1[0], vs2[*]) */
4603 GEN_VEXT_RED(vredmin_vs_b, int8_t, int8_t, H1, H1, DO_MIN)
4604 GEN_VEXT_RED(vredmin_vs_h, int16_t, int16_t, H2, H2, DO_MIN)
4605 GEN_VEXT_RED(vredmin_vs_w, int32_t, int32_t, H4, H4, DO_MIN)
4606 GEN_VEXT_RED(vredmin_vs_d, int64_t, int64_t, H8, H8, DO_MIN)
4607
4608 /* vd[0] = and(vs1[0], vs2[*]) */
4609 GEN_VEXT_RED(vredand_vs_b, int8_t, int8_t, H1, H1, DO_AND)
4610 GEN_VEXT_RED(vredand_vs_h, int16_t, int16_t, H2, H2, DO_AND)
4611 GEN_VEXT_RED(vredand_vs_w, int32_t, int32_t, H4, H4, DO_AND)
4612 GEN_VEXT_RED(vredand_vs_d, int64_t, int64_t, H8, H8, DO_AND)
4613
4614 /* vd[0] = or(vs1[0], vs2[*]) */
4615 GEN_VEXT_RED(vredor_vs_b, int8_t, int8_t, H1, H1, DO_OR)
4616 GEN_VEXT_RED(vredor_vs_h, int16_t, int16_t, H2, H2, DO_OR)
4617 GEN_VEXT_RED(vredor_vs_w, int32_t, int32_t, H4, H4, DO_OR)
4618 GEN_VEXT_RED(vredor_vs_d, int64_t, int64_t, H8, H8, DO_OR)
4619
4620 /* vd[0] = xor(vs1[0], vs2[*]) */
4621 GEN_VEXT_RED(vredxor_vs_b, int8_t, int8_t, H1, H1, DO_XOR)
4622 GEN_VEXT_RED(vredxor_vs_h, int16_t, int16_t, H2, H2, DO_XOR)
4623 GEN_VEXT_RED(vredxor_vs_w, int32_t, int32_t, H4, H4, DO_XOR)
4624 GEN_VEXT_RED(vredxor_vs_d, int64_t, int64_t, H8, H8, DO_XOR)
4625
4626 /* Vector Widening Integer Reduction Instructions */
4627 /* signed sum reduction into double-width accumulator */
4628 GEN_VEXT_RED(vwredsum_vs_b, int16_t, int8_t, H2, H1, DO_ADD)
4629 GEN_VEXT_RED(vwredsum_vs_h, int32_t, int16_t, H4, H2, DO_ADD)
4630 GEN_VEXT_RED(vwredsum_vs_w, int64_t, int32_t, H8, H4, DO_ADD)
4631
4632 /* Unsigned sum reduction into double-width accumulator */
4633 GEN_VEXT_RED(vwredsumu_vs_b, uint16_t, uint8_t, H2, H1, DO_ADD)
4634 GEN_VEXT_RED(vwredsumu_vs_h, uint32_t, uint16_t, H4, H2, DO_ADD)
4635 GEN_VEXT_RED(vwredsumu_vs_w, uint64_t, uint32_t, H8, H4, DO_ADD)
4636
4637 /* Vector Single-Width Floating-Point Reduction Instructions */
4638 #define GEN_VEXT_FRED(NAME, TD, TS2, HD, HS2, OP) \
4639 void HELPER(NAME)(void *vd, void *v0, void *vs1, \
4640 void *vs2, CPURISCVState *env, \
4641 uint32_t desc) \
4642 { \
4643 uint32_t vm = vext_vm(desc); \
4644 uint32_t vl = env->vl; \
4645 uint32_t esz = sizeof(TD); \
4646 uint32_t vlenb = simd_maxsz(desc); \
4647 uint32_t vta = vext_vta(desc); \
4648 uint32_t i; \
4649 TD s1 = *((TD *)vs1 + HD(0)); \
4650 \
4651 for (i = env->vstart; i < vl; i++) { \
4652 TS2 s2 = *((TS2 *)vs2 + HS2(i)); \
4653 if (!vm && !vext_elem_mask(v0, i)) { \
4654 continue; \
4655 } \
4656 s1 = OP(s1, (TD)s2, &env->fp_status); \
4657 } \
4658 *((TD *)vd + HD(0)) = s1; \
4659 env->vstart = 0; \
4660 /* set tail elements to 1s */ \
4661 vext_set_elems_1s(vd, vta, esz, vlenb); \
4662 }
4663
4664 /* Unordered sum */
4665 GEN_VEXT_FRED(vfredsum_vs_h, uint16_t, uint16_t, H2, H2, float16_add)
4666 GEN_VEXT_FRED(vfredsum_vs_w, uint32_t, uint32_t, H4, H4, float32_add)
4667 GEN_VEXT_FRED(vfredsum_vs_d, uint64_t, uint64_t, H8, H8, float64_add)
4668
4669 /* Maximum value */
4670 GEN_VEXT_FRED(vfredmax_vs_h, uint16_t, uint16_t, H2, H2, float16_maximum_number)
4671 GEN_VEXT_FRED(vfredmax_vs_w, uint32_t, uint32_t, H4, H4, float32_maximum_number)
4672 GEN_VEXT_FRED(vfredmax_vs_d, uint64_t, uint64_t, H8, H8, float64_maximum_number)
4673
4674 /* Minimum value */
4675 GEN_VEXT_FRED(vfredmin_vs_h, uint16_t, uint16_t, H2, H2, float16_minimum_number)
4676 GEN_VEXT_FRED(vfredmin_vs_w, uint32_t, uint32_t, H4, H4, float32_minimum_number)
4677 GEN_VEXT_FRED(vfredmin_vs_d, uint64_t, uint64_t, H8, H8, float64_minimum_number)
4678
4679 /* Vector Widening Floating-Point Reduction Instructions */
4680 /* Unordered reduce 2*SEW = 2*SEW + sum(promote(SEW)) */
4681 void HELPER(vfwredsum_vs_h)(void *vd, void *v0, void *vs1,
4682 void *vs2, CPURISCVState *env, uint32_t desc)
4683 {
4684 uint32_t vm = vext_vm(desc);
4685 uint32_t vl = env->vl;
4686 uint32_t esz = sizeof(uint32_t);
4687 uint32_t vlenb = simd_maxsz(desc);
4688 uint32_t vta = vext_vta(desc);
4689 uint32_t i;
4690 uint32_t s1 = *((uint32_t *)vs1 + H4(0));
4691
4692 for (i = env->vstart; i < vl; i++) {
4693 uint16_t s2 = *((uint16_t *)vs2 + H2(i));
4694 if (!vm && !vext_elem_mask(v0, i)) {
4695 continue;
4696 }
4697 s1 = float32_add(s1, float16_to_float32(s2, true, &env->fp_status),
4698 &env->fp_status);
4699 }
4700 *((uint32_t *)vd + H4(0)) = s1;
4701 env->vstart = 0;
4702 /* set tail elements to 1s */
4703 vext_set_elems_1s(vd, vta, esz, vlenb);
4704 }
4705
4706 void HELPER(vfwredsum_vs_w)(void *vd, void *v0, void *vs1,
4707 void *vs2, CPURISCVState *env, uint32_t desc)
4708 {
4709 uint32_t vm = vext_vm(desc);
4710 uint32_t vl = env->vl;
4711 uint32_t esz = sizeof(uint64_t);
4712 uint32_t vlenb = simd_maxsz(desc);
4713 uint32_t vta = vext_vta(desc);
4714 uint32_t i;
4715 uint64_t s1 = *((uint64_t *)vs1);
4716
4717 for (i = env->vstart; i < vl; i++) {
4718 uint32_t s2 = *((uint32_t *)vs2 + H4(i));
4719 if (!vm && !vext_elem_mask(v0, i)) {
4720 continue;
4721 }
4722 s1 = float64_add(s1, float32_to_float64(s2, &env->fp_status),
4723 &env->fp_status);
4724 }
4725 *((uint64_t *)vd) = s1;
4726 env->vstart = 0;
4727 /* set tail elements to 1s */
4728 vext_set_elems_1s(vd, vta, esz, vlenb);
4729 }
4730
4731 /*
4732 *** Vector Mask Operations
4733 */
4734 /* Vector Mask-Register Logical Instructions */
4735 #define GEN_VEXT_MASK_VV(NAME, OP) \
4736 void HELPER(NAME)(void *vd, void *v0, void *vs1, \
4737 void *vs2, CPURISCVState *env, \
4738 uint32_t desc) \
4739 { \
4740 uint32_t vl = env->vl; \
4741 uint32_t total_elems = env_archcpu(env)->cfg.vlen; \
4742 uint32_t vta_all_1s = vext_vta_all_1s(desc); \
4743 uint32_t i; \
4744 int a, b; \
4745 \
4746 for (i = env->vstart; i < vl; i++) { \
4747 a = vext_elem_mask(vs1, i); \
4748 b = vext_elem_mask(vs2, i); \
4749 vext_set_elem_mask(vd, i, OP(b, a)); \
4750 } \
4751 env->vstart = 0; \
4752 /* mask destination register are always tail- \
4753 * agnostic \
4754 */ \
4755 /* set tail elements to 1s */ \
4756 if (vta_all_1s) { \
4757 for (; i < total_elems; i++) { \
4758 vext_set_elem_mask(vd, i, 1); \
4759 } \
4760 } \
4761 }
4762
4763 #define DO_NAND(N, M) (!(N & M))
4764 #define DO_ANDNOT(N, M) (N & !M)
4765 #define DO_NOR(N, M) (!(N | M))
4766 #define DO_ORNOT(N, M) (N | !M)
4767 #define DO_XNOR(N, M) (!(N ^ M))
4768
4769 GEN_VEXT_MASK_VV(vmand_mm, DO_AND)
4770 GEN_VEXT_MASK_VV(vmnand_mm, DO_NAND)
4771 GEN_VEXT_MASK_VV(vmandn_mm, DO_ANDNOT)
4772 GEN_VEXT_MASK_VV(vmxor_mm, DO_XOR)
4773 GEN_VEXT_MASK_VV(vmor_mm, DO_OR)
4774 GEN_VEXT_MASK_VV(vmnor_mm, DO_NOR)
4775 GEN_VEXT_MASK_VV(vmorn_mm, DO_ORNOT)
4776 GEN_VEXT_MASK_VV(vmxnor_mm, DO_XNOR)
4777
4778 /* Vector count population in mask vcpop */
4779 target_ulong HELPER(vcpop_m)(void *v0, void *vs2, CPURISCVState *env,
4780 uint32_t desc)
4781 {
4782 target_ulong cnt = 0;
4783 uint32_t vm = vext_vm(desc);
4784 uint32_t vl = env->vl;
4785 int i;
4786
4787 for (i = env->vstart; i < vl; i++) {
4788 if (vm || vext_elem_mask(v0, i)) {
4789 if (vext_elem_mask(vs2, i)) {
4790 cnt++;
4791 }
4792 }
4793 }
4794 env->vstart = 0;
4795 return cnt;
4796 }
4797
4798 /* vfirst find-first-set mask bit*/
4799 target_ulong HELPER(vfirst_m)(void *v0, void *vs2, CPURISCVState *env,
4800 uint32_t desc)
4801 {
4802 uint32_t vm = vext_vm(desc);
4803 uint32_t vl = env->vl;
4804 int i;
4805
4806 for (i = env->vstart; i < vl; i++) {
4807 if (vm || vext_elem_mask(v0, i)) {
4808 if (vext_elem_mask(vs2, i)) {
4809 return i;
4810 }
4811 }
4812 }
4813 env->vstart = 0;
4814 return -1LL;
4815 }
4816
4817 enum set_mask_type {
4818 ONLY_FIRST = 1,
4819 INCLUDE_FIRST,
4820 BEFORE_FIRST,
4821 };
4822
4823 static void vmsetm(void *vd, void *v0, void *vs2, CPURISCVState *env,
4824 uint32_t desc, enum set_mask_type type)
4825 {
4826 uint32_t vm = vext_vm(desc);
4827 uint32_t vl = env->vl;
4828 uint32_t total_elems = env_archcpu(env)->cfg.vlen;
4829 uint32_t vta_all_1s = vext_vta_all_1s(desc);
4830 int i;
4831 bool first_mask_bit = false;
4832
4833 for (i = env->vstart; i < vl; i++) {
4834 if (!vm && !vext_elem_mask(v0, i)) {
4835 continue;
4836 }
4837 /* write a zero to all following active elements */
4838 if (first_mask_bit) {
4839 vext_set_elem_mask(vd, i, 0);
4840 continue;
4841 }
4842 if (vext_elem_mask(vs2, i)) {
4843 first_mask_bit = true;
4844 if (type == BEFORE_FIRST) {
4845 vext_set_elem_mask(vd, i, 0);
4846 } else {
4847 vext_set_elem_mask(vd, i, 1);
4848 }
4849 } else {
4850 if (type == ONLY_FIRST) {
4851 vext_set_elem_mask(vd, i, 0);
4852 } else {
4853 vext_set_elem_mask(vd, i, 1);
4854 }
4855 }
4856 }
4857 env->vstart = 0;
4858 /* mask destination register are always tail-agnostic */
4859 /* set tail elements to 1s */
4860 if (vta_all_1s) {
4861 for (; i < total_elems; i++) {
4862 vext_set_elem_mask(vd, i, 1);
4863 }
4864 }
4865 }
4866
4867 void HELPER(vmsbf_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4868 uint32_t desc)
4869 {
4870 vmsetm(vd, v0, vs2, env, desc, BEFORE_FIRST);
4871 }
4872
4873 void HELPER(vmsif_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4874 uint32_t desc)
4875 {
4876 vmsetm(vd, v0, vs2, env, desc, INCLUDE_FIRST);
4877 }
4878
4879 void HELPER(vmsof_m)(void *vd, void *v0, void *vs2, CPURISCVState *env,
4880 uint32_t desc)
4881 {
4882 vmsetm(vd, v0, vs2, env, desc, ONLY_FIRST);
4883 }
4884
4885 /* Vector Iota Instruction */
4886 #define GEN_VEXT_VIOTA_M(NAME, ETYPE, H) \
4887 void HELPER(NAME)(void *vd, void *v0, void *vs2, CPURISCVState *env, \
4888 uint32_t desc) \
4889 { \
4890 uint32_t vm = vext_vm(desc); \
4891 uint32_t vl = env->vl; \
4892 uint32_t esz = sizeof(ETYPE); \
4893 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
4894 uint32_t vta = vext_vta(desc); \
4895 uint32_t sum = 0; \
4896 int i; \
4897 \
4898 for (i = env->vstart; i < vl; i++) { \
4899 if (!vm && !vext_elem_mask(v0, i)) { \
4900 continue; \
4901 } \
4902 *((ETYPE *)vd + H(i)) = sum; \
4903 if (vext_elem_mask(vs2, i)) { \
4904 sum++; \
4905 } \
4906 } \
4907 env->vstart = 0; \
4908 /* set tail elements to 1s */ \
4909 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
4910 }
4911
4912 GEN_VEXT_VIOTA_M(viota_m_b, uint8_t, H1)
4913 GEN_VEXT_VIOTA_M(viota_m_h, uint16_t, H2)
4914 GEN_VEXT_VIOTA_M(viota_m_w, uint32_t, H4)
4915 GEN_VEXT_VIOTA_M(viota_m_d, uint64_t, H8)
4916
4917 /* Vector Element Index Instruction */
4918 #define GEN_VEXT_VID_V(NAME, ETYPE, H) \
4919 void HELPER(NAME)(void *vd, void *v0, CPURISCVState *env, uint32_t desc) \
4920 { \
4921 uint32_t vm = vext_vm(desc); \
4922 uint32_t vl = env->vl; \
4923 uint32_t esz = sizeof(ETYPE); \
4924 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
4925 uint32_t vta = vext_vta(desc); \
4926 int i; \
4927 \
4928 for (i = env->vstart; i < vl; i++) { \
4929 if (!vm && !vext_elem_mask(v0, i)) { \
4930 continue; \
4931 } \
4932 *((ETYPE *)vd + H(i)) = i; \
4933 } \
4934 env->vstart = 0; \
4935 /* set tail elements to 1s */ \
4936 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
4937 }
4938
4939 GEN_VEXT_VID_V(vid_v_b, uint8_t, H1)
4940 GEN_VEXT_VID_V(vid_v_h, uint16_t, H2)
4941 GEN_VEXT_VID_V(vid_v_w, uint32_t, H4)
4942 GEN_VEXT_VID_V(vid_v_d, uint64_t, H8)
4943
4944 /*
4945 *** Vector Permutation Instructions
4946 */
4947
4948 /* Vector Slide Instructions */
4949 #define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H) \
4950 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4951 CPURISCVState *env, uint32_t desc) \
4952 { \
4953 uint32_t vm = vext_vm(desc); \
4954 uint32_t vl = env->vl; \
4955 uint32_t esz = sizeof(ETYPE); \
4956 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
4957 uint32_t vta = vext_vta(desc); \
4958 target_ulong offset = s1, i_min, i; \
4959 \
4960 i_min = MAX(env->vstart, offset); \
4961 for (i = i_min; i < vl; i++) { \
4962 if (!vm && !vext_elem_mask(v0, i)) { \
4963 continue; \
4964 } \
4965 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset)); \
4966 } \
4967 /* set tail elements to 1s */ \
4968 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
4969 }
4970
4971 /* vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] */
4972 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_b, uint8_t, H1)
4973 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_h, uint16_t, H2)
4974 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_w, uint32_t, H4)
4975 GEN_VEXT_VSLIDEUP_VX(vslideup_vx_d, uint64_t, H8)
4976
4977 #define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H) \
4978 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
4979 CPURISCVState *env, uint32_t desc) \
4980 { \
4981 uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE))); \
4982 uint32_t vm = vext_vm(desc); \
4983 uint32_t vl = env->vl; \
4984 uint32_t esz = sizeof(ETYPE); \
4985 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
4986 uint32_t vta = vext_vta(desc); \
4987 target_ulong i_max, i; \
4988 \
4989 i_max = MAX(MIN(s1 < vlmax ? vlmax - s1 : 0, vl), env->vstart); \
4990 for (i = env->vstart; i < i_max; ++i) { \
4991 if (vm || vext_elem_mask(v0, i)) { \
4992 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + s1)); \
4993 } \
4994 } \
4995 \
4996 for (i = i_max; i < vl; ++i) { \
4997 if (vm || vext_elem_mask(v0, i)) { \
4998 *((ETYPE *)vd + H(i)) = 0; \
4999 } \
5000 } \
5001 \
5002 env->vstart = 0; \
5003 /* set tail elements to 1s */ \
5004 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5005 }
5006
5007 /* vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] */
5008 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_b, uint8_t, H1)
5009 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_h, uint16_t, H2)
5010 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_w, uint32_t, H4)
5011 GEN_VEXT_VSLIDEDOWN_VX(vslidedown_vx_d, uint64_t, H8)
5012
5013 #define GEN_VEXT_VSLIE1UP(BITWIDTH, H) \
5014 static void vslide1up_##BITWIDTH(void *vd, void *v0, target_ulong s1, \
5015 void *vs2, CPURISCVState *env, uint32_t desc) \
5016 { \
5017 typedef uint##BITWIDTH##_t ETYPE; \
5018 uint32_t vm = vext_vm(desc); \
5019 uint32_t vl = env->vl; \
5020 uint32_t esz = sizeof(ETYPE); \
5021 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5022 uint32_t vta = vext_vta(desc); \
5023 uint32_t i; \
5024 \
5025 for (i = env->vstart; i < vl; i++) { \
5026 if (!vm && !vext_elem_mask(v0, i)) { \
5027 continue; \
5028 } \
5029 if (i == 0) { \
5030 *((ETYPE *)vd + H(i)) = s1; \
5031 } else { \
5032 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1)); \
5033 } \
5034 } \
5035 env->vstart = 0; \
5036 /* set tail elements to 1s */ \
5037 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5038 }
5039
5040 GEN_VEXT_VSLIE1UP(8, H1)
5041 GEN_VEXT_VSLIE1UP(16, H2)
5042 GEN_VEXT_VSLIE1UP(32, H4)
5043 GEN_VEXT_VSLIE1UP(64, H8)
5044
5045 #define GEN_VEXT_VSLIDE1UP_VX(NAME, BITWIDTH) \
5046 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5047 CPURISCVState *env, uint32_t desc) \
5048 { \
5049 vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc); \
5050 }
5051
5052 /* vslide1up.vx vd, vs2, rs1, vm # vd[0]=x[rs1], vd[i+1] = vs2[i] */
5053 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_b, 8)
5054 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_h, 16)
5055 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_w, 32)
5056 GEN_VEXT_VSLIDE1UP_VX(vslide1up_vx_d, 64)
5057
5058 #define GEN_VEXT_VSLIDE1DOWN(BITWIDTH, H) \
5059 static void vslide1down_##BITWIDTH(void *vd, void *v0, target_ulong s1, \
5060 void *vs2, CPURISCVState *env, uint32_t desc) \
5061 { \
5062 typedef uint##BITWIDTH##_t ETYPE; \
5063 uint32_t vm = vext_vm(desc); \
5064 uint32_t vl = env->vl; \
5065 uint32_t esz = sizeof(ETYPE); \
5066 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5067 uint32_t vta = vext_vta(desc); \
5068 uint32_t i; \
5069 \
5070 for (i = env->vstart; i < vl; i++) { \
5071 if (!vm && !vext_elem_mask(v0, i)) { \
5072 continue; \
5073 } \
5074 if (i == vl - 1) { \
5075 *((ETYPE *)vd + H(i)) = s1; \
5076 } else { \
5077 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + 1)); \
5078 } \
5079 } \
5080 env->vstart = 0; \
5081 /* set tail elements to 1s */ \
5082 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5083 }
5084
5085 GEN_VEXT_VSLIDE1DOWN(8, H1)
5086 GEN_VEXT_VSLIDE1DOWN(16, H2)
5087 GEN_VEXT_VSLIDE1DOWN(32, H4)
5088 GEN_VEXT_VSLIDE1DOWN(64, H8)
5089
5090 #define GEN_VEXT_VSLIDE1DOWN_VX(NAME, BITWIDTH) \
5091 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5092 CPURISCVState *env, uint32_t desc) \
5093 { \
5094 vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc); \
5095 }
5096
5097 /* vslide1down.vx vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=x[rs1] */
5098 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_b, 8)
5099 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_h, 16)
5100 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_w, 32)
5101 GEN_VEXT_VSLIDE1DOWN_VX(vslide1down_vx_d, 64)
5102
5103 /* Vector Floating-Point Slide Instructions */
5104 #define GEN_VEXT_VFSLIDE1UP_VF(NAME, BITWIDTH) \
5105 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5106 CPURISCVState *env, uint32_t desc) \
5107 { \
5108 vslide1up_##BITWIDTH(vd, v0, s1, vs2, env, desc); \
5109 }
5110
5111 /* vfslide1up.vf vd, vs2, rs1, vm # vd[0]=f[rs1], vd[i+1] = vs2[i] */
5112 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_h, 16)
5113 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_w, 32)
5114 GEN_VEXT_VFSLIDE1UP_VF(vfslide1up_vf_d, 64)
5115
5116 #define GEN_VEXT_VFSLIDE1DOWN_VF(NAME, BITWIDTH) \
5117 void HELPER(NAME)(void *vd, void *v0, uint64_t s1, void *vs2, \
5118 CPURISCVState *env, uint32_t desc) \
5119 { \
5120 vslide1down_##BITWIDTH(vd, v0, s1, vs2, env, desc); \
5121 }
5122
5123 /* vfslide1down.vf vd, vs2, rs1, vm # vd[i] = vs2[i+1], vd[vl-1]=f[rs1] */
5124 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_h, 16)
5125 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_w, 32)
5126 GEN_VEXT_VFSLIDE1DOWN_VF(vfslide1down_vf_d, 64)
5127
5128 /* Vector Register Gather Instruction */
5129 #define GEN_VEXT_VRGATHER_VV(NAME, TS1, TS2, HS1, HS2) \
5130 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
5131 CPURISCVState *env, uint32_t desc) \
5132 { \
5133 uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(TS2))); \
5134 uint32_t vm = vext_vm(desc); \
5135 uint32_t vl = env->vl; \
5136 uint32_t esz = sizeof(TS2); \
5137 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5138 uint32_t vta = vext_vta(desc); \
5139 uint64_t index; \
5140 uint32_t i; \
5141 \
5142 for (i = env->vstart; i < vl; i++) { \
5143 if (!vm && !vext_elem_mask(v0, i)) { \
5144 continue; \
5145 } \
5146 index = *((TS1 *)vs1 + HS1(i)); \
5147 if (index >= vlmax) { \
5148 *((TS2 *)vd + HS2(i)) = 0; \
5149 } else { \
5150 *((TS2 *)vd + HS2(i)) = *((TS2 *)vs2 + HS2(index)); \
5151 } \
5152 } \
5153 env->vstart = 0; \
5154 /* set tail elements to 1s */ \
5155 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5156 }
5157
5158 /* vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; */
5159 GEN_VEXT_VRGATHER_VV(vrgather_vv_b, uint8_t, uint8_t, H1, H1)
5160 GEN_VEXT_VRGATHER_VV(vrgather_vv_h, uint16_t, uint16_t, H2, H2)
5161 GEN_VEXT_VRGATHER_VV(vrgather_vv_w, uint32_t, uint32_t, H4, H4)
5162 GEN_VEXT_VRGATHER_VV(vrgather_vv_d, uint64_t, uint64_t, H8, H8)
5163
5164 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_b, uint16_t, uint8_t, H2, H1)
5165 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_h, uint16_t, uint16_t, H2, H2)
5166 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_w, uint16_t, uint32_t, H2, H4)
5167 GEN_VEXT_VRGATHER_VV(vrgatherei16_vv_d, uint16_t, uint64_t, H2, H8)
5168
5169 #define GEN_VEXT_VRGATHER_VX(NAME, ETYPE, H) \
5170 void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \
5171 CPURISCVState *env, uint32_t desc) \
5172 { \
5173 uint32_t vlmax = vext_max_elems(desc, ctzl(sizeof(ETYPE))); \
5174 uint32_t vm = vext_vm(desc); \
5175 uint32_t vl = env->vl; \
5176 uint32_t esz = sizeof(ETYPE); \
5177 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5178 uint32_t vta = vext_vta(desc); \
5179 uint64_t index = s1; \
5180 uint32_t i; \
5181 \
5182 for (i = env->vstart; i < vl; i++) { \
5183 if (!vm && !vext_elem_mask(v0, i)) { \
5184 continue; \
5185 } \
5186 if (index >= vlmax) { \
5187 *((ETYPE *)vd + H(i)) = 0; \
5188 } else { \
5189 *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(index)); \
5190 } \
5191 } \
5192 env->vstart = 0; \
5193 /* set tail elements to 1s */ \
5194 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5195 }
5196
5197 /* vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[rs1] */
5198 GEN_VEXT_VRGATHER_VX(vrgather_vx_b, uint8_t, H1)
5199 GEN_VEXT_VRGATHER_VX(vrgather_vx_h, uint16_t, H2)
5200 GEN_VEXT_VRGATHER_VX(vrgather_vx_w, uint32_t, H4)
5201 GEN_VEXT_VRGATHER_VX(vrgather_vx_d, uint64_t, H8)
5202
5203 /* Vector Compress Instruction */
5204 #define GEN_VEXT_VCOMPRESS_VM(NAME, ETYPE, H) \
5205 void HELPER(NAME)(void *vd, void *v0, void *vs1, void *vs2, \
5206 CPURISCVState *env, uint32_t desc) \
5207 { \
5208 uint32_t vl = env->vl; \
5209 uint32_t esz = sizeof(ETYPE); \
5210 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5211 uint32_t vta = vext_vta(desc); \
5212 uint32_t num = 0, i; \
5213 \
5214 for (i = env->vstart; i < vl; i++) { \
5215 if (!vext_elem_mask(vs1, i)) { \
5216 continue; \
5217 } \
5218 *((ETYPE *)vd + H(num)) = *((ETYPE *)vs2 + H(i)); \
5219 num++; \
5220 } \
5221 env->vstart = 0; \
5222 /* set tail elements to 1s */ \
5223 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5224 }
5225
5226 /* Compress into vd elements of vs2 where vs1 is enabled */
5227 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_b, uint8_t, H1)
5228 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_h, uint16_t, H2)
5229 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_w, uint32_t, H4)
5230 GEN_VEXT_VCOMPRESS_VM(vcompress_vm_d, uint64_t, H8)
5231
5232 /* Vector Whole Register Move */
5233 void HELPER(vmvr_v)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc)
5234 {
5235 /* EEW = SEW */
5236 uint32_t maxsz = simd_maxsz(desc);
5237 uint32_t sewb = 1 << FIELD_EX64(env->vtype, VTYPE, VSEW);
5238 uint32_t startb = env->vstart * sewb;
5239 uint32_t i = startb;
5240
5241 memcpy((uint8_t *)vd + H1(i),
5242 (uint8_t *)vs2 + H1(i),
5243 maxsz - startb);
5244
5245 env->vstart = 0;
5246 }
5247
5248 /* Vector Integer Extension */
5249 #define GEN_VEXT_INT_EXT(NAME, ETYPE, DTYPE, HD, HS1) \
5250 void HELPER(NAME)(void *vd, void *v0, void *vs2, \
5251 CPURISCVState *env, uint32_t desc) \
5252 { \
5253 uint32_t vl = env->vl; \
5254 uint32_t vm = vext_vm(desc); \
5255 uint32_t esz = sizeof(ETYPE); \
5256 uint32_t total_elems = vext_get_total_elems(env, desc, esz); \
5257 uint32_t vta = vext_vta(desc); \
5258 uint32_t i; \
5259 \
5260 for (i = env->vstart; i < vl; i++) { \
5261 if (!vm && !vext_elem_mask(v0, i)) { \
5262 continue; \
5263 } \
5264 *((ETYPE *)vd + HD(i)) = *((DTYPE *)vs2 + HS1(i)); \
5265 } \
5266 env->vstart = 0; \
5267 /* set tail elements to 1s */ \
5268 vext_set_elems_1s(vd, vta, vl * esz, total_elems * esz); \
5269 }
5270
5271 GEN_VEXT_INT_EXT(vzext_vf2_h, uint16_t, uint8_t, H2, H1)
5272 GEN_VEXT_INT_EXT(vzext_vf2_w, uint32_t, uint16_t, H4, H2)
5273 GEN_VEXT_INT_EXT(vzext_vf2_d, uint64_t, uint32_t, H8, H4)
5274 GEN_VEXT_INT_EXT(vzext_vf4_w, uint32_t, uint8_t, H4, H1)
5275 GEN_VEXT_INT_EXT(vzext_vf4_d, uint64_t, uint16_t, H8, H2)
5276 GEN_VEXT_INT_EXT(vzext_vf8_d, uint64_t, uint8_t, H8, H1)
5277
5278 GEN_VEXT_INT_EXT(vsext_vf2_h, int16_t, int8_t, H2, H1)
5279 GEN_VEXT_INT_EXT(vsext_vf2_w, int32_t, int16_t, H4, H2)
5280 GEN_VEXT_INT_EXT(vsext_vf2_d, int64_t, int32_t, H8, H4)
5281 GEN_VEXT_INT_EXT(vsext_vf4_w, int32_t, int8_t, H4, H1)
5282 GEN_VEXT_INT_EXT(vsext_vf4_d, int64_t, int16_t, H8, H2)
5283 GEN_VEXT_INT_EXT(vsext_vf8_d, int64_t, int8_t, H8, H1)