]>
git.proxmox.com Git - mirror_qemu.git/blob - tests/tcg/hexagon/scatter_gather.c
2 * Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
19 * This example tests the HVX scatter/gather instructions
21 * See section 5.13 of the V68 HVX Programmer's Reference
23 * There are 3 main classes operations
24 * _16 16-bit elements and 16-bit offsets
25 * _32 32-bit elements and 32-bit offsets
26 * _16_32 16-bit elements and 32-bit offsets
28 * There are also masked and accumulate versions
36 typedef long HVX_Vector
__attribute__((__vector_size__(128)))
37 __attribute__((aligned(128)));
38 typedef long HVX_VectorPair
__attribute__((__vector_size__(256)))
39 __attribute__((aligned(128)));
40 typedef long HVX_VectorPred
__attribute__((__vector_size__(128)))
41 __attribute__((aligned(128)));
43 #define VSCATTER_16(BASE, RGN, OFF, VALS) \
44 __builtin_HEXAGON_V6_vscattermh_128B((int)BASE, RGN, OFF, VALS)
45 #define VSCATTER_16_MASKED(MASK, BASE, RGN, OFF, VALS) \
46 __builtin_HEXAGON_V6_vscattermhq_128B(MASK, (int)BASE, RGN, OFF, VALS)
47 #define VSCATTER_32(BASE, RGN, OFF, VALS) \
48 __builtin_HEXAGON_V6_vscattermw_128B((int)BASE, RGN, OFF, VALS)
49 #define VSCATTER_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
50 __builtin_HEXAGON_V6_vscattermwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
51 #define VSCATTER_16_32(BASE, RGN, OFF, VALS) \
52 __builtin_HEXAGON_V6_vscattermhw_128B((int)BASE, RGN, OFF, VALS)
53 #define VSCATTER_16_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
54 __builtin_HEXAGON_V6_vscattermhwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
55 #define VSCATTER_16_ACC(BASE, RGN, OFF, VALS) \
56 __builtin_HEXAGON_V6_vscattermh_add_128B((int)BASE, RGN, OFF, VALS)
57 #define VSCATTER_32_ACC(BASE, RGN, OFF, VALS) \
58 __builtin_HEXAGON_V6_vscattermw_add_128B((int)BASE, RGN, OFF, VALS)
59 #define VSCATTER_16_32_ACC(BASE, RGN, OFF, VALS) \
60 __builtin_HEXAGON_V6_vscattermhw_add_128B((int)BASE, RGN, OFF, VALS)
62 #define VGATHER_16(DSTADDR, BASE, RGN, OFF) \
63 __builtin_HEXAGON_V6_vgathermh_128B(DSTADDR, (int)BASE, RGN, OFF)
64 #define VGATHER_16_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
65 __builtin_HEXAGON_V6_vgathermhq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
66 #define VGATHER_32(DSTADDR, BASE, RGN, OFF) \
67 __builtin_HEXAGON_V6_vgathermw_128B(DSTADDR, (int)BASE, RGN, OFF)
68 #define VGATHER_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
69 __builtin_HEXAGON_V6_vgathermwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
70 #define VGATHER_16_32(DSTADDR, BASE, RGN, OFF) \
71 __builtin_HEXAGON_V6_vgathermhw_128B(DSTADDR, (int)BASE, RGN, OFF)
72 #define VGATHER_16_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
73 __builtin_HEXAGON_V6_vgathermhwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
76 __builtin_HEXAGON_V6_vshuffh_128B(V)
78 __builtin_HEXAGON_V6_lvsplath_128B(X)
79 #define VAND_VAL(PRED, VAL) \
80 __builtin_HEXAGON_V6_vandvrt_128B(PRED, VAL)
82 __builtin_HEXAGON_V6_vdealh_128B(V)
86 /* define the number of rows/cols in a square matrix */
87 #define MATRIX_SIZE 64
89 /* define the size of the scatter buffer */
90 #define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
92 /* fake vtcm - put buffers together and force alignment */
94 unsigned short vscatter16
[SCATTER_BUFFER_SIZE
];
95 unsigned short vgather16
[MATRIX_SIZE
];
96 unsigned int vscatter32
[SCATTER_BUFFER_SIZE
];
97 unsigned int vgather32
[MATRIX_SIZE
];
98 unsigned short vscatter16_32
[SCATTER_BUFFER_SIZE
];
99 unsigned short vgather16_32
[MATRIX_SIZE
];
100 } vtcm
__attribute__((aligned(0x10000)));
102 /* declare the arrays of reference values */
103 unsigned short vscatter16_ref
[SCATTER_BUFFER_SIZE
];
104 unsigned short vgather16_ref
[MATRIX_SIZE
];
105 unsigned int vscatter32_ref
[SCATTER_BUFFER_SIZE
];
106 unsigned int vgather32_ref
[MATRIX_SIZE
];
107 unsigned short vscatter16_32_ref
[SCATTER_BUFFER_SIZE
];
108 unsigned short vgather16_32_ref
[MATRIX_SIZE
];
110 /* declare the arrays of offsets */
111 unsigned short half_offsets
[MATRIX_SIZE
];
112 unsigned int word_offsets
[MATRIX_SIZE
];
114 /* declare the arrays of values */
115 unsigned short half_values
[MATRIX_SIZE
];
116 unsigned short half_values_acc
[MATRIX_SIZE
];
117 unsigned short half_values_masked
[MATRIX_SIZE
];
118 unsigned int word_values
[MATRIX_SIZE
];
119 unsigned int word_values_acc
[MATRIX_SIZE
];
120 unsigned int word_values_masked
[MATRIX_SIZE
];
122 /* declare the arrays of predicates */
123 unsigned short half_predicates
[MATRIX_SIZE
];
124 unsigned int word_predicates
[MATRIX_SIZE
];
126 /* make this big enough for all the intrinsics */
127 const size_t region_len
= sizeof(vtcm
);
129 /* optionally add sync instructions */
130 #define SYNC_VECTOR 1
132 static void sync_scatter(void *addr
)
136 * Do the scatter release followed by a dummy load to complete the
137 * synchronization. Normally the dummy load would be deferred as
138 * long as possible to minimize stalls.
140 asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr
));
141 /* use volatile to force the load */
142 volatile HVX_Vector vDummy
= *(HVX_Vector
*)addr
; vDummy
= vDummy
;
146 static void sync_gather(void *addr
)
149 /* use volatile to force the load */
150 volatile HVX_Vector vDummy
= *(HVX_Vector
*)addr
; vDummy
= vDummy
;
154 /* optionally print the results */
157 #define FILL_CHAR '.'
159 /* fill vtcm scratch with ee */
160 void prefill_vtcm_scratch(void)
162 memset(&vtcm
, FILL_CHAR
, sizeof(vtcm
));
165 /* create byte offsets to be a diagonal of the matrix with 16 bit elements */
166 void create_offsets_values_preds_16(void)
168 unsigned short half_element
= 0;
169 unsigned short half_element_masked
= 0;
171 char letter_masked
= '@';
173 for (int i
= 0; i
< MATRIX_SIZE
; i
++) {
174 half_offsets
[i
] = i
* (2 * MATRIX_SIZE
+ 2);
177 half_element_masked
= 0;
178 for (int j
= 0; j
< 2; j
++) {
179 half_element
|= letter
<< j
* 8;
180 half_element_masked
|= letter_masked
<< j
* 8;
183 half_values
[i
] = half_element
;
184 half_values_acc
[i
] = ((i
% 10) << 8) + (i
% 10);
185 half_values_masked
[i
] = half_element_masked
;
193 half_predicates
[i
] = (i
% 3 == 0 || i
% 5 == 0) ? ~0 : 0;
197 /* create byte offsets to be a diagonal of the matrix with 32 bit elements */
198 void create_offsets_values_preds_32(void)
200 unsigned int word_element
= 0;
201 unsigned int word_element_masked
= 0;
203 char letter_masked
= '&';
205 for (int i
= 0; i
< MATRIX_SIZE
; i
++) {
206 word_offsets
[i
] = i
* (4 * MATRIX_SIZE
+ 4);
209 word_element_masked
= 0;
210 for (int j
= 0; j
< 4; j
++) {
211 word_element
|= letter
<< j
* 8;
212 word_element_masked
|= letter_masked
<< j
* 8;
215 word_values
[i
] = word_element
;
216 word_values_acc
[i
] = ((i
% 10) << 8) + (i
% 10);
217 word_values_masked
[i
] = word_element_masked
;
225 word_predicates
[i
] = (i
% 4 == 0 || i
% 7 == 0) ? ~0 : 0;
230 * create byte offsets to be a diagonal of the matrix with 16 bit elements
233 void create_offsets_values_preds_16_32(void)
235 unsigned short half_element
= 0;
236 unsigned short half_element_masked
= 0;
238 char letter_masked
= '$';
240 for (int i
= 0; i
< MATRIX_SIZE
; i
++) {
241 word_offsets
[i
] = i
* (2 * MATRIX_SIZE
+ 2);
244 half_element_masked
= 0;
245 for (int j
= 0; j
< 2; j
++) {
246 half_element
|= letter
<< j
* 8;
247 half_element_masked
|= letter_masked
<< j
* 8;
250 half_values
[i
] = half_element
;
251 half_values_acc
[i
] = ((i
% 10) << 8) + (i
% 10);
252 half_values_masked
[i
] = half_element_masked
;
260 half_predicates
[i
] = (i
% 2 == 0 || i
% 13 == 0) ? ~0 : 0;
264 /* scatter the 16 bit elements using intrinsics */
265 void vector_scatter_16(void)
267 /* copy the offsets and values to vectors */
268 HVX_Vector offsets
= *(HVX_Vector
*)half_offsets
;
269 HVX_Vector values
= *(HVX_Vector
*)half_values
;
271 VSCATTER_16(&vtcm
.vscatter16
, region_len
, offsets
, values
);
273 sync_scatter(vtcm
.vscatter16
);
276 /* scatter-accumulate the 16 bit elements using intrinsics */
277 void vector_scatter_16_acc(void)
279 /* copy the offsets and values to vectors */
280 HVX_Vector offsets
= *(HVX_Vector
*)half_offsets
;
281 HVX_Vector values
= *(HVX_Vector
*)half_values_acc
;
283 VSCATTER_16_ACC(&vtcm
.vscatter16
, region_len
, offsets
, values
);
285 sync_scatter(vtcm
.vscatter16
);
288 /* scatter the 16 bit elements using intrinsics */
289 void vector_scatter_16_masked(void)
291 /* copy the offsets and values to vectors */
292 HVX_Vector offsets
= *(HVX_Vector
*)half_offsets
;
293 HVX_Vector values
= *(HVX_Vector
*)half_values_masked
;
294 HVX_Vector pred_reg
= *(HVX_Vector
*)half_predicates
;
295 HVX_VectorPred preds
= VAND_VAL(pred_reg
, ~0);
297 VSCATTER_16_MASKED(preds
, &vtcm
.vscatter16
, region_len
, offsets
, values
);
299 sync_scatter(vtcm
.vscatter16
);
302 /* scatter the 32 bit elements using intrinsics */
303 void vector_scatter_32(void)
305 /* copy the offsets and values to vectors */
306 HVX_Vector offsetslo
= *(HVX_Vector
*)word_offsets
;
307 HVX_Vector offsetshi
= *(HVX_Vector
*)&word_offsets
[MATRIX_SIZE
/ 2];
308 HVX_Vector valueslo
= *(HVX_Vector
*)word_values
;
309 HVX_Vector valueshi
= *(HVX_Vector
*)&word_values
[MATRIX_SIZE
/ 2];
311 VSCATTER_32(&vtcm
.vscatter32
, region_len
, offsetslo
, valueslo
);
312 VSCATTER_32(&vtcm
.vscatter32
, region_len
, offsetshi
, valueshi
);
314 sync_scatter(vtcm
.vscatter32
);
317 /* scatter-acc the 32 bit elements using intrinsics */
318 void vector_scatter_32_acc(void)
320 /* copy the offsets and values to vectors */
321 HVX_Vector offsetslo
= *(HVX_Vector
*)word_offsets
;
322 HVX_Vector offsetshi
= *(HVX_Vector
*)&word_offsets
[MATRIX_SIZE
/ 2];
323 HVX_Vector valueslo
= *(HVX_Vector
*)word_values_acc
;
324 HVX_Vector valueshi
= *(HVX_Vector
*)&word_values_acc
[MATRIX_SIZE
/ 2];
326 VSCATTER_32_ACC(&vtcm
.vscatter32
, region_len
, offsetslo
, valueslo
);
327 VSCATTER_32_ACC(&vtcm
.vscatter32
, region_len
, offsetshi
, valueshi
);
329 sync_scatter(vtcm
.vscatter32
);
332 /* scatter the 32 bit elements using intrinsics */
333 void vector_scatter_32_masked(void)
335 /* copy the offsets and values to vectors */
336 HVX_Vector offsetslo
= *(HVX_Vector
*)word_offsets
;
337 HVX_Vector offsetshi
= *(HVX_Vector
*)&word_offsets
[MATRIX_SIZE
/ 2];
338 HVX_Vector valueslo
= *(HVX_Vector
*)word_values_masked
;
339 HVX_Vector valueshi
= *(HVX_Vector
*)&word_values_masked
[MATRIX_SIZE
/ 2];
340 HVX_Vector pred_reglo
= *(HVX_Vector
*)word_predicates
;
341 HVX_Vector pred_reghi
= *(HVX_Vector
*)&word_predicates
[MATRIX_SIZE
/ 2];
342 HVX_VectorPred predslo
= VAND_VAL(pred_reglo
, ~0);
343 HVX_VectorPred predshi
= VAND_VAL(pred_reghi
, ~0);
345 VSCATTER_32_MASKED(predslo
, &vtcm
.vscatter32
, region_len
, offsetslo
,
347 VSCATTER_32_MASKED(predshi
, &vtcm
.vscatter32
, region_len
, offsetshi
,
350 sync_scatter(vtcm
.vscatter16
);
353 /* scatter the 16 bit elements with 32 bit offsets using intrinsics */
354 void vector_scatter_16_32(void)
356 HVX_VectorPair offsets
;
359 /* get the word offsets in a vector pair */
360 offsets
= *(HVX_VectorPair
*)word_offsets
;
362 /* these values need to be shuffled for the scatter */
363 values
= *(HVX_Vector
*)half_values
;
364 values
= VSHUFF_H(values
);
366 VSCATTER_16_32(&vtcm
.vscatter16_32
, region_len
, offsets
, values
);
368 sync_scatter(vtcm
.vscatter16_32
);
371 /* scatter-acc the 16 bit elements with 32 bit offsets using intrinsics */
372 void vector_scatter_16_32_acc(void)
374 HVX_VectorPair offsets
;
377 /* get the word offsets in a vector pair */
378 offsets
= *(HVX_VectorPair
*)word_offsets
;
380 /* these values need to be shuffled for the scatter */
381 values
= *(HVX_Vector
*)half_values_acc
;
382 values
= VSHUFF_H(values
);
384 VSCATTER_16_32_ACC(&vtcm
.vscatter16_32
, region_len
, offsets
, values
);
386 sync_scatter(vtcm
.vscatter16_32
);
389 /* masked scatter the 16 bit elements with 32 bit offsets using intrinsics */
390 void vector_scatter_16_32_masked(void)
392 HVX_VectorPair offsets
;
396 /* get the word offsets in a vector pair */
397 offsets
= *(HVX_VectorPair
*)word_offsets
;
399 /* these values need to be shuffled for the scatter */
400 values
= *(HVX_Vector
*)half_values_masked
;
401 values
= VSHUFF_H(values
);
403 pred_reg
= *(HVX_Vector
*)half_predicates
;
404 pred_reg
= VSHUFF_H(pred_reg
);
405 HVX_VectorPred preds
= VAND_VAL(pred_reg
, ~0);
407 VSCATTER_16_32_MASKED(preds
, &vtcm
.vscatter16_32
, region_len
, offsets
,
410 sync_scatter(vtcm
.vscatter16_32
);
413 /* gather the elements from the scatter16 buffer */
414 void vector_gather_16(void)
416 HVX_Vector
*vgather
= (HVX_Vector
*)&vtcm
.vgather16
;
417 HVX_Vector offsets
= *(HVX_Vector
*)half_offsets
;
419 VGATHER_16(vgather
, &vtcm
.vscatter16
, region_len
, offsets
);
421 sync_gather(vgather
);
424 static unsigned short gather_16_masked_init(void)
427 return letter
| (letter
<< 8);
430 void vector_gather_16_masked(void)
432 HVX_Vector
*vgather
= (HVX_Vector
*)&vtcm
.vgather16
;
433 HVX_Vector offsets
= *(HVX_Vector
*)half_offsets
;
434 HVX_Vector pred_reg
= *(HVX_Vector
*)half_predicates
;
435 HVX_VectorPred preds
= VAND_VAL(pred_reg
, ~0);
437 *vgather
= VSPLAT_H(gather_16_masked_init());
438 VGATHER_16_MASKED(vgather
, preds
, &vtcm
.vscatter16
, region_len
, offsets
);
440 sync_gather(vgather
);
443 /* gather the elements from the scatter32 buffer */
444 void vector_gather_32(void)
446 HVX_Vector
*vgatherlo
= (HVX_Vector
*)&vtcm
.vgather32
;
447 HVX_Vector
*vgatherhi
=
448 (HVX_Vector
*)((int)&vtcm
.vgather32
+ (MATRIX_SIZE
* 2));
449 HVX_Vector offsetslo
= *(HVX_Vector
*)word_offsets
;
450 HVX_Vector offsetshi
= *(HVX_Vector
*)&word_offsets
[MATRIX_SIZE
/ 2];
452 VGATHER_32(vgatherlo
, &vtcm
.vscatter32
, region_len
, offsetslo
);
453 VGATHER_32(vgatherhi
, &vtcm
.vscatter32
, region_len
, offsetshi
);
455 sync_gather(vgatherhi
);
458 static unsigned int gather_32_masked_init(void)
461 return letter
| (letter
<< 8) | (letter
<< 16) | (letter
<< 24);
464 void vector_gather_32_masked(void)
466 HVX_Vector
*vgatherlo
= (HVX_Vector
*)&vtcm
.vgather32
;
467 HVX_Vector
*vgatherhi
=
468 (HVX_Vector
*)((int)&vtcm
.vgather32
+ (MATRIX_SIZE
* 2));
469 HVX_Vector offsetslo
= *(HVX_Vector
*)word_offsets
;
470 HVX_Vector offsetshi
= *(HVX_Vector
*)&word_offsets
[MATRIX_SIZE
/ 2];
471 HVX_Vector pred_reglo
= *(HVX_Vector
*)word_predicates
;
472 HVX_VectorPred predslo
= VAND_VAL(pred_reglo
, ~0);
473 HVX_Vector pred_reghi
= *(HVX_Vector
*)&word_predicates
[MATRIX_SIZE
/ 2];
474 HVX_VectorPred predshi
= VAND_VAL(pred_reghi
, ~0);
476 *vgatherlo
= VSPLAT_H(gather_32_masked_init());
477 *vgatherhi
= VSPLAT_H(gather_32_masked_init());
478 VGATHER_32_MASKED(vgatherlo
, predslo
, &vtcm
.vscatter32
, region_len
,
480 VGATHER_32_MASKED(vgatherhi
, predshi
, &vtcm
.vscatter32
, region_len
,
483 sync_gather(vgatherlo
);
484 sync_gather(vgatherhi
);
487 /* gather the elements from the scatter16_32 buffer */
488 void vector_gather_16_32(void)
491 HVX_VectorPair offsets
;
494 /* get the vtcm address to gather from */
495 vgather
= (HVX_Vector
*)&vtcm
.vgather16_32
;
497 /* get the word offsets in a vector pair */
498 offsets
= *(HVX_VectorPair
*)word_offsets
;
500 VGATHER_16_32(vgather
, &vtcm
.vscatter16_32
, region_len
, offsets
);
502 /* deal the elements to get the order back */
503 values
= *(HVX_Vector
*)vgather
;
504 values
= VDEAL_H(values
);
506 /* write it back to vtcm address */
507 *(HVX_Vector
*)vgather
= values
;
510 void vector_gather_16_32_masked(void)
513 HVX_VectorPair offsets
;
515 HVX_VectorPred preds
;
518 /* get the vtcm address to gather from */
519 vgather
= (HVX_Vector
*)&vtcm
.vgather16_32
;
521 /* get the word offsets in a vector pair */
522 offsets
= *(HVX_VectorPair
*)word_offsets
;
523 pred_reg
= *(HVX_Vector
*)half_predicates
;
524 pred_reg
= VSHUFF_H(pred_reg
);
525 preds
= VAND_VAL(pred_reg
, ~0);
527 *vgather
= VSPLAT_H(gather_16_masked_init());
528 VGATHER_16_32_MASKED(vgather
, preds
, &vtcm
.vscatter16_32
, region_len
,
531 /* deal the elements to get the order back */
532 values
= *(HVX_Vector
*)vgather
;
533 values
= VDEAL_H(values
);
535 /* write it back to vtcm address */
536 *(HVX_Vector
*)vgather
= values
;
539 static void check_buffer(const char *name
, void *c
, void *r
, size_t size
)
541 char *check
= (char *)c
;
542 char *ref
= (char *)r
;
543 for (int i
= 0; i
< size
; i
++) {
544 if (check
[i
] != ref
[i
]) {
545 printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name
, i
,
546 check
[i
], check
[i
], ref
[i
], ref
[i
]);
553 * These scalar functions are the C equivalents of the vector functions that
557 /* scatter the 16 bit elements using C */
558 void scalar_scatter_16(unsigned short *vscatter16
)
560 for (int i
= 0; i
< MATRIX_SIZE
; ++i
) {
561 vscatter16
[half_offsets
[i
] / 2] = half_values
[i
];
565 void check_scatter_16()
567 memset(vscatter16_ref
, FILL_CHAR
,
568 SCATTER_BUFFER_SIZE
* sizeof(unsigned short));
569 scalar_scatter_16(vscatter16_ref
);
570 check_buffer(__func__
, vtcm
.vscatter16
, vscatter16_ref
,
571 SCATTER_BUFFER_SIZE
* sizeof(unsigned short));
574 /* scatter the 16 bit elements using C */
575 void scalar_scatter_16_acc(unsigned short *vscatter16
)
577 for (int i
= 0; i
< MATRIX_SIZE
; ++i
) {
578 vscatter16
[half_offsets
[i
] / 2] += half_values_acc
[i
];
582 void check_scatter_16_acc()
584 memset(vscatter16_ref
, FILL_CHAR
,
585 SCATTER_BUFFER_SIZE
* sizeof(unsigned short));
586 scalar_scatter_16(vscatter16_ref
);
587 scalar_scatter_16_acc(vscatter16_ref
);
588 check_buffer(__func__
, vtcm
.vscatter16
, vscatter16_ref
,
589 SCATTER_BUFFER_SIZE
* sizeof(unsigned short));
592 /* scatter the 16 bit elements using C */
593 void scalar_scatter_16_masked(unsigned short *vscatter16
)
595 for (int i
= 0; i
< MATRIX_SIZE
; i
++) {
596 if (half_predicates
[i
]) {
597 vscatter16
[half_offsets
[i
] / 2] = half_values_masked
[i
];
603 void check_scatter_16_masked()
605 memset(vscatter16_ref
, FILL_CHAR
,
606 SCATTER_BUFFER_SIZE
* sizeof(unsigned short));
607 scalar_scatter_16(vscatter16_ref
);
608 scalar_scatter_16_acc(vscatter16_ref
);
609 scalar_scatter_16_masked(vscatter16_ref
);
610 check_buffer(__func__
, vtcm
.vscatter16
, vscatter16_ref
,
611 SCATTER_BUFFER_SIZE
* sizeof(unsigned short));
614 /* scatter the 32 bit elements using C */
615 void scalar_scatter_32(unsigned int *vscatter32
)
617 for (int i
= 0; i
< MATRIX_SIZE
; ++i
) {
618 vscatter32
[word_offsets
[i
] / 4] = word_values
[i
];
622 void check_scatter_32()
624 memset(vscatter32_ref
, FILL_CHAR
,
625 SCATTER_BUFFER_SIZE
* sizeof(unsigned int));
626 scalar_scatter_32(vscatter32_ref
);
627 check_buffer(__func__
, vtcm
.vscatter32
, vscatter32_ref
,
628 SCATTER_BUFFER_SIZE
* sizeof(unsigned int));
631 /* scatter the 32 bit elements using C */
632 void scalar_scatter_32_acc(unsigned int *vscatter32
)
634 for (int i
= 0; i
< MATRIX_SIZE
; ++i
) {
635 vscatter32
[word_offsets
[i
] / 4] += word_values_acc
[i
];
639 void check_scatter_32_acc()
641 memset(vscatter32_ref
, FILL_CHAR
,
642 SCATTER_BUFFER_SIZE
* sizeof(unsigned int));
643 scalar_scatter_32(vscatter32_ref
);
644 scalar_scatter_32_acc(vscatter32_ref
);
645 check_buffer(__func__
, vtcm
.vscatter32
, vscatter32_ref
,
646 SCATTER_BUFFER_SIZE
* sizeof(unsigned int));
649 /* scatter the 32 bit elements using C */
650 void scalar_scatter_32_masked(unsigned int *vscatter32
)
652 for (int i
= 0; i
< MATRIX_SIZE
; i
++) {
653 if (word_predicates
[i
]) {
654 vscatter32
[word_offsets
[i
] / 4] = word_values_masked
[i
];
659 void check_scatter_32_masked()
661 memset(vscatter32_ref
, FILL_CHAR
,
662 SCATTER_BUFFER_SIZE
* sizeof(unsigned int));
663 scalar_scatter_32(vscatter32_ref
);
664 scalar_scatter_32_acc(vscatter32_ref
);
665 scalar_scatter_32_masked(vscatter32_ref
);
666 check_buffer(__func__
, vtcm
.vscatter32
, vscatter32_ref
,
667 SCATTER_BUFFER_SIZE
* sizeof(unsigned int));
670 /* scatter the 32 bit elements using C */
671 void scalar_scatter_16_32(unsigned short *vscatter16_32
)
673 for (int i
= 0; i
< MATRIX_SIZE
; ++i
) {
674 vscatter16_32
[word_offsets
[i
] / 2] = half_values
[i
];
678 void check_scatter_16_32()
680 memset(vscatter16_32_ref
, FILL_CHAR
,
681 SCATTER_BUFFER_SIZE
* sizeof(unsigned short));
682 scalar_scatter_16_32(vscatter16_32_ref
);
683 check_buffer(__func__
, vtcm
.vscatter16_32
, vscatter16_32_ref
,
684 SCATTER_BUFFER_SIZE
* sizeof(unsigned short));
687 /* scatter the 32 bit elements using C */
688 void scalar_scatter_16_32_acc(unsigned short *vscatter16_32
)
690 for (int i
= 0; i
< MATRIX_SIZE
; ++i
) {
691 vscatter16_32
[word_offsets
[i
] / 2] += half_values_acc
[i
];
695 void check_scatter_16_32_acc()
697 memset(vscatter16_32_ref
, FILL_CHAR
,
698 SCATTER_BUFFER_SIZE
* sizeof(unsigned short));
699 scalar_scatter_16_32(vscatter16_32_ref
);
700 scalar_scatter_16_32_acc(vscatter16_32_ref
);
701 check_buffer(__func__
, vtcm
.vscatter16_32
, vscatter16_32_ref
,
702 SCATTER_BUFFER_SIZE
* sizeof(unsigned short));
705 void scalar_scatter_16_32_masked(unsigned short *vscatter16_32
)
707 for (int i
= 0; i
< MATRIX_SIZE
; i
++) {
708 if (half_predicates
[i
]) {
709 vscatter16_32
[word_offsets
[i
] / 2] = half_values_masked
[i
];
714 void check_scatter_16_32_masked()
716 memset(vscatter16_32_ref
, FILL_CHAR
,
717 SCATTER_BUFFER_SIZE
* sizeof(unsigned short));
718 scalar_scatter_16_32(vscatter16_32_ref
);
719 scalar_scatter_16_32_acc(vscatter16_32_ref
);
720 scalar_scatter_16_32_masked(vscatter16_32_ref
);
721 check_buffer(__func__
, vtcm
.vscatter16_32
, vscatter16_32_ref
,
722 SCATTER_BUFFER_SIZE
* sizeof(unsigned short));
725 /* gather the elements from the scatter buffer using C */
726 void scalar_gather_16(unsigned short *vgather16
)
728 for (int i
= 0; i
< MATRIX_SIZE
; ++i
) {
729 vgather16
[i
] = vtcm
.vscatter16
[half_offsets
[i
] / 2];
733 void check_gather_16()
735 memset(vgather16_ref
, 0, MATRIX_SIZE
* sizeof(unsigned short));
736 scalar_gather_16(vgather16_ref
);
737 check_buffer(__func__
, vtcm
.vgather16
, vgather16_ref
,
738 MATRIX_SIZE
* sizeof(unsigned short));
741 void scalar_gather_16_masked(unsigned short *vgather16
)
743 for (int i
= 0; i
< MATRIX_SIZE
; ++i
) {
744 if (half_predicates
[i
]) {
745 vgather16
[i
] = vtcm
.vscatter16
[half_offsets
[i
] / 2];
750 void check_gather_16_masked()
752 memset(vgather16_ref
, gather_16_masked_init(),
753 MATRIX_SIZE
* sizeof(unsigned short));
754 scalar_gather_16_masked(vgather16_ref
);
755 check_buffer(__func__
, vtcm
.vgather16
, vgather16_ref
,
756 MATRIX_SIZE
* sizeof(unsigned short));
759 /* gather the elements from the scatter buffer using C */
760 void scalar_gather_32(unsigned int *vgather32
)
762 for (int i
= 0; i
< MATRIX_SIZE
; ++i
) {
763 vgather32
[i
] = vtcm
.vscatter32
[word_offsets
[i
] / 4];
767 void check_gather_32(void)
769 memset(vgather32_ref
, 0, MATRIX_SIZE
* sizeof(unsigned int));
770 scalar_gather_32(vgather32_ref
);
771 check_buffer(__func__
, vtcm
.vgather32
, vgather32_ref
,
772 MATRIX_SIZE
* sizeof(unsigned int));
775 void scalar_gather_32_masked(unsigned int *vgather32
)
777 for (int i
= 0; i
< MATRIX_SIZE
; ++i
) {
778 if (word_predicates
[i
]) {
779 vgather32
[i
] = vtcm
.vscatter32
[word_offsets
[i
] / 4];
785 void check_gather_32_masked(void)
787 memset(vgather32_ref
, gather_32_masked_init(),
788 MATRIX_SIZE
* sizeof(unsigned int));
789 scalar_gather_32_masked(vgather32_ref
);
790 check_buffer(__func__
, vtcm
.vgather32
,
791 vgather32_ref
, MATRIX_SIZE
* sizeof(unsigned int));
794 /* gather the elements from the scatter buffer using C */
795 void scalar_gather_16_32(unsigned short *vgather16_32
)
797 for (int i
= 0; i
< MATRIX_SIZE
; ++i
) {
798 vgather16_32
[i
] = vtcm
.vscatter16_32
[word_offsets
[i
] / 2];
802 void check_gather_16_32(void)
804 memset(vgather16_32_ref
, 0, MATRIX_SIZE
* sizeof(unsigned short));
805 scalar_gather_16_32(vgather16_32_ref
);
806 check_buffer(__func__
, vtcm
.vgather16_32
, vgather16_32_ref
,
807 MATRIX_SIZE
* sizeof(unsigned short));
810 void scalar_gather_16_32_masked(unsigned short *vgather16_32
)
812 for (int i
= 0; i
< MATRIX_SIZE
; ++i
) {
813 if (half_predicates
[i
]) {
814 vgather16_32
[i
] = vtcm
.vscatter16_32
[word_offsets
[i
] / 2];
820 void check_gather_16_32_masked(void)
822 memset(vgather16_32_ref
, gather_16_masked_init(),
823 MATRIX_SIZE
* sizeof(unsigned short));
824 scalar_gather_16_32_masked(vgather16_32_ref
);
825 check_buffer(__func__
, vtcm
.vgather16_32
, vgather16_32_ref
,
826 MATRIX_SIZE
* sizeof(unsigned short));
829 /* print scatter16 buffer */
830 void print_scatter16_buffer(void)
833 printf("\n\nPrinting the 16 bit scatter buffer");
835 for (int i
= 0; i
< SCATTER_BUFFER_SIZE
; i
++) {
836 if ((i
% MATRIX_SIZE
) == 0) {
839 for (int j
= 0; j
< 2; j
++) {
840 printf("%c", (char)((vtcm
.vscatter16
[i
] >> j
* 8) & 0xff));
848 /* print the gather 16 buffer */
849 void print_gather_result_16(void)
852 printf("\n\nPrinting the 16 bit gather result\n");
854 for (int i
= 0; i
< MATRIX_SIZE
; i
++) {
855 for (int j
= 0; j
< 2; j
++) {
856 printf("%c", (char)((vtcm
.vgather16
[i
] >> j
* 8) & 0xff));
864 /* print the scatter32 buffer */
865 void print_scatter32_buffer(void)
868 printf("\n\nPrinting the 32 bit scatter buffer");
870 for (int i
= 0; i
< SCATTER_BUFFER_SIZE
; i
++) {
871 if ((i
% MATRIX_SIZE
) == 0) {
874 for (int j
= 0; j
< 4; j
++) {
875 printf("%c", (char)((vtcm
.vscatter32
[i
] >> j
* 8) & 0xff));
883 /* print the gather 32 buffer */
884 void print_gather_result_32(void)
887 printf("\n\nPrinting the 32 bit gather result\n");
889 for (int i
= 0; i
< MATRIX_SIZE
; i
++) {
890 for (int j
= 0; j
< 4; j
++) {
891 printf("%c", (char)((vtcm
.vgather32
[i
] >> j
* 8) & 0xff));
899 /* print the scatter16_32 buffer */
900 void print_scatter16_32_buffer(void)
903 printf("\n\nPrinting the 16_32 bit scatter buffer");
905 for (int i
= 0; i
< SCATTER_BUFFER_SIZE
; i
++) {
906 if ((i
% MATRIX_SIZE
) == 0) {
909 for (int j
= 0; j
< 2; j
++) {
911 (unsigned char)((vtcm
.vscatter16_32
[i
] >> j
* 8) & 0xff));
919 /* print the gather 16_32 buffer */
920 void print_gather_result_16_32(void)
923 printf("\n\nPrinting the 16_32 bit gather result\n");
925 for (int i
= 0; i
< MATRIX_SIZE
; i
++) {
926 for (int j
= 0; j
< 2; j
++) {
928 (unsigned char)((vtcm
.vgather16_32
[i
] >> j
* 8) & 0xff));
938 prefill_vtcm_scratch();
940 /* 16 bit elements with 16 bit offsets */
941 create_offsets_values_preds_16();
944 print_scatter16_buffer();
948 print_gather_result_16();
951 vector_gather_16_masked();
952 print_gather_result_16();
953 check_gather_16_masked();
955 vector_scatter_16_acc();
956 print_scatter16_buffer();
957 check_scatter_16_acc();
959 vector_scatter_16_masked();
960 print_scatter16_buffer();
961 check_scatter_16_masked();
963 /* 32 bit elements with 32 bit offsets */
964 create_offsets_values_preds_32();
967 print_scatter32_buffer();
971 print_gather_result_32();
974 vector_gather_32_masked();
975 print_gather_result_32();
976 check_gather_32_masked();
978 vector_scatter_32_acc();
979 print_scatter32_buffer();
980 check_scatter_32_acc();
982 vector_scatter_32_masked();
983 print_scatter32_buffer();
984 check_scatter_32_masked();
986 /* 16 bit elements with 32 bit offsets */
987 create_offsets_values_preds_16_32();
989 vector_scatter_16_32();
990 print_scatter16_32_buffer();
991 check_scatter_16_32();
993 vector_gather_16_32();
994 print_gather_result_16_32();
995 check_gather_16_32();
997 vector_gather_16_32_masked();
998 print_gather_result_16_32();
999 check_gather_16_32_masked();
1001 vector_scatter_16_32_acc();
1002 print_scatter16_32_buffer();
1003 check_scatter_16_32_acc();
1005 vector_scatter_16_32_masked();
1006 print_scatter16_32_buffer();
1007 check_scatter_16_32_masked();
1009 puts(err
? "FAIL" : "PASS");