]> git.proxmox.com Git - mirror_qemu.git/blob - tests/tcg/hexagon/scatter_gather.c
virtio: fix reachable assertion due to stale value of cached region size
[mirror_qemu.git] / tests / tcg / hexagon / scatter_gather.c
1 /*
2 * Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18 /*
19 * This example tests the HVX scatter/gather instructions
20 *
21 * See section 5.13 of the V68 HVX Programmer's Reference
22 *
23 * There are 3 main classes operations
24 * _16 16-bit elements and 16-bit offsets
25 * _32 32-bit elements and 32-bit offsets
26 * _16_32 16-bit elements and 32-bit offsets
27 *
28 * There are also masked and accumulate versions
29 */
30
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include <inttypes.h>
35
36 typedef long HVX_Vector __attribute__((__vector_size__(128)))
37 __attribute__((aligned(128)));
38 typedef long HVX_VectorPair __attribute__((__vector_size__(256)))
39 __attribute__((aligned(128)));
40 typedef long HVX_VectorPred __attribute__((__vector_size__(128)))
41 __attribute__((aligned(128)));
42
43 #define VSCATTER_16(BASE, RGN, OFF, VALS) \
44 __builtin_HEXAGON_V6_vscattermh_128B((int)BASE, RGN, OFF, VALS)
45 #define VSCATTER_16_MASKED(MASK, BASE, RGN, OFF, VALS) \
46 __builtin_HEXAGON_V6_vscattermhq_128B(MASK, (int)BASE, RGN, OFF, VALS)
47 #define VSCATTER_32(BASE, RGN, OFF, VALS) \
48 __builtin_HEXAGON_V6_vscattermw_128B((int)BASE, RGN, OFF, VALS)
49 #define VSCATTER_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
50 __builtin_HEXAGON_V6_vscattermwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
51 #define VSCATTER_16_32(BASE, RGN, OFF, VALS) \
52 __builtin_HEXAGON_V6_vscattermhw_128B((int)BASE, RGN, OFF, VALS)
53 #define VSCATTER_16_32_MASKED(MASK, BASE, RGN, OFF, VALS) \
54 __builtin_HEXAGON_V6_vscattermhwq_128B(MASK, (int)BASE, RGN, OFF, VALS)
55 #define VSCATTER_16_ACC(BASE, RGN, OFF, VALS) \
56 __builtin_HEXAGON_V6_vscattermh_add_128B((int)BASE, RGN, OFF, VALS)
57 #define VSCATTER_32_ACC(BASE, RGN, OFF, VALS) \
58 __builtin_HEXAGON_V6_vscattermw_add_128B((int)BASE, RGN, OFF, VALS)
59 #define VSCATTER_16_32_ACC(BASE, RGN, OFF, VALS) \
60 __builtin_HEXAGON_V6_vscattermhw_add_128B((int)BASE, RGN, OFF, VALS)
61
62 #define VGATHER_16(DSTADDR, BASE, RGN, OFF) \
63 __builtin_HEXAGON_V6_vgathermh_128B(DSTADDR, (int)BASE, RGN, OFF)
64 #define VGATHER_16_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
65 __builtin_HEXAGON_V6_vgathermhq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
66 #define VGATHER_32(DSTADDR, BASE, RGN, OFF) \
67 __builtin_HEXAGON_V6_vgathermw_128B(DSTADDR, (int)BASE, RGN, OFF)
68 #define VGATHER_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
69 __builtin_HEXAGON_V6_vgathermwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
70 #define VGATHER_16_32(DSTADDR, BASE, RGN, OFF) \
71 __builtin_HEXAGON_V6_vgathermhw_128B(DSTADDR, (int)BASE, RGN, OFF)
72 #define VGATHER_16_32_MASKED(DSTADDR, MASK, BASE, RGN, OFF) \
73 __builtin_HEXAGON_V6_vgathermhwq_128B(DSTADDR, MASK, (int)BASE, RGN, OFF)
74
75 #define VSHUFF_H(V) \
76 __builtin_HEXAGON_V6_vshuffh_128B(V)
77 #define VSPLAT_H(X) \
78 __builtin_HEXAGON_V6_lvsplath_128B(X)
79 #define VAND_VAL(PRED, VAL) \
80 __builtin_HEXAGON_V6_vandvrt_128B(PRED, VAL)
81 #define VDEAL_H(V) \
82 __builtin_HEXAGON_V6_vdealh_128B(V)
83
84 int err;
85
86 /* define the number of rows/cols in a square matrix */
87 #define MATRIX_SIZE 64
88
89 /* define the size of the scatter buffer */
90 #define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
91
92 /* fake vtcm - put buffers together and force alignment */
93 static struct {
94 unsigned short vscatter16[SCATTER_BUFFER_SIZE];
95 unsigned short vgather16[MATRIX_SIZE];
96 unsigned int vscatter32[SCATTER_BUFFER_SIZE];
97 unsigned int vgather32[MATRIX_SIZE];
98 unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
99 unsigned short vgather16_32[MATRIX_SIZE];
100 } vtcm __attribute__((aligned(0x10000)));
101
102 /* declare the arrays of reference values */
103 unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
104 unsigned short vgather16_ref[MATRIX_SIZE];
105 unsigned int vscatter32_ref[SCATTER_BUFFER_SIZE];
106 unsigned int vgather32_ref[MATRIX_SIZE];
107 unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
108 unsigned short vgather16_32_ref[MATRIX_SIZE];
109
110 /* declare the arrays of offsets */
111 unsigned short half_offsets[MATRIX_SIZE];
112 unsigned int word_offsets[MATRIX_SIZE];
113
114 /* declare the arrays of values */
115 unsigned short half_values[MATRIX_SIZE];
116 unsigned short half_values_acc[MATRIX_SIZE];
117 unsigned short half_values_masked[MATRIX_SIZE];
118 unsigned int word_values[MATRIX_SIZE];
119 unsigned int word_values_acc[MATRIX_SIZE];
120 unsigned int word_values_masked[MATRIX_SIZE];
121
122 /* declare the arrays of predicates */
123 unsigned short half_predicates[MATRIX_SIZE];
124 unsigned int word_predicates[MATRIX_SIZE];
125
126 /* make this big enough for all the intrinsics */
127 const size_t region_len = sizeof(vtcm);
128
129 /* optionally add sync instructions */
130 #define SYNC_VECTOR 1
131
132 static void sync_scatter(void *addr)
133 {
134 #if SYNC_VECTOR
135 /*
136 * Do the scatter release followed by a dummy load to complete the
137 * synchronization. Normally the dummy load would be deferred as
138 * long as possible to minimize stalls.
139 */
140 asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
141 /* use volatile to force the load */
142 volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
143 #endif
144 }
145
146 static void sync_gather(void *addr)
147 {
148 #if SYNC_VECTOR
149 /* use volatile to force the load */
150 volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
151 #endif
152 }
153
154 /* optionally print the results */
155 #define PRINT_DATA 0
156
157 #define FILL_CHAR '.'
158
159 /* fill vtcm scratch with ee */
160 void prefill_vtcm_scratch(void)
161 {
162 memset(&vtcm, FILL_CHAR, sizeof(vtcm));
163 }
164
165 /* create byte offsets to be a diagonal of the matrix with 16 bit elements */
166 void create_offsets_values_preds_16(void)
167 {
168 unsigned short half_element = 0;
169 unsigned short half_element_masked = 0;
170 char letter = 'A';
171 char letter_masked = '@';
172
173 for (int i = 0; i < MATRIX_SIZE; i++) {
174 half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
175
176 half_element = 0;
177 half_element_masked = 0;
178 for (int j = 0; j < 2; j++) {
179 half_element |= letter << j * 8;
180 half_element_masked |= letter_masked << j * 8;
181 }
182
183 half_values[i] = half_element;
184 half_values_acc[i] = ((i % 10) << 8) + (i % 10);
185 half_values_masked[i] = half_element_masked;
186
187 letter++;
188 /* reset to 'A' */
189 if (letter == 'M') {
190 letter = 'A';
191 }
192
193 half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
194 }
195 }
196
197 /* create byte offsets to be a diagonal of the matrix with 32 bit elements */
198 void create_offsets_values_preds_32(void)
199 {
200 unsigned int word_element = 0;
201 unsigned int word_element_masked = 0;
202 char letter = 'A';
203 char letter_masked = '&';
204
205 for (int i = 0; i < MATRIX_SIZE; i++) {
206 word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
207
208 word_element = 0;
209 word_element_masked = 0;
210 for (int j = 0; j < 4; j++) {
211 word_element |= letter << j * 8;
212 word_element_masked |= letter_masked << j * 8;
213 }
214
215 word_values[i] = word_element;
216 word_values_acc[i] = ((i % 10) << 8) + (i % 10);
217 word_values_masked[i] = word_element_masked;
218
219 letter++;
220 /* reset to 'A' */
221 if (letter == 'M') {
222 letter = 'A';
223 }
224
225 word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
226 }
227 }
228
229 /*
230 * create byte offsets to be a diagonal of the matrix with 16 bit elements
231 * and 32 bit offsets
232 */
233 void create_offsets_values_preds_16_32(void)
234 {
235 unsigned short half_element = 0;
236 unsigned short half_element_masked = 0;
237 char letter = 'D';
238 char letter_masked = '$';
239
240 for (int i = 0; i < MATRIX_SIZE; i++) {
241 word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
242
243 half_element = 0;
244 half_element_masked = 0;
245 for (int j = 0; j < 2; j++) {
246 half_element |= letter << j * 8;
247 half_element_masked |= letter_masked << j * 8;
248 }
249
250 half_values[i] = half_element;
251 half_values_acc[i] = ((i % 10) << 8) + (i % 10);
252 half_values_masked[i] = half_element_masked;
253
254 letter++;
255 /* reset to 'A' */
256 if (letter == 'P') {
257 letter = 'D';
258 }
259
260 half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
261 }
262 }
263
264 /* scatter the 16 bit elements using intrinsics */
265 void vector_scatter_16(void)
266 {
267 /* copy the offsets and values to vectors */
268 HVX_Vector offsets = *(HVX_Vector *)half_offsets;
269 HVX_Vector values = *(HVX_Vector *)half_values;
270
271 VSCATTER_16(&vtcm.vscatter16, region_len, offsets, values);
272
273 sync_scatter(vtcm.vscatter16);
274 }
275
276 /* scatter-accumulate the 16 bit elements using intrinsics */
277 void vector_scatter_16_acc(void)
278 {
279 /* copy the offsets and values to vectors */
280 HVX_Vector offsets = *(HVX_Vector *)half_offsets;
281 HVX_Vector values = *(HVX_Vector *)half_values_acc;
282
283 VSCATTER_16_ACC(&vtcm.vscatter16, region_len, offsets, values);
284
285 sync_scatter(vtcm.vscatter16);
286 }
287
288 /* scatter the 16 bit elements using intrinsics */
289 void vector_scatter_16_masked(void)
290 {
291 /* copy the offsets and values to vectors */
292 HVX_Vector offsets = *(HVX_Vector *)half_offsets;
293 HVX_Vector values = *(HVX_Vector *)half_values_masked;
294 HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
295 HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
296
297 VSCATTER_16_MASKED(preds, &vtcm.vscatter16, region_len, offsets, values);
298
299 sync_scatter(vtcm.vscatter16);
300 }
301
302 /* scatter the 32 bit elements using intrinsics */
303 void vector_scatter_32(void)
304 {
305 /* copy the offsets and values to vectors */
306 HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
307 HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
308 HVX_Vector valueslo = *(HVX_Vector *)word_values;
309 HVX_Vector valueshi = *(HVX_Vector *)&word_values[MATRIX_SIZE / 2];
310
311 VSCATTER_32(&vtcm.vscatter32, region_len, offsetslo, valueslo);
312 VSCATTER_32(&vtcm.vscatter32, region_len, offsetshi, valueshi);
313
314 sync_scatter(vtcm.vscatter32);
315 }
316
317 /* scatter-acc the 32 bit elements using intrinsics */
318 void vector_scatter_32_acc(void)
319 {
320 /* copy the offsets and values to vectors */
321 HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
322 HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
323 HVX_Vector valueslo = *(HVX_Vector *)word_values_acc;
324 HVX_Vector valueshi = *(HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
325
326 VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetslo, valueslo);
327 VSCATTER_32_ACC(&vtcm.vscatter32, region_len, offsetshi, valueshi);
328
329 sync_scatter(vtcm.vscatter32);
330 }
331
332 /* scatter the 32 bit elements using intrinsics */
333 void vector_scatter_32_masked(void)
334 {
335 /* copy the offsets and values to vectors */
336 HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
337 HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
338 HVX_Vector valueslo = *(HVX_Vector *)word_values_masked;
339 HVX_Vector valueshi = *(HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
340 HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
341 HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
342 HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0);
343 HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0);
344
345 VSCATTER_32_MASKED(predslo, &vtcm.vscatter32, region_len, offsetslo,
346 valueslo);
347 VSCATTER_32_MASKED(predshi, &vtcm.vscatter32, region_len, offsetshi,
348 valueshi);
349
350 sync_scatter(vtcm.vscatter16);
351 }
352
353 /* scatter the 16 bit elements with 32 bit offsets using intrinsics */
354 void vector_scatter_16_32(void)
355 {
356 HVX_VectorPair offsets;
357 HVX_Vector values;
358
359 /* get the word offsets in a vector pair */
360 offsets = *(HVX_VectorPair *)word_offsets;
361
362 /* these values need to be shuffled for the scatter */
363 values = *(HVX_Vector *)half_values;
364 values = VSHUFF_H(values);
365
366 VSCATTER_16_32(&vtcm.vscatter16_32, region_len, offsets, values);
367
368 sync_scatter(vtcm.vscatter16_32);
369 }
370
371 /* scatter-acc the 16 bit elements with 32 bit offsets using intrinsics */
372 void vector_scatter_16_32_acc(void)
373 {
374 HVX_VectorPair offsets;
375 HVX_Vector values;
376
377 /* get the word offsets in a vector pair */
378 offsets = *(HVX_VectorPair *)word_offsets;
379
380 /* these values need to be shuffled for the scatter */
381 values = *(HVX_Vector *)half_values_acc;
382 values = VSHUFF_H(values);
383
384 VSCATTER_16_32_ACC(&vtcm.vscatter16_32, region_len, offsets, values);
385
386 sync_scatter(vtcm.vscatter16_32);
387 }
388
389 /* masked scatter the 16 bit elements with 32 bit offsets using intrinsics */
390 void vector_scatter_16_32_masked(void)
391 {
392 HVX_VectorPair offsets;
393 HVX_Vector values;
394 HVX_Vector pred_reg;
395
396 /* get the word offsets in a vector pair */
397 offsets = *(HVX_VectorPair *)word_offsets;
398
399 /* these values need to be shuffled for the scatter */
400 values = *(HVX_Vector *)half_values_masked;
401 values = VSHUFF_H(values);
402
403 pred_reg = *(HVX_Vector *)half_predicates;
404 pred_reg = VSHUFF_H(pred_reg);
405 HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
406
407 VSCATTER_16_32_MASKED(preds, &vtcm.vscatter16_32, region_len, offsets,
408 values);
409
410 sync_scatter(vtcm.vscatter16_32);
411 }
412
413 /* gather the elements from the scatter16 buffer */
414 void vector_gather_16(void)
415 {
416 HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16;
417 HVX_Vector offsets = *(HVX_Vector *)half_offsets;
418
419 VGATHER_16(vgather, &vtcm.vscatter16, region_len, offsets);
420
421 sync_gather(vgather);
422 }
423
424 static unsigned short gather_16_masked_init(void)
425 {
426 char letter = '?';
427 return letter | (letter << 8);
428 }
429
430 void vector_gather_16_masked(void)
431 {
432 HVX_Vector *vgather = (HVX_Vector *)&vtcm.vgather16;
433 HVX_Vector offsets = *(HVX_Vector *)half_offsets;
434 HVX_Vector pred_reg = *(HVX_Vector *)half_predicates;
435 HVX_VectorPred preds = VAND_VAL(pred_reg, ~0);
436
437 *vgather = VSPLAT_H(gather_16_masked_init());
438 VGATHER_16_MASKED(vgather, preds, &vtcm.vscatter16, region_len, offsets);
439
440 sync_gather(vgather);
441 }
442
443 /* gather the elements from the scatter32 buffer */
444 void vector_gather_32(void)
445 {
446 HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32;
447 HVX_Vector *vgatherhi =
448 (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2));
449 HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
450 HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
451
452 VGATHER_32(vgatherlo, &vtcm.vscatter32, region_len, offsetslo);
453 VGATHER_32(vgatherhi, &vtcm.vscatter32, region_len, offsetshi);
454
455 sync_gather(vgatherhi);
456 }
457
458 static unsigned int gather_32_masked_init(void)
459 {
460 char letter = '?';
461 return letter | (letter << 8) | (letter << 16) | (letter << 24);
462 }
463
464 void vector_gather_32_masked(void)
465 {
466 HVX_Vector *vgatherlo = (HVX_Vector *)&vtcm.vgather32;
467 HVX_Vector *vgatherhi =
468 (HVX_Vector *)((int)&vtcm.vgather32 + (MATRIX_SIZE * 2));
469 HVX_Vector offsetslo = *(HVX_Vector *)word_offsets;
470 HVX_Vector offsetshi = *(HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
471 HVX_Vector pred_reglo = *(HVX_Vector *)word_predicates;
472 HVX_VectorPred predslo = VAND_VAL(pred_reglo, ~0);
473 HVX_Vector pred_reghi = *(HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
474 HVX_VectorPred predshi = VAND_VAL(pred_reghi, ~0);
475
476 *vgatherlo = VSPLAT_H(gather_32_masked_init());
477 *vgatherhi = VSPLAT_H(gather_32_masked_init());
478 VGATHER_32_MASKED(vgatherlo, predslo, &vtcm.vscatter32, region_len,
479 offsetslo);
480 VGATHER_32_MASKED(vgatherhi, predshi, &vtcm.vscatter32, region_len,
481 offsetshi);
482
483 sync_gather(vgatherlo);
484 sync_gather(vgatherhi);
485 }
486
487 /* gather the elements from the scatter16_32 buffer */
488 void vector_gather_16_32(void)
489 {
490 HVX_Vector *vgather;
491 HVX_VectorPair offsets;
492 HVX_Vector values;
493
494 /* get the vtcm address to gather from */
495 vgather = (HVX_Vector *)&vtcm.vgather16_32;
496
497 /* get the word offsets in a vector pair */
498 offsets = *(HVX_VectorPair *)word_offsets;
499
500 VGATHER_16_32(vgather, &vtcm.vscatter16_32, region_len, offsets);
501
502 /* deal the elements to get the order back */
503 values = *(HVX_Vector *)vgather;
504 values = VDEAL_H(values);
505
506 /* write it back to vtcm address */
507 *(HVX_Vector *)vgather = values;
508 }
509
510 void vector_gather_16_32_masked(void)
511 {
512 HVX_Vector *vgather;
513 HVX_VectorPair offsets;
514 HVX_Vector pred_reg;
515 HVX_VectorPred preds;
516 HVX_Vector values;
517
518 /* get the vtcm address to gather from */
519 vgather = (HVX_Vector *)&vtcm.vgather16_32;
520
521 /* get the word offsets in a vector pair */
522 offsets = *(HVX_VectorPair *)word_offsets;
523 pred_reg = *(HVX_Vector *)half_predicates;
524 pred_reg = VSHUFF_H(pred_reg);
525 preds = VAND_VAL(pred_reg, ~0);
526
527 *vgather = VSPLAT_H(gather_16_masked_init());
528 VGATHER_16_32_MASKED(vgather, preds, &vtcm.vscatter16_32, region_len,
529 offsets);
530
531 /* deal the elements to get the order back */
532 values = *(HVX_Vector *)vgather;
533 values = VDEAL_H(values);
534
535 /* write it back to vtcm address */
536 *(HVX_Vector *)vgather = values;
537 }
538
539 static void check_buffer(const char *name, void *c, void *r, size_t size)
540 {
541 char *check = (char *)c;
542 char *ref = (char *)r;
543 for (int i = 0; i < size; i++) {
544 if (check[i] != ref[i]) {
545 printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
546 check[i], check[i], ref[i], ref[i]);
547 err++;
548 }
549 }
550 }
551
552 /*
553 * These scalar functions are the C equivalents of the vector functions that
554 * use HVX
555 */
556
557 /* scatter the 16 bit elements using C */
558 void scalar_scatter_16(unsigned short *vscatter16)
559 {
560 for (int i = 0; i < MATRIX_SIZE; ++i) {
561 vscatter16[half_offsets[i] / 2] = half_values[i];
562 }
563 }
564
565 void check_scatter_16()
566 {
567 memset(vscatter16_ref, FILL_CHAR,
568 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
569 scalar_scatter_16(vscatter16_ref);
570 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
571 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
572 }
573
574 /* scatter the 16 bit elements using C */
575 void scalar_scatter_16_acc(unsigned short *vscatter16)
576 {
577 for (int i = 0; i < MATRIX_SIZE; ++i) {
578 vscatter16[half_offsets[i] / 2] += half_values_acc[i];
579 }
580 }
581
582 void check_scatter_16_acc()
583 {
584 memset(vscatter16_ref, FILL_CHAR,
585 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
586 scalar_scatter_16(vscatter16_ref);
587 scalar_scatter_16_acc(vscatter16_ref);
588 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
589 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
590 }
591
592 /* scatter the 16 bit elements using C */
593 void scalar_scatter_16_masked(unsigned short *vscatter16)
594 {
595 for (int i = 0; i < MATRIX_SIZE; i++) {
596 if (half_predicates[i]) {
597 vscatter16[half_offsets[i] / 2] = half_values_masked[i];
598 }
599 }
600
601 }
602
603 void check_scatter_16_masked()
604 {
605 memset(vscatter16_ref, FILL_CHAR,
606 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
607 scalar_scatter_16(vscatter16_ref);
608 scalar_scatter_16_acc(vscatter16_ref);
609 scalar_scatter_16_masked(vscatter16_ref);
610 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
611 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
612 }
613
614 /* scatter the 32 bit elements using C */
615 void scalar_scatter_32(unsigned int *vscatter32)
616 {
617 for (int i = 0; i < MATRIX_SIZE; ++i) {
618 vscatter32[word_offsets[i] / 4] = word_values[i];
619 }
620 }
621
622 void check_scatter_32()
623 {
624 memset(vscatter32_ref, FILL_CHAR,
625 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
626 scalar_scatter_32(vscatter32_ref);
627 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
628 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
629 }
630
631 /* scatter the 32 bit elements using C */
632 void scalar_scatter_32_acc(unsigned int *vscatter32)
633 {
634 for (int i = 0; i < MATRIX_SIZE; ++i) {
635 vscatter32[word_offsets[i] / 4] += word_values_acc[i];
636 }
637 }
638
639 void check_scatter_32_acc()
640 {
641 memset(vscatter32_ref, FILL_CHAR,
642 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
643 scalar_scatter_32(vscatter32_ref);
644 scalar_scatter_32_acc(vscatter32_ref);
645 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
646 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
647 }
648
649 /* scatter the 32 bit elements using C */
650 void scalar_scatter_32_masked(unsigned int *vscatter32)
651 {
652 for (int i = 0; i < MATRIX_SIZE; i++) {
653 if (word_predicates[i]) {
654 vscatter32[word_offsets[i] / 4] = word_values_masked[i];
655 }
656 }
657 }
658
659 void check_scatter_32_masked()
660 {
661 memset(vscatter32_ref, FILL_CHAR,
662 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
663 scalar_scatter_32(vscatter32_ref);
664 scalar_scatter_32_acc(vscatter32_ref);
665 scalar_scatter_32_masked(vscatter32_ref);
666 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
667 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
668 }
669
670 /* scatter the 32 bit elements using C */
671 void scalar_scatter_16_32(unsigned short *vscatter16_32)
672 {
673 for (int i = 0; i < MATRIX_SIZE; ++i) {
674 vscatter16_32[word_offsets[i] / 2] = half_values[i];
675 }
676 }
677
678 void check_scatter_16_32()
679 {
680 memset(vscatter16_32_ref, FILL_CHAR,
681 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
682 scalar_scatter_16_32(vscatter16_32_ref);
683 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
684 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
685 }
686
687 /* scatter the 32 bit elements using C */
688 void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
689 {
690 for (int i = 0; i < MATRIX_SIZE; ++i) {
691 vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
692 }
693 }
694
695 void check_scatter_16_32_acc()
696 {
697 memset(vscatter16_32_ref, FILL_CHAR,
698 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
699 scalar_scatter_16_32(vscatter16_32_ref);
700 scalar_scatter_16_32_acc(vscatter16_32_ref);
701 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
702 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
703 }
704
705 void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
706 {
707 for (int i = 0; i < MATRIX_SIZE; i++) {
708 if (half_predicates[i]) {
709 vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
710 }
711 }
712 }
713
714 void check_scatter_16_32_masked()
715 {
716 memset(vscatter16_32_ref, FILL_CHAR,
717 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
718 scalar_scatter_16_32(vscatter16_32_ref);
719 scalar_scatter_16_32_acc(vscatter16_32_ref);
720 scalar_scatter_16_32_masked(vscatter16_32_ref);
721 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
722 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
723 }
724
725 /* gather the elements from the scatter buffer using C */
726 void scalar_gather_16(unsigned short *vgather16)
727 {
728 for (int i = 0; i < MATRIX_SIZE; ++i) {
729 vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
730 }
731 }
732
733 void check_gather_16()
734 {
735 memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
736 scalar_gather_16(vgather16_ref);
737 check_buffer(__func__, vtcm.vgather16, vgather16_ref,
738 MATRIX_SIZE * sizeof(unsigned short));
739 }
740
741 void scalar_gather_16_masked(unsigned short *vgather16)
742 {
743 for (int i = 0; i < MATRIX_SIZE; ++i) {
744 if (half_predicates[i]) {
745 vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
746 }
747 }
748 }
749
750 void check_gather_16_masked()
751 {
752 memset(vgather16_ref, gather_16_masked_init(),
753 MATRIX_SIZE * sizeof(unsigned short));
754 scalar_gather_16_masked(vgather16_ref);
755 check_buffer(__func__, vtcm.vgather16, vgather16_ref,
756 MATRIX_SIZE * sizeof(unsigned short));
757 }
758
759 /* gather the elements from the scatter buffer using C */
760 void scalar_gather_32(unsigned int *vgather32)
761 {
762 for (int i = 0; i < MATRIX_SIZE; ++i) {
763 vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
764 }
765 }
766
767 void check_gather_32(void)
768 {
769 memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
770 scalar_gather_32(vgather32_ref);
771 check_buffer(__func__, vtcm.vgather32, vgather32_ref,
772 MATRIX_SIZE * sizeof(unsigned int));
773 }
774
775 void scalar_gather_32_masked(unsigned int *vgather32)
776 {
777 for (int i = 0; i < MATRIX_SIZE; ++i) {
778 if (word_predicates[i]) {
779 vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
780 }
781 }
782 }
783
784
785 void check_gather_32_masked(void)
786 {
787 memset(vgather32_ref, gather_32_masked_init(),
788 MATRIX_SIZE * sizeof(unsigned int));
789 scalar_gather_32_masked(vgather32_ref);
790 check_buffer(__func__, vtcm.vgather32,
791 vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
792 }
793
794 /* gather the elements from the scatter buffer using C */
795 void scalar_gather_16_32(unsigned short *vgather16_32)
796 {
797 for (int i = 0; i < MATRIX_SIZE; ++i) {
798 vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
799 }
800 }
801
802 void check_gather_16_32(void)
803 {
804 memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
805 scalar_gather_16_32(vgather16_32_ref);
806 check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
807 MATRIX_SIZE * sizeof(unsigned short));
808 }
809
810 void scalar_gather_16_32_masked(unsigned short *vgather16_32)
811 {
812 for (int i = 0; i < MATRIX_SIZE; ++i) {
813 if (half_predicates[i]) {
814 vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
815 }
816 }
817
818 }
819
820 void check_gather_16_32_masked(void)
821 {
822 memset(vgather16_32_ref, gather_16_masked_init(),
823 MATRIX_SIZE * sizeof(unsigned short));
824 scalar_gather_16_32_masked(vgather16_32_ref);
825 check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
826 MATRIX_SIZE * sizeof(unsigned short));
827 }
828
829 /* print scatter16 buffer */
830 void print_scatter16_buffer(void)
831 {
832 if (PRINT_DATA) {
833 printf("\n\nPrinting the 16 bit scatter buffer");
834
835 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
836 if ((i % MATRIX_SIZE) == 0) {
837 printf("\n");
838 }
839 for (int j = 0; j < 2; j++) {
840 printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
841 }
842 printf(" ");
843 }
844 printf("\n");
845 }
846 }
847
848 /* print the gather 16 buffer */
849 void print_gather_result_16(void)
850 {
851 if (PRINT_DATA) {
852 printf("\n\nPrinting the 16 bit gather result\n");
853
854 for (int i = 0; i < MATRIX_SIZE; i++) {
855 for (int j = 0; j < 2; j++) {
856 printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
857 }
858 printf(" ");
859 }
860 printf("\n");
861 }
862 }
863
864 /* print the scatter32 buffer */
865 void print_scatter32_buffer(void)
866 {
867 if (PRINT_DATA) {
868 printf("\n\nPrinting the 32 bit scatter buffer");
869
870 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
871 if ((i % MATRIX_SIZE) == 0) {
872 printf("\n");
873 }
874 for (int j = 0; j < 4; j++) {
875 printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
876 }
877 printf(" ");
878 }
879 printf("\n");
880 }
881 }
882
883 /* print the gather 32 buffer */
884 void print_gather_result_32(void)
885 {
886 if (PRINT_DATA) {
887 printf("\n\nPrinting the 32 bit gather result\n");
888
889 for (int i = 0; i < MATRIX_SIZE; i++) {
890 for (int j = 0; j < 4; j++) {
891 printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
892 }
893 printf(" ");
894 }
895 printf("\n");
896 }
897 }
898
899 /* print the scatter16_32 buffer */
900 void print_scatter16_32_buffer(void)
901 {
902 if (PRINT_DATA) {
903 printf("\n\nPrinting the 16_32 bit scatter buffer");
904
905 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
906 if ((i % MATRIX_SIZE) == 0) {
907 printf("\n");
908 }
909 for (int j = 0; j < 2; j++) {
910 printf("%c",
911 (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
912 }
913 printf(" ");
914 }
915 printf("\n");
916 }
917 }
918
919 /* print the gather 16_32 buffer */
920 void print_gather_result_16_32(void)
921 {
922 if (PRINT_DATA) {
923 printf("\n\nPrinting the 16_32 bit gather result\n");
924
925 for (int i = 0; i < MATRIX_SIZE; i++) {
926 for (int j = 0; j < 2; j++) {
927 printf("%c",
928 (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
929 }
930 printf(" ");
931 }
932 printf("\n");
933 }
934 }
935
936 int main()
937 {
938 prefill_vtcm_scratch();
939
940 /* 16 bit elements with 16 bit offsets */
941 create_offsets_values_preds_16();
942
943 vector_scatter_16();
944 print_scatter16_buffer();
945 check_scatter_16();
946
947 vector_gather_16();
948 print_gather_result_16();
949 check_gather_16();
950
951 vector_gather_16_masked();
952 print_gather_result_16();
953 check_gather_16_masked();
954
955 vector_scatter_16_acc();
956 print_scatter16_buffer();
957 check_scatter_16_acc();
958
959 vector_scatter_16_masked();
960 print_scatter16_buffer();
961 check_scatter_16_masked();
962
963 /* 32 bit elements with 32 bit offsets */
964 create_offsets_values_preds_32();
965
966 vector_scatter_32();
967 print_scatter32_buffer();
968 check_scatter_32();
969
970 vector_gather_32();
971 print_gather_result_32();
972 check_gather_32();
973
974 vector_gather_32_masked();
975 print_gather_result_32();
976 check_gather_32_masked();
977
978 vector_scatter_32_acc();
979 print_scatter32_buffer();
980 check_scatter_32_acc();
981
982 vector_scatter_32_masked();
983 print_scatter32_buffer();
984 check_scatter_32_masked();
985
986 /* 16 bit elements with 32 bit offsets */
987 create_offsets_values_preds_16_32();
988
989 vector_scatter_16_32();
990 print_scatter16_32_buffer();
991 check_scatter_16_32();
992
993 vector_gather_16_32();
994 print_gather_result_16_32();
995 check_gather_16_32();
996
997 vector_gather_16_32_masked();
998 print_gather_result_16_32();
999 check_gather_16_32_masked();
1000
1001 vector_scatter_16_32_acc();
1002 print_scatter16_32_buffer();
1003 check_scatter_16_32_acc();
1004
1005 vector_scatter_16_32_masked();
1006 print_scatter16_32_buffer();
1007 check_scatter_16_32_masked();
1008
1009 puts(err ? "FAIL" : "PASS");
1010 return err;
1011 }