]> git.proxmox.com Git - mirror_qemu.git/blame - tests/tcg/hexagon/scatter_gather.c
Merge tag 'for_upstream' of https://git.kernel.org/pub/scm/virt/kvm/mst/qemu into...
[mirror_qemu.git] / tests / tcg / hexagon / scatter_gather.c
CommitLineData
62e93b08 1/*
c3679385 2 * Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
62e93b08
TS
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, see <http://www.gnu.org/licenses/>.
16 */
17
18/*
19 * This example tests the HVX scatter/gather instructions
20 *
21 * See section 5.13 of the V68 HVX Programmer's Reference
22 *
23 * There are 3 main classes operations
24 * _16 16-bit elements and 16-bit offsets
25 * _32 32-bit elements and 32-bit offsets
26 * _16_32 16-bit elements and 32-bit offsets
27 *
28 * There are also masked and accumulate versions
29 */
30
31#include <stdio.h>
32#include <string.h>
33#include <stdlib.h>
34#include <inttypes.h>
35
36typedef long HVX_Vector __attribute__((__vector_size__(128)))
37 __attribute__((aligned(128)));
38typedef long HVX_VectorPair __attribute__((__vector_size__(256)))
39 __attribute__((aligned(128)));
40typedef long HVX_VectorPred __attribute__((__vector_size__(128)))
41 __attribute__((aligned(128)));
42
62e93b08
TS
43int err;
44
45/* define the number of rows/cols in a square matrix */
46#define MATRIX_SIZE 64
47
48/* define the size of the scatter buffer */
49#define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
50
51/* fake vtcm - put buffers together and force alignment */
52static struct {
53 unsigned short vscatter16[SCATTER_BUFFER_SIZE];
54 unsigned short vgather16[MATRIX_SIZE];
55 unsigned int vscatter32[SCATTER_BUFFER_SIZE];
56 unsigned int vgather32[MATRIX_SIZE];
57 unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
58 unsigned short vgather16_32[MATRIX_SIZE];
59} vtcm __attribute__((aligned(0x10000)));
60
61/* declare the arrays of reference values */
62unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
63unsigned short vgather16_ref[MATRIX_SIZE];
64unsigned int vscatter32_ref[SCATTER_BUFFER_SIZE];
65unsigned int vgather32_ref[MATRIX_SIZE];
66unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
67unsigned short vgather16_32_ref[MATRIX_SIZE];
68
69/* declare the arrays of offsets */
c3679385
TS
70unsigned short half_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
71unsigned int word_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
62e93b08
TS
72
73/* declare the arrays of values */
c3679385
TS
74unsigned short half_values[MATRIX_SIZE] __attribute__((aligned(128)));
75unsigned short half_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
76unsigned short half_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
77unsigned int word_values[MATRIX_SIZE] __attribute__((aligned(128)));
78unsigned int word_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
79unsigned int word_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
62e93b08
TS
80
81/* declare the arrays of predicates */
c3679385
TS
82unsigned short half_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
83unsigned int word_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
62e93b08 84
c3679385 85/* make this big enough for all the operations */
62e93b08
TS
86const size_t region_len = sizeof(vtcm);
87
88/* optionally add sync instructions */
89#define SYNC_VECTOR 1
90
91static void sync_scatter(void *addr)
92{
93#if SYNC_VECTOR
94 /*
95 * Do the scatter release followed by a dummy load to complete the
96 * synchronization. Normally the dummy load would be deferred as
97 * long as possible to minimize stalls.
98 */
99 asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
100 /* use volatile to force the load */
101 volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
102#endif
103}
104
105static void sync_gather(void *addr)
106{
107#if SYNC_VECTOR
108 /* use volatile to force the load */
109 volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
110#endif
111}
112
113/* optionally print the results */
114#define PRINT_DATA 0
115
116#define FILL_CHAR '.'
117
118/* fill vtcm scratch with ee */
119void prefill_vtcm_scratch(void)
120{
121 memset(&vtcm, FILL_CHAR, sizeof(vtcm));
122}
123
124/* create byte offsets to be a diagonal of the matrix with 16 bit elements */
125void create_offsets_values_preds_16(void)
126{
127 unsigned short half_element = 0;
128 unsigned short half_element_masked = 0;
129 char letter = 'A';
130 char letter_masked = '@';
131
132 for (int i = 0; i < MATRIX_SIZE; i++) {
133 half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
134
135 half_element = 0;
136 half_element_masked = 0;
137 for (int j = 0; j < 2; j++) {
138 half_element |= letter << j * 8;
139 half_element_masked |= letter_masked << j * 8;
140 }
141
142 half_values[i] = half_element;
143 half_values_acc[i] = ((i % 10) << 8) + (i % 10);
144 half_values_masked[i] = half_element_masked;
145
146 letter++;
147 /* reset to 'A' */
148 if (letter == 'M') {
149 letter = 'A';
150 }
151
152 half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
153 }
154}
155
156/* create byte offsets to be a diagonal of the matrix with 32 bit elements */
157void create_offsets_values_preds_32(void)
158{
159 unsigned int word_element = 0;
160 unsigned int word_element_masked = 0;
161 char letter = 'A';
162 char letter_masked = '&';
163
164 for (int i = 0; i < MATRIX_SIZE; i++) {
165 word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
166
167 word_element = 0;
168 word_element_masked = 0;
169 for (int j = 0; j < 4; j++) {
170 word_element |= letter << j * 8;
171 word_element_masked |= letter_masked << j * 8;
172 }
173
174 word_values[i] = word_element;
175 word_values_acc[i] = ((i % 10) << 8) + (i % 10);
176 word_values_masked[i] = word_element_masked;
177
178 letter++;
179 /* reset to 'A' */
180 if (letter == 'M') {
181 letter = 'A';
182 }
183
184 word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
185 }
186}
187
188/*
189 * create byte offsets to be a diagonal of the matrix with 16 bit elements
190 * and 32 bit offsets
191 */
192void create_offsets_values_preds_16_32(void)
193{
194 unsigned short half_element = 0;
195 unsigned short half_element_masked = 0;
196 char letter = 'D';
197 char letter_masked = '$';
198
199 for (int i = 0; i < MATRIX_SIZE; i++) {
200 word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
201
202 half_element = 0;
203 half_element_masked = 0;
204 for (int j = 0; j < 2; j++) {
205 half_element |= letter << j * 8;
206 half_element_masked |= letter_masked << j * 8;
207 }
208
209 half_values[i] = half_element;
210 half_values_acc[i] = ((i % 10) << 8) + (i % 10);
211 half_values_masked[i] = half_element_masked;
212
213 letter++;
214 /* reset to 'A' */
215 if (letter == 'P') {
216 letter = 'D';
217 }
218
219 half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
220 }
221}
222
c3679385 223/* scatter the 16 bit elements using HVX */
62e93b08
TS
224void vector_scatter_16(void)
225{
c3679385
TS
226 asm ("m0 = %1\n\t"
227 "v0 = vmem(%2 + #0)\n\t"
228 "v1 = vmem(%3 + #0)\n\t"
229 "vscatter(%0, m0, v0.h).h = v1\n\t"
230 : : "r"(vtcm.vscatter16), "r"(region_len),
231 "r"(half_offsets), "r"(half_values)
232 : "m0", "v0", "v1", "memory");
62e93b08
TS
233
234 sync_scatter(vtcm.vscatter16);
235}
236
c3679385 237/* scatter-accumulate the 16 bit elements using HVX */
62e93b08
TS
238void vector_scatter_16_acc(void)
239{
c3679385
TS
240 asm ("m0 = %1\n\t"
241 "v0 = vmem(%2 + #0)\n\t"
242 "v1 = vmem(%3 + #0)\n\t"
243 "vscatter(%0, m0, v0.h).h += v1\n\t"
244 : : "r"(vtcm.vscatter16), "r"(region_len),
245 "r"(half_offsets), "r"(half_values_acc)
246 : "m0", "v0", "v1", "memory");
62e93b08
TS
247
248 sync_scatter(vtcm.vscatter16);
249}
250
c3679385 251/* masked scatter the 16 bit elements using HVX */
62e93b08
TS
252void vector_scatter_16_masked(void)
253{
c3679385
TS
254 asm ("r1 = #-1\n\t"
255 "v0 = vmem(%0 + #0)\n\t"
256 "q0 = vand(v0, r1)\n\t"
257 "m0 = %2\n\t"
258 "v0 = vmem(%3 + #0)\n\t"
259 "v1 = vmem(%4 + #0)\n\t"
260 "if (q0) vscatter(%1, m0, v0.h).h = v1\n\t"
261 : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
262 "r"(half_offsets), "r"(half_values_masked)
263 : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
62e93b08
TS
264
265 sync_scatter(vtcm.vscatter16);
266}
267
c3679385 268/* scatter the 32 bit elements using HVX */
62e93b08
TS
269void vector_scatter_32(void)
270{
c3679385
TS
271 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
272 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
273 HVX_Vector *valueslo = (HVX_Vector *)word_values;
274 HVX_Vector *valueshi = (HVX_Vector *)&word_values[MATRIX_SIZE / 2];
275
276 asm ("m0 = %1\n\t"
277 "v0 = vmem(%2 + #0)\n\t"
278 "v1 = vmem(%3 + #0)\n\t"
279 "vscatter(%0, m0, v0.w).w = v1\n\t"
280 : : "r"(vtcm.vscatter32), "r"(region_len),
281 "r"(offsetslo), "r"(valueslo)
282 : "m0", "v0", "v1", "memory");
283 asm ("m0 = %1\n\t"
284 "v0 = vmem(%2 + #0)\n\t"
285 "v1 = vmem(%3 + #0)\n\t"
286 "vscatter(%0, m0, v0.w).w = v1\n\t"
287 : : "r"(vtcm.vscatter32), "r"(region_len),
288 "r"(offsetshi), "r"(valueshi)
289 : "m0", "v0", "v1", "memory");
62e93b08
TS
290
291 sync_scatter(vtcm.vscatter32);
292}
293
c3679385 294/* scatter-accumulate the 32 bit elements using HVX */
62e93b08
TS
295void vector_scatter_32_acc(void)
296{
c3679385
TS
297 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
298 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
299 HVX_Vector *valueslo = (HVX_Vector *)word_values_acc;
300 HVX_Vector *valueshi = (HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
301
302 asm ("m0 = %1\n\t"
303 "v0 = vmem(%2 + #0)\n\t"
304 "v1 = vmem(%3 + #0)\n\t"
305 "vscatter(%0, m0, v0.w).w += v1\n\t"
306 : : "r"(vtcm.vscatter32), "r"(region_len),
307 "r"(offsetslo), "r"(valueslo)
308 : "m0", "v0", "v1", "memory");
309 asm ("m0 = %1\n\t"
310 "v0 = vmem(%2 + #0)\n\t"
311 "v1 = vmem(%3 + #0)\n\t"
312 "vscatter(%0, m0, v0.w).w += v1\n\t"
313 : : "r"(vtcm.vscatter32), "r"(region_len),
314 "r"(offsetshi), "r"(valueshi)
315 : "m0", "v0", "v1", "memory");
62e93b08
TS
316
317 sync_scatter(vtcm.vscatter32);
318}
319
c3679385 320/* masked scatter the 32 bit elements using HVX */
62e93b08
TS
321void vector_scatter_32_masked(void)
322{
c3679385
TS
323 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
324 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
325 HVX_Vector *valueslo = (HVX_Vector *)word_values_masked;
326 HVX_Vector *valueshi = (HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
327 HVX_Vector *predslo = (HVX_Vector *)word_predicates;
328 HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
329
330 asm ("r1 = #-1\n\t"
331 "v0 = vmem(%0 + #0)\n\t"
332 "q0 = vand(v0, r1)\n\t"
333 "m0 = %2\n\t"
334 "v0 = vmem(%3 + #0)\n\t"
335 "v1 = vmem(%4 + #0)\n\t"
336 "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
337 : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
338 "r"(offsetslo), "r"(valueslo)
339 : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
340 asm ("r1 = #-1\n\t"
341 "v0 = vmem(%0 + #0)\n\t"
342 "q0 = vand(v0, r1)\n\t"
343 "m0 = %2\n\t"
344 "v0 = vmem(%3 + #0)\n\t"
345 "v1 = vmem(%4 + #0)\n\t"
346 "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
347 : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
348 "r"(offsetshi), "r"(valueshi)
349 : "r1", "q0", "m0", "q0", "v0", "v1", "memory");
62e93b08 350
c3679385 351 sync_scatter(vtcm.vscatter32);
62e93b08
TS
352}
353
c3679385 354/* scatter the 16 bit elements with 32 bit offsets using HVX */
62e93b08
TS
355void vector_scatter_16_32(void)
356{
c3679385
TS
357 asm ("m0 = %1\n\t"
358 "v0 = vmem(%2 + #0)\n\t"
359 "v1 = vmem(%2 + #1)\n\t"
360 "v2 = vmem(%3 + #0)\n\t"
361 "v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */
362 "vscatter(%0, m0, v1:0.w).h = v2\n\t"
363 : : "r"(vtcm.vscatter16_32), "r"(region_len),
364 "r"(word_offsets), "r"(half_values)
365 : "m0", "v0", "v1", "v2", "memory");
62e93b08
TS
366
367 sync_scatter(vtcm.vscatter16_32);
368}
369
c3679385 370/* scatter-accumulate the 16 bit elements with 32 bit offsets using HVX */
62e93b08
TS
371void vector_scatter_16_32_acc(void)
372{
c3679385
TS
373 asm ("m0 = %1\n\t"
374 "v0 = vmem(%2 + #0)\n\t"
375 "v1 = vmem(%2 + #1)\n\t"
376 "v2 = vmem(%3 + #0)\n\t" \
377 "v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */
378 "vscatter(%0, m0, v1:0.w).h += v2\n\t"
379 : : "r"(vtcm.vscatter16_32), "r"(region_len),
380 "r"(word_offsets), "r"(half_values_acc)
381 : "m0", "v0", "v1", "v2", "memory");
62e93b08
TS
382
383 sync_scatter(vtcm.vscatter16_32);
384}
385
c3679385 386/* masked scatter the 16 bit elements with 32 bit offsets using HVX */
62e93b08
TS
387void vector_scatter_16_32_masked(void)
388{
c3679385
TS
389 asm ("r1 = #-1\n\t"
390 "v0 = vmem(%0 + #0)\n\t"
391 "v0.h = vshuff(v0.h)\n\t" /* shuffle the predicates */
392 "q0 = vand(v0, r1)\n\t"
393 "m0 = %2\n\t"
394 "v0 = vmem(%3 + #0)\n\t"
395 "v1 = vmem(%3 + #1)\n\t"
396 "v2 = vmem(%4 + #0)\n\t" \
397 "v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */
398 "if (q0) vscatter(%1, m0, v1:0.w).h = v2\n\t"
399 : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
400 "r"(word_offsets), "r"(half_values_masked)
401 : "r1", "q0", "m0", "v0", "v1", "v2", "memory");
62e93b08
TS
402
403 sync_scatter(vtcm.vscatter16_32);
404}
405
c3679385 406/* gather the elements from the scatter16 buffer using HVX */
62e93b08
TS
407void vector_gather_16(void)
408{
c3679385
TS
409 asm ("m0 = %1\n\t"
410 "v0 = vmem(%2 + #0)\n\t"
411 "{ vtmp.h = vgather(%0, m0, v0.h).h\n\t"
412 " vmem(%3 + #0) = vtmp.new }\n\t"
413 : : "r"(vtcm.vscatter16), "r"(region_len),
414 "r"(half_offsets), "r"(vtcm.vgather16)
415 : "m0", "v0", "memory");
62e93b08 416
c3679385 417 sync_gather(vtcm.vgather16);
62e93b08
TS
418}
419
420static unsigned short gather_16_masked_init(void)
421{
422 char letter = '?';
423 return letter | (letter << 8);
424}
425
c3679385 426/* masked gather the elements from the scatter16 buffer using HVX */
62e93b08
TS
427void vector_gather_16_masked(void)
428{
c3679385 429 unsigned short init = gather_16_masked_init();
62e93b08 430
c3679385
TS
431 asm ("v0.h = vsplat(%5)\n\t"
432 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */
433 "r1 = #-1\n\t"
434 "v0 = vmem(%0 + #0)\n\t"
435 "q0 = vand(v0, r1)\n\t"
436 "m0 = %2\n\t"
437 "v0 = vmem(%3 + #0)\n\t"
438 "{ if (q0) vtmp.h = vgather(%1, m0, v0.h).h\n\t"
439 " vmem(%4 + #0) = vtmp.new }\n\t"
440 : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
441 "r"(half_offsets), "r"(vtcm.vgather16), "r"(init)
442 : "r1", "q0", "m0", "v0", "memory");
62e93b08 443
c3679385 444 sync_gather(vtcm.vgather16);
62e93b08
TS
445}
446
c3679385 447/* gather the elements from the scatter32 buffer using HVX */
62e93b08
TS
448void vector_gather_32(void)
449{
c3679385
TS
450 HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
451 HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
452 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
453 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
454
455 asm ("m0 = %1\n\t"
456 "v0 = vmem(%2 + #0)\n\t"
457 "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
458 " vmem(%3 + #0) = vtmp.new }\n\t"
459 : : "r"(vtcm.vscatter32), "r"(region_len),
460 "r"(offsetslo), "r"(vgatherlo)
461 : "m0", "v0", "memory");
462 asm ("m0 = %1\n\t"
463 "v0 = vmem(%2 + #0)\n\t"
464 "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
465 " vmem(%3 + #0) = vtmp.new }\n\t"
466 : : "r"(vtcm.vscatter32), "r"(region_len),
467 "r"(offsetshi), "r"(vgatherhi)
468 : "m0", "v0", "memory");
62e93b08 469
c3679385 470 sync_gather(vgatherlo);
62e93b08
TS
471 sync_gather(vgatherhi);
472}
473
474static unsigned int gather_32_masked_init(void)
475{
476 char letter = '?';
477 return letter | (letter << 8) | (letter << 16) | (letter << 24);
478}
479
c3679385 480/* masked gather the elements from the scatter32 buffer using HVX */
62e93b08
TS
481void vector_gather_32_masked(void)
482{
c3679385
TS
483 unsigned int init = gather_32_masked_init();
484 HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
485 HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
486 HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
487 HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
488 HVX_Vector *predslo = (HVX_Vector *)word_predicates;
489 HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
490
491 asm ("v0.h = vsplat(%5)\n\t"
492 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */
493 "r1 = #-1\n\t"
494 "v0 = vmem(%0 + #0)\n\t"
495 "q0 = vand(v0, r1)\n\t"
496 "m0 = %2\n\t"
497 "v0 = vmem(%3 + #0)\n\t"
498 "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
499 " vmem(%4 + #0) = vtmp.new }\n\t"
500 : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
501 "r"(offsetslo), "r"(vgatherlo), "r"(init)
502 : "r1", "q0", "m0", "v0", "memory");
503 asm ("v0.h = vsplat(%5)\n\t"
504 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */
505 "r1 = #-1\n\t"
506 "v0 = vmem(%0 + #0)\n\t"
507 "q0 = vand(v0, r1)\n\t"
508 "m0 = %2\n\t"
509 "v0 = vmem(%3 + #0)\n\t"
510 "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
511 " vmem(%4 + #0) = vtmp.new }\n\t"
512 : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
513 "r"(offsetshi), "r"(vgatherhi), "r"(init)
514 : "r1", "q0", "m0", "v0", "memory");
62e93b08
TS
515
516 sync_gather(vgatherlo);
517 sync_gather(vgatherhi);
518}
519
c3679385 520/* gather the elements from the scatter16_32 buffer using HVX */
62e93b08
TS
521void vector_gather_16_32(void)
522{
c3679385
TS
523 asm ("m0 = %1\n\t"
524 "v0 = vmem(%2 + #0)\n\t"
525 "v1 = vmem(%2 + #1)\n\t"
526 "{ vtmp.h = vgather(%0, m0, v1:0.w).h\n\t"
527 " vmem(%3 + #0) = vtmp.new }\n\t"
528 "v0 = vmem(%3 + #0)\n\t"
529 "v0.h = vdeal(v0.h)\n\t" /* deal the elements to get the order back */
530 "vmem(%3 + #0) = v0\n\t"
531 : : "r"(vtcm.vscatter16_32), "r"(region_len),
532 "r"(word_offsets), "r"(vtcm.vgather16_32)
533 : "m0", "v0", "v1", "memory");
62e93b08 534
c3679385 535 sync_gather(vtcm.vgather16_32);
62e93b08
TS
536}
537
c3679385 538/* masked gather the elements from the scatter16_32 buffer using HVX */
62e93b08
TS
539void vector_gather_16_32_masked(void)
540{
c3679385
TS
541 unsigned short init = gather_16_masked_init();
542
543 asm ("v0.h = vsplat(%5)\n\t"
544 "vmem(%4 + #0) = v0\n\t" /* initialize the write area */
545 "r1 = #-1\n\t"
546 "v0 = vmem(%0 + #0)\n\t"
547 "v0.h = vshuff(v0.h)\n\t" /* shuffle the predicates */
548 "q0 = vand(v0, r1)\n\t"
549 "m0 = %2\n\t"
550 "v0 = vmem(%3 + #0)\n\t"
551 "v1 = vmem(%3 + #1)\n\t"
552 "{ if (q0) vtmp.h = vgather(%1, m0, v1:0.w).h\n\t"
553 " vmem(%4 + #0) = vtmp.new }\n\t"
554 "v0 = vmem(%4 + #0)\n\t"
555 "v0.h = vdeal(v0.h)\n\t" /* deal the elements to get the order back */
556 "vmem(%4 + #0) = v0\n\t"
557 : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
558 "r"(word_offsets), "r"(vtcm.vgather16_32), "r"(init)
559 : "r1", "q0", "m0", "v0", "v1", "memory");
560
561 sync_gather(vtcm.vgather16_32);
62e93b08
TS
562}
563
564static void check_buffer(const char *name, void *c, void *r, size_t size)
565{
566 char *check = (char *)c;
567 char *ref = (char *)r;
568 for (int i = 0; i < size; i++) {
569 if (check[i] != ref[i]) {
570 printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
571 check[i], check[i], ref[i], ref[i]);
572 err++;
573 }
574 }
575}
576
577/*
578 * These scalar functions are the C equivalents of the vector functions that
579 * use HVX
580 */
581
582/* scatter the 16 bit elements using C */
583void scalar_scatter_16(unsigned short *vscatter16)
584{
585 for (int i = 0; i < MATRIX_SIZE; ++i) {
586 vscatter16[half_offsets[i] / 2] = half_values[i];
587 }
588}
589
590void check_scatter_16()
591{
592 memset(vscatter16_ref, FILL_CHAR,
593 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
594 scalar_scatter_16(vscatter16_ref);
595 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
596 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
597}
598
599/* scatter the 16 bit elements using C */
600void scalar_scatter_16_acc(unsigned short *vscatter16)
601{
602 for (int i = 0; i < MATRIX_SIZE; ++i) {
603 vscatter16[half_offsets[i] / 2] += half_values_acc[i];
604 }
605}
606
c3679385 607/* scatter-accumulate the 16 bit elements using C */
62e93b08
TS
608void check_scatter_16_acc()
609{
610 memset(vscatter16_ref, FILL_CHAR,
611 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
612 scalar_scatter_16(vscatter16_ref);
613 scalar_scatter_16_acc(vscatter16_ref);
614 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
615 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
616}
617
c3679385 618/* masked scatter the 16 bit elements using C */
62e93b08
TS
619void scalar_scatter_16_masked(unsigned short *vscatter16)
620{
621 for (int i = 0; i < MATRIX_SIZE; i++) {
622 if (half_predicates[i]) {
623 vscatter16[half_offsets[i] / 2] = half_values_masked[i];
624 }
625 }
626
627}
628
629void check_scatter_16_masked()
630{
631 memset(vscatter16_ref, FILL_CHAR,
632 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
633 scalar_scatter_16(vscatter16_ref);
634 scalar_scatter_16_acc(vscatter16_ref);
635 scalar_scatter_16_masked(vscatter16_ref);
636 check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
637 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
638}
639
640/* scatter the 32 bit elements using C */
641void scalar_scatter_32(unsigned int *vscatter32)
642{
643 for (int i = 0; i < MATRIX_SIZE; ++i) {
644 vscatter32[word_offsets[i] / 4] = word_values[i];
645 }
646}
647
648void check_scatter_32()
649{
650 memset(vscatter32_ref, FILL_CHAR,
651 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
652 scalar_scatter_32(vscatter32_ref);
653 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
654 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
655}
656
c3679385 657/* scatter-accumulate the 32 bit elements using C */
62e93b08
TS
658void scalar_scatter_32_acc(unsigned int *vscatter32)
659{
660 for (int i = 0; i < MATRIX_SIZE; ++i) {
661 vscatter32[word_offsets[i] / 4] += word_values_acc[i];
662 }
663}
664
665void check_scatter_32_acc()
666{
667 memset(vscatter32_ref, FILL_CHAR,
668 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
669 scalar_scatter_32(vscatter32_ref);
670 scalar_scatter_32_acc(vscatter32_ref);
671 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
672 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
673}
674
c3679385 675/* masked scatter the 32 bit elements using C */
62e93b08
TS
676void scalar_scatter_32_masked(unsigned int *vscatter32)
677{
678 for (int i = 0; i < MATRIX_SIZE; i++) {
679 if (word_predicates[i]) {
680 vscatter32[word_offsets[i] / 4] = word_values_masked[i];
681 }
682 }
683}
684
685void check_scatter_32_masked()
686{
687 memset(vscatter32_ref, FILL_CHAR,
688 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
689 scalar_scatter_32(vscatter32_ref);
690 scalar_scatter_32_acc(vscatter32_ref);
691 scalar_scatter_32_masked(vscatter32_ref);
692 check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
693 SCATTER_BUFFER_SIZE * sizeof(unsigned int));
694}
695
c3679385 696/* scatter the 16 bit elements with 32 bit offsets using C */
62e93b08
TS
697void scalar_scatter_16_32(unsigned short *vscatter16_32)
698{
699 for (int i = 0; i < MATRIX_SIZE; ++i) {
700 vscatter16_32[word_offsets[i] / 2] = half_values[i];
701 }
702}
703
704void check_scatter_16_32()
705{
706 memset(vscatter16_32_ref, FILL_CHAR,
707 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
708 scalar_scatter_16_32(vscatter16_32_ref);
709 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
710 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
711}
712
c3679385 713/* scatter-accumulate the 16 bit elements with 32 bit offsets using C */
62e93b08
TS
714void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
715{
716 for (int i = 0; i < MATRIX_SIZE; ++i) {
717 vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
718 }
719}
720
721void check_scatter_16_32_acc()
722{
723 memset(vscatter16_32_ref, FILL_CHAR,
724 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
725 scalar_scatter_16_32(vscatter16_32_ref);
726 scalar_scatter_16_32_acc(vscatter16_32_ref);
727 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
728 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
729}
730
c3679385 731/* masked scatter the 16 bit elements with 32 bit offsets using C */
62e93b08
TS
732void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
733{
734 for (int i = 0; i < MATRIX_SIZE; i++) {
735 if (half_predicates[i]) {
736 vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
737 }
738 }
739}
740
741void check_scatter_16_32_masked()
742{
743 memset(vscatter16_32_ref, FILL_CHAR,
744 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
745 scalar_scatter_16_32(vscatter16_32_ref);
746 scalar_scatter_16_32_acc(vscatter16_32_ref);
747 scalar_scatter_16_32_masked(vscatter16_32_ref);
748 check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
749 SCATTER_BUFFER_SIZE * sizeof(unsigned short));
750}
751
752/* gather the elements from the scatter buffer using C */
753void scalar_gather_16(unsigned short *vgather16)
754{
755 for (int i = 0; i < MATRIX_SIZE; ++i) {
756 vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
757 }
758}
759
760void check_gather_16()
761{
762 memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
763 scalar_gather_16(vgather16_ref);
764 check_buffer(__func__, vtcm.vgather16, vgather16_ref,
765 MATRIX_SIZE * sizeof(unsigned short));
766}
767
c3679385 768/* masked gather the elements from the scatter buffer using C */
62e93b08
TS
769void scalar_gather_16_masked(unsigned short *vgather16)
770{
771 for (int i = 0; i < MATRIX_SIZE; ++i) {
772 if (half_predicates[i]) {
773 vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
774 }
775 }
776}
777
778void check_gather_16_masked()
779{
780 memset(vgather16_ref, gather_16_masked_init(),
781 MATRIX_SIZE * sizeof(unsigned short));
782 scalar_gather_16_masked(vgather16_ref);
783 check_buffer(__func__, vtcm.vgather16, vgather16_ref,
784 MATRIX_SIZE * sizeof(unsigned short));
785}
786
c3679385 787/* gather the elements from the scatter32 buffer using C */
62e93b08
TS
788void scalar_gather_32(unsigned int *vgather32)
789{
790 for (int i = 0; i < MATRIX_SIZE; ++i) {
791 vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
792 }
793}
794
795void check_gather_32(void)
796{
797 memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
798 scalar_gather_32(vgather32_ref);
799 check_buffer(__func__, vtcm.vgather32, vgather32_ref,
800 MATRIX_SIZE * sizeof(unsigned int));
801}
802
c3679385 803/* masked gather the elements from the scatter32 buffer using C */
62e93b08
TS
804void scalar_gather_32_masked(unsigned int *vgather32)
805{
806 for (int i = 0; i < MATRIX_SIZE; ++i) {
807 if (word_predicates[i]) {
808 vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
809 }
810 }
811}
812
62e93b08
TS
813void check_gather_32_masked(void)
814{
815 memset(vgather32_ref, gather_32_masked_init(),
816 MATRIX_SIZE * sizeof(unsigned int));
817 scalar_gather_32_masked(vgather32_ref);
818 check_buffer(__func__, vtcm.vgather32,
819 vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
820}
821
c3679385 822/* gather the elements from the scatter16_32 buffer using C */
62e93b08
TS
823void scalar_gather_16_32(unsigned short *vgather16_32)
824{
825 for (int i = 0; i < MATRIX_SIZE; ++i) {
826 vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
827 }
828}
829
830void check_gather_16_32(void)
831{
832 memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
833 scalar_gather_16_32(vgather16_32_ref);
834 check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
835 MATRIX_SIZE * sizeof(unsigned short));
836}
837
c3679385 838/* masked gather the elements from the scatter16_32 buffer using C */
62e93b08
TS
839void scalar_gather_16_32_masked(unsigned short *vgather16_32)
840{
841 for (int i = 0; i < MATRIX_SIZE; ++i) {
842 if (half_predicates[i]) {
843 vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
844 }
845 }
846
847}
848
849void check_gather_16_32_masked(void)
850{
851 memset(vgather16_32_ref, gather_16_masked_init(),
852 MATRIX_SIZE * sizeof(unsigned short));
853 scalar_gather_16_32_masked(vgather16_32_ref);
854 check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
855 MATRIX_SIZE * sizeof(unsigned short));
856}
857
858/* print scatter16 buffer */
859void print_scatter16_buffer(void)
860{
861 if (PRINT_DATA) {
862 printf("\n\nPrinting the 16 bit scatter buffer");
863
864 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
865 if ((i % MATRIX_SIZE) == 0) {
866 printf("\n");
867 }
868 for (int j = 0; j < 2; j++) {
869 printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
870 }
871 printf(" ");
872 }
873 printf("\n");
874 }
875}
876
877/* print the gather 16 buffer */
878void print_gather_result_16(void)
879{
880 if (PRINT_DATA) {
881 printf("\n\nPrinting the 16 bit gather result\n");
882
883 for (int i = 0; i < MATRIX_SIZE; i++) {
884 for (int j = 0; j < 2; j++) {
885 printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
886 }
887 printf(" ");
888 }
889 printf("\n");
890 }
891}
892
893/* print the scatter32 buffer */
894void print_scatter32_buffer(void)
895{
896 if (PRINT_DATA) {
897 printf("\n\nPrinting the 32 bit scatter buffer");
898
899 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
900 if ((i % MATRIX_SIZE) == 0) {
901 printf("\n");
902 }
903 for (int j = 0; j < 4; j++) {
904 printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
905 }
906 printf(" ");
907 }
908 printf("\n");
909 }
910}
911
912/* print the gather 32 buffer */
913void print_gather_result_32(void)
914{
915 if (PRINT_DATA) {
916 printf("\n\nPrinting the 32 bit gather result\n");
917
918 for (int i = 0; i < MATRIX_SIZE; i++) {
919 for (int j = 0; j < 4; j++) {
920 printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
921 }
922 printf(" ");
923 }
924 printf("\n");
925 }
926}
927
928/* print the scatter16_32 buffer */
929void print_scatter16_32_buffer(void)
930{
931 if (PRINT_DATA) {
932 printf("\n\nPrinting the 16_32 bit scatter buffer");
933
934 for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
935 if ((i % MATRIX_SIZE) == 0) {
936 printf("\n");
937 }
938 for (int j = 0; j < 2; j++) {
939 printf("%c",
940 (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
941 }
942 printf(" ");
943 }
944 printf("\n");
945 }
946}
947
948/* print the gather 16_32 buffer */
949void print_gather_result_16_32(void)
950{
951 if (PRINT_DATA) {
952 printf("\n\nPrinting the 16_32 bit gather result\n");
953
954 for (int i = 0; i < MATRIX_SIZE; i++) {
955 for (int j = 0; j < 2; j++) {
956 printf("%c",
957 (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
958 }
959 printf(" ");
960 }
961 printf("\n");
962 }
963}
964
965int main()
966{
967 prefill_vtcm_scratch();
968
969 /* 16 bit elements with 16 bit offsets */
970 create_offsets_values_preds_16();
971
972 vector_scatter_16();
973 print_scatter16_buffer();
974 check_scatter_16();
975
976 vector_gather_16();
977 print_gather_result_16();
978 check_gather_16();
979
980 vector_gather_16_masked();
981 print_gather_result_16();
982 check_gather_16_masked();
983
984 vector_scatter_16_acc();
985 print_scatter16_buffer();
986 check_scatter_16_acc();
987
988 vector_scatter_16_masked();
989 print_scatter16_buffer();
990 check_scatter_16_masked();
991
992 /* 32 bit elements with 32 bit offsets */
993 create_offsets_values_preds_32();
994
995 vector_scatter_32();
996 print_scatter32_buffer();
997 check_scatter_32();
998
999 vector_gather_32();
1000 print_gather_result_32();
1001 check_gather_32();
1002
1003 vector_gather_32_masked();
1004 print_gather_result_32();
1005 check_gather_32_masked();
1006
1007 vector_scatter_32_acc();
1008 print_scatter32_buffer();
1009 check_scatter_32_acc();
1010
1011 vector_scatter_32_masked();
1012 print_scatter32_buffer();
1013 check_scatter_32_masked();
1014
1015 /* 16 bit elements with 32 bit offsets */
1016 create_offsets_values_preds_16_32();
1017
1018 vector_scatter_16_32();
1019 print_scatter16_32_buffer();
1020 check_scatter_16_32();
1021
1022 vector_gather_16_32();
1023 print_gather_result_16_32();
1024 check_gather_16_32();
1025
1026 vector_gather_16_32_masked();
1027 print_gather_result_16_32();
1028 check_gather_16_32_masked();
1029
1030 vector_scatter_16_32_acc();
1031 print_scatter16_32_buffer();
1032 check_scatter_16_32_acc();
1033
1034 vector_scatter_16_32_masked();
1035 print_scatter16_32_buffer();
1036 check_scatter_16_32_masked();
1037
1038 puts(err ? "FAIL" : "PASS");
1039 return err;
1040}