]>
Commit | Line | Data |
---|---|---|
62e93b08 | 1 | /* |
c3679385 | 2 | * Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved. |
62e93b08 TS |
3 | * |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License as published by | |
6 | * the Free Software Foundation; either version 2 of the License, or | |
7 | * (at your option) any later version. | |
8 | * | |
9 | * This program is distributed in the hope that it will be useful, | |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | * GNU General Public License for more details. | |
13 | * | |
14 | * You should have received a copy of the GNU General Public License | |
15 | * along with this program; if not, see <http://www.gnu.org/licenses/>. | |
16 | */ | |
17 | ||
18 | /* | |
19 | * This example tests the HVX scatter/gather instructions | |
20 | * | |
21 | * See section 5.13 of the V68 HVX Programmer's Reference | |
22 | * | |
23 | * There are 3 main classes operations | |
24 | * _16 16-bit elements and 16-bit offsets | |
25 | * _32 32-bit elements and 32-bit offsets | |
26 | * _16_32 16-bit elements and 32-bit offsets | |
27 | * | |
28 | * There are also masked and accumulate versions | |
29 | */ | |
30 | ||
31 | #include <stdio.h> | |
32 | #include <string.h> | |
33 | #include <stdlib.h> | |
34 | #include <inttypes.h> | |
35 | ||
36 | typedef long HVX_Vector __attribute__((__vector_size__(128))) | |
37 | __attribute__((aligned(128))); | |
38 | typedef long HVX_VectorPair __attribute__((__vector_size__(256))) | |
39 | __attribute__((aligned(128))); | |
40 | typedef long HVX_VectorPred __attribute__((__vector_size__(128))) | |
41 | __attribute__((aligned(128))); | |
42 | ||
62e93b08 TS |
43 | int err; |
44 | ||
45 | /* define the number of rows/cols in a square matrix */ | |
46 | #define MATRIX_SIZE 64 | |
47 | ||
48 | /* define the size of the scatter buffer */ | |
49 | #define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE) | |
50 | ||
51 | /* fake vtcm - put buffers together and force alignment */ | |
52 | static struct { | |
53 | unsigned short vscatter16[SCATTER_BUFFER_SIZE]; | |
54 | unsigned short vgather16[MATRIX_SIZE]; | |
55 | unsigned int vscatter32[SCATTER_BUFFER_SIZE]; | |
56 | unsigned int vgather32[MATRIX_SIZE]; | |
57 | unsigned short vscatter16_32[SCATTER_BUFFER_SIZE]; | |
58 | unsigned short vgather16_32[MATRIX_SIZE]; | |
59 | } vtcm __attribute__((aligned(0x10000))); | |
60 | ||
61 | /* declare the arrays of reference values */ | |
62 | unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE]; | |
63 | unsigned short vgather16_ref[MATRIX_SIZE]; | |
64 | unsigned int vscatter32_ref[SCATTER_BUFFER_SIZE]; | |
65 | unsigned int vgather32_ref[MATRIX_SIZE]; | |
66 | unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE]; | |
67 | unsigned short vgather16_32_ref[MATRIX_SIZE]; | |
68 | ||
69 | /* declare the arrays of offsets */ | |
c3679385 TS |
70 | unsigned short half_offsets[MATRIX_SIZE] __attribute__((aligned(128))); |
71 | unsigned int word_offsets[MATRIX_SIZE] __attribute__((aligned(128))); | |
62e93b08 TS |
72 | |
73 | /* declare the arrays of values */ | |
c3679385 TS |
74 | unsigned short half_values[MATRIX_SIZE] __attribute__((aligned(128))); |
75 | unsigned short half_values_acc[MATRIX_SIZE] __attribute__((aligned(128))); | |
76 | unsigned short half_values_masked[MATRIX_SIZE] __attribute__((aligned(128))); | |
77 | unsigned int word_values[MATRIX_SIZE] __attribute__((aligned(128))); | |
78 | unsigned int word_values_acc[MATRIX_SIZE] __attribute__((aligned(128))); | |
79 | unsigned int word_values_masked[MATRIX_SIZE] __attribute__((aligned(128))); | |
62e93b08 TS |
80 | |
81 | /* declare the arrays of predicates */ | |
c3679385 TS |
82 | unsigned short half_predicates[MATRIX_SIZE] __attribute__((aligned(128))); |
83 | unsigned int word_predicates[MATRIX_SIZE] __attribute__((aligned(128))); | |
62e93b08 | 84 | |
c3679385 | 85 | /* make this big enough for all the operations */ |
62e93b08 TS |
86 | const size_t region_len = sizeof(vtcm); |
87 | ||
88 | /* optionally add sync instructions */ | |
89 | #define SYNC_VECTOR 1 | |
90 | ||
91 | static void sync_scatter(void *addr) | |
92 | { | |
93 | #if SYNC_VECTOR | |
94 | /* | |
95 | * Do the scatter release followed by a dummy load to complete the | |
96 | * synchronization. Normally the dummy load would be deferred as | |
97 | * long as possible to minimize stalls. | |
98 | */ | |
99 | asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr)); | |
100 | /* use volatile to force the load */ | |
101 | volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy; | |
102 | #endif | |
103 | } | |
104 | ||
105 | static void sync_gather(void *addr) | |
106 | { | |
107 | #if SYNC_VECTOR | |
108 | /* use volatile to force the load */ | |
109 | volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy; | |
110 | #endif | |
111 | } | |
112 | ||
113 | /* optionally print the results */ | |
114 | #define PRINT_DATA 0 | |
115 | ||
116 | #define FILL_CHAR '.' | |
117 | ||
118 | /* fill vtcm scratch with ee */ | |
119 | void prefill_vtcm_scratch(void) | |
120 | { | |
121 | memset(&vtcm, FILL_CHAR, sizeof(vtcm)); | |
122 | } | |
123 | ||
124 | /* create byte offsets to be a diagonal of the matrix with 16 bit elements */ | |
125 | void create_offsets_values_preds_16(void) | |
126 | { | |
127 | unsigned short half_element = 0; | |
128 | unsigned short half_element_masked = 0; | |
129 | char letter = 'A'; | |
130 | char letter_masked = '@'; | |
131 | ||
132 | for (int i = 0; i < MATRIX_SIZE; i++) { | |
133 | half_offsets[i] = i * (2 * MATRIX_SIZE + 2); | |
134 | ||
135 | half_element = 0; | |
136 | half_element_masked = 0; | |
137 | for (int j = 0; j < 2; j++) { | |
138 | half_element |= letter << j * 8; | |
139 | half_element_masked |= letter_masked << j * 8; | |
140 | } | |
141 | ||
142 | half_values[i] = half_element; | |
143 | half_values_acc[i] = ((i % 10) << 8) + (i % 10); | |
144 | half_values_masked[i] = half_element_masked; | |
145 | ||
146 | letter++; | |
147 | /* reset to 'A' */ | |
148 | if (letter == 'M') { | |
149 | letter = 'A'; | |
150 | } | |
151 | ||
152 | half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0; | |
153 | } | |
154 | } | |
155 | ||
156 | /* create byte offsets to be a diagonal of the matrix with 32 bit elements */ | |
157 | void create_offsets_values_preds_32(void) | |
158 | { | |
159 | unsigned int word_element = 0; | |
160 | unsigned int word_element_masked = 0; | |
161 | char letter = 'A'; | |
162 | char letter_masked = '&'; | |
163 | ||
164 | for (int i = 0; i < MATRIX_SIZE; i++) { | |
165 | word_offsets[i] = i * (4 * MATRIX_SIZE + 4); | |
166 | ||
167 | word_element = 0; | |
168 | word_element_masked = 0; | |
169 | for (int j = 0; j < 4; j++) { | |
170 | word_element |= letter << j * 8; | |
171 | word_element_masked |= letter_masked << j * 8; | |
172 | } | |
173 | ||
174 | word_values[i] = word_element; | |
175 | word_values_acc[i] = ((i % 10) << 8) + (i % 10); | |
176 | word_values_masked[i] = word_element_masked; | |
177 | ||
178 | letter++; | |
179 | /* reset to 'A' */ | |
180 | if (letter == 'M') { | |
181 | letter = 'A'; | |
182 | } | |
183 | ||
184 | word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0; | |
185 | } | |
186 | } | |
187 | ||
188 | /* | |
189 | * create byte offsets to be a diagonal of the matrix with 16 bit elements | |
190 | * and 32 bit offsets | |
191 | */ | |
192 | void create_offsets_values_preds_16_32(void) | |
193 | { | |
194 | unsigned short half_element = 0; | |
195 | unsigned short half_element_masked = 0; | |
196 | char letter = 'D'; | |
197 | char letter_masked = '$'; | |
198 | ||
199 | for (int i = 0; i < MATRIX_SIZE; i++) { | |
200 | word_offsets[i] = i * (2 * MATRIX_SIZE + 2); | |
201 | ||
202 | half_element = 0; | |
203 | half_element_masked = 0; | |
204 | for (int j = 0; j < 2; j++) { | |
205 | half_element |= letter << j * 8; | |
206 | half_element_masked |= letter_masked << j * 8; | |
207 | } | |
208 | ||
209 | half_values[i] = half_element; | |
210 | half_values_acc[i] = ((i % 10) << 8) + (i % 10); | |
211 | half_values_masked[i] = half_element_masked; | |
212 | ||
213 | letter++; | |
214 | /* reset to 'A' */ | |
215 | if (letter == 'P') { | |
216 | letter = 'D'; | |
217 | } | |
218 | ||
219 | half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0; | |
220 | } | |
221 | } | |
222 | ||
c3679385 | 223 | /* scatter the 16 bit elements using HVX */ |
62e93b08 TS |
224 | void vector_scatter_16(void) |
225 | { | |
c3679385 TS |
226 | asm ("m0 = %1\n\t" |
227 | "v0 = vmem(%2 + #0)\n\t" | |
228 | "v1 = vmem(%3 + #0)\n\t" | |
229 | "vscatter(%0, m0, v0.h).h = v1\n\t" | |
230 | : : "r"(vtcm.vscatter16), "r"(region_len), | |
231 | "r"(half_offsets), "r"(half_values) | |
232 | : "m0", "v0", "v1", "memory"); | |
62e93b08 TS |
233 | |
234 | sync_scatter(vtcm.vscatter16); | |
235 | } | |
236 | ||
c3679385 | 237 | /* scatter-accumulate the 16 bit elements using HVX */ |
62e93b08 TS |
238 | void vector_scatter_16_acc(void) |
239 | { | |
c3679385 TS |
240 | asm ("m0 = %1\n\t" |
241 | "v0 = vmem(%2 + #0)\n\t" | |
242 | "v1 = vmem(%3 + #0)\n\t" | |
243 | "vscatter(%0, m0, v0.h).h += v1\n\t" | |
244 | : : "r"(vtcm.vscatter16), "r"(region_len), | |
245 | "r"(half_offsets), "r"(half_values_acc) | |
246 | : "m0", "v0", "v1", "memory"); | |
62e93b08 TS |
247 | |
248 | sync_scatter(vtcm.vscatter16); | |
249 | } | |
250 | ||
c3679385 | 251 | /* masked scatter the 16 bit elements using HVX */ |
62e93b08 TS |
252 | void vector_scatter_16_masked(void) |
253 | { | |
c3679385 TS |
254 | asm ("r1 = #-1\n\t" |
255 | "v0 = vmem(%0 + #0)\n\t" | |
256 | "q0 = vand(v0, r1)\n\t" | |
257 | "m0 = %2\n\t" | |
258 | "v0 = vmem(%3 + #0)\n\t" | |
259 | "v1 = vmem(%4 + #0)\n\t" | |
260 | "if (q0) vscatter(%1, m0, v0.h).h = v1\n\t" | |
261 | : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len), | |
262 | "r"(half_offsets), "r"(half_values_masked) | |
263 | : "r1", "q0", "m0", "q0", "v0", "v1", "memory"); | |
62e93b08 TS |
264 | |
265 | sync_scatter(vtcm.vscatter16); | |
266 | } | |
267 | ||
c3679385 | 268 | /* scatter the 32 bit elements using HVX */ |
62e93b08 TS |
269 | void vector_scatter_32(void) |
270 | { | |
c3679385 TS |
271 | HVX_Vector *offsetslo = (HVX_Vector *)word_offsets; |
272 | HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; | |
273 | HVX_Vector *valueslo = (HVX_Vector *)word_values; | |
274 | HVX_Vector *valueshi = (HVX_Vector *)&word_values[MATRIX_SIZE / 2]; | |
275 | ||
276 | asm ("m0 = %1\n\t" | |
277 | "v0 = vmem(%2 + #0)\n\t" | |
278 | "v1 = vmem(%3 + #0)\n\t" | |
279 | "vscatter(%0, m0, v0.w).w = v1\n\t" | |
280 | : : "r"(vtcm.vscatter32), "r"(region_len), | |
281 | "r"(offsetslo), "r"(valueslo) | |
282 | : "m0", "v0", "v1", "memory"); | |
283 | asm ("m0 = %1\n\t" | |
284 | "v0 = vmem(%2 + #0)\n\t" | |
285 | "v1 = vmem(%3 + #0)\n\t" | |
286 | "vscatter(%0, m0, v0.w).w = v1\n\t" | |
287 | : : "r"(vtcm.vscatter32), "r"(region_len), | |
288 | "r"(offsetshi), "r"(valueshi) | |
289 | : "m0", "v0", "v1", "memory"); | |
62e93b08 TS |
290 | |
291 | sync_scatter(vtcm.vscatter32); | |
292 | } | |
293 | ||
c3679385 | 294 | /* scatter-accumulate the 32 bit elements using HVX */ |
62e93b08 TS |
295 | void vector_scatter_32_acc(void) |
296 | { | |
c3679385 TS |
297 | HVX_Vector *offsetslo = (HVX_Vector *)word_offsets; |
298 | HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; | |
299 | HVX_Vector *valueslo = (HVX_Vector *)word_values_acc; | |
300 | HVX_Vector *valueshi = (HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2]; | |
301 | ||
302 | asm ("m0 = %1\n\t" | |
303 | "v0 = vmem(%2 + #0)\n\t" | |
304 | "v1 = vmem(%3 + #0)\n\t" | |
305 | "vscatter(%0, m0, v0.w).w += v1\n\t" | |
306 | : : "r"(vtcm.vscatter32), "r"(region_len), | |
307 | "r"(offsetslo), "r"(valueslo) | |
308 | : "m0", "v0", "v1", "memory"); | |
309 | asm ("m0 = %1\n\t" | |
310 | "v0 = vmem(%2 + #0)\n\t" | |
311 | "v1 = vmem(%3 + #0)\n\t" | |
312 | "vscatter(%0, m0, v0.w).w += v1\n\t" | |
313 | : : "r"(vtcm.vscatter32), "r"(region_len), | |
314 | "r"(offsetshi), "r"(valueshi) | |
315 | : "m0", "v0", "v1", "memory"); | |
62e93b08 TS |
316 | |
317 | sync_scatter(vtcm.vscatter32); | |
318 | } | |
319 | ||
c3679385 | 320 | /* masked scatter the 32 bit elements using HVX */ |
62e93b08 TS |
321 | void vector_scatter_32_masked(void) |
322 | { | |
c3679385 TS |
323 | HVX_Vector *offsetslo = (HVX_Vector *)word_offsets; |
324 | HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; | |
325 | HVX_Vector *valueslo = (HVX_Vector *)word_values_masked; | |
326 | HVX_Vector *valueshi = (HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2]; | |
327 | HVX_Vector *predslo = (HVX_Vector *)word_predicates; | |
328 | HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2]; | |
329 | ||
330 | asm ("r1 = #-1\n\t" | |
331 | "v0 = vmem(%0 + #0)\n\t" | |
332 | "q0 = vand(v0, r1)\n\t" | |
333 | "m0 = %2\n\t" | |
334 | "v0 = vmem(%3 + #0)\n\t" | |
335 | "v1 = vmem(%4 + #0)\n\t" | |
336 | "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t" | |
337 | : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len), | |
338 | "r"(offsetslo), "r"(valueslo) | |
339 | : "r1", "q0", "m0", "q0", "v0", "v1", "memory"); | |
340 | asm ("r1 = #-1\n\t" | |
341 | "v0 = vmem(%0 + #0)\n\t" | |
342 | "q0 = vand(v0, r1)\n\t" | |
343 | "m0 = %2\n\t" | |
344 | "v0 = vmem(%3 + #0)\n\t" | |
345 | "v1 = vmem(%4 + #0)\n\t" | |
346 | "if (q0) vscatter(%1, m0, v0.w).w = v1\n\t" | |
347 | : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len), | |
348 | "r"(offsetshi), "r"(valueshi) | |
349 | : "r1", "q0", "m0", "q0", "v0", "v1", "memory"); | |
62e93b08 | 350 | |
c3679385 | 351 | sync_scatter(vtcm.vscatter32); |
62e93b08 TS |
352 | } |
353 | ||
c3679385 | 354 | /* scatter the 16 bit elements with 32 bit offsets using HVX */ |
62e93b08 TS |
355 | void vector_scatter_16_32(void) |
356 | { | |
c3679385 TS |
357 | asm ("m0 = %1\n\t" |
358 | "v0 = vmem(%2 + #0)\n\t" | |
359 | "v1 = vmem(%2 + #1)\n\t" | |
360 | "v2 = vmem(%3 + #0)\n\t" | |
361 | "v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */ | |
362 | "vscatter(%0, m0, v1:0.w).h = v2\n\t" | |
363 | : : "r"(vtcm.vscatter16_32), "r"(region_len), | |
364 | "r"(word_offsets), "r"(half_values) | |
365 | : "m0", "v0", "v1", "v2", "memory"); | |
62e93b08 TS |
366 | |
367 | sync_scatter(vtcm.vscatter16_32); | |
368 | } | |
369 | ||
c3679385 | 370 | /* scatter-accumulate the 16 bit elements with 32 bit offsets using HVX */ |
62e93b08 TS |
371 | void vector_scatter_16_32_acc(void) |
372 | { | |
c3679385 TS |
373 | asm ("m0 = %1\n\t" |
374 | "v0 = vmem(%2 + #0)\n\t" | |
375 | "v1 = vmem(%2 + #1)\n\t" | |
376 | "v2 = vmem(%3 + #0)\n\t" \ | |
377 | "v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */ | |
378 | "vscatter(%0, m0, v1:0.w).h += v2\n\t" | |
379 | : : "r"(vtcm.vscatter16_32), "r"(region_len), | |
380 | "r"(word_offsets), "r"(half_values_acc) | |
381 | : "m0", "v0", "v1", "v2", "memory"); | |
62e93b08 TS |
382 | |
383 | sync_scatter(vtcm.vscatter16_32); | |
384 | } | |
385 | ||
c3679385 | 386 | /* masked scatter the 16 bit elements with 32 bit offsets using HVX */ |
62e93b08 TS |
387 | void vector_scatter_16_32_masked(void) |
388 | { | |
c3679385 TS |
389 | asm ("r1 = #-1\n\t" |
390 | "v0 = vmem(%0 + #0)\n\t" | |
391 | "v0.h = vshuff(v0.h)\n\t" /* shuffle the predicates */ | |
392 | "q0 = vand(v0, r1)\n\t" | |
393 | "m0 = %2\n\t" | |
394 | "v0 = vmem(%3 + #0)\n\t" | |
395 | "v1 = vmem(%3 + #1)\n\t" | |
396 | "v2 = vmem(%4 + #0)\n\t" \ | |
397 | "v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */ | |
398 | "if (q0) vscatter(%1, m0, v1:0.w).h = v2\n\t" | |
399 | : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len), | |
400 | "r"(word_offsets), "r"(half_values_masked) | |
401 | : "r1", "q0", "m0", "v0", "v1", "v2", "memory"); | |
62e93b08 TS |
402 | |
403 | sync_scatter(vtcm.vscatter16_32); | |
404 | } | |
405 | ||
c3679385 | 406 | /* gather the elements from the scatter16 buffer using HVX */ |
62e93b08 TS |
407 | void vector_gather_16(void) |
408 | { | |
c3679385 TS |
409 | asm ("m0 = %1\n\t" |
410 | "v0 = vmem(%2 + #0)\n\t" | |
411 | "{ vtmp.h = vgather(%0, m0, v0.h).h\n\t" | |
412 | " vmem(%3 + #0) = vtmp.new }\n\t" | |
413 | : : "r"(vtcm.vscatter16), "r"(region_len), | |
414 | "r"(half_offsets), "r"(vtcm.vgather16) | |
415 | : "m0", "v0", "memory"); | |
62e93b08 | 416 | |
c3679385 | 417 | sync_gather(vtcm.vgather16); |
62e93b08 TS |
418 | } |
419 | ||
420 | static unsigned short gather_16_masked_init(void) | |
421 | { | |
422 | char letter = '?'; | |
423 | return letter | (letter << 8); | |
424 | } | |
425 | ||
c3679385 | 426 | /* masked gather the elements from the scatter16 buffer using HVX */ |
62e93b08 TS |
427 | void vector_gather_16_masked(void) |
428 | { | |
c3679385 | 429 | unsigned short init = gather_16_masked_init(); |
62e93b08 | 430 | |
c3679385 TS |
431 | asm ("v0.h = vsplat(%5)\n\t" |
432 | "vmem(%4 + #0) = v0\n\t" /* initialize the write area */ | |
433 | "r1 = #-1\n\t" | |
434 | "v0 = vmem(%0 + #0)\n\t" | |
435 | "q0 = vand(v0, r1)\n\t" | |
436 | "m0 = %2\n\t" | |
437 | "v0 = vmem(%3 + #0)\n\t" | |
438 | "{ if (q0) vtmp.h = vgather(%1, m0, v0.h).h\n\t" | |
439 | " vmem(%4 + #0) = vtmp.new }\n\t" | |
440 | : : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len), | |
441 | "r"(half_offsets), "r"(vtcm.vgather16), "r"(init) | |
442 | : "r1", "q0", "m0", "v0", "memory"); | |
62e93b08 | 443 | |
c3679385 | 444 | sync_gather(vtcm.vgather16); |
62e93b08 TS |
445 | } |
446 | ||
c3679385 | 447 | /* gather the elements from the scatter32 buffer using HVX */ |
62e93b08 TS |
448 | void vector_gather_32(void) |
449 | { | |
c3679385 TS |
450 | HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32; |
451 | HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2]; | |
452 | HVX_Vector *offsetslo = (HVX_Vector *)word_offsets; | |
453 | HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; | |
454 | ||
455 | asm ("m0 = %1\n\t" | |
456 | "v0 = vmem(%2 + #0)\n\t" | |
457 | "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t" | |
458 | " vmem(%3 + #0) = vtmp.new }\n\t" | |
459 | : : "r"(vtcm.vscatter32), "r"(region_len), | |
460 | "r"(offsetslo), "r"(vgatherlo) | |
461 | : "m0", "v0", "memory"); | |
462 | asm ("m0 = %1\n\t" | |
463 | "v0 = vmem(%2 + #0)\n\t" | |
464 | "{ vtmp.w = vgather(%0, m0, v0.w).w\n\t" | |
465 | " vmem(%3 + #0) = vtmp.new }\n\t" | |
466 | : : "r"(vtcm.vscatter32), "r"(region_len), | |
467 | "r"(offsetshi), "r"(vgatherhi) | |
468 | : "m0", "v0", "memory"); | |
62e93b08 | 469 | |
c3679385 | 470 | sync_gather(vgatherlo); |
62e93b08 TS |
471 | sync_gather(vgatherhi); |
472 | } | |
473 | ||
474 | static unsigned int gather_32_masked_init(void) | |
475 | { | |
476 | char letter = '?'; | |
477 | return letter | (letter << 8) | (letter << 16) | (letter << 24); | |
478 | } | |
479 | ||
c3679385 | 480 | /* masked gather the elements from the scatter32 buffer using HVX */ |
62e93b08 TS |
481 | void vector_gather_32_masked(void) |
482 | { | |
c3679385 TS |
483 | unsigned int init = gather_32_masked_init(); |
484 | HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32; | |
485 | HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2]; | |
486 | HVX_Vector *offsetslo = (HVX_Vector *)word_offsets; | |
487 | HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2]; | |
488 | HVX_Vector *predslo = (HVX_Vector *)word_predicates; | |
489 | HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2]; | |
490 | ||
491 | asm ("v0.h = vsplat(%5)\n\t" | |
492 | "vmem(%4 + #0) = v0\n\t" /* initialize the write area */ | |
493 | "r1 = #-1\n\t" | |
494 | "v0 = vmem(%0 + #0)\n\t" | |
495 | "q0 = vand(v0, r1)\n\t" | |
496 | "m0 = %2\n\t" | |
497 | "v0 = vmem(%3 + #0)\n\t" | |
498 | "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t" | |
499 | " vmem(%4 + #0) = vtmp.new }\n\t" | |
500 | : : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len), | |
501 | "r"(offsetslo), "r"(vgatherlo), "r"(init) | |
502 | : "r1", "q0", "m0", "v0", "memory"); | |
503 | asm ("v0.h = vsplat(%5)\n\t" | |
504 | "vmem(%4 + #0) = v0\n\t" /* initialize the write area */ | |
505 | "r1 = #-1\n\t" | |
506 | "v0 = vmem(%0 + #0)\n\t" | |
507 | "q0 = vand(v0, r1)\n\t" | |
508 | "m0 = %2\n\t" | |
509 | "v0 = vmem(%3 + #0)\n\t" | |
510 | "{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t" | |
511 | " vmem(%4 + #0) = vtmp.new }\n\t" | |
512 | : : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len), | |
513 | "r"(offsetshi), "r"(vgatherhi), "r"(init) | |
514 | : "r1", "q0", "m0", "v0", "memory"); | |
62e93b08 TS |
515 | |
516 | sync_gather(vgatherlo); | |
517 | sync_gather(vgatherhi); | |
518 | } | |
519 | ||
c3679385 | 520 | /* gather the elements from the scatter16_32 buffer using HVX */ |
62e93b08 TS |
521 | void vector_gather_16_32(void) |
522 | { | |
c3679385 TS |
523 | asm ("m0 = %1\n\t" |
524 | "v0 = vmem(%2 + #0)\n\t" | |
525 | "v1 = vmem(%2 + #1)\n\t" | |
526 | "{ vtmp.h = vgather(%0, m0, v1:0.w).h\n\t" | |
527 | " vmem(%3 + #0) = vtmp.new }\n\t" | |
528 | "v0 = vmem(%3 + #0)\n\t" | |
529 | "v0.h = vdeal(v0.h)\n\t" /* deal the elements to get the order back */ | |
530 | "vmem(%3 + #0) = v0\n\t" | |
531 | : : "r"(vtcm.vscatter16_32), "r"(region_len), | |
532 | "r"(word_offsets), "r"(vtcm.vgather16_32) | |
533 | : "m0", "v0", "v1", "memory"); | |
62e93b08 | 534 | |
c3679385 | 535 | sync_gather(vtcm.vgather16_32); |
62e93b08 TS |
536 | } |
537 | ||
c3679385 | 538 | /* masked gather the elements from the scatter16_32 buffer using HVX */ |
62e93b08 TS |
539 | void vector_gather_16_32_masked(void) |
540 | { | |
c3679385 TS |
541 | unsigned short init = gather_16_masked_init(); |
542 | ||
543 | asm ("v0.h = vsplat(%5)\n\t" | |
544 | "vmem(%4 + #0) = v0\n\t" /* initialize the write area */ | |
545 | "r1 = #-1\n\t" | |
546 | "v0 = vmem(%0 + #0)\n\t" | |
547 | "v0.h = vshuff(v0.h)\n\t" /* shuffle the predicates */ | |
548 | "q0 = vand(v0, r1)\n\t" | |
549 | "m0 = %2\n\t" | |
550 | "v0 = vmem(%3 + #0)\n\t" | |
551 | "v1 = vmem(%3 + #1)\n\t" | |
552 | "{ if (q0) vtmp.h = vgather(%1, m0, v1:0.w).h\n\t" | |
553 | " vmem(%4 + #0) = vtmp.new }\n\t" | |
554 | "v0 = vmem(%4 + #0)\n\t" | |
555 | "v0.h = vdeal(v0.h)\n\t" /* deal the elements to get the order back */ | |
556 | "vmem(%4 + #0) = v0\n\t" | |
557 | : : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len), | |
558 | "r"(word_offsets), "r"(vtcm.vgather16_32), "r"(init) | |
559 | : "r1", "q0", "m0", "v0", "v1", "memory"); | |
560 | ||
561 | sync_gather(vtcm.vgather16_32); | |
62e93b08 TS |
562 | } |
563 | ||
564 | static void check_buffer(const char *name, void *c, void *r, size_t size) | |
565 | { | |
566 | char *check = (char *)c; | |
567 | char *ref = (char *)r; | |
568 | for (int i = 0; i < size; i++) { | |
569 | if (check[i] != ref[i]) { | |
570 | printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i, | |
571 | check[i], check[i], ref[i], ref[i]); | |
572 | err++; | |
573 | } | |
574 | } | |
575 | } | |
576 | ||
577 | /* | |
578 | * These scalar functions are the C equivalents of the vector functions that | |
579 | * use HVX | |
580 | */ | |
581 | ||
582 | /* scatter the 16 bit elements using C */ | |
583 | void scalar_scatter_16(unsigned short *vscatter16) | |
584 | { | |
585 | for (int i = 0; i < MATRIX_SIZE; ++i) { | |
586 | vscatter16[half_offsets[i] / 2] = half_values[i]; | |
587 | } | |
588 | } | |
589 | ||
590 | void check_scatter_16() | |
591 | { | |
592 | memset(vscatter16_ref, FILL_CHAR, | |
593 | SCATTER_BUFFER_SIZE * sizeof(unsigned short)); | |
594 | scalar_scatter_16(vscatter16_ref); | |
595 | check_buffer(__func__, vtcm.vscatter16, vscatter16_ref, | |
596 | SCATTER_BUFFER_SIZE * sizeof(unsigned short)); | |
597 | } | |
598 | ||
599 | /* scatter the 16 bit elements using C */ | |
600 | void scalar_scatter_16_acc(unsigned short *vscatter16) | |
601 | { | |
602 | for (int i = 0; i < MATRIX_SIZE; ++i) { | |
603 | vscatter16[half_offsets[i] / 2] += half_values_acc[i]; | |
604 | } | |
605 | } | |
606 | ||
c3679385 | 607 | /* scatter-accumulate the 16 bit elements using C */ |
62e93b08 TS |
608 | void check_scatter_16_acc() |
609 | { | |
610 | memset(vscatter16_ref, FILL_CHAR, | |
611 | SCATTER_BUFFER_SIZE * sizeof(unsigned short)); | |
612 | scalar_scatter_16(vscatter16_ref); | |
613 | scalar_scatter_16_acc(vscatter16_ref); | |
614 | check_buffer(__func__, vtcm.vscatter16, vscatter16_ref, | |
615 | SCATTER_BUFFER_SIZE * sizeof(unsigned short)); | |
616 | } | |
617 | ||
c3679385 | 618 | /* masked scatter the 16 bit elements using C */ |
62e93b08 TS |
619 | void scalar_scatter_16_masked(unsigned short *vscatter16) |
620 | { | |
621 | for (int i = 0; i < MATRIX_SIZE; i++) { | |
622 | if (half_predicates[i]) { | |
623 | vscatter16[half_offsets[i] / 2] = half_values_masked[i]; | |
624 | } | |
625 | } | |
626 | ||
627 | } | |
628 | ||
629 | void check_scatter_16_masked() | |
630 | { | |
631 | memset(vscatter16_ref, FILL_CHAR, | |
632 | SCATTER_BUFFER_SIZE * sizeof(unsigned short)); | |
633 | scalar_scatter_16(vscatter16_ref); | |
634 | scalar_scatter_16_acc(vscatter16_ref); | |
635 | scalar_scatter_16_masked(vscatter16_ref); | |
636 | check_buffer(__func__, vtcm.vscatter16, vscatter16_ref, | |
637 | SCATTER_BUFFER_SIZE * sizeof(unsigned short)); | |
638 | } | |
639 | ||
640 | /* scatter the 32 bit elements using C */ | |
641 | void scalar_scatter_32(unsigned int *vscatter32) | |
642 | { | |
643 | for (int i = 0; i < MATRIX_SIZE; ++i) { | |
644 | vscatter32[word_offsets[i] / 4] = word_values[i]; | |
645 | } | |
646 | } | |
647 | ||
648 | void check_scatter_32() | |
649 | { | |
650 | memset(vscatter32_ref, FILL_CHAR, | |
651 | SCATTER_BUFFER_SIZE * sizeof(unsigned int)); | |
652 | scalar_scatter_32(vscatter32_ref); | |
653 | check_buffer(__func__, vtcm.vscatter32, vscatter32_ref, | |
654 | SCATTER_BUFFER_SIZE * sizeof(unsigned int)); | |
655 | } | |
656 | ||
c3679385 | 657 | /* scatter-accumulate the 32 bit elements using C */ |
62e93b08 TS |
658 | void scalar_scatter_32_acc(unsigned int *vscatter32) |
659 | { | |
660 | for (int i = 0; i < MATRIX_SIZE; ++i) { | |
661 | vscatter32[word_offsets[i] / 4] += word_values_acc[i]; | |
662 | } | |
663 | } | |
664 | ||
665 | void check_scatter_32_acc() | |
666 | { | |
667 | memset(vscatter32_ref, FILL_CHAR, | |
668 | SCATTER_BUFFER_SIZE * sizeof(unsigned int)); | |
669 | scalar_scatter_32(vscatter32_ref); | |
670 | scalar_scatter_32_acc(vscatter32_ref); | |
671 | check_buffer(__func__, vtcm.vscatter32, vscatter32_ref, | |
672 | SCATTER_BUFFER_SIZE * sizeof(unsigned int)); | |
673 | } | |
674 | ||
c3679385 | 675 | /* masked scatter the 32 bit elements using C */ |
62e93b08 TS |
676 | void scalar_scatter_32_masked(unsigned int *vscatter32) |
677 | { | |
678 | for (int i = 0; i < MATRIX_SIZE; i++) { | |
679 | if (word_predicates[i]) { | |
680 | vscatter32[word_offsets[i] / 4] = word_values_masked[i]; | |
681 | } | |
682 | } | |
683 | } | |
684 | ||
685 | void check_scatter_32_masked() | |
686 | { | |
687 | memset(vscatter32_ref, FILL_CHAR, | |
688 | SCATTER_BUFFER_SIZE * sizeof(unsigned int)); | |
689 | scalar_scatter_32(vscatter32_ref); | |
690 | scalar_scatter_32_acc(vscatter32_ref); | |
691 | scalar_scatter_32_masked(vscatter32_ref); | |
692 | check_buffer(__func__, vtcm.vscatter32, vscatter32_ref, | |
693 | SCATTER_BUFFER_SIZE * sizeof(unsigned int)); | |
694 | } | |
695 | ||
c3679385 | 696 | /* scatter the 16 bit elements with 32 bit offsets using C */ |
62e93b08 TS |
697 | void scalar_scatter_16_32(unsigned short *vscatter16_32) |
698 | { | |
699 | for (int i = 0; i < MATRIX_SIZE; ++i) { | |
700 | vscatter16_32[word_offsets[i] / 2] = half_values[i]; | |
701 | } | |
702 | } | |
703 | ||
704 | void check_scatter_16_32() | |
705 | { | |
706 | memset(vscatter16_32_ref, FILL_CHAR, | |
707 | SCATTER_BUFFER_SIZE * sizeof(unsigned short)); | |
708 | scalar_scatter_16_32(vscatter16_32_ref); | |
709 | check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref, | |
710 | SCATTER_BUFFER_SIZE * sizeof(unsigned short)); | |
711 | } | |
712 | ||
c3679385 | 713 | /* scatter-accumulate the 16 bit elements with 32 bit offsets using C */ |
62e93b08 TS |
714 | void scalar_scatter_16_32_acc(unsigned short *vscatter16_32) |
715 | { | |
716 | for (int i = 0; i < MATRIX_SIZE; ++i) { | |
717 | vscatter16_32[word_offsets[i] / 2] += half_values_acc[i]; | |
718 | } | |
719 | } | |
720 | ||
721 | void check_scatter_16_32_acc() | |
722 | { | |
723 | memset(vscatter16_32_ref, FILL_CHAR, | |
724 | SCATTER_BUFFER_SIZE * sizeof(unsigned short)); | |
725 | scalar_scatter_16_32(vscatter16_32_ref); | |
726 | scalar_scatter_16_32_acc(vscatter16_32_ref); | |
727 | check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref, | |
728 | SCATTER_BUFFER_SIZE * sizeof(unsigned short)); | |
729 | } | |
730 | ||
c3679385 | 731 | /* masked scatter the 16 bit elements with 32 bit offsets using C */ |
62e93b08 TS |
732 | void scalar_scatter_16_32_masked(unsigned short *vscatter16_32) |
733 | { | |
734 | for (int i = 0; i < MATRIX_SIZE; i++) { | |
735 | if (half_predicates[i]) { | |
736 | vscatter16_32[word_offsets[i] / 2] = half_values_masked[i]; | |
737 | } | |
738 | } | |
739 | } | |
740 | ||
741 | void check_scatter_16_32_masked() | |
742 | { | |
743 | memset(vscatter16_32_ref, FILL_CHAR, | |
744 | SCATTER_BUFFER_SIZE * sizeof(unsigned short)); | |
745 | scalar_scatter_16_32(vscatter16_32_ref); | |
746 | scalar_scatter_16_32_acc(vscatter16_32_ref); | |
747 | scalar_scatter_16_32_masked(vscatter16_32_ref); | |
748 | check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref, | |
749 | SCATTER_BUFFER_SIZE * sizeof(unsigned short)); | |
750 | } | |
751 | ||
752 | /* gather the elements from the scatter buffer using C */ | |
753 | void scalar_gather_16(unsigned short *vgather16) | |
754 | { | |
755 | for (int i = 0; i < MATRIX_SIZE; ++i) { | |
756 | vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2]; | |
757 | } | |
758 | } | |
759 | ||
760 | void check_gather_16() | |
761 | { | |
762 | memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short)); | |
763 | scalar_gather_16(vgather16_ref); | |
764 | check_buffer(__func__, vtcm.vgather16, vgather16_ref, | |
765 | MATRIX_SIZE * sizeof(unsigned short)); | |
766 | } | |
767 | ||
c3679385 | 768 | /* masked gather the elements from the scatter buffer using C */ |
62e93b08 TS |
769 | void scalar_gather_16_masked(unsigned short *vgather16) |
770 | { | |
771 | for (int i = 0; i < MATRIX_SIZE; ++i) { | |
772 | if (half_predicates[i]) { | |
773 | vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2]; | |
774 | } | |
775 | } | |
776 | } | |
777 | ||
778 | void check_gather_16_masked() | |
779 | { | |
780 | memset(vgather16_ref, gather_16_masked_init(), | |
781 | MATRIX_SIZE * sizeof(unsigned short)); | |
782 | scalar_gather_16_masked(vgather16_ref); | |
783 | check_buffer(__func__, vtcm.vgather16, vgather16_ref, | |
784 | MATRIX_SIZE * sizeof(unsigned short)); | |
785 | } | |
786 | ||
c3679385 | 787 | /* gather the elements from the scatter32 buffer using C */ |
62e93b08 TS |
788 | void scalar_gather_32(unsigned int *vgather32) |
789 | { | |
790 | for (int i = 0; i < MATRIX_SIZE; ++i) { | |
791 | vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4]; | |
792 | } | |
793 | } | |
794 | ||
795 | void check_gather_32(void) | |
796 | { | |
797 | memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int)); | |
798 | scalar_gather_32(vgather32_ref); | |
799 | check_buffer(__func__, vtcm.vgather32, vgather32_ref, | |
800 | MATRIX_SIZE * sizeof(unsigned int)); | |
801 | } | |
802 | ||
c3679385 | 803 | /* masked gather the elements from the scatter32 buffer using C */ |
62e93b08 TS |
804 | void scalar_gather_32_masked(unsigned int *vgather32) |
805 | { | |
806 | for (int i = 0; i < MATRIX_SIZE; ++i) { | |
807 | if (word_predicates[i]) { | |
808 | vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4]; | |
809 | } | |
810 | } | |
811 | } | |
812 | ||
62e93b08 TS |
813 | void check_gather_32_masked(void) |
814 | { | |
815 | memset(vgather32_ref, gather_32_masked_init(), | |
816 | MATRIX_SIZE * sizeof(unsigned int)); | |
817 | scalar_gather_32_masked(vgather32_ref); | |
818 | check_buffer(__func__, vtcm.vgather32, | |
819 | vgather32_ref, MATRIX_SIZE * sizeof(unsigned int)); | |
820 | } | |
821 | ||
c3679385 | 822 | /* gather the elements from the scatter16_32 buffer using C */ |
62e93b08 TS |
823 | void scalar_gather_16_32(unsigned short *vgather16_32) |
824 | { | |
825 | for (int i = 0; i < MATRIX_SIZE; ++i) { | |
826 | vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2]; | |
827 | } | |
828 | } | |
829 | ||
830 | void check_gather_16_32(void) | |
831 | { | |
832 | memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short)); | |
833 | scalar_gather_16_32(vgather16_32_ref); | |
834 | check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref, | |
835 | MATRIX_SIZE * sizeof(unsigned short)); | |
836 | } | |
837 | ||
c3679385 | 838 | /* masked gather the elements from the scatter16_32 buffer using C */ |
62e93b08 TS |
839 | void scalar_gather_16_32_masked(unsigned short *vgather16_32) |
840 | { | |
841 | for (int i = 0; i < MATRIX_SIZE; ++i) { | |
842 | if (half_predicates[i]) { | |
843 | vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2]; | |
844 | } | |
845 | } | |
846 | ||
847 | } | |
848 | ||
849 | void check_gather_16_32_masked(void) | |
850 | { | |
851 | memset(vgather16_32_ref, gather_16_masked_init(), | |
852 | MATRIX_SIZE * sizeof(unsigned short)); | |
853 | scalar_gather_16_32_masked(vgather16_32_ref); | |
854 | check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref, | |
855 | MATRIX_SIZE * sizeof(unsigned short)); | |
856 | } | |
857 | ||
858 | /* print scatter16 buffer */ | |
859 | void print_scatter16_buffer(void) | |
860 | { | |
861 | if (PRINT_DATA) { | |
862 | printf("\n\nPrinting the 16 bit scatter buffer"); | |
863 | ||
864 | for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) { | |
865 | if ((i % MATRIX_SIZE) == 0) { | |
866 | printf("\n"); | |
867 | } | |
868 | for (int j = 0; j < 2; j++) { | |
869 | printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff)); | |
870 | } | |
871 | printf(" "); | |
872 | } | |
873 | printf("\n"); | |
874 | } | |
875 | } | |
876 | ||
877 | /* print the gather 16 buffer */ | |
878 | void print_gather_result_16(void) | |
879 | { | |
880 | if (PRINT_DATA) { | |
881 | printf("\n\nPrinting the 16 bit gather result\n"); | |
882 | ||
883 | for (int i = 0; i < MATRIX_SIZE; i++) { | |
884 | for (int j = 0; j < 2; j++) { | |
885 | printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff)); | |
886 | } | |
887 | printf(" "); | |
888 | } | |
889 | printf("\n"); | |
890 | } | |
891 | } | |
892 | ||
893 | /* print the scatter32 buffer */ | |
894 | void print_scatter32_buffer(void) | |
895 | { | |
896 | if (PRINT_DATA) { | |
897 | printf("\n\nPrinting the 32 bit scatter buffer"); | |
898 | ||
899 | for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) { | |
900 | if ((i % MATRIX_SIZE) == 0) { | |
901 | printf("\n"); | |
902 | } | |
903 | for (int j = 0; j < 4; j++) { | |
904 | printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff)); | |
905 | } | |
906 | printf(" "); | |
907 | } | |
908 | printf("\n"); | |
909 | } | |
910 | } | |
911 | ||
912 | /* print the gather 32 buffer */ | |
913 | void print_gather_result_32(void) | |
914 | { | |
915 | if (PRINT_DATA) { | |
916 | printf("\n\nPrinting the 32 bit gather result\n"); | |
917 | ||
918 | for (int i = 0; i < MATRIX_SIZE; i++) { | |
919 | for (int j = 0; j < 4; j++) { | |
920 | printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff)); | |
921 | } | |
922 | printf(" "); | |
923 | } | |
924 | printf("\n"); | |
925 | } | |
926 | } | |
927 | ||
928 | /* print the scatter16_32 buffer */ | |
929 | void print_scatter16_32_buffer(void) | |
930 | { | |
931 | if (PRINT_DATA) { | |
932 | printf("\n\nPrinting the 16_32 bit scatter buffer"); | |
933 | ||
934 | for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) { | |
935 | if ((i % MATRIX_SIZE) == 0) { | |
936 | printf("\n"); | |
937 | } | |
938 | for (int j = 0; j < 2; j++) { | |
939 | printf("%c", | |
940 | (unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff)); | |
941 | } | |
942 | printf(" "); | |
943 | } | |
944 | printf("\n"); | |
945 | } | |
946 | } | |
947 | ||
948 | /* print the gather 16_32 buffer */ | |
949 | void print_gather_result_16_32(void) | |
950 | { | |
951 | if (PRINT_DATA) { | |
952 | printf("\n\nPrinting the 16_32 bit gather result\n"); | |
953 | ||
954 | for (int i = 0; i < MATRIX_SIZE; i++) { | |
955 | for (int j = 0; j < 2; j++) { | |
956 | printf("%c", | |
957 | (unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff)); | |
958 | } | |
959 | printf(" "); | |
960 | } | |
961 | printf("\n"); | |
962 | } | |
963 | } | |
964 | ||
965 | int main() | |
966 | { | |
967 | prefill_vtcm_scratch(); | |
968 | ||
969 | /* 16 bit elements with 16 bit offsets */ | |
970 | create_offsets_values_preds_16(); | |
971 | ||
972 | vector_scatter_16(); | |
973 | print_scatter16_buffer(); | |
974 | check_scatter_16(); | |
975 | ||
976 | vector_gather_16(); | |
977 | print_gather_result_16(); | |
978 | check_gather_16(); | |
979 | ||
980 | vector_gather_16_masked(); | |
981 | print_gather_result_16(); | |
982 | check_gather_16_masked(); | |
983 | ||
984 | vector_scatter_16_acc(); | |
985 | print_scatter16_buffer(); | |
986 | check_scatter_16_acc(); | |
987 | ||
988 | vector_scatter_16_masked(); | |
989 | print_scatter16_buffer(); | |
990 | check_scatter_16_masked(); | |
991 | ||
992 | /* 32 bit elements with 32 bit offsets */ | |
993 | create_offsets_values_preds_32(); | |
994 | ||
995 | vector_scatter_32(); | |
996 | print_scatter32_buffer(); | |
997 | check_scatter_32(); | |
998 | ||
999 | vector_gather_32(); | |
1000 | print_gather_result_32(); | |
1001 | check_gather_32(); | |
1002 | ||
1003 | vector_gather_32_masked(); | |
1004 | print_gather_result_32(); | |
1005 | check_gather_32_masked(); | |
1006 | ||
1007 | vector_scatter_32_acc(); | |
1008 | print_scatter32_buffer(); | |
1009 | check_scatter_32_acc(); | |
1010 | ||
1011 | vector_scatter_32_masked(); | |
1012 | print_scatter32_buffer(); | |
1013 | check_scatter_32_masked(); | |
1014 | ||
1015 | /* 16 bit elements with 32 bit offsets */ | |
1016 | create_offsets_values_preds_16_32(); | |
1017 | ||
1018 | vector_scatter_16_32(); | |
1019 | print_scatter16_32_buffer(); | |
1020 | check_scatter_16_32(); | |
1021 | ||
1022 | vector_gather_16_32(); | |
1023 | print_gather_result_16_32(); | |
1024 | check_gather_16_32(); | |
1025 | ||
1026 | vector_gather_16_32_masked(); | |
1027 | print_gather_result_16_32(); | |
1028 | check_gather_16_32_masked(); | |
1029 | ||
1030 | vector_scatter_16_32_acc(); | |
1031 | print_scatter16_32_buffer(); | |
1032 | check_scatter_16_32_acc(); | |
1033 | ||
1034 | vector_scatter_16_32_masked(); | |
1035 | print_scatter16_32_buffer(); | |
1036 | check_scatter_16_32_masked(); | |
1037 | ||
1038 | puts(err ? "FAIL" : "PASS"); | |
1039 | return err; | |
1040 | } |