2 ;; Copyright (c) 2019, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 %include "include/os.asm"
29 %include "include/reg_sizes.asm"
36 db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
37 db 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
41 db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
42 db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
46 dw 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
50 dw 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8
69 db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
70 db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01
84 %define bcast_idx xmm0
86 %define accum_val xmm2
97 ; uint8_t lookup_8bit_sse(const void *table, const uint32_t idx, const uint32_t size);
98 ; arg 1 : pointer to table to look up
99 ; arg 2 : index to look up
100 ; arg 3 : size of table to look up (multiple of 16 bytes)
101 MKGLOBAL(lookup_8bit_sse,function,internal)
104 ;; Number of loop iters = matrix size / 4 (number of values in XMM)
110 ;; Broadcast idx to look up
111 movd bcast_idx, DWORD(idx)
113 pxor accum_val, accum_val
114 pshufb bcast_idx, xtmp
116 movdqa xadd, [rel add_16]
117 movdqa xindices, [rel idx_tab8]
120 movdqa xtmp, xindices
122 ;; Compare indices with idx
123 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
124 pcmpeqb xtmp, bcast_idx
126 ;; Load next 16 values
127 movdqa xtmp2, [table + offset]
129 ;; This generates data with all 0s except the value we are looking for in the index to look up
134 ;; Get next 16 indices
142 ;; Extract value from XMM register
143 movdqa xtmp, accum_val
144 pslldq xtmp, 8 ; shift left by 64 bits
147 movdqa xtmp, accum_val
148 pslldq xtmp, 4 ; shift left by 32 bits
151 movdqa xtmp, accum_val
152 pslldq xtmp, 2 ; shift left by 16 bits
155 movdqa xtmp, accum_val
156 pslldq xtmp, 1 ; shift left by 8 bits
159 pextrb rax, accum_val, 15
164 ; uint8_t lookup_8bit_avx(const void *table, const uint32_t idx, const uint32_t size);
165 ; arg 1 : pointer to table to look up
166 ; arg 2 : index to look up
167 ; arg 3 : size of table to look up (multiple of 16 bytes)
168 MKGLOBAL(lookup_8bit_avx,function,internal)
170 ;; Number of loop iters = matrix size / 4 (number of values in XMM)
176 ;; Broadcast idx to look up
177 vmovd bcast_idx, DWORD(idx)
179 vpxor accum_val, accum_val
180 vpshufb bcast_idx, xtmp
182 vmovdqa xadd, [rel add_16]
183 vmovdqa xindices, [rel idx_tab8]
186 ;; Compare indices with idx
187 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
188 vpcmpeqb xtmp, xindices, bcast_idx
190 ;; Load next 16 values
191 vmovdqa xtmp2, [table + offset]
193 ;; This generates data with all 0s except the value we are looking for in the index to look up
196 vpor accum_val, xtmp2
198 ;; Get next 16 indices
199 vpaddb xindices, xadd
206 ;; Extract value from XMM register
207 vpslldq xtmp, accum_val, 8 ; shift left by 64 bits
210 vpslldq xtmp, accum_val, 4 ; shift left by 32 bits
213 vpslldq xtmp, accum_val, 2 ; shift left by 16 bits
216 vpslldq xtmp, accum_val, 1 ; shift left by 8 bits
219 vpextrb rax, accum_val, 15
225 ; uint8_t lookup_16bit_sse(const void *table, const uint32_t idx, const uint32_t size);
226 ; arg 1 : pointer to table to look up
227 ; arg 2 : index to look up
228 ; arg 3 : size of table to look up
229 MKGLOBAL(lookup_16bit_sse,function,internal)
232 ;; Number of loop iters = matrix size / 8 (number of values in XMM)
238 ;; Broadcast idx to look up
239 movd bcast_idx, DWORD(idx)
240 movdqa xtmp, [rel bcast_mask]
241 pxor accum_val, accum_val
242 pshufb bcast_idx, xtmp
244 movdqa xadd, [rel add_8]
245 movdqa xindices, [rel idx_tab16]
249 movdqa xtmp, xindices
251 ;; Compare indices with idx
252 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
253 pcmpeqw xtmp, bcast_idx
255 ;; Load next 8 values
256 movdqa xtmp2, [table + offset]
258 ;; This generates data with all 0s except the value we are looking for in the index to look up
263 ;; Get next 8 indices
270 ;; Extract value from XMM register
271 movdqa xtmp, accum_val
272 pslldq xtmp, 8 ; shift left by 64 bits
275 movdqa xtmp, accum_val
276 pslldq xtmp, 4 ; shift left by 32 bits
279 movdqa xtmp, accum_val
280 pslldq xtmp, 2 ; shift left by 16 bits
283 pextrw rax, accum_val, 7
288 ; uint8_t lookup_16bit_avx(const void *table, const uint32_t idx, const uint32_t size);
289 ; arg 1 : pointer to table to look up
290 ; arg 2 : index to look up
291 ; arg 3 : size of table to look up
292 MKGLOBAL(lookup_16bit_avx,function,internal)
295 ;; Number of loop iters = matrix size / 8 (number of values in XMM)
301 ;; Broadcast idx to look up
302 vmovd bcast_idx, DWORD(idx)
303 vmovdqa xtmp, [rel bcast_mask]
304 vpxor accum_val, accum_val
305 vpshufb bcast_idx, xtmp
307 vmovdqa xadd, [rel add_8]
308 vmovdqa xindices, [rel idx_tab16]
312 ;; Compare indices with idx
313 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
314 vpcmpeqw xtmp, xindices, bcast_idx
316 ;; Load next 16 values
317 vmovdqa xtmp2, [table + offset]
319 ;; This generates data with all 0s except the value we are looking for in the index to look up
322 vpor accum_val, xtmp2
324 ;; Get next 8 indices
325 vpaddw xindices, xadd
331 ;; Extract value from XMM register
332 vpslldq xtmp, accum_val, 8 ; shift left by 64 bits
335 vpslldq xtmp, accum_val, 4 ; shift left by 32 bits
338 vpslldq xtmp, accum_val, 2 ; shift left by 16 bits
341 vpextrw rax, accum_val, 7
346 ; uint32_t lookup_32bit_sse(const void *table, const uint32_t idx, const uint32_t size);
347 ; arg 1 : pointer to table to look up
348 ; arg 2 : index to look up
349 ; arg 3 : size of table to look up
350 MKGLOBAL(lookup_32bit_sse,function,internal)
353 ;; Number of loop iters = matrix size / 4 (number of values in XMM)
359 ;; Broadcast idx to look up
360 movd bcast_idx, DWORD(idx)
361 pxor accum_val, accum_val
362 pshufd bcast_idx, bcast_idx, 0
364 movdqa xadd, [rel add_4]
365 movdqa xindices, [rel idx_tab32]
368 movdqa xtmp, xindices
370 ;; Compare indices with idx
371 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
372 pcmpeqd xtmp, bcast_idx
374 ;; Load next 4 values
375 movdqa xtmp2, [table + offset]
377 ;; This generates data with all 0s except the value we are looking for in the index to look up
382 ;; Get next 4 indices
389 ;; Extract value from XMM register
390 movdqa xtmp, accum_val
391 psrldq xtmp, 8 ; shift right by 64 bits
394 movdqa xtmp, accum_val
395 psrldq xtmp, 4 ; shift right by 32 bits
404 ; uint32_t lookup_32bit_avx(const void *table, const uint32_t idx, const uint32_t size);
405 ; arg 1 : pointer to table to look up
406 ; arg 2 : index to look up
407 ; arg 3 : size of table to look up
408 MKGLOBAL(lookup_32bit_avx,function,internal)
410 ;; Number of loop iters = matrix size / 4 (number of values in XMM)
416 ;; Broadcast idx to look up
417 vmovd bcast_idx, DWORD(idx)
418 vpxor accum_val, accum_val
419 vpshufd bcast_idx, bcast_idx, 0
421 vmovdqa xadd, [rel add_4]
422 vmovdqa xindices, [rel idx_tab32]
425 ;; Compare indices with idx
426 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
427 vpcmpeqd xtmp, xindices, bcast_idx
429 ;; Load next 4 values
430 vmovdqa xtmp2, [table + offset]
432 ;; This generates data with all 0s except the value we are looking for in the index to look up
435 vpor accum_val, xtmp2
437 ;; Get next 4 indices
438 vpaddd xindices, xadd
444 ;; Extract value from XMM register
445 vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits
448 vpsrldq xtmp, accum_val, 4 ; shift right by 32 bits
457 ; uint64_t lookup_64bit_sse(const void *table, const uint32_t idx, const uint32_t size);
458 ; arg 1 : pointer to table to look up
459 ; arg 2 : index to look up
460 ; arg 3 : size of table to look up
461 MKGLOBAL(lookup_64bit_sse,function,internal)
463 ;; Number of loop iters = matrix size / 2 (number of values in XMM)
469 ;; Broadcast idx to look up
471 pxor accum_val, accum_val
472 pinsrq bcast_idx, idx, 1
474 movdqa xadd, [rel add_2]
475 movdqa xindices, [rel idx_tab64]
478 movdqa xtmp, xindices
480 ;; Compare indices with idx
481 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
482 pcmpeqq xtmp, bcast_idx
484 ;; Load next 2 values
485 movdqa xtmp2, [table + offset]
487 ;; This generates data with all 0s except the value we are looking for in the index to look up
492 ;; Get next 2 indices
499 ;; Extract value from XMM register
500 movdqa xtmp, accum_val
501 psrldq xtmp, 8 ; shift right by 64 bits
510 ; uint64_t lookup_64bit_avx(const void *table, const uint32_t idx, const uint32_t size);
511 ; arg 1 : pointer to table to look up
512 ; arg 2 : index to look up
513 ; arg 3 : size of table to look up
514 MKGLOBAL(lookup_64bit_avx,function,internal)
516 ;; Number of loop iters = matrix size / 2 (number of values in XMM)
523 vpxor accum_val, accum_val
524 vpinsrq bcast_idx, idx, 1
526 vmovdqa xadd, [rel add_2]
527 vmovdqa xindices, [rel idx_tab64]
530 ;; Compare indices with idx
531 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
532 vpcmpeqq xtmp, xindices, bcast_idx
534 ;; Load next 2 values
535 vmovdqa xtmp2, [table + offset]
537 ;; This generates data with all 0s except the value we are looking for in the index to look up
540 vpor accum_val, xtmp2
542 ;; Get next 2 indices
543 vpaddq xindices, xadd
549 ;; Extract value from XMM register
550 vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits
560 section .note.GNU-stack noalloc noexec nowrite progbits