]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/include/constant_lookup.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / include / constant_lookup.asm
1 ;;
2 ;; Copyright (c) 2019, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 %include "include/os.asm"
29 %include "include/reg_sizes.asm"
30
31 section .data
32 default rel
33
34 align 16
35 idx_tab8:
36 db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
37 db 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
38
39 align 16
40 add_16:
41 db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
42 db 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
43
44 align 16
45 idx_tab16:
46 dw 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
47
48 align 16
49 add_8:
50 dw 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8
51
52 align 16
53 idx_tab32:
54 dd 0x0, 0x1, 0x2, 0x3
55
56 align 16
57 add_4:
58 dd 0x4, 0x4, 0x4, 0x4
59
60 align 16
61 idx_tab64:
62 dq 0x0, 0x1
63
64 add_2:
65 dq 0x2, 0x2
66
67 align 16
68 bcast_mask:
69 db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
70 db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01
71
72 section .text
73
74 %ifdef LINUX
75 %define arg1 rdi
76 %define arg2 rsi
77 %define arg3 rdx
78 %else
79 %define arg1 rcx
80 %define arg2 rdx
81 %define arg3 r8
82 %endif
83
84 %define bcast_idx xmm0
85 %define xadd xmm1
86 %define accum_val xmm2
87 %define xindices xmm3
88 %define xtmp xmm4
89 %define xtmp2 xmm5
90 %define tmp r9
91 %define offset r10
92
93 %define table arg1
94 %define idx arg2
95 %define size arg3
96
97 ; uint8_t lookup_8bit_sse(const void *table, const uint32_t idx, const uint32_t size);
98 ; arg 1 : pointer to table to look up
99 ; arg 2 : index to look up
100 ; arg 3 : size of table to look up (multiple of 16 bytes)
101 MKGLOBAL(lookup_8bit_sse,function,internal)
102 lookup_8bit_sse:
103
104 ;; Number of loop iters = matrix size / 4 (number of values in XMM)
105 shr size, 4
106 je exit8_sse
107
108 xor offset, offset
109
110 ;; Broadcast idx to look up
111 movd bcast_idx, DWORD(idx)
112 pxor xtmp, xtmp
113 pxor accum_val, accum_val
114 pshufb bcast_idx, xtmp
115
116 movdqa xadd, [rel add_16]
117 movdqa xindices, [rel idx_tab8]
118
119 loop8_sse:
120 movdqa xtmp, xindices
121
122 ;; Compare indices with idx
123 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
124 pcmpeqb xtmp, bcast_idx
125
126 ;; Load next 16 values
127 movdqa xtmp2, [table + offset]
128
129 ;; This generates data with all 0s except the value we are looking for in the index to look up
130 pand xtmp2, xtmp
131
132 por accum_val, xtmp2
133
134 ;; Get next 16 indices
135 paddb xindices, xadd
136
137 add offset, 16
138 dec size
139
140 jne loop8_sse
141
142 ;; Extract value from XMM register
143 movdqa xtmp, accum_val
144 pslldq xtmp, 8 ; shift left by 64 bits
145 por accum_val, xtmp
146
147 movdqa xtmp, accum_val
148 pslldq xtmp, 4 ; shift left by 32 bits
149 por accum_val, xtmp
150
151 movdqa xtmp, accum_val
152 pslldq xtmp, 2 ; shift left by 16 bits
153 por accum_val, xtmp
154
155 movdqa xtmp, accum_val
156 pslldq xtmp, 1 ; shift left by 8 bits
157 por accum_val, xtmp
158
159 pextrb rax, accum_val, 15
160
161 exit8_sse:
162 ret
163
164 ; uint8_t lookup_8bit_avx(const void *table, const uint32_t idx, const uint32_t size);
165 ; arg 1 : pointer to table to look up
166 ; arg 2 : index to look up
167 ; arg 3 : size of table to look up (multiple of 16 bytes)
168 MKGLOBAL(lookup_8bit_avx,function,internal)
169 lookup_8bit_avx:
170 ;; Number of loop iters = matrix size / 4 (number of values in XMM)
171 shr size, 4
172 je exit8_avx
173
174 xor offset, offset
175
176 ;; Broadcast idx to look up
177 vmovd bcast_idx, DWORD(idx)
178 vpxor xtmp, xtmp
179 vpxor accum_val, accum_val
180 vpshufb bcast_idx, xtmp
181
182 vmovdqa xadd, [rel add_16]
183 vmovdqa xindices, [rel idx_tab8]
184
185 loop8_avx:
186 ;; Compare indices with idx
187 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
188 vpcmpeqb xtmp, xindices, bcast_idx
189
190 ;; Load next 16 values
191 vmovdqa xtmp2, [table + offset]
192
193 ;; This generates data with all 0s except the value we are looking for in the index to look up
194 vpand xtmp2, xtmp
195
196 vpor accum_val, xtmp2
197
198 ;; Get next 16 indices
199 vpaddb xindices, xadd
200
201 add offset, 16
202 dec size
203
204 jne loop8_avx
205
206 ;; Extract value from XMM register
207 vpslldq xtmp, accum_val, 8 ; shift left by 64 bits
208 vpor accum_val, xtmp
209
210 vpslldq xtmp, accum_val, 4 ; shift left by 32 bits
211 vpor accum_val, xtmp
212
213 vpslldq xtmp, accum_val, 2 ; shift left by 16 bits
214 vpor accum_val, xtmp
215
216 vpslldq xtmp, accum_val, 1 ; shift left by 8 bits
217 vpor accum_val, xtmp
218
219 vpextrb rax, accum_val, 15
220
221 exit8_avx:
222
223 ret
224
225 ; uint8_t lookup_16bit_sse(const void *table, const uint32_t idx, const uint32_t size);
226 ; arg 1 : pointer to table to look up
227 ; arg 2 : index to look up
228 ; arg 3 : size of table to look up
229 MKGLOBAL(lookup_16bit_sse,function,internal)
230 lookup_16bit_sse:
231
232 ;; Number of loop iters = matrix size / 8 (number of values in XMM)
233 shr size, 3
234 je exit16_sse
235
236 xor offset, offset
237
238 ;; Broadcast idx to look up
239 movd bcast_idx, DWORD(idx)
240 movdqa xtmp, [rel bcast_mask]
241 pxor accum_val, accum_val
242 pshufb bcast_idx, xtmp
243
244 movdqa xadd, [rel add_8]
245 movdqa xindices, [rel idx_tab16]
246
247 loop16_sse:
248
249 movdqa xtmp, xindices
250
251 ;; Compare indices with idx
252 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
253 pcmpeqw xtmp, bcast_idx
254
255 ;; Load next 8 values
256 movdqa xtmp2, [table + offset]
257
258 ;; This generates data with all 0s except the value we are looking for in the index to look up
259 pand xtmp2, xtmp
260
261 por accum_val, xtmp2
262
263 ;; Get next 8 indices
264 paddw xindices, xadd
265 add offset, 16
266 dec size
267
268 jne loop16_sse
269
270 ;; Extract value from XMM register
271 movdqa xtmp, accum_val
272 pslldq xtmp, 8 ; shift left by 64 bits
273 por accum_val, xtmp
274
275 movdqa xtmp, accum_val
276 pslldq xtmp, 4 ; shift left by 32 bits
277 por accum_val, xtmp
278
279 movdqa xtmp, accum_val
280 pslldq xtmp, 2 ; shift left by 16 bits
281 por accum_val, xtmp
282
283 pextrw rax, accum_val, 7
284
285 exit16_sse:
286 ret
287
288 ; uint8_t lookup_16bit_avx(const void *table, const uint32_t idx, const uint32_t size);
289 ; arg 1 : pointer to table to look up
290 ; arg 2 : index to look up
291 ; arg 3 : size of table to look up
292 MKGLOBAL(lookup_16bit_avx,function,internal)
293 lookup_16bit_avx:
294
295 ;; Number of loop iters = matrix size / 8 (number of values in XMM)
296 shr size, 3
297 je exit16_avx
298
299 xor offset, offset
300
301 ;; Broadcast idx to look up
302 vmovd bcast_idx, DWORD(idx)
303 vmovdqa xtmp, [rel bcast_mask]
304 vpxor accum_val, accum_val
305 vpshufb bcast_idx, xtmp
306
307 vmovdqa xadd, [rel add_8]
308 vmovdqa xindices, [rel idx_tab16]
309
310 loop16_avx:
311
312 ;; Compare indices with idx
313 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
314 vpcmpeqw xtmp, xindices, bcast_idx
315
316 ;; Load next 16 values
317 vmovdqa xtmp2, [table + offset]
318
319 ;; This generates data with all 0s except the value we are looking for in the index to look up
320 vpand xtmp2, xtmp
321
322 vpor accum_val, xtmp2
323
324 ;; Get next 8 indices
325 vpaddw xindices, xadd
326 add offset, 16
327 dec size
328
329 jne loop16_avx
330
331 ;; Extract value from XMM register
332 vpslldq xtmp, accum_val, 8 ; shift left by 64 bits
333 vpor accum_val, xtmp
334
335 vpslldq xtmp, accum_val, 4 ; shift left by 32 bits
336 vpor accum_val, xtmp
337
338 vpslldq xtmp, accum_val, 2 ; shift left by 16 bits
339 vpor accum_val, xtmp
340
341 vpextrw rax, accum_val, 7
342
343 exit16_avx:
344 ret
345
346 ; uint32_t lookup_32bit_sse(const void *table, const uint32_t idx, const uint32_t size);
347 ; arg 1 : pointer to table to look up
348 ; arg 2 : index to look up
349 ; arg 3 : size of table to look up
350 MKGLOBAL(lookup_32bit_sse,function,internal)
351 lookup_32bit_sse:
352
353 ;; Number of loop iters = matrix size / 4 (number of values in XMM)
354 shr size, 2
355 je exit32_sse
356
357 xor offset, offset
358
359 ;; Broadcast idx to look up
360 movd bcast_idx, DWORD(idx)
361 pxor accum_val, accum_val
362 pshufd bcast_idx, bcast_idx, 0
363
364 movdqa xadd, [rel add_4]
365 movdqa xindices, [rel idx_tab32]
366
367 loop32_sse:
368 movdqa xtmp, xindices
369
370 ;; Compare indices with idx
371 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
372 pcmpeqd xtmp, bcast_idx
373
374 ;; Load next 4 values
375 movdqa xtmp2, [table + offset]
376
377 ;; This generates data with all 0s except the value we are looking for in the index to look up
378 pand xtmp2, xtmp
379
380 por accum_val, xtmp2
381
382 ;; Get next 4 indices
383 paddd xindices, xadd
384 add offset, 16
385 dec size
386
387 jne loop32_sse
388
389 ;; Extract value from XMM register
390 movdqa xtmp, accum_val
391 psrldq xtmp, 8 ; shift right by 64 bits
392 por accum_val, xtmp
393
394 movdqa xtmp, accum_val
395 psrldq xtmp, 4 ; shift right by 32 bits
396 por accum_val, xtmp
397
398 movd eax, accum_val
399
400 exit32_sse:
401 ret
402
403
404 ; uint32_t lookup_32bit_avx(const void *table, const uint32_t idx, const uint32_t size);
405 ; arg 1 : pointer to table to look up
406 ; arg 2 : index to look up
407 ; arg 3 : size of table to look up
408 MKGLOBAL(lookup_32bit_avx,function,internal)
409 lookup_32bit_avx:
410 ;; Number of loop iters = matrix size / 4 (number of values in XMM)
411 shr size, 2
412 je exit32_avx
413
414 xor offset, offset
415
416 ;; Broadcast idx to look up
417 vmovd bcast_idx, DWORD(idx)
418 vpxor accum_val, accum_val
419 vpshufd bcast_idx, bcast_idx, 0
420
421 vmovdqa xadd, [rel add_4]
422 vmovdqa xindices, [rel idx_tab32]
423
424 loop32_avx:
425 ;; Compare indices with idx
426 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
427 vpcmpeqd xtmp, xindices, bcast_idx
428
429 ;; Load next 4 values
430 vmovdqa xtmp2, [table + offset]
431
432 ;; This generates data with all 0s except the value we are looking for in the index to look up
433 vpand xtmp2, xtmp
434
435 vpor accum_val, xtmp2
436
437 ;; Get next 4 indices
438 vpaddd xindices, xadd
439 add offset, 16
440 dec size
441
442 jne loop32_avx
443
444 ;; Extract value from XMM register
445 vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits
446 vpor accum_val, xtmp
447
448 vpsrldq xtmp, accum_val, 4 ; shift right by 32 bits
449 vpor accum_val, xtmp
450
451 vmovd eax, accum_val
452
453 exit32_avx:
454 ret
455
456
457 ; uint64_t lookup_64bit_sse(const void *table, const uint32_t idx, const uint32_t size);
458 ; arg 1 : pointer to table to look up
459 ; arg 2 : index to look up
460 ; arg 3 : size of table to look up
461 MKGLOBAL(lookup_64bit_sse,function,internal)
462 lookup_64bit_sse:
463 ;; Number of loop iters = matrix size / 2 (number of values in XMM)
464 shr size, 1
465 je exit64_sse
466
467 xor offset, offset
468
469 ;; Broadcast idx to look up
470 movq bcast_idx, idx
471 pxor accum_val, accum_val
472 pinsrq bcast_idx, idx, 1
473
474 movdqa xadd, [rel add_2]
475 movdqa xindices, [rel idx_tab64]
476
477 loop64_sse:
478 movdqa xtmp, xindices
479
480 ;; Compare indices with idx
481 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
482 pcmpeqq xtmp, bcast_idx
483
484 ;; Load next 2 values
485 movdqa xtmp2, [table + offset]
486
487 ;; This generates data with all 0s except the value we are looking for in the index to look up
488 pand xtmp2, xtmp
489
490 por accum_val, xtmp2
491
492 ;; Get next 2 indices
493 paddq xindices, xadd
494 add offset, 16
495 dec size
496
497 jne loop64_sse
498
499 ;; Extract value from XMM register
500 movdqa xtmp, accum_val
501 psrldq xtmp, 8 ; shift right by 64 bits
502 por accum_val, xtmp
503
504 movq rax, accum_val
505
506 exit64_sse:
507 ret
508
509
510 ; uint64_t lookup_64bit_avx(const void *table, const uint32_t idx, const uint32_t size);
511 ; arg 1 : pointer to table to look up
512 ; arg 2 : index to look up
513 ; arg 3 : size of table to look up
514 MKGLOBAL(lookup_64bit_avx,function,internal)
515 lookup_64bit_avx:
516 ;; Number of loop iters = matrix size / 2 (number of values in XMM)
517 shr size, 1
518 je exit64_avx
519
520 xor offset, offset
521
522 vmovq bcast_idx, idx
523 vpxor accum_val, accum_val
524 vpinsrq bcast_idx, idx, 1
525
526 vmovdqa xadd, [rel add_2]
527 vmovdqa xindices, [rel idx_tab64]
528
529 loop64_avx:
530 ;; Compare indices with idx
531 ;; This generates a mask with all 0s except for the position where idx matches (all 1s here)
532 vpcmpeqq xtmp, xindices, bcast_idx
533
534 ;; Load next 2 values
535 vmovdqa xtmp2, [table + offset]
536
537 ;; This generates data with all 0s except the value we are looking for in the index to look up
538 vpand xtmp2, xtmp
539
540 vpor accum_val, xtmp2
541
542 ;; Get next 2 indices
543 vpaddq xindices, xadd
544 add offset, 16
545 dec size
546
547 jne loop64_avx
548
549 ;; Extract value from XMM register
550 vpsrldq xtmp, accum_val, 8 ; shift right by 64 bits
551 vpor accum_val, xtmp
552
553 vmovq rax, accum_val
554
555 exit64_avx:
556 ret
557
558
559 %ifdef LINUX
560 section .note.GNU-stack noalloc noexec nowrite progbits
561 %endif