]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/crc/crc64_iso_refl_by8.asm
update sources to v12.1.1
[ceph.git] / ceph / src / isa-l / crc / crc64_iso_refl_by8.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ; Function API:
32 ; uint64_t crc64_iso_refl_by8(
33 ; uint64_t init_crc, //initial CRC value, 64 bits
34 ; const unsigned char *buf, //buffer pointer to calculate CRC on
35 ; uint64_t len //buffer length in bytes (64-bit data)
36 ; );
37 ;
38 %include "reg_sizes.asm"
39
40 %define fetch_dist 1024
41
42 [bits 64]
43 default rel
44
45 section .text
46
47
48 %ifidn __OUTPUT_FORMAT__, win64
49 %xdefine arg1 rcx
50 %xdefine arg2 rdx
51 %xdefine arg3 r8
52 %else
53 %xdefine arg1 rdi
54 %xdefine arg2 rsi
55 %xdefine arg3 rdx
56 %endif
57
58 %define TMP 16*0
59 %ifidn __OUTPUT_FORMAT__, win64
60 %define XMM_SAVE 16*2
61 %define VARIABLE_OFFSET 16*10+8
62 %else
63 %define VARIABLE_OFFSET 16*2+8
64 %endif
65
66
67 align 16
68 global crc64_iso_refl_by8:function
69 crc64_iso_refl_by8:
70 ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
71 not arg1
72 sub rsp, VARIABLE_OFFSET
73
74 %ifidn __OUTPUT_FORMAT__, win64
75 ; push the xmm registers into the stack to maintain
76 movdqa [rsp + XMM_SAVE + 16*0], xmm6
77 movdqa [rsp + XMM_SAVE + 16*1], xmm7
78 movdqa [rsp + XMM_SAVE + 16*2], xmm8
79 movdqa [rsp + XMM_SAVE + 16*3], xmm9
80 movdqa [rsp + XMM_SAVE + 16*4], xmm10
81 movdqa [rsp + XMM_SAVE + 16*5], xmm11
82 movdqa [rsp + XMM_SAVE + 16*6], xmm12
83 movdqa [rsp + XMM_SAVE + 16*7], xmm13
84 %endif
85
86 ; check if smaller than 256B
87 cmp arg3, 256
88
89 ; for sizes less than 256, we can't fold 128B at a time...
90 jl _less_than_256
91
92
93 ; load the initial crc value
94 movq xmm10, arg1 ; initial crc
95 ; receive the initial 128B data, xor the initial crc value
96 movdqu xmm0, [arg2+16*0]
97 movdqu xmm1, [arg2+16*1]
98 movdqu xmm2, [arg2+16*2]
99 movdqu xmm3, [arg2+16*3]
100 movdqu xmm4, [arg2+16*4]
101 movdqu xmm5, [arg2+16*5]
102 movdqu xmm6, [arg2+16*6]
103 movdqu xmm7, [arg2+16*7]
104
105 ; XOR the initial_crc value
106 pxor xmm0, xmm10
107 movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
108 ;imm value of pclmulqdq instruction will determine which constant to use
109 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
110 ; we subtract 256 instead of 128 to save one instruction from the loop
111 sub arg3, 256
112
113 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
114 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
115
116
117 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
118 _fold_128_B_loop:
119
120 ; update the buffer pointer
121 add arg2, 128
122
123 prefetchnta [arg2+fetch_dist+0]
124 movdqu xmm9, [arg2+16*0]
125 movdqu xmm12, [arg2+16*1]
126 movdqa xmm8, xmm0
127 movdqa xmm13, xmm1
128 pclmulqdq xmm0, xmm10, 0x10
129 pclmulqdq xmm8, xmm10 , 0x1
130 pclmulqdq xmm1, xmm10, 0x10
131 pclmulqdq xmm13, xmm10 , 0x1
132 pxor xmm0, xmm9
133 xorps xmm0, xmm8
134 pxor xmm1, xmm12
135 xorps xmm1, xmm13
136
137 prefetchnta [arg2+fetch_dist+32]
138 movdqu xmm9, [arg2+16*2]
139 movdqu xmm12, [arg2+16*3]
140 movdqa xmm8, xmm2
141 movdqa xmm13, xmm3
142 pclmulqdq xmm2, xmm10, 0x10
143 pclmulqdq xmm8, xmm10 , 0x1
144 pclmulqdq xmm3, xmm10, 0x10
145 pclmulqdq xmm13, xmm10 , 0x1
146 pxor xmm2, xmm9
147 xorps xmm2, xmm8
148 pxor xmm3, xmm12
149 xorps xmm3, xmm13
150
151 prefetchnta [arg2+fetch_dist+64]
152 movdqu xmm9, [arg2+16*4]
153 movdqu xmm12, [arg2+16*5]
154 movdqa xmm8, xmm4
155 movdqa xmm13, xmm5
156 pclmulqdq xmm4, xmm10, 0x10
157 pclmulqdq xmm8, xmm10 , 0x1
158 pclmulqdq xmm5, xmm10, 0x10
159 pclmulqdq xmm13, xmm10 , 0x1
160 pxor xmm4, xmm9
161 xorps xmm4, xmm8
162 pxor xmm5, xmm12
163 xorps xmm5, xmm13
164
165 prefetchnta [arg2+fetch_dist+96]
166 movdqu xmm9, [arg2+16*6]
167 movdqu xmm12, [arg2+16*7]
168 movdqa xmm8, xmm6
169 movdqa xmm13, xmm7
170 pclmulqdq xmm6, xmm10, 0x10
171 pclmulqdq xmm8, xmm10 , 0x1
172 pclmulqdq xmm7, xmm10, 0x10
173 pclmulqdq xmm13, xmm10 , 0x1
174 pxor xmm6, xmm9
175 xorps xmm6, xmm8
176 pxor xmm7, xmm12
177 xorps xmm7, xmm13
178
179 sub arg3, 128
180
181 ; check if there is another 128B in the buffer to be able to fold
182 jge _fold_128_B_loop
183 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
184
185 add arg2, 128
186 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
187 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
188
189
190 ; fold the 8 xmm registers to 1 xmm register with different constants
191 ; xmm0 to xmm7
192 movdqa xmm10, [rk9]
193 movdqa xmm8, xmm0
194 pclmulqdq xmm0, xmm10, 0x1
195 pclmulqdq xmm8, xmm10, 0x10
196 pxor xmm7, xmm8
197 xorps xmm7, xmm0
198 ;xmm1 to xmm7
199 movdqa xmm10, [rk11]
200 movdqa xmm8, xmm1
201 pclmulqdq xmm1, xmm10, 0x1
202 pclmulqdq xmm8, xmm10, 0x10
203 pxor xmm7, xmm8
204 xorps xmm7, xmm1
205
206 movdqa xmm10, [rk13]
207 movdqa xmm8, xmm2
208 pclmulqdq xmm2, xmm10, 0x1
209 pclmulqdq xmm8, xmm10, 0x10
210 pxor xmm7, xmm8
211 pxor xmm7, xmm2
212
213 movdqa xmm10, [rk15]
214 movdqa xmm8, xmm3
215 pclmulqdq xmm3, xmm10, 0x1
216 pclmulqdq xmm8, xmm10, 0x10
217 pxor xmm7, xmm8
218 xorps xmm7, xmm3
219
220 movdqa xmm10, [rk17]
221 movdqa xmm8, xmm4
222 pclmulqdq xmm4, xmm10, 0x1
223 pclmulqdq xmm8, xmm10, 0x10
224 pxor xmm7, xmm8
225 pxor xmm7, xmm4
226
227 movdqa xmm10, [rk19]
228 movdqa xmm8, xmm5
229 pclmulqdq xmm5, xmm10, 0x1
230 pclmulqdq xmm8, xmm10, 0x10
231 pxor xmm7, xmm8
232 xorps xmm7, xmm5
233 ; xmm6 to xmm7
234 movdqa xmm10, [rk1]
235 movdqa xmm8, xmm6
236 pclmulqdq xmm6, xmm10, 0x1
237 pclmulqdq xmm8, xmm10, 0x10
238 pxor xmm7, xmm8
239 pxor xmm7, xmm6
240
241
242 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
243 ; instead of a cmp instruction, we use the negative flag with the jl instruction
244 add arg3, 128-16
245 jl _final_reduction_for_128
246
247 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
248 ; we can fold 16 bytes at a time if y>=16
249 ; continue folding 16B at a time
250
251 _16B_reduction_loop:
252 movdqa xmm8, xmm7
253 pclmulqdq xmm7, xmm10, 0x1
254 pclmulqdq xmm8, xmm10, 0x10
255 pxor xmm7, xmm8
256 movdqu xmm0, [arg2]
257 pxor xmm7, xmm0
258 add arg2, 16
259 sub arg3, 16
260 ; instead of a cmp instruction, we utilize the flags with the jge instruction
261 ; equivalent of: cmp arg3, 16-16
262 ; check if there is any more 16B in the buffer to be able to fold
263 jge _16B_reduction_loop
264
265 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
266 ;first, we reduce the data in the xmm7 register
267
268
269 _final_reduction_for_128:
270 add arg3, 16
271 je _128_done
272 ; here we are getting data that is less than 16 bytes.
273 ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
274 ; after that the registers need to be adjusted.
275 _get_last_two_xmms:
276
277
278 movdqa xmm2, xmm7
279 movdqu xmm1, [arg2 - 16 + arg3]
280
281 ; get rid of the extra data that was loaded before
282 ; load the shift constant
283 lea rax, [pshufb_shf_table]
284 add rax, arg3
285 movdqu xmm0, [rax]
286
287
288 pshufb xmm7, xmm0
289 pxor xmm0, [mask3]
290 pshufb xmm2, xmm0
291
292 pblendvb xmm2, xmm1 ;xmm0 is implicit
293 ;;;;;;;;;;
294 movdqa xmm8, xmm7
295 pclmulqdq xmm7, xmm10, 0x1
296
297 pclmulqdq xmm8, xmm10, 0x10
298 pxor xmm7, xmm8
299 pxor xmm7, xmm2
300
301 _128_done:
302 ; compute crc of a 128-bit value
303 movdqa xmm10, [rk5]
304 movdqa xmm0, xmm7
305
306 ;64b fold
307 pclmulqdq xmm7, xmm10, 0
308 psrldq xmm0, 8
309 pxor xmm7, xmm0
310
311 ;barrett reduction
312 _barrett:
313 movdqa xmm1, xmm7
314 movdqa xmm10, [rk7]
315
316 pclmulqdq xmm7, xmm10, 0
317 movdqa xmm2, xmm7
318 pclmulqdq xmm7, xmm10, 0x10
319 pslldq xmm2, 8
320 pxor xmm7, xmm2
321 pxor xmm7, xmm1
322 pextrq rax, xmm7, 1
323
324 _cleanup:
325 ; return c ^ 0xffffffff, ffffffffL;
326 not rax
327
328
329 %ifidn __OUTPUT_FORMAT__, win64
330 movdqa xmm6, [rsp + XMM_SAVE + 16*0]
331 movdqa xmm7, [rsp + XMM_SAVE + 16*1]
332 movdqa xmm8, [rsp + XMM_SAVE + 16*2]
333 movdqa xmm9, [rsp + XMM_SAVE + 16*3]
334 movdqa xmm10, [rsp + XMM_SAVE + 16*4]
335 movdqa xmm11, [rsp + XMM_SAVE + 16*5]
336 movdqa xmm12, [rsp + XMM_SAVE + 16*6]
337 movdqa xmm13, [rsp + XMM_SAVE + 16*7]
338 %endif
339 add rsp, VARIABLE_OFFSET
340 ret
341
342 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
343 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
344 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
345 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
346
347 align 16
348 _less_than_256:
349
350 ; check if there is enough buffer to be able to fold 16B at a time
351 cmp arg3, 32
352 jl _less_than_32
353
354 ; if there is, load the constants
355 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
356
357 movq xmm0, arg1 ; get the initial crc value
358 movdqu xmm7, [arg2] ; load the plaintext
359 pxor xmm7, xmm0
360
361 ; update the buffer pointer
362 add arg2, 16
363
364 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
365 sub arg3, 32
366
367 jmp _16B_reduction_loop
368
369 align 16
370 _less_than_32:
371 ; mov initial crc to the return value. this is necessary for zero-length buffers.
372 mov rax, arg1
373 test arg3, arg3
374 je _cleanup
375
376 movq xmm0, arg1 ; get the initial crc value
377
378 cmp arg3, 16
379 je _exact_16_left
380 jl _less_than_16_left
381
382 movdqu xmm7, [arg2] ; load the plaintext
383 pxor xmm7, xmm0 ; xor the initial crc value
384 add arg2, 16
385 sub arg3, 16
386 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
387 jmp _get_last_two_xmms
388
389
390 align 16
391 _less_than_16_left:
392 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
393
394 pxor xmm1, xmm1
395 mov r11, rsp
396 movdqa [r11], xmm1
397
398 ; backup the counter value
399 mov r9, arg3
400 cmp arg3, 8
401 jl _less_than_8_left
402
403 ; load 8 Bytes
404 mov rax, [arg2]
405 mov [r11], rax
406 add r11, 8
407 sub arg3, 8
408 add arg2, 8
409 _less_than_8_left:
410
411 cmp arg3, 4
412 jl _less_than_4_left
413
414 ; load 4 Bytes
415 mov eax, [arg2]
416 mov [r11], eax
417 add r11, 4
418 sub arg3, 4
419 add arg2, 4
420 _less_than_4_left:
421
422 cmp arg3, 2
423 jl _less_than_2_left
424
425 ; load 2 Bytes
426 mov ax, [arg2]
427 mov [r11], ax
428 add r11, 2
429 sub arg3, 2
430 add arg2, 2
431 _less_than_2_left:
432 cmp arg3, 1
433 jl _zero_left
434
435 ; load 1 Byte
436 mov al, [arg2]
437 mov [r11], al
438
439 _zero_left:
440 movdqa xmm7, [rsp]
441 pxor xmm7, xmm0 ; xor the initial crc value
442
443 lea rax,[pshufb_shf_table]
444
445 cmp r9, 8
446 jl _end_1to7
447
448 _end_8to15:
449 movdqu xmm0, [rax + r9]
450 pshufb xmm7,xmm0
451 jmp _128_done
452
453 _end_1to7:
454 ; Left shift (8-length) bytes in XMM
455 movdqu xmm0, [rax + r9 + 8]
456 pshufb xmm7,xmm0
457
458 jmp _barrett
459
460 align 16
461 _exact_16_left:
462 movdqu xmm7, [arg2]
463 pxor xmm7, xmm0 ; xor the initial crc value
464
465 jmp _128_done
466
467 section .data
468
469 ; precomputed constants
470 align 16
471 ; rk7 = floor(2^128/Q)
472 ; rk8 = Q
473 rk1:
474 DQ 0xf500000000000001
475 rk2:
476 DQ 0x6b70000000000001
477 rk3:
478 DQ 0xb001000000010000
479 rk4:
480 DQ 0xf501b0000001b000
481 rk5:
482 DQ 0xf500000000000001
483 rk6:
484 DQ 0x0000000000000000
485 rk7:
486 DQ 0xb000000000000001
487 rk8:
488 DQ 0xb000000000000000
489 rk9:
490 DQ 0xe014514514501501
491 rk10:
492 DQ 0x771db6db6db71c71
493 rk11:
494 DQ 0xa101101101110001
495 rk12:
496 DQ 0x1ab1ab1ab1aab001
497 rk13:
498 DQ 0xf445014445000001
499 rk14:
500 DQ 0x6aab71daab700001
501 rk15:
502 DQ 0xb100010100000001
503 rk16:
504 DQ 0x01b001b1b0000001
505 rk17:
506 DQ 0xe145150000000001
507 rk18:
508 DQ 0x76db6c7000000001
509 rk19:
510 DQ 0xa011000000000001
511 rk20:
512 DQ 0x1b1ab00000000001
513
514 pshufb_shf_table:
515 ; use these values for shift constants for the pshufb instruction
516 ; different alignments result in values as shown:
517 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
518 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
519 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
520 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
521 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
522 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
523 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
524 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
525 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
526 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
527 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
528 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
529 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
530 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
531 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
532 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
533 dq 0x0706050403020100, 0x000e0d0c0b0a0908
534
535
536 mask:
537 dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
538 mask2:
539 dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
540 mask3:
541 dq 0x8080808080808080, 0x8080808080808080
542
543 ;;; func core, ver, snum
544 slversion crc64_iso_refl_by8, 01, 00, 0023