]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/crc/crc32_ieee_by4.asm
4837497c05cfa8cf452c8528c9eab6d695958302
[ceph.git] / ceph / src / isa-l / crc / crc32_ieee_by4.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ;
30 ; Function API:
31 ; UINT32 crc32_ieee_by4(
32 ; UINT32 init_crc, //initial CRC value, 32 bits
33 ; const unsigned char *buf, //buffer pointer to calculate CRC on
34 ; UINT64 len //buffer length in bytes (64-bit data)
35 ; );
36 ;
37 ; Authors:
38 ; Erdinc Ozturk
39 ; Vinodh Gopal
40 ; James Guilford
41 ;
42 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43 ; URL: http://download.intel.com/design/intarch/papers/323102.pdf
44 ;
45
46 %include "reg_sizes.asm"
47
48 [bits 64]
49 default rel
50
51 section .text
52
53 %ifidn __OUTPUT_FORMAT__, win64
54 %xdefine arg1 rcx
55 %xdefine arg2 rdx
56 %xdefine arg3 r8
57
58 %xdefine arg1_low32 ecx
59 %else
60 %xdefine arg1 rdi
61 %xdefine arg2 rsi
62 %xdefine arg3 rdx
63
64 %xdefine arg1_low32 edi
65 %endif
66
67 %ifidn __OUTPUT_FORMAT__, win64
68 %define XMM_SAVE 16*2
69 %define VARIABLE_OFFSET 16*4+8
70 %else
71 %define VARIABLE_OFFSET 16*2+8
72 %endif
73
74 align 16
75 global crc32_ieee_by4:function
76 crc32_ieee_by4:
77
78 not arg1_low32
79
80 sub rsp,VARIABLE_OFFSET
81
82 %ifidn __OUTPUT_FORMAT__, win64
83 ; push the xmm registers into the stack to maintain
84 movdqa [rsp + XMM_SAVE + 16*0],xmm6
85 movdqa [rsp + XMM_SAVE + 16*1],xmm7
86 %endif
87
88 ; check if smaller than 128B
89 cmp arg3, 128
90 jl _less_than_128
91
92
93
94 ; load the initial crc value
95 movd xmm6, arg1_low32 ; initial crc
96 ; crc value does not need to be byte-reflected, but it needs to be
97 ; moved to the high part of the register.
98 ; because data will be byte-reflected and will align with initial
99 ; crc at correct place.
100 pslldq xmm6, 12
101
102
103
104 movdqa xmm7, [SHUF_MASK]
105 ; receive the initial 64B data, xor the initial crc value
106 movdqu xmm0, [arg2]
107 movdqu xmm1, [arg2+16]
108 movdqu xmm2, [arg2+32]
109 movdqu xmm3, [arg2+48]
110
111
112
113 pshufb xmm0, xmm7
114 ; XOR the initial_crc value
115 pxor xmm0, xmm6
116 pshufb xmm1, xmm7
117 pshufb xmm2, xmm7
118 pshufb xmm3, xmm7
119
120 movdqa xmm6, [rk3] ; k3=2^480 mod POLY << 32
121 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
122 ;we subtract 128 instead of 64 to save one instruction from the loop
123 sub arg3, 128
124
125 ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
126 ; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
127 ; have 64+y Bytes of buffer
128
129
130 ; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
131 _fold_64_B_loop:
132
133 ;update the buffer pointer
134 add arg2, 64
135
136 movdqa xmm4, xmm0
137 movdqa xmm5, xmm1
138
139 pclmulqdq xmm0, xmm6 , 0x11
140 pclmulqdq xmm1, xmm6 , 0x11
141
142 pclmulqdq xmm4, xmm6, 0x0
143 pclmulqdq xmm5, xmm6, 0x0
144
145 pxor xmm0, xmm4
146 pxor xmm1, xmm5
147
148 movdqa xmm4, xmm2
149 movdqa xmm5, xmm3
150
151 pclmulqdq xmm2, xmm6, 0x11
152 pclmulqdq xmm3, xmm6, 0x11
153
154 pclmulqdq xmm4, xmm6, 0x0
155 pclmulqdq xmm5, xmm6, 0x0
156
157 pxor xmm2, xmm4
158 pxor xmm3, xmm5
159
160 movdqu xmm4, [arg2]
161 movdqu xmm5, [arg2+16]
162 pshufb xmm4, xmm7
163 pshufb xmm5, xmm7
164 pxor xmm0, xmm4
165 pxor xmm1, xmm5
166
167 movdqu xmm4, [arg2+32]
168 movdqu xmm5, [arg2+48]
169 pshufb xmm4, xmm7
170 pshufb xmm5, xmm7
171
172 pxor xmm2, xmm4
173 pxor xmm3, xmm5
174
175 sub arg3, 64
176
177 ; check if there is another 64B in the buffer to be able to fold
178 jge _fold_64_B_loop
179 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
180
181
182 add arg2, 64
183 ;at this point, the arg2 is pointing at the last y Bytes of the buffer
184 ; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
185
186
187 movdqa xmm6, [rk1] ;k1
188
189 ; fold the 4 xmm registers to 1 xmm register with different constants
190 movdqa xmm4, xmm0
191 pclmulqdq xmm0, xmm6, 0x11
192 pclmulqdq xmm4, xmm6, 0x0
193 pxor xmm1, xmm4
194 xorps xmm1, xmm0
195
196 movdqa xmm4, xmm1
197 pclmulqdq xmm1, xmm6, 0x11
198 pclmulqdq xmm4, xmm6, 0x0
199 pxor xmm2, xmm4
200 xorps xmm2, xmm1
201
202 movdqa xmm4, xmm2
203 pclmulqdq xmm2, xmm6, 0x11
204 pclmulqdq xmm4, xmm6, 0x0
205 pxor xmm3, xmm4
206 pxor xmm3, xmm2
207
208
209 ;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
210 ; instead of a cmp instruction, we use the negative flag with the jl instruction
211 add arg3, 64-16
212 jl _final_reduction_for_128
213
214 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
215 ; we can fold 16 bytes at a time if y>=16
216 ; continue folding 16B at a time
217
218 _16B_reduction_loop:
219 movdqa xmm4, xmm3
220 pclmulqdq xmm3, xmm6, 0x11
221 pclmulqdq xmm4, xmm6, 0x0
222 pxor xmm3, xmm4
223 movdqu xmm0, [arg2]
224 pshufb xmm0, xmm7
225 pxor xmm3, xmm0
226 add arg2, 16
227 sub arg3, 16
228 ; instead of a cmp instruction, we utilize the flags with the jge instruction
229 ; equivalent of: cmp arg3, 16-16
230 ; check if there is any more 16B in the buffer to be able to fold
231 jge _16B_reduction_loop
232
233 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
234 ;first, we reduce the data in the xmm3 register
235
236
237
238 _final_reduction_for_128:
239 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
240 add arg3, 16
241 je _128_done
242
243 ; here we are getting data that is less than 16 bytes.
244 ; since we know that there was data before the pointer, we can offset
245 ; the input pointer before the actual point, to receive exactly 16 bytes.
246 ; after that the registers need to be adjusted.
247 _get_last_two_xmms:
248 movdqa xmm2, xmm3
249
250 movdqu xmm1, [arg2 - 16 + arg3]
251 pshufb xmm1, xmm7
252
253 shl arg3, 4
254 lea rax, [pshufb_shf_table + 15*16]
255 sub rax, arg3
256 movdqu xmm0, [rax]
257
258 pshufb xmm2, xmm0
259
260 pxor xmm0, [mask3]
261
262 pshufb xmm3, xmm0
263
264 pblendvb xmm1, xmm2 ;xmm0 is implicit
265
266 movdqa xmm2, xmm1
267
268 movdqa xmm4, xmm3
269 pclmulqdq xmm3, xmm6, 0x11
270
271 pclmulqdq xmm4, xmm6, 0x0
272 pxor xmm3, xmm4
273 pxor xmm3, xmm2
274
275 _128_done:
276
277 movdqa xmm6, [rk5]
278 movdqa xmm0, xmm3
279
280 ;64b fold
281 pclmulqdq xmm3, xmm6, 0x1
282 pslldq xmm0, 8
283 pxor xmm3, xmm0
284
285 ;32b fold
286 movdqa xmm0, xmm3
287
288 pand xmm0, [mask4]
289
290 psrldq xmm3, 12
291 pclmulqdq xmm3, xmm6, 0x10
292 pxor xmm3, xmm0
293
294 ;barrett reduction
295 _barrett:
296 movdqa xmm6, [rk7]
297 movdqa xmm0, xmm3
298 pclmulqdq xmm3, xmm6, 0x01
299 pslldq xmm3, 4
300 pclmulqdq xmm3, xmm6, 0x11
301
302 pslldq xmm3, 4
303 pxor xmm3, xmm0
304 pextrd eax, xmm3,1
305
306 _cleanup:
307 not eax
308 %ifidn __OUTPUT_FORMAT__, win64
309 movdqa xmm6, [rsp + XMM_SAVE + 16*0]
310 movdqa xmm7, [rsp + XMM_SAVE + 16*1]
311 %endif
312 add rsp,VARIABLE_OFFSET
313
314
315 ret
316
317
318
319
320
321
322
323 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
324 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
325 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
326 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
327
328 align 16
329 _less_than_128:
330
331 ;check if there is enough buffer to be able to fold 16B at a time
332 cmp arg3, 32
333 jl _less_than_32
334 movdqa xmm7, [SHUF_MASK]
335
336 ;if there is, load the constants
337 movdqa xmm6, [rk1] ;k1
338
339 movd xmm0, arg1_low32
340 pslldq xmm0, 12
341 movdqu xmm3, [arg2]
342 pshufb xmm3, xmm7
343 pxor xmm3, xmm0
344
345
346 ;update the buffer pointer
347 add arg2, 16
348
349 ;update the counter. subtract 32 instead of 16 to save one instruction from the loop
350 sub arg3, 32
351
352 jmp _16B_reduction_loop
353
354
355 align 16
356 _less_than_32:
357 mov eax, arg1_low32
358 test arg3, arg3
359 je _cleanup
360
361 movdqa xmm7, [SHUF_MASK]
362
363 movd xmm0, arg1_low32
364 pslldq xmm0, 12
365
366 cmp arg3, 16
367 je _exact_16_left
368 jl _less_than_16_left
369 movd xmm0, arg1_low32
370 pslldq xmm0, 12
371 movdqu xmm3, [arg2]
372 pshufb xmm3, xmm7
373 pxor xmm3, xmm0
374 add arg2, 16
375 sub arg3, 16
376 movdqa xmm6, [rk1] ;k1
377 jmp _get_last_two_xmms
378
379
380 align 16
381 _less_than_16_left:
382 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
383
384 pxor xmm1, xmm1
385 mov r11, rsp
386 movdqa [r11], xmm1
387
388
389
390 cmp arg3, 4
391 jl _only_less_than_4
392
393 mov r9, arg3
394
395
396 cmp arg3, 8
397 jl _less_than_8_left
398 mov rax, [arg2]
399 mov [r11], rax
400 add r11, 8
401 sub arg3, 8
402 add arg2, 8
403 _less_than_8_left:
404
405 cmp arg3, 4
406 jl _less_than_4_left
407 mov eax, [arg2]
408 mov [r11], eax
409 add r11, 4
410 sub arg3, 4
411 add arg2, 4
412 _less_than_4_left:
413
414 cmp arg3, 2
415 jl _less_than_2_left
416 mov ax, [arg2]
417 mov [r11], ax
418 add r11, 2
419 sub arg3, 2
420 add arg2, 2
421 _less_than_2_left:
422 cmp arg3, 1
423 jl _zero_left
424
425 mov al, [arg2]
426 mov [r11], al
427
428 _zero_left:
429 movdqa xmm3, [rsp]
430 pshufb xmm3, xmm7
431 pxor xmm3, xmm0
432
433 shl r9, 4
434 lea rax, [pshufb_shf_table + 15*16]
435 sub rax, r9
436 movdqu xmm0, [rax]
437 pxor xmm0, [mask3]
438
439 pshufb xmm3, xmm0
440 jmp _128_done
441
442 align 16
443 _exact_16_left:
444 movdqu xmm3, [arg2]
445 pshufb xmm3, xmm7
446 pxor xmm3, xmm0
447
448 jmp _128_done
449
450 _only_less_than_4:
451 cmp arg3, 3
452 jl _only_less_than_3
453 mov al, [arg2]
454 mov [r11], al
455
456 mov al, [arg2+1]
457 mov [r11+1], al
458
459 mov al, [arg2+2]
460 mov [r11+2], al
461
462 movdqa xmm3, [rsp]
463 pshufb xmm3, xmm7
464 pxor xmm3, xmm0
465
466 psrldq xmm3, 5
467
468 jmp _barrett
469 _only_less_than_3:
470 cmp arg3, 2
471 jl _only_less_than_2
472 mov al, [arg2]
473 mov [r11], al
474
475 mov al, [arg2+1]
476 mov [r11+1], al
477
478 movdqa xmm3, [rsp]
479 pshufb xmm3, xmm7
480 pxor xmm3, xmm0
481
482 psrldq xmm3, 6
483
484 jmp _barrett
485 _only_less_than_2:
486 mov al, [arg2]
487 mov [r11], al
488
489 movdqa xmm3, [rsp]
490 pshufb xmm3, xmm7
491 pxor xmm3, xmm0
492
493 psrldq xmm3, 7
494
495 jmp _barrett
496 ; precomputed constants
497 section .data
498
499 align 16
500 rk1:
501 DQ 0xf200aa6600000000
502 rk2:
503 DQ 0x17d3315d00000000
504 rk3:
505 DQ 0xd3504ec700000000
506 rk4:
507 DQ 0x57a8445500000000
508 rk5:
509 DQ 0xf200aa6600000000
510 rk6:
511 DQ 0x490d678d00000000
512 rk7:
513 DQ 0x0000000104d101df
514 rk8:
515 DQ 0x0000000104c11db7
516 mask:
517 dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
518 mask2:
519 dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
520 mask3:
521 dq 0x8080808080808080, 0x8080808080808080
522 mask4:
523 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
524 align 32
525 pshufb_shf_table:
526
527 dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
528
529 dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
530
531 dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
532
533 dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
534
535 dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
536
537 dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
538
539 dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
540
541 dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
542
543 dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
544
545 dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
546
547 dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
548
549 dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
550
551 dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
552
553 dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
554
555 dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
556
557
558 SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607
559
560 ;;; func core, ver, snum
561 slversion crc32_ieee_by4, 05, 02, 0017