]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/crc/crc32_ieee_by4.asm
import quincy beta 17.1.0
[ceph.git] / ceph / src / isa-l / crc / crc32_ieee_by4.asm
CommitLineData
7c673cae
FG
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29;
30; Function API:
31; UINT32 crc32_ieee_by4(
32; UINT32 init_crc, //initial CRC value, 32 bits
33; const unsigned char *buf, //buffer pointer to calculate CRC on
34; UINT64 len //buffer length in bytes (64-bit data)
35; );
36;
37; Authors:
38; Erdinc Ozturk
39; Vinodh Gopal
40; James Guilford
41;
42; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43; URL: http://download.intel.com/design/intarch/papers/323102.pdf
44;
45
46%include "reg_sizes.asm"
47
224ce89b
WB
48%define fetch_dist 1024
49
7c673cae
FG
50[bits 64]
51default rel
52
53section .text
54
55%ifidn __OUTPUT_FORMAT__, win64
56 %xdefine arg1 rcx
57 %xdefine arg2 rdx
58 %xdefine arg3 r8
59
60 %xdefine arg1_low32 ecx
61%else
62 %xdefine arg1 rdi
63 %xdefine arg2 rsi
64 %xdefine arg3 rdx
65
66 %xdefine arg1_low32 edi
67%endif
68
69%ifidn __OUTPUT_FORMAT__, win64
70 %define XMM_SAVE 16*2
71 %define VARIABLE_OFFSET 16*4+8
72%else
73 %define VARIABLE_OFFSET 16*2+8
74%endif
75
76align 16
20effc67 77mk_global crc32_ieee_by4, function
7c673cae 78crc32_ieee_by4:
20effc67 79 endbranch
7c673cae
FG
80
81 not arg1_low32
82
83 sub rsp,VARIABLE_OFFSET
84
85%ifidn __OUTPUT_FORMAT__, win64
86 ; push the xmm registers into the stack to maintain
87 movdqa [rsp + XMM_SAVE + 16*0],xmm6
88 movdqa [rsp + XMM_SAVE + 16*1],xmm7
89%endif
90
91 ; check if smaller than 128B
92 cmp arg3, 128
93 jl _less_than_128
94
95
96
97 ; load the initial crc value
98 movd xmm6, arg1_low32 ; initial crc
99 ; crc value does not need to be byte-reflected, but it needs to be
100 ; moved to the high part of the register.
101 ; because data will be byte-reflected and will align with initial
102 ; crc at correct place.
103 pslldq xmm6, 12
104
105
106
107 movdqa xmm7, [SHUF_MASK]
108 ; receive the initial 64B data, xor the initial crc value
109 movdqu xmm0, [arg2]
110 movdqu xmm1, [arg2+16]
111 movdqu xmm2, [arg2+32]
112 movdqu xmm3, [arg2+48]
113
114
115
116 pshufb xmm0, xmm7
117 ; XOR the initial_crc value
118 pxor xmm0, xmm6
119 pshufb xmm1, xmm7
120 pshufb xmm2, xmm7
121 pshufb xmm3, xmm7
122
123 movdqa xmm6, [rk3] ; k3=2^480 mod POLY << 32
124 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
125 ;we subtract 128 instead of 64 to save one instruction from the loop
126 sub arg3, 128
127
128 ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
129 ; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
130 ; have 64+y Bytes of buffer
131
132
133 ; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
134_fold_64_B_loop:
135
136 ;update the buffer pointer
137 add arg2, 64
138
224ce89b 139 prefetchnta [arg2+fetch_dist+0]
7c673cae
FG
140 movdqa xmm4, xmm0
141 movdqa xmm5, xmm1
142
143 pclmulqdq xmm0, xmm6 , 0x11
144 pclmulqdq xmm1, xmm6 , 0x11
145
146 pclmulqdq xmm4, xmm6, 0x0
147 pclmulqdq xmm5, xmm6, 0x0
148
149 pxor xmm0, xmm4
150 pxor xmm1, xmm5
151
224ce89b 152 prefetchnta [arg2+fetch_dist+32]
7c673cae
FG
153 movdqa xmm4, xmm2
154 movdqa xmm5, xmm3
155
156 pclmulqdq xmm2, xmm6, 0x11
157 pclmulqdq xmm3, xmm6, 0x11
158
159 pclmulqdq xmm4, xmm6, 0x0
160 pclmulqdq xmm5, xmm6, 0x0
161
162 pxor xmm2, xmm4
163 pxor xmm3, xmm5
164
165 movdqu xmm4, [arg2]
166 movdqu xmm5, [arg2+16]
167 pshufb xmm4, xmm7
168 pshufb xmm5, xmm7
169 pxor xmm0, xmm4
170 pxor xmm1, xmm5
171
172 movdqu xmm4, [arg2+32]
173 movdqu xmm5, [arg2+48]
174 pshufb xmm4, xmm7
175 pshufb xmm5, xmm7
176
177 pxor xmm2, xmm4
178 pxor xmm3, xmm5
179
180 sub arg3, 64
181
182 ; check if there is another 64B in the buffer to be able to fold
183 jge _fold_64_B_loop
184 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
185
186
187 add arg2, 64
188 ;at this point, the arg2 is pointing at the last y Bytes of the buffer
189 ; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
190
191
192 movdqa xmm6, [rk1] ;k1
193
194 ; fold the 4 xmm registers to 1 xmm register with different constants
195 movdqa xmm4, xmm0
196 pclmulqdq xmm0, xmm6, 0x11
197 pclmulqdq xmm4, xmm6, 0x0
198 pxor xmm1, xmm4
199 xorps xmm1, xmm0
200
201 movdqa xmm4, xmm1
202 pclmulqdq xmm1, xmm6, 0x11
203 pclmulqdq xmm4, xmm6, 0x0
204 pxor xmm2, xmm4
205 xorps xmm2, xmm1
206
207 movdqa xmm4, xmm2
208 pclmulqdq xmm2, xmm6, 0x11
209 pclmulqdq xmm4, xmm6, 0x0
210 pxor xmm3, xmm4
211 pxor xmm3, xmm2
212
213
214 ;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
215 ; instead of a cmp instruction, we use the negative flag with the jl instruction
216 add arg3, 64-16
217 jl _final_reduction_for_128
218
219; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
220; we can fold 16 bytes at a time if y>=16
221; continue folding 16B at a time
222
223_16B_reduction_loop:
224 movdqa xmm4, xmm3
225 pclmulqdq xmm3, xmm6, 0x11
226 pclmulqdq xmm4, xmm6, 0x0
227 pxor xmm3, xmm4
228 movdqu xmm0, [arg2]
229 pshufb xmm0, xmm7
230 pxor xmm3, xmm0
231 add arg2, 16
232 sub arg3, 16
233 ; instead of a cmp instruction, we utilize the flags with the jge instruction
234 ; equivalent of: cmp arg3, 16-16
235 ; check if there is any more 16B in the buffer to be able to fold
236 jge _16B_reduction_loop
237
238 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
239 ;first, we reduce the data in the xmm3 register
240
241
242
243_final_reduction_for_128:
244 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
245 add arg3, 16
246 je _128_done
247
248 ; here we are getting data that is less than 16 bytes.
249 ; since we know that there was data before the pointer, we can offset
250 ; the input pointer before the actual point, to receive exactly 16 bytes.
251 ; after that the registers need to be adjusted.
252_get_last_two_xmms:
253 movdqa xmm2, xmm3
254
255 movdqu xmm1, [arg2 - 16 + arg3]
256 pshufb xmm1, xmm7
257
258 shl arg3, 4
259 lea rax, [pshufb_shf_table + 15*16]
260 sub rax, arg3
261 movdqu xmm0, [rax]
262
263 pshufb xmm2, xmm0
264
265 pxor xmm0, [mask3]
266
267 pshufb xmm3, xmm0
268
269 pblendvb xmm1, xmm2 ;xmm0 is implicit
270
271 movdqa xmm2, xmm1
272
273 movdqa xmm4, xmm3
274 pclmulqdq xmm3, xmm6, 0x11
275
276 pclmulqdq xmm4, xmm6, 0x0
277 pxor xmm3, xmm4
278 pxor xmm3, xmm2
279
280_128_done:
281
282 movdqa xmm6, [rk5]
283 movdqa xmm0, xmm3
284
285 ;64b fold
286 pclmulqdq xmm3, xmm6, 0x1
287 pslldq xmm0, 8
288 pxor xmm3, xmm0
289
290 ;32b fold
291 movdqa xmm0, xmm3
292
293 pand xmm0, [mask4]
294
295 psrldq xmm3, 12
296 pclmulqdq xmm3, xmm6, 0x10
297 pxor xmm3, xmm0
298
299 ;barrett reduction
300_barrett:
301 movdqa xmm6, [rk7]
302 movdqa xmm0, xmm3
303 pclmulqdq xmm3, xmm6, 0x01
304 pslldq xmm3, 4
305 pclmulqdq xmm3, xmm6, 0x11
306
307 pslldq xmm3, 4
308 pxor xmm3, xmm0
309 pextrd eax, xmm3,1
310
311_cleanup:
312 not eax
313%ifidn __OUTPUT_FORMAT__, win64
314 movdqa xmm6, [rsp + XMM_SAVE + 16*0]
315 movdqa xmm7, [rsp + XMM_SAVE + 16*1]
316%endif
317 add rsp,VARIABLE_OFFSET
318
319
320 ret
321
322
323
324
325
326
327
328;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
329;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
330;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
331;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
332
333align 16
334_less_than_128:
335
336 ;check if there is enough buffer to be able to fold 16B at a time
337 cmp arg3, 32
338 jl _less_than_32
339 movdqa xmm7, [SHUF_MASK]
340
341 ;if there is, load the constants
342 movdqa xmm6, [rk1] ;k1
343
344 movd xmm0, arg1_low32
345 pslldq xmm0, 12
346 movdqu xmm3, [arg2]
347 pshufb xmm3, xmm7
348 pxor xmm3, xmm0
349
350
351 ;update the buffer pointer
352 add arg2, 16
353
354 ;update the counter. subtract 32 instead of 16 to save one instruction from the loop
355 sub arg3, 32
356
357 jmp _16B_reduction_loop
358
359
360align 16
361_less_than_32:
362 mov eax, arg1_low32
363 test arg3, arg3
364 je _cleanup
365
366 movdqa xmm7, [SHUF_MASK]
367
368 movd xmm0, arg1_low32
369 pslldq xmm0, 12
370
371 cmp arg3, 16
372 je _exact_16_left
373 jl _less_than_16_left
374 movd xmm0, arg1_low32
375 pslldq xmm0, 12
376 movdqu xmm3, [arg2]
377 pshufb xmm3, xmm7
378 pxor xmm3, xmm0
379 add arg2, 16
380 sub arg3, 16
381 movdqa xmm6, [rk1] ;k1
382 jmp _get_last_two_xmms
383
384
385align 16
386_less_than_16_left:
387 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
388
389 pxor xmm1, xmm1
390 mov r11, rsp
391 movdqa [r11], xmm1
392
393
394
395 cmp arg3, 4
396 jl _only_less_than_4
397
398 mov r9, arg3
399
400
401 cmp arg3, 8
402 jl _less_than_8_left
403 mov rax, [arg2]
404 mov [r11], rax
405 add r11, 8
406 sub arg3, 8
407 add arg2, 8
408_less_than_8_left:
409
410 cmp arg3, 4
411 jl _less_than_4_left
412 mov eax, [arg2]
413 mov [r11], eax
414 add r11, 4
415 sub arg3, 4
416 add arg2, 4
417_less_than_4_left:
418
419 cmp arg3, 2
420 jl _less_than_2_left
421 mov ax, [arg2]
422 mov [r11], ax
423 add r11, 2
424 sub arg3, 2
425 add arg2, 2
426_less_than_2_left:
427 cmp arg3, 1
428 jl _zero_left
429
430 mov al, [arg2]
431 mov [r11], al
432
433_zero_left:
434 movdqa xmm3, [rsp]
435 pshufb xmm3, xmm7
436 pxor xmm3, xmm0
437
438 shl r9, 4
439 lea rax, [pshufb_shf_table + 15*16]
440 sub rax, r9
441 movdqu xmm0, [rax]
442 pxor xmm0, [mask3]
443
444 pshufb xmm3, xmm0
445 jmp _128_done
446
447align 16
448_exact_16_left:
449 movdqu xmm3, [arg2]
450 pshufb xmm3, xmm7
451 pxor xmm3, xmm0
452
453 jmp _128_done
454
455_only_less_than_4:
456 cmp arg3, 3
457 jl _only_less_than_3
458 mov al, [arg2]
459 mov [r11], al
460
461 mov al, [arg2+1]
462 mov [r11+1], al
463
464 mov al, [arg2+2]
465 mov [r11+2], al
466
467 movdqa xmm3, [rsp]
468 pshufb xmm3, xmm7
469 pxor xmm3, xmm0
470
471 psrldq xmm3, 5
472
473 jmp _barrett
474_only_less_than_3:
475 cmp arg3, 2
476 jl _only_less_than_2
477 mov al, [arg2]
478 mov [r11], al
479
480 mov al, [arg2+1]
481 mov [r11+1], al
482
483 movdqa xmm3, [rsp]
484 pshufb xmm3, xmm7
485 pxor xmm3, xmm0
486
487 psrldq xmm3, 6
488
489 jmp _barrett
490_only_less_than_2:
491 mov al, [arg2]
492 mov [r11], al
493
494 movdqa xmm3, [rsp]
495 pshufb xmm3, xmm7
496 pxor xmm3, xmm0
497
498 psrldq xmm3, 7
499
500 jmp _barrett
501; precomputed constants
502section .data
503
504align 16
505rk1:
506DQ 0xf200aa6600000000
507rk2:
508DQ 0x17d3315d00000000
509rk3:
510DQ 0xd3504ec700000000
511rk4:
512DQ 0x57a8445500000000
513rk5:
514DQ 0xf200aa6600000000
515rk6:
516DQ 0x490d678d00000000
517rk7:
518DQ 0x0000000104d101df
519rk8:
520DQ 0x0000000104c11db7
521mask:
522dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
523mask2:
524dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
525mask3:
526dq 0x8080808080808080, 0x8080808080808080
527mask4:
528dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
529 align 32
530pshufb_shf_table:
531
532 dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
533
534 dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
535
536 dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
537
538 dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
539
540 dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
541
542 dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
543
544 dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
545
546 dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
547
548 dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
549
550 dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
551
552 dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
553
554 dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
555
556 dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
557
558 dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
559
560 dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
561
562
563SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607
564
565;;; func core, ver, snum
566slversion crc32_ieee_by4, 05, 02, 0017