]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/crc/crc32_ieee_by4.asm
update sources to v12.1.1
[ceph.git] / ceph / src / isa-l / crc / crc32_ieee_by4.asm
CommitLineData
7c673cae
FG
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29;
30; Function API:
31; UINT32 crc32_ieee_by4(
32; UINT32 init_crc, //initial CRC value, 32 bits
33; const unsigned char *buf, //buffer pointer to calculate CRC on
34; UINT64 len //buffer length in bytes (64-bit data)
35; );
36;
37; Authors:
38; Erdinc Ozturk
39; Vinodh Gopal
40; James Guilford
41;
42; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43; URL: http://download.intel.com/design/intarch/papers/323102.pdf
44;
45
46%include "reg_sizes.asm"
47
224ce89b
WB
48%define fetch_dist 1024
49
7c673cae
FG
50[bits 64]
51default rel
52
53section .text
54
55%ifidn __OUTPUT_FORMAT__, win64
56 %xdefine arg1 rcx
57 %xdefine arg2 rdx
58 %xdefine arg3 r8
59
60 %xdefine arg1_low32 ecx
61%else
62 %xdefine arg1 rdi
63 %xdefine arg2 rsi
64 %xdefine arg3 rdx
65
66 %xdefine arg1_low32 edi
67%endif
68
69%ifidn __OUTPUT_FORMAT__, win64
70 %define XMM_SAVE 16*2
71 %define VARIABLE_OFFSET 16*4+8
72%else
73 %define VARIABLE_OFFSET 16*2+8
74%endif
75
76align 16
77global crc32_ieee_by4:function
78crc32_ieee_by4:
79
80 not arg1_low32
81
82 sub rsp,VARIABLE_OFFSET
83
84%ifidn __OUTPUT_FORMAT__, win64
85 ; push the xmm registers into the stack to maintain
86 movdqa [rsp + XMM_SAVE + 16*0],xmm6
87 movdqa [rsp + XMM_SAVE + 16*1],xmm7
88%endif
89
90 ; check if smaller than 128B
91 cmp arg3, 128
92 jl _less_than_128
93
94
95
96 ; load the initial crc value
97 movd xmm6, arg1_low32 ; initial crc
98 ; crc value does not need to be byte-reflected, but it needs to be
99 ; moved to the high part of the register.
100 ; because data will be byte-reflected and will align with initial
101 ; crc at correct place.
102 pslldq xmm6, 12
103
104
105
106 movdqa xmm7, [SHUF_MASK]
107 ; receive the initial 64B data, xor the initial crc value
108 movdqu xmm0, [arg2]
109 movdqu xmm1, [arg2+16]
110 movdqu xmm2, [arg2+32]
111 movdqu xmm3, [arg2+48]
112
113
114
115 pshufb xmm0, xmm7
116 ; XOR the initial_crc value
117 pxor xmm0, xmm6
118 pshufb xmm1, xmm7
119 pshufb xmm2, xmm7
120 pshufb xmm3, xmm7
121
122 movdqa xmm6, [rk3] ; k3=2^480 mod POLY << 32
123 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
124 ;we subtract 128 instead of 64 to save one instruction from the loop
125 sub arg3, 128
126
127 ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
128 ; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
129 ; have 64+y Bytes of buffer
130
131
132 ; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
133_fold_64_B_loop:
134
135 ;update the buffer pointer
136 add arg2, 64
137
224ce89b 138 prefetchnta [arg2+fetch_dist+0]
7c673cae
FG
139 movdqa xmm4, xmm0
140 movdqa xmm5, xmm1
141
142 pclmulqdq xmm0, xmm6 , 0x11
143 pclmulqdq xmm1, xmm6 , 0x11
144
145 pclmulqdq xmm4, xmm6, 0x0
146 pclmulqdq xmm5, xmm6, 0x0
147
148 pxor xmm0, xmm4
149 pxor xmm1, xmm5
150
224ce89b 151 prefetchnta [arg2+fetch_dist+32]
7c673cae
FG
152 movdqa xmm4, xmm2
153 movdqa xmm5, xmm3
154
155 pclmulqdq xmm2, xmm6, 0x11
156 pclmulqdq xmm3, xmm6, 0x11
157
158 pclmulqdq xmm4, xmm6, 0x0
159 pclmulqdq xmm5, xmm6, 0x0
160
161 pxor xmm2, xmm4
162 pxor xmm3, xmm5
163
164 movdqu xmm4, [arg2]
165 movdqu xmm5, [arg2+16]
166 pshufb xmm4, xmm7
167 pshufb xmm5, xmm7
168 pxor xmm0, xmm4
169 pxor xmm1, xmm5
170
171 movdqu xmm4, [arg2+32]
172 movdqu xmm5, [arg2+48]
173 pshufb xmm4, xmm7
174 pshufb xmm5, xmm7
175
176 pxor xmm2, xmm4
177 pxor xmm3, xmm5
178
179 sub arg3, 64
180
181 ; check if there is another 64B in the buffer to be able to fold
182 jge _fold_64_B_loop
183 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
184
185
186 add arg2, 64
187 ;at this point, the arg2 is pointing at the last y Bytes of the buffer
188 ; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
189
190
191 movdqa xmm6, [rk1] ;k1
192
193 ; fold the 4 xmm registers to 1 xmm register with different constants
194 movdqa xmm4, xmm0
195 pclmulqdq xmm0, xmm6, 0x11
196 pclmulqdq xmm4, xmm6, 0x0
197 pxor xmm1, xmm4
198 xorps xmm1, xmm0
199
200 movdqa xmm4, xmm1
201 pclmulqdq xmm1, xmm6, 0x11
202 pclmulqdq xmm4, xmm6, 0x0
203 pxor xmm2, xmm4
204 xorps xmm2, xmm1
205
206 movdqa xmm4, xmm2
207 pclmulqdq xmm2, xmm6, 0x11
208 pclmulqdq xmm4, xmm6, 0x0
209 pxor xmm3, xmm4
210 pxor xmm3, xmm2
211
212
213 ;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
214 ; instead of a cmp instruction, we use the negative flag with the jl instruction
215 add arg3, 64-16
216 jl _final_reduction_for_128
217
218; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
219; we can fold 16 bytes at a time if y>=16
220; continue folding 16B at a time
221
222_16B_reduction_loop:
223 movdqa xmm4, xmm3
224 pclmulqdq xmm3, xmm6, 0x11
225 pclmulqdq xmm4, xmm6, 0x0
226 pxor xmm3, xmm4
227 movdqu xmm0, [arg2]
228 pshufb xmm0, xmm7
229 pxor xmm3, xmm0
230 add arg2, 16
231 sub arg3, 16
232 ; instead of a cmp instruction, we utilize the flags with the jge instruction
233 ; equivalent of: cmp arg3, 16-16
234 ; check if there is any more 16B in the buffer to be able to fold
235 jge _16B_reduction_loop
236
237 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
238 ;first, we reduce the data in the xmm3 register
239
240
241
242_final_reduction_for_128:
243 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
244 add arg3, 16
245 je _128_done
246
247 ; here we are getting data that is less than 16 bytes.
248 ; since we know that there was data before the pointer, we can offset
249 ; the input pointer before the actual point, to receive exactly 16 bytes.
250 ; after that the registers need to be adjusted.
251_get_last_two_xmms:
252 movdqa xmm2, xmm3
253
254 movdqu xmm1, [arg2 - 16 + arg3]
255 pshufb xmm1, xmm7
256
257 shl arg3, 4
258 lea rax, [pshufb_shf_table + 15*16]
259 sub rax, arg3
260 movdqu xmm0, [rax]
261
262 pshufb xmm2, xmm0
263
264 pxor xmm0, [mask3]
265
266 pshufb xmm3, xmm0
267
268 pblendvb xmm1, xmm2 ;xmm0 is implicit
269
270 movdqa xmm2, xmm1
271
272 movdqa xmm4, xmm3
273 pclmulqdq xmm3, xmm6, 0x11
274
275 pclmulqdq xmm4, xmm6, 0x0
276 pxor xmm3, xmm4
277 pxor xmm3, xmm2
278
279_128_done:
280
281 movdqa xmm6, [rk5]
282 movdqa xmm0, xmm3
283
284 ;64b fold
285 pclmulqdq xmm3, xmm6, 0x1
286 pslldq xmm0, 8
287 pxor xmm3, xmm0
288
289 ;32b fold
290 movdqa xmm0, xmm3
291
292 pand xmm0, [mask4]
293
294 psrldq xmm3, 12
295 pclmulqdq xmm3, xmm6, 0x10
296 pxor xmm3, xmm0
297
298 ;barrett reduction
299_barrett:
300 movdqa xmm6, [rk7]
301 movdqa xmm0, xmm3
302 pclmulqdq xmm3, xmm6, 0x01
303 pslldq xmm3, 4
304 pclmulqdq xmm3, xmm6, 0x11
305
306 pslldq xmm3, 4
307 pxor xmm3, xmm0
308 pextrd eax, xmm3,1
309
310_cleanup:
311 not eax
312%ifidn __OUTPUT_FORMAT__, win64
313 movdqa xmm6, [rsp + XMM_SAVE + 16*0]
314 movdqa xmm7, [rsp + XMM_SAVE + 16*1]
315%endif
316 add rsp,VARIABLE_OFFSET
317
318
319 ret
320
321
322
323
324
325
326
327;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
328;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
329;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
330;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
331
332align 16
333_less_than_128:
334
335 ;check if there is enough buffer to be able to fold 16B at a time
336 cmp arg3, 32
337 jl _less_than_32
338 movdqa xmm7, [SHUF_MASK]
339
340 ;if there is, load the constants
341 movdqa xmm6, [rk1] ;k1
342
343 movd xmm0, arg1_low32
344 pslldq xmm0, 12
345 movdqu xmm3, [arg2]
346 pshufb xmm3, xmm7
347 pxor xmm3, xmm0
348
349
350 ;update the buffer pointer
351 add arg2, 16
352
353 ;update the counter. subtract 32 instead of 16 to save one instruction from the loop
354 sub arg3, 32
355
356 jmp _16B_reduction_loop
357
358
359align 16
360_less_than_32:
361 mov eax, arg1_low32
362 test arg3, arg3
363 je _cleanup
364
365 movdqa xmm7, [SHUF_MASK]
366
367 movd xmm0, arg1_low32
368 pslldq xmm0, 12
369
370 cmp arg3, 16
371 je _exact_16_left
372 jl _less_than_16_left
373 movd xmm0, arg1_low32
374 pslldq xmm0, 12
375 movdqu xmm3, [arg2]
376 pshufb xmm3, xmm7
377 pxor xmm3, xmm0
378 add arg2, 16
379 sub arg3, 16
380 movdqa xmm6, [rk1] ;k1
381 jmp _get_last_two_xmms
382
383
384align 16
385_less_than_16_left:
386 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
387
388 pxor xmm1, xmm1
389 mov r11, rsp
390 movdqa [r11], xmm1
391
392
393
394 cmp arg3, 4
395 jl _only_less_than_4
396
397 mov r9, arg3
398
399
400 cmp arg3, 8
401 jl _less_than_8_left
402 mov rax, [arg2]
403 mov [r11], rax
404 add r11, 8
405 sub arg3, 8
406 add arg2, 8
407_less_than_8_left:
408
409 cmp arg3, 4
410 jl _less_than_4_left
411 mov eax, [arg2]
412 mov [r11], eax
413 add r11, 4
414 sub arg3, 4
415 add arg2, 4
416_less_than_4_left:
417
418 cmp arg3, 2
419 jl _less_than_2_left
420 mov ax, [arg2]
421 mov [r11], ax
422 add r11, 2
423 sub arg3, 2
424 add arg2, 2
425_less_than_2_left:
426 cmp arg3, 1
427 jl _zero_left
428
429 mov al, [arg2]
430 mov [r11], al
431
432_zero_left:
433 movdqa xmm3, [rsp]
434 pshufb xmm3, xmm7
435 pxor xmm3, xmm0
436
437 shl r9, 4
438 lea rax, [pshufb_shf_table + 15*16]
439 sub rax, r9
440 movdqu xmm0, [rax]
441 pxor xmm0, [mask3]
442
443 pshufb xmm3, xmm0
444 jmp _128_done
445
446align 16
447_exact_16_left:
448 movdqu xmm3, [arg2]
449 pshufb xmm3, xmm7
450 pxor xmm3, xmm0
451
452 jmp _128_done
453
454_only_less_than_4:
455 cmp arg3, 3
456 jl _only_less_than_3
457 mov al, [arg2]
458 mov [r11], al
459
460 mov al, [arg2+1]
461 mov [r11+1], al
462
463 mov al, [arg2+2]
464 mov [r11+2], al
465
466 movdqa xmm3, [rsp]
467 pshufb xmm3, xmm7
468 pxor xmm3, xmm0
469
470 psrldq xmm3, 5
471
472 jmp _barrett
473_only_less_than_3:
474 cmp arg3, 2
475 jl _only_less_than_2
476 mov al, [arg2]
477 mov [r11], al
478
479 mov al, [arg2+1]
480 mov [r11+1], al
481
482 movdqa xmm3, [rsp]
483 pshufb xmm3, xmm7
484 pxor xmm3, xmm0
485
486 psrldq xmm3, 6
487
488 jmp _barrett
489_only_less_than_2:
490 mov al, [arg2]
491 mov [r11], al
492
493 movdqa xmm3, [rsp]
494 pshufb xmm3, xmm7
495 pxor xmm3, xmm0
496
497 psrldq xmm3, 7
498
499 jmp _barrett
500; precomputed constants
501section .data
502
503align 16
504rk1:
505DQ 0xf200aa6600000000
506rk2:
507DQ 0x17d3315d00000000
508rk3:
509DQ 0xd3504ec700000000
510rk4:
511DQ 0x57a8445500000000
512rk5:
513DQ 0xf200aa6600000000
514rk6:
515DQ 0x490d678d00000000
516rk7:
517DQ 0x0000000104d101df
518rk8:
519DQ 0x0000000104c11db7
520mask:
521dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
522mask2:
523dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
524mask3:
525dq 0x8080808080808080, 0x8080808080808080
526mask4:
527dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
528 align 32
529pshufb_shf_table:
530
531 dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
532
533 dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
534
535 dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
536
537 dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
538
539 dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
540
541 dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
542
543 dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
544
545 dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
546
547 dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
548
549 dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
550
551 dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
552
553 dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
554
555 dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
556
557 dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
558
559 dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
560
561
562SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607
563
564;;; func core, ver, snum
565slversion crc32_ieee_by4, 05, 02, 0017