1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ; UINT32 crc32_ieee_by4(
32 ; UINT32 init_crc, //initial CRC value, 32 bits
33 ; const unsigned char *buf, //buffer pointer to calculate CRC on
34 ; UINT64 len //buffer length in bytes (64-bit data)
42 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43 ; URL: http://download.intel.com/design/intarch/papers/323102.pdf
46 %include "reg_sizes.asm"
48 %define fetch_dist 1024
55 %ifidn __OUTPUT_FORMAT__, win64
60 %xdefine arg1_low32 ecx
66 %xdefine arg1_low32 edi
69 %ifidn __OUTPUT_FORMAT__, win64
71 %define VARIABLE_OFFSET 16*4+8
73 %define VARIABLE_OFFSET 16*2+8
77 mk_global crc32_ieee_by4, function
83 sub rsp,VARIABLE_OFFSET
85 %ifidn __OUTPUT_FORMAT__, win64
86 ; push the xmm registers into the stack to maintain
87 movdqa [rsp + XMM_SAVE + 16*0],xmm6
88 movdqa [rsp + XMM_SAVE + 16*1],xmm7
91 ; check if smaller than 128B
97 ; load the initial crc value
98 movd xmm6, arg1_low32 ; initial crc
99 ; crc value does not need to be byte-reflected, but it needs to be
100 ; moved to the high part of the register.
101 ; because data will be byte-reflected and will align with initial
102 ; crc at correct place.
107 movdqa xmm7, [SHUF_MASK]
108 ; receive the initial 64B data, xor the initial crc value
110 movdqu xmm1, [arg2+16]
111 movdqu xmm2, [arg2+32]
112 movdqu xmm3, [arg2+48]
117 ; XOR the initial_crc value
123 movdqa xmm6, [rk3] ; k3=2^480 mod POLY << 32
124 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
125 ;we subtract 128 instead of 64 to save one instruction from the loop
128 ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
129 ; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
130 ; have 64+y Bytes of buffer
133 ; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
136 ;update the buffer pointer
139 prefetchnta [arg2+fetch_dist+0]
143 pclmulqdq xmm0, xmm6 , 0x11
144 pclmulqdq xmm1, xmm6 , 0x11
146 pclmulqdq xmm4, xmm6, 0x0
147 pclmulqdq xmm5, xmm6, 0x0
152 prefetchnta [arg2+fetch_dist+32]
156 pclmulqdq xmm2, xmm6, 0x11
157 pclmulqdq xmm3, xmm6, 0x11
159 pclmulqdq xmm4, xmm6, 0x0
160 pclmulqdq xmm5, xmm6, 0x0
166 movdqu xmm5, [arg2+16]
172 movdqu xmm4, [arg2+32]
173 movdqu xmm5, [arg2+48]
182 ; check if there is another 64B in the buffer to be able to fold
184 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
188 ;at this point, the arg2 is pointing at the last y Bytes of the buffer
189 ; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
192 movdqa xmm6, [rk1] ;k1
194 ; fold the 4 xmm registers to 1 xmm register with different constants
196 pclmulqdq xmm0, xmm6, 0x11
197 pclmulqdq xmm4, xmm6, 0x0
202 pclmulqdq xmm1, xmm6, 0x11
203 pclmulqdq xmm4, xmm6, 0x0
208 pclmulqdq xmm2, xmm6, 0x11
209 pclmulqdq xmm4, xmm6, 0x0
214 ;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
215 ; instead of a cmp instruction, we use the negative flag with the jl instruction
217 jl _final_reduction_for_128
219 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
220 ; we can fold 16 bytes at a time if y>=16
221 ; continue folding 16B at a time
225 pclmulqdq xmm3, xmm6, 0x11
226 pclmulqdq xmm4, xmm6, 0x0
233 ; instead of a cmp instruction, we utilize the flags with the jge instruction
234 ; equivalent of: cmp arg3, 16-16
235 ; check if there is any more 16B in the buffer to be able to fold
236 jge _16B_reduction_loop
238 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
239 ;first, we reduce the data in the xmm3 register
243 _final_reduction_for_128:
244 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
248 ; here we are getting data that is less than 16 bytes.
249 ; since we know that there was data before the pointer, we can offset
250 ; the input pointer before the actual point, to receive exactly 16 bytes.
251 ; after that the registers need to be adjusted.
255 movdqu xmm1, [arg2 - 16 + arg3]
259 lea rax, [pshufb_shf_table + 15*16]
269 pblendvb xmm1, xmm2 ;xmm0 is implicit
274 pclmulqdq xmm3, xmm6, 0x11
276 pclmulqdq xmm4, xmm6, 0x0
286 pclmulqdq xmm3, xmm6, 0x1
296 pclmulqdq xmm3, xmm6, 0x10
303 pclmulqdq xmm3, xmm6, 0x01
305 pclmulqdq xmm3, xmm6, 0x11
313 %ifidn __OUTPUT_FORMAT__, win64
314 movdqa xmm6, [rsp + XMM_SAVE + 16*0]
315 movdqa xmm7, [rsp + XMM_SAVE + 16*1]
317 add rsp,VARIABLE_OFFSET
328 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
329 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
330 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
331 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
336 ;check if there is enough buffer to be able to fold 16B at a time
339 movdqa xmm7, [SHUF_MASK]
341 ;if there is, load the constants
342 movdqa xmm6, [rk1] ;k1
344 movd xmm0, arg1_low32
351 ;update the buffer pointer
354 ;update the counter. subtract 32 instead of 16 to save one instruction from the loop
357 jmp _16B_reduction_loop
366 movdqa xmm7, [SHUF_MASK]
368 movd xmm0, arg1_low32
373 jl _less_than_16_left
374 movd xmm0, arg1_low32
381 movdqa xmm6, [rk1] ;k1
382 jmp _get_last_two_xmms
387 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
439 lea rax, [pshufb_shf_table + 15*16]
501 ; precomputed constants
506 DQ 0xf200aa6600000000
508 DQ 0x17d3315d00000000
510 DQ 0xd3504ec700000000
512 DQ 0x57a8445500000000
514 DQ 0xf200aa6600000000
516 DQ 0x490d678d00000000
518 DQ 0x0000000104d101df
520 DQ 0x0000000104c11db7
522 dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
524 dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
526 dq 0x8080808080808080, 0x8080808080808080
528 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
532 dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
534 dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
536 dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
538 dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
540 dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
542 dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
544 dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
546 dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
548 dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
550 dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
552 dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
554 dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
556 dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
558 dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
560 dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
563 SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607
565 ;;; func core, ver, snum
566 slversion crc32_ieee_by4, 05, 02, 0017