1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ; UINT32 crc32_ieee_01(
32 ; UINT32 init_crc, //initial CRC value, 32 bits
33 ; const unsigned char *buf, //buffer pointer to calculate CRC on
34 ; UINT64 len //buffer length in bytes (64-bit data)
42 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43 ; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
45 %include "reg_sizes.asm"
52 %ifidn __OUTPUT_FORMAT__, win64
57 %xdefine arg1_low32 ecx
63 %xdefine arg1_low32 edi
67 %ifidn __OUTPUT_FORMAT__, win64
69 %define VARIABLE_OFFSET 16*10+8
71 %define VARIABLE_OFFSET 16*2+8
74 global crc32_ieee_01:function
77 not arg1_low32 ;~init_crc
79 sub rsp,VARIABLE_OFFSET
81 %ifidn __OUTPUT_FORMAT__, win64
82 ; push the xmm registers into the stack to maintain
83 movdqa [rsp + XMM_SAVE + 16*0], xmm6
84 movdqa [rsp + XMM_SAVE + 16*1], xmm7
85 movdqa [rsp + XMM_SAVE + 16*2], xmm8
86 movdqa [rsp + XMM_SAVE + 16*3], xmm9
87 movdqa [rsp + XMM_SAVE + 16*4], xmm10
88 movdqa [rsp + XMM_SAVE + 16*5], xmm11
89 movdqa [rsp + XMM_SAVE + 16*6], xmm12
90 movdqa [rsp + XMM_SAVE + 16*7], xmm13
94 ; check if smaller than 256
97 ; for sizes less than 256, we can't fold 128B at a time...
101 ; load the initial crc value
102 movd xmm10, arg1_low32 ; initial crc
104 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
105 ; because data will be byte-reflected and will align with initial crc at correct place.
108 movdqa xmm11, [SHUF_MASK]
109 ; receive the initial 128B data, xor the initial crc value
110 movdqu xmm0, [arg2+16*0]
111 movdqu xmm1, [arg2+16*1]
112 movdqu xmm2, [arg2+16*2]
113 movdqu xmm3, [arg2+16*3]
114 movdqu xmm4, [arg2+16*4]
115 movdqu xmm5, [arg2+16*5]
116 movdqu xmm6, [arg2+16*6]
117 movdqu xmm7, [arg2+16*7]
120 ; XOR the initial_crc value
130 movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
131 ;imm value of pclmulqdq instruction will determine which constant to use
132 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
133 ; we subtract 256 instead of 128 to save one instruction from the loop
136 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
137 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
140 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
143 ; update the buffer pointer
144 add arg2, 128 ; buf += 128;
146 movdqu xmm9, [arg2+16*0]
147 movdqu xmm12, [arg2+16*1]
152 pclmulqdq xmm0, xmm10, 0x0
153 pclmulqdq xmm8, xmm10 , 0x11
154 pclmulqdq xmm1, xmm10, 0x0
155 pclmulqdq xmm13, xmm10 , 0x11
161 movdqu xmm9, [arg2+16*2]
162 movdqu xmm12, [arg2+16*3]
167 pclmulqdq xmm2, xmm10, 0x0
168 pclmulqdq xmm8, xmm10 , 0x11
169 pclmulqdq xmm3, xmm10, 0x0
170 pclmulqdq xmm13, xmm10 , 0x11
176 movdqu xmm9, [arg2+16*4]
177 movdqu xmm12, [arg2+16*5]
182 pclmulqdq xmm4, xmm10, 0x0
183 pclmulqdq xmm8, xmm10 , 0x11
184 pclmulqdq xmm5, xmm10, 0x0
185 pclmulqdq xmm13, xmm10 , 0x11
191 movdqu xmm9, [arg2+16*6]
192 movdqu xmm12, [arg2+16*7]
197 pclmulqdq xmm6, xmm10, 0x0
198 pclmulqdq xmm8, xmm10 , 0x11
199 pclmulqdq xmm7, xmm10, 0x0
200 pclmulqdq xmm13, xmm10 , 0x11
208 ; check if there is another 128B in the buffer to be able to fold
210 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
214 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
215 ; the 128 of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
218 ; fold the 8 xmm registers to 1 xmm register with different constants
222 pclmulqdq xmm0, xmm10, 0x11
223 pclmulqdq xmm8, xmm10, 0x0
229 pclmulqdq xmm1, xmm10, 0x11
230 pclmulqdq xmm8, xmm10, 0x0
236 pclmulqdq xmm2, xmm10, 0x11
237 pclmulqdq xmm8, xmm10, 0x0
243 pclmulqdq xmm3, xmm10, 0x11
244 pclmulqdq xmm8, xmm10, 0x0
250 pclmulqdq xmm4, xmm10, 0x11
251 pclmulqdq xmm8, xmm10, 0x0
257 pclmulqdq xmm5, xmm10, 0x11
258 pclmulqdq xmm8, xmm10, 0x0
262 movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
263 ;imm value of pclmulqdq instruction will determine which constant to use
265 pclmulqdq xmm6, xmm10, 0x11
266 pclmulqdq xmm8, xmm10, 0x0
271 ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
272 ; instead of a cmp instruction, we use the negative flag with the jl instruction
274 jl _final_reduction_for_128
276 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
277 ; we can fold 16 bytes at a time if y>=16
278 ; continue folding 16B at a time
282 pclmulqdq xmm7, xmm10, 0x11
283 pclmulqdq xmm8, xmm10, 0x0
290 ; instead of a cmp instruction, we utilize the flags with the jge instruction
291 ; equivalent of: cmp arg3, 16-16
292 ; check if there is any more 16B in the buffer to be able to fold
293 jge _16B_reduction_loop
295 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
296 ;first, we reduce the data in the xmm7 register
299 _final_reduction_for_128:
300 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
304 ; here we are getting data that is less than 16 bytes.
305 ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
306 ; after that the registers need to be adjusted.
310 movdqu xmm1, [arg2 - 16 + arg3]
313 ; get rid of the extra data that was loaded before
314 ; load the shift constant
315 lea rax, [pshufb_shf_table + 16]
319 ; shift xmm2 to the left by arg3 bytes
322 ; shift xmm7 to the right by 16-arg3 bytes
325 pblendvb xmm1, xmm2 ;xmm0 is implicit
330 pclmulqdq xmm7, xmm10, 0x11
331 pclmulqdq xmm8, xmm10, 0x0
336 ; compute crc of a 128-bit value
337 movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
341 pclmulqdq xmm7, xmm10, 0x1
351 pclmulqdq xmm7, xmm10, 0x10
356 movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
358 pclmulqdq xmm7, xmm10, 0x01
360 pclmulqdq xmm7, xmm10, 0x11
368 %ifidn __OUTPUT_FORMAT__, win64
369 movdqa xmm6, [rsp + XMM_SAVE + 16*0]
370 movdqa xmm7, [rsp + XMM_SAVE + 16*1]
371 movdqa xmm8, [rsp + XMM_SAVE + 16*2]
372 movdqa xmm9, [rsp + XMM_SAVE + 16*3]
373 movdqa xmm10, [rsp + XMM_SAVE + 16*4]
374 movdqa xmm11, [rsp + XMM_SAVE + 16*5]
375 movdqa xmm12, [rsp + XMM_SAVE + 16*6]
376 movdqa xmm13, [rsp + XMM_SAVE + 16*7]
378 add rsp,VARIABLE_OFFSET
382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
383 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
384 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
385 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
390 ; check if there is enough buffer to be able to fold 16B at a time
393 movdqa xmm11, [SHUF_MASK]
395 ; if there is, load the constants
396 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
398 movd xmm0, arg1_low32 ; get the initial crc value
399 pslldq xmm0, 12 ; align it to its correct place
400 movdqu xmm7, [arg2] ; load the plaintext
401 pshufb xmm7, xmm11 ; byte-reflect the plaintext
405 ; update the buffer pointer
408 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
411 jmp _16B_reduction_loop
416 ; mov initial crc to the return value. this is necessary for zero-length buffers.
421 movdqa xmm11, [SHUF_MASK]
423 movd xmm0, arg1_low32 ; get the initial crc value
424 pslldq xmm0, 12 ; align it to its correct place
428 jl _less_than_16_left
430 movdqu xmm7, [arg2] ; load the plaintext
431 pshufb xmm7, xmm11 ; byte-reflect the plaintext
432 pxor xmm7, xmm0 ; xor the initial crc value
435 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
436 jmp _get_last_two_xmms
441 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
450 ; backup the counter value
493 pxor xmm7, xmm0 ; xor the initial crc value
496 lea rax, [pshufb_shf_table + 16]
508 pxor xmm7, xmm0 ; xor the initial crc value
528 pxor xmm7, xmm0 ; xor the initial crc value
546 pxor xmm7, xmm0 ; xor the initial crc value
559 pxor xmm7, xmm0 ; xor the initial crc value
567 ; precomputed constants
571 DQ 0xf200aa6600000000
573 DQ 0x17d3315d00000000
575 DQ 0x022ffca500000000
577 DQ 0x9d9ee22f00000000
579 DQ 0xf200aa6600000000
581 DQ 0x490d678d00000000
583 DQ 0x0000000104d101df
585 DQ 0x0000000104c11db7
587 DQ 0x6ac7e7d700000000
589 DQ 0xfcd922af00000000
591 DQ 0x34e45a6300000000
593 DQ 0x8762c1f600000000
595 DQ 0x5395a0ea00000000
597 DQ 0x54f2d5c700000000
599 DQ 0xd3504ec700000000
601 DQ 0x57a8445500000000
603 DQ 0xc053585d00000000
605 DQ 0x766f1b7800000000
607 DQ 0xcd8c54b500000000
609 DQ 0xab40b71e00000000
620 dq 0x8080808080808080, 0x8080808080808080
622 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
625 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
628 ; use these values for shift constants for the pshufb instruction
629 ; different alignments result in values as shown:
630 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
631 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
632 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
633 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
634 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
635 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
636 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
637 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
638 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
639 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
640 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
641 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
642 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
643 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
644 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
645 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
646 dq 0x0706050403020100, 0x000e0d0c0b0a0908
648 ;;; func core, ver, snum
649 slversion crc32_ieee_01, 01, 06, 0011