1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ; UINT32 crc32_ieee_01(
32 ; UINT32 init_crc, //initial CRC value, 32 bits
33 ; const unsigned char *buf, //buffer pointer to calculate CRC on
34 ; UINT64 len //buffer length in bytes (64-bit data)
42 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43 ; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
45 %include "reg_sizes.asm"
47 %define fetch_dist 1024
53 %ifidn __OUTPUT_FORMAT__, win64
58 %xdefine arg1_low32 ecx
64 %xdefine arg1_low32 edi
68 %ifidn __OUTPUT_FORMAT__, win64
70 %define VARIABLE_OFFSET 16*10+8
72 %define VARIABLE_OFFSET 16*2+8
75 global crc32_ieee_01:function
78 not arg1_low32 ;~init_crc
80 sub rsp,VARIABLE_OFFSET
82 %ifidn __OUTPUT_FORMAT__, win64
83 ; push the xmm registers into the stack to maintain
84 movdqa [rsp + XMM_SAVE + 16*0], xmm6
85 movdqa [rsp + XMM_SAVE + 16*1], xmm7
86 movdqa [rsp + XMM_SAVE + 16*2], xmm8
87 movdqa [rsp + XMM_SAVE + 16*3], xmm9
88 movdqa [rsp + XMM_SAVE + 16*4], xmm10
89 movdqa [rsp + XMM_SAVE + 16*5], xmm11
90 movdqa [rsp + XMM_SAVE + 16*6], xmm12
91 movdqa [rsp + XMM_SAVE + 16*7], xmm13
95 ; check if smaller than 256
98 ; for sizes less than 256, we can't fold 128B at a time...
102 ; load the initial crc value
103 movd xmm10, arg1_low32 ; initial crc
105 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
106 ; because data will be byte-reflected and will align with initial crc at correct place.
109 movdqa xmm11, [SHUF_MASK]
110 ; receive the initial 128B data, xor the initial crc value
111 movdqu xmm0, [arg2+16*0]
112 movdqu xmm1, [arg2+16*1]
113 movdqu xmm2, [arg2+16*2]
114 movdqu xmm3, [arg2+16*3]
115 movdqu xmm4, [arg2+16*4]
116 movdqu xmm5, [arg2+16*5]
117 movdqu xmm6, [arg2+16*6]
118 movdqu xmm7, [arg2+16*7]
121 ; XOR the initial_crc value
131 movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
132 ;imm value of pclmulqdq instruction will determine which constant to use
133 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
134 ; we subtract 256 instead of 128 to save one instruction from the loop
137 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
138 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
141 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
144 ; update the buffer pointer
145 add arg2, 128 ; buf += 128;
147 prefetchnta [arg2+fetch_dist+0]
148 movdqu xmm9, [arg2+16*0]
149 movdqu xmm12, [arg2+16*1]
154 pclmulqdq xmm0, xmm10, 0x0
155 pclmulqdq xmm8, xmm10 , 0x11
156 pclmulqdq xmm1, xmm10, 0x0
157 pclmulqdq xmm13, xmm10 , 0x11
163 prefetchnta [arg2+fetch_dist+32]
164 movdqu xmm9, [arg2+16*2]
165 movdqu xmm12, [arg2+16*3]
170 pclmulqdq xmm2, xmm10, 0x0
171 pclmulqdq xmm8, xmm10 , 0x11
172 pclmulqdq xmm3, xmm10, 0x0
173 pclmulqdq xmm13, xmm10 , 0x11
179 prefetchnta [arg2+fetch_dist+64]
180 movdqu xmm9, [arg2+16*4]
181 movdqu xmm12, [arg2+16*5]
186 pclmulqdq xmm4, xmm10, 0x0
187 pclmulqdq xmm8, xmm10 , 0x11
188 pclmulqdq xmm5, xmm10, 0x0
189 pclmulqdq xmm13, xmm10 , 0x11
195 prefetchnta [arg2+fetch_dist+96]
196 movdqu xmm9, [arg2+16*6]
197 movdqu xmm12, [arg2+16*7]
202 pclmulqdq xmm6, xmm10, 0x0
203 pclmulqdq xmm8, xmm10 , 0x11
204 pclmulqdq xmm7, xmm10, 0x0
205 pclmulqdq xmm13, xmm10 , 0x11
213 ; check if there is another 128B in the buffer to be able to fold
215 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
219 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
220 ; the 128 of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
223 ; fold the 8 xmm registers to 1 xmm register with different constants
227 pclmulqdq xmm0, xmm10, 0x11
228 pclmulqdq xmm8, xmm10, 0x0
234 pclmulqdq xmm1, xmm10, 0x11
235 pclmulqdq xmm8, xmm10, 0x0
241 pclmulqdq xmm2, xmm10, 0x11
242 pclmulqdq xmm8, xmm10, 0x0
248 pclmulqdq xmm3, xmm10, 0x11
249 pclmulqdq xmm8, xmm10, 0x0
255 pclmulqdq xmm4, xmm10, 0x11
256 pclmulqdq xmm8, xmm10, 0x0
262 pclmulqdq xmm5, xmm10, 0x11
263 pclmulqdq xmm8, xmm10, 0x0
267 movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
268 ;imm value of pclmulqdq instruction will determine which constant to use
270 pclmulqdq xmm6, xmm10, 0x11
271 pclmulqdq xmm8, xmm10, 0x0
276 ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
277 ; instead of a cmp instruction, we use the negative flag with the jl instruction
279 jl _final_reduction_for_128
281 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
282 ; we can fold 16 bytes at a time if y>=16
283 ; continue folding 16B at a time
287 pclmulqdq xmm7, xmm10, 0x11
288 pclmulqdq xmm8, xmm10, 0x0
295 ; instead of a cmp instruction, we utilize the flags with the jge instruction
296 ; equivalent of: cmp arg3, 16-16
297 ; check if there is any more 16B in the buffer to be able to fold
298 jge _16B_reduction_loop
300 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
301 ;first, we reduce the data in the xmm7 register
304 _final_reduction_for_128:
305 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
309 ; here we are getting data that is less than 16 bytes.
310 ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
311 ; after that the registers need to be adjusted.
315 movdqu xmm1, [arg2 - 16 + arg3]
318 ; get rid of the extra data that was loaded before
319 ; load the shift constant
320 lea rax, [pshufb_shf_table + 16]
324 ; shift xmm2 to the left by arg3 bytes
327 ; shift xmm7 to the right by 16-arg3 bytes
330 pblendvb xmm1, xmm2 ;xmm0 is implicit
335 pclmulqdq xmm7, xmm10, 0x11
336 pclmulqdq xmm8, xmm10, 0x0
341 ; compute crc of a 128-bit value
342 movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
346 pclmulqdq xmm7, xmm10, 0x1
356 pclmulqdq xmm7, xmm10, 0x10
361 movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
363 pclmulqdq xmm7, xmm10, 0x01
365 pclmulqdq xmm7, xmm10, 0x11
373 %ifidn __OUTPUT_FORMAT__, win64
374 movdqa xmm6, [rsp + XMM_SAVE + 16*0]
375 movdqa xmm7, [rsp + XMM_SAVE + 16*1]
376 movdqa xmm8, [rsp + XMM_SAVE + 16*2]
377 movdqa xmm9, [rsp + XMM_SAVE + 16*3]
378 movdqa xmm10, [rsp + XMM_SAVE + 16*4]
379 movdqa xmm11, [rsp + XMM_SAVE + 16*5]
380 movdqa xmm12, [rsp + XMM_SAVE + 16*6]
381 movdqa xmm13, [rsp + XMM_SAVE + 16*7]
383 add rsp,VARIABLE_OFFSET
387 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
388 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
389 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
390 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
395 ; check if there is enough buffer to be able to fold 16B at a time
398 movdqa xmm11, [SHUF_MASK]
400 ; if there is, load the constants
401 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
403 movd xmm0, arg1_low32 ; get the initial crc value
404 pslldq xmm0, 12 ; align it to its correct place
405 movdqu xmm7, [arg2] ; load the plaintext
406 pshufb xmm7, xmm11 ; byte-reflect the plaintext
410 ; update the buffer pointer
413 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
416 jmp _16B_reduction_loop
421 ; mov initial crc to the return value. this is necessary for zero-length buffers.
426 movdqa xmm11, [SHUF_MASK]
428 movd xmm0, arg1_low32 ; get the initial crc value
429 pslldq xmm0, 12 ; align it to its correct place
433 jl _less_than_16_left
435 movdqu xmm7, [arg2] ; load the plaintext
436 pshufb xmm7, xmm11 ; byte-reflect the plaintext
437 pxor xmm7, xmm0 ; xor the initial crc value
440 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
441 jmp _get_last_two_xmms
446 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
455 ; backup the counter value
498 pxor xmm7, xmm0 ; xor the initial crc value
501 lea rax, [pshufb_shf_table + 16]
513 pxor xmm7, xmm0 ; xor the initial crc value
533 pxor xmm7, xmm0 ; xor the initial crc value
551 pxor xmm7, xmm0 ; xor the initial crc value
564 pxor xmm7, xmm0 ; xor the initial crc value
572 ; precomputed constants
576 DQ 0xf200aa6600000000
578 DQ 0x17d3315d00000000
580 DQ 0x022ffca500000000
582 DQ 0x9d9ee22f00000000
584 DQ 0xf200aa6600000000
586 DQ 0x490d678d00000000
588 DQ 0x0000000104d101df
590 DQ 0x0000000104c11db7
592 DQ 0x6ac7e7d700000000
594 DQ 0xfcd922af00000000
596 DQ 0x34e45a6300000000
598 DQ 0x8762c1f600000000
600 DQ 0x5395a0ea00000000
602 DQ 0x54f2d5c700000000
604 DQ 0xd3504ec700000000
606 DQ 0x57a8445500000000
608 DQ 0xc053585d00000000
610 DQ 0x766f1b7800000000
612 DQ 0xcd8c54b500000000
614 DQ 0xab40b71e00000000
625 dq 0x8080808080808080, 0x8080808080808080
627 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
630 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
633 ; use these values for shift constants for the pshufb instruction
634 ; different alignments result in values as shown:
635 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
636 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
637 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
638 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
639 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
640 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
641 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
642 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
643 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
644 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
645 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
646 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
647 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
648 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
649 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
650 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
651 dq 0x0706050403020100, 0x000e0d0c0b0a0908
653 ;;; func core, ver, snum
654 slversion crc32_ieee_01, 01, 06, 0011