1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32 ; UINT32 crc32_gzip_refl_by16_10(
33 ; UINT32 init_crc, //initial CRC value, 32 bits
34 ; const unsigned char *buf, //buffer pointer to calculate CRC on
35 ; UINT64 len //buffer length in bytes (64-bit data)
43 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44 ; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
48 %include "reg_sizes.asm"
51 %define FUNCTION_NAME crc32_ieee_by16_10
54 %if (AS_FEATURE_LEVEL) >= 10
62 %ifidn __OUTPUT_FORMAT__, win64
67 %xdefine arg1_low32 ecx
73 %xdefine arg1_low32 edi
77 %ifidn __OUTPUT_FORMAT__, win64
79 %define VARIABLE_OFFSET 16*12+8
81 %define VARIABLE_OFFSET 16*2+8
85 mk_global FUNCTION_NAME, function
90 sub rsp, VARIABLE_OFFSET
92 %ifidn __OUTPUT_FORMAT__, win64
93 ; push the xmm registers into the stack to maintain
94 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
95 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
96 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
97 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
98 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
99 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
100 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
101 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
102 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
103 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
106 vbroadcasti32x4 zmm18, [SHUF_MASK]
110 ; load the initial crc value
111 vmovd xmm10, arg1_low32 ; initial crc
113 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
114 ; because data will be byte-reflected and will align with initial crc at correct place.
117 ; receive the initial 64B data, xor the initial crc value
118 vmovdqu8 zmm0, [arg2+16*0]
119 vmovdqu8 zmm4, [arg2+16*4]
120 vpshufb zmm0, zmm0, zmm18
121 vpshufb zmm4, zmm4, zmm18
123 vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
124 ;imm value of pclmulqdq instruction will determine which constant to use
130 vmovdqu8 zmm7, [arg2+16*8]
131 vmovdqu8 zmm8, [arg2+16*12]
132 vpshufb zmm7, zmm7, zmm18
133 vpshufb zmm8, zmm8, zmm18
134 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
139 vmovdqu8 zmm3, [arg2+16*0]
140 vpshufb zmm3, zmm3, zmm18
141 vpclmulqdq zmm1, zmm0, zmm16, 0x00
142 vpclmulqdq zmm2, zmm0, zmm16, 0x11
143 vpxorq zmm0, zmm1, zmm2
144 vpxorq zmm0, zmm0, zmm3
146 vmovdqu8 zmm9, [arg2+16*4]
147 vpshufb zmm9, zmm9, zmm18
148 vpclmulqdq zmm5, zmm4, zmm16, 0x00
149 vpclmulqdq zmm6, zmm4, zmm16, 0x11
150 vpxorq zmm4, zmm5, zmm6
151 vpxorq zmm4, zmm4, zmm9
153 vmovdqu8 zmm11, [arg2+16*8]
154 vpshufb zmm11, zmm11, zmm18
155 vpclmulqdq zmm12, zmm7, zmm16, 0x00
156 vpclmulqdq zmm13, zmm7, zmm16, 0x11
157 vpxorq zmm7, zmm12, zmm13
158 vpxorq zmm7, zmm7, zmm11
160 vmovdqu8 zmm17, [arg2+16*12]
161 vpshufb zmm17, zmm17, zmm18
162 vpclmulqdq zmm14, zmm8, zmm16, 0x00
163 vpclmulqdq zmm15, zmm8, zmm16, 0x11
164 vpxorq zmm8, zmm14, zmm15
165 vpxorq zmm8, zmm8, zmm17
172 vpclmulqdq zmm1, zmm0, zmm10, 0x00
173 vpclmulqdq zmm2, zmm0, zmm10, 0x11
174 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
176 vpclmulqdq zmm5, zmm4, zmm10, 0x00
177 vpclmulqdq zmm6, zmm4, zmm10, 0x11
178 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
184 jmp .fold_128_B_register
188 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
189 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
191 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
194 vmovdqu8 zmm8, [arg2+16*0]
195 vpshufb zmm8, zmm8, zmm18
196 vpclmulqdq zmm2, zmm0, zmm10, 0x00
197 vpclmulqdq zmm1, zmm0, zmm10, 0x11
198 vpxorq zmm0, zmm2, zmm1
199 vpxorq zmm0, zmm0, zmm8
201 vmovdqu8 zmm9, [arg2+16*4]
202 vpshufb zmm9, zmm9, zmm18
203 vpclmulqdq zmm5, zmm4, zmm10, 0x00
204 vpclmulqdq zmm6, zmm4, zmm10, 0x11
205 vpxorq zmm4, zmm5, zmm6
206 vpxorq zmm4, zmm4, zmm9
210 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
213 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
214 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
216 .fold_128_B_register:
217 ; fold the 8 128b parts into 1 xmm register with different constants
218 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
219 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
220 vpclmulqdq zmm1, zmm0, zmm16, 0x00
221 vpclmulqdq zmm2, zmm0, zmm16, 0x11
222 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
224 vpclmulqdq zmm5, zmm4, zmm11, 0x00
225 vpclmulqdq zmm6, zmm4, zmm11, 0x11
226 vmovdqa xmm10, [rk1] ; Needed later in reduction loop
227 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
228 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
230 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
231 vpxorq ymm8, ymm8, ymm1
232 vextracti64x2 xmm5, ymm8, 1
233 vpxorq xmm7, xmm5, xmm8
235 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
236 ; instead of a cmp instruction, we use the negative flag with the jl instruction
238 jl .final_reduction_for_128
240 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
241 ; we can fold 16 bytes at a time if y>=16
242 ; continue folding 16B at a time
245 vpclmulqdq xmm8, xmm7, xmm10, 0x11
246 vpclmulqdq xmm7, xmm7, xmm10, 0x00
249 vpshufb xmm0, xmm0, xmm18
253 ; instead of a cmp instruction, we utilize the flags with the jge instruction
254 ; equivalent of: cmp arg3, 16-16
255 ; check if there is any more 16B in the buffer to be able to fold
256 jge .16B_reduction_loop
258 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
259 ;first, we reduce the data in the xmm7 register
262 .final_reduction_for_128:
266 ; here we are getting data that is less than 16 bytes.
267 ; since we know that there was data before the pointer, we can offset
268 ; the input pointer before the actual point, to receive exactly 16 bytes.
269 ; after that the registers need to be adjusted.
273 vmovdqu xmm1, [arg2 - 16 + arg3]
276 ; get rid of the extra data that was loaded before
277 ; load the shift constant
278 lea rax, [pshufb_shf_table + 16]
285 vpblendvb xmm1, xmm1, xmm2, xmm0
287 vpclmulqdq xmm8, xmm7, xmm10, 0x11
288 vpclmulqdq xmm7, xmm7, xmm10, 0x00
293 ; compute crc of a 128-bit value
298 vpclmulqdq xmm7, xmm10, 0x01 ; H*L
306 vpclmulqdq xmm7, xmm10, 0x10
311 vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
313 vpclmulqdq xmm7, xmm10, 0x01
315 vpclmulqdq xmm7, xmm10, 0x11
325 %ifidn __OUTPUT_FORMAT__, win64
326 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
327 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
328 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
329 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
330 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
331 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
332 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
333 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
334 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
335 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
337 add rsp, VARIABLE_OFFSET
341 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
342 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
343 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
344 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
349 ; check if there is enough buffer to be able to fold 16B at a time
353 ; if there is, load the constants
354 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
356 vmovd xmm0, arg1_low32 ; get the initial crc value
357 vpslldq xmm0, 12 ; align it to its correct place
358 vmovdqu xmm7, [arg2] ; load the plaintext
359 vpshufb xmm7, xmm18 ; byte-reflect the plaintext
362 ; update the buffer pointer
365 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
368 jmp .16B_reduction_loop
373 ; mov initial crc to the return value. this is necessary for zero-length buffers.
378 vmovd xmm0, arg1_low32 ; get the initial crc value
379 vpslldq xmm0, 12 ; align it to its correct place
383 jl .less_than_16_left
385 vmovdqu xmm7, [arg2] ; load the plaintext
387 vpxor xmm7, xmm0 ; xor the initial crc value
390 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
391 jmp .get_last_two_xmms
395 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
404 ; backup the counter value
448 vpxor xmm7, xmm0 ; xor the initial crc value
450 lea rax, [pshufb_shf_table + 16]
462 vpxor xmm7, xmm0 ; xor the initial crc value
481 vpxor xmm7, xmm0 ; xor the initial crc value
499 vpxor xmm7, xmm0 ; xor the initial crc value
511 vpxor xmm7, xmm0 ; xor the initial crc value
520 ; precomputed constants
521 rk_1: dq 0x1851689900000000
522 rk_2: dq 0xa3dc855100000000
523 rk1: dq 0xf200aa6600000000
524 rk2: dq 0x17d3315d00000000
525 rk3: dq 0x022ffca500000000
526 rk4: dq 0x9d9ee22f00000000
527 rk5: dq 0xf200aa6600000000
528 rk6: dq 0x490d678d00000000
529 rk7: dq 0x0000000104d101df
530 rk8: dq 0x0000000104c11db7
531 rk9: dq 0x6ac7e7d700000000
532 rk10: dq 0xfcd922af00000000
533 rk11: dq 0x34e45a6300000000
534 rk12: dq 0x8762c1f600000000
535 rk13: dq 0x5395a0ea00000000
536 rk14: dq 0x54f2d5c700000000
537 rk15: dq 0xd3504ec700000000
538 rk16: dq 0x57a8445500000000
539 rk17: dq 0xc053585d00000000
540 rk18: dq 0x766f1b7800000000
541 rk19: dq 0xcd8c54b500000000
542 rk20: dq 0xab40b71e00000000
544 rk_1b: dq 0xf200aa6600000000
545 rk_2b: dq 0x17d3315d00000000
546 dq 0x0000000000000000
547 dq 0x0000000000000000
552 mask1: dq 0x8080808080808080, 0x8080808080808080
553 mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
555 SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
558 ; use these values for shift constants for the pshufb instruction
559 ; different alignments result in values as shown:
560 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
561 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
562 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
563 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
564 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
565 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
566 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
567 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
568 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
569 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
570 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
571 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
572 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
573 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
574 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
575 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
576 dq 0x0706050403020100, 0x000e0d0c0b0a0908
577 dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
578 dq 0x8080808080808080, 0x8080808080808080
580 %else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
581 %ifidn __OUTPUT_FORMAT__, win64
582 global no_ %+ FUNCTION_NAME
583 no_ %+ FUNCTION_NAME %+ :
585 %endif ; (AS_FEATURE_LEVEL) >= 10