1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32 ; UINT32 crc32_iscsi_by16_10(
33 ; UINT32 init_crc, //initial CRC value, 32 bits
34 ; const unsigned char *buf, //buffer pointer to calculate CRC on
35 ; UINT64 len //buffer length in bytes (64-bit data)
43 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44 ; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
48 %include "reg_sizes.asm"
51 %define FUNCTION_NAME crc32_iscsi_by16_10
54 %if (AS_FEATURE_LEVEL) >= 10
62 %ifidn __OUTPUT_FORMAT__, win64
67 %xdefine arg1_low32 r8d
73 %xdefine arg1_low32 edx
77 %ifidn __OUTPUT_FORMAT__, win64
79 %define VARIABLE_OFFSET 16*12+8
81 %define VARIABLE_OFFSET 16*2+8
85 mk_global FUNCTION_NAME, function
88 sub rsp, VARIABLE_OFFSET
90 %ifidn __OUTPUT_FORMAT__, win64
91 ; push the xmm registers into the stack to maintain
92 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
93 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
94 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
95 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
96 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
97 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
98 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
99 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
100 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
101 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
104 ; check if smaller than 256B
108 ; load the initial crc value
109 vmovd xmm10, arg1_low32 ; initial crc
111 ; receive the initial 64B data, xor the initial crc value
112 vmovdqu8 zmm0, [arg2+16*0]
113 vmovdqu8 zmm4, [arg2+16*4]
115 vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
116 ;imm value of pclmulqdq instruction will determine which constant to use
122 vmovdqu8 zmm7, [arg2+16*8]
123 vmovdqu8 zmm8, [arg2+16*12]
124 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
129 vmovdqu8 zmm3, [arg2+16*0]
130 vpclmulqdq zmm1, zmm0, zmm16, 0x10
131 vpclmulqdq zmm2, zmm0, zmm16, 0x01
132 vpxorq zmm0, zmm1, zmm2
133 vpxorq zmm0, zmm0, zmm3
135 vmovdqu8 zmm9, [arg2+16*4]
136 vpclmulqdq zmm5, zmm4, zmm16, 0x10
137 vpclmulqdq zmm6, zmm4, zmm16, 0x01
138 vpxorq zmm4, zmm5, zmm6
139 vpxorq zmm4, zmm4, zmm9
141 vmovdqu8 zmm11, [arg2+16*8]
142 vpclmulqdq zmm12, zmm7, zmm16, 0x10
143 vpclmulqdq zmm13, zmm7, zmm16, 0x01
144 vpxorq zmm7, zmm12, zmm13
145 vpxorq zmm7, zmm7, zmm11
147 vmovdqu8 zmm17, [arg2+16*12]
148 vpclmulqdq zmm14, zmm8, zmm16, 0x10
149 vpclmulqdq zmm15, zmm8, zmm16, 0x01
150 vpxorq zmm8, zmm14, zmm15
151 vpxorq zmm8, zmm8, zmm17
158 vpclmulqdq zmm1, zmm0, zmm10, 0x01
159 vpclmulqdq zmm2, zmm0, zmm10, 0x10
160 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
162 vpclmulqdq zmm5, zmm4, zmm10, 0x01
163 vpclmulqdq zmm6, zmm4, zmm10, 0x10
164 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
170 jmp .fold_128_B_register
174 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
175 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
177 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
180 vmovdqu8 zmm8, [arg2+16*0]
181 vpclmulqdq zmm2, zmm0, zmm10, 0x10
182 vpclmulqdq zmm1, zmm0, zmm10, 0x01
183 vpxorq zmm0, zmm2, zmm1
184 vpxorq zmm0, zmm0, zmm8
186 vmovdqu8 zmm9, [arg2+16*4]
187 vpclmulqdq zmm5, zmm4, zmm10, 0x10
188 vpclmulqdq zmm6, zmm4, zmm10, 0x01
189 vpxorq zmm4, zmm5, zmm6
190 vpxorq zmm4, zmm4, zmm9
194 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
197 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
198 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
200 .fold_128_B_register:
201 ; fold the 8 128b parts into 1 xmm register with different constants
202 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
203 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
204 vpclmulqdq zmm1, zmm0, zmm16, 0x01
205 vpclmulqdq zmm2, zmm0, zmm16, 0x10
206 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
208 vpclmulqdq zmm5, zmm4, zmm11, 0x01
209 vpclmulqdq zmm6, zmm4, zmm11, 0x10
210 vmovdqa xmm10, [rk1] ; Needed later in reduction loop
211 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
212 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
214 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
215 vpxorq ymm8, ymm8, ymm1
216 vextracti64x2 xmm5, ymm8, 1
217 vpxorq xmm7, xmm5, xmm8
219 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
220 ; instead of a cmp instruction, we use the negative flag with the jl instruction
222 jl .final_reduction_for_128
224 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
225 ; we can fold 16 bytes at a time if y>=16
226 ; continue folding 16B at a time
229 vpclmulqdq xmm8, xmm7, xmm10, 0x1
230 vpclmulqdq xmm7, xmm7, xmm10, 0x10
236 ; instead of a cmp instruction, we utilize the flags with the jge instruction
237 ; equivalent of: cmp arg3, 16-16
238 ; check if there is any more 16B in the buffer to be able to fold
239 jge .16B_reduction_loop
241 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
242 ;first, we reduce the data in the xmm7 register
245 .final_reduction_for_128:
249 ; here we are getting data that is less than 16 bytes.
250 ; since we know that there was data before the pointer, we can offset
251 ; the input pointer before the actual point, to receive exactly 16 bytes.
252 ; after that the registers need to be adjusted.
256 vmovdqu xmm1, [arg2 - 16 + arg3]
258 ; get rid of the extra data that was loaded before
259 ; load the shift constant
260 lea rax, [pshufb_shf_table]
268 vpblendvb xmm2, xmm2, xmm1, xmm0
270 vpclmulqdq xmm8, xmm7, xmm10, 0x1
271 vpclmulqdq xmm7, xmm7, xmm10, 0x10
276 ; compute crc of a 128-bit value
281 vpclmulqdq xmm7, xmm10, 0
288 vpclmulqdq xmm7, xmm10, 0x10
299 vpclmulqdq xmm7, xmm10, 0
303 vpclmulqdq xmm7, xmm10, 0x10
310 %ifidn __OUTPUT_FORMAT__, win64
311 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
312 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
313 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
314 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
315 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
316 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
317 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
318 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
319 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
320 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
322 add rsp, VARIABLE_OFFSET
326 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
327 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
328 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
329 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
334 ; check if there is enough buffer to be able to fold 16B at a time
338 ; if there is, load the constants
339 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
341 vmovd xmm0, arg1_low32 ; get the initial crc value
342 vmovdqu xmm7, [arg2] ; load the plaintext
345 ; update the buffer pointer
348 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
351 jmp .16B_reduction_loop
356 ; mov initial crc to the return value. this is necessary for zero-length buffers.
361 vmovd xmm0, arg1_low32 ; get the initial crc value
365 jl .less_than_16_left
367 vmovdqu xmm7, [arg2] ; load the plaintext
368 vpxor xmm7, xmm0 ; xor the initial crc value
371 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
372 jmp .get_last_two_xmms
376 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
385 ; backup the counter value
428 vpxor xmm7, xmm0 ; xor the initial crc value
430 lea rax,[pshufb_shf_table]
431 vmovdqu xmm0, [rax + r9]
438 vpxor xmm7, xmm0 ; xor the initial crc value
456 vpxor xmm7, xmm0 ; xor the initial crc value
473 vpxor xmm7, xmm0 ; xor the initial crc value
484 vpxor xmm7, xmm0 ; xor the initial crc value
493 ; precomputed constants
494 rk_1: dq 0x00000000b9e02b86
495 rk_2: dq 0x00000000dcb17aa4
496 rk1: dq 0x00000000493c7d27
497 rk2: dq 0x0000000ec1068c50
498 rk3: dq 0x0000000206e38d70
499 rk4: dq 0x000000006992cea2
500 rk5: dq 0x00000000493c7d27
501 rk6: dq 0x00000000dd45aab8
502 rk7: dq 0x00000000dea713f0
503 rk8: dq 0x0000000105ec76f0
504 rk9: dq 0x0000000047db8317
505 rk10: dq 0x000000002ad91c30
506 rk11: dq 0x000000000715ce53
507 rk12: dq 0x00000000c49f4f67
508 rk13: dq 0x0000000039d3b296
509 rk14: dq 0x00000000083a6eec
510 rk15: dq 0x000000009e4addf8
511 rk16: dq 0x00000000740eef02
512 rk17: dq 0x00000000ddc0152b
513 rk18: dq 0x000000001c291d04
514 rk19: dq 0x00000000ba4fc28e
515 rk20: dq 0x000000003da6d0cb
517 rk_1b: dq 0x00000000493c7d27
518 rk_2b: dq 0x0000000ec1068c50
519 dq 0x0000000000000000
520 dq 0x0000000000000000
527 ; use these values for shift constants for the pshufb instruction
528 ; different alignments result in values as shown:
529 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
530 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
531 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
532 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
533 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
534 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
535 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
536 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
537 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
538 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
539 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
540 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
541 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
542 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
543 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
544 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
545 dq 0x0706050403020100, 0x000e0d0c0b0a0908
547 mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
548 mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
549 mask3: dq 0x8080808080808080, 0x8080808080808080
551 %else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
552 %ifidn __OUTPUT_FORMAT__, win64
553 global no_ %+ FUNCTION_NAME
554 no_ %+ FUNCTION_NAME %+ :
556 %endif ; (AS_FEATURE_LEVEL) >= 10