1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32 ; uint64_t crc64_iso_refl_by16_10(
33 ; uint64_t init_crc, //initial CRC value, 64 bits
34 ; const unsigned char *buf, //buffer pointer to calculate CRC on
35 ; uint64_t len //buffer length in bytes (64-bit data)
38 %include "reg_sizes.asm"
41 %define FUNCTION_NAME crc64_iso_refl_by16_10
44 %if (AS_FEATURE_LEVEL) >= 10
46 %define fetch_dist 1024
54 %ifidn __OUTPUT_FORMAT__, win64
65 %ifidn __OUTPUT_FORMAT__, win64
67 %define VARIABLE_OFFSET 16*12+8
69 %define VARIABLE_OFFSET 16*2+8
73 global FUNCTION_NAME:ISAL_SYM_TYPE_FUNCTION
76 sub rsp, VARIABLE_OFFSET
78 %ifidn __OUTPUT_FORMAT__, win64
79 ; push the xmm registers into the stack to maintain
80 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
81 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
82 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
83 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
84 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
85 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
86 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
87 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
88 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
89 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
95 ; load the initial crc value
96 vmovq xmm10, arg1 ; initial crc
98 ; receive the initial 128B data, xor the initial crc value
99 vmovdqu8 zmm0, [arg2+16*0]
100 vmovdqu8 zmm4, [arg2+16*4]
102 vbroadcasti32x4 zmm10, [rk3] ;zmm10 has rk3 and rk4
103 ;imm value of pclmulqdq instruction will determine which constant to use
109 vmovdqu8 zmm7, [arg2+16*8]
110 vmovdqu8 zmm8, [arg2+16*12]
111 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
116 vmovdqu8 zmm3, [arg2+16*0]
117 vpclmulqdq zmm1, zmm0, zmm16, 0x10
118 vpclmulqdq zmm2, zmm0, zmm16, 0x01
119 vpxorq zmm0, zmm1, zmm2
120 vpxorq zmm0, zmm0, zmm3
122 vmovdqu8 zmm9, [arg2+16*4]
123 vpclmulqdq zmm5, zmm4, zmm16, 0x10
124 vpclmulqdq zmm6, zmm4, zmm16, 0x01
125 vpxorq zmm4, zmm5, zmm6
126 vpxorq zmm4, zmm4, zmm9
128 vmovdqu8 zmm11, [arg2+16*8]
129 vpclmulqdq zmm12, zmm7, zmm16, 0x10
130 vpclmulqdq zmm13, zmm7, zmm16, 0x01
131 vpxorq zmm7, zmm12, zmm13
132 vpxorq zmm7, zmm7, zmm11
134 vmovdqu8 zmm17, [arg2+16*12]
135 vpclmulqdq zmm14, zmm8, zmm16, 0x10
136 vpclmulqdq zmm15, zmm8, zmm16, 0x01
137 vpxorq zmm8, zmm14, zmm15
138 vpxorq zmm8, zmm8, zmm17
145 vpclmulqdq zmm1, zmm0, zmm10, 0x01
146 vpclmulqdq zmm2, zmm0, zmm10, 0x10
147 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
149 vpclmulqdq zmm5, zmm4, zmm10, 0x01
150 vpclmulqdq zmm6, zmm4, zmm10, 0x10
151 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
157 jmp _fold_128_B_register
159 ; fold 128B at a time. This section of the code folds 2 zmm registers in parallel
161 add arg2, 128 ; update the buffer pointer
162 vmovdqu8 zmm8, [arg2+16*0]
163 vpclmulqdq zmm1, zmm0, zmm10, 0x10
164 vpclmulqdq zmm2, zmm0, zmm10, 0x01
165 vpxorq zmm0, zmm1, zmm2
166 vpxorq zmm0, zmm0, zmm8
168 vmovdqu8 zmm9, [arg2+16*4]
169 vpclmulqdq zmm5, zmm4, zmm10, 0x10
170 vpclmulqdq zmm6, zmm4, zmm10, 0x01
171 vpxorq zmm4, zmm5, zmm6
172 vpxorq zmm4, zmm4, zmm9
176 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
179 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
180 ; the 128B of folded data is in 2 zmm registers: zmm0, zmm4
182 _fold_128_B_register:
183 ; fold the 8 128b parts into 1 xmm register with different constants
184 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
185 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
186 vpclmulqdq zmm1, zmm0, zmm16, 0x01
187 vpclmulqdq zmm2, zmm0, zmm16, 0x10
188 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
190 vpclmulqdq zmm5, zmm4, zmm11, 0x01
191 vpclmulqdq zmm6, zmm4, zmm11, 0x10
192 vmovdqa xmm10, [rk1] ; Needed later in reduction loop
193 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
194 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
196 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
197 vpxorq ymm8, ymm8, ymm1
198 vextracti64x2 xmm5, ymm8, 1
199 vpxorq xmm7, xmm5, xmm8
201 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
202 ; instead of a cmp instruction, we use the negative flag with the jl instruction
204 jl _final_reduction_for_128
206 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
207 ; we can fold 16 bytes at a time if y>=16
208 ; continue folding 16B at a time
212 vpclmulqdq xmm7, xmm10, 0x1
213 vpclmulqdq xmm8, xmm10, 0x10
219 ; instead of a cmp instruction, we utilize the flags with the jge instruction
220 ; equivalent of: cmp arg3, 16-16
221 ; check if there is any more 16B in the buffer to be able to fold
222 jge _16B_reduction_loop
224 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
225 ;first, we reduce the data in the xmm7 register
228 _final_reduction_for_128:
231 ; here we are getting data that is less than 16 bytes.
232 ; since we know that there was data before the pointer, we can offset
233 ; the input pointer before the actual point, to receive exactly 16 bytes.
234 ; after that the registers need to be adjusted.
239 vmovdqu xmm1, [arg2 - 16 + arg3]
241 ; get rid of the extra data that was loaded before
242 ; load the shift constant
243 lea rax, [pshufb_shf_table]
252 vpblendvb xmm2, xmm2, xmm1, xmm0
255 vpclmulqdq xmm7, xmm10, 0x1
257 vpclmulqdq xmm8, xmm10, 0x10
262 ; compute crc of a 128-bit value
267 vpclmulqdq xmm7, xmm10, 0
276 vpclmulqdq xmm7, xmm10, 0
278 vpclmulqdq xmm7, xmm10, 0x10
288 %ifidn __OUTPUT_FORMAT__, win64
289 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
290 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
291 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
292 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
293 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
294 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
295 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
296 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
297 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
298 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
300 add rsp, VARIABLE_OFFSET
303 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
304 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
305 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
306 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
311 ; check if there is enough buffer to be able to fold 16B at a time
315 ; if there is, load the constants
316 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
318 vmovq xmm0, arg1 ; get the initial crc value
319 vmovdqu xmm7, [arg2] ; load the plaintext
322 ; update the buffer pointer
325 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
328 jmp _16B_reduction_loop
332 ; mov initial crc to the return value. this is necessary for zero-length buffers.
337 vmovq xmm0, arg1 ; get the initial crc value
341 jl _less_than_16_left
343 vmovdqu xmm7, [arg2] ; load the plaintext
344 vpxor xmm7, xmm0 ; xor the initial crc value
347 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
348 jmp _get_last_two_xmms
353 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
359 ; backup the counter value
402 vpxor xmm7, xmm0 ; xor the initial crc value
404 lea rax,[pshufb_shf_table]
410 vmovdqu xmm0, [rax + r9]
415 ; Left shift (8-length) bytes in XMM
416 vmovdqu xmm0, [rax + r9 + 8]
424 vpxor xmm7, xmm0 ; xor the initial crc value
432 ; precomputed constants
433 rk_1: dq 0x45000000b0000000
434 rk_2: dq 0x6b700000f5000000
435 rk1: dq 0xf500000000000001
436 rk2: dq 0x6b70000000000001
437 rk3: dq 0xb001000000010000
438 rk4: dq 0xf501b0000001b000
439 rk5: dq 0xf500000000000001
440 rk6: dq 0x0000000000000000
441 rk7: dq 0xb000000000000001
442 rk8: dq 0xb000000000000000
443 rk9: dq 0xe014514514501501
444 rk10: dq 0x771db6db6db71c71
445 rk11: dq 0xa101101101110001
446 rk12: dq 0x1ab1ab1ab1aab001
447 rk13: dq 0xf445014445000001
448 rk14: dq 0x6aab71daab700001
449 rk15: dq 0xb100010100000001
450 rk16: dq 0x01b001b1b0000001
451 rk17: dq 0xe145150000000001
452 rk18: dq 0x76db6c7000000001
453 rk19: dq 0xa011000000000001
454 rk20: dq 0x1b1ab00000000001
456 rk_1b: dq 0xf500000000000001
457 rk_2b: dq 0x6b70000000000001
458 dq 0x0000000000000000
459 dq 0x0000000000000000
465 ; use these values for shift constants for the pshufb instruction
466 ; different alignments result in values as shown:
467 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
468 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
469 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
470 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
471 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
472 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
473 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
474 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
475 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
476 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
477 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
478 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
479 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
480 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
481 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
482 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
483 dq 0x0706050403020100, 0x000e0d0c0b0a0908
485 mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
486 mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
487 mask3: dq 0x8080808080808080, 0x8080808080808080
489 %else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
490 %ifidn __OUTPUT_FORMAT__, win64
491 global no_ %+ FUNCTION_NAME
492 no_ %+ FUNCTION_NAME %+ :
494 %endif ; (AS_FEATURE_LEVEL) >= 10