1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32 ; uint64_t crc64_iso_refl_by16_10(
33 ; uint64_t init_crc, //initial CRC value, 64 bits
34 ; const unsigned char *buf, //buffer pointer to calculate CRC on
35 ; uint64_t len //buffer length in bytes (64-bit data)
38 %include "reg_sizes.asm"
41 %define FUNCTION_NAME crc64_iso_refl_by16_10
44 %if (AS_FEATURE_LEVEL) >= 10
46 %define fetch_dist 1024
54 %ifidn __OUTPUT_FORMAT__, win64
65 %ifidn __OUTPUT_FORMAT__, win64
67 %define VARIABLE_OFFSET 16*12+8
69 %define VARIABLE_OFFSET 16*2+8
73 mk_global FUNCTION_NAME, function
77 sub rsp, VARIABLE_OFFSET
79 %ifidn __OUTPUT_FORMAT__, win64
80 ; push the xmm registers into the stack to maintain
81 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
82 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
83 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
84 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
85 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
86 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
87 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
88 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
89 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
90 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
96 ; load the initial crc value
97 vmovq xmm10, arg1 ; initial crc
99 ; receive the initial 128B data, xor the initial crc value
100 vmovdqu8 zmm0, [arg2+16*0]
101 vmovdqu8 zmm4, [arg2+16*4]
103 vbroadcasti32x4 zmm10, [rk3] ;zmm10 has rk3 and rk4
104 ;imm value of pclmulqdq instruction will determine which constant to use
110 vmovdqu8 zmm7, [arg2+16*8]
111 vmovdqu8 zmm8, [arg2+16*12]
112 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
117 vmovdqu8 zmm3, [arg2+16*0]
118 vpclmulqdq zmm1, zmm0, zmm16, 0x10
119 vpclmulqdq zmm2, zmm0, zmm16, 0x01
120 vpxorq zmm0, zmm1, zmm2
121 vpxorq zmm0, zmm0, zmm3
123 vmovdqu8 zmm9, [arg2+16*4]
124 vpclmulqdq zmm5, zmm4, zmm16, 0x10
125 vpclmulqdq zmm6, zmm4, zmm16, 0x01
126 vpxorq zmm4, zmm5, zmm6
127 vpxorq zmm4, zmm4, zmm9
129 vmovdqu8 zmm11, [arg2+16*8]
130 vpclmulqdq zmm12, zmm7, zmm16, 0x10
131 vpclmulqdq zmm13, zmm7, zmm16, 0x01
132 vpxorq zmm7, zmm12, zmm13
133 vpxorq zmm7, zmm7, zmm11
135 vmovdqu8 zmm17, [arg2+16*12]
136 vpclmulqdq zmm14, zmm8, zmm16, 0x10
137 vpclmulqdq zmm15, zmm8, zmm16, 0x01
138 vpxorq zmm8, zmm14, zmm15
139 vpxorq zmm8, zmm8, zmm17
146 vpclmulqdq zmm1, zmm0, zmm10, 0x01
147 vpclmulqdq zmm2, zmm0, zmm10, 0x10
148 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
150 vpclmulqdq zmm5, zmm4, zmm10, 0x01
151 vpclmulqdq zmm6, zmm4, zmm10, 0x10
152 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
158 jmp _fold_128_B_register
160 ; fold 128B at a time. This section of the code folds 2 zmm registers in parallel
162 add arg2, 128 ; update the buffer pointer
163 vmovdqu8 zmm8, [arg2+16*0]
164 vpclmulqdq zmm1, zmm0, zmm10, 0x10
165 vpclmulqdq zmm2, zmm0, zmm10, 0x01
166 vpxorq zmm0, zmm1, zmm2
167 vpxorq zmm0, zmm0, zmm8
169 vmovdqu8 zmm9, [arg2+16*4]
170 vpclmulqdq zmm5, zmm4, zmm10, 0x10
171 vpclmulqdq zmm6, zmm4, zmm10, 0x01
172 vpxorq zmm4, zmm5, zmm6
173 vpxorq zmm4, zmm4, zmm9
177 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
180 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
181 ; the 128B of folded data is in 2 zmm registers: zmm0, zmm4
183 _fold_128_B_register:
184 ; fold the 8 128b parts into 1 xmm register with different constants
185 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
186 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
187 vpclmulqdq zmm1, zmm0, zmm16, 0x01
188 vpclmulqdq zmm2, zmm0, zmm16, 0x10
189 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
191 vpclmulqdq zmm5, zmm4, zmm11, 0x01
192 vpclmulqdq zmm6, zmm4, zmm11, 0x10
193 vmovdqa xmm10, [rk1] ; Needed later in reduction loop
194 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
195 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
197 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
198 vpxorq ymm8, ymm8, ymm1
199 vextracti64x2 xmm5, ymm8, 1
200 vpxorq xmm7, xmm5, xmm8
202 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
203 ; instead of a cmp instruction, we use the negative flag with the jl instruction
205 jl _final_reduction_for_128
207 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
208 ; we can fold 16 bytes at a time if y>=16
209 ; continue folding 16B at a time
213 vpclmulqdq xmm7, xmm10, 0x1
214 vpclmulqdq xmm8, xmm10, 0x10
220 ; instead of a cmp instruction, we utilize the flags with the jge instruction
221 ; equivalent of: cmp arg3, 16-16
222 ; check if there is any more 16B in the buffer to be able to fold
223 jge _16B_reduction_loop
225 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
226 ;first, we reduce the data in the xmm7 register
229 _final_reduction_for_128:
232 ; here we are getting data that is less than 16 bytes.
233 ; since we know that there was data before the pointer, we can offset
234 ; the input pointer before the actual point, to receive exactly 16 bytes.
235 ; after that the registers need to be adjusted.
240 vmovdqu xmm1, [arg2 - 16 + arg3]
242 ; get rid of the extra data that was loaded before
243 ; load the shift constant
244 lea rax, [pshufb_shf_table]
253 vpblendvb xmm2, xmm2, xmm1, xmm0
256 vpclmulqdq xmm7, xmm10, 0x1
258 vpclmulqdq xmm8, xmm10, 0x10
263 ; compute crc of a 128-bit value
268 vpclmulqdq xmm7, xmm10, 0
277 vpclmulqdq xmm7, xmm10, 0
279 vpclmulqdq xmm7, xmm10, 0x10
289 %ifidn __OUTPUT_FORMAT__, win64
290 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
291 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
292 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
293 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
294 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
295 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
296 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
297 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
298 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
299 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
301 add rsp, VARIABLE_OFFSET
304 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
305 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
306 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
307 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
312 ; check if there is enough buffer to be able to fold 16B at a time
316 ; if there is, load the constants
317 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
319 vmovq xmm0, arg1 ; get the initial crc value
320 vmovdqu xmm7, [arg2] ; load the plaintext
323 ; update the buffer pointer
326 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
329 jmp _16B_reduction_loop
333 ; mov initial crc to the return value. this is necessary for zero-length buffers.
338 vmovq xmm0, arg1 ; get the initial crc value
342 jl _less_than_16_left
344 vmovdqu xmm7, [arg2] ; load the plaintext
345 vpxor xmm7, xmm0 ; xor the initial crc value
348 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
349 jmp _get_last_two_xmms
354 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
360 ; backup the counter value
403 vpxor xmm7, xmm0 ; xor the initial crc value
405 lea rax,[pshufb_shf_table]
411 vmovdqu xmm0, [rax + r9]
416 ; Left shift (8-length) bytes in XMM
417 vmovdqu xmm0, [rax + r9 + 8]
425 vpxor xmm7, xmm0 ; xor the initial crc value
433 ; precomputed constants
434 rk_1: dq 0x45000000b0000000
435 rk_2: dq 0x6b700000f5000000
436 rk1: dq 0xf500000000000001
437 rk2: dq 0x6b70000000000001
438 rk3: dq 0xb001000000010000
439 rk4: dq 0xf501b0000001b000
440 rk5: dq 0xf500000000000001
441 rk6: dq 0x0000000000000000
442 rk7: dq 0xb000000000000001
443 rk8: dq 0xb000000000000000
444 rk9: dq 0xe014514514501501
445 rk10: dq 0x771db6db6db71c71
446 rk11: dq 0xa101101101110001
447 rk12: dq 0x1ab1ab1ab1aab001
448 rk13: dq 0xf445014445000001
449 rk14: dq 0x6aab71daab700001
450 rk15: dq 0xb100010100000001
451 rk16: dq 0x01b001b1b0000001
452 rk17: dq 0xe145150000000001
453 rk18: dq 0x76db6c7000000001
454 rk19: dq 0xa011000000000001
455 rk20: dq 0x1b1ab00000000001
457 rk_1b: dq 0xf500000000000001
458 rk_2b: dq 0x6b70000000000001
459 dq 0x0000000000000000
460 dq 0x0000000000000000
466 ; use these values for shift constants for the pshufb instruction
467 ; different alignments result in values as shown:
468 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
469 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
470 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
471 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
472 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
473 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
474 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
475 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
476 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
477 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
478 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
479 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
480 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
481 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
482 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
483 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
484 dq 0x0706050403020100, 0x000e0d0c0b0a0908
486 mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
487 mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
488 mask3: dq 0x8080808080808080, 0x8080808080808080
490 %else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
491 %ifidn __OUTPUT_FORMAT__, win64
492 global no_ %+ FUNCTION_NAME
493 no_ %+ FUNCTION_NAME %+ :
495 %endif ; (AS_FEATURE_LEVEL) >= 10