1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32 ; uint64_t crc64_iso_norm_by16_10(
33 ; uint64_t init_crc, //initial CRC value, 64 bits
34 ; const unsigned char *buf, //buffer pointer to calculate CRC on
35 ; uint64_t len //buffer length in bytes (64-bit data)
38 %include "reg_sizes.asm"
41 %define FUNCTION_NAME crc64_iso_norm_by16_10
44 %if (AS_FEATURE_LEVEL) >= 10
46 %define fetch_dist 1024
54 %ifidn __OUTPUT_FORMAT__, win64
64 %ifidn __OUTPUT_FORMAT__, win64
66 %define VARIABLE_OFFSET 16*12+8
68 %define VARIABLE_OFFSET 16*2+8
72 global FUNCTION_NAME:ISAL_SYM_TYPE_FUNCTION
75 sub rsp, VARIABLE_OFFSET
77 %ifidn __OUTPUT_FORMAT__, win64
78 ; push the xmm registers into the stack to maintain
79 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
80 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
81 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
82 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
83 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
84 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
85 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
86 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
87 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
88 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
90 vbroadcasti32x4 zmm18, [SHUF_MASK]
94 ; load the initial crc value
95 vmovq xmm10, arg1 ; initial crc
97 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
98 ; because data will be byte-reflected and will align with initial crc at correct place.
101 ; receive the initial 128B data, xor the initial crc value
102 vmovdqu8 zmm0, [arg2+16*0]
103 vmovdqu8 zmm4, [arg2+16*4]
104 vpshufb zmm0, zmm0, zmm18
105 vpshufb zmm4, zmm4, zmm18
107 vbroadcasti32x4 zmm10, [rk3] ;zmm10 has rk3 and rk4
108 ;imm value of pclmulqdq instruction will determine which constant to use
113 vmovdqu8 zmm7, [arg2+16*8]
114 vmovdqu8 zmm8, [arg2+16*12]
115 vpshufb zmm7, zmm7, zmm18
116 vpshufb zmm8, zmm8, zmm18
117 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
122 vmovdqu8 zmm3, [arg2+16*0]
123 vpshufb zmm3, zmm3, zmm18
124 vpclmulqdq zmm1, zmm0, zmm16, 0x00
125 vpclmulqdq zmm2, zmm0, zmm16, 0x11
126 vpxorq zmm0, zmm1, zmm2
127 vpxorq zmm0, zmm0, zmm3
129 vmovdqu8 zmm9, [arg2+16*4]
130 vpshufb zmm9, zmm9, zmm18
131 vpclmulqdq zmm5, zmm4, zmm16, 0x00
132 vpclmulqdq zmm6, zmm4, zmm16, 0x11
133 vpxorq zmm4, zmm5, zmm6
134 vpxorq zmm4, zmm4, zmm9
136 vmovdqu8 zmm11, [arg2+16*8]
137 vpshufb zmm11, zmm11, zmm18
138 vpclmulqdq zmm12, zmm7, zmm16, 0x00
139 vpclmulqdq zmm13, zmm7, zmm16, 0x11
140 vpxorq zmm7, zmm12, zmm13
141 vpxorq zmm7, zmm7, zmm11
143 vmovdqu8 zmm17, [arg2+16*12]
144 vpshufb zmm17, zmm17, zmm18
145 vpclmulqdq zmm14, zmm8, zmm16, 0x00
146 vpclmulqdq zmm15, zmm8, zmm16, 0x11
147 vpxorq zmm8, zmm14, zmm15
148 vpxorq zmm8, zmm8, zmm17
155 vpclmulqdq zmm1, zmm0, zmm10, 0x00
156 vpclmulqdq zmm2, zmm0, zmm10, 0x11
157 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
159 vpclmulqdq zmm5, zmm4, zmm10, 0x00
160 vpclmulqdq zmm6, zmm4, zmm10, 0x11
161 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
167 jmp _fold_128_B_register
169 ; fold 128B at a time. This section of the code folds 2 zmm registers in parallel
171 add arg2, 128 ; update the buffer pointer
172 vmovdqu8 zmm8, [arg2+16*0]
173 vpshufb zmm8, zmm8, zmm18
174 vpclmulqdq zmm1, zmm0, zmm10, 0x00
175 vpclmulqdq zmm2, zmm0, zmm10, 0x11
176 vpxorq zmm0, zmm1, zmm2
177 vpxorq zmm0, zmm0, zmm8
179 vmovdqu8 zmm9, [arg2+16*4]
180 vpshufb zmm9, zmm9, zmm18
181 vpclmulqdq zmm5, zmm4, zmm10, 0x00
182 vpclmulqdq zmm6, zmm4, zmm10, 0x11
183 vpxorq zmm4, zmm5, zmm6
184 vpxorq zmm4, zmm4, zmm9
187 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
190 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
191 ; the 128B of folded data is in 2 zmm registers: zmm0, zmm4
193 _fold_128_B_register:
194 ; fold the 8 128b parts into 1 xmm register with different constants
195 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
196 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
197 vpclmulqdq zmm1, zmm0, zmm16, 0x00
198 vpclmulqdq zmm2, zmm0, zmm16, 0x11
199 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
201 vpclmulqdq zmm5, zmm4, zmm11, 0x00
202 vpclmulqdq zmm6, zmm4, zmm11, 0x11
203 vmovdqa xmm10, [rk1] ; Needed later in reduction loop
204 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
205 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
207 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
208 vpxorq ymm8, ymm8, ymm1
209 vextracti64x2 xmm5, ymm8, 1
210 vpxorq xmm7, xmm5, xmm8
212 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
213 ; instead of a cmp instruction, we use the negative flag with the jl instruction
215 jl _final_reduction_for_128
217 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
218 ; we can fold 16 bytes at a time if y>=16
219 ; continue folding 16B at a time
223 vpclmulqdq xmm7, xmm10, 0x11
224 vpclmulqdq xmm8, xmm10, 0x00
227 vpshufb xmm0, xmm0, xmm18
231 ; instead of a cmp instruction, we utilize the flags with the jge instruction
232 ; equivalent of: cmp arg3, 16-16
233 ; check if there is any more 16B in the buffer to be able to fold
234 jge _16B_reduction_loop
236 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
237 ;first, we reduce the data in the xmm7 register
240 _final_reduction_for_128:
243 ; here we are getting data that is less than 16 bytes.
244 ; since we know that there was data before the pointer, we can offset
245 ; the input pointer before the actual point, to receive exactly 16 bytes.
246 ; after that the registers need to be adjusted.
250 vmovdqu xmm1, [arg2 - 16 + arg3]
253 ; get rid of the extra data that was loaded before
254 ; load the shift constant
255 lea rax, [pshufb_shf_table + 16]
259 ; shift xmm2 to the left by arg3 bytes
262 ; shift xmm7 to the right by 16-arg3 bytes
265 vpblendvb xmm1, xmm1, xmm2, xmm0
270 vpclmulqdq xmm7, xmm10, 0x11
271 vpclmulqdq xmm8, xmm10, 0x0
276 ; compute crc of a 128-bit value
281 vpclmulqdq xmm7, xmm10, 0x01 ; H*L
287 vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
292 vpclmulqdq xmm7, xmm10, 0x01
295 vpclmulqdq xmm7, xmm10, 0x11
303 %ifidn __OUTPUT_FORMAT__, win64
304 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
305 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
306 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
307 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
308 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
309 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
310 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
311 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
312 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
313 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
315 add rsp, VARIABLE_OFFSET
318 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
319 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
320 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
321 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
326 ; check if there is enough buffer to be able to fold 16B at a time
330 ; if there is, load the constants
331 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
333 vmovq xmm0, arg1 ; get the initial crc value
334 vpslldq xmm0, 8 ; align it to its correct place
335 vmovdqu xmm7, [arg2] ; load the plaintext
336 vpshufb xmm7, xmm18 ; byte-reflect the plaintext
339 ; update the buffer pointer
342 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
345 jmp _16B_reduction_loop
349 ; mov initial crc to the return value. this is necessary for zero-length buffers.
354 vmovq xmm0, arg1 ; get the initial crc value
355 vpslldq xmm0, 8 ; align it to its correct place
359 jl _less_than_16_left
361 vmovdqu xmm7, [arg2] ; load the plaintext
362 vpshufb xmm7, xmm18 ; byte-reflect the plaintext
363 vpxor xmm7, xmm0 ; xor the initial crc value
366 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
367 jmp _get_last_two_xmms
372 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
378 ; backup the counter value
422 vpxor xmm7, xmm0 ; xor the initial crc value
424 lea rax, [pshufb_shf_table + 16]
438 ; Right shift (8-length) bytes in XMM
449 vpxor xmm7, xmm0 ; xor the initial crc value
457 ; precomputed constants
458 rk_1: dq 0x0000001a00000144
459 rk_2: dq 0x0000015e00001dac
460 rk1: dq 0x0000000000000145
461 rk2: dq 0x0000000000001db7
462 rk3: dq 0x000100000001001a
463 rk4: dq 0x001b0000001b015e
464 rk5: dq 0x0000000000000145
465 rk6: dq 0x0000000000000000
466 rk7: dq 0x000000000000001b
467 rk8: dq 0x000000000000001b
468 rk9: dq 0x0150145145145015
469 rk10: dq 0x1c71db6db6db71c7
470 rk11: dq 0x0001110110110111
471 rk12: dq 0x001aab1ab1ab1aab
472 rk13: dq 0x0000014445014445
473 rk14: dq 0x00001daab71daab7
474 rk15: dq 0x0000000101000101
475 rk16: dq 0x0000001b1b001b1b
476 rk17: dq 0x0000000001514515
477 rk18: dq 0x000000001c6db6c7
478 rk19: dq 0x0000000000011011
479 rk20: dq 0x00000000001ab1ab
481 rk_1b: dq 0x0000000000000145
482 rk_2b: dq 0x0000000000001db7
483 dq 0x0000000000000000
484 dq 0x0000000000000000
489 mask1: dq 0x8080808080808080, 0x8080808080808080
490 mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
491 mask3: dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
493 SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
496 ; use these values for shift constants for the pshufb instruction
497 ; different alignments result in values as shown:
498 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
499 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
500 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
501 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
502 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
503 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
504 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
505 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
506 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
507 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
508 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
509 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
510 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
511 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
512 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
513 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
514 dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
515 dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
516 dq 0x8080808080808080, 0x8080808080808080
519 %else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
520 %ifidn __OUTPUT_FORMAT__, win64
521 global no_ %+ FUNCTION_NAME
522 no_ %+ FUNCTION_NAME %+ :
524 %endif ; (AS_FEATURE_LEVEL) >= 10